sched/deadline: Clean up various coding style details
[linux-2.6-block.git] / kernel / sched / fair.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
bf0f6f24
IM
2/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 * Various enhancements by Dmitry Adamushko.
11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 * Group scheduling enhancements by Srivatsa Vaddagiri
14 * Copyright IBM Corporation, 2007
15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 * Scaled math optimizations by Thomas Gleixner
18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
21805085
PZ
19 *
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
90eec103 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
bf0f6f24
IM
22 */
23
589ee628 24#include <linux/sched/mm.h>
105ab3d8
IM
25#include <linux/sched/topology.h>
26
cb251765 27#include <linux/latencytop.h>
3436ae12 28#include <linux/cpumask.h>
83a0a96a 29#include <linux/cpuidle.h>
029632fb
PZ
30#include <linux/slab.h>
31#include <linux/profile.h>
32#include <linux/interrupt.h>
cbee9f88 33#include <linux/mempolicy.h>
e14808b4 34#include <linux/migrate.h>
cbee9f88 35#include <linux/task_work.h>
78634061 36#include <linux/sched/isolation.h>
029632fb
PZ
37
38#include <trace/events/sched.h>
39
40#include "sched.h"
9745512c 41
bf0f6f24 42/*
21805085 43 * Targeted preemption latency for CPU-bound tasks:
bf0f6f24 44 *
21805085 45 * NOTE: this latency value is not the same as the concept of
d274a4ce
IM
46 * 'timeslice length' - timeslices in CFS are of variable length
47 * and have no persistent notion like in traditional, time-slice
48 * based scheduling concepts.
bf0f6f24 49 *
d274a4ce
IM
50 * (to see the precise effective timeslice length of your workload,
51 * run vmstat and monitor the context-switches (cs) field)
2b4d5b25
IM
52 *
53 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 54 */
2b4d5b25
IM
55unsigned int sysctl_sched_latency = 6000000ULL;
56unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d4 57
1983a922
CE
58/*
59 * The initial- and re-scaling of tunables is configurable
1983a922
CE
60 *
61 * Options are:
2b4d5b25
IM
62 *
63 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
64 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
65 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
66 *
67 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
1983a922 68 */
2b4d5b25 69enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
1983a922 70
2bd8e6d4 71/*
b2be5e96 72 * Minimal preemption granularity for CPU-bound tasks:
2b4d5b25 73 *
864616ee 74 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d4 75 */
2b4d5b25
IM
76unsigned int sysctl_sched_min_granularity = 750000ULL;
77unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
21805085
PZ
78
79/*
2b4d5b25 80 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
b2be5e96 81 */
0bf377bb 82static unsigned int sched_nr_latency = 8;
b2be5e96
PZ
83
84/*
2bba22c5 85 * After fork, child runs first. If set to 0 (default) then
b2be5e96 86 * parent will (try to) run first.
21805085 87 */
2bba22c5 88unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24 89
bf0f6f24
IM
90/*
91 * SCHED_OTHER wake-up granularity.
bf0f6f24
IM
92 *
93 * This option delays the preemption effects of decoupled workloads
94 * and reduces their over-scheduling. Synchronous workloads will still
95 * have immediate wakeup/sleep latencies.
2b4d5b25
IM
96 *
97 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 98 */
2b4d5b25
IM
99unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
100unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24 101
2b4d5b25 102const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
da84d961 103
afe06efd
TC
104#ifdef CONFIG_SMP
105/*
106 * For asym packing, by default the lower numbered cpu has higher priority.
107 */
108int __weak arch_asym_cpu_priority(int cpu)
109{
110 return -cpu;
111}
112#endif
113
ec12cb7f
PT
114#ifdef CONFIG_CFS_BANDWIDTH
115/*
116 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
117 * each time a cfs_rq requests quota.
118 *
119 * Note: in the case that the slice exceeds the runtime remaining (either due
120 * to consumption or the quota being specified to be smaller than the slice)
121 * we will always only issue the remaining available time.
122 *
2b4d5b25
IM
123 * (default: 5 msec, units: microseconds)
124 */
125unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
ec12cb7f
PT
126#endif
127
3273163c
MR
128/*
129 * The margin used when comparing utilization with CPU capacity:
893c5d22 130 * util * margin < capacity * 1024
2b4d5b25
IM
131 *
132 * (default: ~20%)
3273163c 133 */
2b4d5b25 134unsigned int capacity_margin = 1280;
3273163c 135
8527632d
PG
136static inline void update_load_add(struct load_weight *lw, unsigned long inc)
137{
138 lw->weight += inc;
139 lw->inv_weight = 0;
140}
141
142static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
143{
144 lw->weight -= dec;
145 lw->inv_weight = 0;
146}
147
148static inline void update_load_set(struct load_weight *lw, unsigned long w)
149{
150 lw->weight = w;
151 lw->inv_weight = 0;
152}
153
029632fb
PZ
154/*
155 * Increase the granularity value when there are more CPUs,
156 * because with more CPUs the 'effective latency' as visible
157 * to users decreases. But the relationship is not linear,
158 * so pick a second-best guess by going with the log2 of the
159 * number of CPUs.
160 *
161 * This idea comes from the SD scheduler of Con Kolivas:
162 */
58ac93e4 163static unsigned int get_update_sysctl_factor(void)
029632fb 164{
58ac93e4 165 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
029632fb
PZ
166 unsigned int factor;
167
168 switch (sysctl_sched_tunable_scaling) {
169 case SCHED_TUNABLESCALING_NONE:
170 factor = 1;
171 break;
172 case SCHED_TUNABLESCALING_LINEAR:
173 factor = cpus;
174 break;
175 case SCHED_TUNABLESCALING_LOG:
176 default:
177 factor = 1 + ilog2(cpus);
178 break;
179 }
180
181 return factor;
182}
183
184static void update_sysctl(void)
185{
186 unsigned int factor = get_update_sysctl_factor();
187
188#define SET_SYSCTL(name) \
189 (sysctl_##name = (factor) * normalized_sysctl_##name)
190 SET_SYSCTL(sched_min_granularity);
191 SET_SYSCTL(sched_latency);
192 SET_SYSCTL(sched_wakeup_granularity);
193#undef SET_SYSCTL
194}
195
196void sched_init_granularity(void)
197{
198 update_sysctl();
199}
200
9dbdb155 201#define WMULT_CONST (~0U)
029632fb
PZ
202#define WMULT_SHIFT 32
203
9dbdb155
PZ
204static void __update_inv_weight(struct load_weight *lw)
205{
206 unsigned long w;
207
208 if (likely(lw->inv_weight))
209 return;
210
211 w = scale_load_down(lw->weight);
212
213 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
214 lw->inv_weight = 1;
215 else if (unlikely(!w))
216 lw->inv_weight = WMULT_CONST;
217 else
218 lw->inv_weight = WMULT_CONST / w;
219}
029632fb
PZ
220
221/*
9dbdb155
PZ
222 * delta_exec * weight / lw.weight
223 * OR
224 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
225 *
1c3de5e1 226 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
9dbdb155
PZ
227 * we're guaranteed shift stays positive because inv_weight is guaranteed to
228 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
229 *
230 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
231 * weight/lw.weight <= 1, and therefore our shift will also be positive.
029632fb 232 */
9dbdb155 233static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
029632fb 234{
9dbdb155
PZ
235 u64 fact = scale_load_down(weight);
236 int shift = WMULT_SHIFT;
029632fb 237
9dbdb155 238 __update_inv_weight(lw);
029632fb 239
9dbdb155
PZ
240 if (unlikely(fact >> 32)) {
241 while (fact >> 32) {
242 fact >>= 1;
243 shift--;
244 }
029632fb
PZ
245 }
246
9dbdb155
PZ
247 /* hint to use a 32x32->64 mul */
248 fact = (u64)(u32)fact * lw->inv_weight;
029632fb 249
9dbdb155
PZ
250 while (fact >> 32) {
251 fact >>= 1;
252 shift--;
253 }
029632fb 254
9dbdb155 255 return mul_u64_u32_shr(delta_exec, fact, shift);
029632fb
PZ
256}
257
258
259const struct sched_class fair_sched_class;
a4c2f00f 260
bf0f6f24
IM
261/**************************************************************
262 * CFS operations on generic schedulable entities:
263 */
264
62160e3f 265#ifdef CONFIG_FAIR_GROUP_SCHED
bf0f6f24 266
62160e3f 267/* cpu runqueue to which this cfs_rq is attached */
bf0f6f24
IM
268static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
269{
62160e3f 270 return cfs_rq->rq;
bf0f6f24
IM
271}
272
62160e3f
IM
273/* An entity is a task if it doesn't "own" a runqueue */
274#define entity_is_task(se) (!se->my_q)
bf0f6f24 275
8f48894f
PZ
276static inline struct task_struct *task_of(struct sched_entity *se)
277{
9148a3a1 278 SCHED_WARN_ON(!entity_is_task(se));
8f48894f
PZ
279 return container_of(se, struct task_struct, se);
280}
281
b758149c
PZ
282/* Walk up scheduling entities hierarchy */
283#define for_each_sched_entity(se) \
284 for (; se; se = se->parent)
285
286static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
287{
288 return p->se.cfs_rq;
289}
290
291/* runqueue on which this entity is (to be) queued */
292static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
293{
294 return se->cfs_rq;
295}
296
297/* runqueue "owned" by this group */
298static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
299{
300 return grp->my_q;
301}
302
3d4b47b4
PZ
303static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
304{
305 if (!cfs_rq->on_list) {
9c2791f9
VG
306 struct rq *rq = rq_of(cfs_rq);
307 int cpu = cpu_of(rq);
67e86250
PT
308 /*
309 * Ensure we either appear before our parent (if already
310 * enqueued) or force our parent to appear after us when it is
9c2791f9
VG
311 * enqueued. The fact that we always enqueue bottom-up
312 * reduces this to two cases and a special case for the root
313 * cfs_rq. Furthermore, it also means that we will always reset
314 * tmp_alone_branch either when the branch is connected
315 * to a tree or when we reach the beg of the tree
67e86250
PT
316 */
317 if (cfs_rq->tg->parent &&
9c2791f9
VG
318 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
319 /*
320 * If parent is already on the list, we add the child
321 * just before. Thanks to circular linked property of
322 * the list, this means to put the child at the tail
323 * of the list that starts by parent.
324 */
325 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
326 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
327 /*
328 * The branch is now connected to its tree so we can
329 * reset tmp_alone_branch to the beginning of the
330 * list.
331 */
332 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
333 } else if (!cfs_rq->tg->parent) {
334 /*
335 * cfs rq without parent should be put
336 * at the tail of the list.
337 */
67e86250 338 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
9c2791f9
VG
339 &rq->leaf_cfs_rq_list);
340 /*
341 * We have reach the beg of a tree so we can reset
342 * tmp_alone_branch to the beginning of the list.
343 */
344 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
345 } else {
346 /*
347 * The parent has not already been added so we want to
348 * make sure that it will be put after us.
349 * tmp_alone_branch points to the beg of the branch
350 * where we will add parent.
351 */
352 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
353 rq->tmp_alone_branch);
354 /*
355 * update tmp_alone_branch to points to the new beg
356 * of the branch
357 */
358 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
67e86250 359 }
3d4b47b4
PZ
360
361 cfs_rq->on_list = 1;
362 }
363}
364
365static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
366{
367 if (cfs_rq->on_list) {
368 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
369 cfs_rq->on_list = 0;
370 }
371}
372
b758149c 373/* Iterate thr' all leaf cfs_rq's on a runqueue */
a9e7f654
TH
374#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
375 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
376 leaf_cfs_rq_list)
b758149c
PZ
377
378/* Do the two (enqueued) entities belong to the same group ? */
fed14d45 379static inline struct cfs_rq *
b758149c
PZ
380is_same_group(struct sched_entity *se, struct sched_entity *pse)
381{
382 if (se->cfs_rq == pse->cfs_rq)
fed14d45 383 return se->cfs_rq;
b758149c 384
fed14d45 385 return NULL;
b758149c
PZ
386}
387
388static inline struct sched_entity *parent_entity(struct sched_entity *se)
389{
390 return se->parent;
391}
392
464b7527
PZ
393static void
394find_matching_se(struct sched_entity **se, struct sched_entity **pse)
395{
396 int se_depth, pse_depth;
397
398 /*
399 * preemption test can be made between sibling entities who are in the
400 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
401 * both tasks until we find their ancestors who are siblings of common
402 * parent.
403 */
404
405 /* First walk up until both entities are at same depth */
fed14d45
PZ
406 se_depth = (*se)->depth;
407 pse_depth = (*pse)->depth;
464b7527
PZ
408
409 while (se_depth > pse_depth) {
410 se_depth--;
411 *se = parent_entity(*se);
412 }
413
414 while (pse_depth > se_depth) {
415 pse_depth--;
416 *pse = parent_entity(*pse);
417 }
418
419 while (!is_same_group(*se, *pse)) {
420 *se = parent_entity(*se);
421 *pse = parent_entity(*pse);
422 }
423}
424
8f48894f
PZ
425#else /* !CONFIG_FAIR_GROUP_SCHED */
426
427static inline struct task_struct *task_of(struct sched_entity *se)
428{
429 return container_of(se, struct task_struct, se);
430}
bf0f6f24 431
62160e3f
IM
432static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
433{
434 return container_of(cfs_rq, struct rq, cfs);
bf0f6f24
IM
435}
436
437#define entity_is_task(se) 1
438
b758149c
PZ
439#define for_each_sched_entity(se) \
440 for (; se; se = NULL)
bf0f6f24 441
b758149c 442static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
bf0f6f24 443{
b758149c 444 return &task_rq(p)->cfs;
bf0f6f24
IM
445}
446
b758149c
PZ
447static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
448{
449 struct task_struct *p = task_of(se);
450 struct rq *rq = task_rq(p);
451
452 return &rq->cfs;
453}
454
455/* runqueue "owned" by this group */
456static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
457{
458 return NULL;
459}
460
3d4b47b4
PZ
461static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
462{
463}
464
465static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
466{
467}
468
a9e7f654
TH
469#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
470 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
b758149c 471
b758149c
PZ
472static inline struct sched_entity *parent_entity(struct sched_entity *se)
473{
474 return NULL;
475}
476
464b7527
PZ
477static inline void
478find_matching_se(struct sched_entity **se, struct sched_entity **pse)
479{
480}
481
b758149c
PZ
482#endif /* CONFIG_FAIR_GROUP_SCHED */
483
6c16a6dc 484static __always_inline
9dbdb155 485void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
bf0f6f24
IM
486
487/**************************************************************
488 * Scheduling class tree data structure manipulation methods:
489 */
490
1bf08230 491static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
02e0431a 492{
1bf08230 493 s64 delta = (s64)(vruntime - max_vruntime);
368059a9 494 if (delta > 0)
1bf08230 495 max_vruntime = vruntime;
02e0431a 496
1bf08230 497 return max_vruntime;
02e0431a
PZ
498}
499
0702e3eb 500static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246
PZ
501{
502 s64 delta = (s64)(vruntime - min_vruntime);
503 if (delta < 0)
504 min_vruntime = vruntime;
505
506 return min_vruntime;
507}
508
54fdc581
FC
509static inline int entity_before(struct sched_entity *a,
510 struct sched_entity *b)
511{
512 return (s64)(a->vruntime - b->vruntime) < 0;
513}
514
1af5f730
PZ
515static void update_min_vruntime(struct cfs_rq *cfs_rq)
516{
b60205c7 517 struct sched_entity *curr = cfs_rq->curr;
bfb06889 518 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
b60205c7 519
1af5f730
PZ
520 u64 vruntime = cfs_rq->min_vruntime;
521
b60205c7
PZ
522 if (curr) {
523 if (curr->on_rq)
524 vruntime = curr->vruntime;
525 else
526 curr = NULL;
527 }
1af5f730 528
bfb06889
DB
529 if (leftmost) { /* non-empty tree */
530 struct sched_entity *se;
531 se = rb_entry(leftmost, struct sched_entity, run_node);
1af5f730 532
b60205c7 533 if (!curr)
1af5f730
PZ
534 vruntime = se->vruntime;
535 else
536 vruntime = min_vruntime(vruntime, se->vruntime);
537 }
538
1bf08230 539 /* ensure we never gain time by being placed backwards. */
1af5f730 540 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
3fe1698b
PZ
541#ifndef CONFIG_64BIT
542 smp_wmb();
543 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
544#endif
1af5f730
PZ
545}
546
bf0f6f24
IM
547/*
548 * Enqueue an entity into the rb-tree:
549 */
0702e3eb 550static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 551{
bfb06889 552 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
bf0f6f24
IM
553 struct rb_node *parent = NULL;
554 struct sched_entity *entry;
bfb06889 555 bool leftmost = true;
bf0f6f24
IM
556
557 /*
558 * Find the right place in the rbtree:
559 */
560 while (*link) {
561 parent = *link;
562 entry = rb_entry(parent, struct sched_entity, run_node);
563 /*
564 * We dont care about collisions. Nodes with
565 * the same key stay together.
566 */
2bd2d6f2 567 if (entity_before(se, entry)) {
bf0f6f24
IM
568 link = &parent->rb_left;
569 } else {
570 link = &parent->rb_right;
bfb06889 571 leftmost = false;
bf0f6f24
IM
572 }
573 }
574
bf0f6f24 575 rb_link_node(&se->run_node, parent, link);
bfb06889
DB
576 rb_insert_color_cached(&se->run_node,
577 &cfs_rq->tasks_timeline, leftmost);
bf0f6f24
IM
578}
579
0702e3eb 580static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 581{
bfb06889 582 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
583}
584
029632fb 585struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
bf0f6f24 586{
bfb06889 587 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
f4b6755f
PZ
588
589 if (!left)
590 return NULL;
591
592 return rb_entry(left, struct sched_entity, run_node);
bf0f6f24
IM
593}
594
ac53db59
RR
595static struct sched_entity *__pick_next_entity(struct sched_entity *se)
596{
597 struct rb_node *next = rb_next(&se->run_node);
598
599 if (!next)
600 return NULL;
601
602 return rb_entry(next, struct sched_entity, run_node);
603}
604
605#ifdef CONFIG_SCHED_DEBUG
029632fb 606struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b04 607{
bfb06889 608 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
aeb73b04 609
70eee74b
BS
610 if (!last)
611 return NULL;
7eee3e67
IM
612
613 return rb_entry(last, struct sched_entity, run_node);
aeb73b04
PZ
614}
615
bf0f6f24
IM
616/**************************************************************
617 * Scheduling class statistics methods:
618 */
619
acb4a848 620int sched_proc_update_handler(struct ctl_table *table, int write,
8d65af78 621 void __user *buffer, size_t *lenp,
b2be5e96
PZ
622 loff_t *ppos)
623{
8d65af78 624 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
58ac93e4 625 unsigned int factor = get_update_sysctl_factor();
b2be5e96
PZ
626
627 if (ret || !write)
628 return ret;
629
630 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
631 sysctl_sched_min_granularity);
632
acb4a848
CE
633#define WRT_SYSCTL(name) \
634 (normalized_sysctl_##name = sysctl_##name / (factor))
635 WRT_SYSCTL(sched_min_granularity);
636 WRT_SYSCTL(sched_latency);
637 WRT_SYSCTL(sched_wakeup_granularity);
acb4a848
CE
638#undef WRT_SYSCTL
639
b2be5e96
PZ
640 return 0;
641}
642#endif
647e7cac 643
a7be37ac 644/*
f9c0b095 645 * delta /= w
a7be37ac 646 */
9dbdb155 647static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
a7be37ac 648{
f9c0b095 649 if (unlikely(se->load.weight != NICE_0_LOAD))
9dbdb155 650 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
a7be37ac
PZ
651
652 return delta;
653}
654
647e7cac
IM
655/*
656 * The idea is to set a period in which each task runs once.
657 *
532b1858 658 * When there are too many tasks (sched_nr_latency) we have to stretch
647e7cac
IM
659 * this period because otherwise the slices get too small.
660 *
661 * p = (nr <= nl) ? l : l*nr/nl
662 */
4d78e7b6
PZ
663static u64 __sched_period(unsigned long nr_running)
664{
8e2b0bf3
BF
665 if (unlikely(nr_running > sched_nr_latency))
666 return nr_running * sysctl_sched_min_granularity;
667 else
668 return sysctl_sched_latency;
4d78e7b6
PZ
669}
670
647e7cac
IM
671/*
672 * We calculate the wall-time slice from the period by taking a part
673 * proportional to the weight.
674 *
f9c0b095 675 * s = p*P[w/rw]
647e7cac 676 */
6d0f0ebd 677static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
21805085 678{
0a582440 679 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
f9c0b095 680
0a582440 681 for_each_sched_entity(se) {
6272d68c 682 struct load_weight *load;
3104bf03 683 struct load_weight lw;
6272d68c
LM
684
685 cfs_rq = cfs_rq_of(se);
686 load = &cfs_rq->load;
f9c0b095 687
0a582440 688 if (unlikely(!se->on_rq)) {
3104bf03 689 lw = cfs_rq->load;
0a582440
MG
690
691 update_load_add(&lw, se->load.weight);
692 load = &lw;
693 }
9dbdb155 694 slice = __calc_delta(slice, se->load.weight, load);
0a582440
MG
695 }
696 return slice;
bf0f6f24
IM
697}
698
647e7cac 699/*
660cc00f 700 * We calculate the vruntime slice of a to-be-inserted task.
647e7cac 701 *
f9c0b095 702 * vs = s/w
647e7cac 703 */
f9c0b095 704static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a 705{
f9c0b095 706 return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac
PZ
707}
708
a75cdaa9 709#ifdef CONFIG_SMP
283e2ed3
PZ
710
711#include "sched-pelt.h"
712
772bd008 713static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
fb13c7ee
MG
714static unsigned long task_h_load(struct task_struct *p);
715
540247fb
YD
716/* Give new sched_entity start runnable values to heavy its load in infant time */
717void init_entity_runnable_average(struct sched_entity *se)
a75cdaa9 718{
540247fb 719 struct sched_avg *sa = &se->avg;
a75cdaa9 720
f207934f
PZ
721 memset(sa, 0, sizeof(*sa));
722
b5a9b340
VG
723 /*
724 * Tasks are intialized with full load to be seen as heavy tasks until
725 * they get a chance to stabilize to their real load level.
726 * Group entities are intialized with zero load to reflect the fact that
727 * nothing has been attached to the task group yet.
728 */
729 if (entity_is_task(se))
1ea6c46a 730 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
1ea6c46a 731
f207934f
PZ
732 se->runnable_weight = se->load.weight;
733
9d89c257 734 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
a75cdaa9 735}
7ea241af 736
7dc603c9 737static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
df217913 738static void attach_entity_cfs_rq(struct sched_entity *se);
7dc603c9 739
2b8c41da
YD
740/*
741 * With new tasks being created, their initial util_avgs are extrapolated
742 * based on the cfs_rq's current util_avg:
743 *
744 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
745 *
746 * However, in many cases, the above util_avg does not give a desired
747 * value. Moreover, the sum of the util_avgs may be divergent, such
748 * as when the series is a harmonic series.
749 *
750 * To solve this problem, we also cap the util_avg of successive tasks to
751 * only 1/2 of the left utilization budget:
752 *
753 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
754 *
755 * where n denotes the nth task.
756 *
757 * For example, a simplest series from the beginning would be like:
758 *
759 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
760 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
761 *
762 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
763 * if util_avg > util_avg_cap.
764 */
765void post_init_entity_util_avg(struct sched_entity *se)
766{
767 struct cfs_rq *cfs_rq = cfs_rq_of(se);
768 struct sched_avg *sa = &se->avg;
172895e6 769 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
2b8c41da
YD
770
771 if (cap > 0) {
772 if (cfs_rq->avg.util_avg != 0) {
773 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
774 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
775
776 if (sa->util_avg > cap)
777 sa->util_avg = cap;
778 } else {
779 sa->util_avg = cap;
780 }
2b8c41da 781 }
7dc603c9
PZ
782
783 if (entity_is_task(se)) {
784 struct task_struct *p = task_of(se);
785 if (p->sched_class != &fair_sched_class) {
786 /*
787 * For !fair tasks do:
788 *
3a123bbb 789 update_cfs_rq_load_avg(now, cfs_rq);
7dc603c9
PZ
790 attach_entity_load_avg(cfs_rq, se);
791 switched_from_fair(rq, p);
792 *
793 * such that the next switched_to_fair() has the
794 * expected state.
795 */
df217913 796 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
7dc603c9
PZ
797 return;
798 }
799 }
800
df217913 801 attach_entity_cfs_rq(se);
2b8c41da
YD
802}
803
7dc603c9 804#else /* !CONFIG_SMP */
540247fb 805void init_entity_runnable_average(struct sched_entity *se)
a75cdaa9
AS
806{
807}
2b8c41da
YD
808void post_init_entity_util_avg(struct sched_entity *se)
809{
810}
3d30544f
PZ
811static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
812{
813}
7dc603c9 814#endif /* CONFIG_SMP */
a75cdaa9 815
bf0f6f24 816/*
9dbdb155 817 * Update the current task's runtime statistics.
bf0f6f24 818 */
b7cc0896 819static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24 820{
429d43bc 821 struct sched_entity *curr = cfs_rq->curr;
78becc27 822 u64 now = rq_clock_task(rq_of(cfs_rq));
9dbdb155 823 u64 delta_exec;
bf0f6f24
IM
824
825 if (unlikely(!curr))
826 return;
827
9dbdb155
PZ
828 delta_exec = now - curr->exec_start;
829 if (unlikely((s64)delta_exec <= 0))
34f28ecd 830 return;
bf0f6f24 831
8ebc91d9 832 curr->exec_start = now;
d842de87 833
9dbdb155
PZ
834 schedstat_set(curr->statistics.exec_max,
835 max(delta_exec, curr->statistics.exec_max));
836
837 curr->sum_exec_runtime += delta_exec;
ae92882e 838 schedstat_add(cfs_rq->exec_clock, delta_exec);
9dbdb155
PZ
839
840 curr->vruntime += calc_delta_fair(delta_exec, curr);
841 update_min_vruntime(cfs_rq);
842
d842de87
SV
843 if (entity_is_task(curr)) {
844 struct task_struct *curtask = task_of(curr);
845
f977bb49 846 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d2cc5ed6 847 cgroup_account_cputime(curtask, delta_exec);
f06febc9 848 account_group_exec_runtime(curtask, delta_exec);
d842de87 849 }
ec12cb7f
PT
850
851 account_cfs_rq_runtime(cfs_rq, delta_exec);
bf0f6f24
IM
852}
853
6e998916
SG
854static void update_curr_fair(struct rq *rq)
855{
856 update_curr(cfs_rq_of(&rq->curr->se));
857}
858
bf0f6f24 859static inline void
5870db5b 860update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 861{
4fa8d299
JP
862 u64 wait_start, prev_wait_start;
863
864 if (!schedstat_enabled())
865 return;
866
867 wait_start = rq_clock(rq_of(cfs_rq));
868 prev_wait_start = schedstat_val(se->statistics.wait_start);
3ea94de1
JP
869
870 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
4fa8d299
JP
871 likely(wait_start > prev_wait_start))
872 wait_start -= prev_wait_start;
3ea94de1 873
2ed41a55 874 __schedstat_set(se->statistics.wait_start, wait_start);
bf0f6f24
IM
875}
876
4fa8d299 877static inline void
3ea94de1
JP
878update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
879{
880 struct task_struct *p;
cb251765
MG
881 u64 delta;
882
4fa8d299
JP
883 if (!schedstat_enabled())
884 return;
885
886 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
3ea94de1
JP
887
888 if (entity_is_task(se)) {
889 p = task_of(se);
890 if (task_on_rq_migrating(p)) {
891 /*
892 * Preserve migrating task's wait time so wait_start
893 * time stamp can be adjusted to accumulate wait time
894 * prior to migration.
895 */
2ed41a55 896 __schedstat_set(se->statistics.wait_start, delta);
3ea94de1
JP
897 return;
898 }
899 trace_sched_stat_wait(p, delta);
900 }
901
2ed41a55 902 __schedstat_set(se->statistics.wait_max,
4fa8d299 903 max(schedstat_val(se->statistics.wait_max), delta));
2ed41a55
PZ
904 __schedstat_inc(se->statistics.wait_count);
905 __schedstat_add(se->statistics.wait_sum, delta);
906 __schedstat_set(se->statistics.wait_start, 0);
3ea94de1 907}
3ea94de1 908
4fa8d299 909static inline void
1a3d027c
JP
910update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
911{
912 struct task_struct *tsk = NULL;
4fa8d299
JP
913 u64 sleep_start, block_start;
914
915 if (!schedstat_enabled())
916 return;
917
918 sleep_start = schedstat_val(se->statistics.sleep_start);
919 block_start = schedstat_val(se->statistics.block_start);
1a3d027c
JP
920
921 if (entity_is_task(se))
922 tsk = task_of(se);
923
4fa8d299
JP
924 if (sleep_start) {
925 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
1a3d027c
JP
926
927 if ((s64)delta < 0)
928 delta = 0;
929
4fa8d299 930 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
2ed41a55 931 __schedstat_set(se->statistics.sleep_max, delta);
1a3d027c 932
2ed41a55
PZ
933 __schedstat_set(se->statistics.sleep_start, 0);
934 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
1a3d027c
JP
935
936 if (tsk) {
937 account_scheduler_latency(tsk, delta >> 10, 1);
938 trace_sched_stat_sleep(tsk, delta);
939 }
940 }
4fa8d299
JP
941 if (block_start) {
942 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
1a3d027c
JP
943
944 if ((s64)delta < 0)
945 delta = 0;
946
4fa8d299 947 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
2ed41a55 948 __schedstat_set(se->statistics.block_max, delta);
1a3d027c 949
2ed41a55
PZ
950 __schedstat_set(se->statistics.block_start, 0);
951 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
1a3d027c
JP
952
953 if (tsk) {
954 if (tsk->in_iowait) {
2ed41a55
PZ
955 __schedstat_add(se->statistics.iowait_sum, delta);
956 __schedstat_inc(se->statistics.iowait_count);
1a3d027c
JP
957 trace_sched_stat_iowait(tsk, delta);
958 }
959
960 trace_sched_stat_blocked(tsk, delta);
961
962 /*
963 * Blocking time is in units of nanosecs, so shift by
964 * 20 to get a milliseconds-range estimation of the
965 * amount of time that the task spent sleeping:
966 */
967 if (unlikely(prof_on == SLEEP_PROFILING)) {
968 profile_hits(SLEEP_PROFILING,
969 (void *)get_wchan(tsk),
970 delta >> 20);
971 }
972 account_scheduler_latency(tsk, delta >> 10, 0);
973 }
974 }
3ea94de1 975}
3ea94de1 976
bf0f6f24
IM
977/*
978 * Task is being enqueued - update stats:
979 */
cb251765 980static inline void
1a3d027c 981update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 982{
4fa8d299
JP
983 if (!schedstat_enabled())
984 return;
985
bf0f6f24
IM
986 /*
987 * Are we enqueueing a waiting task? (for current tasks
988 * a dequeue/enqueue event is a NOP)
989 */
429d43bc 990 if (se != cfs_rq->curr)
5870db5b 991 update_stats_wait_start(cfs_rq, se);
1a3d027c
JP
992
993 if (flags & ENQUEUE_WAKEUP)
994 update_stats_enqueue_sleeper(cfs_rq, se);
bf0f6f24
IM
995}
996
bf0f6f24 997static inline void
cb251765 998update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 999{
4fa8d299
JP
1000
1001 if (!schedstat_enabled())
1002 return;
1003
bf0f6f24
IM
1004 /*
1005 * Mark the end of the wait period if dequeueing a
1006 * waiting task:
1007 */
429d43bc 1008 if (se != cfs_rq->curr)
9ef0a961 1009 update_stats_wait_end(cfs_rq, se);
cb251765 1010
4fa8d299
JP
1011 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1012 struct task_struct *tsk = task_of(se);
cb251765 1013
4fa8d299 1014 if (tsk->state & TASK_INTERRUPTIBLE)
2ed41a55 1015 __schedstat_set(se->statistics.sleep_start,
4fa8d299
JP
1016 rq_clock(rq_of(cfs_rq)));
1017 if (tsk->state & TASK_UNINTERRUPTIBLE)
2ed41a55 1018 __schedstat_set(se->statistics.block_start,
4fa8d299 1019 rq_clock(rq_of(cfs_rq)));
cb251765 1020 }
cb251765
MG
1021}
1022
bf0f6f24
IM
1023/*
1024 * We are picking a new current task - update its stats:
1025 */
1026static inline void
79303e9e 1027update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
1028{
1029 /*
1030 * We are starting a new run period:
1031 */
78becc27 1032 se->exec_start = rq_clock_task(rq_of(cfs_rq));
bf0f6f24
IM
1033}
1034
bf0f6f24
IM
1035/**************************************************
1036 * Scheduling class queueing methods:
1037 */
1038
cbee9f88
PZ
1039#ifdef CONFIG_NUMA_BALANCING
1040/*
598f0ec0
MG
1041 * Approximate time to scan a full NUMA task in ms. The task scan period is
1042 * calculated based on the tasks virtual memory size and
1043 * numa_balancing_scan_size.
cbee9f88 1044 */
598f0ec0
MG
1045unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1046unsigned int sysctl_numa_balancing_scan_period_max = 60000;
6e5fb223
PZ
1047
1048/* Portion of address space to scan in MB */
1049unsigned int sysctl_numa_balancing_scan_size = 256;
cbee9f88 1050
4b96a29b
PZ
1051/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1052unsigned int sysctl_numa_balancing_scan_delay = 1000;
1053
b5dd77c8
RR
1054struct numa_group {
1055 atomic_t refcount;
1056
1057 spinlock_t lock; /* nr_tasks, tasks */
1058 int nr_tasks;
1059 pid_t gid;
1060 int active_nodes;
1061
1062 struct rcu_head rcu;
1063 unsigned long total_faults;
1064 unsigned long max_faults_cpu;
1065 /*
1066 * Faults_cpu is used to decide whether memory should move
1067 * towards the CPU. As a consequence, these stats are weighted
1068 * more by CPU use than by memory faults.
1069 */
1070 unsigned long *faults_cpu;
1071 unsigned long faults[0];
1072};
1073
1074static inline unsigned long group_faults_priv(struct numa_group *ng);
1075static inline unsigned long group_faults_shared(struct numa_group *ng);
1076
598f0ec0
MG
1077static unsigned int task_nr_scan_windows(struct task_struct *p)
1078{
1079 unsigned long rss = 0;
1080 unsigned long nr_scan_pages;
1081
1082 /*
1083 * Calculations based on RSS as non-present and empty pages are skipped
1084 * by the PTE scanner and NUMA hinting faults should be trapped based
1085 * on resident pages
1086 */
1087 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1088 rss = get_mm_rss(p->mm);
1089 if (!rss)
1090 rss = nr_scan_pages;
1091
1092 rss = round_up(rss, nr_scan_pages);
1093 return rss / nr_scan_pages;
1094}
1095
1096/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1097#define MAX_SCAN_WINDOW 2560
1098
1099static unsigned int task_scan_min(struct task_struct *p)
1100{
316c1608 1101 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
598f0ec0
MG
1102 unsigned int scan, floor;
1103 unsigned int windows = 1;
1104
64192658
KT
1105 if (scan_size < MAX_SCAN_WINDOW)
1106 windows = MAX_SCAN_WINDOW / scan_size;
598f0ec0
MG
1107 floor = 1000 / windows;
1108
1109 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1110 return max_t(unsigned int, floor, scan);
1111}
1112
b5dd77c8
RR
1113static unsigned int task_scan_start(struct task_struct *p)
1114{
1115 unsigned long smin = task_scan_min(p);
1116 unsigned long period = smin;
1117
1118 /* Scale the maximum scan period with the amount of shared memory. */
1119 if (p->numa_group) {
1120 struct numa_group *ng = p->numa_group;
1121 unsigned long shared = group_faults_shared(ng);
1122 unsigned long private = group_faults_priv(ng);
1123
1124 period *= atomic_read(&ng->refcount);
1125 period *= shared + 1;
1126 period /= private + shared + 1;
1127 }
1128
1129 return max(smin, period);
1130}
1131
598f0ec0
MG
1132static unsigned int task_scan_max(struct task_struct *p)
1133{
b5dd77c8
RR
1134 unsigned long smin = task_scan_min(p);
1135 unsigned long smax;
598f0ec0
MG
1136
1137 /* Watch for min being lower than max due to floor calculations */
1138 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
b5dd77c8
RR
1139
1140 /* Scale the maximum scan period with the amount of shared memory. */
1141 if (p->numa_group) {
1142 struct numa_group *ng = p->numa_group;
1143 unsigned long shared = group_faults_shared(ng);
1144 unsigned long private = group_faults_priv(ng);
1145 unsigned long period = smax;
1146
1147 period *= atomic_read(&ng->refcount);
1148 period *= shared + 1;
1149 period /= private + shared + 1;
1150
1151 smax = max(smax, period);
1152 }
1153
598f0ec0
MG
1154 return max(smin, smax);
1155}
1156
0ec8aa00
PZ
1157static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1158{
1159 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1160 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1161}
1162
1163static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1164{
1165 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1166 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1167}
1168
be1e4e76
RR
1169/* Shared or private faults. */
1170#define NR_NUMA_HINT_FAULT_TYPES 2
1171
1172/* Memory and CPU locality */
1173#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1174
1175/* Averaged statistics, and temporary buffers. */
1176#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1177
e29cf08b
MG
1178pid_t task_numa_group_id(struct task_struct *p)
1179{
1180 return p->numa_group ? p->numa_group->gid : 0;
1181}
1182
44dba3d5
IM
1183/*
1184 * The averaged statistics, shared & private, memory & cpu,
1185 * occupy the first half of the array. The second half of the
1186 * array is for current counters, which are averaged into the
1187 * first set by task_numa_placement.
1188 */
1189static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
ac8e895b 1190{
44dba3d5 1191 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
ac8e895b
MG
1192}
1193
1194static inline unsigned long task_faults(struct task_struct *p, int nid)
1195{
44dba3d5 1196 if (!p->numa_faults)
ac8e895b
MG
1197 return 0;
1198
44dba3d5
IM
1199 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1200 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
ac8e895b
MG
1201}
1202
83e1d2cd
MG
1203static inline unsigned long group_faults(struct task_struct *p, int nid)
1204{
1205 if (!p->numa_group)
1206 return 0;
1207
44dba3d5
IM
1208 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1209 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
83e1d2cd
MG
1210}
1211
20e07dea
RR
1212static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1213{
44dba3d5
IM
1214 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1215 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
20e07dea
RR
1216}
1217
b5dd77c8
RR
1218static inline unsigned long group_faults_priv(struct numa_group *ng)
1219{
1220 unsigned long faults = 0;
1221 int node;
1222
1223 for_each_online_node(node) {
1224 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1225 }
1226
1227 return faults;
1228}
1229
1230static inline unsigned long group_faults_shared(struct numa_group *ng)
1231{
1232 unsigned long faults = 0;
1233 int node;
1234
1235 for_each_online_node(node) {
1236 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1237 }
1238
1239 return faults;
1240}
1241
4142c3eb
RR
1242/*
1243 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1244 * considered part of a numa group's pseudo-interleaving set. Migrations
1245 * between these nodes are slowed down, to allow things to settle down.
1246 */
1247#define ACTIVE_NODE_FRACTION 3
1248
1249static bool numa_is_active_node(int nid, struct numa_group *ng)
1250{
1251 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1252}
1253
6c6b1193
RR
1254/* Handle placement on systems where not all nodes are directly connected. */
1255static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1256 int maxdist, bool task)
1257{
1258 unsigned long score = 0;
1259 int node;
1260
1261 /*
1262 * All nodes are directly connected, and the same distance
1263 * from each other. No need for fancy placement algorithms.
1264 */
1265 if (sched_numa_topology_type == NUMA_DIRECT)
1266 return 0;
1267
1268 /*
1269 * This code is called for each node, introducing N^2 complexity,
1270 * which should be ok given the number of nodes rarely exceeds 8.
1271 */
1272 for_each_online_node(node) {
1273 unsigned long faults;
1274 int dist = node_distance(nid, node);
1275
1276 /*
1277 * The furthest away nodes in the system are not interesting
1278 * for placement; nid was already counted.
1279 */
1280 if (dist == sched_max_numa_distance || node == nid)
1281 continue;
1282
1283 /*
1284 * On systems with a backplane NUMA topology, compare groups
1285 * of nodes, and move tasks towards the group with the most
1286 * memory accesses. When comparing two nodes at distance
1287 * "hoplimit", only nodes closer by than "hoplimit" are part
1288 * of each group. Skip other nodes.
1289 */
1290 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1291 dist > maxdist)
1292 continue;
1293
1294 /* Add up the faults from nearby nodes. */
1295 if (task)
1296 faults = task_faults(p, node);
1297 else
1298 faults = group_faults(p, node);
1299
1300 /*
1301 * On systems with a glueless mesh NUMA topology, there are
1302 * no fixed "groups of nodes". Instead, nodes that are not
1303 * directly connected bounce traffic through intermediate
1304 * nodes; a numa_group can occupy any set of nodes.
1305 * The further away a node is, the less the faults count.
1306 * This seems to result in good task placement.
1307 */
1308 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1309 faults *= (sched_max_numa_distance - dist);
1310 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1311 }
1312
1313 score += faults;
1314 }
1315
1316 return score;
1317}
1318
83e1d2cd
MG
1319/*
1320 * These return the fraction of accesses done by a particular task, or
1321 * task group, on a particular numa node. The group weight is given a
1322 * larger multiplier, in order to group tasks together that are almost
1323 * evenly spread out between numa nodes.
1324 */
7bd95320
RR
1325static inline unsigned long task_weight(struct task_struct *p, int nid,
1326 int dist)
83e1d2cd 1327{
7bd95320 1328 unsigned long faults, total_faults;
83e1d2cd 1329
44dba3d5 1330 if (!p->numa_faults)
83e1d2cd
MG
1331 return 0;
1332
1333 total_faults = p->total_numa_faults;
1334
1335 if (!total_faults)
1336 return 0;
1337
7bd95320 1338 faults = task_faults(p, nid);
6c6b1193
RR
1339 faults += score_nearby_nodes(p, nid, dist, true);
1340
7bd95320 1341 return 1000 * faults / total_faults;
83e1d2cd
MG
1342}
1343
7bd95320
RR
1344static inline unsigned long group_weight(struct task_struct *p, int nid,
1345 int dist)
83e1d2cd 1346{
7bd95320
RR
1347 unsigned long faults, total_faults;
1348
1349 if (!p->numa_group)
1350 return 0;
1351
1352 total_faults = p->numa_group->total_faults;
1353
1354 if (!total_faults)
83e1d2cd
MG
1355 return 0;
1356
7bd95320 1357 faults = group_faults(p, nid);
6c6b1193
RR
1358 faults += score_nearby_nodes(p, nid, dist, false);
1359
7bd95320 1360 return 1000 * faults / total_faults;
83e1d2cd
MG
1361}
1362
10f39042
RR
1363bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1364 int src_nid, int dst_cpu)
1365{
1366 struct numa_group *ng = p->numa_group;
1367 int dst_nid = cpu_to_node(dst_cpu);
1368 int last_cpupid, this_cpupid;
1369
1370 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1371
1372 /*
1373 * Multi-stage node selection is used in conjunction with a periodic
1374 * migration fault to build a temporal task<->page relation. By using
1375 * a two-stage filter we remove short/unlikely relations.
1376 *
1377 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1378 * a task's usage of a particular page (n_p) per total usage of this
1379 * page (n_t) (in a given time-span) to a probability.
1380 *
1381 * Our periodic faults will sample this probability and getting the
1382 * same result twice in a row, given these samples are fully
1383 * independent, is then given by P(n)^2, provided our sample period
1384 * is sufficiently short compared to the usage pattern.
1385 *
1386 * This quadric squishes small probabilities, making it less likely we
1387 * act on an unlikely task<->page relation.
1388 */
1389 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1390 if (!cpupid_pid_unset(last_cpupid) &&
1391 cpupid_to_nid(last_cpupid) != dst_nid)
1392 return false;
1393
1394 /* Always allow migrate on private faults */
1395 if (cpupid_match_pid(p, last_cpupid))
1396 return true;
1397
1398 /* A shared fault, but p->numa_group has not been set up yet. */
1399 if (!ng)
1400 return true;
1401
1402 /*
4142c3eb
RR
1403 * Destination node is much more heavily used than the source
1404 * node? Allow migration.
10f39042 1405 */
4142c3eb
RR
1406 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1407 ACTIVE_NODE_FRACTION)
10f39042
RR
1408 return true;
1409
1410 /*
4142c3eb
RR
1411 * Distribute memory according to CPU & memory use on each node,
1412 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1413 *
1414 * faults_cpu(dst) 3 faults_cpu(src)
1415 * --------------- * - > ---------------
1416 * faults_mem(dst) 4 faults_mem(src)
10f39042 1417 */
4142c3eb
RR
1418 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1419 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
10f39042
RR
1420}
1421
c7132dd6 1422static unsigned long weighted_cpuload(struct rq *rq);
58d081b5
MG
1423static unsigned long source_load(int cpu, int type);
1424static unsigned long target_load(int cpu, int type);
ced549fa 1425static unsigned long capacity_of(int cpu);
58d081b5 1426
fb13c7ee 1427/* Cached statistics for all CPUs within a node */
58d081b5 1428struct numa_stats {
fb13c7ee 1429 unsigned long nr_running;
58d081b5 1430 unsigned long load;
fb13c7ee
MG
1431
1432 /* Total compute capacity of CPUs on a node */
5ef20ca1 1433 unsigned long compute_capacity;
fb13c7ee
MG
1434
1435 /* Approximate capacity in terms of runnable tasks on a node */
5ef20ca1 1436 unsigned long task_capacity;
1b6a7495 1437 int has_free_capacity;
58d081b5 1438};
e6628d5b 1439
fb13c7ee
MG
1440/*
1441 * XXX borrowed from update_sg_lb_stats
1442 */
1443static void update_numa_stats(struct numa_stats *ns, int nid)
1444{
83d7f242
RR
1445 int smt, cpu, cpus = 0;
1446 unsigned long capacity;
fb13c7ee
MG
1447
1448 memset(ns, 0, sizeof(*ns));
1449 for_each_cpu(cpu, cpumask_of_node(nid)) {
1450 struct rq *rq = cpu_rq(cpu);
1451
1452 ns->nr_running += rq->nr_running;
c7132dd6 1453 ns->load += weighted_cpuload(rq);
ced549fa 1454 ns->compute_capacity += capacity_of(cpu);
5eca82a9
PZ
1455
1456 cpus++;
fb13c7ee
MG
1457 }
1458
5eca82a9
PZ
1459 /*
1460 * If we raced with hotplug and there are no CPUs left in our mask
1461 * the @ns structure is NULL'ed and task_numa_compare() will
1462 * not find this node attractive.
1463 *
1b6a7495
NP
1464 * We'll either bail at !has_free_capacity, or we'll detect a huge
1465 * imbalance and bail there.
5eca82a9
PZ
1466 */
1467 if (!cpus)
1468 return;
1469
83d7f242
RR
1470 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1471 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1472 capacity = cpus / smt; /* cores */
1473
1474 ns->task_capacity = min_t(unsigned, capacity,
1475 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1b6a7495 1476 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
fb13c7ee
MG
1477}
1478
58d081b5
MG
1479struct task_numa_env {
1480 struct task_struct *p;
e6628d5b 1481
58d081b5
MG
1482 int src_cpu, src_nid;
1483 int dst_cpu, dst_nid;
e6628d5b 1484
58d081b5 1485 struct numa_stats src_stats, dst_stats;
e6628d5b 1486
40ea2b42 1487 int imbalance_pct;
7bd95320 1488 int dist;
fb13c7ee
MG
1489
1490 struct task_struct *best_task;
1491 long best_imp;
58d081b5
MG
1492 int best_cpu;
1493};
1494
fb13c7ee
MG
1495static void task_numa_assign(struct task_numa_env *env,
1496 struct task_struct *p, long imp)
1497{
1498 if (env->best_task)
1499 put_task_struct(env->best_task);
bac78573
ON
1500 if (p)
1501 get_task_struct(p);
fb13c7ee
MG
1502
1503 env->best_task = p;
1504 env->best_imp = imp;
1505 env->best_cpu = env->dst_cpu;
1506}
1507
28a21745 1508static bool load_too_imbalanced(long src_load, long dst_load,
e63da036
RR
1509 struct task_numa_env *env)
1510{
e4991b24
RR
1511 long imb, old_imb;
1512 long orig_src_load, orig_dst_load;
28a21745
RR
1513 long src_capacity, dst_capacity;
1514
1515 /*
1516 * The load is corrected for the CPU capacity available on each node.
1517 *
1518 * src_load dst_load
1519 * ------------ vs ---------
1520 * src_capacity dst_capacity
1521 */
1522 src_capacity = env->src_stats.compute_capacity;
1523 dst_capacity = env->dst_stats.compute_capacity;
e63da036
RR
1524
1525 /* We care about the slope of the imbalance, not the direction. */
e4991b24
RR
1526 if (dst_load < src_load)
1527 swap(dst_load, src_load);
e63da036
RR
1528
1529 /* Is the difference below the threshold? */
e4991b24
RR
1530 imb = dst_load * src_capacity * 100 -
1531 src_load * dst_capacity * env->imbalance_pct;
e63da036
RR
1532 if (imb <= 0)
1533 return false;
1534
1535 /*
1536 * The imbalance is above the allowed threshold.
e4991b24 1537 * Compare it with the old imbalance.
e63da036 1538 */
28a21745 1539 orig_src_load = env->src_stats.load;
e4991b24 1540 orig_dst_load = env->dst_stats.load;
28a21745 1541
e4991b24
RR
1542 if (orig_dst_load < orig_src_load)
1543 swap(orig_dst_load, orig_src_load);
e63da036 1544
e4991b24
RR
1545 old_imb = orig_dst_load * src_capacity * 100 -
1546 orig_src_load * dst_capacity * env->imbalance_pct;
1547
1548 /* Would this change make things worse? */
1549 return (imb > old_imb);
e63da036
RR
1550}
1551
fb13c7ee
MG
1552/*
1553 * This checks if the overall compute and NUMA accesses of the system would
1554 * be improved if the source tasks was migrated to the target dst_cpu taking
1555 * into account that it might be best if task running on the dst_cpu should
1556 * be exchanged with the source task
1557 */
887c290e
RR
1558static void task_numa_compare(struct task_numa_env *env,
1559 long taskimp, long groupimp)
fb13c7ee
MG
1560{
1561 struct rq *src_rq = cpu_rq(env->src_cpu);
1562 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1563 struct task_struct *cur;
28a21745 1564 long src_load, dst_load;
fb13c7ee 1565 long load;
1c5d3eb3 1566 long imp = env->p->numa_group ? groupimp : taskimp;
0132c3e1 1567 long moveimp = imp;
7bd95320 1568 int dist = env->dist;
fb13c7ee
MG
1569
1570 rcu_read_lock();
bac78573
ON
1571 cur = task_rcu_dereference(&dst_rq->curr);
1572 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
fb13c7ee
MG
1573 cur = NULL;
1574
7af68335
PZ
1575 /*
1576 * Because we have preemption enabled we can get migrated around and
1577 * end try selecting ourselves (current == env->p) as a swap candidate.
1578 */
1579 if (cur == env->p)
1580 goto unlock;
1581
fb13c7ee
MG
1582 /*
1583 * "imp" is the fault differential for the source task between the
1584 * source and destination node. Calculate the total differential for
1585 * the source task and potential destination task. The more negative
1586 * the value is, the more rmeote accesses that would be expected to
1587 * be incurred if the tasks were swapped.
1588 */
1589 if (cur) {
1590 /* Skip this swap candidate if cannot move to the source cpu */
0c98d344 1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
fb13c7ee
MG
1592 goto unlock;
1593
887c290e
RR
1594 /*
1595 * If dst and source tasks are in the same NUMA group, or not
ca28aa53 1596 * in any group then look only at task weights.
887c290e 1597 */
ca28aa53 1598 if (cur->numa_group == env->p->numa_group) {
7bd95320
RR
1599 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1600 task_weight(cur, env->dst_nid, dist);
ca28aa53
RR
1601 /*
1602 * Add some hysteresis to prevent swapping the
1603 * tasks within a group over tiny differences.
1604 */
1605 if (cur->numa_group)
1606 imp -= imp/16;
887c290e 1607 } else {
ca28aa53
RR
1608 /*
1609 * Compare the group weights. If a task is all by
1610 * itself (not part of a group), use the task weight
1611 * instead.
1612 */
ca28aa53 1613 if (cur->numa_group)
7bd95320
RR
1614 imp += group_weight(cur, env->src_nid, dist) -
1615 group_weight(cur, env->dst_nid, dist);
ca28aa53 1616 else
7bd95320
RR
1617 imp += task_weight(cur, env->src_nid, dist) -
1618 task_weight(cur, env->dst_nid, dist);
887c290e 1619 }
fb13c7ee
MG
1620 }
1621
0132c3e1 1622 if (imp <= env->best_imp && moveimp <= env->best_imp)
fb13c7ee
MG
1623 goto unlock;
1624
1625 if (!cur) {
1626 /* Is there capacity at our destination? */
b932c03c 1627 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1b6a7495 1628 !env->dst_stats.has_free_capacity)
fb13c7ee
MG
1629 goto unlock;
1630
1631 goto balance;
1632 }
1633
1634 /* Balance doesn't matter much if we're running a task per cpu */
0132c3e1
RR
1635 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1636 dst_rq->nr_running == 1)
fb13c7ee
MG
1637 goto assign;
1638
1639 /*
1640 * In the overloaded case, try and keep the load balanced.
1641 */
1642balance:
e720fff6
PZ
1643 load = task_h_load(env->p);
1644 dst_load = env->dst_stats.load + load;
1645 src_load = env->src_stats.load - load;
fb13c7ee 1646
0132c3e1
RR
1647 if (moveimp > imp && moveimp > env->best_imp) {
1648 /*
1649 * If the improvement from just moving env->p direction is
1650 * better than swapping tasks around, check if a move is
1651 * possible. Store a slightly smaller score than moveimp,
1652 * so an actually idle CPU will win.
1653 */
1654 if (!load_too_imbalanced(src_load, dst_load, env)) {
1655 imp = moveimp - 1;
1656 cur = NULL;
1657 goto assign;
1658 }
1659 }
1660
1661 if (imp <= env->best_imp)
1662 goto unlock;
1663
fb13c7ee 1664 if (cur) {
e720fff6
PZ
1665 load = task_h_load(cur);
1666 dst_load -= load;
1667 src_load += load;
fb13c7ee
MG
1668 }
1669
28a21745 1670 if (load_too_imbalanced(src_load, dst_load, env))
fb13c7ee
MG
1671 goto unlock;
1672
ba7e5a27
RR
1673 /*
1674 * One idle CPU per node is evaluated for a task numa move.
1675 * Call select_idle_sibling to maybe find a better one.
1676 */
10e2f1ac
PZ
1677 if (!cur) {
1678 /*
1679 * select_idle_siblings() uses an per-cpu cpumask that
1680 * can be used from IRQ context.
1681 */
1682 local_irq_disable();
772bd008
MR
1683 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1684 env->dst_cpu);
10e2f1ac
PZ
1685 local_irq_enable();
1686 }
ba7e5a27 1687
fb13c7ee
MG
1688assign:
1689 task_numa_assign(env, cur, imp);
1690unlock:
1691 rcu_read_unlock();
1692}
1693
887c290e
RR
1694static void task_numa_find_cpu(struct task_numa_env *env,
1695 long taskimp, long groupimp)
2c8a50aa
MG
1696{
1697 int cpu;
1698
1699 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1700 /* Skip this CPU if the source task cannot migrate */
0c98d344 1701 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
2c8a50aa
MG
1702 continue;
1703
1704 env->dst_cpu = cpu;
887c290e 1705 task_numa_compare(env, taskimp, groupimp);
2c8a50aa
MG
1706 }
1707}
1708
6f9aad0b
RR
1709/* Only move tasks to a NUMA node less busy than the current node. */
1710static bool numa_has_capacity(struct task_numa_env *env)
1711{
1712 struct numa_stats *src = &env->src_stats;
1713 struct numa_stats *dst = &env->dst_stats;
1714
1715 if (src->has_free_capacity && !dst->has_free_capacity)
1716 return false;
1717
1718 /*
1719 * Only consider a task move if the source has a higher load
1720 * than the destination, corrected for CPU capacity on each node.
1721 *
1722 * src->load dst->load
1723 * --------------------- vs ---------------------
1724 * src->compute_capacity dst->compute_capacity
1725 */
44dcb04f
SD
1726 if (src->load * dst->compute_capacity * env->imbalance_pct >
1727
1728 dst->load * src->compute_capacity * 100)
6f9aad0b
RR
1729 return true;
1730
1731 return false;
1732}
1733
58d081b5
MG
1734static int task_numa_migrate(struct task_struct *p)
1735{
58d081b5
MG
1736 struct task_numa_env env = {
1737 .p = p,
fb13c7ee 1738
58d081b5 1739 .src_cpu = task_cpu(p),
b32e86b4 1740 .src_nid = task_node(p),
fb13c7ee
MG
1741
1742 .imbalance_pct = 112,
1743
1744 .best_task = NULL,
1745 .best_imp = 0,
4142c3eb 1746 .best_cpu = -1,
58d081b5
MG
1747 };
1748 struct sched_domain *sd;
887c290e 1749 unsigned long taskweight, groupweight;
7bd95320 1750 int nid, ret, dist;
887c290e 1751 long taskimp, groupimp;
e6628d5b 1752
58d081b5 1753 /*
fb13c7ee
MG
1754 * Pick the lowest SD_NUMA domain, as that would have the smallest
1755 * imbalance and would be the first to start moving tasks about.
1756 *
1757 * And we want to avoid any moving of tasks about, as that would create
1758 * random movement of tasks -- counter the numa conditions we're trying
1759 * to satisfy here.
58d081b5
MG
1760 */
1761 rcu_read_lock();
fb13c7ee 1762 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
46a73e8a
RR
1763 if (sd)
1764 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
e6628d5b
MG
1765 rcu_read_unlock();
1766
46a73e8a
RR
1767 /*
1768 * Cpusets can break the scheduler domain tree into smaller
1769 * balance domains, some of which do not cross NUMA boundaries.
1770 * Tasks that are "trapped" in such domains cannot be migrated
1771 * elsewhere, so there is no point in (re)trying.
1772 */
1773 if (unlikely(!sd)) {
de1b301a 1774 p->numa_preferred_nid = task_node(p);
46a73e8a
RR
1775 return -EINVAL;
1776 }
1777
2c8a50aa 1778 env.dst_nid = p->numa_preferred_nid;
7bd95320
RR
1779 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1780 taskweight = task_weight(p, env.src_nid, dist);
1781 groupweight = group_weight(p, env.src_nid, dist);
1782 update_numa_stats(&env.src_stats, env.src_nid);
1783 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1784 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2c8a50aa 1785 update_numa_stats(&env.dst_stats, env.dst_nid);
58d081b5 1786
a43455a1 1787 /* Try to find a spot on the preferred nid. */
6f9aad0b
RR
1788 if (numa_has_capacity(&env))
1789 task_numa_find_cpu(&env, taskimp, groupimp);
e1dda8a7 1790
9de05d48
RR
1791 /*
1792 * Look at other nodes in these cases:
1793 * - there is no space available on the preferred_nid
1794 * - the task is part of a numa_group that is interleaved across
1795 * multiple NUMA nodes; in order to better consolidate the group,
1796 * we need to check other locations.
1797 */
4142c3eb 1798 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
2c8a50aa
MG
1799 for_each_online_node(nid) {
1800 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1801 continue;
58d081b5 1802
7bd95320 1803 dist = node_distance(env.src_nid, env.dst_nid);
6c6b1193
RR
1804 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1805 dist != env.dist) {
1806 taskweight = task_weight(p, env.src_nid, dist);
1807 groupweight = group_weight(p, env.src_nid, dist);
1808 }
7bd95320 1809
83e1d2cd 1810 /* Only consider nodes where both task and groups benefit */
7bd95320
RR
1811 taskimp = task_weight(p, nid, dist) - taskweight;
1812 groupimp = group_weight(p, nid, dist) - groupweight;
887c290e 1813 if (taskimp < 0 && groupimp < 0)
fb13c7ee
MG
1814 continue;
1815
7bd95320 1816 env.dist = dist;
2c8a50aa
MG
1817 env.dst_nid = nid;
1818 update_numa_stats(&env.dst_stats, env.dst_nid);
6f9aad0b
RR
1819 if (numa_has_capacity(&env))
1820 task_numa_find_cpu(&env, taskimp, groupimp);
58d081b5
MG
1821 }
1822 }
1823
68d1b02a
RR
1824 /*
1825 * If the task is part of a workload that spans multiple NUMA nodes,
1826 * and is migrating into one of the workload's active nodes, remember
1827 * this node as the task's preferred numa node, so the workload can
1828 * settle down.
1829 * A task that migrated to a second choice node will be better off
1830 * trying for a better one later. Do not set the preferred node here.
1831 */
db015dae 1832 if (p->numa_group) {
4142c3eb
RR
1833 struct numa_group *ng = p->numa_group;
1834
db015dae
RR
1835 if (env.best_cpu == -1)
1836 nid = env.src_nid;
1837 else
1838 nid = env.dst_nid;
1839
4142c3eb 1840 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
db015dae
RR
1841 sched_setnuma(p, env.dst_nid);
1842 }
1843
1844 /* No better CPU than the current one was found. */
1845 if (env.best_cpu == -1)
1846 return -EAGAIN;
0ec8aa00 1847
04bb2f94
RR
1848 /*
1849 * Reset the scan period if the task is being rescheduled on an
1850 * alternative node to recheck if the tasks is now properly placed.
1851 */
b5dd77c8 1852 p->numa_scan_period = task_scan_start(p);
04bb2f94 1853
fb13c7ee 1854 if (env.best_task == NULL) {
286549dc
MG
1855 ret = migrate_task_to(p, env.best_cpu);
1856 if (ret != 0)
1857 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
fb13c7ee
MG
1858 return ret;
1859 }
1860
1861 ret = migrate_swap(p, env.best_task);
286549dc
MG
1862 if (ret != 0)
1863 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
fb13c7ee
MG
1864 put_task_struct(env.best_task);
1865 return ret;
e6628d5b
MG
1866}
1867
6b9a7460
MG
1868/* Attempt to migrate a task to a CPU on the preferred node. */
1869static void numa_migrate_preferred(struct task_struct *p)
1870{
5085e2a3 1871 unsigned long interval = HZ;
7347fc87 1872 unsigned long numa_migrate_retry;
5085e2a3 1873
2739d3ee 1874 /* This task has no NUMA fault statistics yet */
44dba3d5 1875 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
6b9a7460
MG
1876 return;
1877
2739d3ee 1878 /* Periodically retry migrating the task to the preferred node */
5085e2a3 1879 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
7347fc87
MG
1880 numa_migrate_retry = jiffies + interval;
1881
1882 /*
1883 * Check that the new retry threshold is after the current one. If
1884 * the retry is in the future, it implies that wake_affine has
1885 * temporarily asked NUMA balancing to backoff from placement.
1886 */
1887 if (numa_migrate_retry > p->numa_migrate_retry)
1888 return;
1889
1890 /* Safe to try placing the task on the preferred node */
1891 p->numa_migrate_retry = numa_migrate_retry;
2739d3ee
RR
1892
1893 /* Success if task is already running on preferred CPU */
de1b301a 1894 if (task_node(p) == p->numa_preferred_nid)
6b9a7460
MG
1895 return;
1896
1897 /* Otherwise, try migrate to a CPU on the preferred node */
2739d3ee 1898 task_numa_migrate(p);
6b9a7460
MG
1899}
1900
20e07dea 1901/*
4142c3eb 1902 * Find out how many nodes on the workload is actively running on. Do this by
20e07dea
RR
1903 * tracking the nodes from which NUMA hinting faults are triggered. This can
1904 * be different from the set of nodes where the workload's memory is currently
1905 * located.
20e07dea 1906 */
4142c3eb 1907static void numa_group_count_active_nodes(struct numa_group *numa_group)
20e07dea
RR
1908{
1909 unsigned long faults, max_faults = 0;
4142c3eb 1910 int nid, active_nodes = 0;
20e07dea
RR
1911
1912 for_each_online_node(nid) {
1913 faults = group_faults_cpu(numa_group, nid);
1914 if (faults > max_faults)
1915 max_faults = faults;
1916 }
1917
1918 for_each_online_node(nid) {
1919 faults = group_faults_cpu(numa_group, nid);
4142c3eb
RR
1920 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1921 active_nodes++;
20e07dea 1922 }
4142c3eb
RR
1923
1924 numa_group->max_faults_cpu = max_faults;
1925 numa_group->active_nodes = active_nodes;
20e07dea
RR
1926}
1927
04bb2f94
RR
1928/*
1929 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1930 * increments. The more local the fault statistics are, the higher the scan
a22b4b01
RR
1931 * period will be for the next scan window. If local/(local+remote) ratio is
1932 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1933 * the scan period will decrease. Aim for 70% local accesses.
04bb2f94
RR
1934 */
1935#define NUMA_PERIOD_SLOTS 10
a22b4b01 1936#define NUMA_PERIOD_THRESHOLD 7
04bb2f94
RR
1937
1938/*
1939 * Increase the scan period (slow down scanning) if the majority of
1940 * our memory is already on our local node, or if the majority of
1941 * the page accesses are shared with other processes.
1942 * Otherwise, decrease the scan period.
1943 */
1944static void update_task_scan_period(struct task_struct *p,
1945 unsigned long shared, unsigned long private)
1946{
1947 unsigned int period_slot;
37ec97de 1948 int lr_ratio, ps_ratio;
04bb2f94
RR
1949 int diff;
1950
1951 unsigned long remote = p->numa_faults_locality[0];
1952 unsigned long local = p->numa_faults_locality[1];
1953
1954 /*
1955 * If there were no record hinting faults then either the task is
1956 * completely idle or all activity is areas that are not of interest
074c2381
MG
1957 * to automatic numa balancing. Related to that, if there were failed
1958 * migration then it implies we are migrating too quickly or the local
1959 * node is overloaded. In either case, scan slower
04bb2f94 1960 */
074c2381 1961 if (local + shared == 0 || p->numa_faults_locality[2]) {
04bb2f94
RR
1962 p->numa_scan_period = min(p->numa_scan_period_max,
1963 p->numa_scan_period << 1);
1964
1965 p->mm->numa_next_scan = jiffies +
1966 msecs_to_jiffies(p->numa_scan_period);
1967
1968 return;
1969 }
1970
1971 /*
1972 * Prepare to scale scan period relative to the current period.
1973 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1974 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1975 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1976 */
1977 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
37ec97de
RR
1978 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1979 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1980
1981 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1982 /*
1983 * Most memory accesses are local. There is no need to
1984 * do fast NUMA scanning, since memory is already local.
1985 */
1986 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1987 if (!slot)
1988 slot = 1;
1989 diff = slot * period_slot;
1990 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1991 /*
1992 * Most memory accesses are shared with other tasks.
1993 * There is no point in continuing fast NUMA scanning,
1994 * since other tasks may just move the memory elsewhere.
1995 */
1996 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
04bb2f94
RR
1997 if (!slot)
1998 slot = 1;
1999 diff = slot * period_slot;
2000 } else {
04bb2f94 2001 /*
37ec97de
RR
2002 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2003 * yet they are not on the local NUMA node. Speed up
2004 * NUMA scanning to get the memory moved over.
04bb2f94 2005 */
37ec97de
RR
2006 int ratio = max(lr_ratio, ps_ratio);
2007 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
04bb2f94
RR
2008 }
2009
2010 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2011 task_scan_min(p), task_scan_max(p));
2012 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2013}
2014
7e2703e6
RR
2015/*
2016 * Get the fraction of time the task has been running since the last
2017 * NUMA placement cycle. The scheduler keeps similar statistics, but
2018 * decays those on a 32ms period, which is orders of magnitude off
2019 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2020 * stats only if the task is so new there are no NUMA statistics yet.
2021 */
2022static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2023{
2024 u64 runtime, delta, now;
2025 /* Use the start of this time slice to avoid calculations. */
2026 now = p->se.exec_start;
2027 runtime = p->se.sum_exec_runtime;
2028
2029 if (p->last_task_numa_placement) {
2030 delta = runtime - p->last_sum_exec_runtime;
2031 *period = now - p->last_task_numa_placement;
2032 } else {
c7b50216 2033 delta = p->se.avg.load_sum;
9d89c257 2034 *period = LOAD_AVG_MAX;
7e2703e6
RR
2035 }
2036
2037 p->last_sum_exec_runtime = runtime;
2038 p->last_task_numa_placement = now;
2039
2040 return delta;
2041}
2042
54009416
RR
2043/*
2044 * Determine the preferred nid for a task in a numa_group. This needs to
2045 * be done in a way that produces consistent results with group_weight,
2046 * otherwise workloads might not converge.
2047 */
2048static int preferred_group_nid(struct task_struct *p, int nid)
2049{
2050 nodemask_t nodes;
2051 int dist;
2052
2053 /* Direct connections between all NUMA nodes. */
2054 if (sched_numa_topology_type == NUMA_DIRECT)
2055 return nid;
2056
2057 /*
2058 * On a system with glueless mesh NUMA topology, group_weight
2059 * scores nodes according to the number of NUMA hinting faults on
2060 * both the node itself, and on nearby nodes.
2061 */
2062 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2063 unsigned long score, max_score = 0;
2064 int node, max_node = nid;
2065
2066 dist = sched_max_numa_distance;
2067
2068 for_each_online_node(node) {
2069 score = group_weight(p, node, dist);
2070 if (score > max_score) {
2071 max_score = score;
2072 max_node = node;
2073 }
2074 }
2075 return max_node;
2076 }
2077
2078 /*
2079 * Finding the preferred nid in a system with NUMA backplane
2080 * interconnect topology is more involved. The goal is to locate
2081 * tasks from numa_groups near each other in the system, and
2082 * untangle workloads from different sides of the system. This requires
2083 * searching down the hierarchy of node groups, recursively searching
2084 * inside the highest scoring group of nodes. The nodemask tricks
2085 * keep the complexity of the search down.
2086 */
2087 nodes = node_online_map;
2088 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2089 unsigned long max_faults = 0;
81907478 2090 nodemask_t max_group = NODE_MASK_NONE;
54009416
RR
2091 int a, b;
2092
2093 /* Are there nodes at this distance from each other? */
2094 if (!find_numa_distance(dist))
2095 continue;
2096
2097 for_each_node_mask(a, nodes) {
2098 unsigned long faults = 0;
2099 nodemask_t this_group;
2100 nodes_clear(this_group);
2101
2102 /* Sum group's NUMA faults; includes a==b case. */
2103 for_each_node_mask(b, nodes) {
2104 if (node_distance(a, b) < dist) {
2105 faults += group_faults(p, b);
2106 node_set(b, this_group);
2107 node_clear(b, nodes);
2108 }
2109 }
2110
2111 /* Remember the top group. */
2112 if (faults > max_faults) {
2113 max_faults = faults;
2114 max_group = this_group;
2115 /*
2116 * subtle: at the smallest distance there is
2117 * just one node left in each "group", the
2118 * winner is the preferred nid.
2119 */
2120 nid = a;
2121 }
2122 }
2123 /* Next round, evaluate the nodes within max_group. */
890a5409
JB
2124 if (!max_faults)
2125 break;
54009416
RR
2126 nodes = max_group;
2127 }
2128 return nid;
2129}
2130
cbee9f88
PZ
2131static void task_numa_placement(struct task_struct *p)
2132{
83e1d2cd
MG
2133 int seq, nid, max_nid = -1, max_group_nid = -1;
2134 unsigned long max_faults = 0, max_group_faults = 0;
04bb2f94 2135 unsigned long fault_types[2] = { 0, 0 };
7e2703e6
RR
2136 unsigned long total_faults;
2137 u64 runtime, period;
7dbd13ed 2138 spinlock_t *group_lock = NULL;
cbee9f88 2139
7e5a2c17
JL
2140 /*
2141 * The p->mm->numa_scan_seq field gets updated without
2142 * exclusive access. Use READ_ONCE() here to ensure
2143 * that the field is read in a single access:
2144 */
316c1608 2145 seq = READ_ONCE(p->mm->numa_scan_seq);
cbee9f88
PZ
2146 if (p->numa_scan_seq == seq)
2147 return;
2148 p->numa_scan_seq = seq;
598f0ec0 2149 p->numa_scan_period_max = task_scan_max(p);
cbee9f88 2150
7e2703e6
RR
2151 total_faults = p->numa_faults_locality[0] +
2152 p->numa_faults_locality[1];
2153 runtime = numa_get_avg_runtime(p, &period);
2154
7dbd13ed
MG
2155 /* If the task is part of a group prevent parallel updates to group stats */
2156 if (p->numa_group) {
2157 group_lock = &p->numa_group->lock;
60e69eed 2158 spin_lock_irq(group_lock);
7dbd13ed
MG
2159 }
2160
688b7585
MG
2161 /* Find the node with the highest number of faults */
2162 for_each_online_node(nid) {
44dba3d5
IM
2163 /* Keep track of the offsets in numa_faults array */
2164 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
83e1d2cd 2165 unsigned long faults = 0, group_faults = 0;
44dba3d5 2166 int priv;
745d6147 2167
be1e4e76 2168 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
7e2703e6 2169 long diff, f_diff, f_weight;
8c8a743c 2170
44dba3d5
IM
2171 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2172 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2173 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2174 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
745d6147 2175
ac8e895b 2176 /* Decay existing window, copy faults since last scan */
44dba3d5
IM
2177 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2178 fault_types[priv] += p->numa_faults[membuf_idx];
2179 p->numa_faults[membuf_idx] = 0;
fb13c7ee 2180
7e2703e6
RR
2181 /*
2182 * Normalize the faults_from, so all tasks in a group
2183 * count according to CPU use, instead of by the raw
2184 * number of faults. Tasks with little runtime have
2185 * little over-all impact on throughput, and thus their
2186 * faults are less important.
2187 */
2188 f_weight = div64_u64(runtime << 16, period + 1);
44dba3d5 2189 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
7e2703e6 2190 (total_faults + 1);
44dba3d5
IM
2191 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2192 p->numa_faults[cpubuf_idx] = 0;
50ec8a40 2193
44dba3d5
IM
2194 p->numa_faults[mem_idx] += diff;
2195 p->numa_faults[cpu_idx] += f_diff;
2196 faults += p->numa_faults[mem_idx];
83e1d2cd 2197 p->total_numa_faults += diff;
8c8a743c 2198 if (p->numa_group) {
44dba3d5
IM
2199 /*
2200 * safe because we can only change our own group
2201 *
2202 * mem_idx represents the offset for a given
2203 * nid and priv in a specific region because it
2204 * is at the beginning of the numa_faults array.
2205 */
2206 p->numa_group->faults[mem_idx] += diff;
2207 p->numa_group->faults_cpu[mem_idx] += f_diff;
989348b5 2208 p->numa_group->total_faults += diff;
44dba3d5 2209 group_faults += p->numa_group->faults[mem_idx];
8c8a743c 2210 }
ac8e895b
MG
2211 }
2212
688b7585
MG
2213 if (faults > max_faults) {
2214 max_faults = faults;
2215 max_nid = nid;
2216 }
83e1d2cd
MG
2217
2218 if (group_faults > max_group_faults) {
2219 max_group_faults = group_faults;
2220 max_group_nid = nid;
2221 }
2222 }
2223
04bb2f94
RR
2224 update_task_scan_period(p, fault_types[0], fault_types[1]);
2225
7dbd13ed 2226 if (p->numa_group) {
4142c3eb 2227 numa_group_count_active_nodes(p->numa_group);
60e69eed 2228 spin_unlock_irq(group_lock);
54009416 2229 max_nid = preferred_group_nid(p, max_group_nid);
688b7585
MG
2230 }
2231
bb97fc31
RR
2232 if (max_faults) {
2233 /* Set the new preferred node */
2234 if (max_nid != p->numa_preferred_nid)
2235 sched_setnuma(p, max_nid);
2236
2237 if (task_node(p) != p->numa_preferred_nid)
2238 numa_migrate_preferred(p);
3a7053b3 2239 }
cbee9f88
PZ
2240}
2241
8c8a743c
PZ
2242static inline int get_numa_group(struct numa_group *grp)
2243{
2244 return atomic_inc_not_zero(&grp->refcount);
2245}
2246
2247static inline void put_numa_group(struct numa_group *grp)
2248{
2249 if (atomic_dec_and_test(&grp->refcount))
2250 kfree_rcu(grp, rcu);
2251}
2252
3e6a9418
MG
2253static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2254 int *priv)
8c8a743c
PZ
2255{
2256 struct numa_group *grp, *my_grp;
2257 struct task_struct *tsk;
2258 bool join = false;
2259 int cpu = cpupid_to_cpu(cpupid);
2260 int i;
2261
2262 if (unlikely(!p->numa_group)) {
2263 unsigned int size = sizeof(struct numa_group) +
50ec8a40 2264 4*nr_node_ids*sizeof(unsigned long);
8c8a743c
PZ
2265
2266 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2267 if (!grp)
2268 return;
2269
2270 atomic_set(&grp->refcount, 1);
4142c3eb
RR
2271 grp->active_nodes = 1;
2272 grp->max_faults_cpu = 0;
8c8a743c 2273 spin_lock_init(&grp->lock);
e29cf08b 2274 grp->gid = p->pid;
50ec8a40 2275 /* Second half of the array tracks nids where faults happen */
be1e4e76
RR
2276 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2277 nr_node_ids;
8c8a743c 2278
be1e4e76 2279 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
44dba3d5 2280 grp->faults[i] = p->numa_faults[i];
8c8a743c 2281
989348b5 2282 grp->total_faults = p->total_numa_faults;
83e1d2cd 2283
8c8a743c
PZ
2284 grp->nr_tasks++;
2285 rcu_assign_pointer(p->numa_group, grp);
2286 }
2287
2288 rcu_read_lock();
316c1608 2289 tsk = READ_ONCE(cpu_rq(cpu)->curr);
8c8a743c
PZ
2290
2291 if (!cpupid_match_pid(tsk, cpupid))
3354781a 2292 goto no_join;
8c8a743c
PZ
2293
2294 grp = rcu_dereference(tsk->numa_group);
2295 if (!grp)
3354781a 2296 goto no_join;
8c8a743c
PZ
2297
2298 my_grp = p->numa_group;
2299 if (grp == my_grp)
3354781a 2300 goto no_join;
8c8a743c
PZ
2301
2302 /*
2303 * Only join the other group if its bigger; if we're the bigger group,
2304 * the other task will join us.
2305 */
2306 if (my_grp->nr_tasks > grp->nr_tasks)
3354781a 2307 goto no_join;
8c8a743c
PZ
2308
2309 /*
2310 * Tie-break on the grp address.
2311 */
2312 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3354781a 2313 goto no_join;
8c8a743c 2314
dabe1d99
RR
2315 /* Always join threads in the same process. */
2316 if (tsk->mm == current->mm)
2317 join = true;
2318
2319 /* Simple filter to avoid false positives due to PID collisions */
2320 if (flags & TNF_SHARED)
2321 join = true;
8c8a743c 2322
3e6a9418
MG
2323 /* Update priv based on whether false sharing was detected */
2324 *priv = !join;
2325
dabe1d99 2326 if (join && !get_numa_group(grp))
3354781a 2327 goto no_join;
8c8a743c 2328
8c8a743c
PZ
2329 rcu_read_unlock();
2330
2331 if (!join)
2332 return;
2333
60e69eed
MG
2334 BUG_ON(irqs_disabled());
2335 double_lock_irq(&my_grp->lock, &grp->lock);
989348b5 2336
be1e4e76 2337 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
44dba3d5
IM
2338 my_grp->faults[i] -= p->numa_faults[i];
2339 grp->faults[i] += p->numa_faults[i];
8c8a743c 2340 }
989348b5
MG
2341 my_grp->total_faults -= p->total_numa_faults;
2342 grp->total_faults += p->total_numa_faults;
8c8a743c 2343
8c8a743c
PZ
2344 my_grp->nr_tasks--;
2345 grp->nr_tasks++;
2346
2347 spin_unlock(&my_grp->lock);
60e69eed 2348 spin_unlock_irq(&grp->lock);
8c8a743c
PZ
2349
2350 rcu_assign_pointer(p->numa_group, grp);
2351
2352 put_numa_group(my_grp);
3354781a
PZ
2353 return;
2354
2355no_join:
2356 rcu_read_unlock();
2357 return;
8c8a743c
PZ
2358}
2359
2360void task_numa_free(struct task_struct *p)
2361{
2362 struct numa_group *grp = p->numa_group;
44dba3d5 2363 void *numa_faults = p->numa_faults;
e9dd685c
SR
2364 unsigned long flags;
2365 int i;
8c8a743c
PZ
2366
2367 if (grp) {
e9dd685c 2368 spin_lock_irqsave(&grp->lock, flags);
be1e4e76 2369 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
44dba3d5 2370 grp->faults[i] -= p->numa_faults[i];
989348b5 2371 grp->total_faults -= p->total_numa_faults;
83e1d2cd 2372
8c8a743c 2373 grp->nr_tasks--;
e9dd685c 2374 spin_unlock_irqrestore(&grp->lock, flags);
35b123e2 2375 RCU_INIT_POINTER(p->numa_group, NULL);
8c8a743c
PZ
2376 put_numa_group(grp);
2377 }
2378
44dba3d5 2379 p->numa_faults = NULL;
82727018 2380 kfree(numa_faults);
8c8a743c
PZ
2381}
2382
cbee9f88
PZ
2383/*
2384 * Got a PROT_NONE fault for a page on @node.
2385 */
58b46da3 2386void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
cbee9f88
PZ
2387{
2388 struct task_struct *p = current;
6688cc05 2389 bool migrated = flags & TNF_MIGRATED;
58b46da3 2390 int cpu_node = task_node(current);
792568ec 2391 int local = !!(flags & TNF_FAULT_LOCAL);
4142c3eb 2392 struct numa_group *ng;
ac8e895b 2393 int priv;
cbee9f88 2394
2a595721 2395 if (!static_branch_likely(&sched_numa_balancing))
1a687c2e
MG
2396 return;
2397
9ff1d9ff
MG
2398 /* for example, ksmd faulting in a user's mm */
2399 if (!p->mm)
2400 return;
2401
f809ca9a 2402 /* Allocate buffer to track faults on a per-node basis */
44dba3d5
IM
2403 if (unlikely(!p->numa_faults)) {
2404 int size = sizeof(*p->numa_faults) *
be1e4e76 2405 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
f809ca9a 2406
44dba3d5
IM
2407 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2408 if (!p->numa_faults)
f809ca9a 2409 return;
745d6147 2410
83e1d2cd 2411 p->total_numa_faults = 0;
04bb2f94 2412 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
f809ca9a 2413 }
cbee9f88 2414
8c8a743c
PZ
2415 /*
2416 * First accesses are treated as private, otherwise consider accesses
2417 * to be private if the accessing pid has not changed
2418 */
2419 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2420 priv = 1;
2421 } else {
2422 priv = cpupid_match_pid(p, last_cpupid);
6688cc05 2423 if (!priv && !(flags & TNF_NO_GROUP))
3e6a9418 2424 task_numa_group(p, last_cpupid, flags, &priv);
8c8a743c
PZ
2425 }
2426
792568ec
RR
2427 /*
2428 * If a workload spans multiple NUMA nodes, a shared fault that
2429 * occurs wholly within the set of nodes that the workload is
2430 * actively using should be counted as local. This allows the
2431 * scan rate to slow down when a workload has settled down.
2432 */
4142c3eb
RR
2433 ng = p->numa_group;
2434 if (!priv && !local && ng && ng->active_nodes > 1 &&
2435 numa_is_active_node(cpu_node, ng) &&
2436 numa_is_active_node(mem_node, ng))
792568ec
RR
2437 local = 1;
2438
cbee9f88 2439 task_numa_placement(p);
f809ca9a 2440
2739d3ee
RR
2441 /*
2442 * Retry task to preferred node migration periodically, in case it
2443 * case it previously failed, or the scheduler moved us.
2444 */
2445 if (time_after(jiffies, p->numa_migrate_retry))
6b9a7460
MG
2446 numa_migrate_preferred(p);
2447
b32e86b4
IM
2448 if (migrated)
2449 p->numa_pages_migrated += pages;
074c2381
MG
2450 if (flags & TNF_MIGRATE_FAIL)
2451 p->numa_faults_locality[2] += pages;
b32e86b4 2452
44dba3d5
IM
2453 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2454 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
792568ec 2455 p->numa_faults_locality[local] += pages;
cbee9f88
PZ
2456}
2457
6e5fb223
PZ
2458static void reset_ptenuma_scan(struct task_struct *p)
2459{
7e5a2c17
JL
2460 /*
2461 * We only did a read acquisition of the mmap sem, so
2462 * p->mm->numa_scan_seq is written to without exclusive access
2463 * and the update is not guaranteed to be atomic. That's not
2464 * much of an issue though, since this is just used for
2465 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2466 * expensive, to avoid any form of compiler optimizations:
2467 */
316c1608 2468 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
6e5fb223
PZ
2469 p->mm->numa_scan_offset = 0;
2470}
2471
cbee9f88
PZ
2472/*
2473 * The expensive part of numa migration is done from task_work context.
2474 * Triggered from task_tick_numa().
2475 */
2476void task_numa_work(struct callback_head *work)
2477{
2478 unsigned long migrate, next_scan, now = jiffies;
2479 struct task_struct *p = current;
2480 struct mm_struct *mm = p->mm;
51170840 2481 u64 runtime = p->se.sum_exec_runtime;
6e5fb223 2482 struct vm_area_struct *vma;
9f40604c 2483 unsigned long start, end;
598f0ec0 2484 unsigned long nr_pte_updates = 0;
4620f8c1 2485 long pages, virtpages;
cbee9f88 2486
9148a3a1 2487 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
cbee9f88
PZ
2488
2489 work->next = work; /* protect against double add */
2490 /*
2491 * Who cares about NUMA placement when they're dying.
2492 *
2493 * NOTE: make sure not to dereference p->mm before this check,
2494 * exit_task_work() happens _after_ exit_mm() so we could be called
2495 * without p->mm even though we still had it when we enqueued this
2496 * work.
2497 */
2498 if (p->flags & PF_EXITING)
2499 return;
2500
930aa174 2501 if (!mm->numa_next_scan) {
7e8d16b6
MG
2502 mm->numa_next_scan = now +
2503 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
b8593bfd
MG
2504 }
2505
cbee9f88
PZ
2506 /*
2507 * Enforce maximal scan/migration frequency..
2508 */
2509 migrate = mm->numa_next_scan;
2510 if (time_before(now, migrate))
2511 return;
2512
598f0ec0
MG
2513 if (p->numa_scan_period == 0) {
2514 p->numa_scan_period_max = task_scan_max(p);
b5dd77c8 2515 p->numa_scan_period = task_scan_start(p);
598f0ec0 2516 }
cbee9f88 2517
fb003b80 2518 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
cbee9f88
PZ
2519 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2520 return;
2521
19a78d11
PZ
2522 /*
2523 * Delay this task enough that another task of this mm will likely win
2524 * the next time around.
2525 */
2526 p->node_stamp += 2 * TICK_NSEC;
2527
9f40604c
MG
2528 start = mm->numa_scan_offset;
2529 pages = sysctl_numa_balancing_scan_size;
2530 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
4620f8c1 2531 virtpages = pages * 8; /* Scan up to this much virtual space */
9f40604c
MG
2532 if (!pages)
2533 return;
cbee9f88 2534
4620f8c1 2535
8655d549
VB
2536 if (!down_read_trylock(&mm->mmap_sem))
2537 return;
9f40604c 2538 vma = find_vma(mm, start);
6e5fb223
PZ
2539 if (!vma) {
2540 reset_ptenuma_scan(p);
9f40604c 2541 start = 0;
6e5fb223
PZ
2542 vma = mm->mmap;
2543 }
9f40604c 2544 for (; vma; vma = vma->vm_next) {
6b79c57b 2545 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
8e76d4ee 2546 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
6e5fb223 2547 continue;
6b79c57b 2548 }
6e5fb223 2549
4591ce4f
MG
2550 /*
2551 * Shared library pages mapped by multiple processes are not
2552 * migrated as it is expected they are cache replicated. Avoid
2553 * hinting faults in read-only file-backed mappings or the vdso
2554 * as migrating the pages will be of marginal benefit.
2555 */
2556 if (!vma->vm_mm ||
2557 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2558 continue;
2559
3c67f474
MG
2560 /*
2561 * Skip inaccessible VMAs to avoid any confusion between
2562 * PROT_NONE and NUMA hinting ptes
2563 */
2564 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2565 continue;
4591ce4f 2566
9f40604c
MG
2567 do {
2568 start = max(start, vma->vm_start);
2569 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2570 end = min(end, vma->vm_end);
4620f8c1 2571 nr_pte_updates = change_prot_numa(vma, start, end);
598f0ec0
MG
2572
2573 /*
4620f8c1
RR
2574 * Try to scan sysctl_numa_balancing_size worth of
2575 * hpages that have at least one present PTE that
2576 * is not already pte-numa. If the VMA contains
2577 * areas that are unused or already full of prot_numa
2578 * PTEs, scan up to virtpages, to skip through those
2579 * areas faster.
598f0ec0
MG
2580 */
2581 if (nr_pte_updates)
2582 pages -= (end - start) >> PAGE_SHIFT;
4620f8c1 2583 virtpages -= (end - start) >> PAGE_SHIFT;
6e5fb223 2584
9f40604c 2585 start = end;
4620f8c1 2586 if (pages <= 0 || virtpages <= 0)
9f40604c 2587 goto out;
3cf1962c
RR
2588
2589 cond_resched();
9f40604c 2590 } while (end != vma->vm_end);
cbee9f88 2591 }
6e5fb223 2592
9f40604c 2593out:
6e5fb223 2594 /*
c69307d5
PZ
2595 * It is possible to reach the end of the VMA list but the last few
2596 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2597 * would find the !migratable VMA on the next scan but not reset the
2598 * scanner to the start so check it now.
6e5fb223
PZ
2599 */
2600 if (vma)
9f40604c 2601 mm->numa_scan_offset = start;
6e5fb223
PZ
2602 else
2603 reset_ptenuma_scan(p);
2604 up_read(&mm->mmap_sem);
51170840
RR
2605
2606 /*
2607 * Make sure tasks use at least 32x as much time to run other code
2608 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2609 * Usually update_task_scan_period slows down scanning enough; on an
2610 * overloaded system we need to limit overhead on a per task basis.
2611 */
2612 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2613 u64 diff = p->se.sum_exec_runtime - runtime;
2614 p->node_stamp += 32 * diff;
2615 }
cbee9f88
PZ
2616}
2617
2618/*
2619 * Drive the periodic memory faults..
2620 */
2621void task_tick_numa(struct rq *rq, struct task_struct *curr)
2622{
2623 struct callback_head *work = &curr->numa_work;
2624 u64 period, now;
2625
2626 /*
2627 * We don't care about NUMA placement if we don't have memory.
2628 */
2629 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2630 return;
2631
2632 /*
2633 * Using runtime rather than walltime has the dual advantage that
2634 * we (mostly) drive the selection from busy threads and that the
2635 * task needs to have done some actual work before we bother with
2636 * NUMA placement.
2637 */
2638 now = curr->se.sum_exec_runtime;
2639 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2640
25b3e5a3 2641 if (now > curr->node_stamp + period) {
4b96a29b 2642 if (!curr->node_stamp)
b5dd77c8 2643 curr->numa_scan_period = task_scan_start(curr);
19a78d11 2644 curr->node_stamp += period;
cbee9f88
PZ
2645
2646 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2647 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2648 task_work_add(curr, work, true);
2649 }
2650 }
2651}
3fed382b 2652
cbee9f88
PZ
2653#else
2654static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2655{
2656}
0ec8aa00
PZ
2657
2658static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2659{
2660}
2661
2662static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2663{
2664}
3fed382b 2665
cbee9f88
PZ
2666#endif /* CONFIG_NUMA_BALANCING */
2667
30cfdcfc
DA
2668static void
2669account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2670{
2671 update_load_add(&cfs_rq->load, se->load.weight);
c09595f6 2672 if (!parent_entity(se))
029632fb 2673 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
367456c7 2674#ifdef CONFIG_SMP
0ec8aa00
PZ
2675 if (entity_is_task(se)) {
2676 struct rq *rq = rq_of(cfs_rq);
2677
2678 account_numa_enqueue(rq, task_of(se));
2679 list_add(&se->group_node, &rq->cfs_tasks);
2680 }
367456c7 2681#endif
30cfdcfc 2682 cfs_rq->nr_running++;
30cfdcfc
DA
2683}
2684
2685static void
2686account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2687{
2688 update_load_sub(&cfs_rq->load, se->load.weight);
c09595f6 2689 if (!parent_entity(se))
029632fb 2690 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
bfdb198c 2691#ifdef CONFIG_SMP
0ec8aa00
PZ
2692 if (entity_is_task(se)) {
2693 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
b87f1724 2694 list_del_init(&se->group_node);
0ec8aa00 2695 }
bfdb198c 2696#endif
30cfdcfc 2697 cfs_rq->nr_running--;
30cfdcfc
DA
2698}
2699
8d5b9025
PZ
2700/*
2701 * Signed add and clamp on underflow.
2702 *
2703 * Explicitly do a load-store to ensure the intermediate value never hits
2704 * memory. This allows lockless observations without ever seeing the negative
2705 * values.
2706 */
2707#define add_positive(_ptr, _val) do { \
2708 typeof(_ptr) ptr = (_ptr); \
2709 typeof(_val) val = (_val); \
2710 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2711 \
2712 res = var + val; \
2713 \
2714 if (val < 0 && res > var) \
2715 res = 0; \
2716 \
2717 WRITE_ONCE(*ptr, res); \
2718} while (0)
2719
2720/*
2721 * Unsigned subtract and clamp on underflow.
2722 *
2723 * Explicitly do a load-store to ensure the intermediate value never hits
2724 * memory. This allows lockless observations without ever seeing the negative
2725 * values.
2726 */
2727#define sub_positive(_ptr, _val) do { \
2728 typeof(_ptr) ptr = (_ptr); \
2729 typeof(*ptr) val = (_val); \
2730 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2731 res = var - val; \
2732 if (res > var) \
2733 res = 0; \
2734 WRITE_ONCE(*ptr, res); \
2735} while (0)
2736
2737#ifdef CONFIG_SMP
2738/*
1ea6c46a 2739 * XXX we want to get rid of these helpers and use the full load resolution.
8d5b9025
PZ
2740 */
2741static inline long se_weight(struct sched_entity *se)
2742{
2743 return scale_load_down(se->load.weight);
2744}
2745
1ea6c46a
PZ
2746static inline long se_runnable(struct sched_entity *se)
2747{
2748 return scale_load_down(se->runnable_weight);
2749}
2750
8d5b9025
PZ
2751static inline void
2752enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2753{
1ea6c46a
PZ
2754 cfs_rq->runnable_weight += se->runnable_weight;
2755
2756 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2757 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
8d5b9025
PZ
2758}
2759
2760static inline void
2761dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762{
1ea6c46a
PZ
2763 cfs_rq->runnable_weight -= se->runnable_weight;
2764
2765 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2766 sub_positive(&cfs_rq->avg.runnable_load_sum,
2767 se_runnable(se) * se->avg.runnable_load_sum);
8d5b9025
PZ
2768}
2769
2770static inline void
2771enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2772{
2773 cfs_rq->avg.load_avg += se->avg.load_avg;
2774 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2775}
2776
2777static inline void
2778dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2779{
2780 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2781 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2782}
2783#else
2784static inline void
2785enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2786static inline void
2787dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2788static inline void
2789enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2790static inline void
2791dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2792#endif
2793
9059393e 2794static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1ea6c46a 2795 unsigned long weight, unsigned long runnable)
9059393e
VG
2796{
2797 if (se->on_rq) {
2798 /* commit outstanding execution time */
2799 if (cfs_rq->curr == se)
2800 update_curr(cfs_rq);
2801 account_entity_dequeue(cfs_rq, se);
2802 dequeue_runnable_load_avg(cfs_rq, se);
2803 }
2804 dequeue_load_avg(cfs_rq, se);
2805
1ea6c46a 2806 se->runnable_weight = runnable;
9059393e
VG
2807 update_load_set(&se->load, weight);
2808
2809#ifdef CONFIG_SMP
1ea6c46a
PZ
2810 do {
2811 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2812
2813 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2814 se->avg.runnable_load_avg =
2815 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2816 } while (0);
9059393e
VG
2817#endif
2818
2819 enqueue_load_avg(cfs_rq, se);
2820 if (se->on_rq) {
2821 account_entity_enqueue(cfs_rq, se);
2822 enqueue_runnable_load_avg(cfs_rq, se);
2823 }
2824}
2825
2826void reweight_task(struct task_struct *p, int prio)
2827{
2828 struct sched_entity *se = &p->se;
2829 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2830 struct load_weight *load = &se->load;
2831 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2832
1ea6c46a 2833 reweight_entity(cfs_rq, se, weight, weight);
9059393e
VG
2834 load->inv_weight = sched_prio_to_wmult[prio];
2835}
2836
3ff6dcac 2837#ifdef CONFIG_FAIR_GROUP_SCHED
387f77cc 2838#ifdef CONFIG_SMP
cef27403
PZ
2839/*
2840 * All this does is approximate the hierarchical proportion which includes that
2841 * global sum we all love to hate.
2842 *
2843 * That is, the weight of a group entity, is the proportional share of the
2844 * group weight based on the group runqueue weights. That is:
2845 *
2846 * tg->weight * grq->load.weight
2847 * ge->load.weight = ----------------------------- (1)
2848 * \Sum grq->load.weight
2849 *
2850 * Now, because computing that sum is prohibitively expensive to compute (been
2851 * there, done that) we approximate it with this average stuff. The average
2852 * moves slower and therefore the approximation is cheaper and more stable.
2853 *
2854 * So instead of the above, we substitute:
2855 *
2856 * grq->load.weight -> grq->avg.load_avg (2)
2857 *
2858 * which yields the following:
2859 *
2860 * tg->weight * grq->avg.load_avg
2861 * ge->load.weight = ------------------------------ (3)
2862 * tg->load_avg
2863 *
2864 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
2865 *
2866 * That is shares_avg, and it is right (given the approximation (2)).
2867 *
2868 * The problem with it is that because the average is slow -- it was designed
2869 * to be exactly that of course -- this leads to transients in boundary
2870 * conditions. In specific, the case where the group was idle and we start the
2871 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
2872 * yielding bad latency etc..
2873 *
2874 * Now, in that special case (1) reduces to:
2875 *
2876 * tg->weight * grq->load.weight
17de4ee0 2877 * ge->load.weight = ----------------------------- = tg->weight (4)
cef27403
PZ
2878 * grp->load.weight
2879 *
2880 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
2881 *
2882 * So what we do is modify our approximation (3) to approach (4) in the (near)
2883 * UP case, like:
2884 *
2885 * ge->load.weight =
2886 *
2887 * tg->weight * grq->load.weight
2888 * --------------------------------------------------- (5)
2889 * tg->load_avg - grq->avg.load_avg + grq->load.weight
2890 *
17de4ee0
PZ
2891 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
2892 * we need to use grq->avg.load_avg as its lower bound, which then gives:
2893 *
2894 *
2895 * tg->weight * grq->load.weight
2896 * ge->load.weight = ----------------------------- (6)
2897 * tg_load_avg'
2898 *
2899 * Where:
2900 *
2901 * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
2902 * max(grq->load.weight, grq->avg.load_avg)
cef27403
PZ
2903 *
2904 * And that is shares_weight and is icky. In the (near) UP case it approaches
2905 * (4) while in the normal case it approaches (3). It consistently
2906 * overestimates the ge->load.weight and therefore:
2907 *
2908 * \Sum ge->load.weight >= tg->weight
2909 *
2910 * hence icky!
2911 */
2c8e4dce 2912static long calc_group_shares(struct cfs_rq *cfs_rq)
cf5f0acf 2913{
7c80cfc9
PZ
2914 long tg_weight, tg_shares, load, shares;
2915 struct task_group *tg = cfs_rq->tg;
2916
2917 tg_shares = READ_ONCE(tg->shares);
cf5f0acf 2918
3d4b60d3 2919 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
cf5f0acf 2920
ea1dc6fc 2921 tg_weight = atomic_long_read(&tg->load_avg);
3ff6dcac 2922
ea1dc6fc
PZ
2923 /* Ensure tg_weight >= load */
2924 tg_weight -= cfs_rq->tg_load_avg_contrib;
2925 tg_weight += load;
3ff6dcac 2926
7c80cfc9 2927 shares = (tg_shares * load);
cf5f0acf
PZ
2928 if (tg_weight)
2929 shares /= tg_weight;
3ff6dcac 2930
b8fd8423
DE
2931 /*
2932 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
2933 * of a group with small tg->shares value. It is a floor value which is
2934 * assigned as a minimum load.weight to the sched_entity representing
2935 * the group on a CPU.
2936 *
2937 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
2938 * on an 8-core system with 8 tasks each runnable on one CPU shares has
2939 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
2940 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
2941 * instead of 0.
2942 */
7c80cfc9 2943 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3ff6dcac 2944}
2c8e4dce
JB
2945
2946/*
17de4ee0
PZ
2947 * This calculates the effective runnable weight for a group entity based on
2948 * the group entity weight calculated above.
2949 *
2950 * Because of the above approximation (2), our group entity weight is
2951 * an load_avg based ratio (3). This means that it includes blocked load and
2952 * does not represent the runnable weight.
2953 *
2954 * Approximate the group entity's runnable weight per ratio from the group
2955 * runqueue:
2956 *
2957 * grq->avg.runnable_load_avg
2958 * ge->runnable_weight = ge->load.weight * -------------------------- (7)
2959 * grq->avg.load_avg
2960 *
2961 * However, analogous to above, since the avg numbers are slow, this leads to
2962 * transients in the from-idle case. Instead we use:
2963 *
2964 * ge->runnable_weight = ge->load.weight *
2965 *
2966 * max(grq->avg.runnable_load_avg, grq->runnable_weight)
2967 * ----------------------------------------------------- (8)
2968 * max(grq->avg.load_avg, grq->load.weight)
2969 *
2970 * Where these max() serve both to use the 'instant' values to fix the slow
2971 * from-idle and avoid the /0 on to-idle, similar to (6).
2c8e4dce
JB
2972 */
2973static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2974{
17de4ee0
PZ
2975 long runnable, load_avg;
2976
2977 load_avg = max(cfs_rq->avg.load_avg,
2978 scale_load_down(cfs_rq->load.weight));
2979
2980 runnable = max(cfs_rq->avg.runnable_load_avg,
2981 scale_load_down(cfs_rq->runnable_weight));
2c8e4dce
JB
2982
2983 runnable *= shares;
2984 if (load_avg)
2985 runnable /= load_avg;
17de4ee0 2986
2c8e4dce
JB
2987 return clamp_t(long, runnable, MIN_SHARES, shares);
2988}
387f77cc 2989#endif /* CONFIG_SMP */
ea1dc6fc 2990
82958366
PT
2991static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2992
1ea6c46a
PZ
2993/*
2994 * Recomputes the group entity based on the current state of its group
2995 * runqueue.
2996 */
2997static void update_cfs_group(struct sched_entity *se)
2069dd75 2998{
1ea6c46a
PZ
2999 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3000 long shares, runnable;
2069dd75 3001
1ea6c46a 3002 if (!gcfs_rq)
89ee048f
VG
3003 return;
3004
1ea6c46a 3005 if (throttled_hierarchy(gcfs_rq))
2069dd75 3006 return;
89ee048f 3007
3ff6dcac 3008#ifndef CONFIG_SMP
1ea6c46a 3009 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
7c80cfc9
PZ
3010
3011 if (likely(se->load.weight == shares))
3ff6dcac 3012 return;
7c80cfc9 3013#else
2c8e4dce
JB
3014 shares = calc_group_shares(gcfs_rq);
3015 runnable = calc_group_runnable(gcfs_rq, shares);
3ff6dcac 3016#endif
2069dd75 3017
1ea6c46a 3018 reweight_entity(cfs_rq_of(se), se, shares, runnable);
2069dd75 3019}
89ee048f 3020
2069dd75 3021#else /* CONFIG_FAIR_GROUP_SCHED */
1ea6c46a 3022static inline void update_cfs_group(struct sched_entity *se)
2069dd75
PZ
3023{
3024}
3025#endif /* CONFIG_FAIR_GROUP_SCHED */
3026
a030d738
VK
3027static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3028{
43964409
LT
3029 struct rq *rq = rq_of(cfs_rq);
3030
3031 if (&rq->cfs == cfs_rq) {
a030d738
VK
3032 /*
3033 * There are a few boundary cases this might miss but it should
3034 * get called often enough that that should (hopefully) not be
9783be2c 3035 * a real problem.
a030d738
VK
3036 *
3037 * It will not get called when we go idle, because the idle
3038 * thread is a different class (!fair), nor will the utilization
3039 * number include things like RT tasks.
3040 *
3041 * As is, the util number is not freq-invariant (we'd have to
3042 * implement arch_scale_freq_capacity() for that).
3043 *
3044 * See cpu_util().
3045 */
43964409 3046 cpufreq_update_util(rq, 0);
a030d738
VK
3047 }
3048}
3049
141965c7 3050#ifdef CONFIG_SMP
9d85f21c
PT
3051/*
3052 * Approximate:
3053 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
3054 */
a481db34 3055static u64 decay_load(u64 val, u64 n)
9d85f21c 3056{
5b51f2f8
PT
3057 unsigned int local_n;
3058
05296e75 3059 if (unlikely(n > LOAD_AVG_PERIOD * 63))
5b51f2f8
PT
3060 return 0;
3061
3062 /* after bounds checking we can collapse to 32-bit */
3063 local_n = n;
3064
3065 /*
3066 * As y^PERIOD = 1/2, we can combine
9c58c79a
ZZ
3067 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
3068 * With a look-up table which covers y^n (n<PERIOD)
5b51f2f8
PT
3069 *
3070 * To achieve constant time decay_load.
3071 */
3072 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
3073 val >>= local_n / LOAD_AVG_PERIOD;
3074 local_n %= LOAD_AVG_PERIOD;
9d85f21c
PT
3075 }
3076
9d89c257
YD
3077 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
3078 return val;
5b51f2f8
PT
3079}
3080
05296e75 3081static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
5b51f2f8 3082{
05296e75 3083 u32 c1, c2, c3 = d3; /* y^0 == 1 */
5b51f2f8 3084
a481db34 3085 /*
3841cdc3 3086 * c1 = d1 y^p
a481db34 3087 */
05296e75 3088 c1 = decay_load((u64)d1, periods);
a481db34 3089
a481db34 3090 /*
3841cdc3 3091 * p-1
05296e75
PZ
3092 * c2 = 1024 \Sum y^n
3093 * n=1
a481db34 3094 *
05296e75
PZ
3095 * inf inf
3096 * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
3841cdc3 3097 * n=0 n=p
a481db34 3098 */
05296e75 3099 c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
a481db34
YD
3100
3101 return c1 + c2 + c3;
9d85f21c
PT
3102}
3103
a481db34
YD
3104/*
3105 * Accumulate the three separate parts of the sum; d1 the remainder
3106 * of the last (incomplete) period, d2 the span of full periods and d3
3107 * the remainder of the (incomplete) current period.
3108 *
3109 * d1 d2 d3
3110 * ^ ^ ^
3111 * | | |
3112 * |<->|<----------------->|<--->|
3113 * ... |---x---|------| ... |------|-----x (now)
3114 *
3841cdc3
PZ
3115 * p-1
3116 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
3117 * n=1
a481db34 3118 *
3841cdc3 3119 * = u y^p + (Step 1)
a481db34 3120 *
3841cdc3
PZ
3121 * p-1
3122 * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
3123 * n=1
a481db34
YD
3124 */
3125static __always_inline u32
3126accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
1ea6c46a 3127 unsigned long load, unsigned long runnable, int running)
a481db34
YD
3128{
3129 unsigned long scale_freq, scale_cpu;
05296e75 3130 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
a481db34 3131 u64 periods;
a481db34 3132
7673c8a4 3133 scale_freq = arch_scale_freq_capacity(cpu);
a481db34
YD
3134 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3135
3136 delta += sa->period_contrib;
3137 periods = delta / 1024; /* A period is 1024us (~1ms) */
3138
3139 /*
3140 * Step 1: decay old *_sum if we crossed period boundaries.
3141 */
3142 if (periods) {
3143 sa->load_sum = decay_load(sa->load_sum, periods);
1ea6c46a
PZ
3144 sa->runnable_load_sum =
3145 decay_load(sa->runnable_load_sum, periods);
a481db34 3146 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
a481db34 3147
05296e75
PZ
3148 /*
3149 * Step 2
3150 */
3151 delta %= 1024;
3152 contrib = __accumulate_pelt_segments(periods,
3153 1024 - sa->period_contrib, delta);
3154 }
a481db34
YD
3155 sa->period_contrib = delta;
3156
3157 contrib = cap_scale(contrib, scale_freq);
1ea6c46a
PZ
3158 if (load)
3159 sa->load_sum += load * contrib;
3160 if (runnable)
3161 sa->runnable_load_sum += runnable * contrib;
a481db34
YD
3162 if (running)
3163 sa->util_sum += contrib * scale_cpu;
3164
3165 return periods;
3166}
3167
9d85f21c
PT
3168/*
3169 * We can represent the historical contribution to runnable average as the
3170 * coefficients of a geometric series. To do this we sub-divide our runnable
3171 * history into segments of approximately 1ms (1024us); label the segment that
3172 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
3173 *
3174 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
3175 * p0 p1 p2
3176 * (now) (~1ms ago) (~2ms ago)
3177 *
3178 * Let u_i denote the fraction of p_i that the entity was runnable.
3179 *
3180 * We then designate the fractions u_i as our co-efficients, yielding the
3181 * following representation of historical load:
3182 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
3183 *
3184 * We choose y based on the with of a reasonably scheduling period, fixing:
3185 * y^32 = 0.5
3186 *
3187 * This means that the contribution to load ~32ms ago (u_32) will be weighted
3188 * approximately half as much as the contribution to load within the last ms
3189 * (u_0).
3190 *
3191 * When a period "rolls over" and we have new u_0`, multiplying the previous
3192 * sum again by y is sufficient to update:
3193 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
3194 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
3195 */
9d89c257 3196static __always_inline int
c7b50216 3197___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
1ea6c46a 3198 unsigned long load, unsigned long runnable, int running)
9d85f21c 3199{
a481db34 3200 u64 delta;
9d85f21c 3201
9d89c257 3202 delta = now - sa->last_update_time;
9d85f21c
PT
3203 /*
3204 * This should only happen when time goes backwards, which it
3205 * unfortunately does during sched clock init when we swap over to TSC.
3206 */
3207 if ((s64)delta < 0) {
9d89c257 3208 sa->last_update_time = now;
9d85f21c
PT
3209 return 0;
3210 }
3211
3212 /*
3213 * Use 1024ns as the unit of measurement since it's a reasonable
3214 * approximation of 1us and fast to compute.
3215 */
3216 delta >>= 10;
3217 if (!delta)
3218 return 0;
bb0bd044
PZ
3219
3220 sa->last_update_time += delta << 10;
9d85f21c 3221
f235a54f
VG
3222 /*
3223 * running is a subset of runnable (weight) so running can't be set if
3224 * runnable is clear. But there are some corner cases where the current
3225 * se has been already dequeued but cfs_rq->curr still points to it.
3226 * This means that weight will be 0 but not running for a sched_entity
3227 * but also for a cfs_rq if the latter becomes idle. As an example,
3228 * this happens during idle_balance() which calls
3229 * update_blocked_averages()
3230 */
1ea6c46a
PZ
3231 if (!load)
3232 runnable = running = 0;
f235a54f 3233
a481db34
YD
3234 /*
3235 * Now we know we crossed measurement unit boundaries. The *_avg
3236 * accrues by two steps:
3237 *
3238 * Step 1: accumulate *_sum since last_update_time. If we haven't
3239 * crossed period boundaries, finish.
3240 */
1ea6c46a 3241 if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
a481db34 3242 return 0;
9ee474f5 3243
c7b50216
PZ
3244 return 1;
3245}
3246
3247static __always_inline void
1ea6c46a 3248___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
c7b50216
PZ
3249{
3250 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3251
a481db34
YD
3252 /*
3253 * Step 2: update *_avg.
3254 */
1ea6c46a
PZ
3255 sa->load_avg = div_u64(load * sa->load_sum, divider);
3256 sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
c7b50216
PZ
3257 sa->util_avg = sa->util_sum / divider;
3258}
aff3e498 3259
c7b50216
PZ
3260/*
3261 * sched_entity:
3262 *
1ea6c46a
PZ
3263 * task:
3264 * se_runnable() == se_weight()
3265 *
3266 * group: [ see update_cfs_group() ]
3267 * se_weight() = tg->weight * grq->load_avg / tg->load_avg
3268 * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
3269 *
c7b50216
PZ
3270 * load_sum := runnable_sum
3271 * load_avg = se_weight(se) * runnable_avg
3272 *
1ea6c46a
PZ
3273 * runnable_load_sum := runnable_sum
3274 * runnable_load_avg = se_runnable(se) * runnable_avg
3275 *
3276 * XXX collapse load_sum and runnable_load_sum
3277 *
c7b50216
PZ
3278 * cfq_rs:
3279 *
3280 * load_sum = \Sum se_weight(se) * se->avg.load_sum
3281 * load_avg = \Sum se->avg.load_avg
1ea6c46a
PZ
3282 *
3283 * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
3284 * runnable_load_avg = \Sum se->avg.runable_load_avg
c7b50216
PZ
3285 */
3286
0ccb977f
PZ
3287static int
3288__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
3289{
1ea6c46a
PZ
3290 if (entity_is_task(se))
3291 se->runnable_weight = se->load.weight;
3292
3293 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
3294 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
c7b50216
PZ
3295 return 1;
3296 }
3297
3298 return 0;
0ccb977f
PZ
3299}
3300
3301static int
3302__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
3303{
1ea6c46a
PZ
3304 if (entity_is_task(se))
3305 se->runnable_weight = se->load.weight;
3306
3307 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
3308 cfs_rq->curr == se)) {
c7b50216 3309
1ea6c46a 3310 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
c7b50216
PZ
3311 return 1;
3312 }
3313
3314 return 0;
0ccb977f
PZ
3315}
3316
3317static int
3318__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
3319{
c7b50216
PZ
3320 if (___update_load_sum(now, cpu, &cfs_rq->avg,
3321 scale_load_down(cfs_rq->load.weight),
1ea6c46a
PZ
3322 scale_load_down(cfs_rq->runnable_weight),
3323 cfs_rq->curr != NULL)) {
3324
3325 ___update_load_avg(&cfs_rq->avg, 1, 1);
c7b50216
PZ
3326 return 1;
3327 }
3328
3329 return 0;
0ccb977f
PZ
3330}
3331
c566e8e9 3332#ifdef CONFIG_FAIR_GROUP_SCHED
7c3edd2c
PZ
3333/**
3334 * update_tg_load_avg - update the tg's load avg
3335 * @cfs_rq: the cfs_rq whose avg changed
3336 * @force: update regardless of how small the difference
3337 *
3338 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3339 * However, because tg->load_avg is a global value there are performance
3340 * considerations.
3341 *
3342 * In order to avoid having to look at the other cfs_rq's, we use a
3343 * differential update where we store the last value we propagated. This in
3344 * turn allows skipping updates if the differential is 'small'.
3345 *
815abf5a 3346 * Updating tg's load_avg is necessary before update_cfs_share().
bb17f655 3347 */
9d89c257 3348static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
bb17f655 3349{
9d89c257 3350 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
bb17f655 3351
aa0b7ae0
WL
3352 /*
3353 * No need to update load_avg for root_task_group as it is not used.
3354 */
3355 if (cfs_rq->tg == &root_task_group)
3356 return;
3357
9d89c257
YD
3358 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3359 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3360 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
bb17f655 3361 }
8165e145 3362}
f5f9739d 3363
ad936d86
BP
3364/*
3365 * Called within set_task_rq() right before setting a task's cpu. The
3366 * caller only guarantees p->pi_lock is held; no other assumptions,
3367 * including the state of rq->lock, should be made.
3368 */
3369void set_task_rq_fair(struct sched_entity *se,
3370 struct cfs_rq *prev, struct cfs_rq *next)
3371{
0ccb977f
PZ
3372 u64 p_last_update_time;
3373 u64 n_last_update_time;
3374
ad936d86
BP
3375 if (!sched_feat(ATTACH_AGE_LOAD))
3376 return;
3377
3378 /*
3379 * We are supposed to update the task to "current" time, then its up to
3380 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3381 * getting what current time is, so simply throw away the out-of-date
3382 * time. This will result in the wakee task is less decayed, but giving
3383 * the wakee more load sounds not bad.
3384 */
0ccb977f
PZ
3385 if (!(se->avg.last_update_time && prev))
3386 return;
ad936d86
BP
3387
3388#ifndef CONFIG_64BIT
0ccb977f 3389 {
ad936d86
BP
3390 u64 p_last_update_time_copy;
3391 u64 n_last_update_time_copy;
3392
3393 do {
3394 p_last_update_time_copy = prev->load_last_update_time_copy;
3395 n_last_update_time_copy = next->load_last_update_time_copy;
3396
3397 smp_rmb();
3398
3399 p_last_update_time = prev->avg.last_update_time;
3400 n_last_update_time = next->avg.last_update_time;
3401
3402 } while (p_last_update_time != p_last_update_time_copy ||
3403 n_last_update_time != n_last_update_time_copy);
0ccb977f 3404 }
ad936d86 3405#else
0ccb977f
PZ
3406 p_last_update_time = prev->avg.last_update_time;
3407 n_last_update_time = next->avg.last_update_time;
ad936d86 3408#endif
0ccb977f
PZ
3409 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
3410 se->avg.last_update_time = n_last_update_time;
ad936d86 3411}
09a43ace 3412
0e2d2aaa
PZ
3413
3414/*
3415 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3416 * propagate its contribution. The key to this propagation is the invariant
3417 * that for each group:
3418 *
3419 * ge->avg == grq->avg (1)
3420 *
3421 * _IFF_ we look at the pure running and runnable sums. Because they
3422 * represent the very same entity, just at different points in the hierarchy.
3423 *
a4c3c049
VG
3424 * Per the above update_tg_cfs_util() is trivial and simply copies the running
3425 * sum over (but still wrong, because the group entity and group rq do not have
3426 * their PELT windows aligned).
0e2d2aaa
PZ
3427 *
3428 * However, update_tg_cfs_runnable() is more complex. So we have:
3429 *
3430 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3431 *
3432 * And since, like util, the runnable part should be directly transferable,
3433 * the following would _appear_ to be the straight forward approach:
3434 *
a4c3c049 3435 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
0e2d2aaa
PZ
3436 *
3437 * And per (1) we have:
3438 *
a4c3c049 3439 * ge->avg.runnable_avg == grq->avg.runnable_avg
0e2d2aaa
PZ
3440 *
3441 * Which gives:
3442 *
3443 * ge->load.weight * grq->avg.load_avg
3444 * ge->avg.load_avg = ----------------------------------- (4)
3445 * grq->load.weight
3446 *
3447 * Except that is wrong!
3448 *
3449 * Because while for entities historical weight is not important and we
3450 * really only care about our future and therefore can consider a pure
3451 * runnable sum, runqueues can NOT do this.
3452 *
3453 * We specifically want runqueues to have a load_avg that includes
3454 * historical weights. Those represent the blocked load, the load we expect
3455 * to (shortly) return to us. This only works by keeping the weights as
3456 * integral part of the sum. We therefore cannot decompose as per (3).
3457 *
a4c3c049
VG
3458 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3459 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3460 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3461 * runnable section of these tasks overlap (or not). If they were to perfectly
3462 * align the rq as a whole would be runnable 2/3 of the time. If however we
3463 * always have at least 1 runnable task, the rq as a whole is always runnable.
0e2d2aaa 3464 *
a4c3c049 3465 * So we'll have to approximate.. :/
0e2d2aaa 3466 *
a4c3c049 3467 * Given the constraint:
0e2d2aaa 3468 *
a4c3c049 3469 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
0e2d2aaa 3470 *
a4c3c049
VG
3471 * We can construct a rule that adds runnable to a rq by assuming minimal
3472 * overlap.
0e2d2aaa 3473 *
a4c3c049 3474 * On removal, we'll assume each task is equally runnable; which yields:
0e2d2aaa 3475 *
a4c3c049 3476 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
0e2d2aaa 3477 *
a4c3c049 3478 * XXX: only do this for the part of runnable > running ?
0e2d2aaa 3479 *
0e2d2aaa
PZ
3480 */
3481
09a43ace 3482static inline void
0e2d2aaa 3483update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
09a43ace 3484{
09a43ace
VG
3485 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3486
3487 /* Nothing to update */
3488 if (!delta)
3489 return;
3490
a4c3c049
VG
3491 /*
3492 * The relation between sum and avg is:
3493 *
3494 * LOAD_AVG_MAX - 1024 + sa->period_contrib
3495 *
3496 * however, the PELT windows are not aligned between grq and gse.
3497 */
3498
09a43ace
VG
3499 /* Set new sched_entity's utilization */
3500 se->avg.util_avg = gcfs_rq->avg.util_avg;
3501 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3502
3503 /* Update parent cfs_rq utilization */
3504 add_positive(&cfs_rq->avg.util_avg, delta);
3505 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3506}
3507
09a43ace 3508static inline void
0e2d2aaa 3509update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
09a43ace 3510{
a4c3c049
VG
3511 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3512 unsigned long runnable_load_avg, load_avg;
3513 u64 runnable_load_sum, load_sum = 0;
3514 s64 delta_sum;
09a43ace 3515
0e2d2aaa
PZ
3516 if (!runnable_sum)
3517 return;
09a43ace 3518
0e2d2aaa 3519 gcfs_rq->prop_runnable_sum = 0;
09a43ace 3520
a4c3c049
VG
3521 if (runnable_sum >= 0) {
3522 /*
3523 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3524 * the CPU is saturated running == runnable.
3525 */
3526 runnable_sum += se->avg.load_sum;
3527 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3528 } else {
3529 /*
3530 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3531 * assuming all tasks are equally runnable.
3532 */
3533 if (scale_load_down(gcfs_rq->load.weight)) {
3534 load_sum = div_s64(gcfs_rq->avg.load_sum,
3535 scale_load_down(gcfs_rq->load.weight));
3536 }
3537
3538 /* But make sure to not inflate se's runnable */
3539 runnable_sum = min(se->avg.load_sum, load_sum);
3540 }
3541
3542 /*
3543 * runnable_sum can't be lower than running_sum
3544 * As running sum is scale with cpu capacity wehreas the runnable sum
3545 * is not we rescale running_sum 1st
3546 */
3547 running_sum = se->avg.util_sum /
3548 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3549 runnable_sum = max(runnable_sum, running_sum);
3550
0e2d2aaa
PZ
3551 load_sum = (s64)se_weight(se) * runnable_sum;
3552 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
09a43ace 3553
a4c3c049
VG
3554 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3555 delta_avg = load_avg - se->avg.load_avg;
09a43ace 3556
a4c3c049
VG
3557 se->avg.load_sum = runnable_sum;
3558 se->avg.load_avg = load_avg;
3559 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3560 add_positive(&cfs_rq->avg.load_sum, delta_sum);
09a43ace 3561
1ea6c46a
PZ
3562 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3563 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
a4c3c049
VG
3564 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3565 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
1ea6c46a 3566
a4c3c049
VG
3567 se->avg.runnable_load_sum = runnable_sum;
3568 se->avg.runnable_load_avg = runnable_load_avg;
1ea6c46a 3569
09a43ace 3570 if (se->on_rq) {
a4c3c049
VG
3571 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3572 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
09a43ace
VG
3573 }
3574}
3575
0e2d2aaa 3576static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
09a43ace 3577{
0e2d2aaa
PZ
3578 cfs_rq->propagate = 1;
3579 cfs_rq->prop_runnable_sum += runnable_sum;
09a43ace
VG
3580}
3581
3582/* Update task and its cfs_rq load average */
3583static inline int propagate_entity_load_avg(struct sched_entity *se)
3584{
0e2d2aaa 3585 struct cfs_rq *cfs_rq, *gcfs_rq;
09a43ace
VG
3586
3587 if (entity_is_task(se))
3588 return 0;
3589
0e2d2aaa
PZ
3590 gcfs_rq = group_cfs_rq(se);
3591 if (!gcfs_rq->propagate)
09a43ace
VG
3592 return 0;
3593
0e2d2aaa
PZ
3594 gcfs_rq->propagate = 0;
3595
09a43ace
VG
3596 cfs_rq = cfs_rq_of(se);
3597
0e2d2aaa 3598 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
09a43ace 3599
0e2d2aaa
PZ
3600 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3601 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
09a43ace
VG
3602
3603 return 1;
3604}
3605
bc427898
VG
3606/*
3607 * Check if we need to update the load and the utilization of a blocked
3608 * group_entity:
3609 */
3610static inline bool skip_blocked_update(struct sched_entity *se)
3611{
3612 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3613
3614 /*
3615 * If sched_entity still have not zero load or utilization, we have to
3616 * decay it:
3617 */
3618 if (se->avg.load_avg || se->avg.util_avg)
3619 return false;
3620
3621 /*
3622 * If there is a pending propagation, we have to update the load and
3623 * the utilization of the sched_entity:
3624 */
0e2d2aaa 3625 if (gcfs_rq->propagate)
bc427898
VG
3626 return false;
3627
3628 /*
3629 * Otherwise, the load and the utilization of the sched_entity is
3630 * already zero and there is no pending propagation, so it will be a
3631 * waste of time to try to decay it:
3632 */
3633 return true;
3634}
3635
6e83125c 3636#else /* CONFIG_FAIR_GROUP_SCHED */
09a43ace 3637
9d89c257 3638static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
09a43ace
VG
3639
3640static inline int propagate_entity_load_avg(struct sched_entity *se)
3641{
3642 return 0;
3643}
3644
0e2d2aaa 3645static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
09a43ace 3646
6e83125c 3647#endif /* CONFIG_FAIR_GROUP_SCHED */
c566e8e9 3648
3d30544f
PZ
3649/**
3650 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3651 * @now: current time, as per cfs_rq_clock_task()
3652 * @cfs_rq: cfs_rq to update
3d30544f
PZ
3653 *
3654 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3655 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3656 * post_init_entity_util_avg().
3657 *
3658 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3659 *
7c3edd2c
PZ
3660 * Returns true if the load decayed or we removed load.
3661 *
3662 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3663 * call update_tg_load_avg() when this function returns true.
3d30544f 3664 */
a2c6c91f 3665static inline int
3a123bbb 3666update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2dac754e 3667{
0e2d2aaa 3668 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
9d89c257 3669 struct sched_avg *sa = &cfs_rq->avg;
2a2f5d4e 3670 int decayed = 0;
2dac754e 3671
2a2f5d4e
PZ
3672 if (cfs_rq->removed.nr) {
3673 unsigned long r;
9a2dd585 3674 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
2a2f5d4e
PZ
3675
3676 raw_spin_lock(&cfs_rq->removed.lock);
3677 swap(cfs_rq->removed.util_avg, removed_util);
3678 swap(cfs_rq->removed.load_avg, removed_load);
0e2d2aaa 3679 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
2a2f5d4e
PZ
3680 cfs_rq->removed.nr = 0;
3681 raw_spin_unlock(&cfs_rq->removed.lock);
3682
2a2f5d4e 3683 r = removed_load;
89741892 3684 sub_positive(&sa->load_avg, r);
9a2dd585 3685 sub_positive(&sa->load_sum, r * divider);
2dac754e 3686
2a2f5d4e 3687 r = removed_util;
89741892 3688 sub_positive(&sa->util_avg, r);
9a2dd585 3689 sub_positive(&sa->util_sum, r * divider);
2a2f5d4e 3690
0e2d2aaa 3691 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
2a2f5d4e
PZ
3692
3693 decayed = 1;
9d89c257 3694 }
36ee28e4 3695
2a2f5d4e 3696 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
36ee28e4 3697
9d89c257
YD
3698#ifndef CONFIG_64BIT
3699 smp_wmb();
3700 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3701#endif
36ee28e4 3702
2a2f5d4e 3703 if (decayed)
a2c6c91f 3704 cfs_rq_util_change(cfs_rq);
21e96f88 3705
2a2f5d4e 3706 return decayed;
21e96f88
SM
3707}
3708
3d30544f
PZ
3709/**
3710 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3711 * @cfs_rq: cfs_rq to attach to
3712 * @se: sched_entity to attach
3713 *
3714 * Must call update_cfs_rq_load_avg() before this, since we rely on
3715 * cfs_rq->avg.last_update_time being current.
3716 */
a05e8c51
BP
3717static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3718{
f207934f
PZ
3719 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3720
3721 /*
3722 * When we attach the @se to the @cfs_rq, we must align the decay
3723 * window because without that, really weird and wonderful things can
3724 * happen.
3725 *
3726 * XXX illustrate
3727 */
a05e8c51 3728 se->avg.last_update_time = cfs_rq->avg.last_update_time;
f207934f
PZ
3729 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3730
3731 /*
3732 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3733 * period_contrib. This isn't strictly correct, but since we're
3734 * entirely outside of the PELT hierarchy, nobody cares if we truncate
3735 * _sum a little.
3736 */
3737 se->avg.util_sum = se->avg.util_avg * divider;
3738
3739 se->avg.load_sum = divider;
3740 if (se_weight(se)) {
3741 se->avg.load_sum =
3742 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3743 }
3744
3745 se->avg.runnable_load_sum = se->avg.load_sum;
3746
8d5b9025 3747 enqueue_load_avg(cfs_rq, se);
a05e8c51
BP
3748 cfs_rq->avg.util_avg += se->avg.util_avg;
3749 cfs_rq->avg.util_sum += se->avg.util_sum;
0e2d2aaa
PZ
3750
3751 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
a2c6c91f
SM
3752
3753 cfs_rq_util_change(cfs_rq);
a05e8c51
BP
3754}
3755
3d30544f
PZ
3756/**
3757 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3758 * @cfs_rq: cfs_rq to detach from
3759 * @se: sched_entity to detach
3760 *
3761 * Must call update_cfs_rq_load_avg() before this, since we rely on
3762 * cfs_rq->avg.last_update_time being current.
3763 */
a05e8c51
BP
3764static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3765{
8d5b9025 3766 dequeue_load_avg(cfs_rq, se);
89741892
PZ
3767 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3768 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
0e2d2aaa
PZ
3769
3770 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
a2c6c91f
SM
3771
3772 cfs_rq_util_change(cfs_rq);
a05e8c51
BP
3773}
3774
b382a531
PZ
3775/*
3776 * Optional action to be done while updating the load average
3777 */
3778#define UPDATE_TG 0x1
3779#define SKIP_AGE_LOAD 0x2
3780#define DO_ATTACH 0x4
3781
3782/* Update task and its cfs_rq load average */
3783static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3784{
3785 u64 now = cfs_rq_clock_task(cfs_rq);
3786 struct rq *rq = rq_of(cfs_rq);
3787 int cpu = cpu_of(rq);
3788 int decayed;
3789
3790 /*
3791 * Track task load average for carrying it to new CPU after migrated, and
3792 * track group sched_entity load average for task_h_load calc in migration
3793 */
3794 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3795 __update_load_avg_se(now, cpu, cfs_rq, se);
3796
3797 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3798 decayed |= propagate_entity_load_avg(se);
3799
3800 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3801
3802 attach_entity_load_avg(cfs_rq, se);
3803 update_tg_load_avg(cfs_rq, 0);
3804
3805 } else if (decayed && (flags & UPDATE_TG))
3806 update_tg_load_avg(cfs_rq, 0);
3807}
3808
9d89c257 3809#ifndef CONFIG_64BIT
0905f04e
YD
3810static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3811{
9d89c257 3812 u64 last_update_time_copy;
0905f04e 3813 u64 last_update_time;
9ee474f5 3814
9d89c257
YD
3815 do {
3816 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3817 smp_rmb();
3818 last_update_time = cfs_rq->avg.last_update_time;
3819 } while (last_update_time != last_update_time_copy);
0905f04e
YD
3820
3821 return last_update_time;
3822}
9d89c257 3823#else
0905f04e
YD
3824static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3825{
3826 return cfs_rq->avg.last_update_time;
3827}
9d89c257
YD
3828#endif
3829
104cb16d
MR
3830/*
3831 * Synchronize entity load avg of dequeued entity without locking
3832 * the previous rq.
3833 */
3834void sync_entity_load_avg(struct sched_entity *se)
3835{
3836 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3837 u64 last_update_time;
3838
3839 last_update_time = cfs_rq_last_update_time(cfs_rq);
0ccb977f 3840 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
104cb16d
MR
3841}
3842
0905f04e
YD
3843/*
3844 * Task first catches up with cfs_rq, and then subtract
3845 * itself from the cfs_rq (task must be off the queue now).
3846 */
3847void remove_entity_load_avg(struct sched_entity *se)
3848{
3849 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2a2f5d4e 3850 unsigned long flags;
0905f04e
YD
3851
3852 /*
7dc603c9
PZ
3853 * tasks cannot exit without having gone through wake_up_new_task() ->
3854 * post_init_entity_util_avg() which will have added things to the
3855 * cfs_rq, so we can remove unconditionally.
3856 *
3857 * Similarly for groups, they will have passed through
3858 * post_init_entity_util_avg() before unregister_sched_fair_group()
3859 * calls this.
0905f04e 3860 */
0905f04e 3861
104cb16d 3862 sync_entity_load_avg(se);
2a2f5d4e
PZ
3863
3864 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3865 ++cfs_rq->removed.nr;
3866 cfs_rq->removed.util_avg += se->avg.util_avg;
3867 cfs_rq->removed.load_avg += se->avg.load_avg;
0e2d2aaa 3868 cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
2a2f5d4e 3869 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
2dac754e 3870}
642dbc39 3871
7ea241af
YD
3872static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3873{
1ea6c46a 3874 return cfs_rq->avg.runnable_load_avg;
7ea241af
YD
3875}
3876
3877static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3878{
3879 return cfs_rq->avg.load_avg;
3880}
3881
46f69fa3 3882static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
6e83125c 3883
38033c37
PZ
3884#else /* CONFIG_SMP */
3885
01011473 3886static inline int
3a123bbb 3887update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
01011473
PZ
3888{
3889 return 0;
3890}
3891
d31b1a66
VG
3892#define UPDATE_TG 0x0
3893#define SKIP_AGE_LOAD 0x0
b382a531 3894#define DO_ATTACH 0x0
d31b1a66 3895
88c0616e 3896static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
536bd00c 3897{
88c0616e 3898 cfs_rq_util_change(cfs_rq);
536bd00c
RW
3899}
3900
9d89c257 3901static inline void remove_entity_load_avg(struct sched_entity *se) {}
6e83125c 3902
a05e8c51
BP
3903static inline void
3904attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3905static inline void
3906detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3907
46f69fa3 3908static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
6e83125c
PZ
3909{
3910 return 0;
3911}
3912
38033c37 3913#endif /* CONFIG_SMP */
9d85f21c 3914
ddc97297
PZ
3915static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3916{
3917#ifdef CONFIG_SCHED_DEBUG
3918 s64 d = se->vruntime - cfs_rq->min_vruntime;
3919
3920 if (d < 0)
3921 d = -d;
3922
3923 if (d > 3*sysctl_sched_latency)
ae92882e 3924 schedstat_inc(cfs_rq->nr_spread_over);
ddc97297
PZ
3925#endif
3926}
3927
aeb73b04
PZ
3928static void
3929place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3930{
1af5f730 3931 u64 vruntime = cfs_rq->min_vruntime;
94dfb5e7 3932
2cb8600e
PZ
3933 /*
3934 * The 'current' period is already promised to the current tasks,
3935 * however the extra weight of the new task will slow them down a
3936 * little, place the new task so that it fits in the slot that
3937 * stays open at the end.
3938 */
94dfb5e7 3939 if (initial && sched_feat(START_DEBIT))
f9c0b095 3940 vruntime += sched_vslice(cfs_rq, se);
aeb73b04 3941
a2e7a7eb 3942 /* sleeps up to a single latency don't count. */
5ca9880c 3943 if (!initial) {
a2e7a7eb 3944 unsigned long thresh = sysctl_sched_latency;
a7be37ac 3945
a2e7a7eb
MG
3946 /*
3947 * Halve their sleep time's effect, to allow
3948 * for a gentler effect of sleepers:
3949 */
3950 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3951 thresh >>= 1;
51e0304c 3952
a2e7a7eb 3953 vruntime -= thresh;
aeb73b04
PZ
3954 }
3955
b5d9d734 3956 /* ensure we never gain time by being placed backwards. */
16c8f1c7 3957 se->vruntime = max_vruntime(se->vruntime, vruntime);
aeb73b04
PZ
3958}
3959
d3d9dc33
PT
3960static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3961
cb251765
MG
3962static inline void check_schedstat_required(void)
3963{
3964#ifdef CONFIG_SCHEDSTATS
3965 if (schedstat_enabled())
3966 return;
3967
3968 /* Force schedstat enabled if a dependent tracepoint is active */
3969 if (trace_sched_stat_wait_enabled() ||
3970 trace_sched_stat_sleep_enabled() ||
3971 trace_sched_stat_iowait_enabled() ||
3972 trace_sched_stat_blocked_enabled() ||
3973 trace_sched_stat_runtime_enabled()) {
eda8dca5 3974 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
cb251765 3975 "stat_blocked and stat_runtime require the "
f67abed5 3976 "kernel parameter schedstats=enable or "
cb251765
MG
3977 "kernel.sched_schedstats=1\n");
3978 }
3979#endif
3980}
3981
b5179ac7
PZ
3982
3983/*
3984 * MIGRATION
3985 *
3986 * dequeue
3987 * update_curr()
3988 * update_min_vruntime()
3989 * vruntime -= min_vruntime
3990 *
3991 * enqueue
3992 * update_curr()
3993 * update_min_vruntime()
3994 * vruntime += min_vruntime
3995 *
3996 * this way the vruntime transition between RQs is done when both
3997 * min_vruntime are up-to-date.
3998 *
3999 * WAKEUP (remote)
4000 *
59efa0ba 4001 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
b5179ac7
PZ
4002 * vruntime -= min_vruntime
4003 *
4004 * enqueue
4005 * update_curr()
4006 * update_min_vruntime()
4007 * vruntime += min_vruntime
4008 *
4009 * this way we don't have the most up-to-date min_vruntime on the originating
4010 * CPU and an up-to-date min_vruntime on the destination CPU.
4011 */
4012
bf0f6f24 4013static void
88ec22d3 4014enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 4015{
2f950354
PZ
4016 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4017 bool curr = cfs_rq->curr == se;
4018
88ec22d3 4019 /*
2f950354
PZ
4020 * If we're the current task, we must renormalise before calling
4021 * update_curr().
88ec22d3 4022 */
2f950354 4023 if (renorm && curr)
88ec22d3
PZ
4024 se->vruntime += cfs_rq->min_vruntime;
4025
2f950354
PZ
4026 update_curr(cfs_rq);
4027
bf0f6f24 4028 /*
2f950354
PZ
4029 * Otherwise, renormalise after, such that we're placed at the current
4030 * moment in time, instead of some random moment in the past. Being
4031 * placed in the past could significantly boost this task to the
4032 * fairness detriment of existing tasks.
bf0f6f24 4033 */
2f950354
PZ
4034 if (renorm && !curr)
4035 se->vruntime += cfs_rq->min_vruntime;
4036
89ee048f
VG
4037 /*
4038 * When enqueuing a sched_entity, we must:
4039 * - Update loads to have both entity and cfs_rq synced with now.
4040 * - Add its load to cfs_rq->runnable_avg
4041 * - For group_entity, update its weight to reflect the new share of
4042 * its group cfs_rq
4043 * - Add its new weight to cfs_rq->load.weight
4044 */
b382a531 4045 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
1ea6c46a 4046 update_cfs_group(se);
b5b3e35f 4047 enqueue_runnable_load_avg(cfs_rq, se);
17bc14b7 4048 account_entity_enqueue(cfs_rq, se);
bf0f6f24 4049
1a3d027c 4050 if (flags & ENQUEUE_WAKEUP)
aeb73b04 4051 place_entity(cfs_rq, se, 0);
bf0f6f24 4052
cb251765 4053 check_schedstat_required();
4fa8d299
JP
4054 update_stats_enqueue(cfs_rq, se, flags);
4055 check_spread(cfs_rq, se);
2f950354 4056 if (!curr)
83b699ed 4057 __enqueue_entity(cfs_rq, se);
2069dd75 4058 se->on_rq = 1;
3d4b47b4 4059
d3d9dc33 4060 if (cfs_rq->nr_running == 1) {
3d4b47b4 4061 list_add_leaf_cfs_rq(cfs_rq);
d3d9dc33
PT
4062 check_enqueue_throttle(cfs_rq);
4063 }
bf0f6f24
IM
4064}
4065
2c13c919 4066static void __clear_buddies_last(struct sched_entity *se)
2002c695 4067{
2c13c919
RR
4068 for_each_sched_entity(se) {
4069 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 4070 if (cfs_rq->last != se)
2c13c919 4071 break;
f1044799
PZ
4072
4073 cfs_rq->last = NULL;
2c13c919
RR
4074 }
4075}
2002c695 4076
2c13c919
RR
4077static void __clear_buddies_next(struct sched_entity *se)
4078{
4079 for_each_sched_entity(se) {
4080 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 4081 if (cfs_rq->next != se)
2c13c919 4082 break;
f1044799
PZ
4083
4084 cfs_rq->next = NULL;
2c13c919 4085 }
2002c695
PZ
4086}
4087
ac53db59
RR
4088static void __clear_buddies_skip(struct sched_entity *se)
4089{
4090 for_each_sched_entity(se) {
4091 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 4092 if (cfs_rq->skip != se)
ac53db59 4093 break;
f1044799
PZ
4094
4095 cfs_rq->skip = NULL;
ac53db59
RR
4096 }
4097}
4098
a571bbea
PZ
4099static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4100{
2c13c919
RR
4101 if (cfs_rq->last == se)
4102 __clear_buddies_last(se);
4103
4104 if (cfs_rq->next == se)
4105 __clear_buddies_next(se);
ac53db59
RR
4106
4107 if (cfs_rq->skip == se)
4108 __clear_buddies_skip(se);
a571bbea
PZ
4109}
4110
6c16a6dc 4111static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d8b4986d 4112
bf0f6f24 4113static void
371fd7e7 4114dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 4115{
a2a2d680
DA
4116 /*
4117 * Update run-time statistics of the 'current'.
4118 */
4119 update_curr(cfs_rq);
89ee048f
VG
4120
4121 /*
4122 * When dequeuing a sched_entity, we must:
4123 * - Update loads to have both entity and cfs_rq synced with now.
4124 * - Substract its load from the cfs_rq->runnable_avg.
4125 * - Substract its previous weight from cfs_rq->load.weight.
4126 * - For group entity, update its weight to reflect the new share
4127 * of its group cfs_rq.
4128 */
88c0616e 4129 update_load_avg(cfs_rq, se, UPDATE_TG);
b5b3e35f 4130 dequeue_runnable_load_avg(cfs_rq, se);
a2a2d680 4131
4fa8d299 4132 update_stats_dequeue(cfs_rq, se, flags);
67e9fb2a 4133
2002c695 4134 clear_buddies(cfs_rq, se);
4793241b 4135
83b699ed 4136 if (se != cfs_rq->curr)
30cfdcfc 4137 __dequeue_entity(cfs_rq, se);
17bc14b7 4138 se->on_rq = 0;
30cfdcfc 4139 account_entity_dequeue(cfs_rq, se);
88ec22d3
PZ
4140
4141 /*
b60205c7
PZ
4142 * Normalize after update_curr(); which will also have moved
4143 * min_vruntime if @se is the one holding it back. But before doing
4144 * update_min_vruntime() again, which will discount @se's position and
4145 * can move min_vruntime forward still more.
88ec22d3 4146 */
371fd7e7 4147 if (!(flags & DEQUEUE_SLEEP))
88ec22d3 4148 se->vruntime -= cfs_rq->min_vruntime;
1e876231 4149
d8b4986d
PT
4150 /* return excess runtime on last dequeue */
4151 return_cfs_rq_runtime(cfs_rq);
4152
1ea6c46a 4153 update_cfs_group(se);
b60205c7
PZ
4154
4155 /*
4156 * Now advance min_vruntime if @se was the entity holding it back,
4157 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4158 * put back on, and if we advance min_vruntime, we'll be placed back
4159 * further than we started -- ie. we'll be penalized.
4160 */
4161 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
4162 update_min_vruntime(cfs_rq);
bf0f6f24
IM
4163}
4164
4165/*
4166 * Preempt the current task with a newly woken task if needed:
4167 */
7c92e54f 4168static void
2e09bf55 4169check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24 4170{
11697830 4171 unsigned long ideal_runtime, delta_exec;
f4cfb33e
WX
4172 struct sched_entity *se;
4173 s64 delta;
11697830 4174
6d0f0ebd 4175 ideal_runtime = sched_slice(cfs_rq, curr);
11697830 4176 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b5 4177 if (delta_exec > ideal_runtime) {
8875125e 4178 resched_curr(rq_of(cfs_rq));
a9f3e2b5
MG
4179 /*
4180 * The current task ran long enough, ensure it doesn't get
4181 * re-elected due to buddy favours.
4182 */
4183 clear_buddies(cfs_rq, curr);
f685ceac
MG
4184 return;
4185 }
4186
4187 /*
4188 * Ensure that a task that missed wakeup preemption by a
4189 * narrow margin doesn't have to wait for a full slice.
4190 * This also mitigates buddy induced latencies under load.
4191 */
f685ceac
MG
4192 if (delta_exec < sysctl_sched_min_granularity)
4193 return;
4194
f4cfb33e
WX
4195 se = __pick_first_entity(cfs_rq);
4196 delta = curr->vruntime - se->vruntime;
f685ceac 4197
f4cfb33e
WX
4198 if (delta < 0)
4199 return;
d7d82944 4200
f4cfb33e 4201 if (delta > ideal_runtime)
8875125e 4202 resched_curr(rq_of(cfs_rq));
bf0f6f24
IM
4203}
4204
83b699ed 4205static void
8494f412 4206set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 4207{
83b699ed
SV
4208 /* 'current' is not kept within the tree. */
4209 if (se->on_rq) {
4210 /*
4211 * Any task has to be enqueued before it get to execute on
4212 * a CPU. So account for the time it spent waiting on the
4213 * runqueue.
4214 */
4fa8d299 4215 update_stats_wait_end(cfs_rq, se);
83b699ed 4216 __dequeue_entity(cfs_rq, se);
88c0616e 4217 update_load_avg(cfs_rq, se, UPDATE_TG);
83b699ed
SV
4218 }
4219
79303e9e 4220 update_stats_curr_start(cfs_rq, se);
429d43bc 4221 cfs_rq->curr = se;
4fa8d299 4222
eba1ed4b
IM
4223 /*
4224 * Track our maximum slice length, if the CPU's load is at
4225 * least twice that of our own weight (i.e. dont track it
4226 * when there are only lesser-weight tasks around):
4227 */
cb251765 4228 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4fa8d299
JP
4229 schedstat_set(se->statistics.slice_max,
4230 max((u64)schedstat_val(se->statistics.slice_max),
4231 se->sum_exec_runtime - se->prev_sum_exec_runtime));
eba1ed4b 4232 }
4fa8d299 4233
4a55b450 4234 se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24
IM
4235}
4236
3f3a4904
PZ
4237static int
4238wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4239
ac53db59
RR
4240/*
4241 * Pick the next process, keeping these things in mind, in this order:
4242 * 1) keep things fair between processes/task groups
4243 * 2) pick the "next" process, since someone really wants that to run
4244 * 3) pick the "last" process, for cache locality
4245 * 4) do not run the "skip" process, if something else is available
4246 */
678d5718
PZ
4247static struct sched_entity *
4248pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
aa2ac252 4249{
678d5718
PZ
4250 struct sched_entity *left = __pick_first_entity(cfs_rq);
4251 struct sched_entity *se;
4252
4253 /*
4254 * If curr is set we have to see if its left of the leftmost entity
4255 * still in the tree, provided there was anything in the tree at all.
4256 */
4257 if (!left || (curr && entity_before(curr, left)))
4258 left = curr;
4259
4260 se = left; /* ideally we run the leftmost entity */
f4b6755f 4261
ac53db59
RR
4262 /*
4263 * Avoid running the skip buddy, if running something else can
4264 * be done without getting too unfair.
4265 */
4266 if (cfs_rq->skip == se) {
678d5718
PZ
4267 struct sched_entity *second;
4268
4269 if (se == curr) {
4270 second = __pick_first_entity(cfs_rq);
4271 } else {
4272 second = __pick_next_entity(se);
4273 if (!second || (curr && entity_before(curr, second)))
4274 second = curr;
4275 }
4276
ac53db59
RR
4277 if (second && wakeup_preempt_entity(second, left) < 1)
4278 se = second;
4279 }
aa2ac252 4280
f685ceac
MG
4281 /*
4282 * Prefer last buddy, try to return the CPU to a preempted task.
4283 */
4284 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4285 se = cfs_rq->last;
4286
ac53db59
RR
4287 /*
4288 * Someone really wants this to run. If it's not unfair, run it.
4289 */
4290 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4291 se = cfs_rq->next;
4292
f685ceac 4293 clear_buddies(cfs_rq, se);
4793241b
PZ
4294
4295 return se;
aa2ac252
PZ
4296}
4297
678d5718 4298static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d3d9dc33 4299
ab6cde26 4300static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24
IM
4301{
4302 /*
4303 * If still on the runqueue then deactivate_task()
4304 * was not called and update_curr() has to be done:
4305 */
4306 if (prev->on_rq)
b7cc0896 4307 update_curr(cfs_rq);
bf0f6f24 4308
d3d9dc33
PT
4309 /* throttle cfs_rqs exceeding runtime */
4310 check_cfs_rq_runtime(cfs_rq);
4311
4fa8d299 4312 check_spread(cfs_rq, prev);
cb251765 4313
30cfdcfc 4314 if (prev->on_rq) {
4fa8d299 4315 update_stats_wait_start(cfs_rq, prev);
30cfdcfc
DA
4316 /* Put 'current' back into the tree. */
4317 __enqueue_entity(cfs_rq, prev);
9d85f21c 4318 /* in !on_rq case, update occurred at dequeue */
88c0616e 4319 update_load_avg(cfs_rq, prev, 0);
30cfdcfc 4320 }
429d43bc 4321 cfs_rq->curr = NULL;
bf0f6f24
IM
4322}
4323
8f4d37ec
PZ
4324static void
4325entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24 4326{
bf0f6f24 4327 /*
30cfdcfc 4328 * Update run-time statistics of the 'current'.
bf0f6f24 4329 */
30cfdcfc 4330 update_curr(cfs_rq);
bf0f6f24 4331
9d85f21c
PT
4332 /*
4333 * Ensure that runnable average is periodically updated.
4334 */
88c0616e 4335 update_load_avg(cfs_rq, curr, UPDATE_TG);
1ea6c46a 4336 update_cfs_group(curr);
9d85f21c 4337
8f4d37ec
PZ
4338#ifdef CONFIG_SCHED_HRTICK
4339 /*
4340 * queued ticks are scheduled to match the slice, so don't bother
4341 * validating it and just reschedule.
4342 */
983ed7a6 4343 if (queued) {
8875125e 4344 resched_curr(rq_of(cfs_rq));
983ed7a6
HH
4345 return;
4346 }
8f4d37ec
PZ
4347 /*
4348 * don't let the period tick interfere with the hrtick preemption
4349 */
4350 if (!sched_feat(DOUBLE_TICK) &&
4351 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4352 return;
4353#endif
4354
2c2efaed 4355 if (cfs_rq->nr_running > 1)
2e09bf55 4356 check_preempt_tick(cfs_rq, curr);
bf0f6f24
IM
4357}
4358
ab84d31e
PT
4359
4360/**************************************************
4361 * CFS bandwidth control machinery
4362 */
4363
4364#ifdef CONFIG_CFS_BANDWIDTH
029632fb
PZ
4365
4366#ifdef HAVE_JUMP_LABEL
c5905afb 4367static struct static_key __cfs_bandwidth_used;
029632fb
PZ
4368
4369static inline bool cfs_bandwidth_used(void)
4370{
c5905afb 4371 return static_key_false(&__cfs_bandwidth_used);
029632fb
PZ
4372}
4373
1ee14e6c 4374void cfs_bandwidth_usage_inc(void)
029632fb 4375{
ce48c146 4376 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
1ee14e6c
BS
4377}
4378
4379void cfs_bandwidth_usage_dec(void)
4380{
ce48c146 4381 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
029632fb
PZ
4382}
4383#else /* HAVE_JUMP_LABEL */
4384static bool cfs_bandwidth_used(void)
4385{
4386 return true;
4387}
4388
1ee14e6c
BS
4389void cfs_bandwidth_usage_inc(void) {}
4390void cfs_bandwidth_usage_dec(void) {}
029632fb
PZ
4391#endif /* HAVE_JUMP_LABEL */
4392
ab84d31e
PT
4393/*
4394 * default period for cfs group bandwidth.
4395 * default: 0.1s, units: nanoseconds
4396 */
4397static inline u64 default_cfs_period(void)
4398{
4399 return 100000000ULL;
4400}
ec12cb7f
PT
4401
4402static inline u64 sched_cfs_bandwidth_slice(void)
4403{
4404 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4405}
4406
a9cf55b2
PT
4407/*
4408 * Replenish runtime according to assigned quota and update expiration time.
4409 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4410 * additional synchronization around rq->lock.
4411 *
4412 * requires cfs_b->lock
4413 */
029632fb 4414void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
a9cf55b2
PT
4415{
4416 u64 now;
4417
4418 if (cfs_b->quota == RUNTIME_INF)
4419 return;
4420
4421 now = sched_clock_cpu(smp_processor_id());
4422 cfs_b->runtime = cfs_b->quota;
4423 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4424}
4425
029632fb
PZ
4426static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4427{
4428 return &tg->cfs_bandwidth;
4429}
4430
f1b17280
PT
4431/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4432static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4433{
4434 if (unlikely(cfs_rq->throttle_count))
1a99ae3f 4435 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
f1b17280 4436
78becc27 4437 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
f1b17280
PT
4438}
4439
85dac906
PT
4440/* returns 0 on failure to allocate runtime */
4441static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f
PT
4442{
4443 struct task_group *tg = cfs_rq->tg;
4444 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
a9cf55b2 4445 u64 amount = 0, min_amount, expires;
ec12cb7f
PT
4446
4447 /* note: this is a positive sum as runtime_remaining <= 0 */
4448 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4449
4450 raw_spin_lock(&cfs_b->lock);
4451 if (cfs_b->quota == RUNTIME_INF)
4452 amount = min_amount;
58088ad0 4453 else {
77a4d1a1 4454 start_cfs_bandwidth(cfs_b);
58088ad0
PT
4455
4456 if (cfs_b->runtime > 0) {
4457 amount = min(cfs_b->runtime, min_amount);
4458 cfs_b->runtime -= amount;
4459 cfs_b->idle = 0;
4460 }
ec12cb7f 4461 }
a9cf55b2 4462 expires = cfs_b->runtime_expires;
ec12cb7f
PT
4463 raw_spin_unlock(&cfs_b->lock);
4464
4465 cfs_rq->runtime_remaining += amount;
a9cf55b2
PT
4466 /*
4467 * we may have advanced our local expiration to account for allowed
4468 * spread between our sched_clock and the one on which runtime was
4469 * issued.
4470 */
4471 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
4472 cfs_rq->runtime_expires = expires;
85dac906
PT
4473
4474 return cfs_rq->runtime_remaining > 0;
ec12cb7f
PT
4475}
4476
a9cf55b2
PT
4477/*
4478 * Note: This depends on the synchronization provided by sched_clock and the
4479 * fact that rq->clock snapshots this value.
4480 */
4481static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f 4482{
a9cf55b2 4483 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
a9cf55b2
PT
4484
4485 /* if the deadline is ahead of our clock, nothing to do */
78becc27 4486 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
ec12cb7f
PT
4487 return;
4488
a9cf55b2
PT
4489 if (cfs_rq->runtime_remaining < 0)
4490 return;
4491
4492 /*
4493 * If the local deadline has passed we have to consider the
4494 * possibility that our sched_clock is 'fast' and the global deadline
4495 * has not truly expired.
4496 *
4497 * Fortunately we can check determine whether this the case by checking
51f2176d
BS
4498 * whether the global deadline has advanced. It is valid to compare
4499 * cfs_b->runtime_expires without any locks since we only care about
4500 * exact equality, so a partial write will still work.
a9cf55b2
PT
4501 */
4502
51f2176d 4503 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
a9cf55b2
PT
4504 /* extend local deadline, drift is bounded above by 2 ticks */
4505 cfs_rq->runtime_expires += TICK_NSEC;
4506 } else {
4507 /* global deadline is ahead, expiration has passed */
4508 cfs_rq->runtime_remaining = 0;
4509 }
4510}
4511
9dbdb155 4512static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
a9cf55b2
PT
4513{
4514 /* dock delta_exec before expiring quota (as it could span periods) */
ec12cb7f 4515 cfs_rq->runtime_remaining -= delta_exec;
a9cf55b2
PT
4516 expire_cfs_rq_runtime(cfs_rq);
4517
4518 if (likely(cfs_rq->runtime_remaining > 0))
ec12cb7f
PT
4519 return;
4520
85dac906
PT
4521 /*
4522 * if we're unable to extend our runtime we resched so that the active
4523 * hierarchy can be throttled
4524 */
4525 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
8875125e 4526 resched_curr(rq_of(cfs_rq));
ec12cb7f
PT
4527}
4528
6c16a6dc 4529static __always_inline
9dbdb155 4530void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
ec12cb7f 4531{
56f570e5 4532 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
ec12cb7f
PT
4533 return;
4534
4535 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4536}
4537
85dac906
PT
4538static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4539{
56f570e5 4540 return cfs_bandwidth_used() && cfs_rq->throttled;
85dac906
PT
4541}
4542
64660c86
PT
4543/* check whether cfs_rq, or any parent, is throttled */
4544static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4545{
56f570e5 4546 return cfs_bandwidth_used() && cfs_rq->throttle_count;
64660c86
PT
4547}
4548
4549/*
4550 * Ensure that neither of the group entities corresponding to src_cpu or
4551 * dest_cpu are members of a throttled hierarchy when performing group
4552 * load-balance operations.
4553 */
4554static inline int throttled_lb_pair(struct task_group *tg,
4555 int src_cpu, int dest_cpu)
4556{
4557 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4558
4559 src_cfs_rq = tg->cfs_rq[src_cpu];
4560 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4561
4562 return throttled_hierarchy(src_cfs_rq) ||
4563 throttled_hierarchy(dest_cfs_rq);
4564}
4565
4566/* updated child weight may affect parent so we have to do this bottom up */
4567static int tg_unthrottle_up(struct task_group *tg, void *data)
4568{
4569 struct rq *rq = data;
4570 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4571
4572 cfs_rq->throttle_count--;
64660c86 4573 if (!cfs_rq->throttle_count) {
f1b17280 4574 /* adjust cfs_rq_clock_task() */
78becc27 4575 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
f1b17280 4576 cfs_rq->throttled_clock_task;
64660c86 4577 }
64660c86
PT
4578
4579 return 0;
4580}
4581
4582static int tg_throttle_down(struct task_group *tg, void *data)
4583{
4584 struct rq *rq = data;
4585 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4586
82958366
PT
4587 /* group is entering throttled state, stop time */
4588 if (!cfs_rq->throttle_count)
78becc27 4589 cfs_rq->throttled_clock_task = rq_clock_task(rq);
64660c86
PT
4590 cfs_rq->throttle_count++;
4591
4592 return 0;
4593}
4594
d3d9dc33 4595static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
85dac906
PT
4596{
4597 struct rq *rq = rq_of(cfs_rq);
4598 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4599 struct sched_entity *se;
4600 long task_delta, dequeue = 1;
77a4d1a1 4601 bool empty;
85dac906
PT
4602
4603 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4604
f1b17280 4605 /* freeze hierarchy runnable averages while throttled */
64660c86
PT
4606 rcu_read_lock();
4607 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4608 rcu_read_unlock();
85dac906
PT
4609
4610 task_delta = cfs_rq->h_nr_running;
4611 for_each_sched_entity(se) {
4612 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4613 /* throttled entity or throttle-on-deactivate */
4614 if (!se->on_rq)
4615 break;
4616
4617 if (dequeue)
4618 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4619 qcfs_rq->h_nr_running -= task_delta;
4620
4621 if (qcfs_rq->load.weight)
4622 dequeue = 0;
4623 }
4624
4625 if (!se)
72465447 4626 sub_nr_running(rq, task_delta);
85dac906
PT
4627
4628 cfs_rq->throttled = 1;
78becc27 4629 cfs_rq->throttled_clock = rq_clock(rq);
85dac906 4630 raw_spin_lock(&cfs_b->lock);
d49db342 4631 empty = list_empty(&cfs_b->throttled_cfs_rq);
77a4d1a1 4632
c06f04c7
BS
4633 /*
4634 * Add to the _head_ of the list, so that an already-started
4635 * distribute_cfs_runtime will not see us
4636 */
4637 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
77a4d1a1
PZ
4638
4639 /*
4640 * If we're the first throttled task, make sure the bandwidth
4641 * timer is running.
4642 */
4643 if (empty)
4644 start_cfs_bandwidth(cfs_b);
4645
85dac906
PT
4646 raw_spin_unlock(&cfs_b->lock);
4647}
4648
029632fb 4649void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
671fd9da
PT
4650{
4651 struct rq *rq = rq_of(cfs_rq);
4652 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4653 struct sched_entity *se;
4654 int enqueue = 1;
4655 long task_delta;
4656
22b958d8 4657 se = cfs_rq->tg->se[cpu_of(rq)];
671fd9da
PT
4658
4659 cfs_rq->throttled = 0;
1a55af2e
FW
4660
4661 update_rq_clock(rq);
4662
671fd9da 4663 raw_spin_lock(&cfs_b->lock);
78becc27 4664 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
671fd9da
PT
4665 list_del_rcu(&cfs_rq->throttled_list);
4666 raw_spin_unlock(&cfs_b->lock);
4667
64660c86
PT
4668 /* update hierarchical throttle state */
4669 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4670
671fd9da
PT
4671 if (!cfs_rq->load.weight)
4672 return;
4673
4674 task_delta = cfs_rq->h_nr_running;
4675 for_each_sched_entity(se) {
4676 if (se->on_rq)
4677 enqueue = 0;
4678
4679 cfs_rq = cfs_rq_of(se);
4680 if (enqueue)
4681 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4682 cfs_rq->h_nr_running += task_delta;
4683
4684 if (cfs_rq_throttled(cfs_rq))
4685 break;
4686 }
4687
4688 if (!se)
72465447 4689 add_nr_running(rq, task_delta);
671fd9da
PT
4690
4691 /* determine whether we need to wake up potentially idle cpu */
4692 if (rq->curr == rq->idle && rq->cfs.nr_running)
8875125e 4693 resched_curr(rq);
671fd9da
PT
4694}
4695
4696static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4697 u64 remaining, u64 expires)
4698{
4699 struct cfs_rq *cfs_rq;
c06f04c7
BS
4700 u64 runtime;
4701 u64 starting_runtime = remaining;
671fd9da
PT
4702
4703 rcu_read_lock();
4704 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4705 throttled_list) {
4706 struct rq *rq = rq_of(cfs_rq);
8a8c69c3 4707 struct rq_flags rf;
671fd9da 4708
8a8c69c3 4709 rq_lock(rq, &rf);
671fd9da
PT
4710 if (!cfs_rq_throttled(cfs_rq))
4711 goto next;
4712
4713 runtime = -cfs_rq->runtime_remaining + 1;
4714 if (runtime > remaining)
4715 runtime = remaining;
4716 remaining -= runtime;
4717
4718 cfs_rq->runtime_remaining += runtime;
4719 cfs_rq->runtime_expires = expires;
4720
4721 /* we check whether we're throttled above */
4722 if (cfs_rq->runtime_remaining > 0)
4723 unthrottle_cfs_rq(cfs_rq);
4724
4725next:
8a8c69c3 4726 rq_unlock(rq, &rf);
671fd9da
PT
4727
4728 if (!remaining)
4729 break;
4730 }
4731 rcu_read_unlock();
4732
c06f04c7 4733 return starting_runtime - remaining;
671fd9da
PT
4734}
4735
58088ad0
PT
4736/*
4737 * Responsible for refilling a task_group's bandwidth and unthrottling its
4738 * cfs_rqs as appropriate. If there has been no activity within the last
4739 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4740 * used to track this state.
4741 */
4742static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4743{
671fd9da 4744 u64 runtime, runtime_expires;
51f2176d 4745 int throttled;
58088ad0 4746
58088ad0
PT
4747 /* no need to continue the timer with no bandwidth constraint */
4748 if (cfs_b->quota == RUNTIME_INF)
51f2176d 4749 goto out_deactivate;
58088ad0 4750
671fd9da 4751 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
e8da1b18 4752 cfs_b->nr_periods += overrun;
671fd9da 4753
51f2176d
BS
4754 /*
4755 * idle depends on !throttled (for the case of a large deficit), and if
4756 * we're going inactive then everything else can be deferred
4757 */
4758 if (cfs_b->idle && !throttled)
4759 goto out_deactivate;
a9cf55b2
PT
4760
4761 __refill_cfs_bandwidth_runtime(cfs_b);
4762
671fd9da
PT
4763 if (!throttled) {
4764 /* mark as potentially idle for the upcoming period */
4765 cfs_b->idle = 1;
51f2176d 4766 return 0;
671fd9da
PT
4767 }
4768
e8da1b18
NR
4769 /* account preceding periods in which throttling occurred */
4770 cfs_b->nr_throttled += overrun;
4771
671fd9da 4772 runtime_expires = cfs_b->runtime_expires;
671fd9da
PT
4773
4774 /*
c06f04c7
BS
4775 * This check is repeated as we are holding onto the new bandwidth while
4776 * we unthrottle. This can potentially race with an unthrottled group
4777 * trying to acquire new bandwidth from the global pool. This can result
4778 * in us over-using our runtime if it is all used during this loop, but
4779 * only by limited amounts in that extreme case.
671fd9da 4780 */
c06f04c7
BS
4781 while (throttled && cfs_b->runtime > 0) {
4782 runtime = cfs_b->runtime;
671fd9da
PT
4783 raw_spin_unlock(&cfs_b->lock);
4784 /* we can't nest cfs_b->lock while distributing bandwidth */
4785 runtime = distribute_cfs_runtime(cfs_b, runtime,
4786 runtime_expires);
4787 raw_spin_lock(&cfs_b->lock);
4788
4789 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
c06f04c7
BS
4790
4791 cfs_b->runtime -= min(runtime, cfs_b->runtime);
671fd9da 4792 }
58088ad0 4793
671fd9da
PT
4794 /*
4795 * While we are ensured activity in the period following an
4796 * unthrottle, this also covers the case in which the new bandwidth is
4797 * insufficient to cover the existing bandwidth deficit. (Forcing the
4798 * timer to remain active while there are any throttled entities.)
4799 */
4800 cfs_b->idle = 0;
58088ad0 4801
51f2176d
BS
4802 return 0;
4803
4804out_deactivate:
51f2176d 4805 return 1;
58088ad0 4806}
d3d9dc33 4807
d8b4986d
PT
4808/* a cfs_rq won't donate quota below this amount */
4809static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4810/* minimum remaining period time to redistribute slack quota */
4811static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4812/* how long we wait to gather additional slack before distributing */
4813static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4814
db06e78c
BS
4815/*
4816 * Are we near the end of the current quota period?
4817 *
4818 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4961b6e1 4819 * hrtimer base being cleared by hrtimer_start. In the case of
db06e78c
BS
4820 * migrate_hrtimers, base is never cleared, so we are fine.
4821 */
d8b4986d
PT
4822static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4823{
4824 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4825 u64 remaining;
4826
4827 /* if the call-back is running a quota refresh is already occurring */
4828 if (hrtimer_callback_running(refresh_timer))
4829 return 1;
4830
4831 /* is a quota refresh about to occur? */
4832 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4833 if (remaining < min_expire)
4834 return 1;
4835
4836 return 0;
4837}
4838
4839static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4840{
4841 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4842
4843 /* if there's a quota refresh soon don't bother with slack */
4844 if (runtime_refresh_within(cfs_b, min_left))
4845 return;
4846
4cfafd30
PZ
4847 hrtimer_start(&cfs_b->slack_timer,
4848 ns_to_ktime(cfs_bandwidth_slack_period),
4849 HRTIMER_MODE_REL);
d8b4986d
PT
4850}
4851
4852/* we know any runtime found here is valid as update_curr() precedes return */
4853static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4854{
4855 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4856 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4857
4858 if (slack_runtime <= 0)
4859 return;
4860
4861 raw_spin_lock(&cfs_b->lock);
4862 if (cfs_b->quota != RUNTIME_INF &&
4863 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4864 cfs_b->runtime += slack_runtime;
4865
4866 /* we are under rq->lock, defer unthrottling using a timer */
4867 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4868 !list_empty(&cfs_b->throttled_cfs_rq))
4869 start_cfs_slack_bandwidth(cfs_b);
4870 }
4871 raw_spin_unlock(&cfs_b->lock);
4872
4873 /* even if it's not valid for return we don't want to try again */
4874 cfs_rq->runtime_remaining -= slack_runtime;
4875}
4876
4877static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4878{
56f570e5
PT
4879 if (!cfs_bandwidth_used())
4880 return;
4881
fccfdc6f 4882 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
d8b4986d
PT
4883 return;
4884
4885 __return_cfs_rq_runtime(cfs_rq);
4886}
4887
4888/*
4889 * This is done with a timer (instead of inline with bandwidth return) since
4890 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4891 */
4892static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4893{
4894 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4895 u64 expires;
4896
4897 /* confirm we're still not at a refresh boundary */
db06e78c
BS
4898 raw_spin_lock(&cfs_b->lock);
4899 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4900 raw_spin_unlock(&cfs_b->lock);
d8b4986d 4901 return;
db06e78c 4902 }
d8b4986d 4903
c06f04c7 4904 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
d8b4986d 4905 runtime = cfs_b->runtime;
c06f04c7 4906
d8b4986d
PT
4907 expires = cfs_b->runtime_expires;
4908 raw_spin_unlock(&cfs_b->lock);
4909
4910 if (!runtime)
4911 return;
4912
4913 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4914
4915 raw_spin_lock(&cfs_b->lock);
4916 if (expires == cfs_b->runtime_expires)
c06f04c7 4917 cfs_b->runtime -= min(runtime, cfs_b->runtime);
d8b4986d
PT
4918 raw_spin_unlock(&cfs_b->lock);
4919}
4920
d3d9dc33
PT
4921/*
4922 * When a group wakes up we want to make sure that its quota is not already
4923 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4924 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4925 */
4926static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4927{
56f570e5
PT
4928 if (!cfs_bandwidth_used())
4929 return;
4930
d3d9dc33
PT
4931 /* an active group must be handled by the update_curr()->put() path */
4932 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4933 return;
4934
4935 /* ensure the group is not already throttled */
4936 if (cfs_rq_throttled(cfs_rq))
4937 return;
4938
4939 /* update runtime allocation */
4940 account_cfs_rq_runtime(cfs_rq, 0);
4941 if (cfs_rq->runtime_remaining <= 0)
4942 throttle_cfs_rq(cfs_rq);
4943}
4944
55e16d30
PZ
4945static void sync_throttle(struct task_group *tg, int cpu)
4946{
4947 struct cfs_rq *pcfs_rq, *cfs_rq;
4948
4949 if (!cfs_bandwidth_used())
4950 return;
4951
4952 if (!tg->parent)
4953 return;
4954
4955 cfs_rq = tg->cfs_rq[cpu];
4956 pcfs_rq = tg->parent->cfs_rq[cpu];
4957
4958 cfs_rq->throttle_count = pcfs_rq->throttle_count;
b8922125 4959 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
55e16d30
PZ
4960}
4961
d3d9dc33 4962/* conditionally throttle active cfs_rq's from put_prev_entity() */
678d5718 4963static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
d3d9dc33 4964{
56f570e5 4965 if (!cfs_bandwidth_used())
678d5718 4966 return false;
56f570e5 4967
d3d9dc33 4968 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
678d5718 4969 return false;
d3d9dc33
PT
4970
4971 /*
4972 * it's possible for a throttled entity to be forced into a running
4973 * state (e.g. set_curr_task), in this case we're finished.
4974 */
4975 if (cfs_rq_throttled(cfs_rq))
678d5718 4976 return true;
d3d9dc33
PT
4977
4978 throttle_cfs_rq(cfs_rq);
678d5718 4979 return true;
d3d9dc33 4980}
029632fb 4981
029632fb
PZ
4982static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4983{
4984 struct cfs_bandwidth *cfs_b =
4985 container_of(timer, struct cfs_bandwidth, slack_timer);
77a4d1a1 4986
029632fb
PZ
4987 do_sched_cfs_slack_timer(cfs_b);
4988
4989 return HRTIMER_NORESTART;
4990}
4991
4992static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4993{
4994 struct cfs_bandwidth *cfs_b =
4995 container_of(timer, struct cfs_bandwidth, period_timer);
029632fb
PZ
4996 int overrun;
4997 int idle = 0;
4998
51f2176d 4999 raw_spin_lock(&cfs_b->lock);
029632fb 5000 for (;;) {
77a4d1a1 5001 overrun = hrtimer_forward_now(timer, cfs_b->period);
029632fb
PZ
5002 if (!overrun)
5003 break;
5004
5005 idle = do_sched_cfs_period_timer(cfs_b, overrun);
5006 }
4cfafd30
PZ
5007 if (idle)
5008 cfs_b->period_active = 0;
51f2176d 5009 raw_spin_unlock(&cfs_b->lock);
029632fb
PZ
5010
5011 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5012}
5013
5014void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5015{
5016 raw_spin_lock_init(&cfs_b->lock);
5017 cfs_b->runtime = 0;
5018 cfs_b->quota = RUNTIME_INF;
5019 cfs_b->period = ns_to_ktime(default_cfs_period());
5020
5021 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4cfafd30 5022 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
029632fb
PZ
5023 cfs_b->period_timer.function = sched_cfs_period_timer;
5024 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5025 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5026}
5027
5028static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5029{
5030 cfs_rq->runtime_enabled = 0;
5031 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5032}
5033
77a4d1a1 5034void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
029632fb 5035{
4cfafd30 5036 lockdep_assert_held(&cfs_b->lock);
029632fb 5037
4cfafd30
PZ
5038 if (!cfs_b->period_active) {
5039 cfs_b->period_active = 1;
5040 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5041 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5042 }
029632fb
PZ
5043}
5044
5045static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5046{
7f1a169b
TH
5047 /* init_cfs_bandwidth() was not called */
5048 if (!cfs_b->throttled_cfs_rq.next)
5049 return;
5050
029632fb
PZ
5051 hrtimer_cancel(&cfs_b->period_timer);
5052 hrtimer_cancel(&cfs_b->slack_timer);
5053}
5054
502ce005
PZ
5055/*
5056 * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
5057 *
5058 * The race is harmless, since modifying bandwidth settings of unhooked group
5059 * bits doesn't do much.
5060 */
5061
5062/* cpu online calback */
0e59bdae
KT
5063static void __maybe_unused update_runtime_enabled(struct rq *rq)
5064{
502ce005 5065 struct task_group *tg;
0e59bdae 5066
502ce005
PZ
5067 lockdep_assert_held(&rq->lock);
5068
5069 rcu_read_lock();
5070 list_for_each_entry_rcu(tg, &task_groups, list) {
5071 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5072 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
0e59bdae
KT
5073
5074 raw_spin_lock(&cfs_b->lock);
5075 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5076 raw_spin_unlock(&cfs_b->lock);
5077 }
502ce005 5078 rcu_read_unlock();
0e59bdae
KT
5079}
5080
502ce005 5081/* cpu offline callback */
38dc3348 5082static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
029632fb 5083{
502ce005
PZ
5084 struct task_group *tg;
5085
5086 lockdep_assert_held(&rq->lock);
5087
5088 rcu_read_lock();
5089 list_for_each_entry_rcu(tg, &task_groups, list) {
5090 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
029632fb 5091
029632fb
PZ
5092 if (!cfs_rq->runtime_enabled)
5093 continue;
5094
5095 /*
5096 * clock_task is not advancing so we just need to make sure
5097 * there's some valid quota amount
5098 */
51f2176d 5099 cfs_rq->runtime_remaining = 1;
0e59bdae
KT
5100 /*
5101 * Offline rq is schedulable till cpu is completely disabled
5102 * in take_cpu_down(), so we prevent new cfs throttling here.
5103 */
5104 cfs_rq->runtime_enabled = 0;
5105
029632fb
PZ
5106 if (cfs_rq_throttled(cfs_rq))
5107 unthrottle_cfs_rq(cfs_rq);
5108 }
502ce005 5109 rcu_read_unlock();
029632fb
PZ
5110}
5111
5112#else /* CONFIG_CFS_BANDWIDTH */
f1b17280
PT
5113static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5114{
78becc27 5115 return rq_clock_task(rq_of(cfs_rq));
f1b17280
PT
5116}
5117
9dbdb155 5118static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
678d5718 5119static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
d3d9dc33 5120static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
55e16d30 5121static inline void sync_throttle(struct task_group *tg, int cpu) {}
6c16a6dc 5122static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
85dac906
PT
5123
5124static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5125{
5126 return 0;
5127}
64660c86
PT
5128
5129static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5130{
5131 return 0;
5132}
5133
5134static inline int throttled_lb_pair(struct task_group *tg,
5135 int src_cpu, int dest_cpu)
5136{
5137 return 0;
5138}
029632fb
PZ
5139
5140void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5141
5142#ifdef CONFIG_FAIR_GROUP_SCHED
5143static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
ab84d31e
PT
5144#endif
5145
029632fb
PZ
5146static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5147{
5148 return NULL;
5149}
5150static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
0e59bdae 5151static inline void update_runtime_enabled(struct rq *rq) {}
a4c96ae3 5152static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
029632fb
PZ
5153
5154#endif /* CONFIG_CFS_BANDWIDTH */
5155
bf0f6f24
IM
5156/**************************************************
5157 * CFS operations on tasks:
5158 */
5159
8f4d37ec
PZ
5160#ifdef CONFIG_SCHED_HRTICK
5161static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5162{
8f4d37ec
PZ
5163 struct sched_entity *se = &p->se;
5164 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5165
9148a3a1 5166 SCHED_WARN_ON(task_rq(p) != rq);
8f4d37ec 5167
8bf46a39 5168 if (rq->cfs.h_nr_running > 1) {
8f4d37ec
PZ
5169 u64 slice = sched_slice(cfs_rq, se);
5170 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5171 s64 delta = slice - ran;
5172
5173 if (delta < 0) {
5174 if (rq->curr == p)
8875125e 5175 resched_curr(rq);
8f4d37ec
PZ
5176 return;
5177 }
31656519 5178 hrtick_start(rq, delta);
8f4d37ec
PZ
5179 }
5180}
a4c2f00f
PZ
5181
5182/*
5183 * called from enqueue/dequeue and updates the hrtick when the
5184 * current task is from our class and nr_running is low enough
5185 * to matter.
5186 */
5187static void hrtick_update(struct rq *rq)
5188{
5189 struct task_struct *curr = rq->curr;
5190
b39e66ea 5191 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
a4c2f00f
PZ
5192 return;
5193
5194 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5195 hrtick_start_fair(rq, curr);
5196}
55e12e5e 5197#else /* !CONFIG_SCHED_HRTICK */
8f4d37ec
PZ
5198static inline void
5199hrtick_start_fair(struct rq *rq, struct task_struct *p)
5200{
5201}
a4c2f00f
PZ
5202
5203static inline void hrtick_update(struct rq *rq)
5204{
5205}
8f4d37ec
PZ
5206#endif
5207
bf0f6f24
IM
5208/*
5209 * The enqueue_task method is called before nr_running is
5210 * increased. Here we update the fair scheduling stats and
5211 * then put the task into the rbtree:
5212 */
ea87bb78 5213static void
371fd7e7 5214enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
5215{
5216 struct cfs_rq *cfs_rq;
62fb1851 5217 struct sched_entity *se = &p->se;
bf0f6f24 5218
8c34ab19
RW
5219 /*
5220 * If in_iowait is set, the code below may not trigger any cpufreq
5221 * utilization updates, so do it here explicitly with the IOWAIT flag
5222 * passed.
5223 */
5224 if (p->in_iowait)
674e7541 5225 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
8c34ab19 5226
bf0f6f24 5227 for_each_sched_entity(se) {
62fb1851 5228 if (se->on_rq)
bf0f6f24
IM
5229 break;
5230 cfs_rq = cfs_rq_of(se);
88ec22d3 5231 enqueue_entity(cfs_rq, se, flags);
85dac906
PT
5232
5233 /*
5234 * end evaluation on encountering a throttled cfs_rq
5235 *
5236 * note: in the case of encountering a throttled cfs_rq we will
5237 * post the final h_nr_running increment below.
e210bffd 5238 */
85dac906
PT
5239 if (cfs_rq_throttled(cfs_rq))
5240 break;
953bfcd1 5241 cfs_rq->h_nr_running++;
85dac906 5242
88ec22d3 5243 flags = ENQUEUE_WAKEUP;
bf0f6f24 5244 }
8f4d37ec 5245
2069dd75 5246 for_each_sched_entity(se) {
0f317143 5247 cfs_rq = cfs_rq_of(se);
953bfcd1 5248 cfs_rq->h_nr_running++;
2069dd75 5249
85dac906
PT
5250 if (cfs_rq_throttled(cfs_rq))
5251 break;
5252
88c0616e 5253 update_load_avg(cfs_rq, se, UPDATE_TG);
1ea6c46a 5254 update_cfs_group(se);
2069dd75
PZ
5255 }
5256
cd126afe 5257 if (!se)
72465447 5258 add_nr_running(rq, 1);
cd126afe 5259
a4c2f00f 5260 hrtick_update(rq);
bf0f6f24
IM
5261}
5262
2f36825b
VP
5263static void set_next_buddy(struct sched_entity *se);
5264
bf0f6f24
IM
5265/*
5266 * The dequeue_task method is called before nr_running is
5267 * decreased. We remove the task from the rbtree and
5268 * update the fair scheduling stats:
5269 */
371fd7e7 5270static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
5271{
5272 struct cfs_rq *cfs_rq;
62fb1851 5273 struct sched_entity *se = &p->se;
2f36825b 5274 int task_sleep = flags & DEQUEUE_SLEEP;
bf0f6f24
IM
5275
5276 for_each_sched_entity(se) {
5277 cfs_rq = cfs_rq_of(se);
371fd7e7 5278 dequeue_entity(cfs_rq, se, flags);
85dac906
PT
5279
5280 /*
5281 * end evaluation on encountering a throttled cfs_rq
5282 *
5283 * note: in the case of encountering a throttled cfs_rq we will
5284 * post the final h_nr_running decrement below.
5285 */
5286 if (cfs_rq_throttled(cfs_rq))
5287 break;
953bfcd1 5288 cfs_rq->h_nr_running--;
2069dd75 5289
bf0f6f24 5290 /* Don't dequeue parent if it has other entities besides us */
2f36825b 5291 if (cfs_rq->load.weight) {
754bd598
KK
5292 /* Avoid re-evaluating load for this entity: */
5293 se = parent_entity(se);
2f36825b
VP
5294 /*
5295 * Bias pick_next to pick a task from this cfs_rq, as
5296 * p is sleeping when it is within its sched_slice.
5297 */
754bd598
KK
5298 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5299 set_next_buddy(se);
bf0f6f24 5300 break;
2f36825b 5301 }
371fd7e7 5302 flags |= DEQUEUE_SLEEP;
bf0f6f24 5303 }
8f4d37ec 5304
2069dd75 5305 for_each_sched_entity(se) {
0f317143 5306 cfs_rq = cfs_rq_of(se);
953bfcd1 5307 cfs_rq->h_nr_running--;
2069dd75 5308
85dac906
PT
5309 if (cfs_rq_throttled(cfs_rq))
5310 break;
5311
88c0616e 5312 update_load_avg(cfs_rq, se, UPDATE_TG);
1ea6c46a 5313 update_cfs_group(se);
2069dd75
PZ
5314 }
5315
cd126afe 5316 if (!se)
72465447 5317 sub_nr_running(rq, 1);
cd126afe 5318
a4c2f00f 5319 hrtick_update(rq);
bf0f6f24
IM
5320}
5321
e7693a36 5322#ifdef CONFIG_SMP
10e2f1ac
PZ
5323
5324/* Working cpumask for: load_balance, load_balance_newidle. */
5325DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5326DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5327
9fd81dd5 5328#ifdef CONFIG_NO_HZ_COMMON
3289bdb4
PZ
5329/*
5330 * per rq 'load' arrray crap; XXX kill this.
5331 */
5332
5333/*
d937cdc5 5334 * The exact cpuload calculated at every tick would be:
3289bdb4 5335 *
d937cdc5
PZ
5336 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5337 *
5338 * If a cpu misses updates for n ticks (as it was idle) and update gets
5339 * called on the n+1-th tick when cpu may be busy, then we have:
5340 *
5341 * load_n = (1 - 1/2^i)^n * load_0
5342 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
3289bdb4
PZ
5343 *
5344 * decay_load_missed() below does efficient calculation of
3289bdb4 5345 *
d937cdc5
PZ
5346 * load' = (1 - 1/2^i)^n * load
5347 *
5348 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5349 * This allows us to precompute the above in said factors, thereby allowing the
5350 * reduction of an arbitrary n in O(log_2 n) steps. (See also
5351 * fixed_power_int())
3289bdb4 5352 *
d937cdc5 5353 * The calculation is approximated on a 128 point scale.
3289bdb4
PZ
5354 */
5355#define DEGRADE_SHIFT 7
d937cdc5
PZ
5356
5357static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5358static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5359 { 0, 0, 0, 0, 0, 0, 0, 0 },
5360 { 64, 32, 8, 0, 0, 0, 0, 0 },
5361 { 96, 72, 40, 12, 1, 0, 0, 0 },
5362 { 112, 98, 75, 43, 15, 1, 0, 0 },
5363 { 120, 112, 98, 76, 45, 16, 2, 0 }
5364};
3289bdb4
PZ
5365
5366/*
5367 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5368 * would be when CPU is idle and so we just decay the old load without
5369 * adding any new load.
5370 */
5371static unsigned long
5372decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5373{
5374 int j = 0;
5375
5376 if (!missed_updates)
5377 return load;
5378
5379 if (missed_updates >= degrade_zero_ticks[idx])
5380 return 0;
5381
5382 if (idx == 1)
5383 return load >> missed_updates;
5384
5385 while (missed_updates) {
5386 if (missed_updates % 2)
5387 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5388
5389 missed_updates >>= 1;
5390 j++;
5391 }
5392 return load;
5393}
9fd81dd5 5394#endif /* CONFIG_NO_HZ_COMMON */
3289bdb4 5395
59543275 5396/**
cee1afce 5397 * __cpu_load_update - update the rq->cpu_load[] statistics
59543275
BP
5398 * @this_rq: The rq to update statistics for
5399 * @this_load: The current load
5400 * @pending_updates: The number of missed updates
59543275 5401 *
3289bdb4 5402 * Update rq->cpu_load[] statistics. This function is usually called every
59543275
BP
5403 * scheduler tick (TICK_NSEC).
5404 *
5405 * This function computes a decaying average:
5406 *
5407 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5408 *
5409 * Because of NOHZ it might not get called on every tick which gives need for
5410 * the @pending_updates argument.
5411 *
5412 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5413 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5414 * = A * (A * load[i]_n-2 + B) + B
5415 * = A * (A * (A * load[i]_n-3 + B) + B) + B
5416 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5417 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5418 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5419 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5420 *
5421 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5422 * any change in load would have resulted in the tick being turned back on.
5423 *
5424 * For regular NOHZ, this reduces to:
5425 *
5426 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5427 *
5428 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
1f41906a 5429 * term.
3289bdb4 5430 */
1f41906a
FW
5431static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5432 unsigned long pending_updates)
3289bdb4 5433{
9fd81dd5 5434 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
3289bdb4
PZ
5435 int i, scale;
5436
5437 this_rq->nr_load_updates++;
5438
5439 /* Update our load: */
5440 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5441 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5442 unsigned long old_load, new_load;
5443
5444 /* scale is effectively 1 << i now, and >> i divides by scale */
5445
7400d3bb 5446 old_load = this_rq->cpu_load[i];
9fd81dd5 5447#ifdef CONFIG_NO_HZ_COMMON
3289bdb4 5448 old_load = decay_load_missed(old_load, pending_updates - 1, i);
7400d3bb
BP
5449 if (tickless_load) {
5450 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5451 /*
5452 * old_load can never be a negative value because a
5453 * decayed tickless_load cannot be greater than the
5454 * original tickless_load.
5455 */
5456 old_load += tickless_load;
5457 }
9fd81dd5 5458#endif
3289bdb4
PZ
5459 new_load = this_load;
5460 /*
5461 * Round up the averaging division if load is increasing. This
5462 * prevents us from getting stuck on 9 if the load is 10, for
5463 * example.
5464 */
5465 if (new_load > old_load)
5466 new_load += scale - 1;
5467
5468 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5469 }
5470
5471 sched_avg_update(this_rq);
5472}
5473
7ea241af 5474/* Used instead of source_load when we know the type == 0 */
c7132dd6 5475static unsigned long weighted_cpuload(struct rq *rq)
7ea241af 5476{
c7132dd6 5477 return cfs_rq_runnable_load_avg(&rq->cfs);
7ea241af
YD
5478}
5479
3289bdb4 5480#ifdef CONFIG_NO_HZ_COMMON
1f41906a
FW
5481/*
5482 * There is no sane way to deal with nohz on smp when using jiffies because the
5483 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
5484 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5485 *
5486 * Therefore we need to avoid the delta approach from the regular tick when
5487 * possible since that would seriously skew the load calculation. This is why we
5488 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5489 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5490 * loop exit, nohz_idle_balance, nohz full exit...)
5491 *
5492 * This means we might still be one tick off for nohz periods.
5493 */
5494
5495static void cpu_load_update_nohz(struct rq *this_rq,
5496 unsigned long curr_jiffies,
5497 unsigned long load)
be68a682
FW
5498{
5499 unsigned long pending_updates;
5500
5501 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5502 if (pending_updates) {
5503 this_rq->last_load_update_tick = curr_jiffies;
5504 /*
5505 * In the regular NOHZ case, we were idle, this means load 0.
5506 * In the NOHZ_FULL case, we were non-idle, we should consider
5507 * its weighted load.
5508 */
1f41906a 5509 cpu_load_update(this_rq, load, pending_updates);
be68a682
FW
5510 }
5511}
5512
3289bdb4
PZ
5513/*
5514 * Called from nohz_idle_balance() to update the load ratings before doing the
5515 * idle balance.
5516 */
cee1afce 5517static void cpu_load_update_idle(struct rq *this_rq)
3289bdb4 5518{
3289bdb4
PZ
5519 /*
5520 * bail if there's load or we're actually up-to-date.
5521 */
c7132dd6 5522 if (weighted_cpuload(this_rq))
3289bdb4
PZ
5523 return;
5524
1f41906a 5525 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
3289bdb4
PZ
5526}
5527
5528/*
1f41906a
FW
5529 * Record CPU load on nohz entry so we know the tickless load to account
5530 * on nohz exit. cpu_load[0] happens then to be updated more frequently
5531 * than other cpu_load[idx] but it should be fine as cpu_load readers
5532 * shouldn't rely into synchronized cpu_load[*] updates.
3289bdb4 5533 */
1f41906a 5534void cpu_load_update_nohz_start(void)
3289bdb4
PZ
5535{
5536 struct rq *this_rq = this_rq();
1f41906a
FW
5537
5538 /*
5539 * This is all lockless but should be fine. If weighted_cpuload changes
5540 * concurrently we'll exit nohz. And cpu_load write can race with
5541 * cpu_load_update_idle() but both updater would be writing the same.
5542 */
c7132dd6 5543 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
1f41906a
FW
5544}
5545
5546/*
5547 * Account the tickless load in the end of a nohz frame.
5548 */
5549void cpu_load_update_nohz_stop(void)
5550{
316c1608 5551 unsigned long curr_jiffies = READ_ONCE(jiffies);
1f41906a
FW
5552 struct rq *this_rq = this_rq();
5553 unsigned long load;
8a8c69c3 5554 struct rq_flags rf;
3289bdb4
PZ
5555
5556 if (curr_jiffies == this_rq->last_load_update_tick)
5557 return;
5558
c7132dd6 5559 load = weighted_cpuload(this_rq);
8a8c69c3 5560 rq_lock(this_rq, &rf);
b52fad2d 5561 update_rq_clock(this_rq);
1f41906a 5562 cpu_load_update_nohz(this_rq, curr_jiffies, load);
8a8c69c3 5563 rq_unlock(this_rq, &rf);
3289bdb4 5564}
1f41906a
FW
5565#else /* !CONFIG_NO_HZ_COMMON */
5566static inline void cpu_load_update_nohz(struct rq *this_rq,
5567 unsigned long curr_jiffies,
5568 unsigned long load) { }
5569#endif /* CONFIG_NO_HZ_COMMON */
5570
5571static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5572{
9fd81dd5 5573#ifdef CONFIG_NO_HZ_COMMON
1f41906a
FW
5574 /* See the mess around cpu_load_update_nohz(). */
5575 this_rq->last_load_update_tick = READ_ONCE(jiffies);
9fd81dd5 5576#endif
1f41906a
FW
5577 cpu_load_update(this_rq, load, 1);
5578}
3289bdb4
PZ
5579
5580/*
5581 * Called from scheduler_tick()
5582 */
cee1afce 5583void cpu_load_update_active(struct rq *this_rq)
3289bdb4 5584{
c7132dd6 5585 unsigned long load = weighted_cpuload(this_rq);
1f41906a
FW
5586
5587 if (tick_nohz_tick_stopped())
5588 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5589 else
5590 cpu_load_update_periodic(this_rq, load);
3289bdb4
PZ
5591}
5592
029632fb
PZ
5593/*
5594 * Return a low guess at the load of a migration-source cpu weighted
5595 * according to the scheduling class and "nice" value.
5596 *
5597 * We want to under-estimate the load of migration sources, to
5598 * balance conservatively.
5599 */
5600static unsigned long source_load(int cpu, int type)
5601{
5602 struct rq *rq = cpu_rq(cpu);
c7132dd6 5603 unsigned long total = weighted_cpuload(rq);
029632fb
PZ
5604
5605 if (type == 0 || !sched_feat(LB_BIAS))
5606 return total;
5607
5608 return min(rq->cpu_load[type-1], total);
5609}
5610
5611/*
5612 * Return a high guess at the load of a migration-target cpu weighted
5613 * according to the scheduling class and "nice" value.
5614 */
5615static unsigned long target_load(int cpu, int type)
5616{
5617 struct rq *rq = cpu_rq(cpu);
c7132dd6 5618 unsigned long total = weighted_cpuload(rq);
029632fb
PZ
5619
5620 if (type == 0 || !sched_feat(LB_BIAS))
5621 return total;
5622
5623 return max(rq->cpu_load[type-1], total);
5624}
5625
ced549fa 5626static unsigned long capacity_of(int cpu)
029632fb 5627{
ced549fa 5628 return cpu_rq(cpu)->cpu_capacity;
029632fb
PZ
5629}
5630
ca6d75e6
VG
5631static unsigned long capacity_orig_of(int cpu)
5632{
5633 return cpu_rq(cpu)->cpu_capacity_orig;
5634}
5635
029632fb
PZ
5636static unsigned long cpu_avg_load_per_task(int cpu)
5637{
5638 struct rq *rq = cpu_rq(cpu);
316c1608 5639 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
c7132dd6 5640 unsigned long load_avg = weighted_cpuload(rq);
029632fb
PZ
5641
5642 if (nr_running)
b92486cb 5643 return load_avg / nr_running;
029632fb
PZ
5644
5645 return 0;
5646}
5647
c58d25f3
PZ
5648static void record_wakee(struct task_struct *p)
5649{
5650 /*
5651 * Only decay a single time; tasks that have less then 1 wakeup per
5652 * jiffy will not have built up many flips.
5653 */
5654 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5655 current->wakee_flips >>= 1;
5656 current->wakee_flip_decay_ts = jiffies;
5657 }
5658
5659 if (current->last_wakee != p) {
5660 current->last_wakee = p;
5661 current->wakee_flips++;
5662 }
5663}
5664
63b0e9ed
MG
5665/*
5666 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
c58d25f3 5667 *
63b0e9ed 5668 * A waker of many should wake a different task than the one last awakened
c58d25f3
PZ
5669 * at a frequency roughly N times higher than one of its wakees.
5670 *
5671 * In order to determine whether we should let the load spread vs consolidating
5672 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5673 * partner, and a factor of lls_size higher frequency in the other.
5674 *
5675 * With both conditions met, we can be relatively sure that the relationship is
5676 * non-monogamous, with partner count exceeding socket size.
5677 *
5678 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5679 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5680 * socket size.
63b0e9ed 5681 */
62470419
MW
5682static int wake_wide(struct task_struct *p)
5683{
63b0e9ed
MG
5684 unsigned int master = current->wakee_flips;
5685 unsigned int slave = p->wakee_flips;
7d9ffa89 5686 int factor = this_cpu_read(sd_llc_size);
62470419 5687
63b0e9ed
MG
5688 if (master < slave)
5689 swap(master, slave);
5690 if (slave < factor || master < slave * factor)
5691 return 0;
5692 return 1;
62470419
MW
5693}
5694
90001d67 5695/*
d153b153
PZ
5696 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5697 * soonest. For the purpose of speed we only consider the waking and previous
5698 * CPU.
90001d67 5699 *
7332dec0
MG
5700 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5701 * cache-affine and is (or will be) idle.
f2cdd9cc
PZ
5702 *
5703 * wake_affine_weight() - considers the weight to reflect the average
5704 * scheduling latency of the CPUs. This seems to work
5705 * for the overloaded case.
90001d67 5706 */
3b76c4a3 5707static int
89a55f56 5708wake_affine_idle(int this_cpu, int prev_cpu, int sync)
90001d67 5709{
7332dec0
MG
5710 /*
5711 * If this_cpu is idle, it implies the wakeup is from interrupt
5712 * context. Only allow the move if cache is shared. Otherwise an
5713 * interrupt intensive workload could force all tasks onto one
5714 * node depending on the IO topology or IRQ affinity settings.
806486c3
MG
5715 *
5716 * If the prev_cpu is idle and cache affine then avoid a migration.
5717 * There is no guarantee that the cache hot data from an interrupt
5718 * is more important than cache hot data on the prev_cpu and from
5719 * a cpufreq perspective, it's better to have higher utilisation
5720 * on one CPU.
7332dec0
MG
5721 */
5722 if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
806486c3 5723 return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
90001d67 5724
d153b153 5725 if (sync && cpu_rq(this_cpu)->nr_running == 1)
3b76c4a3 5726 return this_cpu;
90001d67 5727
3b76c4a3 5728 return nr_cpumask_bits;
90001d67
PZ
5729}
5730
3b76c4a3 5731static int
f2cdd9cc
PZ
5732wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5733 int this_cpu, int prev_cpu, int sync)
90001d67 5734{
90001d67
PZ
5735 s64 this_eff_load, prev_eff_load;
5736 unsigned long task_load;
5737
f2cdd9cc 5738 this_eff_load = target_load(this_cpu, sd->wake_idx);
90001d67 5739
90001d67
PZ
5740 if (sync) {
5741 unsigned long current_load = task_h_load(current);
5742
f2cdd9cc 5743 if (current_load > this_eff_load)
3b76c4a3 5744 return this_cpu;
90001d67 5745
f2cdd9cc 5746 this_eff_load -= current_load;
90001d67
PZ
5747 }
5748
90001d67
PZ
5749 task_load = task_h_load(p);
5750
f2cdd9cc
PZ
5751 this_eff_load += task_load;
5752 if (sched_feat(WA_BIAS))
5753 this_eff_load *= 100;
5754 this_eff_load *= capacity_of(prev_cpu);
90001d67 5755
eeb60398 5756 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
f2cdd9cc
PZ
5757 prev_eff_load -= task_load;
5758 if (sched_feat(WA_BIAS))
5759 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5760 prev_eff_load *= capacity_of(this_cpu);
90001d67 5761
082f764a
MG
5762 /*
5763 * If sync, adjust the weight of prev_eff_load such that if
5764 * prev_eff == this_eff that select_idle_sibling() will consider
5765 * stacking the wakee on top of the waker if no other CPU is
5766 * idle.
5767 */
5768 if (sync)
5769 prev_eff_load += 1;
5770
5771 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
90001d67
PZ
5772}
5773
7347fc87
MG
5774#ifdef CONFIG_NUMA_BALANCING
5775static void
5776update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5777{
5778 unsigned long interval;
5779
5780 if (!static_branch_likely(&sched_numa_balancing))
5781 return;
5782
5783 /* If balancing has no preference then continue gathering data */
5784 if (p->numa_preferred_nid == -1)
5785 return;
5786
5787 /*
5788 * If the wakeup is not affecting locality then it is neutral from
5789 * the perspective of NUMA balacing so continue gathering data.
5790 */
5791 if (cpu_to_node(prev_cpu) == cpu_to_node(target))
5792 return;
5793
5794 /*
5795 * Temporarily prevent NUMA balancing trying to place waker/wakee after
5796 * wakee has been moved by wake_affine. This will potentially allow
5797 * related tasks to converge and update their data placement. The
5798 * 4 * numa_scan_period is to allow the two-pass filter to migrate
5799 * hot data to the wakers node.
5800 */
5801 interval = max(sysctl_numa_balancing_scan_delay,
5802 p->numa_scan_period << 2);
5803 p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5804
5805 interval = max(sysctl_numa_balancing_scan_delay,
5806 current->numa_scan_period << 2);
5807 current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5808}
5809#else
5810static void
5811update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5812{
5813}
5814#endif
5815
772bd008 5816static int wake_affine(struct sched_domain *sd, struct task_struct *p,
7ebb66a1 5817 int this_cpu, int prev_cpu, int sync)
098fb9db 5818{
3b76c4a3 5819 int target = nr_cpumask_bits;
098fb9db 5820
89a55f56 5821 if (sched_feat(WA_IDLE))
3b76c4a3 5822 target = wake_affine_idle(this_cpu, prev_cpu, sync);
90001d67 5823
3b76c4a3
MG
5824 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5825 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
098fb9db 5826
ae92882e 5827 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
3b76c4a3
MG
5828 if (target == nr_cpumask_bits)
5829 return prev_cpu;
098fb9db 5830
7347fc87 5831 update_wa_numa_placement(p, prev_cpu, target);
3b76c4a3
MG
5832 schedstat_inc(sd->ttwu_move_affine);
5833 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5834 return target;
098fb9db
IM
5835}
5836
f01415fd
PB
5837static inline unsigned long task_util(struct task_struct *p);
5838static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
6a0b19c0
MR
5839
5840static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5841{
f453ae22 5842 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
6a0b19c0
MR
5843}
5844
aaee1203
PZ
5845/*
5846 * find_idlest_group finds and returns the least busy CPU group within the
5847 * domain.
6fee85cc
BJ
5848 *
5849 * Assumes p is allowed on at least one CPU in sd.
aaee1203
PZ
5850 */
5851static struct sched_group *
78e7ed53 5852find_idlest_group(struct sched_domain *sd, struct task_struct *p,
c44f2a02 5853 int this_cpu, int sd_flag)
e7693a36 5854{
b3bd3de6 5855 struct sched_group *idlest = NULL, *group = sd->groups;
6a0b19c0 5856 struct sched_group *most_spare_sg = NULL;
0d10ab95
BJ
5857 unsigned long min_runnable_load = ULONG_MAX;
5858 unsigned long this_runnable_load = ULONG_MAX;
5859 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6a0b19c0 5860 unsigned long most_spare = 0, this_spare = 0;
c44f2a02 5861 int load_idx = sd->forkexec_idx;
6b94780e
VG
5862 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5863 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5864 (sd->imbalance_pct-100) / 100;
e7693a36 5865
c44f2a02
VG
5866 if (sd_flag & SD_BALANCE_WAKE)
5867 load_idx = sd->wake_idx;
5868
aaee1203 5869 do {
6b94780e
VG
5870 unsigned long load, avg_load, runnable_load;
5871 unsigned long spare_cap, max_spare_cap;
aaee1203
PZ
5872 int local_group;
5873 int i;
e7693a36 5874
aaee1203 5875 /* Skip over this group if it has no CPUs allowed */
ae4df9d6 5876 if (!cpumask_intersects(sched_group_span(group),
0c98d344 5877 &p->cpus_allowed))
aaee1203
PZ
5878 continue;
5879
5880 local_group = cpumask_test_cpu(this_cpu,
ae4df9d6 5881 sched_group_span(group));
aaee1203 5882
6a0b19c0
MR
5883 /*
5884 * Tally up the load of all CPUs in the group and find
5885 * the group containing the CPU with most spare capacity.
5886 */
aaee1203 5887 avg_load = 0;
6b94780e 5888 runnable_load = 0;
6a0b19c0 5889 max_spare_cap = 0;
aaee1203 5890
ae4df9d6 5891 for_each_cpu(i, sched_group_span(group)) {
aaee1203
PZ
5892 /* Bias balancing toward cpus of our domain */
5893 if (local_group)
5894 load = source_load(i, load_idx);
5895 else
5896 load = target_load(i, load_idx);
5897
6b94780e
VG
5898 runnable_load += load;
5899
5900 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6a0b19c0
MR
5901
5902 spare_cap = capacity_spare_wake(i, p);
5903
5904 if (spare_cap > max_spare_cap)
5905 max_spare_cap = spare_cap;
aaee1203
PZ
5906 }
5907
63b2ca30 5908 /* Adjust by relative CPU capacity of the group */
6b94780e
VG
5909 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5910 group->sgc->capacity;
5911 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5912 group->sgc->capacity;
aaee1203
PZ
5913
5914 if (local_group) {
6b94780e
VG
5915 this_runnable_load = runnable_load;
5916 this_avg_load = avg_load;
6a0b19c0
MR
5917 this_spare = max_spare_cap;
5918 } else {
6b94780e
VG
5919 if (min_runnable_load > (runnable_load + imbalance)) {
5920 /*
5921 * The runnable load is significantly smaller
5922 * so we can pick this new cpu
5923 */
5924 min_runnable_load = runnable_load;
5925 min_avg_load = avg_load;
5926 idlest = group;
5927 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5928 (100*min_avg_load > imbalance_scale*avg_load)) {
5929 /*
5930 * The runnable loads are close so take the
5931 * blocked load into account through avg_load.
5932 */
5933 min_avg_load = avg_load;
6a0b19c0
MR
5934 idlest = group;
5935 }
5936
5937 if (most_spare < max_spare_cap) {
5938 most_spare = max_spare_cap;
5939 most_spare_sg = group;
5940 }
aaee1203
PZ
5941 }
5942 } while (group = group->next, group != sd->groups);
5943
6a0b19c0
MR
5944 /*
5945 * The cross-over point between using spare capacity or least load
5946 * is too conservative for high utilization tasks on partially
5947 * utilized systems if we require spare_capacity > task_util(p),
5948 * so we allow for some task stuffing by using
5949 * spare_capacity > task_util(p)/2.
f519a3f1
VG
5950 *
5951 * Spare capacity can't be used for fork because the utilization has
5952 * not been set yet, we must first select a rq to compute the initial
5953 * utilization.
6a0b19c0 5954 */
f519a3f1
VG
5955 if (sd_flag & SD_BALANCE_FORK)
5956 goto skip_spare;
5957
6a0b19c0 5958 if (this_spare > task_util(p) / 2 &&
6b94780e 5959 imbalance_scale*this_spare > 100*most_spare)
6a0b19c0 5960 return NULL;
6b94780e
VG
5961
5962 if (most_spare > task_util(p) / 2)
6a0b19c0
MR
5963 return most_spare_sg;
5964
f519a3f1 5965skip_spare:
6b94780e
VG
5966 if (!idlest)
5967 return NULL;
5968
2c833627
MG
5969 /*
5970 * When comparing groups across NUMA domains, it's possible for the
5971 * local domain to be very lightly loaded relative to the remote
5972 * domains but "imbalance" skews the comparison making remote CPUs
5973 * look much more favourable. When considering cross-domain, add
5974 * imbalance to the runnable load on the remote node and consider
5975 * staying local.
5976 */
5977 if ((sd->flags & SD_NUMA) &&
5978 min_runnable_load + imbalance >= this_runnable_load)
5979 return NULL;
5980
6b94780e 5981 if (min_runnable_load > (this_runnable_load + imbalance))
aaee1203 5982 return NULL;
6b94780e
VG
5983
5984 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5985 (100*this_avg_load < imbalance_scale*min_avg_load))
5986 return NULL;
5987
aaee1203
PZ
5988 return idlest;
5989}
5990
5991/*
18bd1b4b 5992 * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
aaee1203
PZ
5993 */
5994static int
18bd1b4b 5995find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
aaee1203
PZ
5996{
5997 unsigned long load, min_load = ULONG_MAX;
83a0a96a
NP
5998 unsigned int min_exit_latency = UINT_MAX;
5999 u64 latest_idle_timestamp = 0;
6000 int least_loaded_cpu = this_cpu;
6001 int shallowest_idle_cpu = -1;
aaee1203
PZ
6002 int i;
6003
eaecf41f
MR
6004 /* Check if we have any choice: */
6005 if (group->group_weight == 1)
ae4df9d6 6006 return cpumask_first(sched_group_span(group));
eaecf41f 6007
aaee1203 6008 /* Traverse only the allowed CPUs */
ae4df9d6 6009 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
83a0a96a
NP
6010 if (idle_cpu(i)) {
6011 struct rq *rq = cpu_rq(i);
6012 struct cpuidle_state *idle = idle_get_state(rq);
6013 if (idle && idle->exit_latency < min_exit_latency) {
6014 /*
6015 * We give priority to a CPU whose idle state
6016 * has the smallest exit latency irrespective
6017 * of any idle timestamp.
6018 */
6019 min_exit_latency = idle->exit_latency;
6020 latest_idle_timestamp = rq->idle_stamp;
6021 shallowest_idle_cpu = i;
6022 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6023 rq->idle_stamp > latest_idle_timestamp) {
6024 /*
6025 * If equal or no active idle state, then
6026 * the most recently idled CPU might have
6027 * a warmer cache.
6028 */
6029 latest_idle_timestamp = rq->idle_stamp;
6030 shallowest_idle_cpu = i;
6031 }
9f96742a 6032 } else if (shallowest_idle_cpu == -1) {
c7132dd6 6033 load = weighted_cpuload(cpu_rq(i));
18cec7e0 6034 if (load < min_load) {
83a0a96a
NP
6035 min_load = load;
6036 least_loaded_cpu = i;
6037 }
e7693a36
GH
6038 }
6039 }
6040
83a0a96a 6041 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
aaee1203 6042}
e7693a36 6043
18bd1b4b
BJ
6044static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6045 int cpu, int prev_cpu, int sd_flag)
6046{
93f50f90 6047 int new_cpu = cpu;
18bd1b4b 6048
6fee85cc
BJ
6049 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6050 return prev_cpu;
6051
18bd1b4b
BJ
6052 while (sd) {
6053 struct sched_group *group;
6054 struct sched_domain *tmp;
6055 int weight;
6056
6057 if (!(sd->flags & sd_flag)) {
6058 sd = sd->child;
6059 continue;
6060 }
6061
6062 group = find_idlest_group(sd, p, cpu, sd_flag);
6063 if (!group) {
6064 sd = sd->child;
6065 continue;
6066 }
6067
6068 new_cpu = find_idlest_group_cpu(group, p, cpu);
e90381ea 6069 if (new_cpu == cpu) {
18bd1b4b
BJ
6070 /* Now try balancing at a lower domain level of cpu */
6071 sd = sd->child;
6072 continue;
6073 }
6074
6075 /* Now try balancing at a lower domain level of new_cpu */
6076 cpu = new_cpu;
6077 weight = sd->span_weight;
6078 sd = NULL;
6079 for_each_domain(cpu, tmp) {
6080 if (weight <= tmp->span_weight)
6081 break;
6082 if (tmp->flags & sd_flag)
6083 sd = tmp;
6084 }
6085 /* while loop will break here if sd == NULL */
6086 }
6087
6088 return new_cpu;
6089}
6090
10e2f1ac
PZ
6091#ifdef CONFIG_SCHED_SMT
6092
6093static inline void set_idle_cores(int cpu, int val)
6094{
6095 struct sched_domain_shared *sds;
6096
6097 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6098 if (sds)
6099 WRITE_ONCE(sds->has_idle_cores, val);
6100}
6101
6102static inline bool test_idle_cores(int cpu, bool def)
6103{
6104 struct sched_domain_shared *sds;
6105
6106 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6107 if (sds)
6108 return READ_ONCE(sds->has_idle_cores);
6109
6110 return def;
6111}
6112
6113/*
6114 * Scans the local SMT mask to see if the entire core is idle, and records this
6115 * information in sd_llc_shared->has_idle_cores.
6116 *
6117 * Since SMT siblings share all cache levels, inspecting this limited remote
6118 * state should be fairly cheap.
6119 */
1b568f0a 6120void __update_idle_core(struct rq *rq)
10e2f1ac
PZ
6121{
6122 int core = cpu_of(rq);
6123 int cpu;
6124
6125 rcu_read_lock();
6126 if (test_idle_cores(core, true))
6127 goto unlock;
6128
6129 for_each_cpu(cpu, cpu_smt_mask(core)) {
6130 if (cpu == core)
6131 continue;
6132
6133 if (!idle_cpu(cpu))
6134 goto unlock;
6135 }
6136
6137 set_idle_cores(core, 1);
6138unlock:
6139 rcu_read_unlock();
6140}
6141
6142/*
6143 * Scan the entire LLC domain for idle cores; this dynamically switches off if
6144 * there are no idle cores left in the system; tracked through
6145 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
6146 */
6147static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6148{
6149 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
c743f0a5 6150 int core, cpu;
10e2f1ac 6151
1b568f0a
PZ
6152 if (!static_branch_likely(&sched_smt_present))
6153 return -1;
6154
10e2f1ac
PZ
6155 if (!test_idle_cores(target, false))
6156 return -1;
6157
0c98d344 6158 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
10e2f1ac 6159
c743f0a5 6160 for_each_cpu_wrap(core, cpus, target) {
10e2f1ac
PZ
6161 bool idle = true;
6162
6163 for_each_cpu(cpu, cpu_smt_mask(core)) {
6164 cpumask_clear_cpu(cpu, cpus);
6165 if (!idle_cpu(cpu))
6166 idle = false;
6167 }
6168
6169 if (idle)
6170 return core;
6171 }
6172
6173 /*
6174 * Failed to find an idle core; stop looking for one.
6175 */
6176 set_idle_cores(target, 0);
6177
6178 return -1;
6179}
6180
6181/*
6182 * Scan the local SMT mask for idle CPUs.
6183 */
6184static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6185{
6186 int cpu;
6187
1b568f0a
PZ
6188 if (!static_branch_likely(&sched_smt_present))
6189 return -1;
6190
10e2f1ac 6191 for_each_cpu(cpu, cpu_smt_mask(target)) {
0c98d344 6192 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
10e2f1ac
PZ
6193 continue;
6194 if (idle_cpu(cpu))
6195 return cpu;
6196 }
6197
6198 return -1;
6199}
6200
6201#else /* CONFIG_SCHED_SMT */
6202
6203static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6204{
6205 return -1;
6206}
6207
6208static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6209{
6210 return -1;
6211}
6212
6213#endif /* CONFIG_SCHED_SMT */
6214
6215/*
6216 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
6217 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
6218 * average idle time for this rq (as found in rq->avg_idle).
a50bde51 6219 */
10e2f1ac
PZ
6220static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6221{
9cfb38a7 6222 struct sched_domain *this_sd;
1ad3aaf3 6223 u64 avg_cost, avg_idle;
10e2f1ac
PZ
6224 u64 time, cost;
6225 s64 delta;
1ad3aaf3 6226 int cpu, nr = INT_MAX;
10e2f1ac 6227
9cfb38a7
WL
6228 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6229 if (!this_sd)
6230 return -1;
6231
10e2f1ac
PZ
6232 /*
6233 * Due to large variance we need a large fuzz factor; hackbench in
6234 * particularly is sensitive here.
6235 */
1ad3aaf3
PZ
6236 avg_idle = this_rq()->avg_idle / 512;
6237 avg_cost = this_sd->avg_scan_cost + 1;
6238
6239 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
10e2f1ac
PZ
6240 return -1;
6241
1ad3aaf3
PZ
6242 if (sched_feat(SIS_PROP)) {
6243 u64 span_avg = sd->span_weight * avg_idle;
6244 if (span_avg > 4*avg_cost)
6245 nr = div_u64(span_avg, avg_cost);
6246 else
6247 nr = 4;
6248 }
6249
10e2f1ac
PZ
6250 time = local_clock();
6251
c743f0a5 6252 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
1ad3aaf3
PZ
6253 if (!--nr)
6254 return -1;
0c98d344 6255 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
10e2f1ac
PZ
6256 continue;
6257 if (idle_cpu(cpu))
6258 break;
6259 }
6260
6261 time = local_clock() - time;
6262 cost = this_sd->avg_scan_cost;
6263 delta = (s64)(time - cost) / 8;
6264 this_sd->avg_scan_cost += delta;
6265
6266 return cpu;
6267}
6268
6269/*
6270 * Try and locate an idle core/thread in the LLC cache domain.
a50bde51 6271 */
772bd008 6272static int select_idle_sibling(struct task_struct *p, int prev, int target)
a50bde51 6273{
99bd5e2f 6274 struct sched_domain *sd;
32e839dd 6275 int i, recent_used_cpu;
a50bde51 6276
e0a79f52
MG
6277 if (idle_cpu(target))
6278 return target;
99bd5e2f
SS
6279
6280 /*
10e2f1ac 6281 * If the previous cpu is cache affine and idle, don't be stupid.
99bd5e2f 6282 */
772bd008
MR
6283 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6284 return prev;
a50bde51 6285
32e839dd
MG
6286 /* Check a recently used CPU as a potential idle candidate */
6287 recent_used_cpu = p->recent_used_cpu;
6288 if (recent_used_cpu != prev &&
6289 recent_used_cpu != target &&
6290 cpus_share_cache(recent_used_cpu, target) &&
6291 idle_cpu(recent_used_cpu) &&
6292 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6293 /*
6294 * Replace recent_used_cpu with prev as it is a potential
6295 * candidate for the next wake.
6296 */
6297 p->recent_used_cpu = prev;
6298 return recent_used_cpu;
6299 }
6300
518cd623 6301 sd = rcu_dereference(per_cpu(sd_llc, target));
10e2f1ac
PZ
6302 if (!sd)
6303 return target;
772bd008 6304
10e2f1ac
PZ
6305 i = select_idle_core(p, sd, target);
6306 if ((unsigned)i < nr_cpumask_bits)
6307 return i;
37407ea7 6308
10e2f1ac
PZ
6309 i = select_idle_cpu(p, sd, target);
6310 if ((unsigned)i < nr_cpumask_bits)
6311 return i;
6312
6313 i = select_idle_smt(p, sd, target);
6314 if ((unsigned)i < nr_cpumask_bits)
6315 return i;
970e1789 6316
a50bde51
PZ
6317 return target;
6318}
231678b7 6319
8bb5b00c 6320/*
9e91d61d 6321 * cpu_util returns the amount of capacity of a CPU that is used by CFS
8bb5b00c 6322 * tasks. The unit of the return value must be the one of capacity so we can
9e91d61d
DE
6323 * compare the utilization with the capacity of the CPU that is available for
6324 * CFS task (ie cpu_capacity).
231678b7
DE
6325 *
6326 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6327 * recent utilization of currently non-runnable tasks on a CPU. It represents
6328 * the amount of utilization of a CPU in the range [0..capacity_orig] where
6329 * capacity_orig is the cpu_capacity available at the highest frequency
6330 * (arch_scale_freq_capacity()).
6331 * The utilization of a CPU converges towards a sum equal to or less than the
6332 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6333 * the running time on this CPU scaled by capacity_curr.
6334 *
6335 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6336 * higher than capacity_orig because of unfortunate rounding in
6337 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
6338 * the average stabilizes with the new running time. We need to check that the
6339 * utilization stays within the range of [0..capacity_orig] and cap it if
6340 * necessary. Without utilization capping, a group could be seen as overloaded
6341 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
6342 * available capacity. We allow utilization to overshoot capacity_curr (but not
6343 * capacity_orig) as it useful for predicting the capacity required after task
6344 * migrations (scheduler-driven DVFS).
8bb5b00c 6345 */
f01415fd 6346static unsigned long cpu_util(int cpu)
8bb5b00c 6347{
9e91d61d 6348 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
8bb5b00c
VG
6349 unsigned long capacity = capacity_orig_of(cpu);
6350
231678b7 6351 return (util >= capacity) ? capacity : util;
8bb5b00c 6352}
a50bde51 6353
f01415fd 6354static inline unsigned long task_util(struct task_struct *p)
3273163c
MR
6355{
6356 return p->se.avg.util_avg;
6357}
6358
104cb16d
MR
6359/*
6360 * cpu_util_wake: Compute cpu utilization with any contributions from
6361 * the waking task p removed.
6362 */
f01415fd 6363static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
104cb16d
MR
6364{
6365 unsigned long util, capacity;
6366
6367 /* Task has no contribution or is new */
6368 if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
6369 return cpu_util(cpu);
6370
6371 capacity = capacity_orig_of(cpu);
6372 util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
6373
6374 return (util >= capacity) ? capacity : util;
6375}
6376
3273163c
MR
6377/*
6378 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6379 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6380 *
6381 * In that case WAKE_AFFINE doesn't make sense and we'll let
6382 * BALANCE_WAKE sort things out.
6383 */
6384static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6385{
6386 long min_cap, max_cap;
6387
6388 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6389 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6390
6391 /* Minimum capacity is close to max, no need to abort wake_affine */
6392 if (max_cap - min_cap < max_cap >> 3)
6393 return 0;
6394
104cb16d
MR
6395 /* Bring task utilization in sync with prev_cpu */
6396 sync_entity_load_avg(&p->se);
6397
3273163c
MR
6398 return min_cap * 1024 < task_util(p) * capacity_margin;
6399}
6400
aaee1203 6401/*
de91b9cb
MR
6402 * select_task_rq_fair: Select target runqueue for the waking task in domains
6403 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6404 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
aaee1203 6405 *
de91b9cb
MR
6406 * Balances load by selecting the idlest cpu in the idlest group, or under
6407 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
aaee1203 6408 *
de91b9cb 6409 * Returns the target cpu number.
aaee1203
PZ
6410 *
6411 * preempt must be disabled.
6412 */
0017d735 6413static int
ac66f547 6414select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
aaee1203 6415{
29cd8bae 6416 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
c88d5910 6417 int cpu = smp_processor_id();
63b0e9ed 6418 int new_cpu = prev_cpu;
99bd5e2f 6419 int want_affine = 0;
24d0c1d6 6420 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
c88d5910 6421
c58d25f3
PZ
6422 if (sd_flag & SD_BALANCE_WAKE) {
6423 record_wakee(p);
3273163c 6424 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
0c98d344 6425 && cpumask_test_cpu(cpu, &p->cpus_allowed);
c58d25f3 6426 }
aaee1203 6427
dce840a0 6428 rcu_read_lock();
aaee1203 6429 for_each_domain(cpu, tmp) {
e4f42888 6430 if (!(tmp->flags & SD_LOAD_BALANCE))
63b0e9ed 6431 break;
e4f42888 6432
fe3bcfe1 6433 /*
99bd5e2f
SS
6434 * If both cpu and prev_cpu are part of this domain,
6435 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1 6436 */
99bd5e2f
SS
6437 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6438 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6439 affine_sd = tmp;
29cd8bae 6440 break;
f03542a7 6441 }
29cd8bae 6442
f03542a7 6443 if (tmp->flags & sd_flag)
29cd8bae 6444 sd = tmp;
63b0e9ed
MG
6445 else if (!want_affine)
6446 break;
29cd8bae
PZ
6447 }
6448
63b0e9ed
MG
6449 if (affine_sd) {
6450 sd = NULL; /* Prefer wake_affine over balance flags */
7d894e6e
RR
6451 if (cpu == prev_cpu)
6452 goto pick_cpu;
6453
7ebb66a1 6454 new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
8b911acd 6455 }
e7693a36 6456
ea16f0ea
BJ
6457 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
6458 /*
6459 * We're going to need the task's util for capacity_spare_wake
6460 * in find_idlest_group. Sync it up to prev_cpu's
6461 * last_update_time.
6462 */
6463 sync_entity_load_avg(&p->se);
6464 }
6465
63b0e9ed 6466 if (!sd) {
ea16f0ea 6467pick_cpu:
32e839dd 6468 if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
772bd008 6469 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
63b0e9ed 6470
32e839dd
MG
6471 if (want_affine)
6472 current->recent_used_cpu = cpu;
6473 }
18bd1b4b
BJ
6474 } else {
6475 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
e7693a36 6476 }
dce840a0 6477 rcu_read_unlock();
e7693a36 6478
c88d5910 6479 return new_cpu;
e7693a36 6480}
0a74bef8 6481
144d8487
PZ
6482static void detach_entity_cfs_rq(struct sched_entity *se);
6483
0a74bef8
PT
6484/*
6485 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
6486 * cfs_rq_of(p) references at time of call are still valid and identify the
525628c7 6487 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
0a74bef8 6488 */
5a4fd036 6489static void migrate_task_rq_fair(struct task_struct *p)
0a74bef8 6490{
59efa0ba
PZ
6491 /*
6492 * As blocked tasks retain absolute vruntime the migration needs to
6493 * deal with this by subtracting the old and adding the new
6494 * min_vruntime -- the latter is done by enqueue_entity() when placing
6495 * the task on the new runqueue.
6496 */
6497 if (p->state == TASK_WAKING) {
6498 struct sched_entity *se = &p->se;
6499 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6500 u64 min_vruntime;
6501
6502#ifndef CONFIG_64BIT
6503 u64 min_vruntime_copy;
6504
6505 do {
6506 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6507 smp_rmb();
6508 min_vruntime = cfs_rq->min_vruntime;
6509 } while (min_vruntime != min_vruntime_copy);
6510#else
6511 min_vruntime = cfs_rq->min_vruntime;
6512#endif
6513
6514 se->vruntime -= min_vruntime;
6515 }
6516
144d8487
PZ
6517 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6518 /*
6519 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
6520 * rq->lock and can modify state directly.
6521 */
6522 lockdep_assert_held(&task_rq(p)->lock);
6523 detach_entity_cfs_rq(&p->se);
6524
6525 } else {
6526 /*
6527 * We are supposed to update the task to "current" time, then
6528 * its up to date and ready to go to new CPU/cfs_rq. But we
6529 * have difficulty in getting what current time is, so simply
6530 * throw away the out-of-date time. This will result in the
6531 * wakee task is less decayed, but giving the wakee more load
6532 * sounds not bad.
6533 */
6534 remove_entity_load_avg(&p->se);
6535 }
9d89c257
YD
6536
6537 /* Tell new CPU we are migrated */
6538 p->se.avg.last_update_time = 0;
3944a927
BS
6539
6540 /* We have migrated, no longer consider this task hot */
9d89c257 6541 p->se.exec_start = 0;
0a74bef8 6542}
12695578
YD
6543
6544static void task_dead_fair(struct task_struct *p)
6545{
6546 remove_entity_load_avg(&p->se);
6547}
e7693a36
GH
6548#endif /* CONFIG_SMP */
6549
a555e9d8 6550static unsigned long wakeup_gran(struct sched_entity *se)
0bbd3336
PZ
6551{
6552 unsigned long gran = sysctl_sched_wakeup_granularity;
6553
6554 /*
e52fb7c0
PZ
6555 * Since its curr running now, convert the gran from real-time
6556 * to virtual-time in his units.
13814d42
MG
6557 *
6558 * By using 'se' instead of 'curr' we penalize light tasks, so
6559 * they get preempted easier. That is, if 'se' < 'curr' then
6560 * the resulting gran will be larger, therefore penalizing the
6561 * lighter, if otoh 'se' > 'curr' then the resulting gran will
6562 * be smaller, again penalizing the lighter task.
6563 *
6564 * This is especially important for buddies when the leftmost
6565 * task is higher priority than the buddy.
0bbd3336 6566 */
f4ad9bd2 6567 return calc_delta_fair(gran, se);
0bbd3336
PZ
6568}
6569
464b7527
PZ
6570/*
6571 * Should 'se' preempt 'curr'.
6572 *
6573 * |s1
6574 * |s2
6575 * |s3
6576 * g
6577 * |<--->|c
6578 *
6579 * w(c, s1) = -1
6580 * w(c, s2) = 0
6581 * w(c, s3) = 1
6582 *
6583 */
6584static int
6585wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6586{
6587 s64 gran, vdiff = curr->vruntime - se->vruntime;
6588
6589 if (vdiff <= 0)
6590 return -1;
6591
a555e9d8 6592 gran = wakeup_gran(se);
464b7527
PZ
6593 if (vdiff > gran)
6594 return 1;
6595
6596 return 0;
6597}
6598
02479099
PZ
6599static void set_last_buddy(struct sched_entity *se)
6600{
69c80f3e
VP
6601 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6602 return;
6603
c5ae366e
DA
6604 for_each_sched_entity(se) {
6605 if (SCHED_WARN_ON(!se->on_rq))
6606 return;
69c80f3e 6607 cfs_rq_of(se)->last = se;
c5ae366e 6608 }
02479099
PZ
6609}
6610
6611static void set_next_buddy(struct sched_entity *se)
6612{
69c80f3e
VP
6613 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6614 return;
6615
c5ae366e
DA
6616 for_each_sched_entity(se) {
6617 if (SCHED_WARN_ON(!se->on_rq))
6618 return;
69c80f3e 6619 cfs_rq_of(se)->next = se;
c5ae366e 6620 }
02479099
PZ
6621}
6622
ac53db59
RR
6623static void set_skip_buddy(struct sched_entity *se)
6624{
69c80f3e
VP
6625 for_each_sched_entity(se)
6626 cfs_rq_of(se)->skip = se;
ac53db59
RR
6627}
6628
bf0f6f24
IM
6629/*
6630 * Preempt the current task with a newly woken task if needed:
6631 */
5a9b86f6 6632static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24
IM
6633{
6634 struct task_struct *curr = rq->curr;
8651a86c 6635 struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e45 6636 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceac 6637 int scale = cfs_rq->nr_running >= sched_nr_latency;
2f36825b 6638 int next_buddy_marked = 0;
bf0f6f24 6639
4ae7d5ce
IM
6640 if (unlikely(se == pse))
6641 return;
6642
5238cdd3 6643 /*
163122b7 6644 * This is possible from callers such as attach_tasks(), in which we
5238cdd3
PT
6645 * unconditionally check_prempt_curr() after an enqueue (which may have
6646 * lead to a throttle). This both saves work and prevents false
6647 * next-buddy nomination below.
6648 */
6649 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6650 return;
6651
2f36825b 6652 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3cb63d52 6653 set_next_buddy(pse);
2f36825b
VP
6654 next_buddy_marked = 1;
6655 }
57fdc26d 6656
aec0a514
BR
6657 /*
6658 * We can come here with TIF_NEED_RESCHED already set from new task
6659 * wake up path.
5238cdd3
PT
6660 *
6661 * Note: this also catches the edge-case of curr being in a throttled
6662 * group (e.g. via set_curr_task), since update_curr() (in the
6663 * enqueue of curr) will have resulted in resched being set. This
6664 * prevents us from potentially nominating it as a false LAST_BUDDY
6665 * below.
aec0a514
BR
6666 */
6667 if (test_tsk_need_resched(curr))
6668 return;
6669
a2f5c9ab
DH
6670 /* Idle tasks are by definition preempted by non-idle tasks. */
6671 if (unlikely(curr->policy == SCHED_IDLE) &&
6672 likely(p->policy != SCHED_IDLE))
6673 goto preempt;
6674
91c234b4 6675 /*
a2f5c9ab
DH
6676 * Batch and idle tasks do not preempt non-idle tasks (their preemption
6677 * is driven by the tick):
91c234b4 6678 */
8ed92e51 6679 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
91c234b4 6680 return;
bf0f6f24 6681
464b7527 6682 find_matching_se(&se, &pse);
9bbd7374 6683 update_curr(cfs_rq_of(se));
002f128b 6684 BUG_ON(!pse);
2f36825b
VP
6685 if (wakeup_preempt_entity(se, pse) == 1) {
6686 /*
6687 * Bias pick_next to pick the sched entity that is
6688 * triggering this preemption.
6689 */
6690 if (!next_buddy_marked)
6691 set_next_buddy(pse);
3a7e73a2 6692 goto preempt;
2f36825b 6693 }
464b7527 6694
3a7e73a2 6695 return;
a65ac745 6696
3a7e73a2 6697preempt:
8875125e 6698 resched_curr(rq);
3a7e73a2
PZ
6699 /*
6700 * Only set the backward buddy when the current task is still
6701 * on the rq. This can happen when a wakeup gets interleaved
6702 * with schedule on the ->pre_schedule() or idle_balance()
6703 * point, either of which can * drop the rq lock.
6704 *
6705 * Also, during early boot the idle thread is in the fair class,
6706 * for obvious reasons its a bad idea to schedule back to it.
6707 */
6708 if (unlikely(!se->on_rq || curr == rq->idle))
6709 return;
6710
6711 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6712 set_last_buddy(se);
bf0f6f24
IM
6713}
6714
606dba2e 6715static struct task_struct *
d8ac8971 6716pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
bf0f6f24
IM
6717{
6718 struct cfs_rq *cfs_rq = &rq->cfs;
6719 struct sched_entity *se;
678d5718 6720 struct task_struct *p;
37e117c0 6721 int new_tasks;
678d5718 6722
6e83125c 6723again:
678d5718 6724 if (!cfs_rq->nr_running)
38033c37 6725 goto idle;
678d5718 6726
9674f5ca 6727#ifdef CONFIG_FAIR_GROUP_SCHED
3f1d2a31 6728 if (prev->sched_class != &fair_sched_class)
678d5718
PZ
6729 goto simple;
6730
6731 /*
6732 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
6733 * likely that a next task is from the same cgroup as the current.
6734 *
6735 * Therefore attempt to avoid putting and setting the entire cgroup
6736 * hierarchy, only change the part that actually changes.
6737 */
6738
6739 do {
6740 struct sched_entity *curr = cfs_rq->curr;
6741
6742 /*
6743 * Since we got here without doing put_prev_entity() we also
6744 * have to consider cfs_rq->curr. If it is still a runnable
6745 * entity, update_curr() will update its vruntime, otherwise
6746 * forget we've ever seen it.
6747 */
54d27365
BS
6748 if (curr) {
6749 if (curr->on_rq)
6750 update_curr(cfs_rq);
6751 else
6752 curr = NULL;
678d5718 6753
54d27365
BS
6754 /*
6755 * This call to check_cfs_rq_runtime() will do the
6756 * throttle and dequeue its entity in the parent(s).
9674f5ca 6757 * Therefore the nr_running test will indeed
54d27365
BS
6758 * be correct.
6759 */
9674f5ca
VK
6760 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6761 cfs_rq = &rq->cfs;
6762
6763 if (!cfs_rq->nr_running)
6764 goto idle;
6765
54d27365 6766 goto simple;
9674f5ca 6767 }
54d27365 6768 }
678d5718
PZ
6769
6770 se = pick_next_entity(cfs_rq, curr);
6771 cfs_rq = group_cfs_rq(se);
6772 } while (cfs_rq);
6773
6774 p = task_of(se);
6775
6776 /*
6777 * Since we haven't yet done put_prev_entity and if the selected task
6778 * is a different task than we started out with, try and touch the
6779 * least amount of cfs_rqs.
6780 */
6781 if (prev != p) {
6782 struct sched_entity *pse = &prev->se;
6783
6784 while (!(cfs_rq = is_same_group(se, pse))) {
6785 int se_depth = se->depth;
6786 int pse_depth = pse->depth;
6787
6788 if (se_depth <= pse_depth) {
6789 put_prev_entity(cfs_rq_of(pse), pse);
6790 pse = parent_entity(pse);
6791 }
6792 if (se_depth >= pse_depth) {
6793 set_next_entity(cfs_rq_of(se), se);
6794 se = parent_entity(se);
6795 }
6796 }
6797
6798 put_prev_entity(cfs_rq, pse);
6799 set_next_entity(cfs_rq, se);
6800 }
6801
93824900 6802 goto done;
678d5718 6803simple:
678d5718 6804#endif
bf0f6f24 6805
3f1d2a31 6806 put_prev_task(rq, prev);
606dba2e 6807
bf0f6f24 6808 do {
678d5718 6809 se = pick_next_entity(cfs_rq, NULL);
f4b6755f 6810 set_next_entity(cfs_rq, se);
bf0f6f24
IM
6811 cfs_rq = group_cfs_rq(se);
6812 } while (cfs_rq);
6813
8f4d37ec 6814 p = task_of(se);
678d5718 6815
93824900
UR
6816done: __maybe_unused
6817#ifdef CONFIG_SMP
6818 /*
6819 * Move the next running task to the front of
6820 * the list, so our cfs_tasks list becomes MRU
6821 * one.
6822 */
6823 list_move(&p->se.group_node, &rq->cfs_tasks);
6824#endif
6825
b39e66ea
MG
6826 if (hrtick_enabled(rq))
6827 hrtick_start_fair(rq, p);
8f4d37ec
PZ
6828
6829 return p;
38033c37
PZ
6830
6831idle:
46f69fa3
MF
6832 new_tasks = idle_balance(rq, rf);
6833
37e117c0
PZ
6834 /*
6835 * Because idle_balance() releases (and re-acquires) rq->lock, it is
6836 * possible for any higher priority task to appear. In that case we
6837 * must re-start the pick_next_entity() loop.
6838 */
e4aa358b 6839 if (new_tasks < 0)
37e117c0
PZ
6840 return RETRY_TASK;
6841
e4aa358b 6842 if (new_tasks > 0)
38033c37 6843 goto again;
38033c37
PZ
6844
6845 return NULL;
bf0f6f24
IM
6846}
6847
6848/*
6849 * Account for a descheduled task:
6850 */
31ee529c 6851static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24
IM
6852{
6853 struct sched_entity *se = &prev->se;
6854 struct cfs_rq *cfs_rq;
6855
6856 for_each_sched_entity(se) {
6857 cfs_rq = cfs_rq_of(se);
ab6cde26 6858 put_prev_entity(cfs_rq, se);
bf0f6f24
IM
6859 }
6860}
6861
ac53db59
RR
6862/*
6863 * sched_yield() is very simple
6864 *
6865 * The magic of dealing with the ->skip buddy is in pick_next_entity.
6866 */
6867static void yield_task_fair(struct rq *rq)
6868{
6869 struct task_struct *curr = rq->curr;
6870 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6871 struct sched_entity *se = &curr->se;
6872
6873 /*
6874 * Are we the only task in the tree?
6875 */
6876 if (unlikely(rq->nr_running == 1))
6877 return;
6878
6879 clear_buddies(cfs_rq, se);
6880
6881 if (curr->policy != SCHED_BATCH) {
6882 update_rq_clock(rq);
6883 /*
6884 * Update run-time statistics of the 'current'.
6885 */
6886 update_curr(cfs_rq);
916671c0
MG
6887 /*
6888 * Tell update_rq_clock() that we've just updated,
6889 * so we don't do microscopic update in schedule()
6890 * and double the fastpath cost.
6891 */
9edfbfed 6892 rq_clock_skip_update(rq, true);
ac53db59
RR
6893 }
6894
6895 set_skip_buddy(se);
6896}
6897
d95f4122
MG
6898static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6899{
6900 struct sched_entity *se = &p->se;
6901
5238cdd3
PT
6902 /* throttled hierarchies are not runnable */
6903 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
d95f4122
MG
6904 return false;
6905
6906 /* Tell the scheduler that we'd really like pse to run next. */
6907 set_next_buddy(se);
6908
d95f4122
MG
6909 yield_task_fair(rq);
6910
6911 return true;
6912}
6913
681f3e68 6914#ifdef CONFIG_SMP
bf0f6f24 6915/**************************************************
e9c84cb8
PZ
6916 * Fair scheduling class load-balancing methods.
6917 *
6918 * BASICS
6919 *
6920 * The purpose of load-balancing is to achieve the same basic fairness the
6921 * per-cpu scheduler provides, namely provide a proportional amount of compute
6922 * time to each task. This is expressed in the following equation:
6923 *
6924 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6925 *
6926 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
6927 * W_i,0 is defined as:
6928 *
6929 * W_i,0 = \Sum_j w_i,j (2)
6930 *
6931 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
1c3de5e1 6932 * is derived from the nice value as per sched_prio_to_weight[].
e9c84cb8
PZ
6933 *
6934 * The weight average is an exponential decay average of the instantaneous
6935 * weight:
6936 *
6937 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6938 *
ced549fa 6939 * C_i is the compute capacity of cpu i, typically it is the
e9c84cb8
PZ
6940 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6941 * can also include other factors [XXX].
6942 *
6943 * To achieve this balance we define a measure of imbalance which follows
6944 * directly from (1):
6945 *
ced549fa 6946 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
e9c84cb8
PZ
6947 *
6948 * We them move tasks around to minimize the imbalance. In the continuous
6949 * function space it is obvious this converges, in the discrete case we get
6950 * a few fun cases generally called infeasible weight scenarios.
6951 *
6952 * [XXX expand on:
6953 * - infeasible weights;
6954 * - local vs global optima in the discrete case. ]
6955 *
6956 *
6957 * SCHED DOMAINS
6958 *
6959 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6960 * for all i,j solution, we create a tree of cpus that follows the hardware
6961 * topology where each level pairs two lower groups (or better). This results
6962 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
6963 * tree to only the first of the previous level and we decrease the frequency
6964 * of load-balance at each level inv. proportional to the number of cpus in
6965 * the groups.
6966 *
6967 * This yields:
6968 *
6969 * log_2 n 1 n
6970 * \Sum { --- * --- * 2^i } = O(n) (5)
6971 * i = 0 2^i 2^i
6972 * `- size of each group
6973 * | | `- number of cpus doing load-balance
6974 * | `- freq
6975 * `- sum over all levels
6976 *
6977 * Coupled with a limit on how many tasks we can migrate every balance pass,
6978 * this makes (5) the runtime complexity of the balancer.
6979 *
6980 * An important property here is that each CPU is still (indirectly) connected
6981 * to every other cpu in at most O(log n) steps:
6982 *
6983 * The adjacency matrix of the resulting graph is given by:
6984 *
97a7142f 6985 * log_2 n
e9c84cb8
PZ
6986 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
6987 * k = 0
6988 *
6989 * And you'll find that:
6990 *
6991 * A^(log_2 n)_i,j != 0 for all i,j (7)
6992 *
6993 * Showing there's indeed a path between every cpu in at most O(log n) steps.
6994 * The task movement gives a factor of O(m), giving a convergence complexity
6995 * of:
6996 *
6997 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
6998 *
6999 *
7000 * WORK CONSERVING
7001 *
7002 * In order to avoid CPUs going idle while there's still work to do, new idle
7003 * balancing is more aggressive and has the newly idle cpu iterate up the domain
7004 * tree itself instead of relying on other CPUs to bring it work.
7005 *
7006 * This adds some complexity to both (5) and (8) but it reduces the total idle
7007 * time.
7008 *
7009 * [XXX more?]
7010 *
7011 *
7012 * CGROUPS
7013 *
7014 * Cgroups make a horror show out of (2), instead of a simple sum we get:
7015 *
7016 * s_k,i
7017 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
7018 * S_k
7019 *
7020 * Where
7021 *
7022 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
7023 *
7024 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
7025 *
7026 * The big problem is S_k, its a global sum needed to compute a local (W_i)
7027 * property.
7028 *
7029 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7030 * rewrite all of this once again.]
97a7142f 7031 */
bf0f6f24 7032
ed387b78
HS
7033static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7034
0ec8aa00
PZ
7035enum fbq_type { regular, remote, all };
7036
ddcdf6e7 7037#define LBF_ALL_PINNED 0x01
367456c7 7038#define LBF_NEED_BREAK 0x02
6263322c
PZ
7039#define LBF_DST_PINNED 0x04
7040#define LBF_SOME_PINNED 0x08
ddcdf6e7
PZ
7041
7042struct lb_env {
7043 struct sched_domain *sd;
7044
ddcdf6e7 7045 struct rq *src_rq;
85c1e7da 7046 int src_cpu;
ddcdf6e7
PZ
7047
7048 int dst_cpu;
7049 struct rq *dst_rq;
7050
88b8dac0
SV
7051 struct cpumask *dst_grpmask;
7052 int new_dst_cpu;
ddcdf6e7 7053 enum cpu_idle_type idle;
bd939f45 7054 long imbalance;
b9403130
MW
7055 /* The set of CPUs under consideration for load-balancing */
7056 struct cpumask *cpus;
7057
ddcdf6e7 7058 unsigned int flags;
367456c7
PZ
7059
7060 unsigned int loop;
7061 unsigned int loop_break;
7062 unsigned int loop_max;
0ec8aa00
PZ
7063
7064 enum fbq_type fbq_type;
163122b7 7065 struct list_head tasks;
ddcdf6e7
PZ
7066};
7067
029632fb
PZ
7068/*
7069 * Is this task likely cache-hot:
7070 */
5d5e2b1b 7071static int task_hot(struct task_struct *p, struct lb_env *env)
029632fb
PZ
7072{
7073 s64 delta;
7074
e5673f28
KT
7075 lockdep_assert_held(&env->src_rq->lock);
7076
029632fb
PZ
7077 if (p->sched_class != &fair_sched_class)
7078 return 0;
7079
7080 if (unlikely(p->policy == SCHED_IDLE))
7081 return 0;
7082
7083 /*
7084 * Buddy candidates are cache hot:
7085 */
5d5e2b1b 7086 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
029632fb
PZ
7087 (&p->se == cfs_rq_of(&p->se)->next ||
7088 &p->se == cfs_rq_of(&p->se)->last))
7089 return 1;
7090
7091 if (sysctl_sched_migration_cost == -1)
7092 return 1;
7093 if (sysctl_sched_migration_cost == 0)
7094 return 0;
7095
5d5e2b1b 7096 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
029632fb
PZ
7097
7098 return delta < (s64)sysctl_sched_migration_cost;
7099}
7100
3a7053b3 7101#ifdef CONFIG_NUMA_BALANCING
c1ceac62 7102/*
2a1ed24c
SD
7103 * Returns 1, if task migration degrades locality
7104 * Returns 0, if task migration improves locality i.e migration preferred.
7105 * Returns -1, if task migration is not affected by locality.
c1ceac62 7106 */
2a1ed24c 7107static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
3a7053b3 7108{
b1ad065e 7109 struct numa_group *numa_group = rcu_dereference(p->numa_group);
c1ceac62 7110 unsigned long src_faults, dst_faults;
3a7053b3
MG
7111 int src_nid, dst_nid;
7112
2a595721 7113 if (!static_branch_likely(&sched_numa_balancing))
2a1ed24c
SD
7114 return -1;
7115
c3b9bc5b 7116 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
2a1ed24c 7117 return -1;
7a0f3083
MG
7118
7119 src_nid = cpu_to_node(env->src_cpu);
7120 dst_nid = cpu_to_node(env->dst_cpu);
7121
83e1d2cd 7122 if (src_nid == dst_nid)
2a1ed24c 7123 return -1;
7a0f3083 7124
2a1ed24c
SD
7125 /* Migrating away from the preferred node is always bad. */
7126 if (src_nid == p->numa_preferred_nid) {
7127 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7128 return 1;
7129 else
7130 return -1;
7131 }
b1ad065e 7132
c1ceac62
RR
7133 /* Encourage migration to the preferred node. */
7134 if (dst_nid == p->numa_preferred_nid)
2a1ed24c 7135 return 0;
b1ad065e 7136
739294fb
RR
7137 /* Leaving a core idle is often worse than degrading locality. */
7138 if (env->idle != CPU_NOT_IDLE)
7139 return -1;
7140
c1ceac62
RR
7141 if (numa_group) {
7142 src_faults = group_faults(p, src_nid);
7143 dst_faults = group_faults(p, dst_nid);
7144 } else {
7145 src_faults = task_faults(p, src_nid);
7146 dst_faults = task_faults(p, dst_nid);
b1ad065e
RR
7147 }
7148
c1ceac62 7149 return dst_faults < src_faults;
7a0f3083
MG
7150}
7151
3a7053b3 7152#else
2a1ed24c 7153static inline int migrate_degrades_locality(struct task_struct *p,
3a7053b3
MG
7154 struct lb_env *env)
7155{
2a1ed24c 7156 return -1;
7a0f3083 7157}
3a7053b3
MG
7158#endif
7159
1e3c88bd
PZ
7160/*
7161 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7162 */
7163static
8e45cb54 7164int can_migrate_task(struct task_struct *p, struct lb_env *env)
1e3c88bd 7165{
2a1ed24c 7166 int tsk_cache_hot;
e5673f28
KT
7167
7168 lockdep_assert_held(&env->src_rq->lock);
7169
1e3c88bd
PZ
7170 /*
7171 * We do not migrate tasks that are:
d3198084 7172 * 1) throttled_lb_pair, or
1e3c88bd 7173 * 2) cannot be migrated to this CPU due to cpus_allowed, or
d3198084
JK
7174 * 3) running (obviously), or
7175 * 4) are cache-hot on their current CPU.
1e3c88bd 7176 */
d3198084
JK
7177 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7178 return 0;
7179
0c98d344 7180 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
e02e60c1 7181 int cpu;
88b8dac0 7182
ae92882e 7183 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
88b8dac0 7184
6263322c
PZ
7185 env->flags |= LBF_SOME_PINNED;
7186
88b8dac0
SV
7187 /*
7188 * Remember if this task can be migrated to any other cpu in
7189 * our sched_group. We may want to revisit it if we couldn't
7190 * meet load balance goals by pulling other tasks on src_cpu.
7191 *
65a4433a
JH
7192 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
7193 * already computed one in current iteration.
88b8dac0 7194 */
65a4433a 7195 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
88b8dac0
SV
7196 return 0;
7197
e02e60c1
JK
7198 /* Prevent to re-select dst_cpu via env's cpus */
7199 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
0c98d344 7200 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
6263322c 7201 env->flags |= LBF_DST_PINNED;
e02e60c1
JK
7202 env->new_dst_cpu = cpu;
7203 break;
7204 }
88b8dac0 7205 }
e02e60c1 7206
1e3c88bd
PZ
7207 return 0;
7208 }
88b8dac0
SV
7209
7210 /* Record that we found atleast one task that could run on dst_cpu */
8e45cb54 7211 env->flags &= ~LBF_ALL_PINNED;
1e3c88bd 7212
ddcdf6e7 7213 if (task_running(env->src_rq, p)) {
ae92882e 7214 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
1e3c88bd
PZ
7215 return 0;
7216 }
7217
7218 /*
7219 * Aggressive migration if:
3a7053b3
MG
7220 * 1) destination numa is preferred
7221 * 2) task is cache cold, or
7222 * 3) too many balance attempts have failed.
1e3c88bd 7223 */
2a1ed24c
SD
7224 tsk_cache_hot = migrate_degrades_locality(p, env);
7225 if (tsk_cache_hot == -1)
7226 tsk_cache_hot = task_hot(p, env);
3a7053b3 7227
2a1ed24c 7228 if (tsk_cache_hot <= 0 ||
7a96c231 7229 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
2a1ed24c 7230 if (tsk_cache_hot == 1) {
ae92882e
JP
7231 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7232 schedstat_inc(p->se.statistics.nr_forced_migrations);
3a7053b3 7233 }
1e3c88bd
PZ
7234 return 1;
7235 }
7236
ae92882e 7237 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
4e2dcb73 7238 return 0;
1e3c88bd
PZ
7239}
7240
897c395f 7241/*
163122b7
KT
7242 * detach_task() -- detach the task for the migration specified in env
7243 */
7244static void detach_task(struct task_struct *p, struct lb_env *env)
7245{
7246 lockdep_assert_held(&env->src_rq->lock);
7247
163122b7 7248 p->on_rq = TASK_ON_RQ_MIGRATING;
5704ac0a 7249 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
163122b7
KT
7250 set_task_cpu(p, env->dst_cpu);
7251}
7252
897c395f 7253/*
e5673f28 7254 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
897c395f 7255 * part of active balancing operations within "domain".
897c395f 7256 *
e5673f28 7257 * Returns a task if successful and NULL otherwise.
897c395f 7258 */
e5673f28 7259static struct task_struct *detach_one_task(struct lb_env *env)
897c395f 7260{
93824900 7261 struct task_struct *p;
897c395f 7262
e5673f28
KT
7263 lockdep_assert_held(&env->src_rq->lock);
7264
93824900
UR
7265 list_for_each_entry_reverse(p,
7266 &env->src_rq->cfs_tasks, se.group_node) {
367456c7
PZ
7267 if (!can_migrate_task(p, env))
7268 continue;
897c395f 7269
163122b7 7270 detach_task(p, env);
e5673f28 7271
367456c7 7272 /*
e5673f28 7273 * Right now, this is only the second place where
163122b7 7274 * lb_gained[env->idle] is updated (other is detach_tasks)
e5673f28 7275 * so we can safely collect stats here rather than
163122b7 7276 * inside detach_tasks().
367456c7 7277 */
ae92882e 7278 schedstat_inc(env->sd->lb_gained[env->idle]);
e5673f28 7279 return p;
897c395f 7280 }
e5673f28 7281 return NULL;
897c395f
PZ
7282}
7283
eb95308e
PZ
7284static const unsigned int sched_nr_migrate_break = 32;
7285
5d6523eb 7286/*
163122b7
KT
7287 * detach_tasks() -- tries to detach up to imbalance weighted load from
7288 * busiest_rq, as part of a balancing operation within domain "sd".
5d6523eb 7289 *
163122b7 7290 * Returns number of detached tasks if successful and 0 otherwise.
5d6523eb 7291 */
163122b7 7292static int detach_tasks(struct lb_env *env)
1e3c88bd 7293{
5d6523eb
PZ
7294 struct list_head *tasks = &env->src_rq->cfs_tasks;
7295 struct task_struct *p;
367456c7 7296 unsigned long load;
163122b7
KT
7297 int detached = 0;
7298
7299 lockdep_assert_held(&env->src_rq->lock);
1e3c88bd 7300
bd939f45 7301 if (env->imbalance <= 0)
5d6523eb 7302 return 0;
1e3c88bd 7303
5d6523eb 7304 while (!list_empty(tasks)) {
985d3a4c
YD
7305 /*
7306 * We don't want to steal all, otherwise we may be treated likewise,
7307 * which could at worst lead to a livelock crash.
7308 */
7309 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7310 break;
7311
93824900 7312 p = list_last_entry(tasks, struct task_struct, se.group_node);
1e3c88bd 7313
367456c7
PZ
7314 env->loop++;
7315 /* We've more or less seen every task there is, call it quits */
5d6523eb 7316 if (env->loop > env->loop_max)
367456c7 7317 break;
5d6523eb
PZ
7318
7319 /* take a breather every nr_migrate tasks */
367456c7 7320 if (env->loop > env->loop_break) {
eb95308e 7321 env->loop_break += sched_nr_migrate_break;
8e45cb54 7322 env->flags |= LBF_NEED_BREAK;
ee00e66f 7323 break;
a195f004 7324 }
1e3c88bd 7325
d3198084 7326 if (!can_migrate_task(p, env))
367456c7
PZ
7327 goto next;
7328
7329 load = task_h_load(p);
5d6523eb 7330
eb95308e 7331 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
367456c7
PZ
7332 goto next;
7333
bd939f45 7334 if ((load / 2) > env->imbalance)
367456c7 7335 goto next;
1e3c88bd 7336
163122b7
KT
7337 detach_task(p, env);
7338 list_add(&p->se.group_node, &env->tasks);
7339
7340 detached++;
bd939f45 7341 env->imbalance -= load;
1e3c88bd
PZ
7342
7343#ifdef CONFIG_PREEMPT
ee00e66f
PZ
7344 /*
7345 * NEWIDLE balancing is a source of latency, so preemptible
163122b7 7346 * kernels will stop after the first task is detached to minimize
ee00e66f
PZ
7347 * the critical section.
7348 */
5d6523eb 7349 if (env->idle == CPU_NEWLY_IDLE)
ee00e66f 7350 break;
1e3c88bd
PZ
7351#endif
7352
ee00e66f
PZ
7353 /*
7354 * We only want to steal up to the prescribed amount of
7355 * weighted load.
7356 */
bd939f45 7357 if (env->imbalance <= 0)
ee00e66f 7358 break;
367456c7
PZ
7359
7360 continue;
7361next:
93824900 7362 list_move(&p->se.group_node, tasks);
1e3c88bd 7363 }
5d6523eb 7364
1e3c88bd 7365 /*
163122b7
KT
7366 * Right now, this is one of only two places we collect this stat
7367 * so we can safely collect detach_one_task() stats here rather
7368 * than inside detach_one_task().
1e3c88bd 7369 */
ae92882e 7370 schedstat_add(env->sd->lb_gained[env->idle], detached);
1e3c88bd 7371
163122b7
KT
7372 return detached;
7373}
7374
7375/*
7376 * attach_task() -- attach the task detached by detach_task() to its new rq.
7377 */
7378static void attach_task(struct rq *rq, struct task_struct *p)
7379{
7380 lockdep_assert_held(&rq->lock);
7381
7382 BUG_ON(task_rq(p) != rq);
5704ac0a 7383 activate_task(rq, p, ENQUEUE_NOCLOCK);
3ea94de1 7384 p->on_rq = TASK_ON_RQ_QUEUED;
163122b7
KT
7385 check_preempt_curr(rq, p, 0);
7386}
7387
7388/*
7389 * attach_one_task() -- attaches the task returned from detach_one_task() to
7390 * its new rq.
7391 */
7392static void attach_one_task(struct rq *rq, struct task_struct *p)
7393{
8a8c69c3
PZ
7394 struct rq_flags rf;
7395
7396 rq_lock(rq, &rf);
5704ac0a 7397 update_rq_clock(rq);
163122b7 7398 attach_task(rq, p);
8a8c69c3 7399 rq_unlock(rq, &rf);
163122b7
KT
7400}
7401
7402/*
7403 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
7404 * new rq.
7405 */
7406static void attach_tasks(struct lb_env *env)
7407{
7408 struct list_head *tasks = &env->tasks;
7409 struct task_struct *p;
8a8c69c3 7410 struct rq_flags rf;
163122b7 7411
8a8c69c3 7412 rq_lock(env->dst_rq, &rf);
5704ac0a 7413 update_rq_clock(env->dst_rq);
163122b7
KT
7414
7415 while (!list_empty(tasks)) {
7416 p = list_first_entry(tasks, struct task_struct, se.group_node);
7417 list_del_init(&p->se.group_node);
1e3c88bd 7418
163122b7
KT
7419 attach_task(env->dst_rq, p);
7420 }
7421
8a8c69c3 7422 rq_unlock(env->dst_rq, &rf);
1e3c88bd
PZ
7423}
7424
230059de 7425#ifdef CONFIG_FAIR_GROUP_SCHED
a9e7f654
TH
7426
7427static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7428{
7429 if (cfs_rq->load.weight)
7430 return false;
7431
7432 if (cfs_rq->avg.load_sum)
7433 return false;
7434
7435 if (cfs_rq->avg.util_sum)
7436 return false;
7437
1ea6c46a 7438 if (cfs_rq->avg.runnable_load_sum)
a9e7f654
TH
7439 return false;
7440
7441 return true;
7442}
7443
48a16753 7444static void update_blocked_averages(int cpu)
9e3081ca 7445{
9e3081ca 7446 struct rq *rq = cpu_rq(cpu);
a9e7f654 7447 struct cfs_rq *cfs_rq, *pos;
8a8c69c3 7448 struct rq_flags rf;
9e3081ca 7449
8a8c69c3 7450 rq_lock_irqsave(rq, &rf);
48a16753 7451 update_rq_clock(rq);
9d89c257 7452
9763b67f
PZ
7453 /*
7454 * Iterates the task_group tree in a bottom up fashion, see
7455 * list_add_leaf_cfs_rq() for details.
7456 */
a9e7f654 7457 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
bc427898
VG
7458 struct sched_entity *se;
7459
9d89c257
YD
7460 /* throttled entities do not contribute to load */
7461 if (throttled_hierarchy(cfs_rq))
7462 continue;
48a16753 7463
3a123bbb 7464 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
9d89c257 7465 update_tg_load_avg(cfs_rq, 0);
4e516076 7466
bc427898
VG
7467 /* Propagate pending load changes to the parent, if any: */
7468 se = cfs_rq->tg->se[cpu];
7469 if (se && !skip_blocked_update(se))
88c0616e 7470 update_load_avg(cfs_rq_of(se), se, 0);
a9e7f654
TH
7471
7472 /*
7473 * There can be a lot of idle CPU cgroups. Don't let fully
7474 * decayed cfs_rqs linger on the list.
7475 */
7476 if (cfs_rq_is_decayed(cfs_rq))
7477 list_del_leaf_cfs_rq(cfs_rq);
9d89c257 7478 }
8a8c69c3 7479 rq_unlock_irqrestore(rq, &rf);
9e3081ca
PZ
7480}
7481
9763b67f 7482/*
68520796 7483 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9763b67f
PZ
7484 * This needs to be done in a top-down fashion because the load of a child
7485 * group is a fraction of its parents load.
7486 */
68520796 7487static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9763b67f 7488{
68520796
VD
7489 struct rq *rq = rq_of(cfs_rq);
7490 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
a35b6466 7491 unsigned long now = jiffies;
68520796 7492 unsigned long load;
a35b6466 7493
68520796 7494 if (cfs_rq->last_h_load_update == now)
a35b6466
PZ
7495 return;
7496
68520796
VD
7497 cfs_rq->h_load_next = NULL;
7498 for_each_sched_entity(se) {
7499 cfs_rq = cfs_rq_of(se);
7500 cfs_rq->h_load_next = se;
7501 if (cfs_rq->last_h_load_update == now)
7502 break;
7503 }
a35b6466 7504
68520796 7505 if (!se) {
7ea241af 7506 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
68520796
VD
7507 cfs_rq->last_h_load_update = now;
7508 }
7509
7510 while ((se = cfs_rq->h_load_next) != NULL) {
7511 load = cfs_rq->h_load;
7ea241af
YD
7512 load = div64_ul(load * se->avg.load_avg,
7513 cfs_rq_load_avg(cfs_rq) + 1);
68520796
VD
7514 cfs_rq = group_cfs_rq(se);
7515 cfs_rq->h_load = load;
7516 cfs_rq->last_h_load_update = now;
7517 }
9763b67f
PZ
7518}
7519
367456c7 7520static unsigned long task_h_load(struct task_struct *p)
230059de 7521{
367456c7 7522 struct cfs_rq *cfs_rq = task_cfs_rq(p);
230059de 7523
68520796 7524 update_cfs_rq_h_load(cfs_rq);
9d89c257 7525 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7ea241af 7526 cfs_rq_load_avg(cfs_rq) + 1);
230059de
PZ
7527}
7528#else
48a16753 7529static inline void update_blocked_averages(int cpu)
9e3081ca 7530{
6c1d47c0
VG
7531 struct rq *rq = cpu_rq(cpu);
7532 struct cfs_rq *cfs_rq = &rq->cfs;
8a8c69c3 7533 struct rq_flags rf;
6c1d47c0 7534
8a8c69c3 7535 rq_lock_irqsave(rq, &rf);
6c1d47c0 7536 update_rq_clock(rq);
3a123bbb 7537 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
8a8c69c3 7538 rq_unlock_irqrestore(rq, &rf);
9e3081ca
PZ
7539}
7540
367456c7 7541static unsigned long task_h_load(struct task_struct *p)
1e3c88bd 7542{
9d89c257 7543 return p->se.avg.load_avg;
1e3c88bd 7544}
230059de 7545#endif
1e3c88bd 7546
1e3c88bd 7547/********** Helpers for find_busiest_group ************************/
caeb178c
RR
7548
7549enum group_type {
7550 group_other = 0,
7551 group_imbalanced,
7552 group_overloaded,
7553};
7554
1e3c88bd
PZ
7555/*
7556 * sg_lb_stats - stats of a sched_group required for load_balancing
7557 */
7558struct sg_lb_stats {
7559 unsigned long avg_load; /*Avg load across the CPUs of the group */
7560 unsigned long group_load; /* Total load over the CPUs of the group */
1e3c88bd 7561 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
56cf515b 7562 unsigned long load_per_task;
63b2ca30 7563 unsigned long group_capacity;
9e91d61d 7564 unsigned long group_util; /* Total utilization of the group */
147c5fc2 7565 unsigned int sum_nr_running; /* Nr tasks running in the group */
147c5fc2
PZ
7566 unsigned int idle_cpus;
7567 unsigned int group_weight;
caeb178c 7568 enum group_type group_type;
ea67821b 7569 int group_no_capacity;
0ec8aa00
PZ
7570#ifdef CONFIG_NUMA_BALANCING
7571 unsigned int nr_numa_running;
7572 unsigned int nr_preferred_running;
7573#endif
1e3c88bd
PZ
7574};
7575
56cf515b
JK
7576/*
7577 * sd_lb_stats - Structure to store the statistics of a sched_domain
7578 * during load balancing.
7579 */
7580struct sd_lb_stats {
7581 struct sched_group *busiest; /* Busiest group in this sd */
7582 struct sched_group *local; /* Local group in this sd */
90001d67 7583 unsigned long total_running;
56cf515b 7584 unsigned long total_load; /* Total load of all groups in sd */
63b2ca30 7585 unsigned long total_capacity; /* Total capacity of all groups in sd */
56cf515b
JK
7586 unsigned long avg_load; /* Average load across all groups in sd */
7587
56cf515b 7588 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
147c5fc2 7589 struct sg_lb_stats local_stat; /* Statistics of the local group */
56cf515b
JK
7590};
7591
147c5fc2
PZ
7592static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7593{
7594 /*
7595 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
7596 * local_stat because update_sg_lb_stats() does a full clear/assignment.
7597 * We must however clear busiest_stat::avg_load because
7598 * update_sd_pick_busiest() reads this before assignment.
7599 */
7600 *sds = (struct sd_lb_stats){
7601 .busiest = NULL,
7602 .local = NULL,
90001d67 7603 .total_running = 0UL,
147c5fc2 7604 .total_load = 0UL,
63b2ca30 7605 .total_capacity = 0UL,
147c5fc2
PZ
7606 .busiest_stat = {
7607 .avg_load = 0UL,
caeb178c
RR
7608 .sum_nr_running = 0,
7609 .group_type = group_other,
147c5fc2
PZ
7610 },
7611 };
7612}
7613
1e3c88bd
PZ
7614/**
7615 * get_sd_load_idx - Obtain the load index for a given sched domain.
7616 * @sd: The sched_domain whose load_idx is to be obtained.
ed1b7732 7617 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
e69f6186
YB
7618 *
7619 * Return: The load index.
1e3c88bd
PZ
7620 */
7621static inline int get_sd_load_idx(struct sched_domain *sd,
7622 enum cpu_idle_type idle)
7623{
7624 int load_idx;
7625
7626 switch (idle) {
7627 case CPU_NOT_IDLE:
7628 load_idx = sd->busy_idx;
7629 break;
7630
7631 case CPU_NEWLY_IDLE:
7632 load_idx = sd->newidle_idx;
7633 break;
7634 default:
7635 load_idx = sd->idle_idx;
7636 break;
7637 }
7638
7639 return load_idx;
7640}
7641
ced549fa 7642static unsigned long scale_rt_capacity(int cpu)
1e3c88bd
PZ
7643{
7644 struct rq *rq = cpu_rq(cpu);
b5b4860d 7645 u64 total, used, age_stamp, avg;
cadefd3d 7646 s64 delta;
1e3c88bd 7647
b654f7de
PZ
7648 /*
7649 * Since we're reading these variables without serialization make sure
7650 * we read them once before doing sanity checks on them.
7651 */
316c1608
JL
7652 age_stamp = READ_ONCE(rq->age_stamp);
7653 avg = READ_ONCE(rq->rt_avg);
cebde6d6 7654 delta = __rq_clock_broken(rq) - age_stamp;
b654f7de 7655
cadefd3d
PZ
7656 if (unlikely(delta < 0))
7657 delta = 0;
7658
7659 total = sched_avg_period() + delta;
aa483808 7660
b5b4860d 7661 used = div_u64(avg, total);
1e3c88bd 7662
b5b4860d
VG
7663 if (likely(used < SCHED_CAPACITY_SCALE))
7664 return SCHED_CAPACITY_SCALE - used;
1e3c88bd 7665
b5b4860d 7666 return 1;
1e3c88bd
PZ
7667}
7668
ced549fa 7669static void update_cpu_capacity(struct sched_domain *sd, int cpu)
1e3c88bd 7670{
8cd5601c 7671 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
1e3c88bd
PZ
7672 struct sched_group *sdg = sd->groups;
7673
ca6d75e6 7674 cpu_rq(cpu)->cpu_capacity_orig = capacity;
9d5efe05 7675
ced549fa 7676 capacity *= scale_rt_capacity(cpu);
ca8ce3d0 7677 capacity >>= SCHED_CAPACITY_SHIFT;
1e3c88bd 7678
ced549fa
NP
7679 if (!capacity)
7680 capacity = 1;
1e3c88bd 7681
ced549fa
NP
7682 cpu_rq(cpu)->cpu_capacity = capacity;
7683 sdg->sgc->capacity = capacity;
bf475ce0 7684 sdg->sgc->min_capacity = capacity;
1e3c88bd
PZ
7685}
7686
63b2ca30 7687void update_group_capacity(struct sched_domain *sd, int cpu)
1e3c88bd
PZ
7688{
7689 struct sched_domain *child = sd->child;
7690 struct sched_group *group, *sdg = sd->groups;
bf475ce0 7691 unsigned long capacity, min_capacity;
4ec4412e
VG
7692 unsigned long interval;
7693
7694 interval = msecs_to_jiffies(sd->balance_interval);
7695 interval = clamp(interval, 1UL, max_load_balance_interval);
63b2ca30 7696 sdg->sgc->next_update = jiffies + interval;
1e3c88bd
PZ
7697
7698 if (!child) {
ced549fa 7699 update_cpu_capacity(sd, cpu);
1e3c88bd
PZ
7700 return;
7701 }
7702
dc7ff76e 7703 capacity = 0;
bf475ce0 7704 min_capacity = ULONG_MAX;
1e3c88bd 7705
74a5ce20
PZ
7706 if (child->flags & SD_OVERLAP) {
7707 /*
7708 * SD_OVERLAP domains cannot assume that child groups
7709 * span the current group.
7710 */
7711
ae4df9d6 7712 for_each_cpu(cpu, sched_group_span(sdg)) {
63b2ca30 7713 struct sched_group_capacity *sgc;
9abf24d4 7714 struct rq *rq = cpu_rq(cpu);
863bffc8 7715
9abf24d4 7716 /*
63b2ca30 7717 * build_sched_domains() -> init_sched_groups_capacity()
9abf24d4
SD
7718 * gets here before we've attached the domains to the
7719 * runqueues.
7720 *
ced549fa
NP
7721 * Use capacity_of(), which is set irrespective of domains
7722 * in update_cpu_capacity().
9abf24d4 7723 *
dc7ff76e 7724 * This avoids capacity from being 0 and
9abf24d4 7725 * causing divide-by-zero issues on boot.
9abf24d4
SD
7726 */
7727 if (unlikely(!rq->sd)) {
ced549fa 7728 capacity += capacity_of(cpu);
bf475ce0
MR
7729 } else {
7730 sgc = rq->sd->groups->sgc;
7731 capacity += sgc->capacity;
9abf24d4 7732 }
863bffc8 7733
bf475ce0 7734 min_capacity = min(capacity, min_capacity);
863bffc8 7735 }
74a5ce20
PZ
7736 } else {
7737 /*
7738 * !SD_OVERLAP domains can assume that child groups
7739 * span the current group.
97a7142f 7740 */
74a5ce20
PZ
7741
7742 group = child->groups;
7743 do {
bf475ce0
MR
7744 struct sched_group_capacity *sgc = group->sgc;
7745
7746 capacity += sgc->capacity;
7747 min_capacity = min(sgc->min_capacity, min_capacity);
74a5ce20
PZ
7748 group = group->next;
7749 } while (group != child->groups);
7750 }
1e3c88bd 7751
63b2ca30 7752 sdg->sgc->capacity = capacity;
bf475ce0 7753 sdg->sgc->min_capacity = min_capacity;
1e3c88bd
PZ
7754}
7755
9d5efe05 7756/*
ea67821b
VG
7757 * Check whether the capacity of the rq has been noticeably reduced by side
7758 * activity. The imbalance_pct is used for the threshold.
7759 * Return true is the capacity is reduced
9d5efe05
SV
7760 */
7761static inline int
ea67821b 7762check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9d5efe05 7763{
ea67821b
VG
7764 return ((rq->cpu_capacity * sd->imbalance_pct) <
7765 (rq->cpu_capacity_orig * 100));
9d5efe05
SV
7766}
7767
30ce5dab
PZ
7768/*
7769 * Group imbalance indicates (and tries to solve) the problem where balancing
0c98d344 7770 * groups is inadequate due to ->cpus_allowed constraints.
30ce5dab
PZ
7771 *
7772 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
7773 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
7774 * Something like:
7775 *
2b4d5b25
IM
7776 * { 0 1 2 3 } { 4 5 6 7 }
7777 * * * * *
30ce5dab
PZ
7778 *
7779 * If we were to balance group-wise we'd place two tasks in the first group and
7780 * two tasks in the second group. Clearly this is undesired as it will overload
7781 * cpu 3 and leave one of the cpus in the second group unused.
7782 *
7783 * The current solution to this issue is detecting the skew in the first group
6263322c
PZ
7784 * by noticing the lower domain failed to reach balance and had difficulty
7785 * moving tasks due to affinity constraints.
30ce5dab
PZ
7786 *
7787 * When this is so detected; this group becomes a candidate for busiest; see
ed1b7732 7788 * update_sd_pick_busiest(). And calculate_imbalance() and
6263322c 7789 * find_busiest_group() avoid some of the usual balance conditions to allow it
30ce5dab
PZ
7790 * to create an effective group imbalance.
7791 *
7792 * This is a somewhat tricky proposition since the next run might not find the
7793 * group imbalance and decide the groups need to be balanced again. A most
7794 * subtle and fragile situation.
7795 */
7796
6263322c 7797static inline int sg_imbalanced(struct sched_group *group)
30ce5dab 7798{
63b2ca30 7799 return group->sgc->imbalance;
30ce5dab
PZ
7800}
7801
b37d9316 7802/*
ea67821b
VG
7803 * group_has_capacity returns true if the group has spare capacity that could
7804 * be used by some tasks.
7805 * We consider that a group has spare capacity if the * number of task is
9e91d61d
DE
7806 * smaller than the number of CPUs or if the utilization is lower than the
7807 * available capacity for CFS tasks.
ea67821b
VG
7808 * For the latter, we use a threshold to stabilize the state, to take into
7809 * account the variance of the tasks' load and to return true if the available
7810 * capacity in meaningful for the load balancer.
7811 * As an example, an available capacity of 1% can appear but it doesn't make
7812 * any benefit for the load balance.
b37d9316 7813 */
ea67821b
VG
7814static inline bool
7815group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
b37d9316 7816{
ea67821b
VG
7817 if (sgs->sum_nr_running < sgs->group_weight)
7818 return true;
c61037e9 7819
ea67821b 7820 if ((sgs->group_capacity * 100) >
9e91d61d 7821 (sgs->group_util * env->sd->imbalance_pct))
ea67821b 7822 return true;
b37d9316 7823
ea67821b
VG
7824 return false;
7825}
7826
7827/*
7828 * group_is_overloaded returns true if the group has more tasks than it can
7829 * handle.
7830 * group_is_overloaded is not equals to !group_has_capacity because a group
7831 * with the exact right number of tasks, has no more spare capacity but is not
7832 * overloaded so both group_has_capacity and group_is_overloaded return
7833 * false.
7834 */
7835static inline bool
7836group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7837{
7838 if (sgs->sum_nr_running <= sgs->group_weight)
7839 return false;
b37d9316 7840
ea67821b 7841 if ((sgs->group_capacity * 100) <
9e91d61d 7842 (sgs->group_util * env->sd->imbalance_pct))
ea67821b 7843 return true;
b37d9316 7844
ea67821b 7845 return false;
b37d9316
PZ
7846}
7847
9e0994c0
MR
7848/*
7849 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
7850 * per-CPU capacity than sched_group ref.
7851 */
7852static inline bool
7853group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7854{
7855 return sg->sgc->min_capacity * capacity_margin <
7856 ref->sgc->min_capacity * 1024;
7857}
7858
79a89f92
LY
7859static inline enum
7860group_type group_classify(struct sched_group *group,
7861 struct sg_lb_stats *sgs)
caeb178c 7862{
ea67821b 7863 if (sgs->group_no_capacity)
caeb178c
RR
7864 return group_overloaded;
7865
7866 if (sg_imbalanced(group))
7867 return group_imbalanced;
7868
7869 return group_other;
7870}
7871
1e3c88bd
PZ
7872/**
7873 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
cd96891d 7874 * @env: The load balancing environment.
1e3c88bd 7875 * @group: sched_group whose statistics are to be updated.
1e3c88bd 7876 * @load_idx: Load index of sched_domain of this_cpu for load calc.
1e3c88bd 7877 * @local_group: Does group contain this_cpu.
1e3c88bd 7878 * @sgs: variable to hold the statistics for this group.
cd3bd4e6 7879 * @overload: Indicate more than one runnable task for any CPU.
1e3c88bd 7880 */
bd939f45
PZ
7881static inline void update_sg_lb_stats(struct lb_env *env,
7882 struct sched_group *group, int load_idx,
4486edd1
TC
7883 int local_group, struct sg_lb_stats *sgs,
7884 bool *overload)
1e3c88bd 7885{
30ce5dab 7886 unsigned long load;
a426f99c 7887 int i, nr_running;
1e3c88bd 7888
b72ff13c
PZ
7889 memset(sgs, 0, sizeof(*sgs));
7890
ae4df9d6 7891 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
1e3c88bd
PZ
7892 struct rq *rq = cpu_rq(i);
7893
1e3c88bd 7894 /* Bias balancing toward cpus of our domain */
6263322c 7895 if (local_group)
04f733b4 7896 load = target_load(i, load_idx);
6263322c 7897 else
1e3c88bd 7898 load = source_load(i, load_idx);
1e3c88bd
PZ
7899
7900 sgs->group_load += load;
9e91d61d 7901 sgs->group_util += cpu_util(i);
65fdac08 7902 sgs->sum_nr_running += rq->cfs.h_nr_running;
4486edd1 7903
a426f99c
WL
7904 nr_running = rq->nr_running;
7905 if (nr_running > 1)
4486edd1
TC
7906 *overload = true;
7907
0ec8aa00
PZ
7908#ifdef CONFIG_NUMA_BALANCING
7909 sgs->nr_numa_running += rq->nr_numa_running;
7910 sgs->nr_preferred_running += rq->nr_preferred_running;
7911#endif
c7132dd6 7912 sgs->sum_weighted_load += weighted_cpuload(rq);
a426f99c
WL
7913 /*
7914 * No need to call idle_cpu() if nr_running is not 0
7915 */
7916 if (!nr_running && idle_cpu(i))
aae6d3dd 7917 sgs->idle_cpus++;
1e3c88bd
PZ
7918 }
7919
63b2ca30
NP
7920 /* Adjust by relative CPU capacity of the group */
7921 sgs->group_capacity = group->sgc->capacity;
ca8ce3d0 7922 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
1e3c88bd 7923
dd5feea1 7924 if (sgs->sum_nr_running)
38d0f770 7925 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
1e3c88bd 7926
aae6d3dd 7927 sgs->group_weight = group->group_weight;
b37d9316 7928
ea67821b 7929 sgs->group_no_capacity = group_is_overloaded(env, sgs);
79a89f92 7930 sgs->group_type = group_classify(group, sgs);
1e3c88bd
PZ
7931}
7932
532cb4c4
MN
7933/**
7934 * update_sd_pick_busiest - return 1 on busiest group
cd96891d 7935 * @env: The load balancing environment.
532cb4c4
MN
7936 * @sds: sched_domain statistics
7937 * @sg: sched_group candidate to be checked for being the busiest
b6b12294 7938 * @sgs: sched_group statistics
532cb4c4
MN
7939 *
7940 * Determine if @sg is a busier group than the previously selected
7941 * busiest group.
e69f6186
YB
7942 *
7943 * Return: %true if @sg is a busier group than the previously selected
7944 * busiest group. %false otherwise.
532cb4c4 7945 */
bd939f45 7946static bool update_sd_pick_busiest(struct lb_env *env,
532cb4c4
MN
7947 struct sd_lb_stats *sds,
7948 struct sched_group *sg,
bd939f45 7949 struct sg_lb_stats *sgs)
532cb4c4 7950{
caeb178c 7951 struct sg_lb_stats *busiest = &sds->busiest_stat;
532cb4c4 7952
caeb178c 7953 if (sgs->group_type > busiest->group_type)
532cb4c4
MN
7954 return true;
7955
caeb178c
RR
7956 if (sgs->group_type < busiest->group_type)
7957 return false;
7958
7959 if (sgs->avg_load <= busiest->avg_load)
7960 return false;
7961
9e0994c0
MR
7962 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
7963 goto asym_packing;
7964
7965 /*
7966 * Candidate sg has no more than one task per CPU and
7967 * has higher per-CPU capacity. Migrating tasks to less
7968 * capable CPUs may harm throughput. Maximize throughput,
7969 * power/energy consequences are not considered.
7970 */
7971 if (sgs->sum_nr_running <= sgs->group_weight &&
7972 group_smaller_cpu_capacity(sds->local, sg))
7973 return false;
7974
7975asym_packing:
caeb178c
RR
7976 /* This is the busiest node in its class. */
7977 if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c4
MN
7978 return true;
7979
1f621e02
SD
7980 /* No ASYM_PACKING if target cpu is already busy */
7981 if (env->idle == CPU_NOT_IDLE)
7982 return true;
532cb4c4 7983 /*
afe06efd
TC
7984 * ASYM_PACKING needs to move all the work to the highest
7985 * prority CPUs in the group, therefore mark all groups
7986 * of lower priority than ourself as busy.
532cb4c4 7987 */
afe06efd
TC
7988 if (sgs->sum_nr_running &&
7989 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
532cb4c4
MN
7990 if (!sds->busiest)
7991 return true;
7992
afe06efd
TC
7993 /* Prefer to move from lowest priority cpu's work */
7994 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7995 sg->asym_prefer_cpu))
532cb4c4
MN
7996 return true;
7997 }
7998
7999 return false;
8000}
8001
0ec8aa00
PZ
8002#ifdef CONFIG_NUMA_BALANCING
8003static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8004{
8005 if (sgs->sum_nr_running > sgs->nr_numa_running)
8006 return regular;
8007 if (sgs->sum_nr_running > sgs->nr_preferred_running)
8008 return remote;
8009 return all;
8010}
8011
8012static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8013{
8014 if (rq->nr_running > rq->nr_numa_running)
8015 return regular;
8016 if (rq->nr_running > rq->nr_preferred_running)
8017 return remote;
8018 return all;
8019}
8020#else
8021static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8022{
8023 return all;
8024}
8025
8026static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8027{
8028 return regular;
8029}
8030#endif /* CONFIG_NUMA_BALANCING */
8031
1e3c88bd 8032/**
461819ac 8033 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
cd96891d 8034 * @env: The load balancing environment.
1e3c88bd
PZ
8035 * @sds: variable to hold the statistics for this sched_domain.
8036 */
0ec8aa00 8037static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 8038{
bd939f45
PZ
8039 struct sched_domain *child = env->sd->child;
8040 struct sched_group *sg = env->sd->groups;
05b40e05 8041 struct sg_lb_stats *local = &sds->local_stat;
56cf515b 8042 struct sg_lb_stats tmp_sgs;
1e3c88bd 8043 int load_idx, prefer_sibling = 0;
4486edd1 8044 bool overload = false;
1e3c88bd
PZ
8045
8046 if (child && child->flags & SD_PREFER_SIBLING)
8047 prefer_sibling = 1;
8048
bd939f45 8049 load_idx = get_sd_load_idx(env->sd, env->idle);
1e3c88bd
PZ
8050
8051 do {
56cf515b 8052 struct sg_lb_stats *sgs = &tmp_sgs;
1e3c88bd
PZ
8053 int local_group;
8054
ae4df9d6 8055 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
56cf515b
JK
8056 if (local_group) {
8057 sds->local = sg;
05b40e05 8058 sgs = local;
b72ff13c
PZ
8059
8060 if (env->idle != CPU_NEWLY_IDLE ||
63b2ca30
NP
8061 time_after_eq(jiffies, sg->sgc->next_update))
8062 update_group_capacity(env->sd, env->dst_cpu);
56cf515b 8063 }
1e3c88bd 8064
4486edd1
TC
8065 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
8066 &overload);
1e3c88bd 8067
b72ff13c
PZ
8068 if (local_group)
8069 goto next_group;
8070
1e3c88bd
PZ
8071 /*
8072 * In case the child domain prefers tasks go to siblings
ea67821b 8073 * first, lower the sg capacity so that we'll try
75dd321d
NR
8074 * and move all the excess tasks away. We lower the capacity
8075 * of a group only if the local group has the capacity to fit
ea67821b
VG
8076 * these excess tasks. The extra check prevents the case where
8077 * you always pull from the heaviest group when it is already
8078 * under-utilized (possible with a large weight task outweighs
8079 * the tasks on the system).
1e3c88bd 8080 */
b72ff13c 8081 if (prefer_sibling && sds->local &&
05b40e05
SD
8082 group_has_capacity(env, local) &&
8083 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
ea67821b 8084 sgs->group_no_capacity = 1;
79a89f92 8085 sgs->group_type = group_classify(sg, sgs);
cb0b9f24 8086 }
1e3c88bd 8087
b72ff13c 8088 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
532cb4c4 8089 sds->busiest = sg;
56cf515b 8090 sds->busiest_stat = *sgs;
1e3c88bd
PZ
8091 }
8092
b72ff13c
PZ
8093next_group:
8094 /* Now, start updating sd_lb_stats */
90001d67 8095 sds->total_running += sgs->sum_nr_running;
b72ff13c 8096 sds->total_load += sgs->group_load;
63b2ca30 8097 sds->total_capacity += sgs->group_capacity;
b72ff13c 8098
532cb4c4 8099 sg = sg->next;
bd939f45 8100 } while (sg != env->sd->groups);
0ec8aa00
PZ
8101
8102 if (env->sd->flags & SD_NUMA)
8103 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4486edd1
TC
8104
8105 if (!env->sd->parent) {
8106 /* update overload indicator if we are at root domain */
8107 if (env->dst_rq->rd->overload != overload)
8108 env->dst_rq->rd->overload = overload;
8109 }
532cb4c4
MN
8110}
8111
532cb4c4
MN
8112/**
8113 * check_asym_packing - Check to see if the group is packed into the
0ba42a59 8114 * sched domain.
532cb4c4
MN
8115 *
8116 * This is primarily intended to used at the sibling level. Some
8117 * cores like POWER7 prefer to use lower numbered SMT threads. In the
8118 * case of POWER7, it can move to lower SMT modes only when higher
8119 * threads are idle. When in lower SMT modes, the threads will
8120 * perform better since they share less core resources. Hence when we
8121 * have idle threads, we want them to be the higher ones.
8122 *
8123 * This packing function is run on idle threads. It checks to see if
8124 * the busiest CPU in this domain (core in the P7 case) has a higher
8125 * CPU number than the packing function is being run on. Here we are
8126 * assuming lower CPU number will be equivalent to lower a SMT thread
8127 * number.
8128 *
e69f6186 8129 * Return: 1 when packing is required and a task should be moved to
46123355 8130 * this CPU. The amount of the imbalance is returned in env->imbalance.
b6b12294 8131 *
cd96891d 8132 * @env: The load balancing environment.
532cb4c4 8133 * @sds: Statistics of the sched_domain which is to be packed
532cb4c4 8134 */
bd939f45 8135static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
532cb4c4
MN
8136{
8137 int busiest_cpu;
8138
bd939f45 8139 if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c4
MN
8140 return 0;
8141
1f621e02
SD
8142 if (env->idle == CPU_NOT_IDLE)
8143 return 0;
8144
532cb4c4
MN
8145 if (!sds->busiest)
8146 return 0;
8147
afe06efd
TC
8148 busiest_cpu = sds->busiest->asym_prefer_cpu;
8149 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
532cb4c4
MN
8150 return 0;
8151
bd939f45 8152 env->imbalance = DIV_ROUND_CLOSEST(
63b2ca30 8153 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
ca8ce3d0 8154 SCHED_CAPACITY_SCALE);
bd939f45 8155
532cb4c4 8156 return 1;
1e3c88bd
PZ
8157}
8158
8159/**
8160 * fix_small_imbalance - Calculate the minor imbalance that exists
8161 * amongst the groups of a sched_domain, during
8162 * load balancing.
cd96891d 8163 * @env: The load balancing environment.
1e3c88bd 8164 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 8165 */
bd939f45
PZ
8166static inline
8167void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 8168{
63b2ca30 8169 unsigned long tmp, capa_now = 0, capa_move = 0;
1e3c88bd 8170 unsigned int imbn = 2;
dd5feea1 8171 unsigned long scaled_busy_load_per_task;
56cf515b 8172 struct sg_lb_stats *local, *busiest;
1e3c88bd 8173
56cf515b
JK
8174 local = &sds->local_stat;
8175 busiest = &sds->busiest_stat;
1e3c88bd 8176
56cf515b
JK
8177 if (!local->sum_nr_running)
8178 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8179 else if (busiest->load_per_task > local->load_per_task)
8180 imbn = 1;
dd5feea1 8181
56cf515b 8182 scaled_busy_load_per_task =
ca8ce3d0 8183 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
63b2ca30 8184 busiest->group_capacity;
56cf515b 8185
3029ede3
VD
8186 if (busiest->avg_load + scaled_busy_load_per_task >=
8187 local->avg_load + (scaled_busy_load_per_task * imbn)) {
56cf515b 8188 env->imbalance = busiest->load_per_task;
1e3c88bd
PZ
8189 return;
8190 }
8191
8192 /*
8193 * OK, we don't have enough imbalance to justify moving tasks,
ced549fa 8194 * however we may be able to increase total CPU capacity used by
1e3c88bd
PZ
8195 * moving them.
8196 */
8197
63b2ca30 8198 capa_now += busiest->group_capacity *
56cf515b 8199 min(busiest->load_per_task, busiest->avg_load);
63b2ca30 8200 capa_now += local->group_capacity *
56cf515b 8201 min(local->load_per_task, local->avg_load);
ca8ce3d0 8202 capa_now /= SCHED_CAPACITY_SCALE;
1e3c88bd
PZ
8203
8204 /* Amount of load we'd subtract */
a2cd4260 8205 if (busiest->avg_load > scaled_busy_load_per_task) {
63b2ca30 8206 capa_move += busiest->group_capacity *
56cf515b 8207 min(busiest->load_per_task,
a2cd4260 8208 busiest->avg_load - scaled_busy_load_per_task);
56cf515b 8209 }
1e3c88bd
PZ
8210
8211 /* Amount of load we'd add */
63b2ca30 8212 if (busiest->avg_load * busiest->group_capacity <
ca8ce3d0 8213 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
63b2ca30
NP
8214 tmp = (busiest->avg_load * busiest->group_capacity) /
8215 local->group_capacity;
56cf515b 8216 } else {
ca8ce3d0 8217 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
63b2ca30 8218 local->group_capacity;
56cf515b 8219 }
63b2ca30 8220 capa_move += local->group_capacity *
3ae11c90 8221 min(local->load_per_task, local->avg_load + tmp);
ca8ce3d0 8222 capa_move /= SCHED_CAPACITY_SCALE;
1e3c88bd
PZ
8223
8224 /* Move if we gain throughput */
63b2ca30 8225 if (capa_move > capa_now)
56cf515b 8226 env->imbalance = busiest->load_per_task;
1e3c88bd
PZ
8227}
8228
8229/**
8230 * calculate_imbalance - Calculate the amount of imbalance present within the
8231 * groups of a given sched_domain during load balance.
bd939f45 8232 * @env: load balance environment
1e3c88bd 8233 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 8234 */
bd939f45 8235static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 8236{
dd5feea1 8237 unsigned long max_pull, load_above_capacity = ~0UL;
56cf515b
JK
8238 struct sg_lb_stats *local, *busiest;
8239
8240 local = &sds->local_stat;
56cf515b 8241 busiest = &sds->busiest_stat;
dd5feea1 8242
caeb178c 8243 if (busiest->group_type == group_imbalanced) {
30ce5dab
PZ
8244 /*
8245 * In the group_imb case we cannot rely on group-wide averages
8246 * to ensure cpu-load equilibrium, look at wider averages. XXX
8247 */
56cf515b
JK
8248 busiest->load_per_task =
8249 min(busiest->load_per_task, sds->avg_load);
dd5feea1
SS
8250 }
8251
1e3c88bd 8252 /*
885e542c
DE
8253 * Avg load of busiest sg can be less and avg load of local sg can
8254 * be greater than avg load across all sgs of sd because avg load
8255 * factors in sg capacity and sgs with smaller group_type are
8256 * skipped when updating the busiest sg:
1e3c88bd 8257 */
b1885550
VD
8258 if (busiest->avg_load <= sds->avg_load ||
8259 local->avg_load >= sds->avg_load) {
bd939f45
PZ
8260 env->imbalance = 0;
8261 return fix_small_imbalance(env, sds);
1e3c88bd
PZ
8262 }
8263
9a5d9ba6
PZ
8264 /*
8265 * If there aren't any idle cpus, avoid creating some.
8266 */
8267 if (busiest->group_type == group_overloaded &&
8268 local->group_type == group_overloaded) {
1be0eb2a 8269 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
cfa10334 8270 if (load_above_capacity > busiest->group_capacity) {
ea67821b 8271 load_above_capacity -= busiest->group_capacity;
26656215 8272 load_above_capacity *= scale_load_down(NICE_0_LOAD);
cfa10334
MR
8273 load_above_capacity /= busiest->group_capacity;
8274 } else
ea67821b 8275 load_above_capacity = ~0UL;
dd5feea1
SS
8276 }
8277
8278 /*
8279 * We're trying to get all the cpus to the average_load, so we don't
8280 * want to push ourselves above the average load, nor do we wish to
8281 * reduce the max loaded cpu below the average load. At the same time,
0a9b23ce
DE
8282 * we also don't want to reduce the group load below the group
8283 * capacity. Thus we look for the minimum possible imbalance.
dd5feea1 8284 */
30ce5dab 8285 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
1e3c88bd
PZ
8286
8287 /* How much load to actually move to equalise the imbalance */
56cf515b 8288 env->imbalance = min(
63b2ca30
NP
8289 max_pull * busiest->group_capacity,
8290 (sds->avg_load - local->avg_load) * local->group_capacity
ca8ce3d0 8291 ) / SCHED_CAPACITY_SCALE;
1e3c88bd
PZ
8292
8293 /*
8294 * if *imbalance is less than the average load per runnable task
25985edc 8295 * there is no guarantee that any tasks will be moved so we'll have
1e3c88bd
PZ
8296 * a think about bumping its value to force at least one task to be
8297 * moved
8298 */
56cf515b 8299 if (env->imbalance < busiest->load_per_task)
bd939f45 8300 return fix_small_imbalance(env, sds);
1e3c88bd 8301}
fab47622 8302
1e3c88bd
PZ
8303/******* find_busiest_group() helpers end here *********************/
8304
8305/**
8306 * find_busiest_group - Returns the busiest group within the sched_domain
0a9b23ce 8307 * if there is an imbalance.
1e3c88bd
PZ
8308 *
8309 * Also calculates the amount of weighted load which should be moved
8310 * to restore balance.
8311 *
cd96891d 8312 * @env: The load balancing environment.
1e3c88bd 8313 *
e69f6186 8314 * Return: - The busiest group if imbalance exists.
1e3c88bd 8315 */
56cf515b 8316static struct sched_group *find_busiest_group(struct lb_env *env)
1e3c88bd 8317{
56cf515b 8318 struct sg_lb_stats *local, *busiest;
1e3c88bd
PZ
8319 struct sd_lb_stats sds;
8320
147c5fc2 8321 init_sd_lb_stats(&sds);
1e3c88bd
PZ
8322
8323 /*
8324 * Compute the various statistics relavent for load balancing at
8325 * this level.
8326 */
23f0d209 8327 update_sd_lb_stats(env, &sds);
56cf515b
JK
8328 local = &sds.local_stat;
8329 busiest = &sds.busiest_stat;
1e3c88bd 8330
ea67821b 8331 /* ASYM feature bypasses nice load balance check */
1f621e02 8332 if (check_asym_packing(env, &sds))
532cb4c4
MN
8333 return sds.busiest;
8334
cc57aa8f 8335 /* There is no busy sibling group to pull tasks from */
56cf515b 8336 if (!sds.busiest || busiest->sum_nr_running == 0)
1e3c88bd
PZ
8337 goto out_balanced;
8338
90001d67 8339 /* XXX broken for overlapping NUMA groups */
ca8ce3d0
NP
8340 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8341 / sds.total_capacity;
b0432d8f 8342
866ab43e
PZ
8343 /*
8344 * If the busiest group is imbalanced the below checks don't
30ce5dab 8345 * work because they assume all things are equal, which typically
866ab43e
PZ
8346 * isn't true due to cpus_allowed constraints and the like.
8347 */
caeb178c 8348 if (busiest->group_type == group_imbalanced)
866ab43e
PZ
8349 goto force_balance;
8350
583ffd99
BJ
8351 /*
8352 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
8353 * capacities from resulting in underutilization due to avg_load.
8354 */
8355 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
ea67821b 8356 busiest->group_no_capacity)
fab47622
NR
8357 goto force_balance;
8358
cc57aa8f 8359 /*
9c58c79a 8360 * If the local group is busier than the selected busiest group
cc57aa8f
PZ
8361 * don't try and pull any tasks.
8362 */
56cf515b 8363 if (local->avg_load >= busiest->avg_load)
1e3c88bd
PZ
8364 goto out_balanced;
8365
cc57aa8f
PZ
8366 /*
8367 * Don't pull any tasks if this group is already above the domain
8368 * average load.
8369 */
56cf515b 8370 if (local->avg_load >= sds.avg_load)
1e3c88bd
PZ
8371 goto out_balanced;
8372
bd939f45 8373 if (env->idle == CPU_IDLE) {
aae6d3dd 8374 /*
43f4d666
VG
8375 * This cpu is idle. If the busiest group is not overloaded
8376 * and there is no imbalance between this and busiest group
8377 * wrt idle cpus, it is balanced. The imbalance becomes
8378 * significant if the diff is greater than 1 otherwise we
8379 * might end up to just move the imbalance on another group
aae6d3dd 8380 */
43f4d666
VG
8381 if ((busiest->group_type != group_overloaded) &&
8382 (local->idle_cpus <= (busiest->idle_cpus + 1)))
aae6d3dd 8383 goto out_balanced;
c186fafe
PZ
8384 } else {
8385 /*
8386 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
8387 * imbalance_pct to be conservative.
8388 */
56cf515b
JK
8389 if (100 * busiest->avg_load <=
8390 env->sd->imbalance_pct * local->avg_load)
c186fafe 8391 goto out_balanced;
aae6d3dd 8392 }
1e3c88bd 8393
fab47622 8394force_balance:
1e3c88bd 8395 /* Looks like there is an imbalance. Compute it */
bd939f45 8396 calculate_imbalance(env, &sds);
1e3c88bd
PZ
8397 return sds.busiest;
8398
8399out_balanced:
bd939f45 8400 env->imbalance = 0;
1e3c88bd
PZ
8401 return NULL;
8402}
8403
8404/*
8405 * find_busiest_queue - find the busiest runqueue among the cpus in group.
8406 */
bd939f45 8407static struct rq *find_busiest_queue(struct lb_env *env,
b9403130 8408 struct sched_group *group)
1e3c88bd
PZ
8409{
8410 struct rq *busiest = NULL, *rq;
ced549fa 8411 unsigned long busiest_load = 0, busiest_capacity = 1;
1e3c88bd
PZ
8412 int i;
8413
ae4df9d6 8414 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
ea67821b 8415 unsigned long capacity, wl;
0ec8aa00
PZ
8416 enum fbq_type rt;
8417
8418 rq = cpu_rq(i);
8419 rt = fbq_classify_rq(rq);
1e3c88bd 8420
0ec8aa00
PZ
8421 /*
8422 * We classify groups/runqueues into three groups:
8423 * - regular: there are !numa tasks
8424 * - remote: there are numa tasks that run on the 'wrong' node
8425 * - all: there is no distinction
8426 *
8427 * In order to avoid migrating ideally placed numa tasks,
8428 * ignore those when there's better options.
8429 *
8430 * If we ignore the actual busiest queue to migrate another
8431 * task, the next balance pass can still reduce the busiest
8432 * queue by moving tasks around inside the node.
8433 *
8434 * If we cannot move enough load due to this classification
8435 * the next pass will adjust the group classification and
8436 * allow migration of more tasks.
8437 *
8438 * Both cases only affect the total convergence complexity.
8439 */
8440 if (rt > env->fbq_type)
8441 continue;
8442
ced549fa 8443 capacity = capacity_of(i);
9d5efe05 8444
c7132dd6 8445 wl = weighted_cpuload(rq);
1e3c88bd 8446
6e40f5bb
TG
8447 /*
8448 * When comparing with imbalance, use weighted_cpuload()
ced549fa 8449 * which is not scaled with the cpu capacity.
6e40f5bb 8450 */
ea67821b
VG
8451
8452 if (rq->nr_running == 1 && wl > env->imbalance &&
8453 !check_cpu_capacity(rq, env->sd))
1e3c88bd
PZ
8454 continue;
8455
6e40f5bb
TG
8456 /*
8457 * For the load comparisons with the other cpu's, consider
ced549fa
NP
8458 * the weighted_cpuload() scaled with the cpu capacity, so
8459 * that the load can be moved away from the cpu that is
8460 * potentially running at a lower capacity.
95a79b80 8461 *
ced549fa 8462 * Thus we're looking for max(wl_i / capacity_i), crosswise
95a79b80 8463 * multiplication to rid ourselves of the division works out
ced549fa
NP
8464 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
8465 * our previous maximum.
6e40f5bb 8466 */
ced549fa 8467 if (wl * busiest_capacity > busiest_load * capacity) {
95a79b80 8468 busiest_load = wl;
ced549fa 8469 busiest_capacity = capacity;
1e3c88bd
PZ
8470 busiest = rq;
8471 }
8472 }
8473
8474 return busiest;
8475}
8476
8477/*
8478 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
8479 * so long as it is large enough.
8480 */
8481#define MAX_PINNED_INTERVAL 512
8482
bd939f45 8483static int need_active_balance(struct lb_env *env)
1af3ed3d 8484{
bd939f45
PZ
8485 struct sched_domain *sd = env->sd;
8486
8487 if (env->idle == CPU_NEWLY_IDLE) {
532cb4c4
MN
8488
8489 /*
8490 * ASYM_PACKING needs to force migrate tasks from busy but
afe06efd
TC
8491 * lower priority CPUs in order to pack all tasks in the
8492 * highest priority CPUs.
532cb4c4 8493 */
afe06efd
TC
8494 if ((sd->flags & SD_ASYM_PACKING) &&
8495 sched_asym_prefer(env->dst_cpu, env->src_cpu))
532cb4c4 8496 return 1;
1af3ed3d
PZ
8497 }
8498
1aaf90a4
VG
8499 /*
8500 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
8501 * It's worth migrating the task if the src_cpu's capacity is reduced
8502 * because of other sched_class or IRQs if more capacity stays
8503 * available on dst_cpu.
8504 */
8505 if ((env->idle != CPU_NOT_IDLE) &&
8506 (env->src_rq->cfs.h_nr_running == 1)) {
8507 if ((check_cpu_capacity(env->src_rq, sd)) &&
8508 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8509 return 1;
8510 }
8511
1af3ed3d
PZ
8512 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8513}
8514
969c7921
TH
8515static int active_load_balance_cpu_stop(void *data);
8516
23f0d209
JK
8517static int should_we_balance(struct lb_env *env)
8518{
8519 struct sched_group *sg = env->sd->groups;
23f0d209
JK
8520 int cpu, balance_cpu = -1;
8521
024c9d2f
PZ
8522 /*
8523 * Ensure the balancing environment is consistent; can happen
8524 * when the softirq triggers 'during' hotplug.
8525 */
8526 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8527 return 0;
8528
23f0d209
JK
8529 /*
8530 * In the newly idle case, we will allow all the cpu's
8531 * to do the newly idle load balance.
8532 */
8533 if (env->idle == CPU_NEWLY_IDLE)
8534 return 1;
8535
23f0d209 8536 /* Try to find first idle cpu */
e5c14b1f 8537 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
af218122 8538 if (!idle_cpu(cpu))
23f0d209
JK
8539 continue;
8540
8541 balance_cpu = cpu;
8542 break;
8543 }
8544
8545 if (balance_cpu == -1)
8546 balance_cpu = group_balance_cpu(sg);
8547
8548 /*
8549 * First idle cpu or the first cpu(busiest) in this sched group
8550 * is eligible for doing load balancing at this and above domains.
8551 */
b0cff9d8 8552 return balance_cpu == env->dst_cpu;
23f0d209
JK
8553}
8554
1e3c88bd
PZ
8555/*
8556 * Check this_cpu to ensure it is balanced within domain. Attempt to move
8557 * tasks if there is an imbalance.
8558 */
8559static int load_balance(int this_cpu, struct rq *this_rq,
8560 struct sched_domain *sd, enum cpu_idle_type idle,
23f0d209 8561 int *continue_balancing)
1e3c88bd 8562{
88b8dac0 8563 int ld_moved, cur_ld_moved, active_balance = 0;
6263322c 8564 struct sched_domain *sd_parent = sd->parent;
1e3c88bd 8565 struct sched_group *group;
1e3c88bd 8566 struct rq *busiest;
8a8c69c3 8567 struct rq_flags rf;
4ba29684 8568 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
1e3c88bd 8569
8e45cb54
PZ
8570 struct lb_env env = {
8571 .sd = sd,
ddcdf6e7
PZ
8572 .dst_cpu = this_cpu,
8573 .dst_rq = this_rq,
ae4df9d6 8574 .dst_grpmask = sched_group_span(sd->groups),
8e45cb54 8575 .idle = idle,
eb95308e 8576 .loop_break = sched_nr_migrate_break,
b9403130 8577 .cpus = cpus,
0ec8aa00 8578 .fbq_type = all,
163122b7 8579 .tasks = LIST_HEAD_INIT(env.tasks),
8e45cb54
PZ
8580 };
8581
65a4433a 8582 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
1e3c88bd 8583
ae92882e 8584 schedstat_inc(sd->lb_count[idle]);
1e3c88bd
PZ
8585
8586redo:
23f0d209
JK
8587 if (!should_we_balance(&env)) {
8588 *continue_balancing = 0;
1e3c88bd 8589 goto out_balanced;
23f0d209 8590 }
1e3c88bd 8591
23f0d209 8592 group = find_busiest_group(&env);
1e3c88bd 8593 if (!group) {
ae92882e 8594 schedstat_inc(sd->lb_nobusyg[idle]);
1e3c88bd
PZ
8595 goto out_balanced;
8596 }
8597
b9403130 8598 busiest = find_busiest_queue(&env, group);
1e3c88bd 8599 if (!busiest) {
ae92882e 8600 schedstat_inc(sd->lb_nobusyq[idle]);
1e3c88bd
PZ
8601 goto out_balanced;
8602 }
8603
78feefc5 8604 BUG_ON(busiest == env.dst_rq);
1e3c88bd 8605
ae92882e 8606 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
1e3c88bd 8607
1aaf90a4
VG
8608 env.src_cpu = busiest->cpu;
8609 env.src_rq = busiest;
8610
1e3c88bd
PZ
8611 ld_moved = 0;
8612 if (busiest->nr_running > 1) {
8613 /*
8614 * Attempt to move tasks. If find_busiest_group has found
8615 * an imbalance but busiest->nr_running <= 1, the group is
8616 * still unbalanced. ld_moved simply stays zero, so it is
8617 * correctly treated as an imbalance.
8618 */
8e45cb54 8619 env.flags |= LBF_ALL_PINNED;
c82513e5 8620 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8e45cb54 8621
5d6523eb 8622more_balance:
8a8c69c3 8623 rq_lock_irqsave(busiest, &rf);
3bed5e21 8624 update_rq_clock(busiest);
88b8dac0
SV
8625
8626 /*
8627 * cur_ld_moved - load moved in current iteration
8628 * ld_moved - cumulative load moved across iterations
8629 */
163122b7 8630 cur_ld_moved = detach_tasks(&env);
1e3c88bd
PZ
8631
8632 /*
163122b7
KT
8633 * We've detached some tasks from busiest_rq. Every
8634 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
8635 * unlock busiest->lock, and we are able to be sure
8636 * that nobody can manipulate the tasks in parallel.
8637 * See task_rq_lock() family for the details.
1e3c88bd 8638 */
163122b7 8639
8a8c69c3 8640 rq_unlock(busiest, &rf);
163122b7
KT
8641
8642 if (cur_ld_moved) {
8643 attach_tasks(&env);
8644 ld_moved += cur_ld_moved;
8645 }
8646
8a8c69c3 8647 local_irq_restore(rf.flags);
88b8dac0 8648
f1cd0858
JK
8649 if (env.flags & LBF_NEED_BREAK) {
8650 env.flags &= ~LBF_NEED_BREAK;
8651 goto more_balance;
8652 }
8653
88b8dac0
SV
8654 /*
8655 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8656 * us and move them to an alternate dst_cpu in our sched_group
8657 * where they can run. The upper limit on how many times we
8658 * iterate on same src_cpu is dependent on number of cpus in our
8659 * sched_group.
8660 *
8661 * This changes load balance semantics a bit on who can move
8662 * load to a given_cpu. In addition to the given_cpu itself
8663 * (or a ilb_cpu acting on its behalf where given_cpu is
8664 * nohz-idle), we now have balance_cpu in a position to move
8665 * load to given_cpu. In rare situations, this may cause
8666 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
8667 * _independently_ and at _same_ time to move some load to
8668 * given_cpu) causing exceess load to be moved to given_cpu.
8669 * This however should not happen so much in practice and
8670 * moreover subsequent load balance cycles should correct the
8671 * excess load moved.
8672 */
6263322c 8673 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
88b8dac0 8674
7aff2e3a
VD
8675 /* Prevent to re-select dst_cpu via env's cpus */
8676 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8677
78feefc5 8678 env.dst_rq = cpu_rq(env.new_dst_cpu);
88b8dac0 8679 env.dst_cpu = env.new_dst_cpu;
6263322c 8680 env.flags &= ~LBF_DST_PINNED;
88b8dac0
SV
8681 env.loop = 0;
8682 env.loop_break = sched_nr_migrate_break;
e02e60c1 8683
88b8dac0
SV
8684 /*
8685 * Go back to "more_balance" rather than "redo" since we
8686 * need to continue with same src_cpu.
8687 */
8688 goto more_balance;
8689 }
1e3c88bd 8690
6263322c
PZ
8691 /*
8692 * We failed to reach balance because of affinity.
8693 */
8694 if (sd_parent) {
63b2ca30 8695 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6263322c 8696
afdeee05 8697 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6263322c 8698 *group_imbalance = 1;
6263322c
PZ
8699 }
8700
1e3c88bd 8701 /* All tasks on this runqueue were pinned by CPU affinity */
8e45cb54 8702 if (unlikely(env.flags & LBF_ALL_PINNED)) {
1e3c88bd 8703 cpumask_clear_cpu(cpu_of(busiest), cpus);
65a4433a
JH
8704 /*
8705 * Attempting to continue load balancing at the current
8706 * sched_domain level only makes sense if there are
8707 * active CPUs remaining as possible busiest CPUs to
8708 * pull load from which are not contained within the
8709 * destination group that is receiving any migrated
8710 * load.
8711 */
8712 if (!cpumask_subset(cpus, env.dst_grpmask)) {
bbf18b19
PN
8713 env.loop = 0;
8714 env.loop_break = sched_nr_migrate_break;
1e3c88bd 8715 goto redo;
bbf18b19 8716 }
afdeee05 8717 goto out_all_pinned;
1e3c88bd
PZ
8718 }
8719 }
8720
8721 if (!ld_moved) {
ae92882e 8722 schedstat_inc(sd->lb_failed[idle]);
58b26c4c
VP
8723 /*
8724 * Increment the failure counter only on periodic balance.
8725 * We do not want newidle balance, which can be very
8726 * frequent, pollute the failure counter causing
8727 * excessive cache_hot migrations and active balances.
8728 */
8729 if (idle != CPU_NEWLY_IDLE)
8730 sd->nr_balance_failed++;
1e3c88bd 8731
bd939f45 8732 if (need_active_balance(&env)) {
8a8c69c3
PZ
8733 unsigned long flags;
8734
1e3c88bd
PZ
8735 raw_spin_lock_irqsave(&busiest->lock, flags);
8736
969c7921
TH
8737 /* don't kick the active_load_balance_cpu_stop,
8738 * if the curr task on busiest cpu can't be
8739 * moved to this_cpu
1e3c88bd 8740 */
0c98d344 8741 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
1e3c88bd
PZ
8742 raw_spin_unlock_irqrestore(&busiest->lock,
8743 flags);
8e45cb54 8744 env.flags |= LBF_ALL_PINNED;
1e3c88bd
PZ
8745 goto out_one_pinned;
8746 }
8747
969c7921
TH
8748 /*
8749 * ->active_balance synchronizes accesses to
8750 * ->active_balance_work. Once set, it's cleared
8751 * only after active load balance is finished.
8752 */
1e3c88bd
PZ
8753 if (!busiest->active_balance) {
8754 busiest->active_balance = 1;
8755 busiest->push_cpu = this_cpu;
8756 active_balance = 1;
8757 }
8758 raw_spin_unlock_irqrestore(&busiest->lock, flags);
969c7921 8759
bd939f45 8760 if (active_balance) {
969c7921
TH
8761 stop_one_cpu_nowait(cpu_of(busiest),
8762 active_load_balance_cpu_stop, busiest,
8763 &busiest->active_balance_work);
bd939f45 8764 }
1e3c88bd 8765
d02c0711 8766 /* We've kicked active balancing, force task migration. */
1e3c88bd
PZ
8767 sd->nr_balance_failed = sd->cache_nice_tries+1;
8768 }
8769 } else
8770 sd->nr_balance_failed = 0;
8771
8772 if (likely(!active_balance)) {
8773 /* We were unbalanced, so reset the balancing interval */
8774 sd->balance_interval = sd->min_interval;
8775 } else {
8776 /*
8777 * If we've begun active balancing, start to back off. This
8778 * case may not be covered by the all_pinned logic if there
8779 * is only 1 task on the busy runqueue (because we don't call
163122b7 8780 * detach_tasks).
1e3c88bd
PZ
8781 */
8782 if (sd->balance_interval < sd->max_interval)
8783 sd->balance_interval *= 2;
8784 }
8785
1e3c88bd
PZ
8786 goto out;
8787
8788out_balanced:
afdeee05
VG
8789 /*
8790 * We reach balance although we may have faced some affinity
8791 * constraints. Clear the imbalance flag if it was set.
8792 */
8793 if (sd_parent) {
8794 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8795
8796 if (*group_imbalance)
8797 *group_imbalance = 0;
8798 }
8799
8800out_all_pinned:
8801 /*
8802 * We reach balance because all tasks are pinned at this level so
8803 * we can't migrate them. Let the imbalance flag set so parent level
8804 * can try to migrate them.
8805 */
ae92882e 8806 schedstat_inc(sd->lb_balanced[idle]);
1e3c88bd
PZ
8807
8808 sd->nr_balance_failed = 0;
8809
8810out_one_pinned:
8811 /* tune up the balancing interval */
8e45cb54 8812 if (((env.flags & LBF_ALL_PINNED) &&
5b54b56b 8813 sd->balance_interval < MAX_PINNED_INTERVAL) ||
1e3c88bd
PZ
8814 (sd->balance_interval < sd->max_interval))
8815 sd->balance_interval *= 2;
8816
46e49b38 8817 ld_moved = 0;
1e3c88bd 8818out:
1e3c88bd
PZ
8819 return ld_moved;
8820}
8821
52a08ef1
JL
8822static inline unsigned long
8823get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
8824{
8825 unsigned long interval = sd->balance_interval;
8826
8827 if (cpu_busy)
8828 interval *= sd->busy_factor;
8829
8830 /* scale ms to jiffies */
8831 interval = msecs_to_jiffies(interval);
8832 interval = clamp(interval, 1UL, max_load_balance_interval);
8833
8834 return interval;
8835}
8836
8837static inline void
31851a98 8838update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
52a08ef1
JL
8839{
8840 unsigned long interval, next;
8841
31851a98
LY
8842 /* used by idle balance, so cpu_busy = 0 */
8843 interval = get_sd_balance_interval(sd, 0);
52a08ef1
JL
8844 next = sd->last_balance + interval;
8845
8846 if (time_after(*next_balance, next))
8847 *next_balance = next;
8848}
8849
1e3c88bd
PZ
8850/*
8851 * idle_balance is called by schedule() if this_cpu is about to become
8852 * idle. Attempts to pull tasks from other CPUs.
8853 */
46f69fa3 8854static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
1e3c88bd 8855{
52a08ef1
JL
8856 unsigned long next_balance = jiffies + HZ;
8857 int this_cpu = this_rq->cpu;
1e3c88bd
PZ
8858 struct sched_domain *sd;
8859 int pulled_task = 0;
9bd721c5 8860 u64 curr_cost = 0;
1e3c88bd 8861
6e83125c
PZ
8862 /*
8863 * We must set idle_stamp _before_ calling idle_balance(), such that we
8864 * measure the duration of idle_balance() as idle time.
8865 */
8866 this_rq->idle_stamp = rq_clock(this_rq);
8867
2800486e
PZ
8868 /*
8869 * Do not pull tasks towards !active CPUs...
8870 */
8871 if (!cpu_active(this_cpu))
8872 return 0;
8873
46f69fa3
MF
8874 /*
8875 * This is OK, because current is on_cpu, which avoids it being picked
8876 * for load-balance and preemption/IRQs are still disabled avoiding
8877 * further scheduler activity on it and we're being very careful to
8878 * re-start the picking loop.
8879 */
8880 rq_unpin_lock(this_rq, rf);
8881
4486edd1
TC
8882 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
8883 !this_rq->rd->overload) {
52a08ef1
JL
8884 rcu_read_lock();
8885 sd = rcu_dereference_check_sched_domain(this_rq->sd);
8886 if (sd)
31851a98 8887 update_next_balance(sd, &next_balance);
52a08ef1
JL
8888 rcu_read_unlock();
8889
6e83125c 8890 goto out;
52a08ef1 8891 }
1e3c88bd 8892
f492e12e
PZ
8893 raw_spin_unlock(&this_rq->lock);
8894
48a16753 8895 update_blocked_averages(this_cpu);
dce840a0 8896 rcu_read_lock();
1e3c88bd 8897 for_each_domain(this_cpu, sd) {
23f0d209 8898 int continue_balancing = 1;
9bd721c5 8899 u64 t0, domain_cost;
1e3c88bd
PZ
8900
8901 if (!(sd->flags & SD_LOAD_BALANCE))
8902 continue;
8903
52a08ef1 8904 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
31851a98 8905 update_next_balance(sd, &next_balance);
9bd721c5 8906 break;
52a08ef1 8907 }
9bd721c5 8908
f492e12e 8909 if (sd->flags & SD_BALANCE_NEWIDLE) {
9bd721c5
JL
8910 t0 = sched_clock_cpu(this_cpu);
8911
f492e12e 8912 pulled_task = load_balance(this_cpu, this_rq,
23f0d209
JK
8913 sd, CPU_NEWLY_IDLE,
8914 &continue_balancing);
9bd721c5
JL
8915
8916 domain_cost = sched_clock_cpu(this_cpu) - t0;
8917 if (domain_cost > sd->max_newidle_lb_cost)
8918 sd->max_newidle_lb_cost = domain_cost;
8919
8920 curr_cost += domain_cost;
f492e12e 8921 }
1e3c88bd 8922
31851a98 8923 update_next_balance(sd, &next_balance);
39a4d9ca
JL
8924
8925 /*
8926 * Stop searching for tasks to pull if there are
8927 * now runnable tasks on this rq.
8928 */
8929 if (pulled_task || this_rq->nr_running > 0)
1e3c88bd 8930 break;
1e3c88bd 8931 }
dce840a0 8932 rcu_read_unlock();
f492e12e
PZ
8933
8934 raw_spin_lock(&this_rq->lock);
8935
0e5b5337
JL
8936 if (curr_cost > this_rq->max_idle_balance_cost)
8937 this_rq->max_idle_balance_cost = curr_cost;
8938
e5fc6611 8939 /*
0e5b5337
JL
8940 * While browsing the domains, we released the rq lock, a task could
8941 * have been enqueued in the meantime. Since we're not going idle,
8942 * pretend we pulled a task.
e5fc6611 8943 */
0e5b5337 8944 if (this_rq->cfs.h_nr_running && !pulled_task)
6e83125c 8945 pulled_task = 1;
e5fc6611 8946
52a08ef1
JL
8947out:
8948 /* Move the next balance forward */
8949 if (time_after(this_rq->next_balance, next_balance))
1e3c88bd 8950 this_rq->next_balance = next_balance;
9bd721c5 8951
e4aa358b 8952 /* Is there a task of a high priority class? */
46383648 8953 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
e4aa358b
KT
8954 pulled_task = -1;
8955
38c6ade2 8956 if (pulled_task)
6e83125c
PZ
8957 this_rq->idle_stamp = 0;
8958
46f69fa3
MF
8959 rq_repin_lock(this_rq, rf);
8960
3c4017c1 8961 return pulled_task;
1e3c88bd
PZ
8962}
8963
8964/*
969c7921
TH
8965 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
8966 * running tasks off the busiest CPU onto idle CPUs. It requires at
8967 * least 1 task to be running on each physical CPU where possible, and
8968 * avoids physical / logical imbalances.
1e3c88bd 8969 */
969c7921 8970static int active_load_balance_cpu_stop(void *data)
1e3c88bd 8971{
969c7921
TH
8972 struct rq *busiest_rq = data;
8973 int busiest_cpu = cpu_of(busiest_rq);
1e3c88bd 8974 int target_cpu = busiest_rq->push_cpu;
969c7921 8975 struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bd 8976 struct sched_domain *sd;
e5673f28 8977 struct task_struct *p = NULL;
8a8c69c3 8978 struct rq_flags rf;
969c7921 8979
8a8c69c3 8980 rq_lock_irq(busiest_rq, &rf);
edd8e41d
PZ
8981 /*
8982 * Between queueing the stop-work and running it is a hole in which
8983 * CPUs can become inactive. We should not move tasks from or to
8984 * inactive CPUs.
8985 */
8986 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8987 goto out_unlock;
969c7921
TH
8988
8989 /* make sure the requested cpu hasn't gone down in the meantime */
8990 if (unlikely(busiest_cpu != smp_processor_id() ||
8991 !busiest_rq->active_balance))
8992 goto out_unlock;
1e3c88bd
PZ
8993
8994 /* Is there any task to move? */
8995 if (busiest_rq->nr_running <= 1)
969c7921 8996 goto out_unlock;
1e3c88bd
PZ
8997
8998 /*
8999 * This condition is "impossible", if it occurs
9000 * we need to fix it. Originally reported by
9001 * Bjorn Helgaas on a 128-cpu setup.
9002 */
9003 BUG_ON(busiest_rq == target_rq);
9004
1e3c88bd 9005 /* Search for an sd spanning us and the target CPU. */
dce840a0 9006 rcu_read_lock();
1e3c88bd
PZ
9007 for_each_domain(target_cpu, sd) {
9008 if ((sd->flags & SD_LOAD_BALANCE) &&
9009 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9010 break;
9011 }
9012
9013 if (likely(sd)) {
8e45cb54
PZ
9014 struct lb_env env = {
9015 .sd = sd,
ddcdf6e7
PZ
9016 .dst_cpu = target_cpu,
9017 .dst_rq = target_rq,
9018 .src_cpu = busiest_rq->cpu,
9019 .src_rq = busiest_rq,
8e45cb54 9020 .idle = CPU_IDLE,
65a4433a
JH
9021 /*
9022 * can_migrate_task() doesn't need to compute new_dst_cpu
9023 * for active balancing. Since we have CPU_IDLE, but no
9024 * @dst_grpmask we need to make that test go away with lying
9025 * about DST_PINNED.
9026 */
9027 .flags = LBF_DST_PINNED,
8e45cb54
PZ
9028 };
9029
ae92882e 9030 schedstat_inc(sd->alb_count);
3bed5e21 9031 update_rq_clock(busiest_rq);
1e3c88bd 9032
e5673f28 9033 p = detach_one_task(&env);
d02c0711 9034 if (p) {
ae92882e 9035 schedstat_inc(sd->alb_pushed);
d02c0711
SD
9036 /* Active balancing done, reset the failure counter. */
9037 sd->nr_balance_failed = 0;
9038 } else {
ae92882e 9039 schedstat_inc(sd->alb_failed);
d02c0711 9040 }
1e3c88bd 9041 }
dce840a0 9042 rcu_read_unlock();
969c7921
TH
9043out_unlock:
9044 busiest_rq->active_balance = 0;
8a8c69c3 9045 rq_unlock(busiest_rq, &rf);
e5673f28
KT
9046
9047 if (p)
9048 attach_one_task(target_rq, p);
9049
9050 local_irq_enable();
9051
969c7921 9052 return 0;
1e3c88bd
PZ
9053}
9054
d987fc7f
MG
9055static inline int on_null_domain(struct rq *rq)
9056{
9057 return unlikely(!rcu_dereference_sched(rq->sd));
9058}
9059
3451d024 9060#ifdef CONFIG_NO_HZ_COMMON
83cd4fe2
VP
9061/*
9062 * idle load balancing details
83cd4fe2
VP
9063 * - When one of the busy CPUs notice that there may be an idle rebalancing
9064 * needed, they will kick the idle load balancer, which then does idle
9065 * load balancing for all the idle CPUs.
9066 */
1e3c88bd 9067static struct {
83cd4fe2 9068 cpumask_var_t idle_cpus_mask;
0b005cf5 9069 atomic_t nr_cpus;
83cd4fe2
VP
9070 unsigned long next_balance; /* in jiffy units */
9071} nohz ____cacheline_aligned;
1e3c88bd 9072
3dd0337d 9073static inline int find_new_ilb(void)
1e3c88bd 9074{
0b005cf5 9075 int ilb = cpumask_first(nohz.idle_cpus_mask);
1e3c88bd 9076
786d6dc7
SS
9077 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9078 return ilb;
9079
9080 return nr_cpu_ids;
1e3c88bd 9081}
1e3c88bd 9082
83cd4fe2
VP
9083/*
9084 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9085 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9086 * CPU (if there is one).
9087 */
0aeeeeba 9088static void nohz_balancer_kick(void)
83cd4fe2
VP
9089{
9090 int ilb_cpu;
9091
9092 nohz.next_balance++;
9093
3dd0337d 9094 ilb_cpu = find_new_ilb();
83cd4fe2 9095
0b005cf5
SS
9096 if (ilb_cpu >= nr_cpu_ids)
9097 return;
83cd4fe2 9098
cd490c5b 9099 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
1c792db7
SS
9100 return;
9101 /*
9102 * Use smp_send_reschedule() instead of resched_cpu().
9103 * This way we generate a sched IPI on the target cpu which
9104 * is idle. And the softirq performing nohz idle load balance
9105 * will be run before returning from the IPI.
9106 */
9107 smp_send_reschedule(ilb_cpu);
83cd4fe2
VP
9108 return;
9109}
9110
20a5c8cc 9111void nohz_balance_exit_idle(unsigned int cpu)
71325960
SS
9112{
9113 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
d987fc7f
MG
9114 /*
9115 * Completely isolated CPUs don't ever set, so we must test.
9116 */
9117 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9118 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9119 atomic_dec(&nohz.nr_cpus);
9120 }
71325960
SS
9121 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9122 }
9123}
9124
69e1e811
SS
9125static inline void set_cpu_sd_state_busy(void)
9126{
9127 struct sched_domain *sd;
37dc6b50 9128 int cpu = smp_processor_id();
69e1e811 9129
69e1e811 9130 rcu_read_lock();
0e369d75 9131 sd = rcu_dereference(per_cpu(sd_llc, cpu));
25f55d9d
VG
9132
9133 if (!sd || !sd->nohz_idle)
9134 goto unlock;
9135 sd->nohz_idle = 0;
9136
0e369d75 9137 atomic_inc(&sd->shared->nr_busy_cpus);
25f55d9d 9138unlock:
69e1e811
SS
9139 rcu_read_unlock();
9140}
9141
9142void set_cpu_sd_state_idle(void)
9143{
9144 struct sched_domain *sd;
37dc6b50 9145 int cpu = smp_processor_id();
69e1e811 9146
69e1e811 9147 rcu_read_lock();
0e369d75 9148 sd = rcu_dereference(per_cpu(sd_llc, cpu));
25f55d9d
VG
9149
9150 if (!sd || sd->nohz_idle)
9151 goto unlock;
9152 sd->nohz_idle = 1;
9153
0e369d75 9154 atomic_dec(&sd->shared->nr_busy_cpus);
25f55d9d 9155unlock:
69e1e811
SS
9156 rcu_read_unlock();
9157}
9158
1e3c88bd 9159/*
c1cc017c 9160 * This routine will record that the cpu is going idle with tick stopped.
0b005cf5 9161 * This info will be used in performing idle load balancing in the future.
1e3c88bd 9162 */
c1cc017c 9163void nohz_balance_enter_idle(int cpu)
1e3c88bd 9164{
71325960
SS
9165 /*
9166 * If this cpu is going down, then nothing needs to be done.
9167 */
9168 if (!cpu_active(cpu))
9169 return;
9170
387bc8b5 9171 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
de201559 9172 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
387bc8b5
FW
9173 return;
9174
c1cc017c
AS
9175 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9176 return;
1e3c88bd 9177
d987fc7f
MG
9178 /*
9179 * If we're a completely isolated CPU, we don't play.
9180 */
9181 if (on_null_domain(cpu_rq(cpu)))
9182 return;
9183
c1cc017c
AS
9184 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9185 atomic_inc(&nohz.nr_cpus);
9186 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
1e3c88bd
PZ
9187}
9188#endif
9189
9190static DEFINE_SPINLOCK(balancing);
9191
49c022e6
PZ
9192/*
9193 * Scale the max load_balance interval with the number of CPUs in the system.
9194 * This trades load-balance latency on larger machines for less cross talk.
9195 */
029632fb 9196void update_max_interval(void)
49c022e6
PZ
9197{
9198 max_load_balance_interval = HZ*num_online_cpus()/10;
9199}
9200
1e3c88bd
PZ
9201/*
9202 * It checks each scheduling domain to see if it is due to be balanced,
9203 * and initiates a balancing operation if so.
9204 *
b9b0853a 9205 * Balancing parameters are set up in init_sched_domains.
1e3c88bd 9206 */
f7ed0a89 9207static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
1e3c88bd 9208{
23f0d209 9209 int continue_balancing = 1;
f7ed0a89 9210 int cpu = rq->cpu;
1e3c88bd 9211 unsigned long interval;
04f733b4 9212 struct sched_domain *sd;
1e3c88bd
PZ
9213 /* Earliest time when we have to do rebalance again */
9214 unsigned long next_balance = jiffies + 60*HZ;
9215 int update_next_balance = 0;
f48627e6
JL
9216 int need_serialize, need_decay = 0;
9217 u64 max_cost = 0;
1e3c88bd 9218
48a16753 9219 update_blocked_averages(cpu);
2069dd75 9220
dce840a0 9221 rcu_read_lock();
1e3c88bd 9222 for_each_domain(cpu, sd) {
f48627e6
JL
9223 /*
9224 * Decay the newidle max times here because this is a regular
9225 * visit to all the domains. Decay ~1% per second.
9226 */
9227 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9228 sd->max_newidle_lb_cost =
9229 (sd->max_newidle_lb_cost * 253) / 256;
9230 sd->next_decay_max_lb_cost = jiffies + HZ;
9231 need_decay = 1;
9232 }
9233 max_cost += sd->max_newidle_lb_cost;
9234
1e3c88bd
PZ
9235 if (!(sd->flags & SD_LOAD_BALANCE))
9236 continue;
9237
f48627e6
JL
9238 /*
9239 * Stop the load balance at this level. There is another
9240 * CPU in our sched group which is doing load balancing more
9241 * actively.
9242 */
9243 if (!continue_balancing) {
9244 if (need_decay)
9245 continue;
9246 break;
9247 }
9248
52a08ef1 9249 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
1e3c88bd
PZ
9250
9251 need_serialize = sd->flags & SD_SERIALIZE;
1e3c88bd
PZ
9252 if (need_serialize) {
9253 if (!spin_trylock(&balancing))
9254 goto out;
9255 }
9256
9257 if (time_after_eq(jiffies, sd->last_balance + interval)) {
23f0d209 9258 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
1e3c88bd 9259 /*
6263322c 9260 * The LBF_DST_PINNED logic could have changed
de5eb2dd
JK
9261 * env->dst_cpu, so we can't know our idle
9262 * state even if we migrated tasks. Update it.
1e3c88bd 9263 */
de5eb2dd 9264 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
1e3c88bd
PZ
9265 }
9266 sd->last_balance = jiffies;
52a08ef1 9267 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
1e3c88bd
PZ
9268 }
9269 if (need_serialize)
9270 spin_unlock(&balancing);
9271out:
9272 if (time_after(next_balance, sd->last_balance + interval)) {
9273 next_balance = sd->last_balance + interval;
9274 update_next_balance = 1;
9275 }
f48627e6
JL
9276 }
9277 if (need_decay) {
1e3c88bd 9278 /*
f48627e6
JL
9279 * Ensure the rq-wide value also decays but keep it at a
9280 * reasonable floor to avoid funnies with rq->avg_idle.
1e3c88bd 9281 */
f48627e6
JL
9282 rq->max_idle_balance_cost =
9283 max((u64)sysctl_sched_migration_cost, max_cost);
1e3c88bd 9284 }
dce840a0 9285 rcu_read_unlock();
1e3c88bd
PZ
9286
9287 /*
9288 * next_balance will be updated only when there is a need.
9289 * When the cpu is attached to null domain for ex, it will not be
9290 * updated.
9291 */
c5afb6a8 9292 if (likely(update_next_balance)) {
1e3c88bd 9293 rq->next_balance = next_balance;
c5afb6a8
VG
9294
9295#ifdef CONFIG_NO_HZ_COMMON
9296 /*
9297 * If this CPU has been elected to perform the nohz idle
9298 * balance. Other idle CPUs have already rebalanced with
9299 * nohz_idle_balance() and nohz.next_balance has been
9300 * updated accordingly. This CPU is now running the idle load
9301 * balance for itself and we need to update the
9302 * nohz.next_balance accordingly.
9303 */
9304 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9305 nohz.next_balance = rq->next_balance;
9306#endif
9307 }
1e3c88bd
PZ
9308}
9309
3451d024 9310#ifdef CONFIG_NO_HZ_COMMON
1e3c88bd 9311/*
3451d024 9312 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
1e3c88bd
PZ
9313 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9314 */
208cb16b 9315static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
83cd4fe2 9316{
208cb16b 9317 int this_cpu = this_rq->cpu;
83cd4fe2
VP
9318 struct rq *rq;
9319 int balance_cpu;
c5afb6a8
VG
9320 /* Earliest time when we have to do rebalance again */
9321 unsigned long next_balance = jiffies + 60*HZ;
9322 int update_next_balance = 0;
83cd4fe2 9323
1c792db7
SS
9324 if (idle != CPU_IDLE ||
9325 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
9326 goto end;
83cd4fe2
VP
9327
9328 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8a6d42d1 9329 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
83cd4fe2
VP
9330 continue;
9331
9332 /*
9333 * If this cpu gets work to do, stop the load balancing
9334 * work being done for other cpus. Next load
9335 * balancing owner will pick it up.
9336 */
1c792db7 9337 if (need_resched())
83cd4fe2 9338 break;
83cd4fe2 9339
5ed4f1d9
VG
9340 rq = cpu_rq(balance_cpu);
9341
ed61bbc6
TC
9342 /*
9343 * If time for next balance is due,
9344 * do the balance.
9345 */
9346 if (time_after_eq(jiffies, rq->next_balance)) {
8a8c69c3
PZ
9347 struct rq_flags rf;
9348
9349 rq_lock_irq(rq, &rf);
ed61bbc6 9350 update_rq_clock(rq);
cee1afce 9351 cpu_load_update_idle(rq);
8a8c69c3
PZ
9352 rq_unlock_irq(rq, &rf);
9353
ed61bbc6
TC
9354 rebalance_domains(rq, CPU_IDLE);
9355 }
83cd4fe2 9356
c5afb6a8
VG
9357 if (time_after(next_balance, rq->next_balance)) {
9358 next_balance = rq->next_balance;
9359 update_next_balance = 1;
9360 }
83cd4fe2 9361 }
c5afb6a8
VG
9362
9363 /*
9364 * next_balance will be updated only when there is a need.
9365 * When the CPU is attached to null domain for ex, it will not be
9366 * updated.
9367 */
9368 if (likely(update_next_balance))
9369 nohz.next_balance = next_balance;
1c792db7
SS
9370end:
9371 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
83cd4fe2
VP
9372}
9373
9374/*
0b005cf5 9375 * Current heuristic for kicking the idle load balancer in the presence
1aaf90a4 9376 * of an idle cpu in the system.
0b005cf5 9377 * - This rq has more than one task.
1aaf90a4
VG
9378 * - This rq has at least one CFS task and the capacity of the CPU is
9379 * significantly reduced because of RT tasks or IRQs.
9380 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
9381 * multiple busy cpu.
0b005cf5
SS
9382 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9383 * domain span are idle.
83cd4fe2 9384 */
1aaf90a4 9385static inline bool nohz_kick_needed(struct rq *rq)
83cd4fe2
VP
9386{
9387 unsigned long now = jiffies;
0e369d75 9388 struct sched_domain_shared *sds;
0b005cf5 9389 struct sched_domain *sd;
afe06efd 9390 int nr_busy, i, cpu = rq->cpu;
1aaf90a4 9391 bool kick = false;
83cd4fe2 9392
4a725627 9393 if (unlikely(rq->idle_balance))
1aaf90a4 9394 return false;
83cd4fe2 9395
1c792db7
SS
9396 /*
9397 * We may be recently in ticked or tickless idle mode. At the first
9398 * busy tick after returning from idle, we will update the busy stats.
9399 */
69e1e811 9400 set_cpu_sd_state_busy();
c1cc017c 9401 nohz_balance_exit_idle(cpu);
0b005cf5
SS
9402
9403 /*
9404 * None are in tickless mode and hence no need for NOHZ idle load
9405 * balancing.
9406 */
9407 if (likely(!atomic_read(&nohz.nr_cpus)))
1aaf90a4 9408 return false;
1c792db7
SS
9409
9410 if (time_before(now, nohz.next_balance))
1aaf90a4 9411 return false;
83cd4fe2 9412
0b005cf5 9413 if (rq->nr_running >= 2)
1aaf90a4 9414 return true;
83cd4fe2 9415
067491b7 9416 rcu_read_lock();
0e369d75
PZ
9417 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9418 if (sds) {
9419 /*
9420 * XXX: write a coherent comment on why we do this.
9421 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
9422 */
9423 nr_busy = atomic_read(&sds->nr_busy_cpus);
1aaf90a4
VG
9424 if (nr_busy > 1) {
9425 kick = true;
9426 goto unlock;
9427 }
9428
83cd4fe2 9429 }
37dc6b50 9430
1aaf90a4
VG
9431 sd = rcu_dereference(rq->sd);
9432 if (sd) {
9433 if ((rq->cfs.h_nr_running >= 1) &&
9434 check_cpu_capacity(rq, sd)) {
9435 kick = true;
9436 goto unlock;
9437 }
9438 }
37dc6b50 9439
1aaf90a4 9440 sd = rcu_dereference(per_cpu(sd_asym, cpu));
afe06efd
TC
9441 if (sd) {
9442 for_each_cpu(i, sched_domain_span(sd)) {
9443 if (i == cpu ||
9444 !cpumask_test_cpu(i, nohz.idle_cpus_mask))
9445 continue;
067491b7 9446
afe06efd
TC
9447 if (sched_asym_prefer(i, cpu)) {
9448 kick = true;
9449 goto unlock;
9450 }
9451 }
9452 }
1aaf90a4 9453unlock:
067491b7 9454 rcu_read_unlock();
1aaf90a4 9455 return kick;
83cd4fe2
VP
9456}
9457#else
208cb16b 9458static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
83cd4fe2
VP
9459#endif
9460
9461/*
9462 * run_rebalance_domains is triggered when needed from the scheduler tick.
9463 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
9464 */
0766f788 9465static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
1e3c88bd 9466{
208cb16b 9467 struct rq *this_rq = this_rq();
6eb57e0d 9468 enum cpu_idle_type idle = this_rq->idle_balance ?
1e3c88bd
PZ
9469 CPU_IDLE : CPU_NOT_IDLE;
9470
1e3c88bd 9471 /*
83cd4fe2 9472 * If this cpu has a pending nohz_balance_kick, then do the
1e3c88bd 9473 * balancing on behalf of the other idle cpus whose ticks are
d4573c3e
PM
9474 * stopped. Do nohz_idle_balance *before* rebalance_domains to
9475 * give the idle cpus a chance to load balance. Else we may
9476 * load balance only within the local sched_domain hierarchy
9477 * and abort nohz_idle_balance altogether if we pull some load.
1e3c88bd 9478 */
208cb16b 9479 nohz_idle_balance(this_rq, idle);
d4573c3e 9480 rebalance_domains(this_rq, idle);
1e3c88bd
PZ
9481}
9482
1e3c88bd
PZ
9483/*
9484 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bd 9485 */
7caff66f 9486void trigger_load_balance(struct rq *rq)
1e3c88bd 9487{
1e3c88bd 9488 /* Don't need to rebalance while attached to NULL domain */
c726099e
DL
9489 if (unlikely(on_null_domain(rq)))
9490 return;
9491
9492 if (time_after_eq(jiffies, rq->next_balance))
1e3c88bd 9493 raise_softirq(SCHED_SOFTIRQ);
3451d024 9494#ifdef CONFIG_NO_HZ_COMMON
c726099e 9495 if (nohz_kick_needed(rq))
0aeeeeba 9496 nohz_balancer_kick();
83cd4fe2 9497#endif
1e3c88bd
PZ
9498}
9499
0bcdcf28
CE
9500static void rq_online_fair(struct rq *rq)
9501{
9502 update_sysctl();
0e59bdae
KT
9503
9504 update_runtime_enabled(rq);
0bcdcf28
CE
9505}
9506
9507static void rq_offline_fair(struct rq *rq)
9508{
9509 update_sysctl();
a4c96ae3
PB
9510
9511 /* Ensure any throttled groups are reachable by pick_next_task */
9512 unthrottle_offline_cfs_rqs(rq);
0bcdcf28
CE
9513}
9514
55e12e5e 9515#endif /* CONFIG_SMP */
e1d1484f 9516
bf0f6f24 9517/*
d84b3131
FW
9518 * scheduler tick hitting a task of our scheduling class.
9519 *
9520 * NOTE: This function can be called remotely by the tick offload that
9521 * goes along full dynticks. Therefore no local assumption can be made
9522 * and everything must be accessed through the @rq and @curr passed in
9523 * parameters.
bf0f6f24 9524 */
8f4d37ec 9525static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24
IM
9526{
9527 struct cfs_rq *cfs_rq;
9528 struct sched_entity *se = &curr->se;
9529
9530 for_each_sched_entity(se) {
9531 cfs_rq = cfs_rq_of(se);
8f4d37ec 9532 entity_tick(cfs_rq, se, queued);
bf0f6f24 9533 }
18bf2805 9534
b52da86e 9535 if (static_branch_unlikely(&sched_numa_balancing))
cbee9f88 9536 task_tick_numa(rq, curr);
bf0f6f24
IM
9537}
9538
9539/*
cd29fe6f
PZ
9540 * called on fork with the child task as argument from the parent's context
9541 * - child not yet on the tasklist
9542 * - preemption disabled
bf0f6f24 9543 */
cd29fe6f 9544static void task_fork_fair(struct task_struct *p)
bf0f6f24 9545{
4fc420c9
DN
9546 struct cfs_rq *cfs_rq;
9547 struct sched_entity *se = &p->se, *curr;
cd29fe6f 9548 struct rq *rq = this_rq();
8a8c69c3 9549 struct rq_flags rf;
bf0f6f24 9550
8a8c69c3 9551 rq_lock(rq, &rf);
861d034e
PZ
9552 update_rq_clock(rq);
9553
4fc420c9
DN
9554 cfs_rq = task_cfs_rq(current);
9555 curr = cfs_rq->curr;
e210bffd
PZ
9556 if (curr) {
9557 update_curr(cfs_rq);
b5d9d734 9558 se->vruntime = curr->vruntime;
e210bffd 9559 }
aeb73b04 9560 place_entity(cfs_rq, se, 1);
4d78e7b6 9561
cd29fe6f 9562 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa38 9563 /*
edcb60a3
IM
9564 * Upon rescheduling, sched_class::put_prev_task() will place
9565 * 'current' within the tree based on its new key value.
9566 */
4d78e7b6 9567 swap(curr->vruntime, se->vruntime);
8875125e 9568 resched_curr(rq);
4d78e7b6 9569 }
bf0f6f24 9570
88ec22d3 9571 se->vruntime -= cfs_rq->min_vruntime;
8a8c69c3 9572 rq_unlock(rq, &rf);
bf0f6f24
IM
9573}
9574
cb469845
SR
9575/*
9576 * Priority of the task has changed. Check to see if we preempt
9577 * the current task.
9578 */
da7a735e
PZ
9579static void
9580prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
cb469845 9581{
da0c1e65 9582 if (!task_on_rq_queued(p))
da7a735e
PZ
9583 return;
9584
cb469845
SR
9585 /*
9586 * Reschedule if we are currently running on this runqueue and
9587 * our priority decreased, or if we are not currently running on
9588 * this runqueue and our priority is higher than the current's
9589 */
da7a735e 9590 if (rq->curr == p) {
cb469845 9591 if (p->prio > oldprio)
8875125e 9592 resched_curr(rq);
cb469845 9593 } else
15afe09b 9594 check_preempt_curr(rq, p, 0);
cb469845
SR
9595}
9596
daa59407 9597static inline bool vruntime_normalized(struct task_struct *p)
da7a735e
PZ
9598{
9599 struct sched_entity *se = &p->se;
da7a735e
PZ
9600
9601 /*
daa59407
BP
9602 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
9603 * the dequeue_entity(.flags=0) will already have normalized the
9604 * vruntime.
9605 */
9606 if (p->on_rq)
9607 return true;
9608
9609 /*
9610 * When !on_rq, vruntime of the task has usually NOT been normalized.
9611 * But there are some cases where it has already been normalized:
da7a735e 9612 *
daa59407
BP
9613 * - A forked child which is waiting for being woken up by
9614 * wake_up_new_task().
9615 * - A task which has been woken up by try_to_wake_up() and
9616 * waiting for actually being woken up by sched_ttwu_pending().
da7a735e 9617 */
daa59407
BP
9618 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
9619 return true;
9620
9621 return false;
9622}
9623
09a43ace
VG
9624#ifdef CONFIG_FAIR_GROUP_SCHED
9625/*
9626 * Propagate the changes of the sched_entity across the tg tree to make it
9627 * visible to the root
9628 */
9629static void propagate_entity_cfs_rq(struct sched_entity *se)
9630{
9631 struct cfs_rq *cfs_rq;
9632
9633 /* Start to propagate at parent */
9634 se = se->parent;
9635
9636 for_each_sched_entity(se) {
9637 cfs_rq = cfs_rq_of(se);
9638
9639 if (cfs_rq_throttled(cfs_rq))
9640 break;
9641
88c0616e 9642 update_load_avg(cfs_rq, se, UPDATE_TG);
09a43ace
VG
9643 }
9644}
9645#else
9646static void propagate_entity_cfs_rq(struct sched_entity *se) { }
9647#endif
9648
df217913 9649static void detach_entity_cfs_rq(struct sched_entity *se)
daa59407 9650{
daa59407
BP
9651 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9652
9d89c257 9653 /* Catch up with the cfs_rq and remove our load when we leave */
88c0616e 9654 update_load_avg(cfs_rq, se, 0);
a05e8c51 9655 detach_entity_load_avg(cfs_rq, se);
7c3edd2c 9656 update_tg_load_avg(cfs_rq, false);
09a43ace 9657 propagate_entity_cfs_rq(se);
da7a735e
PZ
9658}
9659
df217913 9660static void attach_entity_cfs_rq(struct sched_entity *se)
cb469845 9661{
daa59407 9662 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7855a35a
BP
9663
9664#ifdef CONFIG_FAIR_GROUP_SCHED
eb7a59b2
M
9665 /*
9666 * Since the real-depth could have been changed (only FAIR
9667 * class maintain depth value), reset depth properly.
9668 */
9669 se->depth = se->parent ? se->parent->depth + 1 : 0;
9670#endif
7855a35a 9671
df217913 9672 /* Synchronize entity with its cfs_rq */
88c0616e 9673 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
daa59407 9674 attach_entity_load_avg(cfs_rq, se);
7c3edd2c 9675 update_tg_load_avg(cfs_rq, false);
09a43ace 9676 propagate_entity_cfs_rq(se);
df217913
VG
9677}
9678
9679static void detach_task_cfs_rq(struct task_struct *p)
9680{
9681 struct sched_entity *se = &p->se;
9682 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9683
9684 if (!vruntime_normalized(p)) {
9685 /*
9686 * Fix up our vruntime so that the current sleep doesn't
9687 * cause 'unlimited' sleep bonus.
9688 */
9689 place_entity(cfs_rq, se, 0);
9690 se->vruntime -= cfs_rq->min_vruntime;
9691 }
9692
9693 detach_entity_cfs_rq(se);
9694}
9695
9696static void attach_task_cfs_rq(struct task_struct *p)
9697{
9698 struct sched_entity *se = &p->se;
9699 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9700
9701 attach_entity_cfs_rq(se);
daa59407
BP
9702
9703 if (!vruntime_normalized(p))
9704 se->vruntime += cfs_rq->min_vruntime;
9705}
6efdb105 9706
daa59407
BP
9707static void switched_from_fair(struct rq *rq, struct task_struct *p)
9708{
9709 detach_task_cfs_rq(p);
9710}
9711
9712static void switched_to_fair(struct rq *rq, struct task_struct *p)
9713{
9714 attach_task_cfs_rq(p);
7855a35a 9715
daa59407 9716 if (task_on_rq_queued(p)) {
7855a35a 9717 /*
daa59407
BP
9718 * We were most likely switched from sched_rt, so
9719 * kick off the schedule if running, otherwise just see
9720 * if we can still preempt the current task.
7855a35a 9721 */
daa59407
BP
9722 if (rq->curr == p)
9723 resched_curr(rq);
9724 else
9725 check_preempt_curr(rq, p, 0);
7855a35a 9726 }
cb469845
SR
9727}
9728
83b699ed
SV
9729/* Account for a task changing its policy or group.
9730 *
9731 * This routine is mostly called to set cfs_rq->curr field when a task
9732 * migrates between groups/classes.
9733 */
9734static void set_curr_task_fair(struct rq *rq)
9735{
9736 struct sched_entity *se = &rq->curr->se;
9737
ec12cb7f
PT
9738 for_each_sched_entity(se) {
9739 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9740
9741 set_next_entity(cfs_rq, se);
9742 /* ensure bandwidth has been allocated on our new cfs_rq */
9743 account_cfs_rq_runtime(cfs_rq, 0);
9744 }
83b699ed
SV
9745}
9746
029632fb
PZ
9747void init_cfs_rq(struct cfs_rq *cfs_rq)
9748{
bfb06889 9749 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
029632fb
PZ
9750 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9751#ifndef CONFIG_64BIT
9752 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9753#endif
141965c7 9754#ifdef CONFIG_SMP
2a2f5d4e 9755 raw_spin_lock_init(&cfs_rq->removed.lock);
9ee474f5 9756#endif
029632fb
PZ
9757}
9758
810b3817 9759#ifdef CONFIG_FAIR_GROUP_SCHED
ea86cb4b
VG
9760static void task_set_group_fair(struct task_struct *p)
9761{
9762 struct sched_entity *se = &p->se;
9763
9764 set_task_rq(p, task_cpu(p));
9765 se->depth = se->parent ? se->parent->depth + 1 : 0;
9766}
9767
bc54da21 9768static void task_move_group_fair(struct task_struct *p)
810b3817 9769{
daa59407 9770 detach_task_cfs_rq(p);
b2b5ce02 9771 set_task_rq(p, task_cpu(p));
6efdb105
BP
9772
9773#ifdef CONFIG_SMP
9774 /* Tell se's cfs_rq has been changed -- migrated */
9775 p->se.avg.last_update_time = 0;
9776#endif
daa59407 9777 attach_task_cfs_rq(p);
810b3817 9778}
029632fb 9779
ea86cb4b
VG
9780static void task_change_group_fair(struct task_struct *p, int type)
9781{
9782 switch (type) {
9783 case TASK_SET_GROUP:
9784 task_set_group_fair(p);
9785 break;
9786
9787 case TASK_MOVE_GROUP:
9788 task_move_group_fair(p);
9789 break;
9790 }
9791}
9792
029632fb
PZ
9793void free_fair_sched_group(struct task_group *tg)
9794{
9795 int i;
9796
9797 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9798
9799 for_each_possible_cpu(i) {
9800 if (tg->cfs_rq)
9801 kfree(tg->cfs_rq[i]);
6fe1f348 9802 if (tg->se)
029632fb
PZ
9803 kfree(tg->se[i]);
9804 }
9805
9806 kfree(tg->cfs_rq);
9807 kfree(tg->se);
9808}
9809
9810int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9811{
029632fb 9812 struct sched_entity *se;
b7fa30c9 9813 struct cfs_rq *cfs_rq;
029632fb
PZ
9814 int i;
9815
9816 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9817 if (!tg->cfs_rq)
9818 goto err;
9819 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9820 if (!tg->se)
9821 goto err;
9822
9823 tg->shares = NICE_0_LOAD;
9824
9825 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
9826
9827 for_each_possible_cpu(i) {
9828 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9829 GFP_KERNEL, cpu_to_node(i));
9830 if (!cfs_rq)
9831 goto err;
9832
9833 se = kzalloc_node(sizeof(struct sched_entity),
9834 GFP_KERNEL, cpu_to_node(i));
9835 if (!se)
9836 goto err_free_rq;
9837
9838 init_cfs_rq(cfs_rq);
9839 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
540247fb 9840 init_entity_runnable_average(se);
029632fb
PZ
9841 }
9842
9843 return 1;
9844
9845err_free_rq:
9846 kfree(cfs_rq);
9847err:
9848 return 0;
9849}
9850
8663e24d
PZ
9851void online_fair_sched_group(struct task_group *tg)
9852{
9853 struct sched_entity *se;
9854 struct rq *rq;
9855 int i;
9856
9857 for_each_possible_cpu(i) {
9858 rq = cpu_rq(i);
9859 se = tg->se[i];
9860
9861 raw_spin_lock_irq(&rq->lock);
4126bad6 9862 update_rq_clock(rq);
d0326691 9863 attach_entity_cfs_rq(se);
55e16d30 9864 sync_throttle(tg, i);
8663e24d
PZ
9865 raw_spin_unlock_irq(&rq->lock);
9866 }
9867}
9868
6fe1f348 9869void unregister_fair_sched_group(struct task_group *tg)
029632fb 9870{
029632fb 9871 unsigned long flags;
6fe1f348
PZ
9872 struct rq *rq;
9873 int cpu;
029632fb 9874
6fe1f348
PZ
9875 for_each_possible_cpu(cpu) {
9876 if (tg->se[cpu])
9877 remove_entity_load_avg(tg->se[cpu]);
029632fb 9878
6fe1f348
PZ
9879 /*
9880 * Only empty task groups can be destroyed; so we can speculatively
9881 * check on_list without danger of it being re-added.
9882 */
9883 if (!tg->cfs_rq[cpu]->on_list)
9884 continue;
9885
9886 rq = cpu_rq(cpu);
9887
9888 raw_spin_lock_irqsave(&rq->lock, flags);
9889 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
9890 raw_spin_unlock_irqrestore(&rq->lock, flags);
9891 }
029632fb
PZ
9892}
9893
9894void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9895 struct sched_entity *se, int cpu,
9896 struct sched_entity *parent)
9897{
9898 struct rq *rq = cpu_rq(cpu);
9899
9900 cfs_rq->tg = tg;
9901 cfs_rq->rq = rq;
029632fb
PZ
9902 init_cfs_rq_runtime(cfs_rq);
9903
9904 tg->cfs_rq[cpu] = cfs_rq;
9905 tg->se[cpu] = se;
9906
9907 /* se could be NULL for root_task_group */
9908 if (!se)
9909 return;
9910
fed14d45 9911 if (!parent) {
029632fb 9912 se->cfs_rq = &rq->cfs;
fed14d45
PZ
9913 se->depth = 0;
9914 } else {
029632fb 9915 se->cfs_rq = parent->my_q;
fed14d45
PZ
9916 se->depth = parent->depth + 1;
9917 }
029632fb
PZ
9918
9919 se->my_q = cfs_rq;
0ac9b1c2
PT
9920 /* guarantee group entities always have weight */
9921 update_load_set(&se->load, NICE_0_LOAD);
029632fb
PZ
9922 se->parent = parent;
9923}
9924
9925static DEFINE_MUTEX(shares_mutex);
9926
9927int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9928{
9929 int i;
029632fb
PZ
9930
9931 /*
9932 * We can't change the weight of the root cgroup.
9933 */
9934 if (!tg->se[0])
9935 return -EINVAL;
9936
9937 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
9938
9939 mutex_lock(&shares_mutex);
9940 if (tg->shares == shares)
9941 goto done;
9942
9943 tg->shares = shares;
9944 for_each_possible_cpu(i) {
9945 struct rq *rq = cpu_rq(i);
8a8c69c3
PZ
9946 struct sched_entity *se = tg->se[i];
9947 struct rq_flags rf;
029632fb 9948
029632fb 9949 /* Propagate contribution to hierarchy */
8a8c69c3 9950 rq_lock_irqsave(rq, &rf);
71b1da46 9951 update_rq_clock(rq);
89ee048f 9952 for_each_sched_entity(se) {
88c0616e 9953 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
1ea6c46a 9954 update_cfs_group(se);
89ee048f 9955 }
8a8c69c3 9956 rq_unlock_irqrestore(rq, &rf);
029632fb
PZ
9957 }
9958
9959done:
9960 mutex_unlock(&shares_mutex);
9961 return 0;
9962}
9963#else /* CONFIG_FAIR_GROUP_SCHED */
9964
9965void free_fair_sched_group(struct task_group *tg) { }
9966
9967int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9968{
9969 return 1;
9970}
9971
8663e24d
PZ
9972void online_fair_sched_group(struct task_group *tg) { }
9973
6fe1f348 9974void unregister_fair_sched_group(struct task_group *tg) { }
029632fb
PZ
9975
9976#endif /* CONFIG_FAIR_GROUP_SCHED */
9977
810b3817 9978
6d686f45 9979static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cea
PW
9980{
9981 struct sched_entity *se = &task->se;
0d721cea
PW
9982 unsigned int rr_interval = 0;
9983
9984 /*
9985 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
9986 * idle runqueue:
9987 */
0d721cea 9988 if (rq->cfs.load.weight)
a59f4e07 9989 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
0d721cea
PW
9990
9991 return rr_interval;
9992}
9993
bf0f6f24
IM
9994/*
9995 * All the scheduling class methods:
9996 */
029632fb 9997const struct sched_class fair_sched_class = {
5522d5d5 9998 .next = &idle_sched_class,
bf0f6f24
IM
9999 .enqueue_task = enqueue_task_fair,
10000 .dequeue_task = dequeue_task_fair,
10001 .yield_task = yield_task_fair,
d95f4122 10002 .yield_to_task = yield_to_task_fair,
bf0f6f24 10003
2e09bf55 10004 .check_preempt_curr = check_preempt_wakeup,
bf0f6f24
IM
10005
10006 .pick_next_task = pick_next_task_fair,
10007 .put_prev_task = put_prev_task_fair,
10008
681f3e68 10009#ifdef CONFIG_SMP
4ce72a2c 10010 .select_task_rq = select_task_rq_fair,
0a74bef8 10011 .migrate_task_rq = migrate_task_rq_fair,
141965c7 10012
0bcdcf28
CE
10013 .rq_online = rq_online_fair,
10014 .rq_offline = rq_offline_fair,
88ec22d3 10015
12695578 10016 .task_dead = task_dead_fair,
c5b28038 10017 .set_cpus_allowed = set_cpus_allowed_common,
681f3e68 10018#endif
bf0f6f24 10019
83b699ed 10020 .set_curr_task = set_curr_task_fair,
bf0f6f24 10021 .task_tick = task_tick_fair,
cd29fe6f 10022 .task_fork = task_fork_fair,
cb469845
SR
10023
10024 .prio_changed = prio_changed_fair,
da7a735e 10025 .switched_from = switched_from_fair,
cb469845 10026 .switched_to = switched_to_fair,
810b3817 10027
0d721cea
PW
10028 .get_rr_interval = get_rr_interval_fair,
10029
6e998916
SG
10030 .update_curr = update_curr_fair,
10031
810b3817 10032#ifdef CONFIG_FAIR_GROUP_SCHED
ea86cb4b 10033 .task_change_group = task_change_group_fair,
810b3817 10034#endif
bf0f6f24
IM
10035};
10036
10037#ifdef CONFIG_SCHED_DEBUG
029632fb 10038void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24 10039{
a9e7f654 10040 struct cfs_rq *cfs_rq, *pos;
bf0f6f24 10041
5973e5b9 10042 rcu_read_lock();
a9e7f654 10043 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
5cef9eca 10044 print_cfs_rq(m, cpu, cfs_rq);
5973e5b9 10045 rcu_read_unlock();
bf0f6f24 10046}
397f2378
SD
10047
10048#ifdef CONFIG_NUMA_BALANCING
10049void show_numa_stats(struct task_struct *p, struct seq_file *m)
10050{
10051 int node;
10052 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10053
10054 for_each_online_node(node) {
10055 if (p->numa_faults) {
10056 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10057 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10058 }
10059 if (p->numa_group) {
10060 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10061 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10062 }
10063 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10064 }
10065}
10066#endif /* CONFIG_NUMA_BALANCING */
10067#endif /* CONFIG_SCHED_DEBUG */
029632fb
PZ
10068
10069__init void init_sched_fair_class(void)
10070{
10071#ifdef CONFIG_SMP
10072 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10073
3451d024 10074#ifdef CONFIG_NO_HZ_COMMON
554cecaf 10075 nohz.next_balance = jiffies;
029632fb 10076 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
029632fb
PZ
10077#endif
10078#endif /* SMP */
10079
10080}