sched: sched.h: make rq locking and clock functions available in stats.h
[linux-block.git] / kernel / sched / sched.h
CommitLineData
b2441318 1/* SPDX-License-Identifier: GPL-2.0 */
97fb7a0a
IM
2/*
3 * Scheduler internal types and methods:
4 */
029632fb 5#include <linux/sched.h>
325ea10c 6
dfc3401a 7#include <linux/sched/autogroup.h>
e6017571 8#include <linux/sched/clock.h>
325ea10c 9#include <linux/sched/coredump.h>
55687da1 10#include <linux/sched/cpufreq.h>
325ea10c
IM
11#include <linux/sched/cputime.h>
12#include <linux/sched/deadline.h>
b17b0153 13#include <linux/sched/debug.h>
ef8bd77f 14#include <linux/sched/hotplug.h>
325ea10c
IM
15#include <linux/sched/idle.h>
16#include <linux/sched/init.h>
17#include <linux/sched/isolation.h>
18#include <linux/sched/jobctl.h>
19#include <linux/sched/loadavg.h>
20#include <linux/sched/mm.h>
21#include <linux/sched/nohz.h>
22#include <linux/sched/numa_balancing.h>
23#include <linux/sched/prio.h>
24#include <linux/sched/rt.h>
25#include <linux/sched/signal.h>
26#include <linux/sched/stat.h>
27#include <linux/sched/sysctl.h>
29930025 28#include <linux/sched/task.h>
68db0cf1 29#include <linux/sched/task_stack.h>
325ea10c
IM
30#include <linux/sched/topology.h>
31#include <linux/sched/user.h>
32#include <linux/sched/wake_q.h>
33#include <linux/sched/xacct.h>
34
35#include <uapi/linux/sched/types.h>
ef8bd77f 36
3866e845 37#include <linux/binfmts.h>
325ea10c
IM
38#include <linux/blkdev.h>
39#include <linux/compat.h>
40#include <linux/context_tracking.h>
41#include <linux/cpufreq.h>
42#include <linux/cpuidle.h>
43#include <linux/cpuset.h>
44#include <linux/ctype.h>
45#include <linux/debugfs.h>
46#include <linux/delayacct.h>
47#include <linux/init_task.h>
48#include <linux/kprobes.h>
49#include <linux/kthread.h>
50#include <linux/membarrier.h>
51#include <linux/migrate.h>
52#include <linux/mmu_context.h>
53#include <linux/nmi.h>
54#include <linux/proc_fs.h>
55#include <linux/prefetch.h>
56#include <linux/profile.h>
57#include <linux/rcupdate_wait.h>
58#include <linux/security.h>
029632fb 59#include <linux/stop_machine.h>
325ea10c
IM
60#include <linux/suspend.h>
61#include <linux/swait.h>
62#include <linux/syscalls.h>
63#include <linux/task_work.h>
64#include <linux/tsacct_kern.h>
65
66#include <asm/tlb.h>
029632fb 67
7fce777c 68#ifdef CONFIG_PARAVIRT
325ea10c 69# include <asm/paravirt.h>
7fce777c
IM
70#endif
71
391e43da 72#include "cpupri.h"
6bfd6d72 73#include "cpudeadline.h"
029632fb 74
9148a3a1 75#ifdef CONFIG_SCHED_DEBUG
6d3aed3d 76# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
9148a3a1 77#else
6d3aed3d 78# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
9148a3a1
PZ
79#endif
80
45ceebf7 81struct rq;
442bf3aa 82struct cpuidle_state;
45ceebf7 83
da0c1e65
KT
84/* task_struct::on_rq states: */
85#define TASK_ON_RQ_QUEUED 1
cca26e80 86#define TASK_ON_RQ_MIGRATING 2
da0c1e65 87
029632fb
PZ
88extern __read_mostly int scheduler_running;
89
45ceebf7
PG
90extern unsigned long calc_load_update;
91extern atomic_long_t calc_load_tasks;
92
3289bdb4 93extern void calc_global_load_tick(struct rq *this_rq);
d60585c5 94extern long calc_load_fold_active(struct rq *this_rq, long adjust);
3289bdb4
PZ
95
96#ifdef CONFIG_SMP
cee1afce 97extern void cpu_load_update_active(struct rq *this_rq);
3289bdb4 98#else
cee1afce 99static inline void cpu_load_update_active(struct rq *this_rq) { }
3289bdb4 100#endif
45ceebf7 101
029632fb
PZ
102/*
103 * Helpers for converting nanosecond timing to jiffy resolution
104 */
105#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
106
cc1f4b1f
LZ
107/*
108 * Increase resolution of nice-level calculations for 64-bit architectures.
109 * The extra resolution improves shares distribution and load balancing of
110 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
111 * hierarchies, especially on larger systems. This is not a user-visible change
112 * and does not change the user-interface for setting shares/weights.
113 *
114 * We increase resolution only if we have enough bits to allow this increased
97fb7a0a
IM
115 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
116 * are pretty high and the returns do not justify the increased costs.
2159197d 117 *
97fb7a0a
IM
118 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
119 * increase coverage and consistency always enable it on 64-bit platforms.
cc1f4b1f 120 */
2159197d 121#ifdef CONFIG_64BIT
172895e6 122# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
6ecdd749
YD
123# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
124# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
cc1f4b1f 125#else
172895e6 126# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
cc1f4b1f
LZ
127# define scale_load(w) (w)
128# define scale_load_down(w) (w)
129#endif
130
6ecdd749 131/*
172895e6
YD
132 * Task weight (visible to users) and its load (invisible to users) have
133 * independent resolution, but they should be well calibrated. We use
134 * scale_load() and scale_load_down(w) to convert between them. The
135 * following must be true:
136 *
137 * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
138 *
6ecdd749 139 */
172895e6 140#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
029632fb 141
332ac17e
DF
142/*
143 * Single value that decides SCHED_DEADLINE internal math precision.
144 * 10 -> just above 1us
145 * 9 -> just above 0.5us
146 */
97fb7a0a 147#define DL_SCALE 10
029632fb
PZ
148
149/*
97fb7a0a 150 * Single value that denotes runtime == period, ie unlimited time.
029632fb 151 */
97fb7a0a 152#define RUNTIME_INF ((u64)~0ULL)
029632fb 153
20f9cd2a
HA
154static inline int idle_policy(int policy)
155{
156 return policy == SCHED_IDLE;
157}
d50dde5a
DF
158static inline int fair_policy(int policy)
159{
160 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
161}
162
029632fb
PZ
163static inline int rt_policy(int policy)
164{
d50dde5a 165 return policy == SCHED_FIFO || policy == SCHED_RR;
029632fb
PZ
166}
167
aab03e05
DF
168static inline int dl_policy(int policy)
169{
170 return policy == SCHED_DEADLINE;
171}
20f9cd2a
HA
172static inline bool valid_policy(int policy)
173{
174 return idle_policy(policy) || fair_policy(policy) ||
175 rt_policy(policy) || dl_policy(policy);
176}
aab03e05 177
029632fb
PZ
178static inline int task_has_rt_policy(struct task_struct *p)
179{
180 return rt_policy(p->policy);
181}
182
aab03e05
DF
183static inline int task_has_dl_policy(struct task_struct *p)
184{
185 return dl_policy(p->policy);
186}
187
07881166
JL
188#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
189
794a56eb
JL
190/*
191 * !! For sched_setattr_nocheck() (kernel) only !!
192 *
193 * This is actually gross. :(
194 *
195 * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
196 * tasks, but still be able to sleep. We need this on platforms that cannot
197 * atomically change clock frequency. Remove once fast switching will be
198 * available on such platforms.
199 *
200 * SUGOV stands for SchedUtil GOVernor.
201 */
202#define SCHED_FLAG_SUGOV 0x10000000
203
204static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
205{
206#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
207 return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
208#else
209 return false;
210#endif
211}
212
2d3d891d
DF
213/*
214 * Tells if entity @a should preempt entity @b.
215 */
332ac17e
DF
216static inline bool
217dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
2d3d891d 218{
794a56eb
JL
219 return dl_entity_is_special(a) ||
220 dl_time_before(a->deadline, b->deadline);
2d3d891d
DF
221}
222
029632fb
PZ
223/*
224 * This is the priority-queue data structure of the RT scheduling class:
225 */
226struct rt_prio_array {
227 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
228 struct list_head queue[MAX_RT_PRIO];
229};
230
231struct rt_bandwidth {
232 /* nests inside the rq lock: */
233 raw_spinlock_t rt_runtime_lock;
234 ktime_t rt_period;
235 u64 rt_runtime;
236 struct hrtimer rt_period_timer;
4cfafd30 237 unsigned int rt_period_active;
029632fb 238};
a5e7be3b
JL
239
240void __dl_clear_params(struct task_struct *p);
241
332ac17e
DF
242/*
243 * To keep the bandwidth of -deadline tasks and groups under control
244 * we need some place where:
245 * - store the maximum -deadline bandwidth of the system (the group);
246 * - cache the fraction of that bandwidth that is currently allocated.
247 *
248 * This is all done in the data structure below. It is similar to the
249 * one used for RT-throttling (rt_bandwidth), with the main difference
250 * that, since here we are only interested in admission control, we
251 * do not decrease any runtime while the group "executes", neither we
252 * need a timer to replenish it.
253 *
254 * With respect to SMP, the bandwidth is given on a per-CPU basis,
255 * meaning that:
256 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
257 * - dl_total_bw array contains, in the i-eth element, the currently
258 * allocated bandwidth on the i-eth CPU.
259 * Moreover, groups consume bandwidth on each CPU, while tasks only
260 * consume bandwidth on the CPU they're running on.
261 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
262 * that will be shown the next time the proc or cgroup controls will
263 * be red. It on its turn can be changed by writing on its own
264 * control.
265 */
266struct dl_bandwidth {
97fb7a0a
IM
267 raw_spinlock_t dl_runtime_lock;
268 u64 dl_runtime;
269 u64 dl_period;
332ac17e
DF
270};
271
272static inline int dl_bandwidth_enabled(void)
273{
1724813d 274 return sysctl_sched_rt_runtime >= 0;
332ac17e
DF
275}
276
332ac17e 277struct dl_bw {
97fb7a0a
IM
278 raw_spinlock_t lock;
279 u64 bw;
280 u64 total_bw;
332ac17e
DF
281};
282
daec5798
LA
283static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
284
7f51412a 285static inline
8c0944ce 286void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
7f51412a
JL
287{
288 dl_b->total_bw -= tsk_bw;
daec5798 289 __dl_update(dl_b, (s32)tsk_bw / cpus);
7f51412a
JL
290}
291
292static inline
daec5798 293void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
7f51412a
JL
294{
295 dl_b->total_bw += tsk_bw;
daec5798 296 __dl_update(dl_b, -((s32)tsk_bw / cpus));
7f51412a
JL
297}
298
299static inline
300bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
301{
302 return dl_b->bw != -1 &&
303 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
304}
305
97fb7a0a 306extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
f2cb1360 307extern void init_dl_bw(struct dl_bw *dl_b);
97fb7a0a 308extern int sched_dl_global_validate(void);
06a76fe0 309extern void sched_dl_do_global(void);
97fb7a0a 310extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
06a76fe0
NP
311extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
312extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
313extern bool __checkparam_dl(const struct sched_attr *attr);
06a76fe0 314extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
97fb7a0a
IM
315extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
316extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
06a76fe0 317extern bool dl_cpu_busy(unsigned int cpu);
029632fb
PZ
318
319#ifdef CONFIG_CGROUP_SCHED
320
321#include <linux/cgroup.h>
322
323struct cfs_rq;
324struct rt_rq;
325
35cf4e50 326extern struct list_head task_groups;
029632fb
PZ
327
328struct cfs_bandwidth {
329#ifdef CONFIG_CFS_BANDWIDTH
97fb7a0a
IM
330 raw_spinlock_t lock;
331 ktime_t period;
332 u64 quota;
333 u64 runtime;
334 s64 hierarchical_quota;
335 u64 runtime_expires;
512ac999 336 int expires_seq;
97fb7a0a 337
512ac999
XP
338 short idle;
339 short period_active;
97fb7a0a
IM
340 struct hrtimer period_timer;
341 struct hrtimer slack_timer;
342 struct list_head throttled_cfs_rq;
343
344 /* Statistics: */
345 int nr_periods;
346 int nr_throttled;
347 u64 throttled_time;
baa9be4f
PA
348
349 bool distribute_running;
029632fb
PZ
350#endif
351};
352
97fb7a0a 353/* Task group related information */
029632fb
PZ
354struct task_group {
355 struct cgroup_subsys_state css;
356
357#ifdef CONFIG_FAIR_GROUP_SCHED
97fb7a0a
IM
358 /* schedulable entities of this group on each CPU */
359 struct sched_entity **se;
360 /* runqueue "owned" by this group on each CPU */
361 struct cfs_rq **cfs_rq;
362 unsigned long shares;
029632fb 363
fa6bddeb 364#ifdef CONFIG_SMP
b0367629
WL
365 /*
366 * load_avg can be heavily contended at clock tick time, so put
367 * it in its own cacheline separated from the fields above which
368 * will also be accessed at each tick.
369 */
97fb7a0a 370 atomic_long_t load_avg ____cacheline_aligned;
029632fb 371#endif
fa6bddeb 372#endif
029632fb
PZ
373
374#ifdef CONFIG_RT_GROUP_SCHED
97fb7a0a
IM
375 struct sched_rt_entity **rt_se;
376 struct rt_rq **rt_rq;
029632fb 377
97fb7a0a 378 struct rt_bandwidth rt_bandwidth;
029632fb
PZ
379#endif
380
97fb7a0a
IM
381 struct rcu_head rcu;
382 struct list_head list;
029632fb 383
97fb7a0a
IM
384 struct task_group *parent;
385 struct list_head siblings;
386 struct list_head children;
029632fb
PZ
387
388#ifdef CONFIG_SCHED_AUTOGROUP
97fb7a0a 389 struct autogroup *autogroup;
029632fb
PZ
390#endif
391
97fb7a0a 392 struct cfs_bandwidth cfs_bandwidth;
029632fb
PZ
393};
394
395#ifdef CONFIG_FAIR_GROUP_SCHED
396#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
397
398/*
399 * A weight of 0 or 1 can cause arithmetics problems.
400 * A weight of a cfs_rq is the sum of weights of which entities
401 * are queued on this cfs_rq, so a weight of a entity should not be
402 * too large, so as the shares value of a task group.
403 * (The default weight is 1024 - so there's no practical
404 * limitation from this.)
405 */
97fb7a0a
IM
406#define MIN_SHARES (1UL << 1)
407#define MAX_SHARES (1UL << 18)
029632fb
PZ
408#endif
409
029632fb
PZ
410typedef int (*tg_visitor)(struct task_group *, void *);
411
412extern int walk_tg_tree_from(struct task_group *from,
413 tg_visitor down, tg_visitor up, void *data);
414
415/*
416 * Iterate the full tree, calling @down when first entering a node and @up when
417 * leaving it for the final time.
418 *
419 * Caller must hold rcu_lock or sufficient equivalent.
420 */
421static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
422{
423 return walk_tg_tree_from(&root_task_group, down, up, data);
424}
425
426extern int tg_nop(struct task_group *tg, void *data);
427
428extern void free_fair_sched_group(struct task_group *tg);
429extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
8663e24d 430extern void online_fair_sched_group(struct task_group *tg);
6fe1f348 431extern void unregister_fair_sched_group(struct task_group *tg);
029632fb
PZ
432extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
433 struct sched_entity *se, int cpu,
434 struct sched_entity *parent);
435extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
029632fb
PZ
436
437extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
77a4d1a1 438extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
029632fb
PZ
439extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
440
441extern void free_rt_sched_group(struct task_group *tg);
442extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
443extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
444 struct sched_rt_entity *rt_se, int cpu,
445 struct sched_rt_entity *parent);
8887cd99
NP
446extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us);
447extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us);
448extern long sched_group_rt_runtime(struct task_group *tg);
449extern long sched_group_rt_period(struct task_group *tg);
450extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
029632fb 451
25cc7da7
LZ
452extern struct task_group *sched_create_group(struct task_group *parent);
453extern void sched_online_group(struct task_group *tg,
454 struct task_group *parent);
455extern void sched_destroy_group(struct task_group *tg);
456extern void sched_offline_group(struct task_group *tg);
457
458extern void sched_move_task(struct task_struct *tsk);
459
460#ifdef CONFIG_FAIR_GROUP_SCHED
461extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
ad936d86
BP
462
463#ifdef CONFIG_SMP
464extern void set_task_rq_fair(struct sched_entity *se,
465 struct cfs_rq *prev, struct cfs_rq *next);
466#else /* !CONFIG_SMP */
467static inline void set_task_rq_fair(struct sched_entity *se,
468 struct cfs_rq *prev, struct cfs_rq *next) { }
469#endif /* CONFIG_SMP */
470#endif /* CONFIG_FAIR_GROUP_SCHED */
25cc7da7 471
029632fb
PZ
472#else /* CONFIG_CGROUP_SCHED */
473
474struct cfs_bandwidth { };
475
476#endif /* CONFIG_CGROUP_SCHED */
477
478/* CFS-related fields in a runqueue */
479struct cfs_rq {
97fb7a0a
IM
480 struct load_weight load;
481 unsigned long runnable_weight;
482 unsigned int nr_running;
483 unsigned int h_nr_running;
029632fb 484
97fb7a0a
IM
485 u64 exec_clock;
486 u64 min_vruntime;
029632fb 487#ifndef CONFIG_64BIT
97fb7a0a 488 u64 min_vruntime_copy;
029632fb
PZ
489#endif
490
97fb7a0a 491 struct rb_root_cached tasks_timeline;
029632fb 492
029632fb
PZ
493 /*
494 * 'curr' points to currently running entity on this cfs_rq.
495 * It is set to NULL otherwise (i.e when none are currently running).
496 */
97fb7a0a
IM
497 struct sched_entity *curr;
498 struct sched_entity *next;
499 struct sched_entity *last;
500 struct sched_entity *skip;
029632fb
PZ
501
502#ifdef CONFIG_SCHED_DEBUG
97fb7a0a 503 unsigned int nr_spread_over;
029632fb
PZ
504#endif
505
2dac754e
PT
506#ifdef CONFIG_SMP
507 /*
9d89c257 508 * CFS load tracking
2dac754e 509 */
97fb7a0a 510 struct sched_avg avg;
2a2f5d4e 511#ifndef CONFIG_64BIT
97fb7a0a 512 u64 load_last_update_time_copy;
9d89c257 513#endif
2a2f5d4e
PZ
514 struct {
515 raw_spinlock_t lock ____cacheline_aligned;
516 int nr;
517 unsigned long load_avg;
518 unsigned long util_avg;
0e2d2aaa 519 unsigned long runnable_sum;
2a2f5d4e 520 } removed;
82958366 521
9d89c257 522#ifdef CONFIG_FAIR_GROUP_SCHED
97fb7a0a
IM
523 unsigned long tg_load_avg_contrib;
524 long propagate;
525 long prop_runnable_sum;
0e2d2aaa 526
82958366
PT
527 /*
528 * h_load = weight * f(tg)
529 *
530 * Where f(tg) is the recursive weight fraction assigned to
531 * this group.
532 */
97fb7a0a
IM
533 unsigned long h_load;
534 u64 last_h_load_update;
535 struct sched_entity *h_load_next;
68520796 536#endif /* CONFIG_FAIR_GROUP_SCHED */
82958366
PT
537#endif /* CONFIG_SMP */
538
029632fb 539#ifdef CONFIG_FAIR_GROUP_SCHED
97fb7a0a 540 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
029632fb
PZ
541
542 /*
543 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
544 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
545 * (like users, containers etc.)
546 *
97fb7a0a
IM
547 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
548 * This list is used during load balance.
029632fb 549 */
97fb7a0a
IM
550 int on_list;
551 struct list_head leaf_cfs_rq_list;
552 struct task_group *tg; /* group that "owns" this runqueue */
029632fb 553
029632fb 554#ifdef CONFIG_CFS_BANDWIDTH
97fb7a0a 555 int runtime_enabled;
512ac999 556 int expires_seq;
97fb7a0a
IM
557 u64 runtime_expires;
558 s64 runtime_remaining;
559
560 u64 throttled_clock;
561 u64 throttled_clock_task;
562 u64 throttled_clock_task_time;
563 int throttled;
564 int throttle_count;
565 struct list_head throttled_list;
029632fb
PZ
566#endif /* CONFIG_CFS_BANDWIDTH */
567#endif /* CONFIG_FAIR_GROUP_SCHED */
568};
569
570static inline int rt_bandwidth_enabled(void)
571{
572 return sysctl_sched_rt_runtime >= 0;
573}
574
b6366f04 575/* RT IPI pull logic requires IRQ_WORK */
4bdced5c 576#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
b6366f04
SR
577# define HAVE_RT_PUSH_IPI
578#endif
579
029632fb
PZ
580/* Real-Time classes' related field in a runqueue: */
581struct rt_rq {
97fb7a0a
IM
582 struct rt_prio_array active;
583 unsigned int rt_nr_running;
584 unsigned int rr_nr_running;
029632fb
PZ
585#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
586 struct {
97fb7a0a 587 int curr; /* highest queued rt task prio */
029632fb 588#ifdef CONFIG_SMP
97fb7a0a 589 int next; /* next highest */
029632fb
PZ
590#endif
591 } highest_prio;
592#endif
593#ifdef CONFIG_SMP
97fb7a0a
IM
594 unsigned long rt_nr_migratory;
595 unsigned long rt_nr_total;
596 int overloaded;
597 struct plist_head pushable_tasks;
371bf427 598
b6366f04 599#endif /* CONFIG_SMP */
97fb7a0a 600 int rt_queued;
f4ebcbc0 601
97fb7a0a
IM
602 int rt_throttled;
603 u64 rt_time;
604 u64 rt_runtime;
029632fb 605 /* Nests inside the rq lock: */
97fb7a0a 606 raw_spinlock_t rt_runtime_lock;
029632fb
PZ
607
608#ifdef CONFIG_RT_GROUP_SCHED
97fb7a0a 609 unsigned long rt_nr_boosted;
029632fb 610
97fb7a0a
IM
611 struct rq *rq;
612 struct task_group *tg;
029632fb
PZ
613#endif
614};
615
296b2ffe
VG
616static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
617{
618 return rt_rq->rt_queued && rt_rq->rt_nr_running;
619}
620
aab03e05
DF
621/* Deadline class' related fields in a runqueue */
622struct dl_rq {
623 /* runqueue is an rbtree, ordered by deadline */
97fb7a0a 624 struct rb_root_cached root;
aab03e05 625
97fb7a0a 626 unsigned long dl_nr_running;
1baca4ce
JL
627
628#ifdef CONFIG_SMP
629 /*
630 * Deadline values of the currently executing and the
631 * earliest ready task on this rq. Caching these facilitates
632 * the decision wether or not a ready but not running task
633 * should migrate somewhere else.
634 */
635 struct {
97fb7a0a
IM
636 u64 curr;
637 u64 next;
1baca4ce
JL
638 } earliest_dl;
639
97fb7a0a
IM
640 unsigned long dl_nr_migratory;
641 int overloaded;
1baca4ce
JL
642
643 /*
644 * Tasks on this rq that can be pushed away. They are kept in
645 * an rb-tree, ordered by tasks' deadlines, with caching
646 * of the leftmost (earliest deadline) element.
647 */
97fb7a0a 648 struct rb_root_cached pushable_dl_tasks_root;
332ac17e 649#else
97fb7a0a 650 struct dl_bw dl_bw;
1baca4ce 651#endif
e36d8677
LA
652 /*
653 * "Active utilization" for this runqueue: increased when a
654 * task wakes up (becomes TASK_RUNNING) and decreased when a
655 * task blocks
656 */
97fb7a0a 657 u64 running_bw;
4da3abce 658
8fd27231
LA
659 /*
660 * Utilization of the tasks "assigned" to this runqueue (including
661 * the tasks that are in runqueue and the tasks that executed on this
662 * CPU and blocked). Increased when a task moves to this runqueue, and
663 * decreased when the task moves away (migrates, changes scheduling
664 * policy, or terminates).
665 * This is needed to compute the "inactive utilization" for the
666 * runqueue (inactive utilization = this_bw - running_bw).
667 */
97fb7a0a
IM
668 u64 this_bw;
669 u64 extra_bw;
8fd27231 670
4da3abce
LA
671 /*
672 * Inverse of the fraction of CPU utilization that can be reclaimed
673 * by the GRUB algorithm.
674 */
97fb7a0a 675 u64 bw_ratio;
aab03e05
DF
676};
677
c0796298
VG
678#ifdef CONFIG_FAIR_GROUP_SCHED
679/* An entity is a task if it doesn't "own" a runqueue */
680#define entity_is_task(se) (!se->my_q)
681#else
682#define entity_is_task(se) 1
683#endif
684
029632fb 685#ifdef CONFIG_SMP
c0796298
VG
686/*
687 * XXX we want to get rid of these helpers and use the full load resolution.
688 */
689static inline long se_weight(struct sched_entity *se)
690{
691 return scale_load_down(se->load.weight);
692}
693
694static inline long se_runnable(struct sched_entity *se)
695{
696 return scale_load_down(se->runnable_weight);
697}
029632fb 698
afe06efd
TC
699static inline bool sched_asym_prefer(int a, int b)
700{
701 return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
702}
703
029632fb
PZ
704/*
705 * We add the notion of a root-domain which will be used to define per-domain
706 * variables. Each exclusive cpuset essentially defines an island domain by
97fb7a0a 707 * fully partitioning the member CPUs from any other cpuset. Whenever a new
029632fb
PZ
708 * exclusive cpuset is created, we also create and attach a new root-domain
709 * object.
710 *
711 */
712struct root_domain {
97fb7a0a
IM
713 atomic_t refcount;
714 atomic_t rto_count;
715 struct rcu_head rcu;
716 cpumask_var_t span;
717 cpumask_var_t online;
029632fb 718
757ffdd7
VS
719 /*
720 * Indicate pullable load on at least one CPU, e.g:
721 * - More than one runnable task
722 * - Running task is misfit
723 */
575638d1 724 int overload;
4486edd1 725
1baca4ce
JL
726 /*
727 * The bit corresponding to a CPU gets set here if such CPU has more
728 * than one runnable -deadline task (as it is below for RT tasks).
729 */
97fb7a0a
IM
730 cpumask_var_t dlo_mask;
731 atomic_t dlo_count;
732 struct dl_bw dl_bw;
733 struct cpudl cpudl;
1baca4ce 734
4bdced5c
SRRH
735#ifdef HAVE_RT_PUSH_IPI
736 /*
737 * For IPI pull requests, loop across the rto_mask.
738 */
97fb7a0a
IM
739 struct irq_work rto_push_work;
740 raw_spinlock_t rto_lock;
4bdced5c 741 /* These are only updated and read within rto_lock */
97fb7a0a
IM
742 int rto_loop;
743 int rto_cpu;
4bdced5c 744 /* These atomics are updated outside of a lock */
97fb7a0a
IM
745 atomic_t rto_loop_next;
746 atomic_t rto_loop_start;
4bdced5c 747#endif
029632fb
PZ
748 /*
749 * The "RT overload" flag: it gets set if a CPU has more than
750 * one runnable RT task.
751 */
97fb7a0a
IM
752 cpumask_var_t rto_mask;
753 struct cpupri cpupri;
cd92bfd3 754
97fb7a0a 755 unsigned long max_cpu_capacity;
029632fb
PZ
756};
757
758extern struct root_domain def_root_domain;
f2cb1360 759extern struct mutex sched_domains_mutex;
f2cb1360
IM
760
761extern void init_defrootdomain(void);
8d5dc512 762extern int sched_init_domains(const struct cpumask *cpu_map);
f2cb1360 763extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
364f5665
SRV
764extern void sched_get_rd(struct root_domain *rd);
765extern void sched_put_rd(struct root_domain *rd);
029632fb 766
4bdced5c
SRRH
767#ifdef HAVE_RT_PUSH_IPI
768extern void rto_push_irq_work_func(struct irq_work *work);
769#endif
029632fb
PZ
770#endif /* CONFIG_SMP */
771
772/*
773 * This is the main, per-CPU runqueue data structure.
774 *
775 * Locking rule: those places that want to lock multiple runqueues
776 * (such as the load balancing or the thread migration code), lock
777 * acquire operations must be ordered by ascending &runqueue.
778 */
779struct rq {
780 /* runqueue lock: */
97fb7a0a 781 raw_spinlock_t lock;
029632fb
PZ
782
783 /*
784 * nr_running and cpu_load should be in the same cacheline because
785 * remote CPUs use both these fields when doing load calculation.
786 */
97fb7a0a 787 unsigned int nr_running;
0ec8aa00 788#ifdef CONFIG_NUMA_BALANCING
97fb7a0a
IM
789 unsigned int nr_numa_running;
790 unsigned int nr_preferred_running;
a4739eca 791 unsigned int numa_migrate_on;
0ec8aa00 792#endif
029632fb 793 #define CPU_LOAD_IDX_MAX 5
97fb7a0a 794 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
3451d024 795#ifdef CONFIG_NO_HZ_COMMON
9fd81dd5 796#ifdef CONFIG_SMP
97fb7a0a 797 unsigned long last_load_update_tick;
e022e0d3 798 unsigned long last_blocked_load_update_tick;
f643ea22 799 unsigned int has_blocked_load;
9fd81dd5 800#endif /* CONFIG_SMP */
00357f5e 801 unsigned int nohz_tick_stopped;
a22e47a4 802 atomic_t nohz_flags;
9fd81dd5 803#endif /* CONFIG_NO_HZ_COMMON */
dcdedb24 804
97fb7a0a
IM
805 /* capture load from *all* tasks on this CPU: */
806 struct load_weight load;
807 unsigned long nr_load_updates;
808 u64 nr_switches;
029632fb 809
97fb7a0a
IM
810 struct cfs_rq cfs;
811 struct rt_rq rt;
812 struct dl_rq dl;
029632fb
PZ
813
814#ifdef CONFIG_FAIR_GROUP_SCHED
97fb7a0a
IM
815 /* list of leaf cfs_rq on this CPU: */
816 struct list_head leaf_cfs_rq_list;
817 struct list_head *tmp_alone_branch;
a35b6466
PZ
818#endif /* CONFIG_FAIR_GROUP_SCHED */
819
029632fb
PZ
820 /*
821 * This is part of a global counter where only the total sum
822 * over all CPUs matters. A task can increase this counter on
823 * one CPU and if it got migrated afterwards it may decrease
824 * it on another CPU. Always updated under the runqueue lock:
825 */
97fb7a0a 826 unsigned long nr_uninterruptible;
029632fb 827
97fb7a0a
IM
828 struct task_struct *curr;
829 struct task_struct *idle;
830 struct task_struct *stop;
831 unsigned long next_balance;
832 struct mm_struct *prev_mm;
029632fb 833
97fb7a0a
IM
834 unsigned int clock_update_flags;
835 u64 clock;
836 u64 clock_task;
029632fb 837
97fb7a0a 838 atomic_t nr_iowait;
029632fb
PZ
839
840#ifdef CONFIG_SMP
97fb7a0a
IM
841 struct root_domain *rd;
842 struct sched_domain *sd;
843
844 unsigned long cpu_capacity;
845 unsigned long cpu_capacity_orig;
029632fb 846
97fb7a0a 847 struct callback_head *balance_callback;
029632fb 848
97fb7a0a 849 unsigned char idle_balance;
e3fca9e7 850
3b1baa64
MR
851 unsigned long misfit_task_load;
852
029632fb 853 /* For active balancing */
97fb7a0a
IM
854 int active_balance;
855 int push_cpu;
856 struct cpu_stop_work active_balance_work;
857
858 /* CPU of this runqueue: */
859 int cpu;
860 int online;
029632fb 861
367456c7
PZ
862 struct list_head cfs_tasks;
863
371bf427 864 struct sched_avg avg_rt;
3727e0e1 865 struct sched_avg avg_dl;
11d4afd4 866#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
91c27493
VG
867 struct sched_avg avg_irq;
868#endif
97fb7a0a
IM
869 u64 idle_stamp;
870 u64 avg_idle;
9bd721c5
JL
871
872 /* This is used to determine avg_idle's max value */
97fb7a0a 873 u64 max_idle_balance_cost;
029632fb
PZ
874#endif
875
876#ifdef CONFIG_IRQ_TIME_ACCOUNTING
97fb7a0a 877 u64 prev_irq_time;
029632fb
PZ
878#endif
879#ifdef CONFIG_PARAVIRT
97fb7a0a 880 u64 prev_steal_time;
029632fb
PZ
881#endif
882#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
97fb7a0a 883 u64 prev_steal_time_rq;
029632fb
PZ
884#endif
885
886 /* calc_load related fields */
97fb7a0a
IM
887 unsigned long calc_load_update;
888 long calc_load_active;
029632fb
PZ
889
890#ifdef CONFIG_SCHED_HRTICK
891#ifdef CONFIG_SMP
97fb7a0a
IM
892 int hrtick_csd_pending;
893 call_single_data_t hrtick_csd;
029632fb 894#endif
97fb7a0a 895 struct hrtimer hrtick_timer;
029632fb
PZ
896#endif
897
898#ifdef CONFIG_SCHEDSTATS
899 /* latency stats */
97fb7a0a
IM
900 struct sched_info rq_sched_info;
901 unsigned long long rq_cpu_time;
029632fb
PZ
902 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
903
904 /* sys_sched_yield() stats */
97fb7a0a 905 unsigned int yld_count;
029632fb
PZ
906
907 /* schedule() stats */
97fb7a0a
IM
908 unsigned int sched_count;
909 unsigned int sched_goidle;
029632fb
PZ
910
911 /* try_to_wake_up() stats */
97fb7a0a
IM
912 unsigned int ttwu_count;
913 unsigned int ttwu_local;
029632fb
PZ
914#endif
915
916#ifdef CONFIG_SMP
97fb7a0a 917 struct llist_head wake_list;
029632fb 918#endif
442bf3aa
DL
919
920#ifdef CONFIG_CPU_IDLE
921 /* Must be inspected within a rcu lock section */
97fb7a0a 922 struct cpuidle_state *idle_state;
442bf3aa 923#endif
029632fb
PZ
924};
925
926static inline int cpu_of(struct rq *rq)
927{
928#ifdef CONFIG_SMP
929 return rq->cpu;
930#else
931 return 0;
932#endif
933}
934
1b568f0a
PZ
935
936#ifdef CONFIG_SCHED_SMT
937
938extern struct static_key_false sched_smt_present;
939
940extern void __update_idle_core(struct rq *rq);
941
942static inline void update_idle_core(struct rq *rq)
943{
944 if (static_branch_unlikely(&sched_smt_present))
945 __update_idle_core(rq);
946}
947
948#else
949static inline void update_idle_core(struct rq *rq) { }
950#endif
951
8b06c55b 952DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
029632fb 953
518cd623 954#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
4a32fea9 955#define this_rq() this_cpu_ptr(&runqueues)
518cd623
PZ
956#define task_rq(p) cpu_rq(task_cpu(p))
957#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
4a32fea9 958#define raw_rq() raw_cpu_ptr(&runqueues)
518cd623 959
1f351d7f
JW
960extern void update_rq_clock(struct rq *rq);
961
cebde6d6
PZ
962static inline u64 __rq_clock_broken(struct rq *rq)
963{
316c1608 964 return READ_ONCE(rq->clock);
cebde6d6
PZ
965}
966
cb42c9a3
MF
967/*
968 * rq::clock_update_flags bits
969 *
970 * %RQCF_REQ_SKIP - will request skipping of clock update on the next
971 * call to __schedule(). This is an optimisation to avoid
972 * neighbouring rq clock updates.
973 *
974 * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
975 * in effect and calls to update_rq_clock() are being ignored.
976 *
977 * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
978 * made to update_rq_clock() since the last time rq::lock was pinned.
979 *
980 * If inside of __schedule(), clock_update_flags will have been
981 * shifted left (a left shift is a cheap operation for the fast path
982 * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
983 *
984 * if (rq-clock_update_flags >= RQCF_UPDATED)
985 *
986 * to check if %RQCF_UPADTED is set. It'll never be shifted more than
987 * one position though, because the next rq_unpin_lock() will shift it
988 * back.
989 */
97fb7a0a
IM
990#define RQCF_REQ_SKIP 0x01
991#define RQCF_ACT_SKIP 0x02
992#define RQCF_UPDATED 0x04
cb42c9a3
MF
993
994static inline void assert_clock_updated(struct rq *rq)
995{
996 /*
997 * The only reason for not seeing a clock update since the
998 * last rq_pin_lock() is if we're currently skipping updates.
999 */
1000 SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
1001}
1002
78becc27
FW
1003static inline u64 rq_clock(struct rq *rq)
1004{
cebde6d6 1005 lockdep_assert_held(&rq->lock);
cb42c9a3
MF
1006 assert_clock_updated(rq);
1007
78becc27
FW
1008 return rq->clock;
1009}
1010
1011static inline u64 rq_clock_task(struct rq *rq)
1012{
cebde6d6 1013 lockdep_assert_held(&rq->lock);
cb42c9a3
MF
1014 assert_clock_updated(rq);
1015
78becc27
FW
1016 return rq->clock_task;
1017}
1018
adcc8da8 1019static inline void rq_clock_skip_update(struct rq *rq)
9edfbfed
PZ
1020{
1021 lockdep_assert_held(&rq->lock);
adcc8da8
DB
1022 rq->clock_update_flags |= RQCF_REQ_SKIP;
1023}
1024
1025/*
595058b6 1026 * See rt task throttling, which is the only time a skip
adcc8da8
DB
1027 * request is cancelled.
1028 */
1029static inline void rq_clock_cancel_skipupdate(struct rq *rq)
1030{
1031 lockdep_assert_held(&rq->lock);
1032 rq->clock_update_flags &= ~RQCF_REQ_SKIP;
9edfbfed
PZ
1033}
1034
d8ac8971
MF
1035struct rq_flags {
1036 unsigned long flags;
1037 struct pin_cookie cookie;
cb42c9a3
MF
1038#ifdef CONFIG_SCHED_DEBUG
1039 /*
1040 * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
1041 * current pin context is stashed here in case it needs to be
1042 * restored in rq_repin_lock().
1043 */
1044 unsigned int clock_update_flags;
1045#endif
d8ac8971
MF
1046};
1047
1048static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
1049{
1050 rf->cookie = lockdep_pin_lock(&rq->lock);
cb42c9a3
MF
1051
1052#ifdef CONFIG_SCHED_DEBUG
1053 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
1054 rf->clock_update_flags = 0;
1055#endif
d8ac8971
MF
1056}
1057
1058static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
1059{
cb42c9a3
MF
1060#ifdef CONFIG_SCHED_DEBUG
1061 if (rq->clock_update_flags > RQCF_ACT_SKIP)
1062 rf->clock_update_flags = RQCF_UPDATED;
1063#endif
1064
d8ac8971
MF
1065 lockdep_unpin_lock(&rq->lock, rf->cookie);
1066}
1067
1068static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
1069{
1070 lockdep_repin_lock(&rq->lock, rf->cookie);
cb42c9a3
MF
1071
1072#ifdef CONFIG_SCHED_DEBUG
1073 /*
1074 * Restore the value we stashed in @rf for this pin context.
1075 */
1076 rq->clock_update_flags |= rf->clock_update_flags;
1077#endif
d8ac8971
MF
1078}
1079
1f351d7f
JW
1080struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1081 __acquires(rq->lock);
1082
1083struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1084 __acquires(p->pi_lock)
1085 __acquires(rq->lock);
1086
1087static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
1088 __releases(rq->lock)
1089{
1090 rq_unpin_lock(rq, rf);
1091 raw_spin_unlock(&rq->lock);
1092}
1093
1094static inline void
1095task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1096 __releases(rq->lock)
1097 __releases(p->pi_lock)
1098{
1099 rq_unpin_lock(rq, rf);
1100 raw_spin_unlock(&rq->lock);
1101 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
1102}
1103
1104static inline void
1105rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
1106 __acquires(rq->lock)
1107{
1108 raw_spin_lock_irqsave(&rq->lock, rf->flags);
1109 rq_pin_lock(rq, rf);
1110}
1111
1112static inline void
1113rq_lock_irq(struct rq *rq, struct rq_flags *rf)
1114 __acquires(rq->lock)
1115{
1116 raw_spin_lock_irq(&rq->lock);
1117 rq_pin_lock(rq, rf);
1118}
1119
1120static inline void
1121rq_lock(struct rq *rq, struct rq_flags *rf)
1122 __acquires(rq->lock)
1123{
1124 raw_spin_lock(&rq->lock);
1125 rq_pin_lock(rq, rf);
1126}
1127
1128static inline void
1129rq_relock(struct rq *rq, struct rq_flags *rf)
1130 __acquires(rq->lock)
1131{
1132 raw_spin_lock(&rq->lock);
1133 rq_repin_lock(rq, rf);
1134}
1135
1136static inline void
1137rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
1138 __releases(rq->lock)
1139{
1140 rq_unpin_lock(rq, rf);
1141 raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
1142}
1143
1144static inline void
1145rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
1146 __releases(rq->lock)
1147{
1148 rq_unpin_lock(rq, rf);
1149 raw_spin_unlock_irq(&rq->lock);
1150}
1151
1152static inline void
1153rq_unlock(struct rq *rq, struct rq_flags *rf)
1154 __releases(rq->lock)
1155{
1156 rq_unpin_lock(rq, rf);
1157 raw_spin_unlock(&rq->lock);
1158}
1159
9942f79b 1160#ifdef CONFIG_NUMA
e3fe70b1
RR
1161enum numa_topology_type {
1162 NUMA_DIRECT,
1163 NUMA_GLUELESS_MESH,
1164 NUMA_BACKPLANE,
1165};
1166extern enum numa_topology_type sched_numa_topology_type;
9942f79b
RR
1167extern int sched_max_numa_distance;
1168extern bool find_numa_distance(int distance);
1169#endif
1170
f2cb1360
IM
1171#ifdef CONFIG_NUMA
1172extern void sched_init_numa(void);
1173extern void sched_domains_numa_masks_set(unsigned int cpu);
1174extern void sched_domains_numa_masks_clear(unsigned int cpu);
1175#else
1176static inline void sched_init_numa(void) { }
1177static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
1178static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
1179#endif
1180
f809ca9a 1181#ifdef CONFIG_NUMA_BALANCING
44dba3d5
IM
1182/* The regions in numa_faults array from task_struct */
1183enum numa_faults_stats {
1184 NUMA_MEM = 0,
1185 NUMA_CPU,
1186 NUMA_MEMBUF,
1187 NUMA_CPUBUF
1188};
0ec8aa00 1189extern void sched_setnuma(struct task_struct *p, int node);
e6628d5b 1190extern int migrate_task_to(struct task_struct *p, int cpu);
0ad4e3df
SD
1191extern int migrate_swap(struct task_struct *p, struct task_struct *t,
1192 int cpu, int scpu);
13784475
MG
1193extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
1194#else
1195static inline void
1196init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1197{
1198}
f809ca9a
MG
1199#endif /* CONFIG_NUMA_BALANCING */
1200
518cd623
PZ
1201#ifdef CONFIG_SMP
1202
e3fca9e7
PZ
1203static inline void
1204queue_balance_callback(struct rq *rq,
1205 struct callback_head *head,
1206 void (*func)(struct rq *rq))
1207{
1208 lockdep_assert_held(&rq->lock);
1209
1210 if (unlikely(head->next))
1211 return;
1212
1213 head->func = (void (*)(struct callback_head *))func;
1214 head->next = rq->balance_callback;
1215 rq->balance_callback = head;
1216}
1217
e3baac47
PZ
1218extern void sched_ttwu_pending(void);
1219
029632fb
PZ
1220#define rcu_dereference_check_sched_domain(p) \
1221 rcu_dereference_check((p), \
1222 lockdep_is_held(&sched_domains_mutex))
1223
1224/*
1225 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1226 * See detach_destroy_domains: synchronize_sched for details.
1227 *
1228 * The domain tree of any CPU may only be accessed from within
1229 * preempt-disabled sections.
1230 */
1231#define for_each_domain(cpu, __sd) \
518cd623
PZ
1232 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
1233 __sd; __sd = __sd->parent)
029632fb 1234
77e81365
SS
1235#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
1236
518cd623
PZ
1237/**
1238 * highest_flag_domain - Return highest sched_domain containing flag.
97fb7a0a 1239 * @cpu: The CPU whose highest level of sched domain is to
518cd623
PZ
1240 * be returned.
1241 * @flag: The flag to check for the highest sched_domain
97fb7a0a 1242 * for the given CPU.
518cd623 1243 *
97fb7a0a 1244 * Returns the highest sched_domain of a CPU which contains the given flag.
518cd623
PZ
1245 */
1246static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
1247{
1248 struct sched_domain *sd, *hsd = NULL;
1249
1250 for_each_domain(cpu, sd) {
1251 if (!(sd->flags & flag))
1252 break;
1253 hsd = sd;
1254 }
1255
1256 return hsd;
1257}
1258
fb13c7ee
MG
1259static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
1260{
1261 struct sched_domain *sd;
1262
1263 for_each_domain(cpu, sd) {
1264 if (sd->flags & flag)
1265 break;
1266 }
1267
1268 return sd;
1269}
1270
518cd623 1271DECLARE_PER_CPU(struct sched_domain *, sd_llc);
7d9ffa89 1272DECLARE_PER_CPU(int, sd_llc_size);
518cd623 1273DECLARE_PER_CPU(int, sd_llc_id);
0e369d75 1274DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
fb13c7ee 1275DECLARE_PER_CPU(struct sched_domain *, sd_numa);
37dc6b50 1276DECLARE_PER_CPU(struct sched_domain *, sd_asym);
df054e84 1277extern struct static_key_false sched_asym_cpucapacity;
518cd623 1278
63b2ca30 1279struct sched_group_capacity {
97fb7a0a 1280 atomic_t ref;
5e6521ea 1281 /*
172895e6 1282 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
63b2ca30 1283 * for a single CPU.
5e6521ea 1284 */
97fb7a0a
IM
1285 unsigned long capacity;
1286 unsigned long min_capacity; /* Min per-CPU capacity in group */
e3d6d0cb 1287 unsigned long max_capacity; /* Max per-CPU capacity in group */
97fb7a0a
IM
1288 unsigned long next_update;
1289 int imbalance; /* XXX unrelated to capacity but shared group state */
5e6521ea 1290
005f874d 1291#ifdef CONFIG_SCHED_DEBUG
97fb7a0a 1292 int id;
005f874d
PZ
1293#endif
1294
97fb7a0a 1295 unsigned long cpumask[0]; /* Balance mask */
5e6521ea
LZ
1296};
1297
1298struct sched_group {
97fb7a0a
IM
1299 struct sched_group *next; /* Must be a circular list */
1300 atomic_t ref;
5e6521ea 1301
97fb7a0a 1302 unsigned int group_weight;
63b2ca30 1303 struct sched_group_capacity *sgc;
97fb7a0a 1304 int asym_prefer_cpu; /* CPU of highest priority in group */
5e6521ea
LZ
1305
1306 /*
1307 * The CPUs this group covers.
1308 *
1309 * NOTE: this field is variable length. (Allocated dynamically
1310 * by attaching extra space to the end of the structure,
1311 * depending on how many CPUs the kernel has booted up with)
1312 */
97fb7a0a 1313 unsigned long cpumask[0];
5e6521ea
LZ
1314};
1315
ae4df9d6 1316static inline struct cpumask *sched_group_span(struct sched_group *sg)
5e6521ea
LZ
1317{
1318 return to_cpumask(sg->cpumask);
1319}
1320
1321/*
e5c14b1f 1322 * See build_balance_mask().
5e6521ea 1323 */
e5c14b1f 1324static inline struct cpumask *group_balance_mask(struct sched_group *sg)
5e6521ea 1325{
63b2ca30 1326 return to_cpumask(sg->sgc->cpumask);
5e6521ea
LZ
1327}
1328
1329/**
97fb7a0a
IM
1330 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
1331 * @group: The group whose first CPU is to be returned.
5e6521ea
LZ
1332 */
1333static inline unsigned int group_first_cpu(struct sched_group *group)
1334{
ae4df9d6 1335 return cpumask_first(sched_group_span(group));
5e6521ea
LZ
1336}
1337
c1174876
PZ
1338extern int group_balance_cpu(struct sched_group *sg);
1339
3866e845
SRRH
1340#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
1341void register_sched_domain_sysctl(void);
bbdacdfe 1342void dirty_sched_domain_sysctl(int cpu);
3866e845
SRRH
1343void unregister_sched_domain_sysctl(void);
1344#else
1345static inline void register_sched_domain_sysctl(void)
1346{
1347}
bbdacdfe
PZ
1348static inline void dirty_sched_domain_sysctl(int cpu)
1349{
1350}
3866e845
SRRH
1351static inline void unregister_sched_domain_sysctl(void)
1352{
1353}
1354#endif
1355
e3baac47
PZ
1356#else
1357
1358static inline void sched_ttwu_pending(void) { }
1359
518cd623 1360#endif /* CONFIG_SMP */
029632fb 1361
391e43da 1362#include "stats.h"
1051408f 1363#include "autogroup.h"
029632fb
PZ
1364
1365#ifdef CONFIG_CGROUP_SCHED
1366
1367/*
1368 * Return the group to which this tasks belongs.
1369 *
8af01f56
TH
1370 * We cannot use task_css() and friends because the cgroup subsystem
1371 * changes that value before the cgroup_subsys::attach() method is called,
1372 * therefore we cannot pin it and might observe the wrong value.
8323f26c
PZ
1373 *
1374 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
1375 * core changes this before calling sched_move_task().
1376 *
1377 * Instead we use a 'copy' which is updated from sched_move_task() while
1378 * holding both task_struct::pi_lock and rq::lock.
029632fb
PZ
1379 */
1380static inline struct task_group *task_group(struct task_struct *p)
1381{
8323f26c 1382 return p->sched_task_group;
029632fb
PZ
1383}
1384
1385/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
1386static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
1387{
1388#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
1389 struct task_group *tg = task_group(p);
1390#endif
1391
1392#ifdef CONFIG_FAIR_GROUP_SCHED
ad936d86 1393 set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
029632fb
PZ
1394 p->se.cfs_rq = tg->cfs_rq[cpu];
1395 p->se.parent = tg->se[cpu];
1396#endif
1397
1398#ifdef CONFIG_RT_GROUP_SCHED
1399 p->rt.rt_rq = tg->rt_rq[cpu];
1400 p->rt.parent = tg->rt_se[cpu];
1401#endif
1402}
1403
1404#else /* CONFIG_CGROUP_SCHED */
1405
1406static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
1407static inline struct task_group *task_group(struct task_struct *p)
1408{
1409 return NULL;
1410}
1411
1412#endif /* CONFIG_CGROUP_SCHED */
1413
1414static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1415{
1416 set_task_rq(p, cpu);
1417#ifdef CONFIG_SMP
1418 /*
1419 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1420 * successfuly executed on another CPU. We must ensure that updates of
1421 * per-task data have been completed by this moment.
1422 */
1423 smp_wmb();
c65eacbe
AL
1424#ifdef CONFIG_THREAD_INFO_IN_TASK
1425 p->cpu = cpu;
1426#else
029632fb 1427 task_thread_info(p)->cpu = cpu;
c65eacbe 1428#endif
ac66f547 1429 p->wake_cpu = cpu;
029632fb
PZ
1430#endif
1431}
1432
1433/*
1434 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
1435 */
1436#ifdef CONFIG_SCHED_DEBUG
c5905afb 1437# include <linux/static_key.h>
029632fb
PZ
1438# define const_debug __read_mostly
1439#else
1440# define const_debug const
1441#endif
1442
029632fb
PZ
1443#define SCHED_FEAT(name, enabled) \
1444 __SCHED_FEAT_##name ,
1445
1446enum {
391e43da 1447#include "features.h"
f8b6d1cc 1448 __SCHED_FEAT_NR,
029632fb
PZ
1449};
1450
1451#undef SCHED_FEAT
1452
f8b6d1cc 1453#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
765cc3a4
PB
1454
1455/*
1456 * To support run-time toggling of sched features, all the translation units
1457 * (but core.c) reference the sysctl_sched_features defined in core.c.
1458 */
1459extern const_debug unsigned int sysctl_sched_features;
1460
f8b6d1cc 1461#define SCHED_FEAT(name, enabled) \
c5905afb 1462static __always_inline bool static_branch_##name(struct static_key *key) \
f8b6d1cc 1463{ \
6e76ea8a 1464 return static_key_##enabled(key); \
f8b6d1cc
PZ
1465}
1466
1467#include "features.h"
f8b6d1cc
PZ
1468#undef SCHED_FEAT
1469
c5905afb 1470extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
f8b6d1cc 1471#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
765cc3a4 1472
f8b6d1cc 1473#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
765cc3a4
PB
1474
1475/*
1476 * Each translation unit has its own copy of sysctl_sched_features to allow
1477 * constants propagation at compile time and compiler optimization based on
1478 * features default.
1479 */
1480#define SCHED_FEAT(name, enabled) \
1481 (1UL << __SCHED_FEAT_##name) * enabled |
1482static const_debug __maybe_unused unsigned int sysctl_sched_features =
1483#include "features.h"
1484 0;
1485#undef SCHED_FEAT
1486
7e6f4c5d 1487#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
765cc3a4 1488
f8b6d1cc 1489#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
029632fb 1490
2a595721 1491extern struct static_key_false sched_numa_balancing;
cb251765 1492extern struct static_key_false sched_schedstats;
cbee9f88 1493
029632fb
PZ
1494static inline u64 global_rt_period(void)
1495{
1496 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1497}
1498
1499static inline u64 global_rt_runtime(void)
1500{
1501 if (sysctl_sched_rt_runtime < 0)
1502 return RUNTIME_INF;
1503
1504 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1505}
1506
029632fb
PZ
1507static inline int task_current(struct rq *rq, struct task_struct *p)
1508{
1509 return rq->curr == p;
1510}
1511
1512static inline int task_running(struct rq *rq, struct task_struct *p)
1513{
1514#ifdef CONFIG_SMP
1515 return p->on_cpu;
1516#else
1517 return task_current(rq, p);
1518#endif
1519}
1520
da0c1e65
KT
1521static inline int task_on_rq_queued(struct task_struct *p)
1522{
1523 return p->on_rq == TASK_ON_RQ_QUEUED;
1524}
029632fb 1525
cca26e80
KT
1526static inline int task_on_rq_migrating(struct task_struct *p)
1527{
1528 return p->on_rq == TASK_ON_RQ_MIGRATING;
1529}
1530
b13095f0
LZ
1531/*
1532 * wake flags
1533 */
97fb7a0a
IM
1534#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
1535#define WF_FORK 0x02 /* Child wakeup after fork */
1536#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
b13095f0 1537
029632fb
PZ
1538/*
1539 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1540 * of tasks with abnormal "nice" values across CPUs the contribution that
1541 * each task makes to its run queue's load is weighted according to its
1542 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1543 * scaled version of the new time slice allocation that they receive on time
1544 * slice expiry etc.
1545 */
1546
97fb7a0a
IM
1547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
029632fb 1549
97fb7a0a
IM
1550extern const int sched_prio_to_weight[40];
1551extern const u32 sched_prio_to_wmult[40];
029632fb 1552
ff77e468
PZ
1553/*
1554 * {de,en}queue flags:
1555 *
1556 * DEQUEUE_SLEEP - task is no longer runnable
1557 * ENQUEUE_WAKEUP - task just became runnable
1558 *
1559 * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
1560 * are in a known state which allows modification. Such pairs
1561 * should preserve as much state as possible.
1562 *
1563 * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
1564 * in the runqueue.
1565 *
1566 * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
1567 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
59efa0ba 1568 * ENQUEUE_MIGRATED - the task was migrated during wakeup
ff77e468
PZ
1569 *
1570 */
1571
1572#define DEQUEUE_SLEEP 0x01
97fb7a0a
IM
1573#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
1574#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
1575#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
ff77e468 1576
1de64443 1577#define ENQUEUE_WAKEUP 0x01
ff77e468
PZ
1578#define ENQUEUE_RESTORE 0x02
1579#define ENQUEUE_MOVE 0x04
0a67d1ee 1580#define ENQUEUE_NOCLOCK 0x08
ff77e468 1581
0a67d1ee
PZ
1582#define ENQUEUE_HEAD 0x10
1583#define ENQUEUE_REPLENISH 0x20
c82ba9fa 1584#ifdef CONFIG_SMP
0a67d1ee 1585#define ENQUEUE_MIGRATED 0x40
c82ba9fa 1586#else
59efa0ba 1587#define ENQUEUE_MIGRATED 0x00
c82ba9fa 1588#endif
c82ba9fa 1589
37e117c0
PZ
1590#define RETRY_TASK ((void *)-1UL)
1591
c82ba9fa
LZ
1592struct sched_class {
1593 const struct sched_class *next;
1594
1595 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1596 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
97fb7a0a
IM
1597 void (*yield_task) (struct rq *rq);
1598 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
c82ba9fa 1599
97fb7a0a 1600 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
c82ba9fa 1601
606dba2e
PZ
1602 /*
1603 * It is the responsibility of the pick_next_task() method that will
1604 * return the next task to call put_prev_task() on the @prev task or
1605 * something equivalent.
37e117c0
PZ
1606 *
1607 * May return RETRY_TASK when it finds a higher prio class has runnable
1608 * tasks.
606dba2e 1609 */
97fb7a0a
IM
1610 struct task_struct * (*pick_next_task)(struct rq *rq,
1611 struct task_struct *prev,
1612 struct rq_flags *rf);
1613 void (*put_prev_task)(struct rq *rq, struct task_struct *p);
c82ba9fa
LZ
1614
1615#ifdef CONFIG_SMP
ac66f547 1616 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1327237a 1617 void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
c82ba9fa 1618
97fb7a0a 1619 void (*task_woken)(struct rq *this_rq, struct task_struct *task);
c82ba9fa
LZ
1620
1621 void (*set_cpus_allowed)(struct task_struct *p,
1622 const struct cpumask *newmask);
1623
1624 void (*rq_online)(struct rq *rq);
1625 void (*rq_offline)(struct rq *rq);
1626#endif
1627
97fb7a0a
IM
1628 void (*set_curr_task)(struct rq *rq);
1629 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1630 void (*task_fork)(struct task_struct *p);
1631 void (*task_dead)(struct task_struct *p);
c82ba9fa 1632
67dfa1b7
KT
1633 /*
1634 * The switched_from() call is allowed to drop rq->lock, therefore we
1635 * cannot assume the switched_from/switched_to pair is serliazed by
1636 * rq->lock. They are however serialized by p->pi_lock.
1637 */
97fb7a0a
IM
1638 void (*switched_from)(struct rq *this_rq, struct task_struct *task);
1639 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
c82ba9fa 1640 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
97fb7a0a 1641 int oldprio);
c82ba9fa 1642
97fb7a0a
IM
1643 unsigned int (*get_rr_interval)(struct rq *rq,
1644 struct task_struct *task);
c82ba9fa 1645
97fb7a0a 1646 void (*update_curr)(struct rq *rq);
6e998916 1647
97fb7a0a
IM
1648#define TASK_SET_GROUP 0
1649#define TASK_MOVE_GROUP 1
ea86cb4b 1650
c82ba9fa 1651#ifdef CONFIG_FAIR_GROUP_SCHED
97fb7a0a 1652 void (*task_change_group)(struct task_struct *p, int type);
c82ba9fa
LZ
1653#endif
1654};
029632fb 1655
3f1d2a31
PZ
1656static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1657{
1658 prev->sched_class->put_prev_task(rq, prev);
1659}
1660
b2bf6c31
PZ
1661static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
1662{
1663 curr->sched_class->set_curr_task(rq);
1664}
1665
f5832c19 1666#ifdef CONFIG_SMP
029632fb 1667#define sched_class_highest (&stop_sched_class)
f5832c19
NP
1668#else
1669#define sched_class_highest (&dl_sched_class)
1670#endif
029632fb
PZ
1671#define for_each_class(class) \
1672 for (class = sched_class_highest; class; class = class->next)
1673
1674extern const struct sched_class stop_sched_class;
aab03e05 1675extern const struct sched_class dl_sched_class;
029632fb
PZ
1676extern const struct sched_class rt_sched_class;
1677extern const struct sched_class fair_sched_class;
1678extern const struct sched_class idle_sched_class;
1679
1680
1681#ifdef CONFIG_SMP
1682
63b2ca30 1683extern void update_group_capacity(struct sched_domain *sd, int cpu);
b719203b 1684
7caff66f 1685extern void trigger_load_balance(struct rq *rq);
029632fb 1686
c5b28038
PZ
1687extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
1688
029632fb
PZ
1689#endif
1690
442bf3aa
DL
1691#ifdef CONFIG_CPU_IDLE
1692static inline void idle_set_state(struct rq *rq,
1693 struct cpuidle_state *idle_state)
1694{
1695 rq->idle_state = idle_state;
1696}
1697
1698static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1699{
9148a3a1 1700 SCHED_WARN_ON(!rcu_read_lock_held());
97fb7a0a 1701
442bf3aa
DL
1702 return rq->idle_state;
1703}
1704#else
1705static inline void idle_set_state(struct rq *rq,
1706 struct cpuidle_state *idle_state)
1707{
1708}
1709
1710static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1711{
1712 return NULL;
1713}
1714#endif
1715
8663effb
SRV
1716extern void schedule_idle(void);
1717
029632fb
PZ
1718extern void sysrq_sched_debug_show(void);
1719extern void sched_init_granularity(void);
1720extern void update_max_interval(void);
1baca4ce
JL
1721
1722extern void init_sched_dl_class(void);
029632fb
PZ
1723extern void init_sched_rt_class(void);
1724extern void init_sched_fair_class(void);
1725
9059393e
VG
1726extern void reweight_task(struct task_struct *p, int prio);
1727
8875125e 1728extern void resched_curr(struct rq *rq);
029632fb
PZ
1729extern void resched_cpu(int cpu);
1730
1731extern struct rt_bandwidth def_rt_bandwidth;
1732extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1733
332ac17e
DF
1734extern struct dl_bandwidth def_dl_bandwidth;
1735extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
aab03e05 1736extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
209a0cbd 1737extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
4da3abce 1738extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
aab03e05 1739
97fb7a0a
IM
1740#define BW_SHIFT 20
1741#define BW_UNIT (1 << BW_SHIFT)
1742#define RATIO_SHIFT 8
332ac17e
DF
1743unsigned long to_ratio(u64 period, u64 runtime);
1744
540247fb 1745extern void init_entity_runnable_average(struct sched_entity *se);
2b8c41da 1746extern void post_init_entity_util_avg(struct sched_entity *se);
a75cdaa9 1747
76d92ac3
FW
1748#ifdef CONFIG_NO_HZ_FULL
1749extern bool sched_can_stop_tick(struct rq *rq);
d84b3131 1750extern int __init sched_tick_offload_init(void);
76d92ac3
FW
1751
1752/*
1753 * Tick may be needed by tasks in the runqueue depending on their policy and
1754 * requirements. If tick is needed, lets send the target an IPI to kick it out of
1755 * nohz mode if necessary.
1756 */
1757static inline void sched_update_tick_dependency(struct rq *rq)
1758{
1759 int cpu;
1760
1761 if (!tick_nohz_full_enabled())
1762 return;
1763
1764 cpu = cpu_of(rq);
1765
1766 if (!tick_nohz_full_cpu(cpu))
1767 return;
1768
1769 if (sched_can_stop_tick(rq))
1770 tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
1771 else
1772 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
1773}
1774#else
d84b3131 1775static inline int sched_tick_offload_init(void) { return 0; }
76d92ac3
FW
1776static inline void sched_update_tick_dependency(struct rq *rq) { }
1777#endif
1778
72465447 1779static inline void add_nr_running(struct rq *rq, unsigned count)
029632fb 1780{
72465447
KT
1781 unsigned prev_nr = rq->nr_running;
1782
1783 rq->nr_running = prev_nr + count;
9f3660c2 1784
72465447 1785 if (prev_nr < 2 && rq->nr_running >= 2) {
4486edd1 1786#ifdef CONFIG_SMP
e90c8fe1
VS
1787 if (!READ_ONCE(rq->rd->overload))
1788 WRITE_ONCE(rq->rd->overload, 1);
4486edd1 1789#endif
4486edd1 1790 }
76d92ac3
FW
1791
1792 sched_update_tick_dependency(rq);
029632fb
PZ
1793}
1794
72465447 1795static inline void sub_nr_running(struct rq *rq, unsigned count)
029632fb 1796{
72465447 1797 rq->nr_running -= count;
76d92ac3
FW
1798 /* Check if we still need preemption */
1799 sched_update_tick_dependency(rq);
029632fb
PZ
1800}
1801
029632fb
PZ
1802extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
1803extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
1804
1805extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
1806
029632fb
PZ
1807extern const_debug unsigned int sysctl_sched_nr_migrate;
1808extern const_debug unsigned int sysctl_sched_migration_cost;
1809
029632fb
PZ
1810#ifdef CONFIG_SCHED_HRTICK
1811
1812/*
1813 * Use hrtick when:
1814 * - enabled by features
1815 * - hrtimer is actually high res
1816 */
1817static inline int hrtick_enabled(struct rq *rq)
1818{
1819 if (!sched_feat(HRTICK))
1820 return 0;
1821 if (!cpu_active(cpu_of(rq)))
1822 return 0;
1823 return hrtimer_is_hres_active(&rq->hrtick_timer);
1824}
1825
1826void hrtick_start(struct rq *rq, u64 delay);
1827
b39e66ea
MG
1828#else
1829
1830static inline int hrtick_enabled(struct rq *rq)
1831{
1832 return 0;
1833}
1834
029632fb
PZ
1835#endif /* CONFIG_SCHED_HRTICK */
1836
dfbca41f
PZ
1837#ifndef arch_scale_freq_capacity
1838static __always_inline
7673c8a4 1839unsigned long arch_scale_freq_capacity(int cpu)
dfbca41f
PZ
1840{
1841 return SCHED_CAPACITY_SCALE;
1842}
1843#endif
b5b4860d 1844
7e1a9208 1845#ifdef CONFIG_SMP
8cd5601c
MR
1846#ifndef arch_scale_cpu_capacity
1847static __always_inline
1848unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
1849{
e3279a2e 1850 if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
8cd5601c
MR
1851 return sd->smt_gain / sd->span_weight;
1852
1853 return SCHED_CAPACITY_SCALE;
1854}
1855#endif
029632fb 1856#else
7e1a9208
JL
1857#ifndef arch_scale_cpu_capacity
1858static __always_inline
1859unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
1860{
1861 return SCHED_CAPACITY_SCALE;
1862}
1863#endif
029632fb
PZ
1864#endif
1865
029632fb
PZ
1866#ifdef CONFIG_SMP
1867#ifdef CONFIG_PREEMPT
1868
1869static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
1870
1871/*
1872 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1873 * way at the expense of forcing extra atomic operations in all
1874 * invocations. This assures that the double_lock is acquired using the
1875 * same underlying policy as the spinlock_t on this architecture, which
1876 * reduces latency compared to the unfair variant below. However, it
1877 * also adds more overhead and therefore may reduce throughput.
1878 */
1879static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1880 __releases(this_rq->lock)
1881 __acquires(busiest->lock)
1882 __acquires(this_rq->lock)
1883{
1884 raw_spin_unlock(&this_rq->lock);
1885 double_rq_lock(this_rq, busiest);
1886
1887 return 1;
1888}
1889
1890#else
1891/*
1892 * Unfair double_lock_balance: Optimizes throughput at the expense of
1893 * latency by eliminating extra atomic operations when the locks are
97fb7a0a
IM
1894 * already in proper order on entry. This favors lower CPU-ids and will
1895 * grant the double lock to lower CPUs over higher ids under contention,
029632fb
PZ
1896 * regardless of entry order into the function.
1897 */
1898static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1899 __releases(this_rq->lock)
1900 __acquires(busiest->lock)
1901 __acquires(this_rq->lock)
1902{
1903 int ret = 0;
1904
1905 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1906 if (busiest < this_rq) {
1907 raw_spin_unlock(&this_rq->lock);
1908 raw_spin_lock(&busiest->lock);
1909 raw_spin_lock_nested(&this_rq->lock,
1910 SINGLE_DEPTH_NESTING);
1911 ret = 1;
1912 } else
1913 raw_spin_lock_nested(&busiest->lock,
1914 SINGLE_DEPTH_NESTING);
1915 }
1916 return ret;
1917}
1918
1919#endif /* CONFIG_PREEMPT */
1920
1921/*
1922 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1923 */
1924static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1925{
1926 if (unlikely(!irqs_disabled())) {
97fb7a0a 1927 /* printk() doesn't work well under rq->lock */
029632fb
PZ
1928 raw_spin_unlock(&this_rq->lock);
1929 BUG_ON(1);
1930 }
1931
1932 return _double_lock_balance(this_rq, busiest);
1933}
1934
1935static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1936 __releases(busiest->lock)
1937{
1938 raw_spin_unlock(&busiest->lock);
1939 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1940}
1941
74602315
PZ
1942static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1943{
1944 if (l1 > l2)
1945 swap(l1, l2);
1946
1947 spin_lock(l1);
1948 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1949}
1950
60e69eed
MG
1951static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
1952{
1953 if (l1 > l2)
1954 swap(l1, l2);
1955
1956 spin_lock_irq(l1);
1957 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1958}
1959
74602315
PZ
1960static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1961{
1962 if (l1 > l2)
1963 swap(l1, l2);
1964
1965 raw_spin_lock(l1);
1966 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1967}
1968
029632fb
PZ
1969/*
1970 * double_rq_lock - safely lock two runqueues
1971 *
1972 * Note this does not disable interrupts like task_rq_lock,
1973 * you need to do so manually before calling.
1974 */
1975static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1976 __acquires(rq1->lock)
1977 __acquires(rq2->lock)
1978{
1979 BUG_ON(!irqs_disabled());
1980 if (rq1 == rq2) {
1981 raw_spin_lock(&rq1->lock);
1982 __acquire(rq2->lock); /* Fake it out ;) */
1983 } else {
1984 if (rq1 < rq2) {
1985 raw_spin_lock(&rq1->lock);
1986 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1987 } else {
1988 raw_spin_lock(&rq2->lock);
1989 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1990 }
1991 }
1992}
1993
1994/*
1995 * double_rq_unlock - safely unlock two runqueues
1996 *
1997 * Note this does not restore interrupts like task_rq_unlock,
1998 * you need to do so manually after calling.
1999 */
2000static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2001 __releases(rq1->lock)
2002 __releases(rq2->lock)
2003{
2004 raw_spin_unlock(&rq1->lock);
2005 if (rq1 != rq2)
2006 raw_spin_unlock(&rq2->lock);
2007 else
2008 __release(rq2->lock);
2009}
2010
f2cb1360
IM
2011extern void set_rq_online (struct rq *rq);
2012extern void set_rq_offline(struct rq *rq);
2013extern bool sched_smp_initialized;
2014
029632fb
PZ
2015#else /* CONFIG_SMP */
2016
2017/*
2018 * double_rq_lock - safely lock two runqueues
2019 *
2020 * Note this does not disable interrupts like task_rq_lock,
2021 * you need to do so manually before calling.
2022 */
2023static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
2024 __acquires(rq1->lock)
2025 __acquires(rq2->lock)
2026{
2027 BUG_ON(!irqs_disabled());
2028 BUG_ON(rq1 != rq2);
2029 raw_spin_lock(&rq1->lock);
2030 __acquire(rq2->lock); /* Fake it out ;) */
2031}
2032
2033/*
2034 * double_rq_unlock - safely unlock two runqueues
2035 *
2036 * Note this does not restore interrupts like task_rq_unlock,
2037 * you need to do so manually after calling.
2038 */
2039static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2040 __releases(rq1->lock)
2041 __releases(rq2->lock)
2042{
2043 BUG_ON(rq1 != rq2);
2044 raw_spin_unlock(&rq1->lock);
2045 __release(rq2->lock);
2046}
2047
2048#endif
2049
2050extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
2051extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
6b55c965
SD
2052
2053#ifdef CONFIG_SCHED_DEBUG
9469eb01
PZ
2054extern bool sched_debug_enabled;
2055
029632fb
PZ
2056extern void print_cfs_stats(struct seq_file *m, int cpu);
2057extern void print_rt_stats(struct seq_file *m, int cpu);
acb32132 2058extern void print_dl_stats(struct seq_file *m, int cpu);
f6a34630
MM
2059extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
2060extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2061extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
397f2378
SD
2062#ifdef CONFIG_NUMA_BALANCING
2063extern void
2064show_numa_stats(struct task_struct *p, struct seq_file *m);
2065extern void
2066print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
2067 unsigned long tpf, unsigned long gsf, unsigned long gpf);
2068#endif /* CONFIG_NUMA_BALANCING */
2069#endif /* CONFIG_SCHED_DEBUG */
029632fb
PZ
2070
2071extern void init_cfs_rq(struct cfs_rq *cfs_rq);
07c54f7a
AV
2072extern void init_rt_rq(struct rt_rq *rt_rq);
2073extern void init_dl_rq(struct dl_rq *dl_rq);
029632fb 2074
1ee14e6c
BS
2075extern void cfs_bandwidth_usage_inc(void);
2076extern void cfs_bandwidth_usage_dec(void);
1c792db7 2077
3451d024 2078#ifdef CONFIG_NO_HZ_COMMON
00357f5e
PZ
2079#define NOHZ_BALANCE_KICK_BIT 0
2080#define NOHZ_STATS_KICK_BIT 1
a22e47a4 2081
a22e47a4 2082#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
b7031a02
PZ
2083#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
2084
2085#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
1c792db7
SS
2086
2087#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
20a5c8cc 2088
00357f5e 2089extern void nohz_balance_exit_idle(struct rq *rq);
20a5c8cc 2090#else
00357f5e 2091static inline void nohz_balance_exit_idle(struct rq *rq) { }
1c792db7 2092#endif
73fbec60 2093
daec5798
LA
2094
2095#ifdef CONFIG_SMP
2096static inline
2097void __dl_update(struct dl_bw *dl_b, s64 bw)
2098{
2099 struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
2100 int i;
2101
2102 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2103 "sched RCU must be held");
2104 for_each_cpu_and(i, rd->span, cpu_active_mask) {
2105 struct rq *rq = cpu_rq(i);
2106
2107 rq->dl.extra_bw += bw;
2108 }
2109}
2110#else
2111static inline
2112void __dl_update(struct dl_bw *dl_b, s64 bw)
2113{
2114 struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
2115
2116 dl->extra_bw += bw;
2117}
2118#endif
2119
2120
73fbec60 2121#ifdef CONFIG_IRQ_TIME_ACCOUNTING
19d23dbf 2122struct irqtime {
25e2d8c1 2123 u64 total;
a499a5a1 2124 u64 tick_delta;
19d23dbf
FW
2125 u64 irq_start_time;
2126 struct u64_stats_sync sync;
2127};
73fbec60 2128
19d23dbf 2129DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
73fbec60 2130
25e2d8c1
FW
2131/*
2132 * Returns the irqtime minus the softirq time computed by ksoftirqd.
2133 * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
2134 * and never move forward.
2135 */
73fbec60
FW
2136static inline u64 irq_time_read(int cpu)
2137{
19d23dbf
FW
2138 struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
2139 unsigned int seq;
2140 u64 total;
73fbec60
FW
2141
2142 do {
19d23dbf 2143 seq = __u64_stats_fetch_begin(&irqtime->sync);
25e2d8c1 2144 total = irqtime->total;
19d23dbf 2145 } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
73fbec60 2146
19d23dbf 2147 return total;
73fbec60 2148}
73fbec60 2149#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
adaf9fcd
RW
2150
2151#ifdef CONFIG_CPU_FREQ
2152DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
2153
2154/**
2155 * cpufreq_update_util - Take a note about CPU utilization changes.
12bde33d 2156 * @rq: Runqueue to carry out the update for.
58919e83 2157 * @flags: Update reason flags.
adaf9fcd 2158 *
58919e83
RW
2159 * This function is called by the scheduler on the CPU whose utilization is
2160 * being updated.
adaf9fcd
RW
2161 *
2162 * It can only be called from RCU-sched read-side critical sections.
adaf9fcd
RW
2163 *
2164 * The way cpufreq is currently arranged requires it to evaluate the CPU
2165 * performance state (frequency/voltage) on a regular basis to prevent it from
2166 * being stuck in a completely inadequate performance level for too long.
e0367b12
JL
2167 * That is not guaranteed to happen if the updates are only triggered from CFS
2168 * and DL, though, because they may not be coming in if only RT tasks are
2169 * active all the time (or there are RT tasks only).
adaf9fcd 2170 *
e0367b12
JL
2171 * As a workaround for that issue, this function is called periodically by the
2172 * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
adaf9fcd 2173 * but that really is a band-aid. Going forward it should be replaced with
e0367b12 2174 * solutions targeted more specifically at RT tasks.
adaf9fcd 2175 */
12bde33d 2176static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
adaf9fcd 2177{
58919e83
RW
2178 struct update_util_data *data;
2179
674e7541
VK
2180 data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
2181 cpu_of(rq)));
58919e83 2182 if (data)
12bde33d
RW
2183 data->func(data, rq_clock(rq), flags);
2184}
adaf9fcd 2185#else
12bde33d 2186static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
adaf9fcd 2187#endif /* CONFIG_CPU_FREQ */
be53f58f 2188
9bdcb44e 2189#ifdef arch_scale_freq_capacity
97fb7a0a
IM
2190# ifndef arch_scale_freq_invariant
2191# define arch_scale_freq_invariant() true
2192# endif
2193#else
2194# define arch_scale_freq_invariant() false
9bdcb44e 2195#endif
d4edd662 2196
794a56eb 2197#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
8cc90515 2198static inline unsigned long cpu_bw_dl(struct rq *rq)
d4edd662
JL
2199{
2200 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
2201}
2202
8cc90515
VG
2203static inline unsigned long cpu_util_dl(struct rq *rq)
2204{
2205 return READ_ONCE(rq->avg_dl.util_avg);
2206}
2207
d4edd662
JL
2208static inline unsigned long cpu_util_cfs(struct rq *rq)
2209{
a07630b8
PB
2210 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
2211
2212 if (sched_feat(UTIL_EST)) {
2213 util = max_t(unsigned long, util,
2214 READ_ONCE(rq->cfs.avg.util_est.enqueued));
2215 }
2216
2217 return util;
d4edd662 2218}
371bf427
VG
2219
2220static inline unsigned long cpu_util_rt(struct rq *rq)
2221{
dfa444dc 2222 return READ_ONCE(rq->avg_rt.util_avg);
371bf427 2223}
2e62c474 2224#endif
9033ea11 2225
11d4afd4 2226#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
9033ea11
VG
2227static inline unsigned long cpu_util_irq(struct rq *rq)
2228{
2229 return rq->avg_irq.util_avg;
2230}
2e62c474
VG
2231
2232static inline
2233unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2234{
2235 util *= (max - irq);
2236 util /= max;
2237
2238 return util;
2239
2240}
9033ea11
VG
2241#else
2242static inline unsigned long cpu_util_irq(struct rq *rq)
2243{
2244 return 0;
2245}
2246
2e62c474
VG
2247static inline
2248unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2249{
2250 return util;
2251}
794a56eb 2252#endif