License cleanup: add SPDX GPL-2.0 license identifier to files with no license
[linux-block.git] / kernel / sched / rt.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
bb44e5d1
IM
2/*
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies)
5 */
6
029632fb
PZ
7#include "sched.h"
8
9#include <linux/slab.h>
b6366f04 10#include <linux/irq_work.h>
029632fb 11
ce0dbbbb 12int sched_rr_timeslice = RR_TIMESLICE;
975e155e 13int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
ce0dbbbb 14
029632fb
PZ
15static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
16
17struct rt_bandwidth def_rt_bandwidth;
18
19static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
20{
21 struct rt_bandwidth *rt_b =
22 container_of(timer, struct rt_bandwidth, rt_period_timer);
029632fb 23 int idle = 0;
77a4d1a1 24 int overrun;
029632fb 25
77a4d1a1 26 raw_spin_lock(&rt_b->rt_runtime_lock);
029632fb 27 for (;;) {
77a4d1a1 28 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
029632fb
PZ
29 if (!overrun)
30 break;
31
77a4d1a1 32 raw_spin_unlock(&rt_b->rt_runtime_lock);
029632fb 33 idle = do_sched_rt_period_timer(rt_b, overrun);
77a4d1a1 34 raw_spin_lock(&rt_b->rt_runtime_lock);
029632fb 35 }
4cfafd30
PZ
36 if (idle)
37 rt_b->rt_period_active = 0;
77a4d1a1 38 raw_spin_unlock(&rt_b->rt_runtime_lock);
029632fb
PZ
39
40 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
41}
42
43void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
44{
45 rt_b->rt_period = ns_to_ktime(period);
46 rt_b->rt_runtime = runtime;
47
48 raw_spin_lock_init(&rt_b->rt_runtime_lock);
49
50 hrtimer_init(&rt_b->rt_period_timer,
51 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
52 rt_b->rt_period_timer.function = sched_rt_period_timer;
53}
54
55static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
56{
57 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
58 return;
59
029632fb 60 raw_spin_lock(&rt_b->rt_runtime_lock);
4cfafd30
PZ
61 if (!rt_b->rt_period_active) {
62 rt_b->rt_period_active = 1;
c3a990dc
SR
63 /*
64 * SCHED_DEADLINE updates the bandwidth, as a run away
65 * RT task with a DL task could hog a CPU. But DL does
66 * not reset the period. If a deadline task was running
67 * without an RT task running, it can cause RT tasks to
68 * throttle when they start up. Kick the timer right away
69 * to update the period.
70 */
71 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
4cfafd30
PZ
72 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
73 }
029632fb
PZ
74 raw_spin_unlock(&rt_b->rt_runtime_lock);
75}
76
89b41108 77#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
b6366f04
SR
78static void push_irq_work_func(struct irq_work *work);
79#endif
80
07c54f7a 81void init_rt_rq(struct rt_rq *rt_rq)
029632fb
PZ
82{
83 struct rt_prio_array *array;
84 int i;
85
86 array = &rt_rq->active;
87 for (i = 0; i < MAX_RT_PRIO; i++) {
88 INIT_LIST_HEAD(array->queue + i);
89 __clear_bit(i, array->bitmap);
90 }
91 /* delimiter for bitsearch: */
92 __set_bit(MAX_RT_PRIO, array->bitmap);
93
94#if defined CONFIG_SMP
95 rt_rq->highest_prio.curr = MAX_RT_PRIO;
96 rt_rq->highest_prio.next = MAX_RT_PRIO;
97 rt_rq->rt_nr_migratory = 0;
98 rt_rq->overloaded = 0;
99 plist_head_init(&rt_rq->pushable_tasks);
b6366f04
SR
100
101#ifdef HAVE_RT_PUSH_IPI
102 rt_rq->push_flags = 0;
103 rt_rq->push_cpu = nr_cpu_ids;
104 raw_spin_lock_init(&rt_rq->push_lock);
105 init_irq_work(&rt_rq->push_work, push_irq_work_func);
029632fb 106#endif
b6366f04 107#endif /* CONFIG_SMP */
f4ebcbc0
KT
108 /* We start is dequeued state, because no RT tasks are queued */
109 rt_rq->rt_queued = 0;
029632fb
PZ
110
111 rt_rq->rt_time = 0;
112 rt_rq->rt_throttled = 0;
113 rt_rq->rt_runtime = 0;
114 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
115}
116
8f48894f 117#ifdef CONFIG_RT_GROUP_SCHED
029632fb
PZ
118static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
119{
120 hrtimer_cancel(&rt_b->rt_period_timer);
121}
8f48894f
PZ
122
123#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
124
398a153b
GH
125static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
126{
8f48894f
PZ
127#ifdef CONFIG_SCHED_DEBUG
128 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
129#endif
398a153b
GH
130 return container_of(rt_se, struct task_struct, rt);
131}
132
398a153b
GH
133static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
134{
135 return rt_rq->rq;
136}
137
138static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
139{
140 return rt_se->rt_rq;
141}
142
653d07a6
KT
143static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
144{
145 struct rt_rq *rt_rq = rt_se->rt_rq;
146
147 return rt_rq->rq;
148}
149
029632fb
PZ
150void free_rt_sched_group(struct task_group *tg)
151{
152 int i;
153
154 if (tg->rt_se)
155 destroy_rt_bandwidth(&tg->rt_bandwidth);
156
157 for_each_possible_cpu(i) {
158 if (tg->rt_rq)
159 kfree(tg->rt_rq[i]);
160 if (tg->rt_se)
161 kfree(tg->rt_se[i]);
162 }
163
164 kfree(tg->rt_rq);
165 kfree(tg->rt_se);
166}
167
168void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
169 struct sched_rt_entity *rt_se, int cpu,
170 struct sched_rt_entity *parent)
171{
172 struct rq *rq = cpu_rq(cpu);
173
174 rt_rq->highest_prio.curr = MAX_RT_PRIO;
175 rt_rq->rt_nr_boosted = 0;
176 rt_rq->rq = rq;
177 rt_rq->tg = tg;
178
179 tg->rt_rq[cpu] = rt_rq;
180 tg->rt_se[cpu] = rt_se;
181
182 if (!rt_se)
183 return;
184
185 if (!parent)
186 rt_se->rt_rq = &rq->rt;
187 else
188 rt_se->rt_rq = parent->my_q;
189
190 rt_se->my_q = rt_rq;
191 rt_se->parent = parent;
192 INIT_LIST_HEAD(&rt_se->run_list);
193}
194
195int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
196{
197 struct rt_rq *rt_rq;
198 struct sched_rt_entity *rt_se;
199 int i;
200
201 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
202 if (!tg->rt_rq)
203 goto err;
204 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
205 if (!tg->rt_se)
206 goto err;
207
208 init_rt_bandwidth(&tg->rt_bandwidth,
209 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
210
211 for_each_possible_cpu(i) {
212 rt_rq = kzalloc_node(sizeof(struct rt_rq),
213 GFP_KERNEL, cpu_to_node(i));
214 if (!rt_rq)
215 goto err;
216
217 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
218 GFP_KERNEL, cpu_to_node(i));
219 if (!rt_se)
220 goto err_free_rq;
221
07c54f7a 222 init_rt_rq(rt_rq);
029632fb
PZ
223 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
224 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
225 }
226
227 return 1;
228
229err_free_rq:
230 kfree(rt_rq);
231err:
232 return 0;
233}
234
398a153b
GH
235#else /* CONFIG_RT_GROUP_SCHED */
236
a1ba4d8b
PZ
237#define rt_entity_is_task(rt_se) (1)
238
8f48894f
PZ
239static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
240{
241 return container_of(rt_se, struct task_struct, rt);
242}
243
398a153b
GH
244static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
245{
246 return container_of(rt_rq, struct rq, rt);
247}
248
653d07a6 249static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
398a153b
GH
250{
251 struct task_struct *p = rt_task_of(rt_se);
653d07a6
KT
252
253 return task_rq(p);
254}
255
256static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
257{
258 struct rq *rq = rq_of_rt_se(rt_se);
398a153b
GH
259
260 return &rq->rt;
261}
262
029632fb
PZ
263void free_rt_sched_group(struct task_group *tg) { }
264
265int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
266{
267 return 1;
268}
398a153b
GH
269#endif /* CONFIG_RT_GROUP_SCHED */
270
4fd29176 271#ifdef CONFIG_SMP
84de4274 272
8046d680 273static void pull_rt_task(struct rq *this_rq);
38033c37 274
dc877341
PZ
275static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
276{
277 /* Try to pull RT tasks here if we lower this rq's prio */
278 return rq->rt.highest_prio.curr > prev->prio;
279}
280
637f5085 281static inline int rt_overloaded(struct rq *rq)
4fd29176 282{
637f5085 283 return atomic_read(&rq->rd->rto_count);
4fd29176 284}
84de4274 285
4fd29176
SR
286static inline void rt_set_overload(struct rq *rq)
287{
1f11eb6a
GH
288 if (!rq->online)
289 return;
290
c6c4927b 291 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
4fd29176
SR
292 /*
293 * Make sure the mask is visible before we set
294 * the overload count. That is checked to determine
295 * if we should look at the mask. It would be a shame
296 * if we looked at the mask, but the mask was not
297 * updated yet.
7c3f2ab7
PZ
298 *
299 * Matched by the barrier in pull_rt_task().
4fd29176 300 */
7c3f2ab7 301 smp_wmb();
637f5085 302 atomic_inc(&rq->rd->rto_count);
4fd29176 303}
84de4274 304
4fd29176
SR
305static inline void rt_clear_overload(struct rq *rq)
306{
1f11eb6a
GH
307 if (!rq->online)
308 return;
309
4fd29176 310 /* the order here really doesn't matter */
637f5085 311 atomic_dec(&rq->rd->rto_count);
c6c4927b 312 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
4fd29176 313}
73fe6aae 314
398a153b 315static void update_rt_migration(struct rt_rq *rt_rq)
73fe6aae 316{
a1ba4d8b 317 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
398a153b
GH
318 if (!rt_rq->overloaded) {
319 rt_set_overload(rq_of_rt_rq(rt_rq));
320 rt_rq->overloaded = 1;
cdc8eb98 321 }
398a153b
GH
322 } else if (rt_rq->overloaded) {
323 rt_clear_overload(rq_of_rt_rq(rt_rq));
324 rt_rq->overloaded = 0;
637f5085 325 }
73fe6aae 326}
4fd29176 327
398a153b
GH
328static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
329{
29baa747
PZ
330 struct task_struct *p;
331
a1ba4d8b
PZ
332 if (!rt_entity_is_task(rt_se))
333 return;
334
29baa747 335 p = rt_task_of(rt_se);
a1ba4d8b
PZ
336 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
337
338 rt_rq->rt_nr_total++;
4b53a341 339 if (p->nr_cpus_allowed > 1)
398a153b
GH
340 rt_rq->rt_nr_migratory++;
341
342 update_rt_migration(rt_rq);
343}
344
345static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
346{
29baa747
PZ
347 struct task_struct *p;
348
a1ba4d8b
PZ
349 if (!rt_entity_is_task(rt_se))
350 return;
351
29baa747 352 p = rt_task_of(rt_se);
a1ba4d8b
PZ
353 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
354
355 rt_rq->rt_nr_total--;
4b53a341 356 if (p->nr_cpus_allowed > 1)
398a153b
GH
357 rt_rq->rt_nr_migratory--;
358
359 update_rt_migration(rt_rq);
360}
361
5181f4a4
SR
362static inline int has_pushable_tasks(struct rq *rq)
363{
364 return !plist_head_empty(&rq->rt.pushable_tasks);
365}
366
fd7a4bed
PZ
367static DEFINE_PER_CPU(struct callback_head, rt_push_head);
368static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
e3fca9e7
PZ
369
370static void push_rt_tasks(struct rq *);
fd7a4bed 371static void pull_rt_task(struct rq *);
e3fca9e7
PZ
372
373static inline void queue_push_tasks(struct rq *rq)
dc877341 374{
e3fca9e7
PZ
375 if (!has_pushable_tasks(rq))
376 return;
377
fd7a4bed
PZ
378 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
379}
380
381static inline void queue_pull_task(struct rq *rq)
382{
383 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
dc877341
PZ
384}
385
917b627d
GH
386static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
387{
388 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
389 plist_node_init(&p->pushable_tasks, p->prio);
390 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
5181f4a4
SR
391
392 /* Update the highest prio pushable task */
393 if (p->prio < rq->rt.highest_prio.next)
394 rq->rt.highest_prio.next = p->prio;
917b627d
GH
395}
396
397static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
398{
399 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
917b627d 400
5181f4a4
SR
401 /* Update the new highest prio pushable task */
402 if (has_pushable_tasks(rq)) {
403 p = plist_first_entry(&rq->rt.pushable_tasks,
404 struct task_struct, pushable_tasks);
405 rq->rt.highest_prio.next = p->prio;
406 } else
407 rq->rt.highest_prio.next = MAX_RT_PRIO;
bcf08df3
IM
408}
409
917b627d
GH
410#else
411
ceacc2c1 412static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
fa85ae24 413{
6f505b16
PZ
414}
415
ceacc2c1
PZ
416static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
417{
418}
419
b07430ac 420static inline
ceacc2c1
PZ
421void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
422{
423}
424
398a153b 425static inline
ceacc2c1
PZ
426void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
427{
428}
917b627d 429
dc877341
PZ
430static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
431{
432 return false;
433}
434
8046d680 435static inline void pull_rt_task(struct rq *this_rq)
dc877341 436{
dc877341
PZ
437}
438
e3fca9e7 439static inline void queue_push_tasks(struct rq *rq)
dc877341
PZ
440{
441}
4fd29176
SR
442#endif /* CONFIG_SMP */
443
f4ebcbc0
KT
444static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
445static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
446
6f505b16
PZ
447static inline int on_rt_rq(struct sched_rt_entity *rt_se)
448{
ff77e468 449 return rt_se->on_rq;
6f505b16
PZ
450}
451
052f1dc7 452#ifdef CONFIG_RT_GROUP_SCHED
6f505b16 453
9f0c1e56 454static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
6f505b16
PZ
455{
456 if (!rt_rq->tg)
9f0c1e56 457 return RUNTIME_INF;
6f505b16 458
ac086bc2
PZ
459 return rt_rq->rt_runtime;
460}
461
462static inline u64 sched_rt_period(struct rt_rq *rt_rq)
463{
464 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
6f505b16
PZ
465}
466
ec514c48
CX
467typedef struct task_group *rt_rq_iter_t;
468
1c09ab0d
YZ
469static inline struct task_group *next_task_group(struct task_group *tg)
470{
471 do {
472 tg = list_entry_rcu(tg->list.next,
473 typeof(struct task_group), list);
474 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
475
476 if (&tg->list == &task_groups)
477 tg = NULL;
478
479 return tg;
480}
481
482#define for_each_rt_rq(rt_rq, iter, rq) \
483 for (iter = container_of(&task_groups, typeof(*iter), list); \
484 (iter = next_task_group(iter)) && \
485 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
ec514c48 486
6f505b16
PZ
487#define for_each_sched_rt_entity(rt_se) \
488 for (; rt_se; rt_se = rt_se->parent)
489
490static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
491{
492 return rt_se->my_q;
493}
494
ff77e468
PZ
495static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
496static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
6f505b16 497
9f0c1e56 498static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
6f505b16 499{
f6121f4f 500 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
8875125e 501 struct rq *rq = rq_of_rt_rq(rt_rq);
74b7eb58
YZ
502 struct sched_rt_entity *rt_se;
503
8875125e 504 int cpu = cpu_of(rq);
0c3b9168
BS
505
506 rt_se = rt_rq->tg->rt_se[cpu];
6f505b16 507
f6121f4f 508 if (rt_rq->rt_nr_running) {
f4ebcbc0
KT
509 if (!rt_se)
510 enqueue_top_rt_rq(rt_rq);
511 else if (!on_rt_rq(rt_se))
ff77e468 512 enqueue_rt_entity(rt_se, 0);
f4ebcbc0 513
e864c499 514 if (rt_rq->highest_prio.curr < curr->prio)
8875125e 515 resched_curr(rq);
6f505b16
PZ
516 }
517}
518
9f0c1e56 519static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
6f505b16 520{
74b7eb58 521 struct sched_rt_entity *rt_se;
0c3b9168 522 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
74b7eb58 523
0c3b9168 524 rt_se = rt_rq->tg->rt_se[cpu];
6f505b16 525
f4ebcbc0
KT
526 if (!rt_se)
527 dequeue_top_rt_rq(rt_rq);
528 else if (on_rt_rq(rt_se))
ff77e468 529 dequeue_rt_entity(rt_se, 0);
6f505b16
PZ
530}
531
46383648
KT
532static inline int rt_rq_throttled(struct rt_rq *rt_rq)
533{
534 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
535}
536
23b0fdfc
PZ
537static int rt_se_boosted(struct sched_rt_entity *rt_se)
538{
539 struct rt_rq *rt_rq = group_rt_rq(rt_se);
540 struct task_struct *p;
541
542 if (rt_rq)
543 return !!rt_rq->rt_nr_boosted;
544
545 p = rt_task_of(rt_se);
546 return p->prio != p->normal_prio;
547}
548
d0b27fa7 549#ifdef CONFIG_SMP
c6c4927b 550static inline const struct cpumask *sched_rt_period_mask(void)
d0b27fa7 551{
424c93fe 552 return this_rq()->rd->span;
d0b27fa7 553}
6f505b16 554#else
c6c4927b 555static inline const struct cpumask *sched_rt_period_mask(void)
d0b27fa7 556{
c6c4927b 557 return cpu_online_mask;
d0b27fa7
PZ
558}
559#endif
6f505b16 560
d0b27fa7
PZ
561static inline
562struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
6f505b16 563{
d0b27fa7
PZ
564 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
565}
9f0c1e56 566
ac086bc2
PZ
567static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
568{
569 return &rt_rq->tg->rt_bandwidth;
570}
571
55e12e5e 572#else /* !CONFIG_RT_GROUP_SCHED */
d0b27fa7
PZ
573
574static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
575{
ac086bc2
PZ
576 return rt_rq->rt_runtime;
577}
578
579static inline u64 sched_rt_period(struct rt_rq *rt_rq)
580{
581 return ktime_to_ns(def_rt_bandwidth.rt_period);
6f505b16
PZ
582}
583
ec514c48
CX
584typedef struct rt_rq *rt_rq_iter_t;
585
586#define for_each_rt_rq(rt_rq, iter, rq) \
587 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
588
6f505b16
PZ
589#define for_each_sched_rt_entity(rt_se) \
590 for (; rt_se; rt_se = NULL)
591
592static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
593{
594 return NULL;
595}
596
9f0c1e56 597static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
6f505b16 598{
f4ebcbc0
KT
599 struct rq *rq = rq_of_rt_rq(rt_rq);
600
601 if (!rt_rq->rt_nr_running)
602 return;
603
604 enqueue_top_rt_rq(rt_rq);
8875125e 605 resched_curr(rq);
6f505b16
PZ
606}
607
9f0c1e56 608static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
6f505b16 609{
f4ebcbc0 610 dequeue_top_rt_rq(rt_rq);
6f505b16
PZ
611}
612
46383648
KT
613static inline int rt_rq_throttled(struct rt_rq *rt_rq)
614{
615 return rt_rq->rt_throttled;
616}
617
c6c4927b 618static inline const struct cpumask *sched_rt_period_mask(void)
d0b27fa7 619{
c6c4927b 620 return cpu_online_mask;
d0b27fa7
PZ
621}
622
623static inline
624struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
625{
626 return &cpu_rq(cpu)->rt;
627}
628
ac086bc2
PZ
629static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
630{
631 return &def_rt_bandwidth;
632}
633
55e12e5e 634#endif /* CONFIG_RT_GROUP_SCHED */
d0b27fa7 635
faa59937
JL
636bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
637{
638 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
639
640 return (hrtimer_active(&rt_b->rt_period_timer) ||
641 rt_rq->rt_time < rt_b->rt_runtime);
642}
643
ac086bc2 644#ifdef CONFIG_SMP
78333cdd
PZ
645/*
646 * We ran out of runtime, see if we can borrow some from our neighbours.
647 */
269b26a5 648static void do_balance_runtime(struct rt_rq *rt_rq)
ac086bc2
PZ
649{
650 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
aa7f6730 651 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
269b26a5 652 int i, weight;
ac086bc2
PZ
653 u64 rt_period;
654
c6c4927b 655 weight = cpumask_weight(rd->span);
ac086bc2 656
0986b11b 657 raw_spin_lock(&rt_b->rt_runtime_lock);
ac086bc2 658 rt_period = ktime_to_ns(rt_b->rt_period);
c6c4927b 659 for_each_cpu(i, rd->span) {
ac086bc2
PZ
660 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
661 s64 diff;
662
663 if (iter == rt_rq)
664 continue;
665
0986b11b 666 raw_spin_lock(&iter->rt_runtime_lock);
78333cdd
PZ
667 /*
668 * Either all rqs have inf runtime and there's nothing to steal
669 * or __disable_runtime() below sets a specific rq to inf to
670 * indicate its been disabled and disalow stealing.
671 */
7def2be1
PZ
672 if (iter->rt_runtime == RUNTIME_INF)
673 goto next;
674
78333cdd
PZ
675 /*
676 * From runqueues with spare time, take 1/n part of their
677 * spare time, but no more than our period.
678 */
ac086bc2
PZ
679 diff = iter->rt_runtime - iter->rt_time;
680 if (diff > 0) {
58838cf3 681 diff = div_u64((u64)diff, weight);
ac086bc2
PZ
682 if (rt_rq->rt_runtime + diff > rt_period)
683 diff = rt_period - rt_rq->rt_runtime;
684 iter->rt_runtime -= diff;
685 rt_rq->rt_runtime += diff;
ac086bc2 686 if (rt_rq->rt_runtime == rt_period) {
0986b11b 687 raw_spin_unlock(&iter->rt_runtime_lock);
ac086bc2
PZ
688 break;
689 }
690 }
7def2be1 691next:
0986b11b 692 raw_spin_unlock(&iter->rt_runtime_lock);
ac086bc2 693 }
0986b11b 694 raw_spin_unlock(&rt_b->rt_runtime_lock);
ac086bc2 695}
7def2be1 696
78333cdd
PZ
697/*
698 * Ensure this RQ takes back all the runtime it lend to its neighbours.
699 */
7def2be1
PZ
700static void __disable_runtime(struct rq *rq)
701{
702 struct root_domain *rd = rq->rd;
ec514c48 703 rt_rq_iter_t iter;
7def2be1
PZ
704 struct rt_rq *rt_rq;
705
706 if (unlikely(!scheduler_running))
707 return;
708
ec514c48 709 for_each_rt_rq(rt_rq, iter, rq) {
7def2be1
PZ
710 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
711 s64 want;
712 int i;
713
0986b11b
TG
714 raw_spin_lock(&rt_b->rt_runtime_lock);
715 raw_spin_lock(&rt_rq->rt_runtime_lock);
78333cdd
PZ
716 /*
717 * Either we're all inf and nobody needs to borrow, or we're
718 * already disabled and thus have nothing to do, or we have
719 * exactly the right amount of runtime to take out.
720 */
7def2be1
PZ
721 if (rt_rq->rt_runtime == RUNTIME_INF ||
722 rt_rq->rt_runtime == rt_b->rt_runtime)
723 goto balanced;
0986b11b 724 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7def2be1 725
78333cdd
PZ
726 /*
727 * Calculate the difference between what we started out with
728 * and what we current have, that's the amount of runtime
729 * we lend and now have to reclaim.
730 */
7def2be1
PZ
731 want = rt_b->rt_runtime - rt_rq->rt_runtime;
732
78333cdd
PZ
733 /*
734 * Greedy reclaim, take back as much as we can.
735 */
c6c4927b 736 for_each_cpu(i, rd->span) {
7def2be1
PZ
737 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
738 s64 diff;
739
78333cdd
PZ
740 /*
741 * Can't reclaim from ourselves or disabled runqueues.
742 */
f1679d08 743 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
7def2be1
PZ
744 continue;
745
0986b11b 746 raw_spin_lock(&iter->rt_runtime_lock);
7def2be1
PZ
747 if (want > 0) {
748 diff = min_t(s64, iter->rt_runtime, want);
749 iter->rt_runtime -= diff;
750 want -= diff;
751 } else {
752 iter->rt_runtime -= want;
753 want -= want;
754 }
0986b11b 755 raw_spin_unlock(&iter->rt_runtime_lock);
7def2be1
PZ
756
757 if (!want)
758 break;
759 }
760
0986b11b 761 raw_spin_lock(&rt_rq->rt_runtime_lock);
78333cdd
PZ
762 /*
763 * We cannot be left wanting - that would mean some runtime
764 * leaked out of the system.
765 */
7def2be1
PZ
766 BUG_ON(want);
767balanced:
78333cdd
PZ
768 /*
769 * Disable all the borrow logic by pretending we have inf
770 * runtime - in which case borrowing doesn't make sense.
771 */
7def2be1 772 rt_rq->rt_runtime = RUNTIME_INF;
a4c96ae3 773 rt_rq->rt_throttled = 0;
0986b11b
TG
774 raw_spin_unlock(&rt_rq->rt_runtime_lock);
775 raw_spin_unlock(&rt_b->rt_runtime_lock);
99b62567
KT
776
777 /* Make rt_rq available for pick_next_task() */
778 sched_rt_rq_enqueue(rt_rq);
7def2be1
PZ
779 }
780}
781
7def2be1
PZ
782static void __enable_runtime(struct rq *rq)
783{
ec514c48 784 rt_rq_iter_t iter;
7def2be1
PZ
785 struct rt_rq *rt_rq;
786
787 if (unlikely(!scheduler_running))
788 return;
789
78333cdd
PZ
790 /*
791 * Reset each runqueue's bandwidth settings
792 */
ec514c48 793 for_each_rt_rq(rt_rq, iter, rq) {
7def2be1
PZ
794 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
795
0986b11b
TG
796 raw_spin_lock(&rt_b->rt_runtime_lock);
797 raw_spin_lock(&rt_rq->rt_runtime_lock);
7def2be1
PZ
798 rt_rq->rt_runtime = rt_b->rt_runtime;
799 rt_rq->rt_time = 0;
baf25731 800 rt_rq->rt_throttled = 0;
0986b11b
TG
801 raw_spin_unlock(&rt_rq->rt_runtime_lock);
802 raw_spin_unlock(&rt_b->rt_runtime_lock);
7def2be1
PZ
803 }
804}
805
269b26a5 806static void balance_runtime(struct rt_rq *rt_rq)
eff6549b 807{
4a6184ce 808 if (!sched_feat(RT_RUNTIME_SHARE))
269b26a5 809 return;
4a6184ce 810
eff6549b 811 if (rt_rq->rt_time > rt_rq->rt_runtime) {
0986b11b 812 raw_spin_unlock(&rt_rq->rt_runtime_lock);
269b26a5 813 do_balance_runtime(rt_rq);
0986b11b 814 raw_spin_lock(&rt_rq->rt_runtime_lock);
eff6549b 815 }
eff6549b 816}
55e12e5e 817#else /* !CONFIG_SMP */
269b26a5 818static inline void balance_runtime(struct rt_rq *rt_rq) {}
55e12e5e 819#endif /* CONFIG_SMP */
ac086bc2 820
eff6549b
PZ
821static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
822{
42c62a58 823 int i, idle = 1, throttled = 0;
c6c4927b 824 const struct cpumask *span;
eff6549b 825
eff6549b 826 span = sched_rt_period_mask();
e221d028
MG
827#ifdef CONFIG_RT_GROUP_SCHED
828 /*
829 * FIXME: isolated CPUs should really leave the root task group,
830 * whether they are isolcpus or were isolated via cpusets, lest
831 * the timer run on a CPU which does not service all runqueues,
832 * potentially leaving other CPUs indefinitely throttled. If
833 * isolation is really required, the user will turn the throttle
834 * off to kill the perturbations it causes anyway. Meanwhile,
835 * this maintains functionality for boot and/or troubleshooting.
836 */
837 if (rt_b == &root_task_group.rt_bandwidth)
838 span = cpu_online_mask;
839#endif
c6c4927b 840 for_each_cpu(i, span) {
eff6549b
PZ
841 int enqueue = 0;
842 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
843 struct rq *rq = rq_of_rt_rq(rt_rq);
c249f255
DK
844 int skip;
845
846 /*
847 * When span == cpu_online_mask, taking each rq->lock
848 * can be time-consuming. Try to avoid it when possible.
849 */
850 raw_spin_lock(&rt_rq->rt_runtime_lock);
851 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
852 raw_spin_unlock(&rt_rq->rt_runtime_lock);
853 if (skip)
854 continue;
eff6549b 855
05fa785c 856 raw_spin_lock(&rq->lock);
eff6549b
PZ
857 if (rt_rq->rt_time) {
858 u64 runtime;
859
0986b11b 860 raw_spin_lock(&rt_rq->rt_runtime_lock);
eff6549b
PZ
861 if (rt_rq->rt_throttled)
862 balance_runtime(rt_rq);
863 runtime = rt_rq->rt_runtime;
864 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
865 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
866 rt_rq->rt_throttled = 0;
867 enqueue = 1;
61eadef6
MG
868
869 /*
9edfbfed
PZ
870 * When we're idle and a woken (rt) task is
871 * throttled check_preempt_curr() will set
872 * skip_update and the time between the wakeup
873 * and this unthrottle will get accounted as
874 * 'runtime'.
61eadef6
MG
875 */
876 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
9edfbfed 877 rq_clock_skip_update(rq, false);
eff6549b
PZ
878 }
879 if (rt_rq->rt_time || rt_rq->rt_nr_running)
880 idle = 0;
0986b11b 881 raw_spin_unlock(&rt_rq->rt_runtime_lock);
0c3b9168 882 } else if (rt_rq->rt_nr_running) {
6c3df255 883 idle = 0;
0c3b9168
BS
884 if (!rt_rq_throttled(rt_rq))
885 enqueue = 1;
886 }
42c62a58
PZ
887 if (rt_rq->rt_throttled)
888 throttled = 1;
eff6549b
PZ
889
890 if (enqueue)
891 sched_rt_rq_enqueue(rt_rq);
05fa785c 892 raw_spin_unlock(&rq->lock);
eff6549b
PZ
893 }
894
42c62a58
PZ
895 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
896 return 1;
897
eff6549b
PZ
898 return idle;
899}
ac086bc2 900
6f505b16
PZ
901static inline int rt_se_prio(struct sched_rt_entity *rt_se)
902{
052f1dc7 903#ifdef CONFIG_RT_GROUP_SCHED
6f505b16
PZ
904 struct rt_rq *rt_rq = group_rt_rq(rt_se);
905
906 if (rt_rq)
e864c499 907 return rt_rq->highest_prio.curr;
6f505b16
PZ
908#endif
909
910 return rt_task_of(rt_se)->prio;
911}
912
9f0c1e56 913static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
6f505b16 914{
9f0c1e56 915 u64 runtime = sched_rt_runtime(rt_rq);
fa85ae24 916
fa85ae24 917 if (rt_rq->rt_throttled)
23b0fdfc 918 return rt_rq_throttled(rt_rq);
fa85ae24 919
5b680fd6 920 if (runtime >= sched_rt_period(rt_rq))
ac086bc2
PZ
921 return 0;
922
b79f3833
PZ
923 balance_runtime(rt_rq);
924 runtime = sched_rt_runtime(rt_rq);
925 if (runtime == RUNTIME_INF)
926 return 0;
ac086bc2 927
9f0c1e56 928 if (rt_rq->rt_time > runtime) {
7abc63b1
PZ
929 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
930
931 /*
932 * Don't actually throttle groups that have no runtime assigned
933 * but accrue some time due to boosting.
934 */
935 if (likely(rt_b->rt_runtime)) {
936 rt_rq->rt_throttled = 1;
c224815d 937 printk_deferred_once("sched: RT throttling activated\n");
7abc63b1
PZ
938 } else {
939 /*
940 * In case we did anyway, make it go away,
941 * replenishment is a joke, since it will replenish us
942 * with exactly 0 ns.
943 */
944 rt_rq->rt_time = 0;
945 }
946
23b0fdfc 947 if (rt_rq_throttled(rt_rq)) {
9f0c1e56 948 sched_rt_rq_dequeue(rt_rq);
23b0fdfc
PZ
949 return 1;
950 }
fa85ae24
PZ
951 }
952
953 return 0;
954}
955
bb44e5d1
IM
956/*
957 * Update the current task's runtime statistics. Skip current tasks that
958 * are not in our scheduling class.
959 */
a9957449 960static void update_curr_rt(struct rq *rq)
bb44e5d1
IM
961{
962 struct task_struct *curr = rq->curr;
6f505b16 963 struct sched_rt_entity *rt_se = &curr->rt;
bb44e5d1
IM
964 u64 delta_exec;
965
06c3bc65 966 if (curr->sched_class != &rt_sched_class)
bb44e5d1
IM
967 return;
968
78becc27 969 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
fc79e240
KT
970 if (unlikely((s64)delta_exec <= 0))
971 return;
6cfb0d5d 972
58919e83 973 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
674e7541 974 cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
594dd290 975
42c62a58
PZ
976 schedstat_set(curr->se.statistics.exec_max,
977 max(curr->se.statistics.exec_max, delta_exec));
bb44e5d1
IM
978
979 curr->se.sum_exec_runtime += delta_exec;
f06febc9
FM
980 account_group_exec_runtime(curr, delta_exec);
981
78becc27 982 curr->se.exec_start = rq_clock_task(rq);
d842de87 983 cpuacct_charge(curr, delta_exec);
fa85ae24 984
e9e9250b
PZ
985 sched_rt_avg_update(rq, delta_exec);
986
0b148fa0
PZ
987 if (!rt_bandwidth_enabled())
988 return;
989
354d60c2 990 for_each_sched_rt_entity(rt_se) {
0b07939c 991 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
354d60c2 992
cc2991cf 993 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
0986b11b 994 raw_spin_lock(&rt_rq->rt_runtime_lock);
cc2991cf
PZ
995 rt_rq->rt_time += delta_exec;
996 if (sched_rt_runtime_exceeded(rt_rq))
8875125e 997 resched_curr(rq);
0986b11b 998 raw_spin_unlock(&rt_rq->rt_runtime_lock);
cc2991cf 999 }
354d60c2 1000 }
bb44e5d1
IM
1001}
1002
f4ebcbc0
KT
1003static void
1004dequeue_top_rt_rq(struct rt_rq *rt_rq)
1005{
1006 struct rq *rq = rq_of_rt_rq(rt_rq);
1007
1008 BUG_ON(&rq->rt != rt_rq);
1009
1010 if (!rt_rq->rt_queued)
1011 return;
1012
1013 BUG_ON(!rq->nr_running);
1014
72465447 1015 sub_nr_running(rq, rt_rq->rt_nr_running);
f4ebcbc0
KT
1016 rt_rq->rt_queued = 0;
1017}
1018
1019static void
1020enqueue_top_rt_rq(struct rt_rq *rt_rq)
1021{
1022 struct rq *rq = rq_of_rt_rq(rt_rq);
1023
1024 BUG_ON(&rq->rt != rt_rq);
1025
1026 if (rt_rq->rt_queued)
1027 return;
1028 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1029 return;
1030
72465447 1031 add_nr_running(rq, rt_rq->rt_nr_running);
f4ebcbc0
KT
1032 rt_rq->rt_queued = 1;
1033}
1034
398a153b 1035#if defined CONFIG_SMP
e864c499 1036
398a153b
GH
1037static void
1038inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
63489e45 1039{
4d984277 1040 struct rq *rq = rq_of_rt_rq(rt_rq);
1f11eb6a 1041
757dfcaa
KT
1042#ifdef CONFIG_RT_GROUP_SCHED
1043 /*
1044 * Change rq's cpupri only if rt_rq is the top queue.
1045 */
1046 if (&rq->rt != rt_rq)
1047 return;
1048#endif
5181f4a4
SR
1049 if (rq->online && prio < prev_prio)
1050 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
398a153b 1051}
73fe6aae 1052
398a153b
GH
1053static void
1054dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1055{
1056 struct rq *rq = rq_of_rt_rq(rt_rq);
d0b27fa7 1057
757dfcaa
KT
1058#ifdef CONFIG_RT_GROUP_SCHED
1059 /*
1060 * Change rq's cpupri only if rt_rq is the top queue.
1061 */
1062 if (&rq->rt != rt_rq)
1063 return;
1064#endif
398a153b
GH
1065 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1066 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
63489e45
SR
1067}
1068
398a153b
GH
1069#else /* CONFIG_SMP */
1070
6f505b16 1071static inline
398a153b
GH
1072void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1073static inline
1074void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1075
1076#endif /* CONFIG_SMP */
6e0534f2 1077
052f1dc7 1078#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
398a153b
GH
1079static void
1080inc_rt_prio(struct rt_rq *rt_rq, int prio)
1081{
1082 int prev_prio = rt_rq->highest_prio.curr;
1083
1084 if (prio < prev_prio)
1085 rt_rq->highest_prio.curr = prio;
1086
1087 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1088}
1089
1090static void
1091dec_rt_prio(struct rt_rq *rt_rq, int prio)
1092{
1093 int prev_prio = rt_rq->highest_prio.curr;
1094
6f505b16 1095 if (rt_rq->rt_nr_running) {
764a9d6f 1096
398a153b 1097 WARN_ON(prio < prev_prio);
764a9d6f 1098
e864c499 1099 /*
398a153b
GH
1100 * This may have been our highest task, and therefore
1101 * we may have some recomputation to do
e864c499 1102 */
398a153b 1103 if (prio == prev_prio) {
e864c499
GH
1104 struct rt_prio_array *array = &rt_rq->active;
1105
1106 rt_rq->highest_prio.curr =
764a9d6f 1107 sched_find_first_bit(array->bitmap);
e864c499
GH
1108 }
1109
764a9d6f 1110 } else
e864c499 1111 rt_rq->highest_prio.curr = MAX_RT_PRIO;
73fe6aae 1112
398a153b
GH
1113 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1114}
1f11eb6a 1115
398a153b
GH
1116#else
1117
1118static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1119static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1120
1121#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
6e0534f2 1122
052f1dc7 1123#ifdef CONFIG_RT_GROUP_SCHED
398a153b
GH
1124
1125static void
1126inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1127{
1128 if (rt_se_boosted(rt_se))
1129 rt_rq->rt_nr_boosted++;
1130
1131 if (rt_rq->tg)
1132 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1133}
1134
1135static void
1136dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1137{
23b0fdfc
PZ
1138 if (rt_se_boosted(rt_se))
1139 rt_rq->rt_nr_boosted--;
1140
1141 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
398a153b
GH
1142}
1143
1144#else /* CONFIG_RT_GROUP_SCHED */
1145
1146static void
1147inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1148{
1149 start_rt_bandwidth(&def_rt_bandwidth);
1150}
1151
1152static inline
1153void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1154
1155#endif /* CONFIG_RT_GROUP_SCHED */
1156
22abdef3
KT
1157static inline
1158unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1159{
1160 struct rt_rq *group_rq = group_rt_rq(rt_se);
1161
1162 if (group_rq)
1163 return group_rq->rt_nr_running;
1164 else
1165 return 1;
1166}
1167
01d36d0a
FW
1168static inline
1169unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1170{
1171 struct rt_rq *group_rq = group_rt_rq(rt_se);
1172 struct task_struct *tsk;
1173
1174 if (group_rq)
1175 return group_rq->rr_nr_running;
1176
1177 tsk = rt_task_of(rt_se);
1178
1179 return (tsk->policy == SCHED_RR) ? 1 : 0;
1180}
1181
398a153b
GH
1182static inline
1183void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1184{
1185 int prio = rt_se_prio(rt_se);
1186
1187 WARN_ON(!rt_prio(prio));
22abdef3 1188 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
01d36d0a 1189 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
398a153b
GH
1190
1191 inc_rt_prio(rt_rq, prio);
1192 inc_rt_migration(rt_se, rt_rq);
1193 inc_rt_group(rt_se, rt_rq);
1194}
1195
1196static inline
1197void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1198{
1199 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1200 WARN_ON(!rt_rq->rt_nr_running);
22abdef3 1201 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
01d36d0a 1202 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
398a153b
GH
1203
1204 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1205 dec_rt_migration(rt_se, rt_rq);
1206 dec_rt_group(rt_se, rt_rq);
63489e45
SR
1207}
1208
ff77e468
PZ
1209/*
1210 * Change rt_se->run_list location unless SAVE && !MOVE
1211 *
1212 * assumes ENQUEUE/DEQUEUE flags match
1213 */
1214static inline bool move_entity(unsigned int flags)
1215{
1216 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1217 return false;
1218
1219 return true;
1220}
1221
1222static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1223{
1224 list_del_init(&rt_se->run_list);
1225
1226 if (list_empty(array->queue + rt_se_prio(rt_se)))
1227 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1228
1229 rt_se->on_list = 0;
1230}
1231
1232static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
bb44e5d1 1233{
6f505b16
PZ
1234 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1235 struct rt_prio_array *array = &rt_rq->active;
1236 struct rt_rq *group_rq = group_rt_rq(rt_se);
20b6331b 1237 struct list_head *queue = array->queue + rt_se_prio(rt_se);
bb44e5d1 1238
ad2a3f13
PZ
1239 /*
1240 * Don't enqueue the group if its throttled, or when empty.
1241 * The latter is a consequence of the former when a child group
1242 * get throttled and the current group doesn't have any other
1243 * active members.
1244 */
ff77e468
PZ
1245 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1246 if (rt_se->on_list)
1247 __delist_rt_entity(rt_se, array);
6f505b16 1248 return;
ff77e468 1249 }
63489e45 1250
ff77e468
PZ
1251 if (move_entity(flags)) {
1252 WARN_ON_ONCE(rt_se->on_list);
1253 if (flags & ENQUEUE_HEAD)
1254 list_add(&rt_se->run_list, queue);
1255 else
1256 list_add_tail(&rt_se->run_list, queue);
1257
1258 __set_bit(rt_se_prio(rt_se), array->bitmap);
1259 rt_se->on_list = 1;
1260 }
1261 rt_se->on_rq = 1;
78f2c7db 1262
6f505b16
PZ
1263 inc_rt_tasks(rt_se, rt_rq);
1264}
1265
ff77e468 1266static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
6f505b16
PZ
1267{
1268 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1269 struct rt_prio_array *array = &rt_rq->active;
1270
ff77e468
PZ
1271 if (move_entity(flags)) {
1272 WARN_ON_ONCE(!rt_se->on_list);
1273 __delist_rt_entity(rt_se, array);
1274 }
1275 rt_se->on_rq = 0;
6f505b16
PZ
1276
1277 dec_rt_tasks(rt_se, rt_rq);
1278}
1279
1280/*
1281 * Because the prio of an upper entry depends on the lower
1282 * entries, we must remove entries top - down.
6f505b16 1283 */
ff77e468 1284static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
6f505b16 1285{
ad2a3f13 1286 struct sched_rt_entity *back = NULL;
6f505b16 1287
58d6c2d7
PZ
1288 for_each_sched_rt_entity(rt_se) {
1289 rt_se->back = back;
1290 back = rt_se;
1291 }
1292
f4ebcbc0
KT
1293 dequeue_top_rt_rq(rt_rq_of_se(back));
1294
58d6c2d7
PZ
1295 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1296 if (on_rt_rq(rt_se))
ff77e468 1297 __dequeue_rt_entity(rt_se, flags);
ad2a3f13
PZ
1298 }
1299}
1300
ff77e468 1301static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
ad2a3f13 1302{
f4ebcbc0
KT
1303 struct rq *rq = rq_of_rt_se(rt_se);
1304
ff77e468 1305 dequeue_rt_stack(rt_se, flags);
ad2a3f13 1306 for_each_sched_rt_entity(rt_se)
ff77e468 1307 __enqueue_rt_entity(rt_se, flags);
f4ebcbc0 1308 enqueue_top_rt_rq(&rq->rt);
ad2a3f13
PZ
1309}
1310
ff77e468 1311static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
ad2a3f13 1312{
f4ebcbc0
KT
1313 struct rq *rq = rq_of_rt_se(rt_se);
1314
ff77e468 1315 dequeue_rt_stack(rt_se, flags);
ad2a3f13
PZ
1316
1317 for_each_sched_rt_entity(rt_se) {
1318 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1319
1320 if (rt_rq && rt_rq->rt_nr_running)
ff77e468 1321 __enqueue_rt_entity(rt_se, flags);
58d6c2d7 1322 }
f4ebcbc0 1323 enqueue_top_rt_rq(&rq->rt);
bb44e5d1
IM
1324}
1325
1326/*
1327 * Adding/removing a task to/from a priority array:
1328 */
ea87bb78 1329static void
371fd7e7 1330enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
6f505b16
PZ
1331{
1332 struct sched_rt_entity *rt_se = &p->rt;
1333
371fd7e7 1334 if (flags & ENQUEUE_WAKEUP)
6f505b16
PZ
1335 rt_se->timeout = 0;
1336
ff77e468 1337 enqueue_rt_entity(rt_se, flags);
c09595f6 1338
4b53a341 1339 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
917b627d 1340 enqueue_pushable_task(rq, p);
6f505b16
PZ
1341}
1342
371fd7e7 1343static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
bb44e5d1 1344{
6f505b16 1345 struct sched_rt_entity *rt_se = &p->rt;
bb44e5d1 1346
f1e14ef6 1347 update_curr_rt(rq);
ff77e468 1348 dequeue_rt_entity(rt_se, flags);
c09595f6 1349
917b627d 1350 dequeue_pushable_task(rq, p);
bb44e5d1
IM
1351}
1352
1353/*
60686317
RW
1354 * Put task to the head or the end of the run list without the overhead of
1355 * dequeue followed by enqueue.
bb44e5d1 1356 */
7ebefa8c
DA
1357static void
1358requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
6f505b16 1359{
1cdad715 1360 if (on_rt_rq(rt_se)) {
7ebefa8c
DA
1361 struct rt_prio_array *array = &rt_rq->active;
1362 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1363
1364 if (head)
1365 list_move(&rt_se->run_list, queue);
1366 else
1367 list_move_tail(&rt_se->run_list, queue);
1cdad715 1368 }
6f505b16
PZ
1369}
1370
7ebefa8c 1371static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
bb44e5d1 1372{
6f505b16
PZ
1373 struct sched_rt_entity *rt_se = &p->rt;
1374 struct rt_rq *rt_rq;
bb44e5d1 1375
6f505b16
PZ
1376 for_each_sched_rt_entity(rt_se) {
1377 rt_rq = rt_rq_of_se(rt_se);
7ebefa8c 1378 requeue_rt_entity(rt_rq, rt_se, head);
6f505b16 1379 }
bb44e5d1
IM
1380}
1381
6f505b16 1382static void yield_task_rt(struct rq *rq)
bb44e5d1 1383{
7ebefa8c 1384 requeue_task_rt(rq, rq->curr, 0);
bb44e5d1
IM
1385}
1386
e7693a36 1387#ifdef CONFIG_SMP
318e0893
GH
1388static int find_lowest_rq(struct task_struct *task);
1389
0017d735 1390static int
ac66f547 1391select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
e7693a36 1392{
7608dec2
PZ
1393 struct task_struct *curr;
1394 struct rq *rq;
c37495fd
SR
1395
1396 /* For anything but wake ups, just return the task_cpu */
1397 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1398 goto out;
1399
7608dec2
PZ
1400 rq = cpu_rq(cpu);
1401
1402 rcu_read_lock();
316c1608 1403 curr = READ_ONCE(rq->curr); /* unlocked access */
7608dec2 1404
318e0893 1405 /*
7608dec2 1406 * If the current task on @p's runqueue is an RT task, then
e1f47d89
SR
1407 * try to see if we can wake this RT task up on another
1408 * runqueue. Otherwise simply start this RT task
1409 * on its current runqueue.
1410 *
43fa5460
SR
1411 * We want to avoid overloading runqueues. If the woken
1412 * task is a higher priority, then it will stay on this CPU
1413 * and the lower prio task should be moved to another CPU.
1414 * Even though this will probably make the lower prio task
1415 * lose its cache, we do not want to bounce a higher task
1416 * around just because it gave up its CPU, perhaps for a
1417 * lock?
1418 *
1419 * For equal prio tasks, we just let the scheduler sort it out.
7608dec2
PZ
1420 *
1421 * Otherwise, just let it ride on the affined RQ and the
1422 * post-schedule router will push the preempted task away
1423 *
1424 * This test is optimistic, if we get it wrong the load-balancer
1425 * will have to sort it out.
318e0893 1426 */
7608dec2 1427 if (curr && unlikely(rt_task(curr)) &&
4b53a341 1428 (curr->nr_cpus_allowed < 2 ||
6bfa687c 1429 curr->prio <= p->prio)) {
7608dec2 1430 int target = find_lowest_rq(p);
318e0893 1431
80e3d87b
TC
1432 /*
1433 * Don't bother moving it if the destination CPU is
1434 * not running a lower priority task.
1435 */
1436 if (target != -1 &&
1437 p->prio < cpu_rq(target)->rt.highest_prio.curr)
7608dec2 1438 cpu = target;
318e0893 1439 }
7608dec2 1440 rcu_read_unlock();
318e0893 1441
c37495fd 1442out:
7608dec2 1443 return cpu;
e7693a36 1444}
7ebefa8c
DA
1445
1446static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1447{
308a623a
WL
1448 /*
1449 * Current can't be migrated, useless to reschedule,
1450 * let's hope p can move out.
1451 */
4b53a341 1452 if (rq->curr->nr_cpus_allowed == 1 ||
308a623a 1453 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
7ebefa8c
DA
1454 return;
1455
308a623a
WL
1456 /*
1457 * p is migratable, so let's not schedule it and
1458 * see if it is pushed or pulled somewhere else.
1459 */
4b53a341 1460 if (p->nr_cpus_allowed != 1
13b8bd0a
RR
1461 && cpupri_find(&rq->rd->cpupri, p, NULL))
1462 return;
24600ce8 1463
7ebefa8c
DA
1464 /*
1465 * There appears to be other cpus that can accept
1466 * current and none to run 'p', so lets reschedule
1467 * to try and push current away:
1468 */
1469 requeue_task_rt(rq, p, 1);
8875125e 1470 resched_curr(rq);
7ebefa8c
DA
1471}
1472
e7693a36
GH
1473#endif /* CONFIG_SMP */
1474
bb44e5d1
IM
1475/*
1476 * Preempt the current task with a newly woken task if needed:
1477 */
7d478721 1478static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
bb44e5d1 1479{
45c01e82 1480 if (p->prio < rq->curr->prio) {
8875125e 1481 resched_curr(rq);
45c01e82
GH
1482 return;
1483 }
1484
1485#ifdef CONFIG_SMP
1486 /*
1487 * If:
1488 *
1489 * - the newly woken task is of equal priority to the current task
1490 * - the newly woken task is non-migratable while current is migratable
1491 * - current will be preempted on the next reschedule
1492 *
1493 * we should check to see if current can readily move to a different
1494 * cpu. If so, we will reschedule to allow the push logic to try
1495 * to move current somewhere else, making room for our non-migratable
1496 * task.
1497 */
8dd0de8b 1498 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
7ebefa8c 1499 check_preempt_equal_prio(rq, p);
45c01e82 1500#endif
bb44e5d1
IM
1501}
1502
6f505b16
PZ
1503static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1504 struct rt_rq *rt_rq)
bb44e5d1 1505{
6f505b16
PZ
1506 struct rt_prio_array *array = &rt_rq->active;
1507 struct sched_rt_entity *next = NULL;
bb44e5d1
IM
1508 struct list_head *queue;
1509 int idx;
1510
1511 idx = sched_find_first_bit(array->bitmap);
6f505b16 1512 BUG_ON(idx >= MAX_RT_PRIO);
bb44e5d1
IM
1513
1514 queue = array->queue + idx;
6f505b16 1515 next = list_entry(queue->next, struct sched_rt_entity, run_list);
326587b8 1516
6f505b16
PZ
1517 return next;
1518}
bb44e5d1 1519
917b627d 1520static struct task_struct *_pick_next_task_rt(struct rq *rq)
6f505b16
PZ
1521{
1522 struct sched_rt_entity *rt_se;
1523 struct task_struct *p;
606dba2e 1524 struct rt_rq *rt_rq = &rq->rt;
6f505b16
PZ
1525
1526 do {
1527 rt_se = pick_next_rt_entity(rq, rt_rq);
326587b8 1528 BUG_ON(!rt_se);
6f505b16
PZ
1529 rt_rq = group_rt_rq(rt_se);
1530 } while (rt_rq);
1531
1532 p = rt_task_of(rt_se);
78becc27 1533 p->se.exec_start = rq_clock_task(rq);
917b627d
GH
1534
1535 return p;
1536}
1537
606dba2e 1538static struct task_struct *
d8ac8971 1539pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
917b627d 1540{
606dba2e
PZ
1541 struct task_struct *p;
1542 struct rt_rq *rt_rq = &rq->rt;
1543
37e117c0 1544 if (need_pull_rt_task(rq, prev)) {
cbce1a68
PZ
1545 /*
1546 * This is OK, because current is on_cpu, which avoids it being
1547 * picked for load-balance and preemption/IRQs are still
1548 * disabled avoiding further scheduler activity on it and we're
1549 * being very careful to re-start the picking loop.
1550 */
d8ac8971 1551 rq_unpin_lock(rq, rf);
38033c37 1552 pull_rt_task(rq);
d8ac8971 1553 rq_repin_lock(rq, rf);
37e117c0
PZ
1554 /*
1555 * pull_rt_task() can drop (and re-acquire) rq->lock; this
a1d9a323
KT
1556 * means a dl or stop task can slip in, in which case we need
1557 * to re-start task selection.
37e117c0 1558 */
da0c1e65 1559 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
a1d9a323 1560 rq->dl.dl_nr_running))
37e117c0
PZ
1561 return RETRY_TASK;
1562 }
38033c37 1563
734ff2a7
KT
1564 /*
1565 * We may dequeue prev's rt_rq in put_prev_task().
1566 * So, we update time before rt_nr_running check.
1567 */
1568 if (prev->sched_class == &rt_sched_class)
1569 update_curr_rt(rq);
1570
f4ebcbc0 1571 if (!rt_rq->rt_queued)
606dba2e
PZ
1572 return NULL;
1573
3f1d2a31 1574 put_prev_task(rq, prev);
606dba2e
PZ
1575
1576 p = _pick_next_task_rt(rq);
917b627d
GH
1577
1578 /* The running task is never eligible for pushing */
f3f1768f 1579 dequeue_pushable_task(rq, p);
917b627d 1580
e3fca9e7 1581 queue_push_tasks(rq);
3f029d3c 1582
6f505b16 1583 return p;
bb44e5d1
IM
1584}
1585
31ee529c 1586static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
bb44e5d1 1587{
f1e14ef6 1588 update_curr_rt(rq);
917b627d
GH
1589
1590 /*
1591 * The previous task needs to be made eligible for pushing
1592 * if it is still active
1593 */
4b53a341 1594 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
917b627d 1595 enqueue_pushable_task(rq, p);
bb44e5d1
IM
1596}
1597
681f3e68 1598#ifdef CONFIG_SMP
6f505b16 1599
e8fa1362
SR
1600/* Only try algorithms three times */
1601#define RT_MAX_TRIES 3
1602
f65eda4f
SR
1603static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1604{
1605 if (!task_running(rq, p) &&
0c98d344 1606 cpumask_test_cpu(cpu, &p->cpus_allowed))
f65eda4f
SR
1607 return 1;
1608 return 0;
1609}
1610
e23ee747
KT
1611/*
1612 * Return the highest pushable rq's task, which is suitable to be executed
1613 * on the cpu, NULL otherwise
1614 */
1615static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
e8fa1362 1616{
e23ee747
KT
1617 struct plist_head *head = &rq->rt.pushable_tasks;
1618 struct task_struct *p;
3d07467b 1619
e23ee747
KT
1620 if (!has_pushable_tasks(rq))
1621 return NULL;
3d07467b 1622
e23ee747
KT
1623 plist_for_each_entry(p, head, pushable_tasks) {
1624 if (pick_rt_task(rq, p, cpu))
1625 return p;
f65eda4f
SR
1626 }
1627
e23ee747 1628 return NULL;
e8fa1362
SR
1629}
1630
0e3900e6 1631static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
e8fa1362 1632
6e1254d2
GH
1633static int find_lowest_rq(struct task_struct *task)
1634{
1635 struct sched_domain *sd;
4ba29684 1636 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
6e1254d2
GH
1637 int this_cpu = smp_processor_id();
1638 int cpu = task_cpu(task);
06f90dbd 1639
0da938c4
SR
1640 /* Make sure the mask is initialized first */
1641 if (unlikely(!lowest_mask))
1642 return -1;
1643
4b53a341 1644 if (task->nr_cpus_allowed == 1)
6e0534f2 1645 return -1; /* No other targets possible */
6e1254d2 1646
6e0534f2
GH
1647 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1648 return -1; /* No targets found */
6e1254d2
GH
1649
1650 /*
1651 * At this point we have built a mask of cpus representing the
1652 * lowest priority tasks in the system. Now we want to elect
1653 * the best one based on our affinity and topology.
1654 *
1655 * We prioritize the last cpu that the task executed on since
1656 * it is most likely cache-hot in that location.
1657 */
96f874e2 1658 if (cpumask_test_cpu(cpu, lowest_mask))
6e1254d2
GH
1659 return cpu;
1660
1661 /*
1662 * Otherwise, we consult the sched_domains span maps to figure
1663 * out which cpu is logically closest to our hot cache data.
1664 */
e2c88063
RR
1665 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1666 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
6e1254d2 1667
cd4ae6ad 1668 rcu_read_lock();
e2c88063
RR
1669 for_each_domain(cpu, sd) {
1670 if (sd->flags & SD_WAKE_AFFINE) {
1671 int best_cpu;
6e1254d2 1672
e2c88063
RR
1673 /*
1674 * "this_cpu" is cheaper to preempt than a
1675 * remote processor.
1676 */
1677 if (this_cpu != -1 &&
cd4ae6ad
XF
1678 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1679 rcu_read_unlock();
e2c88063 1680 return this_cpu;
cd4ae6ad 1681 }
e2c88063
RR
1682
1683 best_cpu = cpumask_first_and(lowest_mask,
1684 sched_domain_span(sd));
cd4ae6ad
XF
1685 if (best_cpu < nr_cpu_ids) {
1686 rcu_read_unlock();
e2c88063 1687 return best_cpu;
cd4ae6ad 1688 }
6e1254d2
GH
1689 }
1690 }
cd4ae6ad 1691 rcu_read_unlock();
6e1254d2
GH
1692
1693 /*
1694 * And finally, if there were no matches within the domains
1695 * just give the caller *something* to work with from the compatible
1696 * locations.
1697 */
e2c88063
RR
1698 if (this_cpu != -1)
1699 return this_cpu;
1700
1701 cpu = cpumask_any(lowest_mask);
1702 if (cpu < nr_cpu_ids)
1703 return cpu;
1704 return -1;
07b4032c
GH
1705}
1706
1707/* Will lock the rq it finds */
4df64c0b 1708static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
07b4032c
GH
1709{
1710 struct rq *lowest_rq = NULL;
07b4032c 1711 int tries;
4df64c0b 1712 int cpu;
e8fa1362 1713
07b4032c
GH
1714 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1715 cpu = find_lowest_rq(task);
1716
2de0b463 1717 if ((cpu == -1) || (cpu == rq->cpu))
e8fa1362
SR
1718 break;
1719
07b4032c
GH
1720 lowest_rq = cpu_rq(cpu);
1721
80e3d87b
TC
1722 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1723 /*
1724 * Target rq has tasks of equal or higher priority,
1725 * retrying does not release any lock and is unlikely
1726 * to yield a different result.
1727 */
1728 lowest_rq = NULL;
1729 break;
1730 }
1731
e8fa1362 1732 /* if the prio of this runqueue changed, try again */
07b4032c 1733 if (double_lock_balance(rq, lowest_rq)) {
e8fa1362
SR
1734 /*
1735 * We had to unlock the run queue. In
1736 * the mean time, task could have
1737 * migrated already or had its affinity changed.
1738 * Also make sure that it wasn't scheduled on its rq.
1739 */
07b4032c 1740 if (unlikely(task_rq(task) != rq ||
0c98d344 1741 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
07b4032c 1742 task_running(rq, task) ||
13b5ab02 1743 !rt_task(task) ||
da0c1e65 1744 !task_on_rq_queued(task))) {
4df64c0b 1745
7f1b4393 1746 double_unlock_balance(rq, lowest_rq);
e8fa1362
SR
1747 lowest_rq = NULL;
1748 break;
1749 }
1750 }
1751
1752 /* If this rq is still suitable use it. */
e864c499 1753 if (lowest_rq->rt.highest_prio.curr > task->prio)
e8fa1362
SR
1754 break;
1755
1756 /* try again */
1b12bbc7 1757 double_unlock_balance(rq, lowest_rq);
e8fa1362
SR
1758 lowest_rq = NULL;
1759 }
1760
1761 return lowest_rq;
1762}
1763
917b627d
GH
1764static struct task_struct *pick_next_pushable_task(struct rq *rq)
1765{
1766 struct task_struct *p;
1767
1768 if (!has_pushable_tasks(rq))
1769 return NULL;
1770
1771 p = plist_first_entry(&rq->rt.pushable_tasks,
1772 struct task_struct, pushable_tasks);
1773
1774 BUG_ON(rq->cpu != task_cpu(p));
1775 BUG_ON(task_current(rq, p));
4b53a341 1776 BUG_ON(p->nr_cpus_allowed <= 1);
917b627d 1777
da0c1e65 1778 BUG_ON(!task_on_rq_queued(p));
917b627d
GH
1779 BUG_ON(!rt_task(p));
1780
1781 return p;
1782}
1783
e8fa1362
SR
1784/*
1785 * If the current CPU has more than one RT task, see if the non
1786 * running task can migrate over to a CPU that is running a task
1787 * of lesser priority.
1788 */
697f0a48 1789static int push_rt_task(struct rq *rq)
e8fa1362
SR
1790{
1791 struct task_struct *next_task;
1792 struct rq *lowest_rq;
311e800e 1793 int ret = 0;
e8fa1362 1794
a22d7fc1
GH
1795 if (!rq->rt.overloaded)
1796 return 0;
1797
917b627d 1798 next_task = pick_next_pushable_task(rq);
e8fa1362
SR
1799 if (!next_task)
1800 return 0;
1801
49246274 1802retry:
697f0a48 1803 if (unlikely(next_task == rq->curr)) {
f65eda4f 1804 WARN_ON(1);
e8fa1362 1805 return 0;
f65eda4f 1806 }
e8fa1362
SR
1807
1808 /*
1809 * It's possible that the next_task slipped in of
1810 * higher priority than current. If that's the case
1811 * just reschedule current.
1812 */
697f0a48 1813 if (unlikely(next_task->prio < rq->curr->prio)) {
8875125e 1814 resched_curr(rq);
e8fa1362
SR
1815 return 0;
1816 }
1817
697f0a48 1818 /* We might release rq lock */
e8fa1362
SR
1819 get_task_struct(next_task);
1820
1821 /* find_lock_lowest_rq locks the rq if found */
697f0a48 1822 lowest_rq = find_lock_lowest_rq(next_task, rq);
e8fa1362
SR
1823 if (!lowest_rq) {
1824 struct task_struct *task;
1825 /*
311e800e 1826 * find_lock_lowest_rq releases rq->lock
1563513d
GH
1827 * so it is possible that next_task has migrated.
1828 *
1829 * We need to make sure that the task is still on the same
1830 * run-queue and is also still the next task eligible for
1831 * pushing.
e8fa1362 1832 */
917b627d 1833 task = pick_next_pushable_task(rq);
de16b91e 1834 if (task == next_task) {
1563513d 1835 /*
311e800e
HD
1836 * The task hasn't migrated, and is still the next
1837 * eligible task, but we failed to find a run-queue
1838 * to push it to. Do not retry in this case, since
1839 * other cpus will pull from us when ready.
1563513d 1840 */
1563513d 1841 goto out;
e8fa1362 1842 }
917b627d 1843
1563513d
GH
1844 if (!task)
1845 /* No more tasks, just exit */
1846 goto out;
1847
917b627d 1848 /*
1563513d 1849 * Something has shifted, try again.
917b627d 1850 */
1563513d
GH
1851 put_task_struct(next_task);
1852 next_task = task;
1853 goto retry;
e8fa1362
SR
1854 }
1855
697f0a48 1856 deactivate_task(rq, next_task, 0);
e8fa1362
SR
1857 set_task_cpu(next_task, lowest_rq->cpu);
1858 activate_task(lowest_rq, next_task, 0);
311e800e 1859 ret = 1;
e8fa1362 1860
8875125e 1861 resched_curr(lowest_rq);
e8fa1362 1862
1b12bbc7 1863 double_unlock_balance(rq, lowest_rq);
e8fa1362 1864
e8fa1362
SR
1865out:
1866 put_task_struct(next_task);
1867
311e800e 1868 return ret;
e8fa1362
SR
1869}
1870
e8fa1362
SR
1871static void push_rt_tasks(struct rq *rq)
1872{
1873 /* push_rt_task will return true if it moved an RT */
1874 while (push_rt_task(rq))
1875 ;
1876}
1877
b6366f04
SR
1878#ifdef HAVE_RT_PUSH_IPI
1879/*
1880 * The search for the next cpu always starts at rq->cpu and ends
1881 * when we reach rq->cpu again. It will never return rq->cpu.
1882 * This returns the next cpu to check, or nr_cpu_ids if the loop
1883 * is complete.
1884 *
1885 * rq->rt.push_cpu holds the last cpu returned by this function,
1886 * or if this is the first instance, it must hold rq->cpu.
1887 */
1888static int rto_next_cpu(struct rq *rq)
1889{
1890 int prev_cpu = rq->rt.push_cpu;
1891 int cpu;
1892
1893 cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
1894
1895 /*
1896 * If the previous cpu is less than the rq's CPU, then it already
1897 * passed the end of the mask, and has started from the beginning.
1898 * We end if the next CPU is greater or equal to rq's CPU.
1899 */
1900 if (prev_cpu < rq->cpu) {
1901 if (cpu >= rq->cpu)
1902 return nr_cpu_ids;
1903
1904 } else if (cpu >= nr_cpu_ids) {
1905 /*
1906 * We passed the end of the mask, start at the beginning.
1907 * If the result is greater or equal to the rq's CPU, then
1908 * the loop is finished.
1909 */
1910 cpu = cpumask_first(rq->rd->rto_mask);
1911 if (cpu >= rq->cpu)
1912 return nr_cpu_ids;
1913 }
1914 rq->rt.push_cpu = cpu;
1915
1916 /* Return cpu to let the caller know if the loop is finished or not */
1917 return cpu;
1918}
1919
1920static int find_next_push_cpu(struct rq *rq)
1921{
1922 struct rq *next_rq;
1923 int cpu;
1924
1925 while (1) {
1926 cpu = rto_next_cpu(rq);
1927 if (cpu >= nr_cpu_ids)
1928 break;
1929 next_rq = cpu_rq(cpu);
1930
1931 /* Make sure the next rq can push to this rq */
1932 if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
1933 break;
1934 }
1935
1936 return cpu;
1937}
1938
1939#define RT_PUSH_IPI_EXECUTING 1
1940#define RT_PUSH_IPI_RESTART 2
1941
3e777f99
SRV
1942/*
1943 * When a high priority task schedules out from a CPU and a lower priority
1944 * task is scheduled in, a check is made to see if there's any RT tasks
1945 * on other CPUs that are waiting to run because a higher priority RT task
1946 * is currently running on its CPU. In this case, the CPU with multiple RT
1947 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1948 * up that may be able to run one of its non-running queued RT tasks.
1949 *
1950 * On large CPU boxes, there's the case that several CPUs could schedule
1951 * a lower priority task at the same time, in which case it will look for
1952 * any overloaded CPUs that it could pull a task from. To do this, the runqueue
1953 * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
1954 * for a single overloaded CPU's runqueue lock can produce a large latency.
1955 * (This has actually been observed on large boxes running cyclictest).
1956 * Instead of taking the runqueue lock of the overloaded CPU, each of the
1957 * CPUs that scheduled a lower priority task simply sends an IPI to the
1958 * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
1959 * lots of contention. The overloaded CPU will look to push its non-running
1960 * RT task off, and if it does, it can then ignore the other IPIs coming
1961 * in, and just pass those IPIs off to any other overloaded CPU.
1962 *
1963 * When a CPU schedules a lower priority task, it only sends an IPI to
1964 * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
1965 * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
1966 * RT overloaded tasks, would cause 100 IPIs to go out at once.
1967 *
1968 * The overloaded RT CPU, when receiving an IPI, will try to push off its
1969 * overloaded RT tasks and then send an IPI to the next CPU that has
1970 * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
1971 * have completed. Just because a CPU may have pushed off its own overloaded
1972 * RT task does not mean it should stop sending the IPI around to other
1973 * overloaded CPUs. There may be another RT task waiting to run on one of
1974 * those CPUs that are of higher priority than the one that was just
1975 * pushed.
1976 *
1977 * An optimization that could possibly be made is to make a CPU array similar
1978 * to the cpupri array mask of all running RT tasks, but for the overloaded
1979 * case, then the IPI could be sent to only the CPU with the highest priority
1980 * RT task waiting, and that CPU could send off further IPIs to the CPU with
1981 * the next highest waiting task. Since the overloaded case is much less likely
1982 * to happen, the complexity of this implementation may not be worth it.
1983 * Instead, just send an IPI around to all overloaded CPUs.
1984 *
1985 * The rq->rt.push_flags holds the status of the IPI that is going around.
1986 * A run queue can only send out a single IPI at a time. The possible flags
1987 * for rq->rt.push_flags are:
1988 *
1989 * (None or zero): No IPI is going around for the current rq
1990 * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
1991 * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
1992 * has changed, and the IPI should restart
1993 * circulating the overloaded CPUs again.
1994 *
1995 * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
1996 * before sending to the next CPU.
1997 *
1998 * Instead of having all CPUs that schedule a lower priority task send
1999 * an IPI to the same "first" CPU in the RT overload mask, they send it
2000 * to the next overloaded CPU after their own CPU. This helps distribute
2001 * the work when there's more than one overloaded CPU and multiple CPUs
2002 * scheduling in lower priority tasks.
2003 *
2004 * When a rq schedules a lower priority task than what was currently
2005 * running, the next CPU with overloaded RT tasks is examined first.
2006 * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
2007 * priority task, it will send an IPI first to CPU 5, then CPU 5 will
2008 * send to CPU 1 if it is still overloaded. CPU 1 will clear the
2009 * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
2010 *
2011 * The first CPU to notice IPI_RESTART is set, will clear that flag and then
2012 * send an IPI to the next overloaded CPU after the rq->cpu and not the next
2013 * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
2014 * schedules a lower priority task, and the IPI_RESTART gets set while the
2015 * handling is being done on CPU 5, it will clear the flag and send it back to
2016 * CPU 4 instead of CPU 1.
2017 *
2018 * Note, the above logic can be disabled by turning off the sched_feature
2019 * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
2020 * taken by the CPU requesting a pull and the waiting RT task will be pulled
2021 * by that CPU. This may be fine for machines with few CPUs.
2022 */
b6366f04
SR
2023static void tell_cpu_to_push(struct rq *rq)
2024{
2025 int cpu;
2026
2027 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
2028 raw_spin_lock(&rq->rt.push_lock);
2029 /* Make sure it's still executing */
2030 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
2031 /*
2032 * Tell the IPI to restart the loop as things have
2033 * changed since it started.
2034 */
2035 rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
2036 raw_spin_unlock(&rq->rt.push_lock);
2037 return;
2038 }
2039 raw_spin_unlock(&rq->rt.push_lock);
2040 }
2041
2042 /* When here, there's no IPI going around */
2043
2044 rq->rt.push_cpu = rq->cpu;
2045 cpu = find_next_push_cpu(rq);
2046 if (cpu >= nr_cpu_ids)
2047 return;
2048
2049 rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
2050
2051 irq_work_queue_on(&rq->rt.push_work, cpu);
2052}
2053
2054/* Called from hardirq context */
2055static void try_to_push_tasks(void *arg)
2056{
2057 struct rt_rq *rt_rq = arg;
2058 struct rq *rq, *src_rq;
2059 int this_cpu;
2060 int cpu;
2061
2062 this_cpu = rt_rq->push_cpu;
2063
2064 /* Paranoid check */
2065 BUG_ON(this_cpu != smp_processor_id());
2066
2067 rq = cpu_rq(this_cpu);
2068 src_rq = rq_of_rt_rq(rt_rq);
2069
2070again:
2071 if (has_pushable_tasks(rq)) {
2072 raw_spin_lock(&rq->lock);
2073 push_rt_task(rq);
2074 raw_spin_unlock(&rq->lock);
2075 }
2076
2077 /* Pass the IPI to the next rt overloaded queue */
2078 raw_spin_lock(&rt_rq->push_lock);
2079 /*
2080 * If the source queue changed since the IPI went out,
2081 * we need to restart the search from that CPU again.
2082 */
2083 if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
2084 rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
2085 rt_rq->push_cpu = src_rq->cpu;
2086 }
2087
2088 cpu = find_next_push_cpu(src_rq);
2089
2090 if (cpu >= nr_cpu_ids)
2091 rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
2092 raw_spin_unlock(&rt_rq->push_lock);
2093
2094 if (cpu >= nr_cpu_ids)
2095 return;
2096
2097 /*
2098 * It is possible that a restart caused this CPU to be
2099 * chosen again. Don't bother with an IPI, just see if we
2100 * have more to push.
2101 */
2102 if (unlikely(cpu == rq->cpu))
2103 goto again;
2104
2105 /* Try the next RT overloaded CPU */
2106 irq_work_queue_on(&rt_rq->push_work, cpu);
2107}
2108
2109static void push_irq_work_func(struct irq_work *work)
2110{
2111 struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
2112
2113 try_to_push_tasks(rt_rq);
2114}
2115#endif /* HAVE_RT_PUSH_IPI */
2116
8046d680 2117static void pull_rt_task(struct rq *this_rq)
f65eda4f 2118{
8046d680
PZ
2119 int this_cpu = this_rq->cpu, cpu;
2120 bool resched = false;
a8728944 2121 struct task_struct *p;
f65eda4f 2122 struct rq *src_rq;
f65eda4f 2123
637f5085 2124 if (likely(!rt_overloaded(this_rq)))
8046d680 2125 return;
f65eda4f 2126
7c3f2ab7
PZ
2127 /*
2128 * Match the barrier from rt_set_overloaded; this guarantees that if we
2129 * see overloaded we must also see the rto_mask bit.
2130 */
2131 smp_rmb();
2132
b6366f04
SR
2133#ifdef HAVE_RT_PUSH_IPI
2134 if (sched_feat(RT_PUSH_IPI)) {
2135 tell_cpu_to_push(this_rq);
8046d680 2136 return;
b6366f04
SR
2137 }
2138#endif
2139
c6c4927b 2140 for_each_cpu(cpu, this_rq->rd->rto_mask) {
f65eda4f
SR
2141 if (this_cpu == cpu)
2142 continue;
2143
2144 src_rq = cpu_rq(cpu);
74ab8e4f
GH
2145
2146 /*
2147 * Don't bother taking the src_rq->lock if the next highest
2148 * task is known to be lower-priority than our current task.
2149 * This may look racy, but if this value is about to go
2150 * logically higher, the src_rq will push this task away.
2151 * And if its going logically lower, we do not care
2152 */
2153 if (src_rq->rt.highest_prio.next >=
2154 this_rq->rt.highest_prio.curr)
2155 continue;
2156
f65eda4f
SR
2157 /*
2158 * We can potentially drop this_rq's lock in
2159 * double_lock_balance, and another CPU could
a8728944 2160 * alter this_rq
f65eda4f 2161 */
a8728944 2162 double_lock_balance(this_rq, src_rq);
f65eda4f
SR
2163
2164 /*
e23ee747
KT
2165 * We can pull only a task, which is pushable
2166 * on its rq, and no others.
f65eda4f 2167 */
e23ee747 2168 p = pick_highest_pushable_task(src_rq, this_cpu);
f65eda4f
SR
2169
2170 /*
2171 * Do we have an RT task that preempts
2172 * the to-be-scheduled task?
2173 */
a8728944 2174 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
f65eda4f 2175 WARN_ON(p == src_rq->curr);
da0c1e65 2176 WARN_ON(!task_on_rq_queued(p));
f65eda4f
SR
2177
2178 /*
2179 * There's a chance that p is higher in priority
2180 * than what's currently running on its cpu.
2181 * This is just that p is wakeing up and hasn't
2182 * had a chance to schedule. We only pull
2183 * p if it is lower in priority than the
a8728944 2184 * current task on the run queue
f65eda4f 2185 */
a8728944 2186 if (p->prio < src_rq->curr->prio)
614ee1f6 2187 goto skip;
f65eda4f 2188
8046d680 2189 resched = true;
f65eda4f
SR
2190
2191 deactivate_task(src_rq, p, 0);
2192 set_task_cpu(p, this_cpu);
2193 activate_task(this_rq, p, 0);
2194 /*
2195 * We continue with the search, just in
2196 * case there's an even higher prio task
25985edc 2197 * in another runqueue. (low likelihood
f65eda4f 2198 * but possible)
f65eda4f 2199 */
f65eda4f 2200 }
49246274 2201skip:
1b12bbc7 2202 double_unlock_balance(this_rq, src_rq);
f65eda4f
SR
2203 }
2204
8046d680
PZ
2205 if (resched)
2206 resched_curr(this_rq);
f65eda4f
SR
2207}
2208
8ae121ac
GH
2209/*
2210 * If we are not running and we are not going to reschedule soon, we should
2211 * try to push tasks away now
2212 */
efbbd05a 2213static void task_woken_rt(struct rq *rq, struct task_struct *p)
4642dafd 2214{
9a897c5a 2215 if (!task_running(rq, p) &&
8ae121ac 2216 !test_tsk_need_resched(rq->curr) &&
4b53a341 2217 p->nr_cpus_allowed > 1 &&
1baca4ce 2218 (dl_task(rq->curr) || rt_task(rq->curr)) &&
4b53a341 2219 (rq->curr->nr_cpus_allowed < 2 ||
3be209a8 2220 rq->curr->prio <= p->prio))
4642dafd
SR
2221 push_rt_tasks(rq);
2222}
2223
bdd7c81b 2224/* Assumes rq->lock is held */
1f11eb6a 2225static void rq_online_rt(struct rq *rq)
bdd7c81b
IM
2226{
2227 if (rq->rt.overloaded)
2228 rt_set_overload(rq);
6e0534f2 2229
7def2be1
PZ
2230 __enable_runtime(rq);
2231
e864c499 2232 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
bdd7c81b
IM
2233}
2234
2235/* Assumes rq->lock is held */
1f11eb6a 2236static void rq_offline_rt(struct rq *rq)
bdd7c81b
IM
2237{
2238 if (rq->rt.overloaded)
2239 rt_clear_overload(rq);
6e0534f2 2240
7def2be1
PZ
2241 __disable_runtime(rq);
2242
6e0534f2 2243 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
bdd7c81b 2244}
cb469845
SR
2245
2246/*
2247 * When switch from the rt queue, we bring ourselves to a position
2248 * that we might want to pull RT tasks from other runqueues.
2249 */
da7a735e 2250static void switched_from_rt(struct rq *rq, struct task_struct *p)
cb469845
SR
2251{
2252 /*
2253 * If there are other RT tasks then we will reschedule
2254 * and the scheduling of the other RT tasks will handle
2255 * the balancing. But if we are the last RT task
2256 * we may need to handle the pulling of RT tasks
2257 * now.
2258 */
da0c1e65 2259 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
1158ddb5
KT
2260 return;
2261
fd7a4bed 2262 queue_pull_task(rq);
cb469845 2263}
3d8cbdf8 2264
11c785b7 2265void __init init_sched_rt_class(void)
3d8cbdf8
RR
2266{
2267 unsigned int i;
2268
029632fb 2269 for_each_possible_cpu(i) {
eaa95840 2270 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
6ca09dfc 2271 GFP_KERNEL, cpu_to_node(i));
029632fb 2272 }
3d8cbdf8 2273}
cb469845
SR
2274#endif /* CONFIG_SMP */
2275
2276/*
2277 * When switching a task to RT, we may overload the runqueue
2278 * with RT tasks. In this case we try to push them off to
2279 * other runqueues.
2280 */
da7a735e 2281static void switched_to_rt(struct rq *rq, struct task_struct *p)
cb469845 2282{
cb469845
SR
2283 /*
2284 * If we are already running, then there's nothing
2285 * that needs to be done. But if we are not running
2286 * we may need to preempt the current running task.
2287 * If that current running task is also an RT task
2288 * then see if we can move to another run queue.
2289 */
da0c1e65 2290 if (task_on_rq_queued(p) && rq->curr != p) {
cb469845 2291#ifdef CONFIG_SMP
4b53a341 2292 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
fd7a4bed 2293 queue_push_tasks(rq);
619bd4a7 2294#endif /* CONFIG_SMP */
fd7a4bed 2295 if (p->prio < rq->curr->prio)
8875125e 2296 resched_curr(rq);
cb469845
SR
2297 }
2298}
2299
2300/*
2301 * Priority of the task has changed. This may cause
2302 * us to initiate a push or pull.
2303 */
da7a735e
PZ
2304static void
2305prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
cb469845 2306{
da0c1e65 2307 if (!task_on_rq_queued(p))
da7a735e
PZ
2308 return;
2309
2310 if (rq->curr == p) {
cb469845
SR
2311#ifdef CONFIG_SMP
2312 /*
2313 * If our priority decreases while running, we
2314 * may need to pull tasks to this runqueue.
2315 */
2316 if (oldprio < p->prio)
fd7a4bed
PZ
2317 queue_pull_task(rq);
2318
cb469845
SR
2319 /*
2320 * If there's a higher priority task waiting to run
fd7a4bed 2321 * then reschedule.
cb469845 2322 */
fd7a4bed 2323 if (p->prio > rq->rt.highest_prio.curr)
8875125e 2324 resched_curr(rq);
cb469845
SR
2325#else
2326 /* For UP simply resched on drop of prio */
2327 if (oldprio < p->prio)
8875125e 2328 resched_curr(rq);
e8fa1362 2329#endif /* CONFIG_SMP */
cb469845
SR
2330 } else {
2331 /*
2332 * This task is not running, but if it is
2333 * greater than the current running task
2334 * then reschedule.
2335 */
2336 if (p->prio < rq->curr->prio)
8875125e 2337 resched_curr(rq);
cb469845
SR
2338 }
2339}
2340
b18b6a9c 2341#ifdef CONFIG_POSIX_TIMERS
78f2c7db
PZ
2342static void watchdog(struct rq *rq, struct task_struct *p)
2343{
2344 unsigned long soft, hard;
2345
78d7d407
JS
2346 /* max may change after cur was read, this will be fixed next tick */
2347 soft = task_rlimit(p, RLIMIT_RTTIME);
2348 hard = task_rlimit_max(p, RLIMIT_RTTIME);
78f2c7db
PZ
2349
2350 if (soft != RLIM_INFINITY) {
2351 unsigned long next;
2352
57d2aa00
YX
2353 if (p->rt.watchdog_stamp != jiffies) {
2354 p->rt.timeout++;
2355 p->rt.watchdog_stamp = jiffies;
2356 }
2357
78f2c7db 2358 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
5a52dd50 2359 if (p->rt.timeout > next)
f06febc9 2360 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
78f2c7db
PZ
2361 }
2362}
b18b6a9c
NP
2363#else
2364static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2365#endif
bb44e5d1 2366
8f4d37ec 2367static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
bb44e5d1 2368{
454c7999
CC
2369 struct sched_rt_entity *rt_se = &p->rt;
2370
67e2be02
PZ
2371 update_curr_rt(rq);
2372
78f2c7db
PZ
2373 watchdog(rq, p);
2374
bb44e5d1
IM
2375 /*
2376 * RR tasks need a special form of timeslice management.
2377 * FIFO tasks have no timeslices.
2378 */
2379 if (p->policy != SCHED_RR)
2380 return;
2381
fa717060 2382 if (--p->rt.time_slice)
bb44e5d1
IM
2383 return;
2384
ce0dbbbb 2385 p->rt.time_slice = sched_rr_timeslice;
bb44e5d1 2386
98fbc798 2387 /*
e9aa39bb
LB
2388 * Requeue to the end of queue if we (and all of our ancestors) are not
2389 * the only element on the queue
98fbc798 2390 */
454c7999
CC
2391 for_each_sched_rt_entity(rt_se) {
2392 if (rt_se->run_list.prev != rt_se->run_list.next) {
2393 requeue_task_rt(rq, p, 0);
8aa6f0eb 2394 resched_curr(rq);
454c7999
CC
2395 return;
2396 }
98fbc798 2397 }
bb44e5d1
IM
2398}
2399
83b699ed
SV
2400static void set_curr_task_rt(struct rq *rq)
2401{
2402 struct task_struct *p = rq->curr;
2403
78becc27 2404 p->se.exec_start = rq_clock_task(rq);
917b627d
GH
2405
2406 /* The running task is never eligible for pushing */
2407 dequeue_pushable_task(rq, p);
83b699ed
SV
2408}
2409
6d686f45 2410static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
0d721cea
PW
2411{
2412 /*
2413 * Time slice is 0 for SCHED_FIFO tasks
2414 */
2415 if (task->policy == SCHED_RR)
ce0dbbbb 2416 return sched_rr_timeslice;
0d721cea
PW
2417 else
2418 return 0;
2419}
2420
029632fb 2421const struct sched_class rt_sched_class = {
5522d5d5 2422 .next = &fair_sched_class,
bb44e5d1
IM
2423 .enqueue_task = enqueue_task_rt,
2424 .dequeue_task = dequeue_task_rt,
2425 .yield_task = yield_task_rt,
2426
2427 .check_preempt_curr = check_preempt_curr_rt,
2428
2429 .pick_next_task = pick_next_task_rt,
2430 .put_prev_task = put_prev_task_rt,
2431
681f3e68 2432#ifdef CONFIG_SMP
4ce72a2c
LZ
2433 .select_task_rq = select_task_rq_rt,
2434
6c37067e 2435 .set_cpus_allowed = set_cpus_allowed_common,
1f11eb6a
GH
2436 .rq_online = rq_online_rt,
2437 .rq_offline = rq_offline_rt,
efbbd05a 2438 .task_woken = task_woken_rt,
cb469845 2439 .switched_from = switched_from_rt,
681f3e68 2440#endif
bb44e5d1 2441
83b699ed 2442 .set_curr_task = set_curr_task_rt,
bb44e5d1 2443 .task_tick = task_tick_rt,
cb469845 2444
0d721cea
PW
2445 .get_rr_interval = get_rr_interval_rt,
2446
cb469845
SR
2447 .prio_changed = prio_changed_rt,
2448 .switched_to = switched_to_rt,
6e998916
SG
2449
2450 .update_curr = update_curr_rt,
bb44e5d1 2451};
ada18de2 2452
8887cd99
NP
2453#ifdef CONFIG_RT_GROUP_SCHED
2454/*
2455 * Ensure that the real time constraints are schedulable.
2456 */
2457static DEFINE_MUTEX(rt_constraints_mutex);
2458
2459/* Must be called with tasklist_lock held */
2460static inline int tg_has_rt_tasks(struct task_group *tg)
2461{
2462 struct task_struct *g, *p;
2463
2464 /*
2465 * Autogroups do not have RT tasks; see autogroup_create().
2466 */
2467 if (task_group_is_autogroup(tg))
2468 return 0;
2469
2470 for_each_process_thread(g, p) {
2471 if (rt_task(p) && task_group(p) == tg)
2472 return 1;
2473 }
2474
2475 return 0;
2476}
2477
2478struct rt_schedulable_data {
2479 struct task_group *tg;
2480 u64 rt_period;
2481 u64 rt_runtime;
2482};
2483
2484static int tg_rt_schedulable(struct task_group *tg, void *data)
2485{
2486 struct rt_schedulable_data *d = data;
2487 struct task_group *child;
2488 unsigned long total, sum = 0;
2489 u64 period, runtime;
2490
2491 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2492 runtime = tg->rt_bandwidth.rt_runtime;
2493
2494 if (tg == d->tg) {
2495 period = d->rt_period;
2496 runtime = d->rt_runtime;
2497 }
2498
2499 /*
2500 * Cannot have more runtime than the period.
2501 */
2502 if (runtime > period && runtime != RUNTIME_INF)
2503 return -EINVAL;
2504
2505 /*
2506 * Ensure we don't starve existing RT tasks.
2507 */
2508 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2509 return -EBUSY;
2510
2511 total = to_ratio(period, runtime);
2512
2513 /*
2514 * Nobody can have more than the global setting allows.
2515 */
2516 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2517 return -EINVAL;
2518
2519 /*
2520 * The sum of our children's runtime should not exceed our own.
2521 */
2522 list_for_each_entry_rcu(child, &tg->children, siblings) {
2523 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2524 runtime = child->rt_bandwidth.rt_runtime;
2525
2526 if (child == d->tg) {
2527 period = d->rt_period;
2528 runtime = d->rt_runtime;
2529 }
2530
2531 sum += to_ratio(period, runtime);
2532 }
2533
2534 if (sum > total)
2535 return -EINVAL;
2536
2537 return 0;
2538}
2539
2540static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2541{
2542 int ret;
2543
2544 struct rt_schedulable_data data = {
2545 .tg = tg,
2546 .rt_period = period,
2547 .rt_runtime = runtime,
2548 };
2549
2550 rcu_read_lock();
2551 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2552 rcu_read_unlock();
2553
2554 return ret;
2555}
2556
2557static int tg_set_rt_bandwidth(struct task_group *tg,
2558 u64 rt_period, u64 rt_runtime)
2559{
2560 int i, err = 0;
2561
2562 /*
2563 * Disallowing the root group RT runtime is BAD, it would disallow the
2564 * kernel creating (and or operating) RT threads.
2565 */
2566 if (tg == &root_task_group && rt_runtime == 0)
2567 return -EINVAL;
2568
2569 /* No period doesn't make any sense. */
2570 if (rt_period == 0)
2571 return -EINVAL;
2572
2573 mutex_lock(&rt_constraints_mutex);
2574 read_lock(&tasklist_lock);
2575 err = __rt_schedulable(tg, rt_period, rt_runtime);
2576 if (err)
2577 goto unlock;
2578
2579 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2580 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2581 tg->rt_bandwidth.rt_runtime = rt_runtime;
2582
2583 for_each_possible_cpu(i) {
2584 struct rt_rq *rt_rq = tg->rt_rq[i];
2585
2586 raw_spin_lock(&rt_rq->rt_runtime_lock);
2587 rt_rq->rt_runtime = rt_runtime;
2588 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2589 }
2590 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2591unlock:
2592 read_unlock(&tasklist_lock);
2593 mutex_unlock(&rt_constraints_mutex);
2594
2595 return err;
2596}
2597
2598int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2599{
2600 u64 rt_runtime, rt_period;
2601
2602 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2603 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2604 if (rt_runtime_us < 0)
2605 rt_runtime = RUNTIME_INF;
2606
2607 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2608}
2609
2610long sched_group_rt_runtime(struct task_group *tg)
2611{
2612 u64 rt_runtime_us;
2613
2614 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2615 return -1;
2616
2617 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2618 do_div(rt_runtime_us, NSEC_PER_USEC);
2619 return rt_runtime_us;
2620}
2621
2622int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2623{
2624 u64 rt_runtime, rt_period;
2625
2626 rt_period = rt_period_us * NSEC_PER_USEC;
2627 rt_runtime = tg->rt_bandwidth.rt_runtime;
2628
2629 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2630}
2631
2632long sched_group_rt_period(struct task_group *tg)
2633{
2634 u64 rt_period_us;
2635
2636 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2637 do_div(rt_period_us, NSEC_PER_USEC);
2638 return rt_period_us;
2639}
2640
2641static int sched_rt_global_constraints(void)
2642{
2643 int ret = 0;
2644
2645 mutex_lock(&rt_constraints_mutex);
2646 read_lock(&tasklist_lock);
2647 ret = __rt_schedulable(NULL, 0, 0);
2648 read_unlock(&tasklist_lock);
2649 mutex_unlock(&rt_constraints_mutex);
2650
2651 return ret;
2652}
2653
2654int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2655{
2656 /* Don't accept realtime tasks when there is no way for them to run */
2657 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2658 return 0;
2659
2660 return 1;
2661}
2662
2663#else /* !CONFIG_RT_GROUP_SCHED */
2664static int sched_rt_global_constraints(void)
2665{
2666 unsigned long flags;
2667 int i;
2668
2669 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2670 for_each_possible_cpu(i) {
2671 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2672
2673 raw_spin_lock(&rt_rq->rt_runtime_lock);
2674 rt_rq->rt_runtime = global_rt_runtime();
2675 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2676 }
2677 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2678
2679 return 0;
2680}
2681#endif /* CONFIG_RT_GROUP_SCHED */
2682
2683static int sched_rt_global_validate(void)
2684{
2685 if (sysctl_sched_rt_period <= 0)
2686 return -EINVAL;
2687
2688 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2689 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2690 return -EINVAL;
2691
2692 return 0;
2693}
2694
2695static void sched_rt_do_global(void)
2696{
2697 def_rt_bandwidth.rt_runtime = global_rt_runtime();
2698 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2699}
2700
2701int sched_rt_handler(struct ctl_table *table, int write,
2702 void __user *buffer, size_t *lenp,
2703 loff_t *ppos)
2704{
2705 int old_period, old_runtime;
2706 static DEFINE_MUTEX(mutex);
2707 int ret;
2708
2709 mutex_lock(&mutex);
2710 old_period = sysctl_sched_rt_period;
2711 old_runtime = sysctl_sched_rt_runtime;
2712
2713 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2714
2715 if (!ret && write) {
2716 ret = sched_rt_global_validate();
2717 if (ret)
2718 goto undo;
2719
2720 ret = sched_dl_global_validate();
2721 if (ret)
2722 goto undo;
2723
2724 ret = sched_rt_global_constraints();
2725 if (ret)
2726 goto undo;
2727
2728 sched_rt_do_global();
2729 sched_dl_do_global();
2730 }
2731 if (0) {
2732undo:
2733 sysctl_sched_rt_period = old_period;
2734 sysctl_sched_rt_runtime = old_runtime;
2735 }
2736 mutex_unlock(&mutex);
2737
2738 return ret;
2739}
2740
2741int sched_rr_handler(struct ctl_table *table, int write,
2742 void __user *buffer, size_t *lenp,
2743 loff_t *ppos)
2744{
2745 int ret;
2746 static DEFINE_MUTEX(mutex);
2747
2748 mutex_lock(&mutex);
2749 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2750 /*
2751 * Make sure that internally we keep jiffies.
2752 * Also, writing zero resets the timeslice to default:
2753 */
2754 if (!ret && write) {
2755 sched_rr_timeslice =
2756 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2757 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2758 }
2759 mutex_unlock(&mutex);
2760 return ret;
2761}
2762
ada18de2
PZ
2763#ifdef CONFIG_SCHED_DEBUG
2764extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2765
029632fb 2766void print_rt_stats(struct seq_file *m, int cpu)
ada18de2 2767{
ec514c48 2768 rt_rq_iter_t iter;
ada18de2
PZ
2769 struct rt_rq *rt_rq;
2770
2771 rcu_read_lock();
ec514c48 2772 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
ada18de2
PZ
2773 print_rt_rq(m, cpu, rt_rq);
2774 rcu_read_unlock();
2775}
55e12e5e 2776#endif /* CONFIG_SCHED_DEBUG */