kernel/sched/rt.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4  * policies)
   5  */
   6
   7 #include "sched.h"
   8 #include "pelt.h"
   9
  10 int sched_rr_timeslice = RR_TIMESLICE;
  11 /* More than 4 hours if BW_SHIFT equals 20. */
  12 static const u64 max_rt_runtime = MAX_BW;
  13
  14 /*
  15  * period over which we measure -rt task CPU usage in us.
  16  * default: 1s
  17  */
  18 int sysctl_sched_rt_period = 1000000;
  19
  20 /*
  21  * part of the period that we allow rt tasks to run in us.
  22  * default: 0.95s
  23  */
  24 int sysctl_sched_rt_runtime = 950000;
  25
  26 #ifdef CONFIG_SYSCTL
  27 static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
  28 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
  29                 size_t *lenp, loff_t *ppos);
  30 static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
  31                 size_t *lenp, loff_t *ppos);
  32 static const struct ctl_table sched_rt_sysctls[] = {
  33         {
  34                 .procname       = "sched_rt_period_us",
  35                 .data           = &sysctl_sched_rt_period,
  36                 .maxlen         = sizeof(int),
  37                 .mode           = 0644,
  38                 .proc_handler   = sched_rt_handler,
  39                 .extra1         = SYSCTL_ONE,
  40                 .extra2         = SYSCTL_INT_MAX,
  41         },
  42         {
  43                 .procname       = "sched_rt_runtime_us",
  44                 .data           = &sysctl_sched_rt_runtime,
  45                 .maxlen         = sizeof(int),
  46                 .mode           = 0644,
  47                 .proc_handler   = sched_rt_handler,
  48                 .extra1         = SYSCTL_NEG_ONE,
  49                 .extra2         = (void *)&sysctl_sched_rt_period,
  50         },
  51         {
  52                 .procname       = "sched_rr_timeslice_ms",
  53                 .data           = &sysctl_sched_rr_timeslice,
  54                 .maxlen         = sizeof(int),
  55                 .mode           = 0644,
  56                 .proc_handler   = sched_rr_handler,
  57         },
  58 };
  59
  60 static int __init sched_rt_sysctl_init(void)
  61 {
  62         register_sysctl_init("kernel", sched_rt_sysctls);
  63         return 0;
  64 }
  65 late_initcall(sched_rt_sysctl_init);
  66 #endif /* CONFIG_SYSCTL */
  67
  68 void init_rt_rq(struct rt_rq *rt_rq)
  69 {
  70         struct rt_prio_array *array;
  71         int i;
  72
  73         array = &rt_rq->active;
  74         for (i = 0; i < MAX_RT_PRIO; i++) {
  75                 INIT_LIST_HEAD(array->queue + i);
  76                 __clear_bit(i, array->bitmap);
  77         }
  78         /* delimiter for bitsearch: */
  79         __set_bit(MAX_RT_PRIO, array->bitmap);
  80
  81         rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
  82         rt_rq->highest_prio.next = MAX_RT_PRIO-1;
  83         rt_rq->overloaded = 0;
  84         plist_head_init(&rt_rq->pushable_tasks);
  85         /* We start is dequeued state, because no RT tasks are queued */
  86         rt_rq->rt_queued = 0;
  87
  88 #ifdef CONFIG_RT_GROUP_SCHED
  89         rt_rq->rt_time = 0;
  90         rt_rq->rt_throttled = 0;
  91         rt_rq->rt_runtime = 0;
  92         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
  93         rt_rq->tg = &root_task_group;
  94 #endif
  95 }
  96
  97 #ifdef CONFIG_RT_GROUP_SCHED
  98
  99 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 100
 101 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 102 {
 103         struct rt_bandwidth *rt_b =
 104                 container_of(timer, struct rt_bandwidth, rt_period_timer);
 105         int idle = 0;
 106         int overrun;
 107
 108         raw_spin_lock(&rt_b->rt_runtime_lock);
 109         for (;;) {
 110                 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
 111                 if (!overrun)
 112                         break;
 113
 114                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 115                 idle = do_sched_rt_period_timer(rt_b, overrun);
 116                 raw_spin_lock(&rt_b->rt_runtime_lock);
 117         }
 118         if (idle)
 119                 rt_b->rt_period_active = 0;
 120         raw_spin_unlock(&rt_b->rt_runtime_lock);
 121
 122         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 123 }
 124
 125 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 126 {
 127         rt_b->rt_period = ns_to_ktime(period);
 128         rt_b->rt_runtime = runtime;
 129
 130         raw_spin_lock_init(&rt_b->rt_runtime_lock);
 131
 132         hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC,
 133                       HRTIMER_MODE_REL_HARD);
 134 }
 135
 136 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
 137 {
 138         raw_spin_lock(&rt_b->rt_runtime_lock);
 139         if (!rt_b->rt_period_active) {
 140                 rt_b->rt_period_active = 1;
 141                 /*
 142                  * SCHED_DEADLINE updates the bandwidth, as a run away
 143                  * RT task with a DL task could hog a CPU. But DL does
 144                  * not reset the period. If a deadline task was running
 145                  * without an RT task running, it can cause RT tasks to
 146                  * throttle when they start up. Kick the timer right away
 147                  * to update the period.
 148                  */
 149                 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
 150                 hrtimer_start_expires(&rt_b->rt_period_timer,
 151                                       HRTIMER_MODE_ABS_PINNED_HARD);
 152         }
 153         raw_spin_unlock(&rt_b->rt_runtime_lock);
 154 }
 155
 156 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 157 {
 158         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 159                 return;
 160
 161         do_start_rt_bandwidth(rt_b);
 162 }
 163
 164 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 165 {
 166         hrtimer_cancel(&rt_b->rt_period_timer);
 167 }
 168
 169 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 170
 171 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 172 {
 173         WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 174
 175         return container_of(rt_se, struct task_struct, rt);
 176 }
 177
 178 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 179 {
 180         /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */
 181         WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
 182         return rt_rq->rq;
 183 }
 184
 185 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 186 {
 187         WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group);
 188         return rt_se->rt_rq;
 189 }
 190
 191 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 192 {
 193         struct rt_rq *rt_rq = rt_se->rt_rq;
 194
 195         WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
 196         return rt_rq->rq;
 197 }
 198
 199 void unregister_rt_sched_group(struct task_group *tg)
 200 {
 201         if (!rt_group_sched_enabled())
 202                 return;
 203
 204         if (tg->rt_se)
 205                 destroy_rt_bandwidth(&tg->rt_bandwidth);
 206 }
 207
 208 void free_rt_sched_group(struct task_group *tg)
 209 {
 210         int i;
 211
 212         if (!rt_group_sched_enabled())
 213                 return;
 214
 215         for_each_possible_cpu(i) {
 216                 if (tg->rt_rq)
 217                         kfree(tg->rt_rq[i]);
 218                 if (tg->rt_se)
 219                         kfree(tg->rt_se[i]);
 220         }
 221
 222         kfree(tg->rt_rq);
 223         kfree(tg->rt_se);
 224 }
 225
 226 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 227                 struct sched_rt_entity *rt_se, int cpu,
 228                 struct sched_rt_entity *parent)
 229 {
 230         struct rq *rq = cpu_rq(cpu);
 231
 232         rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
 233         rt_rq->rt_nr_boosted = 0;
 234         rt_rq->rq = rq;
 235         rt_rq->tg = tg;
 236
 237         tg->rt_rq[cpu] = rt_rq;
 238         tg->rt_se[cpu] = rt_se;
 239
 240         if (!rt_se)
 241                 return;
 242
 243         if (!parent)
 244                 rt_se->rt_rq = &rq->rt;
 245         else
 246                 rt_se->rt_rq = parent->my_q;
 247
 248         rt_se->my_q = rt_rq;
 249         rt_se->parent = parent;
 250         INIT_LIST_HEAD(&rt_se->run_list);
 251 }
 252
 253 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 254 {
 255         struct rt_rq *rt_rq;
 256         struct sched_rt_entity *rt_se;
 257         int i;
 258
 259         if (!rt_group_sched_enabled())
 260                 return 1;
 261
 262         tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
 263         if (!tg->rt_rq)
 264                 goto err;
 265         tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
 266         if (!tg->rt_se)
 267                 goto err;
 268
 269         init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
 270
 271         for_each_possible_cpu(i) {
 272                 rt_rq = kzalloc_node(sizeof(struct rt_rq),
 273                                      GFP_KERNEL, cpu_to_node(i));
 274                 if (!rt_rq)
 275                         goto err;
 276
 277                 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 278                                      GFP_KERNEL, cpu_to_node(i));
 279                 if (!rt_se)
 280                         goto err_free_rq;
 281
 282                 init_rt_rq(rt_rq);
 283                 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 284                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 285         }
 286
 287         return 1;
 288
 289 err_free_rq:
 290         kfree(rt_rq);
 291 err:
 292         return 0;
 293 }
 294
 295 #else /* !CONFIG_RT_GROUP_SCHED: */
 296
 297 #define rt_entity_is_task(rt_se) (1)
 298
 299 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 300 {
 301         return container_of(rt_se, struct task_struct, rt);
 302 }
 303
 304 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 305 {
 306         return container_of(rt_rq, struct rq, rt);
 307 }
 308
 309 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 310 {
 311         struct task_struct *p = rt_task_of(rt_se);
 312
 313         return task_rq(p);
 314 }
 315
 316 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 317 {
 318         struct rq *rq = rq_of_rt_se(rt_se);
 319
 320         return &rq->rt;
 321 }
 322
 323 void unregister_rt_sched_group(struct task_group *tg) { }
 324
 325 void free_rt_sched_group(struct task_group *tg) { }
 326
 327 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 328 {
 329         return 1;
 330 }
 331 #endif /* !CONFIG_RT_GROUP_SCHED */
 332
 333 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 334 {
 335         /* Try to pull RT tasks here if we lower this rq's prio */
 336         return rq->online && rq->rt.highest_prio.curr > prev->prio;
 337 }
 338
 339 static inline int rt_overloaded(struct rq *rq)
 340 {
 341         return atomic_read(&rq->rd->rto_count);
 342 }
 343
 344 static inline void rt_set_overload(struct rq *rq)
 345 {
 346         if (!rq->online)
 347                 return;
 348
 349         cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 350         /*
 351          * Make sure the mask is visible before we set
 352          * the overload count. That is checked to determine
 353          * if we should look at the mask. It would be a shame
 354          * if we looked at the mask, but the mask was not
 355          * updated yet.
 356          *
 357          * Matched by the barrier in pull_rt_task().
 358          */
 359         smp_wmb();
 360         atomic_inc(&rq->rd->rto_count);
 361 }
 362
 363 static inline void rt_clear_overload(struct rq *rq)
 364 {
 365         if (!rq->online)
 366                 return;
 367
 368         /* the order here really doesn't matter */
 369         atomic_dec(&rq->rd->rto_count);
 370         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 371 }
 372
 373 static inline int has_pushable_tasks(struct rq *rq)
 374 {
 375         return !plist_head_empty(&rq->rt.pushable_tasks);
 376 }
 377
 378 static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
 379 static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
 380
 381 static void push_rt_tasks(struct rq *);
 382 static void pull_rt_task(struct rq *);
 383
 384 static inline void rt_queue_push_tasks(struct rq *rq)
 385 {
 386         if (!has_pushable_tasks(rq))
 387                 return;
 388
 389         queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 390 }
 391
 392 static inline void rt_queue_pull_task(struct rq *rq)
 393 {
 394         queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 395 }
 396
 397 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 398 {
 399         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 400         plist_node_init(&p->pushable_tasks, p->prio);
 401         plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 402
 403         /* Update the highest prio pushable task */
 404         if (p->prio < rq->rt.highest_prio.next)
 405                 rq->rt.highest_prio.next = p->prio;
 406
 407         if (!rq->rt.overloaded) {
 408                 rt_set_overload(rq);
 409                 rq->rt.overloaded = 1;
 410         }
 411 }
 412
 413 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 414 {
 415         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 416
 417         /* Update the new highest prio pushable task */
 418         if (has_pushable_tasks(rq)) {
 419                 p = plist_first_entry(&rq->rt.pushable_tasks,
 420                                       struct task_struct, pushable_tasks);
 421                 rq->rt.highest_prio.next = p->prio;
 422         } else {
 423                 rq->rt.highest_prio.next = MAX_RT_PRIO-1;
 424
 425                 if (rq->rt.overloaded) {
 426                         rt_clear_overload(rq);
 427                         rq->rt.overloaded = 0;
 428                 }
 429         }
 430 }
 431
 432 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 433 static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
 434
 435 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 436 {
 437         return rt_se->on_rq;
 438 }
 439
 440 #ifdef CONFIG_UCLAMP_TASK
 441 /*
 442  * Verify the fitness of task @p to run on @cpu taking into account the uclamp
 443  * settings.
 444  *
 445  * This check is only important for heterogeneous systems where uclamp_min value
 446  * is higher than the capacity of a @cpu. For non-heterogeneous system this
 447  * function will always return true.
 448  *
 449  * The function will return true if the capacity of the @cpu is >= the
 450  * uclamp_min and false otherwise.
 451  *
 452  * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
 453  * > uclamp_max.
 454  */
 455 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
 456 {
 457         unsigned int min_cap;
 458         unsigned int max_cap;
 459         unsigned int cpu_cap;
 460
 461         /* Only heterogeneous systems can benefit from this check */
 462         if (!sched_asym_cpucap_active())
 463                 return true;
 464
 465         min_cap = uclamp_eff_value(p, UCLAMP_MIN);
 466         max_cap = uclamp_eff_value(p, UCLAMP_MAX);
 467
 468         cpu_cap = arch_scale_cpu_capacity(cpu);
 469
 470         return cpu_cap >= min(min_cap, max_cap);
 471 }
 472 #else /* !CONFIG_UCLAMP_TASK: */
 473 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
 474 {
 475         return true;
 476 }
 477 #endif /* !CONFIG_UCLAMP_TASK */
 478
 479 #ifdef CONFIG_RT_GROUP_SCHED
 480
 481 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 482 {
 483         return rt_rq->rt_runtime;
 484 }
 485
 486 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 487 {
 488         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 489 }
 490
 491 typedef struct task_group *rt_rq_iter_t;
 492
 493 static inline struct task_group *next_task_group(struct task_group *tg)
 494 {
 495         if (!rt_group_sched_enabled()) {
 496                 WARN_ON(tg != &root_task_group);
 497                 return NULL;
 498         }
 499
 500         do {
 501                 tg = list_entry_rcu(tg->list.next,
 502                         typeof(struct task_group), list);
 503         } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 504
 505         if (&tg->list == &task_groups)
 506                 tg = NULL;
 507
 508         return tg;
 509 }
 510
 511 #define for_each_rt_rq(rt_rq, iter, rq)                                 \
 512         for (iter = &root_task_group;                                   \
 513                 iter && (rt_rq = iter->rt_rq[cpu_of(rq)]);              \
 514                 iter = next_task_group(iter))
 515
 516 #define for_each_sched_rt_entity(rt_se) \
 517         for (; rt_se; rt_se = rt_se->parent)
 518
 519 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 520 {
 521         return rt_se->my_q;
 522 }
 523
 524 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 525 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 526
 527 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 528 {
 529         struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor;
 530         struct rq *rq = rq_of_rt_rq(rt_rq);
 531         struct sched_rt_entity *rt_se;
 532
 533         int cpu = cpu_of(rq);
 534
 535         rt_se = rt_rq->tg->rt_se[cpu];
 536
 537         if (rt_rq->rt_nr_running) {
 538                 if (!rt_se)
 539                         enqueue_top_rt_rq(rt_rq);
 540                 else if (!on_rt_rq(rt_se))
 541                         enqueue_rt_entity(rt_se, 0);
 542
 543                 if (rt_rq->highest_prio.curr < donor->prio)
 544                         resched_curr(rq);
 545         }
 546 }
 547
 548 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 549 {
 550         struct sched_rt_entity *rt_se;
 551         int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 552
 553         rt_se = rt_rq->tg->rt_se[cpu];
 554
 555         if (!rt_se) {
 556                 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
 557                 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 558                 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
 559         }
 560         else if (on_rt_rq(rt_se))
 561                 dequeue_rt_entity(rt_se, 0);
 562 }
 563
 564 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 565 {
 566         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 567 }
 568
 569 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 570 {
 571         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 572         struct task_struct *p;
 573
 574         if (rt_rq)
 575                 return !!rt_rq->rt_nr_boosted;
 576
 577         p = rt_task_of(rt_se);
 578         return p->prio != p->normal_prio;
 579 }
 580
 581 static inline const struct cpumask *sched_rt_period_mask(void)
 582 {
 583         return this_rq()->rd->span;
 584 }
 585
 586 static inline
 587 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 588 {
 589         return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 590 }
 591
 592 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 593 {
 594         return &rt_rq->tg->rt_bandwidth;
 595 }
 596
 597 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 598 {
 599         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 600
 601         return (hrtimer_active(&rt_b->rt_period_timer) ||
 602                 rt_rq->rt_time < rt_b->rt_runtime);
 603 }
 604
 605 /*
 606  * We ran out of runtime, see if we can borrow some from our neighbours.
 607  */
 608 static void do_balance_runtime(struct rt_rq *rt_rq)
 609 {
 610         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 611         struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 612         int i, weight;
 613         u64 rt_period;
 614
 615         weight = cpumask_weight(rd->span);
 616
 617         raw_spin_lock(&rt_b->rt_runtime_lock);
 618         rt_period = ktime_to_ns(rt_b->rt_period);
 619         for_each_cpu(i, rd->span) {
 620                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 621                 s64 diff;
 622
 623                 if (iter == rt_rq)
 624                         continue;
 625
 626                 raw_spin_lock(&iter->rt_runtime_lock);
 627                 /*
 628                  * Either all rqs have inf runtime and there's nothing to steal
 629                  * or __disable_runtime() below sets a specific rq to inf to
 630                  * indicate its been disabled and disallow stealing.
 631                  */
 632                 if (iter->rt_runtime == RUNTIME_INF)
 633                         goto next;
 634
 635                 /*
 636                  * From runqueues with spare time, take 1/n part of their
 637                  * spare time, but no more than our period.
 638                  */
 639                 diff = iter->rt_runtime - iter->rt_time;
 640                 if (diff > 0) {
 641                         diff = div_u64((u64)diff, weight);
 642                         if (rt_rq->rt_runtime + diff > rt_period)
 643                                 diff = rt_period - rt_rq->rt_runtime;
 644                         iter->rt_runtime -= diff;
 645                         rt_rq->rt_runtime += diff;
 646                         if (rt_rq->rt_runtime == rt_period) {
 647                                 raw_spin_unlock(&iter->rt_runtime_lock);
 648                                 break;
 649                         }
 650                 }
 651 next:
 652                 raw_spin_unlock(&iter->rt_runtime_lock);
 653         }
 654         raw_spin_unlock(&rt_b->rt_runtime_lock);
 655 }
 656
 657 /*
 658  * Ensure this RQ takes back all the runtime it lend to its neighbours.
 659  */
 660 static void __disable_runtime(struct rq *rq)
 661 {
 662         struct root_domain *rd = rq->rd;
 663         rt_rq_iter_t iter;
 664         struct rt_rq *rt_rq;
 665
 666         if (unlikely(!scheduler_running))
 667                 return;
 668
 669         for_each_rt_rq(rt_rq, iter, rq) {
 670                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 671                 s64 want;
 672                 int i;
 673
 674                 raw_spin_lock(&rt_b->rt_runtime_lock);
 675                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 676                 /*
 677                  * Either we're all inf and nobody needs to borrow, or we're
 678                  * already disabled and thus have nothing to do, or we have
 679                  * exactly the right amount of runtime to take out.
 680                  */
 681                 if (rt_rq->rt_runtime == RUNTIME_INF ||
 682                                 rt_rq->rt_runtime == rt_b->rt_runtime)
 683                         goto balanced;
 684                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 685
 686                 /*
 687                  * Calculate the difference between what we started out with
 688                  * and what we current have, that's the amount of runtime
 689                  * we lend and now have to reclaim.
 690                  */
 691                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
 692
 693                 /*
 694                  * Greedy reclaim, take back as much as we can.
 695                  */
 696                 for_each_cpu(i, rd->span) {
 697                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 698                         s64 diff;
 699
 700                         /*
 701                          * Can't reclaim from ourselves or disabled runqueues.
 702                          */
 703                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 704                                 continue;
 705
 706                         raw_spin_lock(&iter->rt_runtime_lock);
 707                         if (want > 0) {
 708                                 diff = min_t(s64, iter->rt_runtime, want);
 709                                 iter->rt_runtime -= diff;
 710                                 want -= diff;
 711                         } else {
 712                                 iter->rt_runtime -= want;
 713                                 want -= want;
 714                         }
 715                         raw_spin_unlock(&iter->rt_runtime_lock);
 716
 717                         if (!want)
 718                                 break;
 719                 }
 720
 721                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 722                 /*
 723                  * We cannot be left wanting - that would mean some runtime
 724                  * leaked out of the system.
 725                  */
 726                 WARN_ON_ONCE(want);
 727 balanced:
 728                 /*
 729                  * Disable all the borrow logic by pretending we have inf
 730                  * runtime - in which case borrowing doesn't make sense.
 731                  */
 732                 rt_rq->rt_runtime = RUNTIME_INF;
 733                 rt_rq->rt_throttled = 0;
 734                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 735                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 736
 737                 /* Make rt_rq available for pick_next_task() */
 738                 sched_rt_rq_enqueue(rt_rq);
 739         }
 740 }
 741
 742 static void __enable_runtime(struct rq *rq)
 743 {
 744         rt_rq_iter_t iter;
 745         struct rt_rq *rt_rq;
 746
 747         if (unlikely(!scheduler_running))
 748                 return;
 749
 750         /*
 751          * Reset each runqueue's bandwidth settings
 752          */
 753         for_each_rt_rq(rt_rq, iter, rq) {
 754                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 755
 756                 raw_spin_lock(&rt_b->rt_runtime_lock);
 757                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 758                 rt_rq->rt_runtime = rt_b->rt_runtime;
 759                 rt_rq->rt_time = 0;
 760                 rt_rq->rt_throttled = 0;
 761                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 762                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 763         }
 764 }
 765
 766 static void balance_runtime(struct rt_rq *rt_rq)
 767 {
 768         if (!sched_feat(RT_RUNTIME_SHARE))
 769                 return;
 770
 771         if (rt_rq->rt_time > rt_rq->rt_runtime) {
 772                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 773                 do_balance_runtime(rt_rq);
 774                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 775         }
 776 }
 777
 778 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 779 {
 780         int i, idle = 1, throttled = 0;
 781         const struct cpumask *span;
 782
 783         span = sched_rt_period_mask();
 784
 785         /*
 786          * FIXME: isolated CPUs should really leave the root task group,
 787          * whether they are isolcpus or were isolated via cpusets, lest
 788          * the timer run on a CPU which does not service all runqueues,
 789          * potentially leaving other CPUs indefinitely throttled.  If
 790          * isolation is really required, the user will turn the throttle
 791          * off to kill the perturbations it causes anyway.  Meanwhile,
 792          * this maintains functionality for boot and/or troubleshooting.
 793          */
 794         if (rt_b == &root_task_group.rt_bandwidth)
 795                 span = cpu_online_mask;
 796
 797         for_each_cpu(i, span) {
 798                 int enqueue = 0;
 799                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 800                 struct rq *rq = rq_of_rt_rq(rt_rq);
 801                 struct rq_flags rf;
 802                 int skip;
 803
 804                 /*
 805                  * When span == cpu_online_mask, taking each rq->lock
 806                  * can be time-consuming. Try to avoid it when possible.
 807                  */
 808                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 809                 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
 810                         rt_rq->rt_runtime = rt_b->rt_runtime;
 811                 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 812                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 813                 if (skip)
 814                         continue;
 815
 816                 rq_lock(rq, &rf);
 817                 update_rq_clock(rq);
 818
 819                 if (rt_rq->rt_time) {
 820                         u64 runtime;
 821
 822                         raw_spin_lock(&rt_rq->rt_runtime_lock);
 823                         if (rt_rq->rt_throttled)
 824                                 balance_runtime(rt_rq);
 825                         runtime = rt_rq->rt_runtime;
 826                         rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 827                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 828                                 rt_rq->rt_throttled = 0;
 829                                 enqueue = 1;
 830
 831                                 /*
 832                                  * When we're idle and a woken (rt) task is
 833                                  * throttled wakeup_preempt() will set
 834                                  * skip_update and the time between the wakeup
 835                                  * and this unthrottle will get accounted as
 836                                  * 'runtime'.
 837                                  */
 838                                 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 839                                         rq_clock_cancel_skipupdate(rq);
 840                         }
 841                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
 842                                 idle = 0;
 843                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
 844                 } else if (rt_rq->rt_nr_running) {
 845                         idle = 0;
 846                         if (!rt_rq_throttled(rt_rq))
 847                                 enqueue = 1;
 848                 }
 849                 if (rt_rq->rt_throttled)
 850                         throttled = 1;
 851
 852                 if (enqueue)
 853                         sched_rt_rq_enqueue(rt_rq);
 854                 rq_unlock(rq, &rf);
 855         }
 856
 857         if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 858                 return 1;
 859
 860         return idle;
 861 }
 862
 863 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 864 {
 865         u64 runtime = sched_rt_runtime(rt_rq);
 866
 867         if (rt_rq->rt_throttled)
 868                 return rt_rq_throttled(rt_rq);
 869
 870         if (runtime >= sched_rt_period(rt_rq))
 871                 return 0;
 872
 873         balance_runtime(rt_rq);
 874         runtime = sched_rt_runtime(rt_rq);
 875         if (runtime == RUNTIME_INF)
 876                 return 0;
 877
 878         if (rt_rq->rt_time > runtime) {
 879                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 880
 881                 /*
 882                  * Don't actually throttle groups that have no runtime assigned
 883                  * but accrue some time due to boosting.
 884                  */
 885                 if (likely(rt_b->rt_runtime)) {
 886                         rt_rq->rt_throttled = 1;
 887                         printk_deferred_once("sched: RT throttling activated\n");
 888                 } else {
 889                         /*
 890                          * In case we did anyway, make it go away,
 891                          * replenishment is a joke, since it will replenish us
 892                          * with exactly 0 ns.
 893                          */
 894                         rt_rq->rt_time = 0;
 895                 }
 896
 897                 if (rt_rq_throttled(rt_rq)) {
 898                         sched_rt_rq_dequeue(rt_rq);
 899                         return 1;
 900                 }
 901         }
 902
 903         return 0;
 904 }
 905
 906 #else /* !CONFIG_RT_GROUP_SCHED: */
 907
 908 typedef struct rt_rq *rt_rq_iter_t;
 909
 910 #define for_each_rt_rq(rt_rq, iter, rq) \
 911         for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 912
 913 #define for_each_sched_rt_entity(rt_se) \
 914         for (; rt_se; rt_se = NULL)
 915
 916 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 917 {
 918         return NULL;
 919 }
 920
 921 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 922 {
 923         struct rq *rq = rq_of_rt_rq(rt_rq);
 924
 925         if (!rt_rq->rt_nr_running)
 926                 return;
 927
 928         enqueue_top_rt_rq(rt_rq);
 929         resched_curr(rq);
 930 }
 931
 932 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 933 {
 934         dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
 935 }
 936
 937 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 938 {
 939         return false;
 940 }
 941
 942 static inline const struct cpumask *sched_rt_period_mask(void)
 943 {
 944         return cpu_online_mask;
 945 }
 946
 947 static inline
 948 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 949 {
 950         return &cpu_rq(cpu)->rt;
 951 }
 952
 953 static void __enable_runtime(struct rq *rq) { }
 954 static void __disable_runtime(struct rq *rq) { }
 955
 956 #endif /* !CONFIG_RT_GROUP_SCHED */
 957
 958 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 959 {
 960 #ifdef CONFIG_RT_GROUP_SCHED
 961         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 962
 963         if (rt_rq)
 964                 return rt_rq->highest_prio.curr;
 965 #endif
 966
 967         return rt_task_of(rt_se)->prio;
 968 }
 969
 970 /*
 971  * Update the current task's runtime statistics. Skip current tasks that
 972  * are not in our scheduling class.
 973  */
 974 static void update_curr_rt(struct rq *rq)
 975 {
 976         struct task_struct *donor = rq->donor;
 977         s64 delta_exec;
 978
 979         if (donor->sched_class != &rt_sched_class)
 980                 return;
 981
 982         delta_exec = update_curr_common(rq);
 983         if (unlikely(delta_exec <= 0))
 984                 return;
 985
 986 #ifdef CONFIG_RT_GROUP_SCHED
 987         struct sched_rt_entity *rt_se = &donor->rt;
 988
 989         if (!rt_bandwidth_enabled())
 990                 return;
 991
 992         for_each_sched_rt_entity(rt_se) {
 993                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 994                 int exceeded;
 995
 996                 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 997                         raw_spin_lock(&rt_rq->rt_runtime_lock);
 998                         rt_rq->rt_time += delta_exec;
 999                         exceeded = sched_rt_runtime_exceeded(rt_rq);
1000                         if (exceeded)
1001                                 resched_curr(rq);
1002                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
1003                         if (exceeded)
1004                                 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1005                 }
1006         }
1007 #endif /* CONFIG_RT_GROUP_SCHED */
1008 }
1009
1010 static void
1011 dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1012 {
1013         struct rq *rq = rq_of_rt_rq(rt_rq);
1014
1015         BUG_ON(&rq->rt != rt_rq);
1016
1017         if (!rt_rq->rt_queued)
1018                 return;
1019
1020         BUG_ON(!rq->nr_running);
1021
1022         sub_nr_running(rq, count);
1023         rt_rq->rt_queued = 0;
1024
1025 }
1026
1027 static void
1028 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1029 {
1030         struct rq *rq = rq_of_rt_rq(rt_rq);
1031
1032         BUG_ON(&rq->rt != rt_rq);
1033
1034         if (rt_rq->rt_queued)
1035                 return;
1036
1037         if (rt_rq_throttled(rt_rq))
1038                 return;
1039
1040         if (rt_rq->rt_nr_running) {
1041                 add_nr_running(rq, rt_rq->rt_nr_running);
1042                 rt_rq->rt_queued = 1;
1043         }
1044
1045         /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1046         cpufreq_update_util(rq, 0);
1047 }
1048
1049 static void
1050 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1051 {
1052         struct rq *rq = rq_of_rt_rq(rt_rq);
1053
1054         /*
1055          * Change rq's cpupri only if rt_rq is the top queue.
1056          */
1057         if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
1058                 return;
1059
1060         if (rq->online && prio < prev_prio)
1061                 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1062 }
1063
1064 static void
1065 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1066 {
1067         struct rq *rq = rq_of_rt_rq(rt_rq);
1068
1069         /*
1070          * Change rq's cpupri only if rt_rq is the top queue.
1071          */
1072         if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
1073                 return;
1074
1075         if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1076                 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1077 }
1078
1079 static void
1080 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1081 {
1082         int prev_prio = rt_rq->highest_prio.curr;
1083
1084         if (prio < prev_prio)
1085                 rt_rq->highest_prio.curr = prio;
1086
1087         inc_rt_prio_smp(rt_rq, prio, prev_prio);
1088 }
1089
1090 static void
1091 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1092 {
1093         int prev_prio = rt_rq->highest_prio.curr;
1094
1095         if (rt_rq->rt_nr_running) {
1096
1097                 WARN_ON(prio < prev_prio);
1098
1099                 /*
1100                  * This may have been our highest task, and therefore
1101                  * we may have some re-computation to do
1102                  */
1103                 if (prio == prev_prio) {
1104                         struct rt_prio_array *array = &rt_rq->active;
1105
1106                         rt_rq->highest_prio.curr =
1107                                 sched_find_first_bit(array->bitmap);
1108                 }
1109
1110         } else {
1111                 rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1112         }
1113
1114         dec_rt_prio_smp(rt_rq, prio, prev_prio);
1115 }
1116
1117 #ifdef CONFIG_RT_GROUP_SCHED
1118
1119 static void
1120 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1121 {
1122         if (rt_se_boosted(rt_se))
1123                 rt_rq->rt_nr_boosted++;
1124
1125         start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1126 }
1127
1128 static void
1129 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1130 {
1131         if (rt_se_boosted(rt_se))
1132                 rt_rq->rt_nr_boosted--;
1133
1134         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1135 }
1136
1137 #else /* !CONFIG_RT_GROUP_SCHED: */
1138
1139 static void
1140 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1141 {
1142 }
1143
1144 static inline
1145 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1146
1147 #endif /* !CONFIG_RT_GROUP_SCHED */
1148
1149 static inline
1150 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1151 {
1152         struct rt_rq *group_rq = group_rt_rq(rt_se);
1153
1154         if (group_rq)
1155                 return group_rq->rt_nr_running;
1156         else
1157                 return 1;
1158 }
1159
1160 static inline
1161 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1162 {
1163         struct rt_rq *group_rq = group_rt_rq(rt_se);
1164         struct task_struct *tsk;
1165
1166         if (group_rq)
1167                 return group_rq->rr_nr_running;
1168
1169         tsk = rt_task_of(rt_se);
1170
1171         return (tsk->policy == SCHED_RR) ? 1 : 0;
1172 }
1173
1174 static inline
1175 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1176 {
1177         int prio = rt_se_prio(rt_se);
1178
1179         WARN_ON(!rt_prio(prio));
1180         rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1181         rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1182
1183         inc_rt_prio(rt_rq, prio);
1184         inc_rt_group(rt_se, rt_rq);
1185 }
1186
1187 static inline
1188 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1189 {
1190         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1191         WARN_ON(!rt_rq->rt_nr_running);
1192         rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1193         rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1194
1195         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1196         dec_rt_group(rt_se, rt_rq);
1197 }
1198
1199 /*
1200  * Change rt_se->run_list location unless SAVE && !MOVE
1201  *
1202  * assumes ENQUEUE/DEQUEUE flags match
1203  */
1204 static inline bool move_entity(unsigned int flags)
1205 {
1206         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1207                 return false;
1208
1209         return true;
1210 }
1211
1212 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1213 {
1214         list_del_init(&rt_se->run_list);
1215
1216         if (list_empty(array->queue + rt_se_prio(rt_se)))
1217                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1218
1219         rt_se->on_list = 0;
1220 }
1221
1222 static inline struct sched_statistics *
1223 __schedstats_from_rt_se(struct sched_rt_entity *rt_se)
1224 {
1225         /* schedstats is not supported for rt group. */
1226         if (!rt_entity_is_task(rt_se))
1227                 return NULL;
1228
1229         return &rt_task_of(rt_se)->stats;
1230 }
1231
1232 static inline void
1233 update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1234 {
1235         struct sched_statistics *stats;
1236         struct task_struct *p = NULL;
1237
1238         if (!schedstat_enabled())
1239                 return;
1240
1241         if (rt_entity_is_task(rt_se))
1242                 p = rt_task_of(rt_se);
1243
1244         stats = __schedstats_from_rt_se(rt_se);
1245         if (!stats)
1246                 return;
1247
1248         __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
1249 }
1250
1251 static inline void
1252 update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1253 {
1254         struct sched_statistics *stats;
1255         struct task_struct *p = NULL;
1256
1257         if (!schedstat_enabled())
1258                 return;
1259
1260         if (rt_entity_is_task(rt_se))
1261                 p = rt_task_of(rt_se);
1262
1263         stats = __schedstats_from_rt_se(rt_se);
1264         if (!stats)
1265                 return;
1266
1267         __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
1268 }
1269
1270 static inline void
1271 update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1272                         int flags)
1273 {
1274         if (!schedstat_enabled())
1275                 return;
1276
1277         if (flags & ENQUEUE_WAKEUP)
1278                 update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
1279 }
1280
1281 static inline void
1282 update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1283 {
1284         struct sched_statistics *stats;
1285         struct task_struct *p = NULL;
1286
1287         if (!schedstat_enabled())
1288                 return;
1289
1290         if (rt_entity_is_task(rt_se))
1291                 p = rt_task_of(rt_se);
1292
1293         stats = __schedstats_from_rt_se(rt_se);
1294         if (!stats)
1295                 return;
1296
1297         __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
1298 }
1299
1300 static inline void
1301 update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1302                         int flags)
1303 {
1304         struct task_struct *p = NULL;
1305
1306         if (!schedstat_enabled())
1307                 return;
1308
1309         if (rt_entity_is_task(rt_se))
1310                 p = rt_task_of(rt_se);
1311
1312         if ((flags & DEQUEUE_SLEEP) && p) {
1313                 unsigned int state;
1314
1315                 state = READ_ONCE(p->__state);
1316                 if (state & TASK_INTERRUPTIBLE)
1317                         __schedstat_set(p->stats.sleep_start,
1318                                         rq_clock(rq_of_rt_rq(rt_rq)));
1319
1320                 if (state & TASK_UNINTERRUPTIBLE)
1321                         __schedstat_set(p->stats.block_start,
1322                                         rq_clock(rq_of_rt_rq(rt_rq)));
1323         }
1324 }
1325
1326 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1327 {
1328         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1329         struct rt_prio_array *array = &rt_rq->active;
1330         struct rt_rq *group_rq = group_rt_rq(rt_se);
1331         struct list_head *queue = array->queue + rt_se_prio(rt_se);
1332
1333         /*
1334          * Don't enqueue the group if its throttled, or when empty.
1335          * The latter is a consequence of the former when a child group
1336          * get throttled and the current group doesn't have any other
1337          * active members.
1338          */
1339         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1340                 if (rt_se->on_list)
1341                         __delist_rt_entity(rt_se, array);
1342                 return;
1343         }
1344
1345         if (move_entity(flags)) {
1346                 WARN_ON_ONCE(rt_se->on_list);
1347                 if (flags & ENQUEUE_HEAD)
1348                         list_add(&rt_se->run_list, queue);
1349                 else
1350                         list_add_tail(&rt_se->run_list, queue);
1351
1352                 __set_bit(rt_se_prio(rt_se), array->bitmap);
1353                 rt_se->on_list = 1;
1354         }
1355         rt_se->on_rq = 1;
1356
1357         inc_rt_tasks(rt_se, rt_rq);
1358 }
1359
1360 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1361 {
1362         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1363         struct rt_prio_array *array = &rt_rq->active;
1364
1365         if (move_entity(flags)) {
1366                 WARN_ON_ONCE(!rt_se->on_list);
1367                 __delist_rt_entity(rt_se, array);
1368         }
1369         rt_se->on_rq = 0;
1370
1371         dec_rt_tasks(rt_se, rt_rq);
1372 }
1373
1374 /*
1375  * Because the prio of an upper entry depends on the lower
1376  * entries, we must remove entries top - down.
1377  */
1378 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1379 {
1380         struct sched_rt_entity *back = NULL;
1381         unsigned int rt_nr_running;
1382
1383         for_each_sched_rt_entity(rt_se) {
1384                 rt_se->back = back;
1385                 back = rt_se;
1386         }
1387
1388         rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1389
1390         for (rt_se = back; rt_se; rt_se = rt_se->back) {
1391                 if (on_rt_rq(rt_se))
1392                         __dequeue_rt_entity(rt_se, flags);
1393         }
1394
1395         dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1396 }
1397
1398 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1399 {
1400         struct rq *rq = rq_of_rt_se(rt_se);
1401
1402         update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1403
1404         dequeue_rt_stack(rt_se, flags);
1405         for_each_sched_rt_entity(rt_se)
1406                 __enqueue_rt_entity(rt_se, flags);
1407         enqueue_top_rt_rq(&rq->rt);
1408 }
1409
1410 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1411 {
1412         struct rq *rq = rq_of_rt_se(rt_se);
1413
1414         update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1415
1416         dequeue_rt_stack(rt_se, flags);
1417
1418         for_each_sched_rt_entity(rt_se) {
1419                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1420
1421                 if (rt_rq && rt_rq->rt_nr_running)
1422                         __enqueue_rt_entity(rt_se, flags);
1423         }
1424         enqueue_top_rt_rq(&rq->rt);
1425 }
1426
1427 /*
1428  * Adding/removing a task to/from a priority array:
1429  */
1430 static void
1431 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1432 {
1433         struct sched_rt_entity *rt_se = &p->rt;
1434
1435         if (flags & ENQUEUE_WAKEUP)
1436                 rt_se->timeout = 0;
1437
1438         check_schedstat_required();
1439         update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
1440
1441         enqueue_rt_entity(rt_se, flags);
1442
1443         if (task_is_blocked(p))
1444                 return;
1445
1446         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1447                 enqueue_pushable_task(rq, p);
1448 }
1449
1450 static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1451 {
1452         struct sched_rt_entity *rt_se = &p->rt;
1453
1454         update_curr_rt(rq);
1455         dequeue_rt_entity(rt_se, flags);
1456
1457         dequeue_pushable_task(rq, p);
1458
1459         return true;
1460 }
1461
1462 /*
1463  * Put task to the head or the end of the run list without the overhead of
1464  * dequeue followed by enqueue.
1465  */
1466 static void
1467 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1468 {
1469         if (on_rt_rq(rt_se)) {
1470                 struct rt_prio_array *array = &rt_rq->active;
1471                 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1472
1473                 if (head)
1474                         list_move(&rt_se->run_list, queue);
1475                 else
1476                         list_move_tail(&rt_se->run_list, queue);
1477         }
1478 }
1479
1480 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1481 {
1482         struct sched_rt_entity *rt_se = &p->rt;
1483         struct rt_rq *rt_rq;
1484
1485         for_each_sched_rt_entity(rt_se) {
1486                 rt_rq = rt_rq_of_se(rt_se);
1487                 requeue_rt_entity(rt_rq, rt_se, head);
1488         }
1489 }
1490
1491 static void yield_task_rt(struct rq *rq)
1492 {
1493         requeue_task_rt(rq, rq->curr, 0);
1494 }
1495
1496 static int find_lowest_rq(struct task_struct *task);
1497
1498 static int
1499 select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1500 {
1501         struct task_struct *curr, *donor;
1502         struct rq *rq;
1503         bool test;
1504
1505         /* For anything but wake ups, just return the task_cpu */
1506         if (!(flags & (WF_TTWU | WF_FORK)))
1507                 goto out;
1508
1509         rq = cpu_rq(cpu);
1510
1511         rcu_read_lock();
1512         curr = READ_ONCE(rq->curr); /* unlocked access */
1513         donor = READ_ONCE(rq->donor);
1514
1515         /*
1516          * If the current task on @p's runqueue is an RT task, then
1517          * try to see if we can wake this RT task up on another
1518          * runqueue. Otherwise simply start this RT task
1519          * on its current runqueue.
1520          *
1521          * We want to avoid overloading runqueues. If the woken
1522          * task is a higher priority, then it will stay on this CPU
1523          * and the lower prio task should be moved to another CPU.
1524          * Even though this will probably make the lower prio task
1525          * lose its cache, we do not want to bounce a higher task
1526          * around just because it gave up its CPU, perhaps for a
1527          * lock?
1528          *
1529          * For equal prio tasks, we just let the scheduler sort it out.
1530          *
1531          * Otherwise, just let it ride on the affine RQ and the
1532          * post-schedule router will push the preempted task away
1533          *
1534          * This test is optimistic, if we get it wrong the load-balancer
1535          * will have to sort it out.
1536          *
1537          * We take into account the capacity of the CPU to ensure it fits the
1538          * requirement of the task - which is only important on heterogeneous
1539          * systems like big.LITTLE.
1540          */
1541         test = curr &&
1542                unlikely(rt_task(donor)) &&
1543                (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio);
1544
1545         if (test || !rt_task_fits_capacity(p, cpu)) {
1546                 int target = find_lowest_rq(p);
1547
1548                 /*
1549                  * Bail out if we were forcing a migration to find a better
1550                  * fitting CPU but our search failed.
1551                  */
1552                 if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1553                         goto out_unlock;
1554
1555                 /*
1556                  * Don't bother moving it if the destination CPU is
1557                  * not running a lower priority task.
1558                  */
1559                 if (target != -1 &&
1560                     p->prio < cpu_rq(target)->rt.highest_prio.curr)
1561                         cpu = target;
1562         }
1563
1564 out_unlock:
1565         rcu_read_unlock();
1566
1567 out:
1568         return cpu;
1569 }
1570
1571 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1572 {
1573         if (rq->curr->nr_cpus_allowed == 1 ||
1574             !cpupri_find(&rq->rd->cpupri, rq->donor, NULL))
1575                 return;
1576
1577         /*
1578          * p is migratable, so let's not schedule it and
1579          * see if it is pushed or pulled somewhere else.
1580          */
1581         if (p->nr_cpus_allowed != 1 &&
1582             cpupri_find(&rq->rd->cpupri, p, NULL))
1583                 return;
1584
1585         /*
1586          * There appear to be other CPUs that can accept
1587          * the current task but none can run 'p', so lets reschedule
1588          * to try and push the current task away:
1589          */
1590         requeue_task_rt(rq, p, 1);
1591         resched_curr(rq);
1592 }
1593
1594 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1595 {
1596         if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1597                 /*
1598                  * This is OK, because current is on_cpu, which avoids it being
1599                  * picked for load-balance and preemption/IRQs are still
1600                  * disabled avoiding further scheduler activity on it and we've
1601                  * not yet started the picking loop.
1602                  */
1603                 rq_unpin_lock(rq, rf);
1604                 pull_rt_task(rq);
1605                 rq_repin_lock(rq, rf);
1606         }
1607
1608         return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1609 }
1610
1611 /*
1612  * Preempt the current task with a newly woken task if needed:
1613  */
1614 static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
1615 {
1616         struct task_struct *donor = rq->donor;
1617
1618         if (p->prio < donor->prio) {
1619                 resched_curr(rq);
1620                 return;
1621         }
1622
1623         /*
1624          * If:
1625          *
1626          * - the newly woken task is of equal priority to the current task
1627          * - the newly woken task is non-migratable while current is migratable
1628          * - current will be preempted on the next reschedule
1629          *
1630          * we should check to see if current can readily move to a different
1631          * cpu.  If so, we will reschedule to allow the push logic to try
1632          * to move current somewhere else, making room for our non-migratable
1633          * task.
1634          */
1635         if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
1636                 check_preempt_equal_prio(rq, p);
1637 }
1638
1639 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1640 {
1641         struct sched_rt_entity *rt_se = &p->rt;
1642         struct rt_rq *rt_rq = &rq->rt;
1643
1644         p->se.exec_start = rq_clock_task(rq);
1645         if (on_rt_rq(&p->rt))
1646                 update_stats_wait_end_rt(rt_rq, rt_se);
1647
1648         /* The running task is never eligible for pushing */
1649         dequeue_pushable_task(rq, p);
1650
1651         if (!first)
1652                 return;
1653
1654         /*
1655          * If prev task was rt, put_prev_task() has already updated the
1656          * utilization. We only care of the case where we start to schedule a
1657          * rt task
1658          */
1659         if (rq->donor->sched_class != &rt_sched_class)
1660                 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1661
1662         rt_queue_push_tasks(rq);
1663 }
1664
1665 static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
1666 {
1667         struct rt_prio_array *array = &rt_rq->active;
1668         struct sched_rt_entity *next = NULL;
1669         struct list_head *queue;
1670         int idx;
1671
1672         idx = sched_find_first_bit(array->bitmap);
1673         BUG_ON(idx >= MAX_RT_PRIO);
1674
1675         queue = array->queue + idx;
1676         if (WARN_ON_ONCE(list_empty(queue)))
1677                 return NULL;
1678         next = list_entry(queue->next, struct sched_rt_entity, run_list);
1679
1680         return next;
1681 }
1682
1683 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1684 {
1685         struct sched_rt_entity *rt_se;
1686         struct rt_rq *rt_rq  = &rq->rt;
1687
1688         do {
1689                 rt_se = pick_next_rt_entity(rt_rq);
1690                 if (unlikely(!rt_se))
1691                         return NULL;
1692                 rt_rq = group_rt_rq(rt_se);
1693         } while (rt_rq);
1694
1695         return rt_task_of(rt_se);
1696 }
1697
1698 static struct task_struct *pick_task_rt(struct rq *rq)
1699 {
1700         struct task_struct *p;
1701
1702         if (!sched_rt_runnable(rq))
1703                 return NULL;
1704
1705         p = _pick_next_task_rt(rq);
1706
1707         return p;
1708 }
1709
1710 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
1711 {
1712         struct sched_rt_entity *rt_se = &p->rt;
1713         struct rt_rq *rt_rq = &rq->rt;
1714
1715         if (on_rt_rq(&p->rt))
1716                 update_stats_wait_start_rt(rt_rq, rt_se);
1717
1718         update_curr_rt(rq);
1719
1720         update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1721
1722         if (task_is_blocked(p))
1723                 return;
1724         /*
1725          * The previous task needs to be made eligible for pushing
1726          * if it is still active
1727          */
1728         if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1729                 enqueue_pushable_task(rq, p);
1730 }
1731
1732 /* Only try algorithms three times */
1733 #define RT_MAX_TRIES 3
1734
1735 /*
1736  * Return the highest pushable rq's task, which is suitable to be executed
1737  * on the CPU, NULL otherwise
1738  */
1739 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1740 {
1741         struct plist_head *head = &rq->rt.pushable_tasks;
1742         struct task_struct *p;
1743
1744         if (!has_pushable_tasks(rq))
1745                 return NULL;
1746
1747         plist_for_each_entry(p, head, pushable_tasks) {
1748                 if (task_is_pushable(rq, p, cpu))
1749                         return p;
1750         }
1751
1752         return NULL;
1753 }
1754
1755 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1756
1757 static int find_lowest_rq(struct task_struct *task)
1758 {
1759         struct sched_domain *sd;
1760         struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1761         int this_cpu = smp_processor_id();
1762         int cpu      = task_cpu(task);
1763         int ret;
1764
1765         /* Make sure the mask is initialized first */
1766         if (unlikely(!lowest_mask))
1767                 return -1;
1768
1769         if (task->nr_cpus_allowed == 1)
1770                 return -1; /* No other targets possible */
1771
1772         /*
1773          * If we're on asym system ensure we consider the different capacities
1774          * of the CPUs when searching for the lowest_mask.
1775          */
1776         if (sched_asym_cpucap_active()) {
1777
1778                 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1779                                           task, lowest_mask,
1780                                           rt_task_fits_capacity);
1781         } else {
1782
1783                 ret = cpupri_find(&task_rq(task)->rd->cpupri,
1784                                   task, lowest_mask);
1785         }
1786
1787         if (!ret)
1788                 return -1; /* No targets found */
1789
1790         /*
1791          * At this point we have built a mask of CPUs representing the
1792          * lowest priority tasks in the system.  Now we want to elect
1793          * the best one based on our affinity and topology.
1794          *
1795          * We prioritize the last CPU that the task executed on since
1796          * it is most likely cache-hot in that location.
1797          */
1798         if (cpumask_test_cpu(cpu, lowest_mask))
1799                 return cpu;
1800
1801         /*
1802          * Otherwise, we consult the sched_domains span maps to figure
1803          * out which CPU is logically closest to our hot cache data.
1804          */
1805         if (!cpumask_test_cpu(this_cpu, lowest_mask))
1806                 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1807
1808         rcu_read_lock();
1809         for_each_domain(cpu, sd) {
1810                 if (sd->flags & SD_WAKE_AFFINE) {
1811                         int best_cpu;
1812
1813                         /*
1814                          * "this_cpu" is cheaper to preempt than a
1815                          * remote processor.
1816                          */
1817                         if (this_cpu != -1 &&
1818                             cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1819                                 rcu_read_unlock();
1820                                 return this_cpu;
1821                         }
1822
1823                         best_cpu = cpumask_any_and_distribute(lowest_mask,
1824                                                               sched_domain_span(sd));
1825                         if (best_cpu < nr_cpu_ids) {
1826                                 rcu_read_unlock();
1827                                 return best_cpu;
1828                         }
1829                 }
1830         }
1831         rcu_read_unlock();
1832
1833         /*
1834          * And finally, if there were no matches within the domains
1835          * just give the caller *something* to work with from the compatible
1836          * locations.
1837          */
1838         if (this_cpu != -1)
1839                 return this_cpu;
1840
1841         cpu = cpumask_any_distribute(lowest_mask);
1842         if (cpu < nr_cpu_ids)
1843                 return cpu;
1844
1845         return -1;
1846 }
1847
1848 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1849 {
1850         struct task_struct *p;
1851
1852         if (!has_pushable_tasks(rq))
1853                 return NULL;
1854
1855         p = plist_first_entry(&rq->rt.pushable_tasks,
1856                               struct task_struct, pushable_tasks);
1857
1858         BUG_ON(rq->cpu != task_cpu(p));
1859         BUG_ON(task_current(rq, p));
1860         BUG_ON(task_current_donor(rq, p));
1861         BUG_ON(p->nr_cpus_allowed <= 1);
1862
1863         BUG_ON(!task_on_rq_queued(p));
1864         BUG_ON(!rt_task(p));
1865
1866         return p;
1867 }
1868
1869 /* Will lock the rq it finds */
1870 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1871 {
1872         struct rq *lowest_rq = NULL;
1873         int tries;
1874         int cpu;
1875
1876         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1877                 cpu = find_lowest_rq(task);
1878
1879                 if ((cpu == -1) || (cpu == rq->cpu))
1880                         break;
1881
1882                 lowest_rq = cpu_rq(cpu);
1883
1884                 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1885                         /*
1886                          * Target rq has tasks of equal or higher priority,
1887                          * retrying does not release any lock and is unlikely
1888                          * to yield a different result.
1889                          */
1890                         lowest_rq = NULL;
1891                         break;
1892                 }
1893
1894                 /* if the prio of this runqueue changed, try again */
1895                 if (double_lock_balance(rq, lowest_rq)) {
1896                         /*
1897                          * We had to unlock the run queue. In
1898                          * the mean time, task could have
1899                          * migrated already or had its affinity changed,
1900                          * therefore check if the task is still at the
1901                          * head of the pushable tasks list.
1902                          * It is possible the task was scheduled, set
1903                          * "migrate_disabled" and then got preempted, so we must
1904                          * check the task migration disable flag here too.
1905                          */
1906                         if (unlikely(is_migration_disabled(task) ||
1907                                      !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
1908                                      task != pick_next_pushable_task(rq))) {
1909
1910                                 double_unlock_balance(rq, lowest_rq);
1911                                 lowest_rq = NULL;
1912                                 break;
1913                         }
1914                 }
1915
1916                 /* If this rq is still suitable use it. */
1917                 if (lowest_rq->rt.highest_prio.curr > task->prio)
1918                         break;
1919
1920                 /* try again */
1921                 double_unlock_balance(rq, lowest_rq);
1922                 lowest_rq = NULL;
1923         }
1924
1925         return lowest_rq;
1926 }
1927
1928 /*
1929  * If the current CPU has more than one RT task, see if the non
1930  * running task can migrate over to a CPU that is running a task
1931  * of lesser priority.
1932  */
1933 static int push_rt_task(struct rq *rq, bool pull)
1934 {
1935         struct task_struct *next_task;
1936         struct rq *lowest_rq;
1937         int ret = 0;
1938
1939         if (!rq->rt.overloaded)
1940                 return 0;
1941
1942         next_task = pick_next_pushable_task(rq);
1943         if (!next_task)
1944                 return 0;
1945
1946 retry:
1947         /*
1948          * It's possible that the next_task slipped in of
1949          * higher priority than current. If that's the case
1950          * just reschedule current.
1951          */
1952         if (unlikely(next_task->prio < rq->donor->prio)) {
1953                 resched_curr(rq);
1954                 return 0;
1955         }
1956
1957         if (is_migration_disabled(next_task)) {
1958                 struct task_struct *push_task = NULL;
1959                 int cpu;
1960
1961                 if (!pull || rq->push_busy)
1962                         return 0;
1963
1964                 /*
1965                  * Invoking find_lowest_rq() on anything but an RT task doesn't
1966                  * make sense. Per the above priority check, curr has to
1967                  * be of higher priority than next_task, so no need to
1968                  * reschedule when bailing out.
1969                  *
1970                  * Note that the stoppers are masqueraded as SCHED_FIFO
1971                  * (cf. sched_set_stop_task()), so we can't rely on rt_task().
1972                  */
1973                 if (rq->donor->sched_class != &rt_sched_class)
1974                         return 0;
1975
1976                 cpu = find_lowest_rq(rq->curr);
1977                 if (cpu == -1 || cpu == rq->cpu)
1978                         return 0;
1979
1980                 /*
1981                  * Given we found a CPU with lower priority than @next_task,
1982                  * therefore it should be running. However we cannot migrate it
1983                  * to this other CPU, instead attempt to push the current
1984                  * running task on this CPU away.
1985                  */
1986                 push_task = get_push_task(rq);
1987                 if (push_task) {
1988                         preempt_disable();
1989                         raw_spin_rq_unlock(rq);
1990                         stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
1991                                             push_task, &rq->push_work);
1992                         preempt_enable();
1993                         raw_spin_rq_lock(rq);
1994                 }
1995
1996                 return 0;
1997         }
1998
1999         if (WARN_ON(next_task == rq->curr))
2000                 return 0;
2001
2002         /* We might release rq lock */
2003         get_task_struct(next_task);
2004
2005         /* find_lock_lowest_rq locks the rq if found */
2006         lowest_rq = find_lock_lowest_rq(next_task, rq);
2007         if (!lowest_rq) {
2008                 struct task_struct *task;
2009                 /*
2010                  * find_lock_lowest_rq releases rq->lock
2011                  * so it is possible that next_task has migrated.
2012                  *
2013                  * We need to make sure that the task is still on the same
2014                  * run-queue and is also still the next task eligible for
2015                  * pushing.
2016                  */
2017                 task = pick_next_pushable_task(rq);
2018                 if (task == next_task) {
2019                         /*
2020                          * The task hasn't migrated, and is still the next
2021                          * eligible task, but we failed to find a run-queue
2022                          * to push it to.  Do not retry in this case, since
2023                          * other CPUs will pull from us when ready.
2024                          */
2025                         goto out;
2026                 }
2027
2028                 if (!task)
2029                         /* No more tasks, just exit */
2030                         goto out;
2031
2032                 /*
2033                  * Something has shifted, try again.
2034                  */
2035                 put_task_struct(next_task);
2036                 next_task = task;
2037                 goto retry;
2038         }
2039
2040         move_queued_task_locked(rq, lowest_rq, next_task);
2041         resched_curr(lowest_rq);
2042         ret = 1;
2043
2044         double_unlock_balance(rq, lowest_rq);
2045 out:
2046         put_task_struct(next_task);
2047
2048         return ret;
2049 }
2050
2051 static void push_rt_tasks(struct rq *rq)
2052 {
2053         /* push_rt_task will return true if it moved an RT */
2054         while (push_rt_task(rq, false))
2055                 ;
2056 }
2057
2058 #ifdef HAVE_RT_PUSH_IPI
2059
2060 /*
2061  * When a high priority task schedules out from a CPU and a lower priority
2062  * task is scheduled in, a check is made to see if there's any RT tasks
2063  * on other CPUs that are waiting to run because a higher priority RT task
2064  * is currently running on its CPU. In this case, the CPU with multiple RT
2065  * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2066  * up that may be able to run one of its non-running queued RT tasks.
2067  *
2068  * All CPUs with overloaded RT tasks need to be notified as there is currently
2069  * no way to know which of these CPUs have the highest priority task waiting
2070  * to run. Instead of trying to take a spinlock on each of these CPUs,
2071  * which has shown to cause large latency when done on machines with many
2072  * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2073  * RT tasks waiting to run.
2074  *
2075  * Just sending an IPI to each of the CPUs is also an issue, as on large
2076  * count CPU machines, this can cause an IPI storm on a CPU, especially
2077  * if its the only CPU with multiple RT tasks queued, and a large number
2078  * of CPUs scheduling a lower priority task at the same time.
2079  *
2080  * Each root domain has its own IRQ work function that can iterate over
2081  * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2082  * task must be checked if there's one or many CPUs that are lowering
2083  * their priority, there's a single IRQ work iterator that will try to
2084  * push off RT tasks that are waiting to run.
2085  *
2086  * When a CPU schedules a lower priority task, it will kick off the
2087  * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
2088  * As it only takes the first CPU that schedules a lower priority task
2089  * to start the process, the rto_start variable is incremented and if
2090  * the atomic result is one, then that CPU will try to take the rto_lock.
2091  * This prevents high contention on the lock as the process handles all
2092  * CPUs scheduling lower priority tasks.
2093  *
2094  * All CPUs that are scheduling a lower priority task will increment the
2095  * rt_loop_next variable. This will make sure that the IRQ work iterator
2096  * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2097  * priority task, even if the iterator is in the middle of a scan. Incrementing
2098  * the rt_loop_next will cause the iterator to perform another scan.
2099  *
2100  */
2101 static int rto_next_cpu(struct root_domain *rd)
2102 {
2103         int next;
2104         int cpu;
2105
2106         /*
2107          * When starting the IPI RT pushing, the rto_cpu is set to -1,
2108          * rt_next_cpu() will simply return the first CPU found in
2109          * the rto_mask.
2110          *
2111          * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2112          * will return the next CPU found in the rto_mask.
2113          *
2114          * If there are no more CPUs left in the rto_mask, then a check is made
2115          * against rto_loop and rto_loop_next. rto_loop is only updated with
2116          * the rto_lock held, but any CPU may increment the rto_loop_next
2117          * without any locking.
2118          */
2119         for (;;) {
2120
2121                 /* When rto_cpu is -1 this acts like cpumask_first() */
2122                 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2123
2124                 rd->rto_cpu = cpu;
2125
2126                 if (cpu < nr_cpu_ids)
2127                         return cpu;
2128
2129                 rd->rto_cpu = -1;
2130
2131                 /*
2132                  * ACQUIRE ensures we see the @rto_mask changes
2133                  * made prior to the @next value observed.
2134                  *
2135                  * Matches WMB in rt_set_overload().
2136                  */
2137                 next = atomic_read_acquire(&rd->rto_loop_next);
2138
2139                 if (rd->rto_loop == next)
2140                         break;
2141
2142                 rd->rto_loop = next;
2143         }
2144
2145         return -1;
2146 }
2147
2148 static inline bool rto_start_trylock(atomic_t *v)
2149 {
2150         return !atomic_cmpxchg_acquire(v, 0, 1);
2151 }
2152
2153 static inline void rto_start_unlock(atomic_t *v)
2154 {
2155         atomic_set_release(v, 0);
2156 }
2157
2158 static void tell_cpu_to_push(struct rq *rq)
2159 {
2160         int cpu = -1;
2161
2162         /* Keep the loop going if the IPI is currently active */
2163         atomic_inc(&rq->rd->rto_loop_next);
2164
2165         /* Only one CPU can initiate a loop at a time */
2166         if (!rto_start_trylock(&rq->rd->rto_loop_start))
2167                 return;
2168
2169         raw_spin_lock(&rq->rd->rto_lock);
2170
2171         /*
2172          * The rto_cpu is updated under the lock, if it has a valid CPU
2173          * then the IPI is still running and will continue due to the
2174          * update to loop_next, and nothing needs to be done here.
2175          * Otherwise it is finishing up and an IPI needs to be sent.
2176          */
2177         if (rq->rd->rto_cpu < 0)
2178                 cpu = rto_next_cpu(rq->rd);
2179
2180         raw_spin_unlock(&rq->rd->rto_lock);
2181
2182         rto_start_unlock(&rq->rd->rto_loop_start);
2183
2184         if (cpu >= 0) {
2185                 /* Make sure the rd does not get freed while pushing */
2186                 sched_get_rd(rq->rd);
2187                 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2188         }
2189 }
2190
2191 /* Called from hardirq context */
2192 void rto_push_irq_work_func(struct irq_work *work)
2193 {
2194         struct root_domain *rd =
2195                 container_of(work, struct root_domain, rto_push_work);
2196         struct rq *rq;
2197         int cpu;
2198
2199         rq = this_rq();
2200
2201         /*
2202          * We do not need to grab the lock to check for has_pushable_tasks.
2203          * When it gets updated, a check is made if a push is possible.
2204          */
2205         if (has_pushable_tasks(rq)) {
2206                 raw_spin_rq_lock(rq);
2207                 while (push_rt_task(rq, true))
2208                         ;
2209                 raw_spin_rq_unlock(rq);
2210         }
2211
2212         raw_spin_lock(&rd->rto_lock);
2213
2214         /* Pass the IPI to the next rt overloaded queue */
2215         cpu = rto_next_cpu(rd);
2216
2217         raw_spin_unlock(&rd->rto_lock);
2218
2219         if (cpu < 0) {
2220                 sched_put_rd(rd);
2221                 return;
2222         }
2223
2224         /* Try the next RT overloaded CPU */
2225         irq_work_queue_on(&rd->rto_push_work, cpu);
2226 }
2227 #endif /* HAVE_RT_PUSH_IPI */
2228
2229 static void pull_rt_task(struct rq *this_rq)
2230 {
2231         int this_cpu = this_rq->cpu, cpu;
2232         bool resched = false;
2233         struct task_struct *p, *push_task;
2234         struct rq *src_rq;
2235         int rt_overload_count = rt_overloaded(this_rq);
2236
2237         if (likely(!rt_overload_count))
2238                 return;
2239
2240         /*
2241          * Match the barrier from rt_set_overloaded; this guarantees that if we
2242          * see overloaded we must also see the rto_mask bit.
2243          */
2244         smp_rmb();
2245
2246         /* If we are the only overloaded CPU do nothing */
2247         if (rt_overload_count == 1 &&
2248             cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2249                 return;
2250
2251 #ifdef HAVE_RT_PUSH_IPI
2252         if (sched_feat(RT_PUSH_IPI)) {
2253                 tell_cpu_to_push(this_rq);
2254                 return;
2255         }
2256 #endif
2257
2258         for_each_cpu(cpu, this_rq->rd->rto_mask) {
2259                 if (this_cpu == cpu)
2260                         continue;
2261
2262                 src_rq = cpu_rq(cpu);
2263
2264                 /*
2265                  * Don't bother taking the src_rq->lock if the next highest
2266                  * task is known to be lower-priority than our current task.
2267                  * This may look racy, but if this value is about to go
2268                  * logically higher, the src_rq will push this task away.
2269                  * And if its going logically lower, we do not care
2270                  */
2271                 if (src_rq->rt.highest_prio.next >=
2272                     this_rq->rt.highest_prio.curr)
2273                         continue;
2274
2275                 /*
2276                  * We can potentially drop this_rq's lock in
2277                  * double_lock_balance, and another CPU could
2278                  * alter this_rq
2279                  */
2280                 push_task = NULL;
2281                 double_lock_balance(this_rq, src_rq);
2282
2283                 /*
2284                  * We can pull only a task, which is pushable
2285                  * on its rq, and no others.
2286                  */
2287                 p = pick_highest_pushable_task(src_rq, this_cpu);
2288
2289                 /*
2290                  * Do we have an RT task that preempts
2291                  * the to-be-scheduled task?
2292                  */
2293                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2294                         WARN_ON(p == src_rq->curr);
2295                         WARN_ON(!task_on_rq_queued(p));
2296
2297                         /*
2298                          * There's a chance that p is higher in priority
2299                          * than what's currently running on its CPU.
2300                          * This is just that p is waking up and hasn't
2301                          * had a chance to schedule. We only pull
2302                          * p if it is lower in priority than the
2303                          * current task on the run queue
2304                          */
2305                         if (p->prio < src_rq->donor->prio)
2306                                 goto skip;
2307
2308                         if (is_migration_disabled(p)) {
2309                                 push_task = get_push_task(src_rq);
2310                         } else {
2311                                 move_queued_task_locked(src_rq, this_rq, p);
2312                                 resched = true;
2313                         }
2314                         /*
2315                          * We continue with the search, just in
2316                          * case there's an even higher prio task
2317                          * in another runqueue. (low likelihood
2318                          * but possible)
2319                          */
2320                 }
2321 skip:
2322                 double_unlock_balance(this_rq, src_rq);
2323
2324                 if (push_task) {
2325                         preempt_disable();
2326                         raw_spin_rq_unlock(this_rq);
2327                         stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2328                                             push_task, &src_rq->push_work);
2329                         preempt_enable();
2330                         raw_spin_rq_lock(this_rq);
2331                 }
2332         }
2333
2334         if (resched)
2335                 resched_curr(this_rq);
2336 }
2337
2338 /*
2339  * If we are not running and we are not going to reschedule soon, we should
2340  * try to push tasks away now
2341  */
2342 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2343 {
2344         bool need_to_push = !task_on_cpu(rq, p) &&
2345                             !test_tsk_need_resched(rq->curr) &&
2346                             p->nr_cpus_allowed > 1 &&
2347                             (dl_task(rq->donor) || rt_task(rq->donor)) &&
2348                             (rq->curr->nr_cpus_allowed < 2 ||
2349                              rq->donor->prio <= p->prio);
2350
2351         if (need_to_push)
2352                 push_rt_tasks(rq);
2353 }
2354
2355 /* Assumes rq->lock is held */
2356 static void rq_online_rt(struct rq *rq)
2357 {
2358         if (rq->rt.overloaded)
2359                 rt_set_overload(rq);
2360
2361         __enable_runtime(rq);
2362
2363         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2364 }
2365
2366 /* Assumes rq->lock is held */
2367 static void rq_offline_rt(struct rq *rq)
2368 {
2369         if (rq->rt.overloaded)
2370                 rt_clear_overload(rq);
2371
2372         __disable_runtime(rq);
2373
2374         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2375 }
2376
2377 /*
2378  * When switch from the rt queue, we bring ourselves to a position
2379  * that we might want to pull RT tasks from other runqueues.
2380  */
2381 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2382 {
2383         /*
2384          * If there are other RT tasks then we will reschedule
2385          * and the scheduling of the other RT tasks will handle
2386          * the balancing. But if we are the last RT task
2387          * we may need to handle the pulling of RT tasks
2388          * now.
2389          */
2390         if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2391                 return;
2392
2393         rt_queue_pull_task(rq);
2394 }
2395
2396 void __init init_sched_rt_class(void)
2397 {
2398         unsigned int i;
2399
2400         for_each_possible_cpu(i) {
2401                 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2402                                         GFP_KERNEL, cpu_to_node(i));
2403         }
2404 }
2405
2406 /*
2407  * When switching a task to RT, we may overload the runqueue
2408  * with RT tasks. In this case we try to push them off to
2409  * other runqueues.
2410  */
2411 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2412 {
2413         /*
2414          * If we are running, update the avg_rt tracking, as the running time
2415          * will now on be accounted into the latter.
2416          */
2417         if (task_current(rq, p)) {
2418                 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2419                 return;
2420         }
2421
2422         /*
2423          * If we are not running we may need to preempt the current
2424          * running task. If that current running task is also an RT task
2425          * then see if we can move to another run queue.
2426          */
2427         if (task_on_rq_queued(p)) {
2428                 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2429                         rt_queue_push_tasks(rq);
2430                 if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
2431                         resched_curr(rq);
2432         }
2433 }
2434
2435 /*
2436  * Priority of the task has changed. This may cause
2437  * us to initiate a push or pull.
2438  */
2439 static void
2440 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2441 {
2442         if (!task_on_rq_queued(p))
2443                 return;
2444
2445         if (task_current_donor(rq, p)) {
2446                 /*
2447                  * If our priority decreases while running, we
2448                  * may need to pull tasks to this runqueue.
2449                  */
2450                 if (oldprio < p->prio)
2451                         rt_queue_pull_task(rq);
2452
2453                 /*
2454                  * If there's a higher priority task waiting to run
2455                  * then reschedule.
2456                  */
2457                 if (p->prio > rq->rt.highest_prio.curr)
2458                         resched_curr(rq);
2459         } else {
2460                 /*
2461                  * This task is not running, but if it is
2462                  * greater than the current running task
2463                  * then reschedule.
2464                  */
2465                 if (p->prio < rq->donor->prio)
2466                         resched_curr(rq);
2467         }
2468 }
2469
2470 #ifdef CONFIG_POSIX_TIMERS
2471 static void watchdog(struct rq *rq, struct task_struct *p)
2472 {
2473         unsigned long soft, hard;
2474
2475         /* max may change after cur was read, this will be fixed next tick */
2476         soft = task_rlimit(p, RLIMIT_RTTIME);
2477         hard = task_rlimit_max(p, RLIMIT_RTTIME);
2478
2479         if (soft != RLIM_INFINITY) {
2480                 unsigned long next;
2481
2482                 if (p->rt.watchdog_stamp != jiffies) {
2483                         p->rt.timeout++;
2484                         p->rt.watchdog_stamp = jiffies;
2485                 }
2486
2487                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2488                 if (p->rt.timeout > next) {
2489                         posix_cputimers_rt_watchdog(&p->posix_cputimers,
2490                                                     p->se.sum_exec_runtime);
2491                 }
2492         }
2493 }
2494 #else /* !CONFIG_POSIX_TIMERS: */
2495 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2496 #endif /* !CONFIG_POSIX_TIMERS */
2497
2498 /*
2499  * scheduler tick hitting a task of our scheduling class.
2500  *
2501  * NOTE: This function can be called remotely by the tick offload that
2502  * goes along full dynticks. Therefore no local assumption can be made
2503  * and everything must be accessed through the @rq and @curr passed in
2504  * parameters.
2505  */
2506 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2507 {
2508         struct sched_rt_entity *rt_se = &p->rt;
2509
2510         update_curr_rt(rq);
2511         update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2512
2513         watchdog(rq, p);
2514
2515         /*
2516          * RR tasks need a special form of time-slice management.
2517          * FIFO tasks have no timeslices.
2518          */
2519         if (p->policy != SCHED_RR)
2520                 return;
2521
2522         if (--p->rt.time_slice)
2523                 return;
2524
2525         p->rt.time_slice = sched_rr_timeslice;
2526
2527         /*
2528          * Requeue to the end of queue if we (and all of our ancestors) are not
2529          * the only element on the queue
2530          */
2531         for_each_sched_rt_entity(rt_se) {
2532                 if (rt_se->run_list.prev != rt_se->run_list.next) {
2533                         requeue_task_rt(rq, p, 0);
2534                         resched_curr(rq);
2535                         return;
2536                 }
2537         }
2538 }
2539
2540 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2541 {
2542         /*
2543          * Time slice is 0 for SCHED_FIFO tasks
2544          */
2545         if (task->policy == SCHED_RR)
2546                 return sched_rr_timeslice;
2547         else
2548                 return 0;
2549 }
2550
2551 #ifdef CONFIG_SCHED_CORE
2552 static int task_is_throttled_rt(struct task_struct *p, int cpu)
2553 {
2554         struct rt_rq *rt_rq;
2555
2556 #ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq
2557         rt_rq = task_group(p)->rt_rq[cpu];
2558         WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
2559 #else
2560         rt_rq = &cpu_rq(cpu)->rt;
2561 #endif
2562
2563         return rt_rq_throttled(rt_rq);
2564 }
2565 #endif /* CONFIG_SCHED_CORE */
2566
2567 DEFINE_SCHED_CLASS(rt) = {
2568
2569         .enqueue_task           = enqueue_task_rt,
2570         .dequeue_task           = dequeue_task_rt,
2571         .yield_task             = yield_task_rt,
2572
2573         .wakeup_preempt         = wakeup_preempt_rt,
2574
2575         .pick_task              = pick_task_rt,
2576         .put_prev_task          = put_prev_task_rt,
2577         .set_next_task          = set_next_task_rt,
2578
2579         .balance                = balance_rt,
2580         .select_task_rq         = select_task_rq_rt,
2581         .set_cpus_allowed       = set_cpus_allowed_common,
2582         .rq_online              = rq_online_rt,
2583         .rq_offline             = rq_offline_rt,
2584         .task_woken             = task_woken_rt,
2585         .switched_from          = switched_from_rt,
2586         .find_lock_rq           = find_lock_lowest_rq,
2587
2588         .task_tick              = task_tick_rt,
2589
2590         .get_rr_interval        = get_rr_interval_rt,
2591
2592         .prio_changed           = prio_changed_rt,
2593         .switched_to            = switched_to_rt,
2594
2595         .update_curr            = update_curr_rt,
2596
2597 #ifdef CONFIG_SCHED_CORE
2598         .task_is_throttled      = task_is_throttled_rt,
2599 #endif
2600
2601 #ifdef CONFIG_UCLAMP_TASK
2602         .uclamp_enabled         = 1,
2603 #endif
2604 };
2605
2606 #ifdef CONFIG_RT_GROUP_SCHED
2607 /*
2608  * Ensure that the real time constraints are schedulable.
2609  */
2610 static DEFINE_MUTEX(rt_constraints_mutex);
2611
2612 static inline int tg_has_rt_tasks(struct task_group *tg)
2613 {
2614         struct task_struct *task;
2615         struct css_task_iter it;
2616         int ret = 0;
2617
2618         /*
2619          * Autogroups do not have RT tasks; see autogroup_create().
2620          */
2621         if (task_group_is_autogroup(tg))
2622                 return 0;
2623
2624         css_task_iter_start(&tg->css, 0, &it);
2625         while (!ret && (task = css_task_iter_next(&it)))
2626                 ret |= rt_task(task);
2627         css_task_iter_end(&it);
2628
2629         return ret;
2630 }
2631
2632 struct rt_schedulable_data {
2633         struct task_group *tg;
2634         u64 rt_period;
2635         u64 rt_runtime;
2636 };
2637
2638 static int tg_rt_schedulable(struct task_group *tg, void *data)
2639 {
2640         struct rt_schedulable_data *d = data;
2641         struct task_group *child;
2642         unsigned long total, sum = 0;
2643         u64 period, runtime;
2644
2645         period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2646         runtime = tg->rt_bandwidth.rt_runtime;
2647
2648         if (tg == d->tg) {
2649                 period = d->rt_period;
2650                 runtime = d->rt_runtime;
2651         }
2652
2653         /*
2654          * Cannot have more runtime than the period.
2655          */
2656         if (runtime > period && runtime != RUNTIME_INF)
2657                 return -EINVAL;
2658
2659         /*
2660          * Ensure we don't starve existing RT tasks if runtime turns zero.
2661          */
2662         if (rt_bandwidth_enabled() && !runtime &&
2663             tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2664                 return -EBUSY;
2665
2666         if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
2667                 return -EBUSY;
2668
2669         total = to_ratio(period, runtime);
2670
2671         /*
2672          * Nobody can have more than the global setting allows.
2673          */
2674         if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2675                 return -EINVAL;
2676
2677         /*
2678          * The sum of our children's runtime should not exceed our own.
2679          */
2680         list_for_each_entry_rcu(child, &tg->children, siblings) {
2681                 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2682                 runtime = child->rt_bandwidth.rt_runtime;
2683
2684                 if (child == d->tg) {
2685                         period = d->rt_period;
2686                         runtime = d->rt_runtime;
2687                 }
2688
2689                 sum += to_ratio(period, runtime);
2690         }
2691
2692         if (sum > total)
2693                 return -EINVAL;
2694
2695         return 0;
2696 }
2697
2698 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2699 {
2700         int ret;
2701
2702         struct rt_schedulable_data data = {
2703                 .tg = tg,
2704                 .rt_period = period,
2705                 .rt_runtime = runtime,
2706         };
2707
2708         rcu_read_lock();
2709         ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2710         rcu_read_unlock();
2711
2712         return ret;
2713 }
2714
2715 static int tg_set_rt_bandwidth(struct task_group *tg,
2716                 u64 rt_period, u64 rt_runtime)
2717 {
2718         int i, err = 0;
2719
2720         /*
2721          * Disallowing the root group RT runtime is BAD, it would disallow the
2722          * kernel creating (and or operating) RT threads.
2723          */
2724         if (tg == &root_task_group && rt_runtime == 0)
2725                 return -EINVAL;
2726
2727         /* No period doesn't make any sense. */
2728         if (rt_period == 0)
2729                 return -EINVAL;
2730
2731         /*
2732          * Bound quota to defend quota against overflow during bandwidth shift.
2733          */
2734         if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2735                 return -EINVAL;
2736
2737         mutex_lock(&rt_constraints_mutex);
2738         err = __rt_schedulable(tg, rt_period, rt_runtime);
2739         if (err)
2740                 goto unlock;
2741
2742         raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2743         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2744         tg->rt_bandwidth.rt_runtime = rt_runtime;
2745
2746         for_each_possible_cpu(i) {
2747                 struct rt_rq *rt_rq = tg->rt_rq[i];
2748
2749                 raw_spin_lock(&rt_rq->rt_runtime_lock);
2750                 rt_rq->rt_runtime = rt_runtime;
2751                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2752         }
2753         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2754 unlock:
2755         mutex_unlock(&rt_constraints_mutex);
2756
2757         return err;
2758 }
2759
2760 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2761 {
2762         u64 rt_runtime, rt_period;
2763
2764         rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2765         rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2766         if (rt_runtime_us < 0)
2767                 rt_runtime = RUNTIME_INF;
2768         else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2769                 return -EINVAL;
2770
2771         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2772 }
2773
2774 long sched_group_rt_runtime(struct task_group *tg)
2775 {
2776         u64 rt_runtime_us;
2777
2778         if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2779                 return -1;
2780
2781         rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2782         do_div(rt_runtime_us, NSEC_PER_USEC);
2783         return rt_runtime_us;
2784 }
2785
2786 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2787 {
2788         u64 rt_runtime, rt_period;
2789
2790         if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2791                 return -EINVAL;
2792
2793         rt_period = rt_period_us * NSEC_PER_USEC;
2794         rt_runtime = tg->rt_bandwidth.rt_runtime;
2795
2796         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2797 }
2798
2799 long sched_group_rt_period(struct task_group *tg)
2800 {
2801         u64 rt_period_us;
2802
2803         rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2804         do_div(rt_period_us, NSEC_PER_USEC);
2805         return rt_period_us;
2806 }
2807
2808 #ifdef CONFIG_SYSCTL
2809 static int sched_rt_global_constraints(void)
2810 {
2811         int ret = 0;
2812
2813         mutex_lock(&rt_constraints_mutex);
2814         ret = __rt_schedulable(NULL, 0, 0);
2815         mutex_unlock(&rt_constraints_mutex);
2816
2817         return ret;
2818 }
2819 #endif /* CONFIG_SYSCTL */
2820
2821 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2822 {
2823         /* Don't accept real-time tasks when there is no way for them to run */
2824         if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2825                 return 0;
2826
2827         return 1;
2828 }
2829
2830 #else /* !CONFIG_RT_GROUP_SCHED: */
2831
2832 #ifdef CONFIG_SYSCTL
2833 static int sched_rt_global_constraints(void)
2834 {
2835         return 0;
2836 }
2837 #endif /* CONFIG_SYSCTL */
2838 #endif /* !CONFIG_RT_GROUP_SCHED */
2839
2840 #ifdef CONFIG_SYSCTL
2841 static int sched_rt_global_validate(void)
2842 {
2843         if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2844                 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2845                  ((u64)sysctl_sched_rt_runtime *
2846                         NSEC_PER_USEC > max_rt_runtime)))
2847                 return -EINVAL;
2848
2849         return 0;
2850 }
2851
2852 static void sched_rt_do_global(void)
2853 {
2854 }
2855
2856 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
2857                 size_t *lenp, loff_t *ppos)
2858 {
2859         int old_period, old_runtime;
2860         static DEFINE_MUTEX(mutex);
2861         int ret;
2862
2863         mutex_lock(&mutex);
2864         sched_domains_mutex_lock();
2865         old_period = sysctl_sched_rt_period;
2866         old_runtime = sysctl_sched_rt_runtime;
2867
2868         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2869
2870         if (!ret && write) {
2871                 ret = sched_rt_global_validate();
2872                 if (ret)
2873                         goto undo;
2874
2875                 ret = sched_dl_global_validate();
2876                 if (ret)
2877                         goto undo;
2878
2879                 ret = sched_rt_global_constraints();
2880                 if (ret)
2881                         goto undo;
2882
2883                 sched_rt_do_global();
2884                 sched_dl_do_global();
2885         }
2886         if (0) {
2887 undo:
2888                 sysctl_sched_rt_period = old_period;
2889                 sysctl_sched_rt_runtime = old_runtime;
2890         }
2891         sched_domains_mutex_unlock();
2892         mutex_unlock(&mutex);
2893
2894         /*
2895          * After changing maximum available bandwidth for DEADLINE, we need to
2896          * recompute per root domain and per cpus variables accordingly.
2897          */
2898         rebuild_sched_domains();
2899
2900         return ret;
2901 }
2902
2903 static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
2904                 size_t *lenp, loff_t *ppos)
2905 {
2906         int ret;
2907         static DEFINE_MUTEX(mutex);
2908
2909         mutex_lock(&mutex);
2910         ret = proc_dointvec(table, write, buffer, lenp, ppos);
2911         /*
2912          * Make sure that internally we keep jiffies.
2913          * Also, writing zero resets the time-slice to default:
2914          */
2915         if (!ret && write) {
2916                 sched_rr_timeslice =
2917                         sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2918                         msecs_to_jiffies(sysctl_sched_rr_timeslice);
2919
2920                 if (sysctl_sched_rr_timeslice <= 0)
2921                         sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
2922         }
2923         mutex_unlock(&mutex);
2924
2925         return ret;
2926 }
2927 #endif /* CONFIG_SYSCTL */
2928
2929 void print_rt_stats(struct seq_file *m, int cpu)
2930 {
2931         rt_rq_iter_t iter;
2932         struct rt_rq *rt_rq;
2933
2934         rcu_read_lock();
2935         for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2936                 print_rt_rq(m, cpu, rt_rq);
2937         rcu_read_unlock();
2938 }