kernel/sched/fair.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   4  *
   5  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   6  *
   7  *  Interactivity improvements by Mike Galbraith
   8  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   9  *
  10  *  Various enhancements by Dmitry Adamushko.
  11  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  12  *
  13  *  Group scheduling enhancements by Srivatsa Vaddagiri
  14  *  Copyright IBM Corporation, 2007
  15  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  16  *
  17  *  Scaled math optimizations by Thomas Gleixner
  18  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  19  *
  20  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22  */
  23 #include <linux/energy_model.h>
  24 #include <linux/mmap_lock.h>
  25 #include <linux/hugetlb_inline.h>
  26 #include <linux/jiffies.h>
  27 #include <linux/mm_api.h>
  28 #include <linux/highmem.h>
  29 #include <linux/spinlock_api.h>
  30 #include <linux/cpumask_api.h>
  31 #include <linux/lockdep_api.h>
  32 #include <linux/softirq.h>
  33 #include <linux/refcount_api.h>
  34 #include <linux/topology.h>
  35 #include <linux/sched/clock.h>
  36 #include <linux/sched/cond_resched.h>
  37 #include <linux/sched/cputime.h>
  38 #include <linux/sched/isolation.h>
  39 #include <linux/sched/nohz.h>
  40
  41 #include <linux/cpuidle.h>
  42 #include <linux/interrupt.h>
  43 #include <linux/memory-tiers.h>
  44 #include <linux/mempolicy.h>
  45 #include <linux/mutex_api.h>
  46 #include <linux/profile.h>
  47 #include <linux/psi.h>
  48 #include <linux/ratelimit.h>
  49 #include <linux/task_work.h>
  50 #include <linux/rbtree_augmented.h>
  51
  52 #include <asm/switch_to.h>
  53
  54 #include "sched.h"
  55 #include "stats.h"
  56 #include "autogroup.h"
  57
  58 /*
  59  * The initial- and re-scaling of tunables is configurable
  60  *
  61  * Options are:
  62  *
  63  *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
  64  *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  65  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  66  *
  67  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  68  */
  69 unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  70
  71 /*
  72  * Minimal preemption granularity for CPU-bound tasks:
  73  *
  74  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  75  */
  76 unsigned int sysctl_sched_base_slice                    = 750000ULL;
  77 static unsigned int normalized_sysctl_sched_base_slice  = 750000ULL;
  78
  79 const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;
  80
  81 int sched_thermal_decay_shift;
  82 static int __init setup_sched_thermal_decay_shift(char *str)
  83 {
  84         int _shift = 0;
  85
  86         if (kstrtoint(str, 0, &_shift))
  87                 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
  88
  89         sched_thermal_decay_shift = clamp(_shift, 0, 10);
  90         return 1;
  91 }
  92 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
  93
  94 #ifdef CONFIG_SMP
  95 /*
  96  * For asym packing, by default the lower numbered CPU has higher priority.
  97  */
  98 int __weak arch_asym_cpu_priority(int cpu)
  99 {
 100         return -cpu;
 101 }
 102
 103 /*
 104  * The margin used when comparing utilization with CPU capacity.
 105  *
 106  * (default: ~20%)
 107  */
 108 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
 109
 110 /*
 111  * The margin used when comparing CPU capacities.
 112  * is 'cap1' noticeably greater than 'cap2'
 113  *
 114  * (default: ~5%)
 115  */
 116 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
 117 #endif
 118
 119 #ifdef CONFIG_CFS_BANDWIDTH
 120 /*
 121  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 122  * each time a cfs_rq requests quota.
 123  *
 124  * Note: in the case that the slice exceeds the runtime remaining (either due
 125  * to consumption or the quota being specified to be smaller than the slice)
 126  * we will always only issue the remaining available time.
 127  *
 128  * (default: 5 msec, units: microseconds)
 129  */
 130 static unsigned int sysctl_sched_cfs_bandwidth_slice            = 5000UL;
 131 #endif
 132
 133 #ifdef CONFIG_NUMA_BALANCING
 134 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
 135 static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
 136 #endif
 137
 138 #ifdef CONFIG_SYSCTL
 139 static struct ctl_table sched_fair_sysctls[] = {
 140 #ifdef CONFIG_CFS_BANDWIDTH
 141         {
 142                 .procname       = "sched_cfs_bandwidth_slice_us",
 143                 .data           = &sysctl_sched_cfs_bandwidth_slice,
 144                 .maxlen         = sizeof(unsigned int),
 145                 .mode           = 0644,
 146                 .proc_handler   = proc_dointvec_minmax,
 147                 .extra1         = SYSCTL_ONE,
 148         },
 149 #endif
 150 #ifdef CONFIG_NUMA_BALANCING
 151         {
 152                 .procname       = "numa_balancing_promote_rate_limit_MBps",
 153                 .data           = &sysctl_numa_balancing_promote_rate_limit,
 154                 .maxlen         = sizeof(unsigned int),
 155                 .mode           = 0644,
 156                 .proc_handler   = proc_dointvec_minmax,
 157                 .extra1         = SYSCTL_ZERO,
 158         },
 159 #endif /* CONFIG_NUMA_BALANCING */
 160         {}
 161 };
 162
 163 static int __init sched_fair_sysctl_init(void)
 164 {
 165         register_sysctl_init("kernel", sched_fair_sysctls);
 166         return 0;
 167 }
 168 late_initcall(sched_fair_sysctl_init);
 169 #endif
 170
 171 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 172 {
 173         lw->weight += inc;
 174         lw->inv_weight = 0;
 175 }
 176
 177 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 178 {
 179         lw->weight -= dec;
 180         lw->inv_weight = 0;
 181 }
 182
 183 static inline void update_load_set(struct load_weight *lw, unsigned long w)
 184 {
 185         lw->weight = w;
 186         lw->inv_weight = 0;
 187 }
 188
 189 /*
 190  * Increase the granularity value when there are more CPUs,
 191  * because with more CPUs the 'effective latency' as visible
 192  * to users decreases. But the relationship is not linear,
 193  * so pick a second-best guess by going with the log2 of the
 194  * number of CPUs.
 195  *
 196  * This idea comes from the SD scheduler of Con Kolivas:
 197  */
 198 static unsigned int get_update_sysctl_factor(void)
 199 {
 200         unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 201         unsigned int factor;
 202
 203         switch (sysctl_sched_tunable_scaling) {
 204         case SCHED_TUNABLESCALING_NONE:
 205                 factor = 1;
 206                 break;
 207         case SCHED_TUNABLESCALING_LINEAR:
 208                 factor = cpus;
 209                 break;
 210         case SCHED_TUNABLESCALING_LOG:
 211         default:
 212                 factor = 1 + ilog2(cpus);
 213                 break;
 214         }
 215
 216         return factor;
 217 }
 218
 219 static void update_sysctl(void)
 220 {
 221         unsigned int factor = get_update_sysctl_factor();
 222
 223 #define SET_SYSCTL(name) \
 224         (sysctl_##name = (factor) * normalized_sysctl_##name)
 225         SET_SYSCTL(sched_base_slice);
 226 #undef SET_SYSCTL
 227 }
 228
 229 void __init sched_init_granularity(void)
 230 {
 231         update_sysctl();
 232 }
 233
 234 #define WMULT_CONST     (~0U)
 235 #define WMULT_SHIFT     32
 236
 237 static void __update_inv_weight(struct load_weight *lw)
 238 {
 239         unsigned long w;
 240
 241         if (likely(lw->inv_weight))
 242                 return;
 243
 244         w = scale_load_down(lw->weight);
 245
 246         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 247                 lw->inv_weight = 1;
 248         else if (unlikely(!w))
 249                 lw->inv_weight = WMULT_CONST;
 250         else
 251                 lw->inv_weight = WMULT_CONST / w;
 252 }
 253
 254 /*
 255  * delta_exec * weight / lw.weight
 256  *   OR
 257  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 258  *
 259  * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 260  * we're guaranteed shift stays positive because inv_weight is guaranteed to
 261  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 262  *
 263  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 264  * weight/lw.weight <= 1, and therefore our shift will also be positive.
 265  */
 266 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 267 {
 268         u64 fact = scale_load_down(weight);
 269         u32 fact_hi = (u32)(fact >> 32);
 270         int shift = WMULT_SHIFT;
 271         int fs;
 272
 273         __update_inv_weight(lw);
 274
 275         if (unlikely(fact_hi)) {
 276                 fs = fls(fact_hi);
 277                 shift -= fs;
 278                 fact >>= fs;
 279         }
 280
 281         fact = mul_u32_u32(fact, lw->inv_weight);
 282
 283         fact_hi = (u32)(fact >> 32);
 284         if (fact_hi) {
 285                 fs = fls(fact_hi);
 286                 shift -= fs;
 287                 fact >>= fs;
 288         }
 289
 290         return mul_u64_u32_shr(delta_exec, fact, shift);
 291 }
 292
 293 /*
 294  * delta /= w
 295  */
 296 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 297 {
 298         if (unlikely(se->load.weight != NICE_0_LOAD))
 299                 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 300
 301         return delta;
 302 }
 303
 304 const struct sched_class fair_sched_class;
 305
 306 /**************************************************************
 307  * CFS operations on generic schedulable entities:
 308  */
 309
 310 #ifdef CONFIG_FAIR_GROUP_SCHED
 311
 312 /* Walk up scheduling entities hierarchy */
 313 #define for_each_sched_entity(se) \
 314                 for (; se; se = se->parent)
 315
 316 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 317 {
 318         struct rq *rq = rq_of(cfs_rq);
 319         int cpu = cpu_of(rq);
 320
 321         if (cfs_rq->on_list)
 322                 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
 323
 324         cfs_rq->on_list = 1;
 325
 326         /*
 327          * Ensure we either appear before our parent (if already
 328          * enqueued) or force our parent to appear after us when it is
 329          * enqueued. The fact that we always enqueue bottom-up
 330          * reduces this to two cases and a special case for the root
 331          * cfs_rq. Furthermore, it also means that we will always reset
 332          * tmp_alone_branch either when the branch is connected
 333          * to a tree or when we reach the top of the tree
 334          */
 335         if (cfs_rq->tg->parent &&
 336             cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 337                 /*
 338                  * If parent is already on the list, we add the child
 339                  * just before. Thanks to circular linked property of
 340                  * the list, this means to put the child at the tail
 341                  * of the list that starts by parent.
 342                  */
 343                 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 344                         &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 345                 /*
 346                  * The branch is now connected to its tree so we can
 347                  * reset tmp_alone_branch to the beginning of the
 348                  * list.
 349                  */
 350                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 351                 return true;
 352         }
 353
 354         if (!cfs_rq->tg->parent) {
 355                 /*
 356                  * cfs rq without parent should be put
 357                  * at the tail of the list.
 358                  */
 359                 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 360                         &rq->leaf_cfs_rq_list);
 361                 /*
 362                  * We have reach the top of a tree so we can reset
 363                  * tmp_alone_branch to the beginning of the list.
 364                  */
 365                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 366                 return true;
 367         }
 368
 369         /*
 370          * The parent has not already been added so we want to
 371          * make sure that it will be put after us.
 372          * tmp_alone_branch points to the begin of the branch
 373          * where we will add parent.
 374          */
 375         list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
 376         /*
 377          * update tmp_alone_branch to points to the new begin
 378          * of the branch
 379          */
 380         rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 381         return false;
 382 }
 383
 384 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 385 {
 386         if (cfs_rq->on_list) {
 387                 struct rq *rq = rq_of(cfs_rq);
 388
 389                 /*
 390                  * With cfs_rq being unthrottled/throttled during an enqueue,
 391                  * it can happen the tmp_alone_branch points the a leaf that
 392                  * we finally want to del. In this case, tmp_alone_branch moves
 393                  * to the prev element but it will point to rq->leaf_cfs_rq_list
 394                  * at the end of the enqueue.
 395                  */
 396                 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
 397                         rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
 398
 399                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 400                 cfs_rq->on_list = 0;
 401         }
 402 }
 403
 404 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 405 {
 406         SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
 407 }
 408
 409 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 410 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                      \
 411         list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
 412                                  leaf_cfs_rq_list)
 413
 414 /* Do the two (enqueued) entities belong to the same group ? */
 415 static inline struct cfs_rq *
 416 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 417 {
 418         if (se->cfs_rq == pse->cfs_rq)
 419                 return se->cfs_rq;
 420
 421         return NULL;
 422 }
 423
 424 static inline struct sched_entity *parent_entity(const struct sched_entity *se)
 425 {
 426         return se->parent;
 427 }
 428
 429 static void
 430 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 431 {
 432         int se_depth, pse_depth;
 433
 434         /*
 435          * preemption test can be made between sibling entities who are in the
 436          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 437          * both tasks until we find their ancestors who are siblings of common
 438          * parent.
 439          */
 440
 441         /* First walk up until both entities are at same depth */
 442         se_depth = (*se)->depth;
 443         pse_depth = (*pse)->depth;
 444
 445         while (se_depth > pse_depth) {
 446                 se_depth--;
 447                 *se = parent_entity(*se);
 448         }
 449
 450         while (pse_depth > se_depth) {
 451                 pse_depth--;
 452                 *pse = parent_entity(*pse);
 453         }
 454
 455         while (!is_same_group(*se, *pse)) {
 456                 *se = parent_entity(*se);
 457                 *pse = parent_entity(*pse);
 458         }
 459 }
 460
 461 static int tg_is_idle(struct task_group *tg)
 462 {
 463         return tg->idle > 0;
 464 }
 465
 466 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
 467 {
 468         return cfs_rq->idle > 0;
 469 }
 470
 471 static int se_is_idle(struct sched_entity *se)
 472 {
 473         if (entity_is_task(se))
 474                 return task_has_idle_policy(task_of(se));
 475         return cfs_rq_is_idle(group_cfs_rq(se));
 476 }
 477
 478 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 479
 480 #define for_each_sched_entity(se) \
 481                 for (; se; se = NULL)
 482
 483 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 484 {
 485         return true;
 486 }
 487
 488 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 489 {
 490 }
 491
 492 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 493 {
 494 }
 495
 496 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)      \
 497                 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 498
 499 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 500 {
 501         return NULL;
 502 }
 503
 504 static inline void
 505 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 506 {
 507 }
 508
 509 static inline int tg_is_idle(struct task_group *tg)
 510 {
 511         return 0;
 512 }
 513
 514 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
 515 {
 516         return 0;
 517 }
 518
 519 static int se_is_idle(struct sched_entity *se)
 520 {
 521         return 0;
 522 }
 523
 524 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 525
 526 static __always_inline
 527 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 528
 529 /**************************************************************
 530  * Scheduling class tree data structure manipulation methods:
 531  */
 532
 533 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 534 {
 535         s64 delta = (s64)(vruntime - max_vruntime);
 536         if (delta > 0)
 537                 max_vruntime = vruntime;
 538
 539         return max_vruntime;
 540 }
 541
 542 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 543 {
 544         s64 delta = (s64)(vruntime - min_vruntime);
 545         if (delta < 0)
 546                 min_vruntime = vruntime;
 547
 548         return min_vruntime;
 549 }
 550
 551 static inline bool entity_before(const struct sched_entity *a,
 552                                  const struct sched_entity *b)
 553 {
 554         /*
 555          * Tiebreak on vruntime seems unnecessary since it can
 556          * hardly happen.
 557          */
 558         return (s64)(a->deadline - b->deadline) < 0;
 559 }
 560
 561 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 562 {
 563         return (s64)(se->vruntime - cfs_rq->min_vruntime);
 564 }
 565
 566 #define __node_2_se(node) \
 567         rb_entry((node), struct sched_entity, run_node)
 568
 569 /*
 570  * Compute virtual time from the per-task service numbers:
 571  *
 572  * Fair schedulers conserve lag:
 573  *
 574  *   \Sum lag_i = 0
 575  *
 576  * Where lag_i is given by:
 577  *
 578  *   lag_i = S - s_i = w_i * (V - v_i)
 579  *
 580  * Where S is the ideal service time and V is it's virtual time counterpart.
 581  * Therefore:
 582  *
 583  *   \Sum lag_i = 0
 584  *   \Sum w_i * (V - v_i) = 0
 585  *   \Sum w_i * V - w_i * v_i = 0
 586  *
 587  * From which we can solve an expression for V in v_i (which we have in
 588  * se->vruntime):
 589  *
 590  *       \Sum v_i * w_i   \Sum v_i * w_i
 591  *   V = -------------- = --------------
 592  *          \Sum w_i            W
 593  *
 594  * Specifically, this is the weighted average of all entity virtual runtimes.
 595  *
 596  * [[ NOTE: this is only equal to the ideal scheduler under the condition
 597  *          that join/leave operations happen at lag_i = 0, otherwise the
 598  *          virtual time has non-continguous motion equivalent to:
 599  *
 600  *            V +-= lag_i / W
 601  *
 602  *          Also see the comment in place_entity() that deals with this. ]]
 603  *
 604  * However, since v_i is u64, and the multiplcation could easily overflow
 605  * transform it into a relative form that uses smaller quantities:
 606  *
 607  * Substitute: v_i == (v_i - v0) + v0
 608  *
 609  *     \Sum ((v_i - v0) + v0) * w_i   \Sum (v_i - v0) * w_i
 610  * V = ---------------------------- = --------------------- + v0
 611  *                  W                            W
 612  *
 613  * Which we track using:
 614  *
 615  *                    v0 := cfs_rq->min_vruntime
 616  * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
 617  *              \Sum w_i := cfs_rq->avg_load
 618  *
 619  * Since min_vruntime is a monotonic increasing variable that closely tracks
 620  * the per-task service, these deltas: (v_i - v), will be in the order of the
 621  * maximal (virtual) lag induced in the system due to quantisation.
 622  *
 623  * Also, we use scale_load_down() to reduce the size.
 624  *
 625  * As measured, the max (key * weight) value was ~44 bits for a kernel build.
 626  */
 627 static void
 628 avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 629 {
 630         unsigned long weight = scale_load_down(se->load.weight);
 631         s64 key = entity_key(cfs_rq, se);
 632
 633         cfs_rq->avg_vruntime += key * weight;
 634         cfs_rq->avg_load += weight;
 635 }
 636
 637 static void
 638 avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
 639 {
 640         unsigned long weight = scale_load_down(se->load.weight);
 641         s64 key = entity_key(cfs_rq, se);
 642
 643         cfs_rq->avg_vruntime -= key * weight;
 644         cfs_rq->avg_load -= weight;
 645 }
 646
 647 static inline
 648 void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
 649 {
 650         /*
 651          * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
 652          */
 653         cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
 654 }
 655
 656 /*
 657  * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
 658  * For this to be so, the result of this function must have a left bias.
 659  */
 660 u64 avg_vruntime(struct cfs_rq *cfs_rq)
 661 {
 662         struct sched_entity *curr = cfs_rq->curr;
 663         s64 avg = cfs_rq->avg_vruntime;
 664         long load = cfs_rq->avg_load;
 665
 666         if (curr && curr->on_rq) {
 667                 unsigned long weight = scale_load_down(curr->load.weight);
 668
 669                 avg += entity_key(cfs_rq, curr) * weight;
 670                 load += weight;
 671         }
 672
 673         if (load) {
 674                 /* sign flips effective floor / ceil */
 675                 if (avg < 0)
 676                         avg -= (load - 1);
 677                 avg = div_s64(avg, load);
 678         }
 679
 680         return cfs_rq->min_vruntime + avg;
 681 }
 682
 683 /*
 684  * lag_i = S - s_i = w_i * (V - v_i)
 685  *
 686  * However, since V is approximated by the weighted average of all entities it
 687  * is possible -- by addition/removal/reweight to the tree -- to move V around
 688  * and end up with a larger lag than we started with.
 689  *
 690  * Limit this to either double the slice length with a minimum of TICK_NSEC
 691  * since that is the timing granularity.
 692  *
 693  * EEVDF gives the following limit for a steady state system:
 694  *
 695  *   -r_max < lag < max(r_max, q)
 696  *
 697  * XXX could add max_slice to the augmented data to track this.
 698  */
 699 static s64 entity_lag(u64 avruntime, struct sched_entity *se)
 700 {
 701         s64 vlag, limit;
 702
 703         vlag = avruntime - se->vruntime;
 704         limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
 705
 706         return clamp(vlag, -limit, limit);
 707 }
 708
 709 static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
 710 {
 711         SCHED_WARN_ON(!se->on_rq);
 712
 713         se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
 714 }
 715
 716 /*
 717  * Entity is eligible once it received less service than it ought to have,
 718  * eg. lag >= 0.
 719  *
 720  * lag_i = S - s_i = w_i*(V - v_i)
 721  *
 722  * lag_i >= 0 -> V >= v_i
 723  *
 724  *     \Sum (v_i - v)*w_i
 725  * V = ------------------ + v
 726  *          \Sum w_i
 727  *
 728  * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
 729  *
 730  * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
 731  *       to the loss in precision caused by the division.
 732  */
 733 static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
 734 {
 735         struct sched_entity *curr = cfs_rq->curr;
 736         s64 avg = cfs_rq->avg_vruntime;
 737         long load = cfs_rq->avg_load;
 738
 739         if (curr && curr->on_rq) {
 740                 unsigned long weight = scale_load_down(curr->load.weight);
 741
 742                 avg += entity_key(cfs_rq, curr) * weight;
 743                 load += weight;
 744         }
 745
 746         return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
 747 }
 748
 749 int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
 750 {
 751         return vruntime_eligible(cfs_rq, se->vruntime);
 752 }
 753
 754 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
 755 {
 756         u64 min_vruntime = cfs_rq->min_vruntime;
 757         /*
 758          * open coded max_vruntime() to allow updating avg_vruntime
 759          */
 760         s64 delta = (s64)(vruntime - min_vruntime);
 761         if (delta > 0) {
 762                 avg_vruntime_update(cfs_rq, delta);
 763                 min_vruntime = vruntime;
 764         }
 765         return min_vruntime;
 766 }
 767
 768 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 769 {
 770         struct sched_entity *se = __pick_root_entity(cfs_rq);
 771         struct sched_entity *curr = cfs_rq->curr;
 772         u64 vruntime = cfs_rq->min_vruntime;
 773
 774         if (curr) {
 775                 if (curr->on_rq)
 776                         vruntime = curr->vruntime;
 777                 else
 778                         curr = NULL;
 779         }
 780
 781         if (se) {
 782                 if (!curr)
 783                         vruntime = se->min_vruntime;
 784                 else
 785                         vruntime = min_vruntime(vruntime, se->min_vruntime);
 786         }
 787
 788         /* ensure we never gain time by being placed backwards. */
 789         u64_u32_store(cfs_rq->min_vruntime,
 790                       __update_min_vruntime(cfs_rq, vruntime));
 791 }
 792
 793 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
 794 {
 795         return entity_before(__node_2_se(a), __node_2_se(b));
 796 }
 797
 798 #define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
 799
 800 static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
 801 {
 802         if (node) {
 803                 struct sched_entity *rse = __node_2_se(node);
 804                 if (vruntime_gt(min_vruntime, se, rse))
 805                         se->min_vruntime = rse->min_vruntime;
 806         }
 807 }
 808
 809 /*
 810  * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
 811  */
 812 static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
 813 {
 814         u64 old_min_vruntime = se->min_vruntime;
 815         struct rb_node *node = &se->run_node;
 816
 817         se->min_vruntime = se->vruntime;
 818         __min_vruntime_update(se, node->rb_right);
 819         __min_vruntime_update(se, node->rb_left);
 820
 821         return se->min_vruntime == old_min_vruntime;
 822 }
 823
 824 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
 825                      run_node, min_vruntime, min_vruntime_update);
 826
 827 /*
 828  * Enqueue an entity into the rb-tree:
 829  */
 830 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 831 {
 832         avg_vruntime_add(cfs_rq, se);
 833         se->min_vruntime = se->vruntime;
 834         rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
 835                                 __entity_less, &min_vruntime_cb);
 836 }
 837
 838 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 839 {
 840         rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
 841                                   &min_vruntime_cb);
 842         avg_vruntime_sub(cfs_rq, se);
 843 }
 844
 845 struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
 846 {
 847         struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
 848
 849         if (!root)
 850                 return NULL;
 851
 852         return __node_2_se(root);
 853 }
 854
 855 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 856 {
 857         struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
 858
 859         if (!left)
 860                 return NULL;
 861
 862         return __node_2_se(left);
 863 }
 864
 865 /*
 866  * Earliest Eligible Virtual Deadline First
 867  *
 868  * In order to provide latency guarantees for different request sizes
 869  * EEVDF selects the best runnable task from two criteria:
 870  *
 871  *  1) the task must be eligible (must be owed service)
 872  *
 873  *  2) from those tasks that meet 1), we select the one
 874  *     with the earliest virtual deadline.
 875  *
 876  * We can do this in O(log n) time due to an augmented RB-tree. The
 877  * tree keeps the entries sorted on deadline, but also functions as a
 878  * heap based on the vruntime by keeping:
 879  *
 880  *  se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
 881  *
 882  * Which allows tree pruning through eligibility.
 883  */
 884 static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 885 {
 886         struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
 887         struct sched_entity *se = __pick_first_entity(cfs_rq);
 888         struct sched_entity *curr = cfs_rq->curr;
 889         struct sched_entity *best = NULL;
 890
 891         /*
 892          * We can safely skip eligibility check if there is only one entity
 893          * in this cfs_rq, saving some cycles.
 894          */
 895         if (cfs_rq->nr_running == 1)
 896                 return curr && curr->on_rq ? curr : se;
 897
 898         if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
 899                 curr = NULL;
 900
 901         /*
 902          * Once selected, run a task until it either becomes non-eligible or
 903          * until it gets a new slice. See the HACK in set_next_entity().
 904          */
 905         if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
 906                 return curr;
 907
 908         /* Pick the leftmost entity if it's eligible */
 909         if (se && entity_eligible(cfs_rq, se)) {
 910                 best = se;
 911                 goto found;
 912         }
 913
 914         /* Heap search for the EEVD entity */
 915         while (node) {
 916                 struct rb_node *left = node->rb_left;
 917
 918                 /*
 919                  * Eligible entities in left subtree are always better
 920                  * choices, since they have earlier deadlines.
 921                  */
 922                 if (left && vruntime_eligible(cfs_rq,
 923                                         __node_2_se(left)->min_vruntime)) {
 924                         node = left;
 925                         continue;
 926                 }
 927
 928                 se = __node_2_se(node);
 929
 930                 /*
 931                  * The left subtree either is empty or has no eligible
 932                  * entity, so check the current node since it is the one
 933                  * with earliest deadline that might be eligible.
 934                  */
 935                 if (entity_eligible(cfs_rq, se)) {
 936                         best = se;
 937                         break;
 938                 }
 939
 940                 node = node->rb_right;
 941         }
 942 found:
 943         if (!best || (curr && entity_before(curr, best)))
 944                 best = curr;
 945
 946         return best;
 947 }
 948
 949 #ifdef CONFIG_SCHED_DEBUG
 950 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 951 {
 952         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
 953
 954         if (!last)
 955                 return NULL;
 956
 957         return __node_2_se(last);
 958 }
 959
 960 /**************************************************************
 961  * Scheduling class statistics methods:
 962  */
 963 #ifdef CONFIG_SMP
 964 int sched_update_scaling(void)
 965 {
 966         unsigned int factor = get_update_sysctl_factor();
 967
 968 #define WRT_SYSCTL(name) \
 969         (normalized_sysctl_##name = sysctl_##name / (factor))
 970         WRT_SYSCTL(sched_base_slice);
 971 #undef WRT_SYSCTL
 972
 973         return 0;
 974 }
 975 #endif
 976 #endif
 977
 978 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
 979
 980 /*
 981  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
 982  * this is probably good enough.
 983  */
 984 static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 985 {
 986         if ((s64)(se->vruntime - se->deadline) < 0)
 987                 return;
 988
 989         /*
 990          * For EEVDF the virtual time slope is determined by w_i (iow.
 991          * nice) while the request time r_i is determined by
 992          * sysctl_sched_base_slice.
 993          */
 994         se->slice = sysctl_sched_base_slice;
 995
 996         /*
 997          * EEVDF: vd_i = ve_i + r_i / w_i
 998          */
 999         se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
1000
1001         /*
1002          * The task has consumed its request, reschedule.
1003          */
1004         if (cfs_rq->nr_running > 1) {
1005                 resched_curr(rq_of(cfs_rq));
1006                 clear_buddies(cfs_rq, se);
1007         }
1008 }
1009
1010 #include "pelt.h"
1011 #ifdef CONFIG_SMP
1012
1013 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
1014 static unsigned long task_h_load(struct task_struct *p);
1015 static unsigned long capacity_of(int cpu);
1016
1017 /* Give new sched_entity start runnable values to heavy its load in infant time */
1018 void init_entity_runnable_average(struct sched_entity *se)
1019 {
1020         struct sched_avg *sa = &se->avg;
1021
1022         memset(sa, 0, sizeof(*sa));
1023
1024         /*
1025          * Tasks are initialized with full load to be seen as heavy tasks until
1026          * they get a chance to stabilize to their real load level.
1027          * Group entities are initialized with zero load to reflect the fact that
1028          * nothing has been attached to the task group yet.
1029          */
1030         if (entity_is_task(se))
1031                 sa->load_avg = scale_load_down(se->load.weight);
1032
1033         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
1034 }
1035
1036 /*
1037  * With new tasks being created, their initial util_avgs are extrapolated
1038  * based on the cfs_rq's current util_avg:
1039  *
1040  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
1041  *
1042  * However, in many cases, the above util_avg does not give a desired
1043  * value. Moreover, the sum of the util_avgs may be divergent, such
1044  * as when the series is a harmonic series.
1045  *
1046  * To solve this problem, we also cap the util_avg of successive tasks to
1047  * only 1/2 of the left utilization budget:
1048  *
1049  *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
1050  *
1051  * where n denotes the nth task and cpu_scale the CPU capacity.
1052  *
1053  * For example, for a CPU with 1024 of capacity, a simplest series from
1054  * the beginning would be like:
1055  *
1056  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
1057  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
1058  *
1059  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
1060  * if util_avg > util_avg_cap.
1061  */
1062 void post_init_entity_util_avg(struct task_struct *p)
1063 {
1064         struct sched_entity *se = &p->se;
1065         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1066         struct sched_avg *sa = &se->avg;
1067         long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
1068         long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
1069
1070         if (p->sched_class != &fair_sched_class) {
1071                 /*
1072                  * For !fair tasks do:
1073                  *
1074                 update_cfs_rq_load_avg(now, cfs_rq);
1075                 attach_entity_load_avg(cfs_rq, se);
1076                 switched_from_fair(rq, p);
1077                  *
1078                  * such that the next switched_to_fair() has the
1079                  * expected state.
1080                  */
1081                 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
1082                 return;
1083         }
1084
1085         if (cap > 0) {
1086                 if (cfs_rq->avg.util_avg != 0) {
1087                         sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
1088                         sa->util_avg /= (cfs_rq->avg.load_avg + 1);
1089
1090                         if (sa->util_avg > cap)
1091                                 sa->util_avg = cap;
1092                 } else {
1093                         sa->util_avg = cap;
1094                 }
1095         }
1096
1097         sa->runnable_avg = sa->util_avg;
1098 }
1099
1100 #else /* !CONFIG_SMP */
1101 void init_entity_runnable_average(struct sched_entity *se)
1102 {
1103 }
1104 void post_init_entity_util_avg(struct task_struct *p)
1105 {
1106 }
1107 static void update_tg_load_avg(struct cfs_rq *cfs_rq)
1108 {
1109 }
1110 #endif /* CONFIG_SMP */
1111
1112 static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
1113 {
1114         u64 now = rq_clock_task(rq);
1115         s64 delta_exec;
1116
1117         delta_exec = now - curr->exec_start;
1118         if (unlikely(delta_exec <= 0))
1119                 return delta_exec;
1120
1121         curr->exec_start = now;
1122         curr->sum_exec_runtime += delta_exec;
1123
1124         if (schedstat_enabled()) {
1125                 struct sched_statistics *stats;
1126
1127                 stats = __schedstats_from_se(curr);
1128                 __schedstat_set(stats->exec_max,
1129                                 max(delta_exec, stats->exec_max));
1130         }
1131
1132         return delta_exec;
1133 }
1134
1135 static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
1136 {
1137         trace_sched_stat_runtime(p, delta_exec);
1138         account_group_exec_runtime(p, delta_exec);
1139         cgroup_account_cputime(p, delta_exec);
1140         if (p->dl_server)
1141                 dl_server_update(p->dl_server, delta_exec);
1142 }
1143
1144 /*
1145  * Used by other classes to account runtime.
1146  */
1147 s64 update_curr_common(struct rq *rq)
1148 {
1149         struct task_struct *curr = rq->curr;
1150         s64 delta_exec;
1151
1152         delta_exec = update_curr_se(rq, &curr->se);
1153         if (likely(delta_exec > 0))
1154                 update_curr_task(curr, delta_exec);
1155
1156         return delta_exec;
1157 }
1158
1159 /*
1160  * Update the current task's runtime statistics.
1161  */
1162 static void update_curr(struct cfs_rq *cfs_rq)
1163 {
1164         struct sched_entity *curr = cfs_rq->curr;
1165         s64 delta_exec;
1166
1167         if (unlikely(!curr))
1168                 return;
1169
1170         delta_exec = update_curr_se(rq_of(cfs_rq), curr);
1171         if (unlikely(delta_exec <= 0))
1172                 return;
1173
1174         curr->vruntime += calc_delta_fair(delta_exec, curr);
1175         update_deadline(cfs_rq, curr);
1176         update_min_vruntime(cfs_rq);
1177
1178         if (entity_is_task(curr))
1179                 update_curr_task(task_of(curr), delta_exec);
1180
1181         account_cfs_rq_runtime(cfs_rq, delta_exec);
1182 }
1183
1184 static void update_curr_fair(struct rq *rq)
1185 {
1186         update_curr(cfs_rq_of(&rq->curr->se));
1187 }
1188
1189 static inline void
1190 update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1191 {
1192         struct sched_statistics *stats;
1193         struct task_struct *p = NULL;
1194
1195         if (!schedstat_enabled())
1196                 return;
1197
1198         stats = __schedstats_from_se(se);
1199
1200         if (entity_is_task(se))
1201                 p = task_of(se);
1202
1203         __update_stats_wait_start(rq_of(cfs_rq), p, stats);
1204 }
1205
1206 static inline void
1207 update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1208 {
1209         struct sched_statistics *stats;
1210         struct task_struct *p = NULL;
1211
1212         if (!schedstat_enabled())
1213                 return;
1214
1215         stats = __schedstats_from_se(se);
1216
1217         /*
1218          * When the sched_schedstat changes from 0 to 1, some sched se
1219          * maybe already in the runqueue, the se->statistics.wait_start
1220          * will be 0.So it will let the delta wrong. We need to avoid this
1221          * scenario.
1222          */
1223         if (unlikely(!schedstat_val(stats->wait_start)))
1224                 return;
1225
1226         if (entity_is_task(se))
1227                 p = task_of(se);
1228
1229         __update_stats_wait_end(rq_of(cfs_rq), p, stats);
1230 }
1231
1232 static inline void
1233 update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1234 {
1235         struct sched_statistics *stats;
1236         struct task_struct *tsk = NULL;
1237
1238         if (!schedstat_enabled())
1239                 return;
1240
1241         stats = __schedstats_from_se(se);
1242
1243         if (entity_is_task(se))
1244                 tsk = task_of(se);
1245
1246         __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
1247 }
1248
1249 /*
1250  * Task is being enqueued - update stats:
1251  */
1252 static inline void
1253 update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1254 {
1255         if (!schedstat_enabled())
1256                 return;
1257
1258         /*
1259          * Are we enqueueing a waiting task? (for current tasks
1260          * a dequeue/enqueue event is a NOP)
1261          */
1262         if (se != cfs_rq->curr)
1263                 update_stats_wait_start_fair(cfs_rq, se);
1264
1265         if (flags & ENQUEUE_WAKEUP)
1266                 update_stats_enqueue_sleeper_fair(cfs_rq, se);
1267 }
1268
1269 static inline void
1270 update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1271 {
1272
1273         if (!schedstat_enabled())
1274                 return;
1275
1276         /*
1277          * Mark the end of the wait period if dequeueing a
1278          * waiting task:
1279          */
1280         if (se != cfs_rq->curr)
1281                 update_stats_wait_end_fair(cfs_rq, se);
1282
1283         if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1284                 struct task_struct *tsk = task_of(se);
1285                 unsigned int state;
1286
1287                 /* XXX racy against TTWU */
1288                 state = READ_ONCE(tsk->__state);
1289                 if (state & TASK_INTERRUPTIBLE)
1290                         __schedstat_set(tsk->stats.sleep_start,
1291                                       rq_clock(rq_of(cfs_rq)));
1292                 if (state & TASK_UNINTERRUPTIBLE)
1293                         __schedstat_set(tsk->stats.block_start,
1294                                       rq_clock(rq_of(cfs_rq)));
1295         }
1296 }
1297
1298 /*
1299  * We are picking a new current task - update its stats:
1300  */
1301 static inline void
1302 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1303 {
1304         /*
1305          * We are starting a new run period:
1306          */
1307         se->exec_start = rq_clock_task(rq_of(cfs_rq));
1308 }
1309
1310 /**************************************************
1311  * Scheduling class queueing methods:
1312  */
1313
1314 static inline bool is_core_idle(int cpu)
1315 {
1316 #ifdef CONFIG_SCHED_SMT
1317         int sibling;
1318
1319         for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1320                 if (cpu == sibling)
1321                         continue;
1322
1323                 if (!idle_cpu(sibling))
1324                         return false;
1325         }
1326 #endif
1327
1328         return true;
1329 }
1330
1331 #ifdef CONFIG_NUMA
1332 #define NUMA_IMBALANCE_MIN 2
1333
1334 static inline long
1335 adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1336 {
1337         /*
1338          * Allow a NUMA imbalance if busy CPUs is less than the maximum
1339          * threshold. Above this threshold, individual tasks may be contending
1340          * for both memory bandwidth and any shared HT resources.  This is an
1341          * approximation as the number of running tasks may not be related to
1342          * the number of busy CPUs due to sched_setaffinity.
1343          */
1344         if (dst_running > imb_numa_nr)
1345                 return imbalance;
1346
1347         /*
1348          * Allow a small imbalance based on a simple pair of communicating
1349          * tasks that remain local when the destination is lightly loaded.
1350          */
1351         if (imbalance <= NUMA_IMBALANCE_MIN)
1352                 return 0;
1353
1354         return imbalance;
1355 }
1356 #endif /* CONFIG_NUMA */
1357
1358 #ifdef CONFIG_NUMA_BALANCING
1359 /*
1360  * Approximate time to scan a full NUMA task in ms. The task scan period is
1361  * calculated based on the tasks virtual memory size and
1362  * numa_balancing_scan_size.
1363  */
1364 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1365 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1366
1367 /* Portion of address space to scan in MB */
1368 unsigned int sysctl_numa_balancing_scan_size = 256;
1369
1370 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1371 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1372
1373 /* The page with hint page fault latency < threshold in ms is considered hot */
1374 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
1375
1376 struct numa_group {
1377         refcount_t refcount;
1378
1379         spinlock_t lock; /* nr_tasks, tasks */
1380         int nr_tasks;
1381         pid_t gid;
1382         int active_nodes;
1383
1384         struct rcu_head rcu;
1385         unsigned long total_faults;
1386         unsigned long max_faults_cpu;
1387         /*
1388          * faults[] array is split into two regions: faults_mem and faults_cpu.
1389          *
1390          * Faults_cpu is used to decide whether memory should move
1391          * towards the CPU. As a consequence, these stats are weighted
1392          * more by CPU use than by memory faults.
1393          */
1394         unsigned long faults[];
1395 };
1396
1397 /*
1398  * For functions that can be called in multiple contexts that permit reading
1399  * ->numa_group (see struct task_struct for locking rules).
1400  */
1401 static struct numa_group *deref_task_numa_group(struct task_struct *p)
1402 {
1403         return rcu_dereference_check(p->numa_group, p == current ||
1404                 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1405 }
1406
1407 static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1408 {
1409         return rcu_dereference_protected(p->numa_group, p == current);
1410 }
1411
1412 static inline unsigned long group_faults_priv(struct numa_group *ng);
1413 static inline unsigned long group_faults_shared(struct numa_group *ng);
1414
1415 static unsigned int task_nr_scan_windows(struct task_struct *p)
1416 {
1417         unsigned long rss = 0;
1418         unsigned long nr_scan_pages;
1419
1420         /*
1421          * Calculations based on RSS as non-present and empty pages are skipped
1422          * by the PTE scanner and NUMA hinting faults should be trapped based
1423          * on resident pages
1424          */
1425         nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1426         rss = get_mm_rss(p->mm);
1427         if (!rss)
1428                 rss = nr_scan_pages;
1429
1430         rss = round_up(rss, nr_scan_pages);
1431         return rss / nr_scan_pages;
1432 }
1433
1434 /* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1435 #define MAX_SCAN_WINDOW 2560
1436
1437 static unsigned int task_scan_min(struct task_struct *p)
1438 {
1439         unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1440         unsigned int scan, floor;
1441         unsigned int windows = 1;
1442
1443         if (scan_size < MAX_SCAN_WINDOW)
1444                 windows = MAX_SCAN_WINDOW / scan_size;
1445         floor = 1000 / windows;
1446
1447         scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1448         return max_t(unsigned int, floor, scan);
1449 }
1450
1451 static unsigned int task_scan_start(struct task_struct *p)
1452 {
1453         unsigned long smin = task_scan_min(p);
1454         unsigned long period = smin;
1455         struct numa_group *ng;
1456
1457         /* Scale the maximum scan period with the amount of shared memory. */
1458         rcu_read_lock();
1459         ng = rcu_dereference(p->numa_group);
1460         if (ng) {
1461                 unsigned long shared = group_faults_shared(ng);
1462                 unsigned long private = group_faults_priv(ng);
1463
1464                 period *= refcount_read(&ng->refcount);
1465                 period *= shared + 1;
1466                 period /= private + shared + 1;
1467         }
1468         rcu_read_unlock();
1469
1470         return max(smin, period);
1471 }
1472
1473 static unsigned int task_scan_max(struct task_struct *p)
1474 {
1475         unsigned long smin = task_scan_min(p);
1476         unsigned long smax;
1477         struct numa_group *ng;
1478
1479         /* Watch for min being lower than max due to floor calculations */
1480         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1481
1482         /* Scale the maximum scan period with the amount of shared memory. */
1483         ng = deref_curr_numa_group(p);
1484         if (ng) {
1485                 unsigned long shared = group_faults_shared(ng);
1486                 unsigned long private = group_faults_priv(ng);
1487                 unsigned long period = smax;
1488
1489                 period *= refcount_read(&ng->refcount);
1490                 period *= shared + 1;
1491                 period /= private + shared + 1;
1492
1493                 smax = max(smax, period);
1494         }
1495
1496         return max(smin, smax);
1497 }
1498
1499 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1500 {
1501         rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1502         rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1503 }
1504
1505 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1506 {
1507         rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1508         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1509 }
1510
1511 /* Shared or private faults. */
1512 #define NR_NUMA_HINT_FAULT_TYPES 2
1513
1514 /* Memory and CPU locality */
1515 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1516
1517 /* Averaged statistics, and temporary buffers. */
1518 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1519
1520 pid_t task_numa_group_id(struct task_struct *p)
1521 {
1522         struct numa_group *ng;
1523         pid_t gid = 0;
1524
1525         rcu_read_lock();
1526         ng = rcu_dereference(p->numa_group);
1527         if (ng)
1528                 gid = ng->gid;
1529         rcu_read_unlock();
1530
1531         return gid;
1532 }
1533
1534 /*
1535  * The averaged statistics, shared & private, memory & CPU,
1536  * occupy the first half of the array. The second half of the
1537  * array is for current counters, which are averaged into the
1538  * first set by task_numa_placement.
1539  */
1540 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1541 {
1542         return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1543 }
1544
1545 static inline unsigned long task_faults(struct task_struct *p, int nid)
1546 {
1547         if (!p->numa_faults)
1548                 return 0;
1549
1550         return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1551                 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1552 }
1553
1554 static inline unsigned long group_faults(struct task_struct *p, int nid)
1555 {
1556         struct numa_group *ng = deref_task_numa_group(p);
1557
1558         if (!ng)
1559                 return 0;
1560
1561         return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1562                 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1563 }
1564
1565 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1566 {
1567         return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
1568                 group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
1569 }
1570
1571 static inline unsigned long group_faults_priv(struct numa_group *ng)
1572 {
1573         unsigned long faults = 0;
1574         int node;
1575
1576         for_each_online_node(node) {
1577                 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1578         }
1579
1580         return faults;
1581 }
1582
1583 static inline unsigned long group_faults_shared(struct numa_group *ng)
1584 {
1585         unsigned long faults = 0;
1586         int node;
1587
1588         for_each_online_node(node) {
1589                 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1590         }
1591
1592         return faults;
1593 }
1594
1595 /*
1596  * A node triggering more than 1/3 as many NUMA faults as the maximum is
1597  * considered part of a numa group's pseudo-interleaving set. Migrations
1598  * between these nodes are slowed down, to allow things to settle down.
1599  */
1600 #define ACTIVE_NODE_FRACTION 3
1601
1602 static bool numa_is_active_node(int nid, struct numa_group *ng)
1603 {
1604         return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1605 }
1606
1607 /* Handle placement on systems where not all nodes are directly connected. */
1608 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1609                                         int lim_dist, bool task)
1610 {
1611         unsigned long score = 0;
1612         int node, max_dist;
1613
1614         /*
1615          * All nodes are directly connected, and the same distance
1616          * from each other. No need for fancy placement algorithms.
1617          */
1618         if (sched_numa_topology_type == NUMA_DIRECT)
1619                 return 0;
1620
1621         /* sched_max_numa_distance may be changed in parallel. */
1622         max_dist = READ_ONCE(sched_max_numa_distance);
1623         /*
1624          * This code is called for each node, introducing N^2 complexity,
1625          * which should be ok given the number of nodes rarely exceeds 8.
1626          */
1627         for_each_online_node(node) {
1628                 unsigned long faults;
1629                 int dist = node_distance(nid, node);
1630
1631                 /*
1632                  * The furthest away nodes in the system are not interesting
1633                  * for placement; nid was already counted.
1634                  */
1635                 if (dist >= max_dist || node == nid)
1636                         continue;
1637
1638                 /*
1639                  * On systems with a backplane NUMA topology, compare groups
1640                  * of nodes, and move tasks towards the group with the most
1641                  * memory accesses. When comparing two nodes at distance
1642                  * "hoplimit", only nodes closer by than "hoplimit" are part
1643                  * of each group. Skip other nodes.
1644                  */
1645                 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1646                         continue;
1647
1648                 /* Add up the faults from nearby nodes. */
1649                 if (task)
1650                         faults = task_faults(p, node);
1651                 else
1652                         faults = group_faults(p, node);
1653
1654                 /*
1655                  * On systems with a glueless mesh NUMA topology, there are
1656                  * no fixed "groups of nodes". Instead, nodes that are not
1657                  * directly connected bounce traffic through intermediate
1658                  * nodes; a numa_group can occupy any set of nodes.
1659                  * The further away a node is, the less the faults count.
1660                  * This seems to result in good task placement.
1661                  */
1662                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1663                         faults *= (max_dist - dist);
1664                         faults /= (max_dist - LOCAL_DISTANCE);
1665                 }
1666
1667                 score += faults;
1668         }
1669
1670         return score;
1671 }
1672
1673 /*
1674  * These return the fraction of accesses done by a particular task, or
1675  * task group, on a particular numa node.  The group weight is given a
1676  * larger multiplier, in order to group tasks together that are almost
1677  * evenly spread out between numa nodes.
1678  */
1679 static inline unsigned long task_weight(struct task_struct *p, int nid,
1680                                         int dist)
1681 {
1682         unsigned long faults, total_faults;
1683
1684         if (!p->numa_faults)
1685                 return 0;
1686
1687         total_faults = p->total_numa_faults;
1688
1689         if (!total_faults)
1690                 return 0;
1691
1692         faults = task_faults(p, nid);
1693         faults += score_nearby_nodes(p, nid, dist, true);
1694
1695         return 1000 * faults / total_faults;
1696 }
1697
1698 static inline unsigned long group_weight(struct task_struct *p, int nid,
1699                                          int dist)
1700 {
1701         struct numa_group *ng = deref_task_numa_group(p);
1702         unsigned long faults, total_faults;
1703
1704         if (!ng)
1705                 return 0;
1706
1707         total_faults = ng->total_faults;
1708
1709         if (!total_faults)
1710                 return 0;
1711
1712         faults = group_faults(p, nid);
1713         faults += score_nearby_nodes(p, nid, dist, false);
1714
1715         return 1000 * faults / total_faults;
1716 }
1717
1718 /*
1719  * If memory tiering mode is enabled, cpupid of slow memory page is
1720  * used to record scan time instead of CPU and PID.  When tiering mode
1721  * is disabled at run time, the scan time (in cpupid) will be
1722  * interpreted as CPU and PID.  So CPU needs to be checked to avoid to
1723  * access out of array bound.
1724  */
1725 static inline bool cpupid_valid(int cpupid)
1726 {
1727         return cpupid_to_cpu(cpupid) < nr_cpu_ids;
1728 }
1729
1730 /*
1731  * For memory tiering mode, if there are enough free pages (more than
1732  * enough watermark defined here) in fast memory node, to take full
1733  * advantage of fast memory capacity, all recently accessed slow
1734  * memory pages will be migrated to fast memory node without
1735  * considering hot threshold.
1736  */
1737 static bool pgdat_free_space_enough(struct pglist_data *pgdat)
1738 {
1739         int z;
1740         unsigned long enough_wmark;
1741
1742         enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
1743                            pgdat->node_present_pages >> 4);
1744         for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1745                 struct zone *zone = pgdat->node_zones + z;
1746
1747                 if (!populated_zone(zone))
1748                         continue;
1749
1750                 if (zone_watermark_ok(zone, 0,
1751                                       wmark_pages(zone, WMARK_PROMO) + enough_wmark,
1752                                       ZONE_MOVABLE, 0))
1753                         return true;
1754         }
1755         return false;
1756 }
1757
1758 /*
1759  * For memory tiering mode, when page tables are scanned, the scan
1760  * time will be recorded in struct page in addition to make page
1761  * PROT_NONE for slow memory page.  So when the page is accessed, in
1762  * hint page fault handler, the hint page fault latency is calculated
1763  * via,
1764  *
1765  *      hint page fault latency = hint page fault time - scan time
1766  *
1767  * The smaller the hint page fault latency, the higher the possibility
1768  * for the page to be hot.
1769  */
1770 static int numa_hint_fault_latency(struct folio *folio)
1771 {
1772         int last_time, time;
1773
1774         time = jiffies_to_msecs(jiffies);
1775         last_time = folio_xchg_access_time(folio, time);
1776
1777         return (time - last_time) & PAGE_ACCESS_TIME_MASK;
1778 }
1779
1780 /*
1781  * For memory tiering mode, too high promotion/demotion throughput may
1782  * hurt application latency.  So we provide a mechanism to rate limit
1783  * the number of pages that are tried to be promoted.
1784  */
1785 static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
1786                                       unsigned long rate_limit, int nr)
1787 {
1788         unsigned long nr_cand;
1789         unsigned int now, start;
1790
1791         now = jiffies_to_msecs(jiffies);
1792         mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
1793         nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1794         start = pgdat->nbp_rl_start;
1795         if (now - start > MSEC_PER_SEC &&
1796             cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
1797                 pgdat->nbp_rl_nr_cand = nr_cand;
1798         if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
1799                 return true;
1800         return false;
1801 }
1802
1803 #define NUMA_MIGRATION_ADJUST_STEPS     16
1804
1805 static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
1806                                             unsigned long rate_limit,
1807                                             unsigned int ref_th)
1808 {
1809         unsigned int now, start, th_period, unit_th, th;
1810         unsigned long nr_cand, ref_cand, diff_cand;
1811
1812         now = jiffies_to_msecs(jiffies);
1813         th_period = sysctl_numa_balancing_scan_period_max;
1814         start = pgdat->nbp_th_start;
1815         if (now - start > th_period &&
1816             cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
1817                 ref_cand = rate_limit *
1818                         sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
1819                 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1820                 diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
1821                 unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
1822                 th = pgdat->nbp_threshold ? : ref_th;
1823                 if (diff_cand > ref_cand * 11 / 10)
1824                         th = max(th - unit_th, unit_th);
1825                 else if (diff_cand < ref_cand * 9 / 10)
1826                         th = min(th + unit_th, ref_th * 2);
1827                 pgdat->nbp_th_nr_cand = nr_cand;
1828                 pgdat->nbp_threshold = th;
1829         }
1830 }
1831
1832 bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
1833                                 int src_nid, int dst_cpu)
1834 {
1835         struct numa_group *ng = deref_curr_numa_group(p);
1836         int dst_nid = cpu_to_node(dst_cpu);
1837         int last_cpupid, this_cpupid;
1838
1839         /*
1840          * Cannot migrate to memoryless nodes.
1841          */
1842         if (!node_state(dst_nid, N_MEMORY))
1843                 return false;
1844
1845         /*
1846          * The pages in slow memory node should be migrated according
1847          * to hot/cold instead of private/shared.
1848          */
1849         if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
1850             !node_is_toptier(src_nid)) {
1851                 struct pglist_data *pgdat;
1852                 unsigned long rate_limit;
1853                 unsigned int latency, th, def_th;
1854
1855                 pgdat = NODE_DATA(dst_nid);
1856                 if (pgdat_free_space_enough(pgdat)) {
1857                         /* workload changed, reset hot threshold */
1858                         pgdat->nbp_threshold = 0;
1859                         return true;
1860                 }
1861
1862                 def_th = sysctl_numa_balancing_hot_threshold;
1863                 rate_limit = sysctl_numa_balancing_promote_rate_limit << \
1864                         (20 - PAGE_SHIFT);
1865                 numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
1866
1867                 th = pgdat->nbp_threshold ? : def_th;
1868                 latency = numa_hint_fault_latency(folio);
1869                 if (latency >= th)
1870                         return false;
1871
1872                 return !numa_promotion_rate_limit(pgdat, rate_limit,
1873                                                   folio_nr_pages(folio));
1874         }
1875
1876         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1877         last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
1878
1879         if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
1880             !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
1881                 return false;
1882
1883         /*
1884          * Allow first faults or private faults to migrate immediately early in
1885          * the lifetime of a task. The magic number 4 is based on waiting for
1886          * two full passes of the "multi-stage node selection" test that is
1887          * executed below.
1888          */
1889         if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1890             (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1891                 return true;
1892
1893         /*
1894          * Multi-stage node selection is used in conjunction with a periodic
1895          * migration fault to build a temporal task<->page relation. By using
1896          * a two-stage filter we remove short/unlikely relations.
1897          *
1898          * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1899          * a task's usage of a particular page (n_p) per total usage of this
1900          * page (n_t) (in a given time-span) to a probability.
1901          *
1902          * Our periodic faults will sample this probability and getting the
1903          * same result twice in a row, given these samples are fully
1904          * independent, is then given by P(n)^2, provided our sample period
1905          * is sufficiently short compared to the usage pattern.
1906          *
1907          * This quadric squishes small probabilities, making it less likely we
1908          * act on an unlikely task<->page relation.
1909          */
1910         if (!cpupid_pid_unset(last_cpupid) &&
1911                                 cpupid_to_nid(last_cpupid) != dst_nid)
1912                 return false;
1913
1914         /* Always allow migrate on private faults */
1915         if (cpupid_match_pid(p, last_cpupid))
1916                 return true;
1917
1918         /* A shared fault, but p->numa_group has not been set up yet. */
1919         if (!ng)
1920                 return true;
1921
1922         /*
1923          * Destination node is much more heavily used than the source
1924          * node? Allow migration.
1925          */
1926         if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1927                                         ACTIVE_NODE_FRACTION)
1928                 return true;
1929
1930         /*
1931          * Distribute memory according to CPU & memory use on each node,
1932          * with 3/4 hysteresis to avoid unnecessary memory migrations:
1933          *
1934          * faults_cpu(dst)   3   faults_cpu(src)
1935          * --------------- * - > ---------------
1936          * faults_mem(dst)   4   faults_mem(src)
1937          */
1938         return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1939                group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1940 }
1941
1942 /*
1943  * 'numa_type' describes the node at the moment of load balancing.
1944  */
1945 enum numa_type {
1946         /* The node has spare capacity that can be used to run more tasks.  */
1947         node_has_spare = 0,
1948         /*
1949          * The node is fully used and the tasks don't compete for more CPU
1950          * cycles. Nevertheless, some tasks might wait before running.
1951          */
1952         node_fully_busy,
1953         /*
1954          * The node is overloaded and can't provide expected CPU cycles to all
1955          * tasks.
1956          */
1957         node_overloaded
1958 };
1959
1960 /* Cached statistics for all CPUs within a node */
1961 struct numa_stats {
1962         unsigned long load;
1963         unsigned long runnable;
1964         unsigned long util;
1965         /* Total compute capacity of CPUs on a node */
1966         unsigned long compute_capacity;
1967         unsigned int nr_running;
1968         unsigned int weight;
1969         enum numa_type node_type;
1970         int idle_cpu;
1971 };
1972
1973 struct task_numa_env {
1974         struct task_struct *p;
1975
1976         int src_cpu, src_nid;
1977         int dst_cpu, dst_nid;
1978         int imb_numa_nr;
1979
1980         struct numa_stats src_stats, dst_stats;
1981
1982         int imbalance_pct;
1983         int dist;
1984
1985         struct task_struct *best_task;
1986         long best_imp;
1987         int best_cpu;
1988 };
1989
1990 static unsigned long cpu_load(struct rq *rq);
1991 static unsigned long cpu_runnable(struct rq *rq);
1992
1993 static inline enum
1994 numa_type numa_classify(unsigned int imbalance_pct,
1995                          struct numa_stats *ns)
1996 {
1997         if ((ns->nr_running > ns->weight) &&
1998             (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1999              ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
2000                 return node_overloaded;
2001
2002         if ((ns->nr_running < ns->weight) ||
2003             (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
2004              ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
2005                 return node_has_spare;
2006
2007         return node_fully_busy;
2008 }
2009
2010 #ifdef CONFIG_SCHED_SMT
2011 /* Forward declarations of select_idle_sibling helpers */
2012 static inline bool test_idle_cores(int cpu);
2013 static inline int numa_idle_core(int idle_core, int cpu)
2014 {
2015         if (!static_branch_likely(&sched_smt_present) ||
2016             idle_core >= 0 || !test_idle_cores(cpu))
2017                 return idle_core;
2018
2019         /*
2020          * Prefer cores instead of packing HT siblings
2021          * and triggering future load balancing.
2022          */
2023         if (is_core_idle(cpu))
2024                 idle_core = cpu;
2025
2026         return idle_core;
2027 }
2028 #else
2029 static inline int numa_idle_core(int idle_core, int cpu)
2030 {
2031         return idle_core;
2032 }
2033 #endif
2034
2035 /*
2036  * Gather all necessary information to make NUMA balancing placement
2037  * decisions that are compatible with standard load balancer. This
2038  * borrows code and logic from update_sg_lb_stats but sharing a
2039  * common implementation is impractical.
2040  */
2041 static void update_numa_stats(struct task_numa_env *env,
2042                               struct numa_stats *ns, int nid,
2043                               bool find_idle)
2044 {
2045         int cpu, idle_core = -1;
2046
2047         memset(ns, 0, sizeof(*ns));
2048         ns->idle_cpu = -1;
2049
2050         rcu_read_lock();
2051         for_each_cpu(cpu, cpumask_of_node(nid)) {
2052                 struct rq *rq = cpu_rq(cpu);
2053
2054                 ns->load += cpu_load(rq);
2055                 ns->runnable += cpu_runnable(rq);
2056                 ns->util += cpu_util_cfs(cpu);
2057                 ns->nr_running += rq->cfs.h_nr_running;
2058                 ns->compute_capacity += capacity_of(cpu);
2059
2060                 if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
2061                         if (READ_ONCE(rq->numa_migrate_on) ||
2062                             !cpumask_test_cpu(cpu, env->p->cpus_ptr))
2063                                 continue;
2064
2065                         if (ns->idle_cpu == -1)
2066                                 ns->idle_cpu = cpu;
2067
2068                         idle_core = numa_idle_core(idle_core, cpu);
2069                 }
2070         }
2071         rcu_read_unlock();
2072
2073         ns->weight = cpumask_weight(cpumask_of_node(nid));
2074
2075         ns->node_type = numa_classify(env->imbalance_pct, ns);
2076
2077         if (idle_core >= 0)
2078                 ns->idle_cpu = idle_core;
2079 }
2080
2081 static void task_numa_assign(struct task_numa_env *env,
2082                              struct task_struct *p, long imp)
2083 {
2084         struct rq *rq = cpu_rq(env->dst_cpu);
2085
2086         /* Check if run-queue part of active NUMA balance. */
2087         if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
2088                 int cpu;
2089                 int start = env->dst_cpu;
2090
2091                 /* Find alternative idle CPU. */
2092                 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
2093                         if (cpu == env->best_cpu || !idle_cpu(cpu) ||
2094                             !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
2095                                 continue;
2096                         }
2097
2098                         env->dst_cpu = cpu;
2099                         rq = cpu_rq(env->dst_cpu);
2100                         if (!xchg(&rq->numa_migrate_on, 1))
2101                                 goto assign;
2102                 }
2103
2104                 /* Failed to find an alternative idle CPU */
2105                 return;
2106         }
2107
2108 assign:
2109         /*
2110          * Clear previous best_cpu/rq numa-migrate flag, since task now
2111          * found a better CPU to move/swap.
2112          */
2113         if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
2114                 rq = cpu_rq(env->best_cpu);
2115                 WRITE_ONCE(rq->numa_migrate_on, 0);
2116         }
2117
2118         if (env->best_task)
2119                 put_task_struct(env->best_task);
2120         if (p)
2121                 get_task_struct(p);
2122
2123         env->best_task = p;
2124         env->best_imp = imp;
2125         env->best_cpu = env->dst_cpu;
2126 }
2127
2128 static bool load_too_imbalanced(long src_load, long dst_load,
2129                                 struct task_numa_env *env)
2130 {
2131         long imb, old_imb;
2132         long orig_src_load, orig_dst_load;
2133         long src_capacity, dst_capacity;
2134
2135         /*
2136          * The load is corrected for the CPU capacity available on each node.
2137          *
2138          * src_load        dst_load
2139          * ------------ vs ---------
2140          * src_capacity    dst_capacity
2141          */
2142         src_capacity = env->src_stats.compute_capacity;
2143         dst_capacity = env->dst_stats.compute_capacity;
2144
2145         imb = abs(dst_load * src_capacity - src_load * dst_capacity);
2146
2147         orig_src_load = env->src_stats.load;
2148         orig_dst_load = env->dst_stats.load;
2149
2150         old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
2151
2152         /* Would this change make things worse? */
2153         return (imb > old_imb);
2154 }
2155
2156 /*
2157  * Maximum NUMA importance can be 1998 (2*999);
2158  * SMALLIMP @ 30 would be close to 1998/64.
2159  * Used to deter task migration.
2160  */
2161 #define SMALLIMP        30
2162
2163 /*
2164  * This checks if the overall compute and NUMA accesses of the system would
2165  * be improved if the source tasks was migrated to the target dst_cpu taking
2166  * into account that it might be best if task running on the dst_cpu should
2167  * be exchanged with the source task
2168  */
2169 static bool task_numa_compare(struct task_numa_env *env,
2170                               long taskimp, long groupimp, bool maymove)
2171 {
2172         struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
2173         struct rq *dst_rq = cpu_rq(env->dst_cpu);
2174         long imp = p_ng ? groupimp : taskimp;
2175         struct task_struct *cur;
2176         long src_load, dst_load;
2177         int dist = env->dist;
2178         long moveimp = imp;
2179         long load;
2180         bool stopsearch = false;
2181
2182         if (READ_ONCE(dst_rq->numa_migrate_on))
2183                 return false;
2184
2185         rcu_read_lock();
2186         cur = rcu_dereference(dst_rq->curr);
2187         if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
2188                 cur = NULL;
2189
2190         /*
2191          * Because we have preemption enabled we can get migrated around and
2192          * end try selecting ourselves (current == env->p) as a swap candidate.
2193          */
2194         if (cur == env->p) {
2195                 stopsearch = true;
2196                 goto unlock;
2197         }
2198
2199         if (!cur) {
2200                 if (maymove && moveimp >= env->best_imp)
2201                         goto assign;
2202                 else
2203                         goto unlock;
2204         }
2205
2206         /* Skip this swap candidate if cannot move to the source cpu. */
2207         if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
2208                 goto unlock;
2209
2210         /*
2211          * Skip this swap candidate if it is not moving to its preferred
2212          * node and the best task is.
2213          */
2214         if (env->best_task &&
2215             env->best_task->numa_preferred_nid == env->src_nid &&
2216             cur->numa_preferred_nid != env->src_nid) {
2217                 goto unlock;
2218         }
2219
2220         /*
2221          * "imp" is the fault differential for the source task between the
2222          * source and destination node. Calculate the total differential for
2223          * the source task and potential destination task. The more negative
2224          * the value is, the more remote accesses that would be expected to
2225          * be incurred if the tasks were swapped.
2226          *
2227          * If dst and source tasks are in the same NUMA group, or not
2228          * in any group then look only at task weights.
2229          */
2230         cur_ng = rcu_dereference(cur->numa_group);
2231         if (cur_ng == p_ng) {
2232                 /*
2233                  * Do not swap within a group or between tasks that have
2234                  * no group if there is spare capacity. Swapping does
2235                  * not address the load imbalance and helps one task at
2236                  * the cost of punishing another.
2237                  */
2238                 if (env->dst_stats.node_type == node_has_spare)
2239                         goto unlock;
2240
2241                 imp = taskimp + task_weight(cur, env->src_nid, dist) -
2242                       task_weight(cur, env->dst_nid, dist);
2243                 /*
2244                  * Add some hysteresis to prevent swapping the
2245                  * tasks within a group over tiny differences.
2246                  */
2247                 if (cur_ng)
2248                         imp -= imp / 16;
2249         } else {
2250                 /*
2251                  * Compare the group weights. If a task is all by itself
2252                  * (not part of a group), use the task weight instead.
2253                  */
2254                 if (cur_ng && p_ng)
2255                         imp += group_weight(cur, env->src_nid, dist) -
2256                                group_weight(cur, env->dst_nid, dist);
2257                 else
2258                         imp += task_weight(cur, env->src_nid, dist) -
2259                                task_weight(cur, env->dst_nid, dist);
2260         }
2261
2262         /* Discourage picking a task already on its preferred node */
2263         if (cur->numa_preferred_nid == env->dst_nid)
2264                 imp -= imp / 16;
2265
2266         /*
2267          * Encourage picking a task that moves to its preferred node.
2268          * This potentially makes imp larger than it's maximum of
2269          * 1998 (see SMALLIMP and task_weight for why) but in this
2270          * case, it does not matter.
2271          */
2272         if (cur->numa_preferred_nid == env->src_nid)
2273                 imp += imp / 8;
2274
2275         if (maymove && moveimp > imp && moveimp > env->best_imp) {
2276                 imp = moveimp;
2277                 cur = NULL;
2278                 goto assign;
2279         }
2280
2281         /*
2282          * Prefer swapping with a task moving to its preferred node over a
2283          * task that is not.
2284          */
2285         if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
2286             env->best_task->numa_preferred_nid != env->src_nid) {
2287                 goto assign;
2288         }
2289
2290         /*
2291          * If the NUMA importance is less than SMALLIMP,
2292          * task migration might only result in ping pong
2293          * of tasks and also hurt performance due to cache
2294          * misses.
2295          */
2296         if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
2297                 goto unlock;
2298
2299         /*
2300          * In the overloaded case, try and keep the load balanced.
2301          */
2302         load = task_h_load(env->p) - task_h_load(cur);
2303         if (!load)
2304                 goto assign;
2305
2306         dst_load = env->dst_stats.load + load;
2307         src_load = env->src_stats.load - load;
2308
2309         if (load_too_imbalanced(src_load, dst_load, env))
2310                 goto unlock;
2311
2312 assign:
2313         /* Evaluate an idle CPU for a task numa move. */
2314         if (!cur) {
2315                 int cpu = env->dst_stats.idle_cpu;
2316
2317                 /* Nothing cached so current CPU went idle since the search. */
2318                 if (cpu < 0)
2319                         cpu = env->dst_cpu;
2320
2321                 /*
2322                  * If the CPU is no longer truly idle and the previous best CPU
2323                  * is, keep using it.
2324                  */
2325                 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
2326                     idle_cpu(env->best_cpu)) {
2327                         cpu = env->best_cpu;
2328                 }
2329
2330                 env->dst_cpu = cpu;
2331         }
2332
2333         task_numa_assign(env, cur, imp);
2334
2335         /*
2336          * If a move to idle is allowed because there is capacity or load
2337          * balance improves then stop the search. While a better swap
2338          * candidate may exist, a search is not free.
2339          */
2340         if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
2341                 stopsearch = true;
2342
2343         /*
2344          * If a swap candidate must be identified and the current best task
2345          * moves its preferred node then stop the search.
2346          */
2347         if (!maymove && env->best_task &&
2348             env->best_task->numa_preferred_nid == env->src_nid) {
2349                 stopsearch = true;
2350         }
2351 unlock:
2352         rcu_read_unlock();
2353
2354         return stopsearch;
2355 }
2356
2357 static void task_numa_find_cpu(struct task_numa_env *env,
2358                                 long taskimp, long groupimp)
2359 {
2360         bool maymove = false;
2361         int cpu;
2362
2363         /*
2364          * If dst node has spare capacity, then check if there is an
2365          * imbalance that would be overruled by the load balancer.
2366          */
2367         if (env->dst_stats.node_type == node_has_spare) {
2368                 unsigned int imbalance;
2369                 int src_running, dst_running;
2370
2371                 /*
2372                  * Would movement cause an imbalance? Note that if src has
2373                  * more running tasks that the imbalance is ignored as the
2374                  * move improves the imbalance from the perspective of the
2375                  * CPU load balancer.
2376                  * */
2377                 src_running = env->src_stats.nr_running - 1;
2378                 dst_running = env->dst_stats.nr_running + 1;
2379                 imbalance = max(0, dst_running - src_running);
2380                 imbalance = adjust_numa_imbalance(imbalance, dst_running,
2381                                                   env->imb_numa_nr);
2382
2383                 /* Use idle CPU if there is no imbalance */
2384                 if (!imbalance) {
2385                         maymove = true;
2386                         if (env->dst_stats.idle_cpu >= 0) {
2387                                 env->dst_cpu = env->dst_stats.idle_cpu;
2388                                 task_numa_assign(env, NULL, 0);
2389                                 return;
2390                         }
2391                 }
2392         } else {
2393                 long src_load, dst_load, load;
2394                 /*
2395                  * If the improvement from just moving env->p direction is better
2396                  * than swapping tasks around, check if a move is possible.
2397                  */
2398                 load = task_h_load(env->p);
2399                 dst_load = env->dst_stats.load + load;
2400                 src_load = env->src_stats.load - load;
2401                 maymove = !load_too_imbalanced(src_load, dst_load, env);
2402         }
2403
2404         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
2405                 /* Skip this CPU if the source task cannot migrate */
2406                 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
2407                         continue;
2408
2409                 env->dst_cpu = cpu;
2410                 if (task_numa_compare(env, taskimp, groupimp, maymove))
2411                         break;
2412         }
2413 }
2414
2415 static int task_numa_migrate(struct task_struct *p)
2416 {
2417         struct task_numa_env env = {
2418                 .p = p,
2419
2420                 .src_cpu = task_cpu(p),
2421                 .src_nid = task_node(p),
2422
2423                 .imbalance_pct = 112,
2424
2425                 .best_task = NULL,
2426                 .best_imp = 0,
2427                 .best_cpu = -1,
2428         };
2429         unsigned long taskweight, groupweight;
2430         struct sched_domain *sd;
2431         long taskimp, groupimp;
2432         struct numa_group *ng;
2433         struct rq *best_rq;
2434         int nid, ret, dist;
2435
2436         /*
2437          * Pick the lowest SD_NUMA domain, as that would have the smallest
2438          * imbalance and would be the first to start moving tasks about.
2439          *
2440          * And we want to avoid any moving of tasks about, as that would create
2441          * random movement of tasks -- counter the numa conditions we're trying
2442          * to satisfy here.
2443          */
2444         rcu_read_lock();
2445         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
2446         if (sd) {
2447                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
2448                 env.imb_numa_nr = sd->imb_numa_nr;
2449         }
2450         rcu_read_unlock();
2451
2452         /*
2453          * Cpusets can break the scheduler domain tree into smaller
2454          * balance domains, some of which do not cross NUMA boundaries.
2455          * Tasks that are "trapped" in such domains cannot be migrated
2456          * elsewhere, so there is no point in (re)trying.
2457          */
2458         if (unlikely(!sd)) {
2459                 sched_setnuma(p, task_node(p));
2460                 return -EINVAL;
2461         }
2462
2463         env.dst_nid = p->numa_preferred_nid;
2464         dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2465         taskweight = task_weight(p, env.src_nid, dist);
2466         groupweight = group_weight(p, env.src_nid, dist);
2467         update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2468         taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2469         groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2470         update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2471
2472         /* Try to find a spot on the preferred nid. */
2473         task_numa_find_cpu(&env, taskimp, groupimp);
2474
2475         /*
2476          * Look at other nodes in these cases:
2477          * - there is no space available on the preferred_nid
2478          * - the task is part of a numa_group that is interleaved across
2479          *   multiple NUMA nodes; in order to better consolidate the group,
2480          *   we need to check other locations.
2481          */
2482         ng = deref_curr_numa_group(p);
2483         if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
2484                 for_each_node_state(nid, N_CPU) {
2485                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
2486                                 continue;
2487
2488                         dist = node_distance(env.src_nid, env.dst_nid);
2489                         if (sched_numa_topology_type == NUMA_BACKPLANE &&
2490                                                 dist != env.dist) {
2491                                 taskweight = task_weight(p, env.src_nid, dist);
2492                                 groupweight = group_weight(p, env.src_nid, dist);
2493                         }
2494
2495                         /* Only consider nodes where both task and groups benefit */
2496                         taskimp = task_weight(p, nid, dist) - taskweight;
2497                         groupimp = group_weight(p, nid, dist) - groupweight;
2498                         if (taskimp < 0 && groupimp < 0)
2499                                 continue;
2500
2501                         env.dist = dist;
2502                         env.dst_nid = nid;
2503                         update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2504                         task_numa_find_cpu(&env, taskimp, groupimp);
2505                 }
2506         }
2507
2508         /*
2509          * If the task is part of a workload that spans multiple NUMA nodes,
2510          * and is migrating into one of the workload's active nodes, remember
2511          * this node as the task's preferred numa node, so the workload can
2512          * settle down.
2513          * A task that migrated to a second choice node will be better off
2514          * trying for a better one later. Do not set the preferred node here.
2515          */
2516         if (ng) {
2517                 if (env.best_cpu == -1)
2518                         nid = env.src_nid;
2519                 else
2520                         nid = cpu_to_node(env.best_cpu);
2521
2522                 if (nid != p->numa_preferred_nid)
2523                         sched_setnuma(p, nid);
2524         }
2525
2526         /* No better CPU than the current one was found. */
2527         if (env.best_cpu == -1) {
2528                 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2529                 return -EAGAIN;
2530         }
2531
2532         best_rq = cpu_rq(env.best_cpu);
2533         if (env.best_task == NULL) {
2534                 ret = migrate_task_to(p, env.best_cpu);
2535                 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2536                 if (ret != 0)
2537                         trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2538                 return ret;
2539         }
2540
2541         ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2542         WRITE_ONCE(best_rq->numa_migrate_on, 0);
2543
2544         if (ret != 0)
2545                 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2546         put_task_struct(env.best_task);
2547         return ret;
2548 }
2549
2550 /* Attempt to migrate a task to a CPU on the preferred node. */
2551 static void numa_migrate_preferred(struct task_struct *p)
2552 {
2553         unsigned long interval = HZ;
2554
2555         /* This task has no NUMA fault statistics yet */
2556         if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2557                 return;
2558
2559         /* Periodically retry migrating the task to the preferred node */
2560         interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2561         p->numa_migrate_retry = jiffies + interval;
2562
2563         /* Success if task is already running on preferred CPU */
2564         if (task_node(p) == p->numa_preferred_nid)
2565                 return;
2566
2567         /* Otherwise, try migrate to a CPU on the preferred node */
2568         task_numa_migrate(p);
2569 }
2570
2571 /*
2572  * Find out how many nodes the workload is actively running on. Do this by
2573  * tracking the nodes from which NUMA hinting faults are triggered. This can
2574  * be different from the set of nodes where the workload's memory is currently
2575  * located.
2576  */
2577 static void numa_group_count_active_nodes(struct numa_group *numa_group)
2578 {
2579         unsigned long faults, max_faults = 0;
2580         int nid, active_nodes = 0;
2581
2582         for_each_node_state(nid, N_CPU) {
2583                 faults = group_faults_cpu(numa_group, nid);
2584                 if (faults > max_faults)
2585                         max_faults = faults;
2586         }
2587
2588         for_each_node_state(nid, N_CPU) {
2589                 faults = group_faults_cpu(numa_group, nid);
2590                 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2591                         active_nodes++;
2592         }
2593
2594         numa_group->max_faults_cpu = max_faults;
2595         numa_group->active_nodes = active_nodes;
2596 }
2597
2598 /*
2599  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2600  * increments. The more local the fault statistics are, the higher the scan
2601  * period will be for the next scan window. If local/(local+remote) ratio is
2602  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2603  * the scan period will decrease. Aim for 70% local accesses.
2604  */
2605 #define NUMA_PERIOD_SLOTS 10
2606 #define NUMA_PERIOD_THRESHOLD 7
2607
2608 /*
2609  * Increase the scan period (slow down scanning) if the majority of
2610  * our memory is already on our local node, or if the majority of
2611  * the page accesses are shared with other processes.
2612  * Otherwise, decrease the scan period.
2613  */
2614 static void update_task_scan_period(struct task_struct *p,
2615                         unsigned long shared, unsigned long private)
2616 {
2617         unsigned int period_slot;
2618         int lr_ratio, ps_ratio;
2619         int diff;
2620
2621         unsigned long remote = p->numa_faults_locality[0];
2622         unsigned long local = p->numa_faults_locality[1];
2623
2624         /*
2625          * If there were no record hinting faults then either the task is
2626          * completely idle or all activity is in areas that are not of interest
2627          * to automatic numa balancing. Related to that, if there were failed
2628          * migration then it implies we are migrating too quickly or the local
2629          * node is overloaded. In either case, scan slower
2630          */
2631         if (local + shared == 0 || p->numa_faults_locality[2]) {
2632                 p->numa_scan_period = min(p->numa_scan_period_max,
2633                         p->numa_scan_period << 1);
2634
2635                 p->mm->numa_next_scan = jiffies +
2636                         msecs_to_jiffies(p->numa_scan_period);
2637
2638                 return;
2639         }
2640
2641         /*
2642          * Prepare to scale scan period relative to the current period.
2643          *       == NUMA_PERIOD_THRESHOLD scan period stays the same
2644          *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2645          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2646          */
2647         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2648         lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2649         ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2650
2651         if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2652                 /*
2653                  * Most memory accesses are local. There is no need to
2654                  * do fast NUMA scanning, since memory is already local.
2655                  */
2656                 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2657                 if (!slot)
2658                         slot = 1;
2659                 diff = slot * period_slot;
2660         } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2661                 /*
2662                  * Most memory accesses are shared with other tasks.
2663                  * There is no point in continuing fast NUMA scanning,
2664                  * since other tasks may just move the memory elsewhere.
2665                  */
2666                 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2667                 if (!slot)
2668                         slot = 1;
2669                 diff = slot * period_slot;
2670         } else {
2671                 /*
2672                  * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2673                  * yet they are not on the local NUMA node. Speed up
2674                  * NUMA scanning to get the memory moved over.
2675                  */
2676                 int ratio = max(lr_ratio, ps_ratio);
2677                 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2678         }
2679
2680         p->numa_scan_period = clamp(p->numa_scan_period + diff,
2681                         task_scan_min(p), task_scan_max(p));
2682         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2683 }
2684
2685 /*
2686  * Get the fraction of time the task has been running since the last
2687  * NUMA placement cycle. The scheduler keeps similar statistics, but
2688  * decays those on a 32ms period, which is orders of magnitude off
2689  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2690  * stats only if the task is so new there are no NUMA statistics yet.
2691  */
2692 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2693 {
2694         u64 runtime, delta, now;
2695         /* Use the start of this time slice to avoid calculations. */
2696         now = p->se.exec_start;
2697         runtime = p->se.sum_exec_runtime;
2698
2699         if (p->last_task_numa_placement) {
2700                 delta = runtime - p->last_sum_exec_runtime;
2701                 *period = now - p->last_task_numa_placement;
2702
2703                 /* Avoid time going backwards, prevent potential divide error: */
2704                 if (unlikely((s64)*period < 0))
2705                         *period = 0;
2706         } else {
2707                 delta = p->se.avg.load_sum;
2708                 *period = LOAD_AVG_MAX;
2709         }
2710
2711         p->last_sum_exec_runtime = runtime;
2712         p->last_task_numa_placement = now;
2713
2714         return delta;
2715 }
2716
2717 /*
2718  * Determine the preferred nid for a task in a numa_group. This needs to
2719  * be done in a way that produces consistent results with group_weight,
2720  * otherwise workloads might not converge.
2721  */
2722 static int preferred_group_nid(struct task_struct *p, int nid)
2723 {
2724         nodemask_t nodes;
2725         int dist;
2726
2727         /* Direct connections between all NUMA nodes. */
2728         if (sched_numa_topology_type == NUMA_DIRECT)
2729                 return nid;
2730
2731         /*
2732          * On a system with glueless mesh NUMA topology, group_weight
2733          * scores nodes according to the number of NUMA hinting faults on
2734          * both the node itself, and on nearby nodes.
2735          */
2736         if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2737                 unsigned long score, max_score = 0;
2738                 int node, max_node = nid;
2739
2740                 dist = sched_max_numa_distance;
2741
2742                 for_each_node_state(node, N_CPU) {
2743                         score = group_weight(p, node, dist);
2744                         if (score > max_score) {
2745                                 max_score = score;
2746                                 max_node = node;
2747                         }
2748                 }
2749                 return max_node;
2750         }
2751
2752         /*
2753          * Finding the preferred nid in a system with NUMA backplane
2754          * interconnect topology is more involved. The goal is to locate
2755          * tasks from numa_groups near each other in the system, and
2756          * untangle workloads from different sides of the system. This requires
2757          * searching down the hierarchy of node groups, recursively searching
2758          * inside the highest scoring group of nodes. The nodemask tricks
2759          * keep the complexity of the search down.
2760          */
2761         nodes = node_states[N_CPU];
2762         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2763                 unsigned long max_faults = 0;
2764                 nodemask_t max_group = NODE_MASK_NONE;
2765                 int a, b;
2766
2767                 /* Are there nodes at this distance from each other? */
2768                 if (!find_numa_distance(dist))
2769                         continue;
2770
2771                 for_each_node_mask(a, nodes) {
2772                         unsigned long faults = 0;
2773                         nodemask_t this_group;
2774                         nodes_clear(this_group);
2775
2776                         /* Sum group's NUMA faults; includes a==b case. */
2777                         for_each_node_mask(b, nodes) {
2778                                 if (node_distance(a, b) < dist) {
2779                                         faults += group_faults(p, b);
2780                                         node_set(b, this_group);
2781                                         node_clear(b, nodes);
2782                                 }
2783                         }
2784
2785                         /* Remember the top group. */
2786                         if (faults > max_faults) {
2787                                 max_faults = faults;
2788                                 max_group = this_group;
2789                                 /*
2790                                  * subtle: at the smallest distance there is
2791                                  * just one node left in each "group", the
2792                                  * winner is the preferred nid.
2793                                  */
2794                                 nid = a;
2795                         }
2796                 }
2797                 /* Next round, evaluate the nodes within max_group. */
2798                 if (!max_faults)
2799                         break;
2800                 nodes = max_group;
2801         }
2802         return nid;
2803 }
2804
2805 static void task_numa_placement(struct task_struct *p)
2806 {
2807         int seq, nid, max_nid = NUMA_NO_NODE;
2808         unsigned long max_faults = 0;
2809         unsigned long fault_types[2] = { 0, 0 };
2810         unsigned long total_faults;
2811         u64 runtime, period;
2812         spinlock_t *group_lock = NULL;
2813         struct numa_group *ng;
2814
2815         /*
2816          * The p->mm->numa_scan_seq field gets updated without
2817          * exclusive access. Use READ_ONCE() here to ensure
2818          * that the field is read in a single access:
2819          */
2820         seq = READ_ONCE(p->mm->numa_scan_seq);
2821         if (p->numa_scan_seq == seq)
2822                 return;
2823         p->numa_scan_seq = seq;
2824         p->numa_scan_period_max = task_scan_max(p);
2825
2826         total_faults = p->numa_faults_locality[0] +
2827                        p->numa_faults_locality[1];
2828         runtime = numa_get_avg_runtime(p, &period);
2829
2830         /* If the task is part of a group prevent parallel updates to group stats */
2831         ng = deref_curr_numa_group(p);
2832         if (ng) {
2833                 group_lock = &ng->lock;
2834                 spin_lock_irq(group_lock);
2835         }
2836
2837         /* Find the node with the highest number of faults */
2838         for_each_online_node(nid) {
2839                 /* Keep track of the offsets in numa_faults array */
2840                 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2841                 unsigned long faults = 0, group_faults = 0;
2842                 int priv;
2843
2844                 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2845                         long diff, f_diff, f_weight;
2846
2847                         mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2848                         membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2849                         cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2850                         cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2851
2852                         /* Decay existing window, copy faults since last scan */
2853                         diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2854                         fault_types[priv] += p->numa_faults[membuf_idx];
2855                         p->numa_faults[membuf_idx] = 0;
2856
2857                         /*
2858                          * Normalize the faults_from, so all tasks in a group
2859                          * count according to CPU use, instead of by the raw
2860                          * number of faults. Tasks with little runtime have
2861                          * little over-all impact on throughput, and thus their
2862                          * faults are less important.
2863                          */
2864                         f_weight = div64_u64(runtime << 16, period + 1);
2865                         f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2866                                    (total_faults + 1);
2867                         f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2868                         p->numa_faults[cpubuf_idx] = 0;
2869
2870                         p->numa_faults[mem_idx] += diff;
2871                         p->numa_faults[cpu_idx] += f_diff;
2872                         faults += p->numa_faults[mem_idx];
2873                         p->total_numa_faults += diff;
2874                         if (ng) {
2875                                 /*
2876                                  * safe because we can only change our own group
2877                                  *
2878                                  * mem_idx represents the offset for a given
2879                                  * nid and priv in a specific region because it
2880                                  * is at the beginning of the numa_faults array.
2881                                  */
2882                                 ng->faults[mem_idx] += diff;
2883                                 ng->faults[cpu_idx] += f_diff;
2884                                 ng->total_faults += diff;
2885                                 group_faults += ng->faults[mem_idx];
2886                         }
2887                 }
2888
2889                 if (!ng) {
2890                         if (faults > max_faults) {
2891                                 max_faults = faults;
2892                                 max_nid = nid;
2893                         }
2894                 } else if (group_faults > max_faults) {
2895                         max_faults = group_faults;
2896                         max_nid = nid;
2897                 }
2898         }
2899
2900         /* Cannot migrate task to CPU-less node */
2901         max_nid = numa_nearest_node(max_nid, N_CPU);
2902
2903         if (ng) {
2904                 numa_group_count_active_nodes(ng);
2905                 spin_unlock_irq(group_lock);
2906                 max_nid = preferred_group_nid(p, max_nid);
2907         }
2908
2909         if (max_faults) {
2910                 /* Set the new preferred node */
2911                 if (max_nid != p->numa_preferred_nid)
2912                         sched_setnuma(p, max_nid);
2913         }
2914
2915         update_task_scan_period(p, fault_types[0], fault_types[1]);
2916 }
2917
2918 static inline int get_numa_group(struct numa_group *grp)
2919 {
2920         return refcount_inc_not_zero(&grp->refcount);
2921 }
2922
2923 static inline void put_numa_group(struct numa_group *grp)
2924 {
2925         if (refcount_dec_and_test(&grp->refcount))
2926                 kfree_rcu(grp, rcu);
2927 }
2928
2929 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2930                         int *priv)
2931 {
2932         struct numa_group *grp, *my_grp;
2933         struct task_struct *tsk;
2934         bool join = false;
2935         int cpu = cpupid_to_cpu(cpupid);
2936         int i;
2937
2938         if (unlikely(!deref_curr_numa_group(p))) {
2939                 unsigned int size = sizeof(struct numa_group) +
2940                                     NR_NUMA_HINT_FAULT_STATS *
2941                                     nr_node_ids * sizeof(unsigned long);
2942
2943                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2944                 if (!grp)
2945                         return;
2946
2947                 refcount_set(&grp->refcount, 1);
2948                 grp->active_nodes = 1;
2949                 grp->max_faults_cpu = 0;
2950                 spin_lock_init(&grp->lock);
2951                 grp->gid = p->pid;
2952
2953                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2954                         grp->faults[i] = p->numa_faults[i];
2955
2956                 grp->total_faults = p->total_numa_faults;
2957
2958                 grp->nr_tasks++;
2959                 rcu_assign_pointer(p->numa_group, grp);
2960         }
2961
2962         rcu_read_lock();
2963         tsk = READ_ONCE(cpu_rq(cpu)->curr);
2964
2965         if (!cpupid_match_pid(tsk, cpupid))
2966                 goto no_join;
2967
2968         grp = rcu_dereference(tsk->numa_group);
2969         if (!grp)
2970                 goto no_join;
2971
2972         my_grp = deref_curr_numa_group(p);
2973         if (grp == my_grp)
2974                 goto no_join;
2975
2976         /*
2977          * Only join the other group if its bigger; if we're the bigger group,
2978          * the other task will join us.
2979          */
2980         if (my_grp->nr_tasks > grp->nr_tasks)
2981                 goto no_join;
2982
2983         /*
2984          * Tie-break on the grp address.
2985          */
2986         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2987                 goto no_join;
2988
2989         /* Always join threads in the same process. */
2990         if (tsk->mm == current->mm)
2991                 join = true;
2992
2993         /* Simple filter to avoid false positives due to PID collisions */
2994         if (flags & TNF_SHARED)
2995                 join = true;
2996
2997         /* Update priv based on whether false sharing was detected */
2998         *priv = !join;
2999
3000         if (join && !get_numa_group(grp))
3001                 goto no_join;
3002
3003         rcu_read_unlock();
3004
3005         if (!join)
3006                 return;
3007
3008         WARN_ON_ONCE(irqs_disabled());
3009         double_lock_irq(&my_grp->lock, &grp->lock);
3010
3011         for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
3012                 my_grp->faults[i] -= p->numa_faults[i];
3013                 grp->faults[i] += p->numa_faults[i];
3014         }
3015         my_grp->total_faults -= p->total_numa_faults;
3016         grp->total_faults += p->total_numa_faults;
3017
3018         my_grp->nr_tasks--;
3019         grp->nr_tasks++;
3020
3021         spin_unlock(&my_grp->lock);
3022         spin_unlock_irq(&grp->lock);
3023
3024         rcu_assign_pointer(p->numa_group, grp);
3025
3026         put_numa_group(my_grp);
3027         return;
3028
3029 no_join:
3030         rcu_read_unlock();
3031         return;
3032 }
3033
3034 /*
3035  * Get rid of NUMA statistics associated with a task (either current or dead).
3036  * If @final is set, the task is dead and has reached refcount zero, so we can
3037  * safely free all relevant data structures. Otherwise, there might be
3038  * concurrent reads from places like load balancing and procfs, and we should
3039  * reset the data back to default state without freeing ->numa_faults.
3040  */
3041 void task_numa_free(struct task_struct *p, bool final)
3042 {
3043         /* safe: p either is current or is being freed by current */
3044         struct numa_group *grp = rcu_dereference_raw(p->numa_group);
3045         unsigned long *numa_faults = p->numa_faults;
3046         unsigned long flags;
3047         int i;
3048
3049         if (!numa_faults)
3050                 return;
3051
3052         if (grp) {
3053                 spin_lock_irqsave(&grp->lock, flags);
3054                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3055                         grp->faults[i] -= p->numa_faults[i];
3056                 grp->total_faults -= p->total_numa_faults;
3057
3058                 grp->nr_tasks--;
3059                 spin_unlock_irqrestore(&grp->lock, flags);
3060                 RCU_INIT_POINTER(p->numa_group, NULL);
3061                 put_numa_group(grp);
3062         }
3063
3064         if (final) {
3065                 p->numa_faults = NULL;
3066                 kfree(numa_faults);
3067         } else {
3068                 p->total_numa_faults = 0;
3069                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3070                         numa_faults[i] = 0;
3071         }
3072 }
3073
3074 /*
3075  * Got a PROT_NONE fault for a page on @node.
3076  */
3077 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
3078 {
3079         struct task_struct *p = current;
3080         bool migrated = flags & TNF_MIGRATED;
3081         int cpu_node = task_node(current);
3082         int local = !!(flags & TNF_FAULT_LOCAL);
3083         struct numa_group *ng;
3084         int priv;
3085
3086         if (!static_branch_likely(&sched_numa_balancing))
3087                 return;
3088
3089         /* for example, ksmd faulting in a user's mm */
3090         if (!p->mm)
3091                 return;
3092
3093         /*
3094          * NUMA faults statistics are unnecessary for the slow memory
3095          * node for memory tiering mode.
3096          */
3097         if (!node_is_toptier(mem_node) &&
3098             (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
3099              !cpupid_valid(last_cpupid)))
3100                 return;
3101
3102         /* Allocate buffer to track faults on a per-node basis */
3103         if (unlikely(!p->numa_faults)) {
3104                 int size = sizeof(*p->numa_faults) *
3105                            NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
3106
3107                 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
3108                 if (!p->numa_faults)
3109                         return;
3110
3111                 p->total_numa_faults = 0;
3112                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
3113         }
3114
3115         /*
3116          * First accesses are treated as private, otherwise consider accesses
3117          * to be private if the accessing pid has not changed
3118          */
3119         if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
3120                 priv = 1;
3121         } else {
3122                 priv = cpupid_match_pid(p, last_cpupid);
3123                 if (!priv && !(flags & TNF_NO_GROUP))
3124                         task_numa_group(p, last_cpupid, flags, &priv);
3125         }
3126
3127         /*
3128          * If a workload spans multiple NUMA nodes, a shared fault that
3129          * occurs wholly within the set of nodes that the workload is
3130          * actively using should be counted as local. This allows the
3131          * scan rate to slow down when a workload has settled down.
3132          */
3133         ng = deref_curr_numa_group(p);
3134         if (!priv && !local && ng && ng->active_nodes > 1 &&
3135                                 numa_is_active_node(cpu_node, ng) &&
3136                                 numa_is_active_node(mem_node, ng))
3137                 local = 1;
3138
3139         /*
3140          * Retry to migrate task to preferred node periodically, in case it
3141          * previously failed, or the scheduler moved us.
3142          */
3143         if (time_after(jiffies, p->numa_migrate_retry)) {
3144                 task_numa_placement(p);
3145                 numa_migrate_preferred(p);
3146         }
3147
3148         if (migrated)
3149                 p->numa_pages_migrated += pages;
3150         if (flags & TNF_MIGRATE_FAIL)
3151                 p->numa_faults_locality[2] += pages;
3152
3153         p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
3154         p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
3155         p->numa_faults_locality[local] += pages;
3156 }
3157
3158 static void reset_ptenuma_scan(struct task_struct *p)
3159 {
3160         /*
3161          * We only did a read acquisition of the mmap sem, so
3162          * p->mm->numa_scan_seq is written to without exclusive access
3163          * and the update is not guaranteed to be atomic. That's not
3164          * much of an issue though, since this is just used for
3165          * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
3166          * expensive, to avoid any form of compiler optimizations:
3167          */
3168         WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
3169         p->mm->numa_scan_offset = 0;
3170 }
3171
3172 static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
3173 {
3174         unsigned long pids;
3175         /*
3176          * Allow unconditional access first two times, so that all the (pages)
3177          * of VMAs get prot_none fault introduced irrespective of accesses.
3178          * This is also done to avoid any side effect of task scanning
3179          * amplifying the unfairness of disjoint set of VMAs' access.
3180          */
3181         if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
3182                 return true;
3183
3184         pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
3185         if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
3186                 return true;
3187
3188         /*
3189          * Complete a scan that has already started regardless of PID access, or
3190          * some VMAs may never be scanned in multi-threaded applications:
3191          */
3192         if (mm->numa_scan_offset > vma->vm_start) {
3193                 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
3194                 return true;
3195         }
3196
3197         return false;
3198 }
3199
3200 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
3201
3202 /*
3203  * The expensive part of numa migration is done from task_work context.
3204  * Triggered from task_tick_numa().
3205  */
3206 static void task_numa_work(struct callback_head *work)
3207 {
3208         unsigned long migrate, next_scan, now = jiffies;
3209         struct task_struct *p = current;
3210         struct mm_struct *mm = p->mm;
3211         u64 runtime = p->se.sum_exec_runtime;
3212         struct vm_area_struct *vma;
3213         unsigned long start, end;
3214         unsigned long nr_pte_updates = 0;
3215         long pages, virtpages;
3216         struct vma_iterator vmi;
3217         bool vma_pids_skipped;
3218         bool vma_pids_forced = false;
3219
3220         SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
3221
3222         work->next = work;
3223         /*
3224          * Who cares about NUMA placement when they're dying.
3225          *
3226          * NOTE: make sure not to dereference p->mm before this check,
3227          * exit_task_work() happens _after_ exit_mm() so we could be called
3228          * without p->mm even though we still had it when we enqueued this
3229          * work.
3230          */
3231         if (p->flags & PF_EXITING)
3232                 return;
3233
3234         if (!mm->numa_next_scan) {
3235                 mm->numa_next_scan = now +
3236                         msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3237         }
3238
3239         /*
3240          * Enforce maximal scan/migration frequency..
3241          */
3242         migrate = mm->numa_next_scan;
3243         if (time_before(now, migrate))
3244                 return;
3245
3246         if (p->numa_scan_period == 0) {
3247                 p->numa_scan_period_max = task_scan_max(p);
3248                 p->numa_scan_period = task_scan_start(p);
3249         }
3250
3251         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
3252         if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
3253                 return;
3254
3255         /*
3256          * Delay this task enough that another task of this mm will likely win
3257          * the next time around.
3258          */
3259         p->node_stamp += 2 * TICK_NSEC;
3260
3261         pages = sysctl_numa_balancing_scan_size;
3262         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
3263         virtpages = pages * 8;     /* Scan up to this much virtual space */
3264         if (!pages)
3265                 return;
3266
3267
3268         if (!mmap_read_trylock(mm))
3269                 return;
3270
3271         /*
3272          * VMAs are skipped if the current PID has not trapped a fault within
3273          * the VMA recently. Allow scanning to be forced if there is no
3274          * suitable VMA remaining.
3275          */
3276         vma_pids_skipped = false;
3277
3278 retry_pids:
3279         start = mm->numa_scan_offset;
3280         vma_iter_init(&vmi, mm, start);
3281         vma = vma_next(&vmi);
3282         if (!vma) {
3283                 reset_ptenuma_scan(p);
3284                 start = 0;
3285                 vma_iter_set(&vmi, start);
3286                 vma = vma_next(&vmi);
3287         }
3288
3289         do {
3290                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
3291                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
3292                         trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
3293                         continue;
3294                 }
3295
3296                 /*
3297                  * Shared library pages mapped by multiple processes are not
3298                  * migrated as it is expected they are cache replicated. Avoid
3299                  * hinting faults in read-only file-backed mappings or the vdso
3300                  * as migrating the pages will be of marginal benefit.
3301                  */
3302                 if (!vma->vm_mm ||
3303                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
3304                         trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
3305                         continue;
3306                 }
3307
3308                 /*
3309                  * Skip inaccessible VMAs to avoid any confusion between
3310                  * PROT_NONE and NUMA hinting ptes
3311                  */
3312                 if (!vma_is_accessible(vma)) {
3313                         trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
3314                         continue;
3315                 }
3316
3317                 /* Initialise new per-VMA NUMAB state. */
3318                 if (!vma->numab_state) {
3319                         vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
3320                                 GFP_KERNEL);
3321                         if (!vma->numab_state)
3322                                 continue;
3323
3324                         vma->numab_state->start_scan_seq = mm->numa_scan_seq;
3325
3326                         vma->numab_state->next_scan = now +
3327                                 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3328
3329                         /* Reset happens after 4 times scan delay of scan start */
3330                         vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
3331                                 msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3332
3333                         /*
3334                          * Ensure prev_scan_seq does not match numa_scan_seq,
3335                          * to prevent VMAs being skipped prematurely on the
3336                          * first scan:
3337                          */
3338                          vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
3339                 }
3340
3341                 /*
3342                  * Scanning the VMA's of short lived tasks add more overhead. So
3343                  * delay the scan for new VMAs.
3344                  */
3345                 if (mm->numa_scan_seq && time_before(jiffies,
3346                                                 vma->numab_state->next_scan)) {
3347                         trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
3348                         continue;
3349                 }
3350
3351                 /* RESET access PIDs regularly for old VMAs. */
3352                 if (mm->numa_scan_seq &&
3353                                 time_after(jiffies, vma->numab_state->pids_active_reset)) {
3354                         vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
3355                                 msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3356                         vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
3357                         vma->numab_state->pids_active[1] = 0;
3358                 }
3359
3360                 /* Do not rescan VMAs twice within the same sequence. */
3361                 if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
3362                         mm->numa_scan_offset = vma->vm_end;
3363                         trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
3364                         continue;
3365                 }
3366
3367                 /*
3368                  * Do not scan the VMA if task has not accessed it, unless no other
3369                  * VMA candidate exists.
3370                  */
3371                 if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
3372                         vma_pids_skipped = true;
3373                         trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
3374                         continue;
3375                 }
3376
3377                 do {
3378                         start = max(start, vma->vm_start);
3379                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
3380                         end = min(end, vma->vm_end);
3381                         nr_pte_updates = change_prot_numa(vma, start, end);
3382
3383                         /*
3384                          * Try to scan sysctl_numa_balancing_size worth of
3385                          * hpages that have at least one present PTE that
3386                          * is not already pte-numa. If the VMA contains
3387                          * areas that are unused or already full of prot_numa
3388                          * PTEs, scan up to virtpages, to skip through those
3389                          * areas faster.
3390                          */
3391                         if (nr_pte_updates)
3392                                 pages -= (end - start) >> PAGE_SHIFT;
3393                         virtpages -= (end - start) >> PAGE_SHIFT;
3394
3395                         start = end;
3396                         if (pages <= 0 || virtpages <= 0)
3397                                 goto out;
3398
3399                         cond_resched();
3400                 } while (end != vma->vm_end);
3401
3402                 /* VMA scan is complete, do not scan until next sequence. */
3403                 vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
3404
3405                 /*
3406                  * Only force scan within one VMA at a time, to limit the
3407                  * cost of scanning a potentially uninteresting VMA.
3408                  */
3409                 if (vma_pids_forced)
3410                         break;
3411         } for_each_vma(vmi, vma);
3412
3413         /*
3414          * If no VMAs are remaining and VMAs were skipped due to the PID
3415          * not accessing the VMA previously, then force a scan to ensure
3416          * forward progress:
3417          */
3418         if (!vma && !vma_pids_forced && vma_pids_skipped) {
3419                 vma_pids_forced = true;
3420                 goto retry_pids;
3421         }
3422
3423 out:
3424         /*
3425          * It is possible to reach the end of the VMA list but the last few
3426          * VMAs are not guaranteed to the vma_migratable. If they are not, we
3427          * would find the !migratable VMA on the next scan but not reset the
3428          * scanner to the start so check it now.
3429          */
3430         if (vma)
3431                 mm->numa_scan_offset = start;
3432         else
3433                 reset_ptenuma_scan(p);
3434         mmap_read_unlock(mm);
3435
3436         /*
3437          * Make sure tasks use at least 32x as much time to run other code
3438          * than they used here, to limit NUMA PTE scanning overhead to 3% max.
3439          * Usually update_task_scan_period slows down scanning enough; on an
3440          * overloaded system we need to limit overhead on a per task basis.
3441          */
3442         if (unlikely(p->se.sum_exec_runtime != runtime)) {
3443                 u64 diff = p->se.sum_exec_runtime - runtime;
3444                 p->node_stamp += 32 * diff;
3445         }
3446 }
3447
3448 void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
3449 {
3450         int mm_users = 0;
3451         struct mm_struct *mm = p->mm;
3452
3453         if (mm) {
3454                 mm_users = atomic_read(&mm->mm_users);
3455                 if (mm_users == 1) {
3456                         mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3457                         mm->numa_scan_seq = 0;
3458                 }
3459         }
3460         p->node_stamp                   = 0;
3461         p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
3462         p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
3463         p->numa_migrate_retry           = 0;
3464         /* Protect against double add, see task_tick_numa and task_numa_work */
3465         p->numa_work.next               = &p->numa_work;
3466         p->numa_faults                  = NULL;
3467         p->numa_pages_migrated          = 0;
3468         p->total_numa_faults            = 0;
3469         RCU_INIT_POINTER(p->numa_group, NULL);
3470         p->last_task_numa_placement     = 0;
3471         p->last_sum_exec_runtime        = 0;
3472
3473         init_task_work(&p->numa_work, task_numa_work);
3474
3475         /* New address space, reset the preferred nid */
3476         if (!(clone_flags & CLONE_VM)) {
3477                 p->numa_preferred_nid = NUMA_NO_NODE;
3478                 return;
3479         }
3480
3481         /*
3482          * New thread, keep existing numa_preferred_nid which should be copied
3483          * already by arch_dup_task_struct but stagger when scans start.
3484          */
3485         if (mm) {
3486                 unsigned int delay;
3487
3488                 delay = min_t(unsigned int, task_scan_max(current),
3489                         current->numa_scan_period * mm_users * NSEC_PER_MSEC);
3490                 delay += 2 * TICK_NSEC;
3491                 p->node_stamp = delay;
3492         }
3493 }
3494
3495 /*
3496  * Drive the periodic memory faults..
3497  */
3498 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3499 {
3500         struct callback_head *work = &curr->numa_work;
3501         u64 period, now;
3502
3503         /*
3504          * We don't care about NUMA placement if we don't have memory.
3505          */
3506         if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
3507                 return;
3508
3509         /*
3510          * Using runtime rather than walltime has the dual advantage that
3511          * we (mostly) drive the selection from busy threads and that the
3512          * task needs to have done some actual work before we bother with
3513          * NUMA placement.
3514          */
3515         now = curr->se.sum_exec_runtime;
3516         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
3517
3518         if (now > curr->node_stamp + period) {
3519                 if (!curr->node_stamp)
3520                         curr->numa_scan_period = task_scan_start(curr);
3521                 curr->node_stamp += period;
3522
3523                 if (!time_before(jiffies, curr->mm->numa_next_scan))
3524                         task_work_add(curr, work, TWA_RESUME);
3525         }
3526 }
3527
3528 static void update_scan_period(struct task_struct *p, int new_cpu)
3529 {
3530         int src_nid = cpu_to_node(task_cpu(p));
3531         int dst_nid = cpu_to_node(new_cpu);
3532
3533         if (!static_branch_likely(&sched_numa_balancing))
3534                 return;
3535
3536         if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
3537                 return;
3538
3539         if (src_nid == dst_nid)
3540                 return;
3541
3542         /*
3543          * Allow resets if faults have been trapped before one scan
3544          * has completed. This is most likely due to a new task that
3545          * is pulled cross-node due to wakeups or load balancing.
3546          */
3547         if (p->numa_scan_seq) {
3548                 /*
3549                  * Avoid scan adjustments if moving to the preferred
3550                  * node or if the task was not previously running on
3551                  * the preferred node.
3552                  */
3553                 if (dst_nid == p->numa_preferred_nid ||
3554                     (p->numa_preferred_nid != NUMA_NO_NODE &&
3555                         src_nid != p->numa_preferred_nid))
3556                         return;
3557         }
3558
3559         p->numa_scan_period = task_scan_start(p);
3560 }
3561
3562 #else
3563 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3564 {
3565 }
3566
3567 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
3568 {
3569 }
3570
3571 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
3572 {
3573 }
3574
3575 static inline void update_scan_period(struct task_struct *p, int new_cpu)
3576 {
3577 }
3578
3579 #endif /* CONFIG_NUMA_BALANCING */
3580
3581 static void
3582 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3583 {
3584         update_load_add(&cfs_rq->load, se->load.weight);
3585 #ifdef CONFIG_SMP
3586         if (entity_is_task(se)) {
3587                 struct rq *rq = rq_of(cfs_rq);
3588
3589                 account_numa_enqueue(rq, task_of(se));
3590                 list_add(&se->group_node, &rq->cfs_tasks);
3591         }
3592 #endif
3593         cfs_rq->nr_running++;
3594         if (se_is_idle(se))
3595                 cfs_rq->idle_nr_running++;
3596 }
3597
3598 static void
3599 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3600 {
3601         update_load_sub(&cfs_rq->load, se->load.weight);
3602 #ifdef CONFIG_SMP
3603         if (entity_is_task(se)) {
3604                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
3605                 list_del_init(&se->group_node);
3606         }
3607 #endif
3608         cfs_rq->nr_running--;
3609         if (se_is_idle(se))
3610                 cfs_rq->idle_nr_running--;
3611 }
3612
3613 /*
3614  * Signed add and clamp on underflow.
3615  *
3616  * Explicitly do a load-store to ensure the intermediate value never hits
3617  * memory. This allows lockless observations without ever seeing the negative
3618  * values.
3619  */
3620 #define add_positive(_ptr, _val) do {                           \
3621         typeof(_ptr) ptr = (_ptr);                              \
3622         typeof(_val) val = (_val);                              \
3623         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3624                                                                 \
3625         res = var + val;                                        \
3626                                                                 \
3627         if (val < 0 && res > var)                               \
3628                 res = 0;                                        \
3629                                                                 \
3630         WRITE_ONCE(*ptr, res);                                  \
3631 } while (0)
3632
3633 /*
3634  * Unsigned subtract and clamp on underflow.
3635  *
3636  * Explicitly do a load-store to ensure the intermediate value never hits
3637  * memory. This allows lockless observations without ever seeing the negative
3638  * values.
3639  */
3640 #define sub_positive(_ptr, _val) do {                           \
3641         typeof(_ptr) ptr = (_ptr);                              \
3642         typeof(*ptr) val = (_val);                              \
3643         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3644         res = var - val;                                        \
3645         if (res > var)                                          \
3646                 res = 0;                                        \
3647         WRITE_ONCE(*ptr, res);                                  \
3648 } while (0)
3649
3650 /*
3651  * Remove and clamp on negative, from a local variable.
3652  *
3653  * A variant of sub_positive(), which does not use explicit load-store
3654  * and is thus optimized for local variable updates.
3655  */
3656 #define lsub_positive(_ptr, _val) do {                          \
3657         typeof(_ptr) ptr = (_ptr);                              \
3658         *ptr -= min_t(typeof(*ptr), *ptr, _val);                \
3659 } while (0)
3660
3661 #ifdef CONFIG_SMP
3662 static inline void
3663 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3664 {
3665         cfs_rq->avg.load_avg += se->avg.load_avg;
3666         cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3667 }
3668
3669 static inline void
3670 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3671 {
3672         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3673         sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3674         /* See update_cfs_rq_load_avg() */
3675         cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3676                                           cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3677 }
3678 #else
3679 static inline void
3680 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3681 static inline void
3682 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3683 #endif
3684
3685 static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
3686                            unsigned long weight)
3687 {
3688         unsigned long old_weight = se->load.weight;
3689         s64 vlag, vslice;
3690
3691         /*
3692          * VRUNTIME
3693          * ========
3694          *
3695          * COROLLARY #1: The virtual runtime of the entity needs to be
3696          * adjusted if re-weight at !0-lag point.
3697          *
3698          * Proof: For contradiction assume this is not true, so we can
3699          * re-weight without changing vruntime at !0-lag point.
3700          *
3701          *             Weight   VRuntime   Avg-VRuntime
3702          *     before    w          v            V
3703          *      after    w'         v'           V'
3704          *
3705          * Since lag needs to be preserved through re-weight:
3706          *
3707          *      lag = (V - v)*w = (V'- v')*w', where v = v'
3708          *      ==>     V' = (V - v)*w/w' + v           (1)
3709          *
3710          * Let W be the total weight of the entities before reweight,
3711          * since V' is the new weighted average of entities:
3712          *
3713          *      V' = (WV + w'v - wv) / (W + w' - w)     (2)
3714          *
3715          * by using (1) & (2) we obtain:
3716          *
3717          *      (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
3718          *      ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
3719          *      ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
3720          *      ==>     (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
3721          *
3722          * Since we are doing at !0-lag point which means V != v, we
3723          * can simplify (3):
3724          *
3725          *      ==>     W / (W + w' - w) = w / w'
3726          *      ==>     Ww' = Ww + ww' - ww
3727          *      ==>     W * (w' - w) = w * (w' - w)
3728          *      ==>     W = w   (re-weight indicates w' != w)
3729          *
3730          * So the cfs_rq contains only one entity, hence vruntime of
3731          * the entity @v should always equal to the cfs_rq's weighted
3732          * average vruntime @V, which means we will always re-weight
3733          * at 0-lag point, thus breach assumption. Proof completed.
3734          *
3735          *
3736          * COROLLARY #2: Re-weight does NOT affect weighted average
3737          * vruntime of all the entities.
3738          *
3739          * Proof: According to corollary #1, Eq. (1) should be:
3740          *
3741          *      (V - v)*w = (V' - v')*w'
3742          *      ==>    v' = V' - (V - v)*w/w'           (4)
3743          *
3744          * According to the weighted average formula, we have:
3745          *
3746          *      V' = (WV - wv + w'v') / (W - w + w')
3747          *         = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
3748          *         = (WV - wv + w'V' - Vw + wv) / (W - w + w')
3749          *         = (WV + w'V' - Vw) / (W - w + w')
3750          *
3751          *      ==>  V'*(W - w + w') = WV + w'V' - Vw
3752          *      ==>     V' * (W - w) = (W - w) * V      (5)
3753          *
3754          * If the entity is the only one in the cfs_rq, then reweight
3755          * always occurs at 0-lag point, so V won't change. Or else
3756          * there are other entities, hence W != w, then Eq. (5) turns
3757          * into V' = V. So V won't change in either case, proof done.
3758          *
3759          *
3760          * So according to corollary #1 & #2, the effect of re-weight
3761          * on vruntime should be:
3762          *
3763          *      v' = V' - (V - v) * w / w'              (4)
3764          *         = V  - (V - v) * w / w'
3765          *         = V  - vl * w / w'
3766          *         = V  - vl'
3767          */
3768         if (avruntime != se->vruntime) {
3769                 vlag = entity_lag(avruntime, se);
3770                 vlag = div_s64(vlag * old_weight, weight);
3771                 se->vruntime = avruntime - vlag;
3772         }
3773
3774         /*
3775          * DEADLINE
3776          * ========
3777          *
3778          * When the weight changes, the virtual time slope changes and
3779          * we should adjust the relative virtual deadline accordingly.
3780          *
3781          *      d' = v' + (d - v)*w/w'
3782          *         = V' - (V - v)*w/w' + (d - v)*w/w'
3783          *         = V  - (V - v)*w/w' + (d - v)*w/w'
3784          *         = V  + (d - V)*w/w'
3785          */
3786         vslice = (s64)(se->deadline - avruntime);
3787         vslice = div_s64(vslice * old_weight, weight);
3788         se->deadline = avruntime + vslice;
3789 }
3790
3791 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3792                             unsigned long weight)
3793 {
3794         bool curr = cfs_rq->curr == se;
3795         u64 avruntime;
3796
3797         if (se->on_rq) {
3798                 /* commit outstanding execution time */
3799                 update_curr(cfs_rq);
3800                 avruntime = avg_vruntime(cfs_rq);
3801                 if (!curr)
3802                         __dequeue_entity(cfs_rq, se);
3803                 update_load_sub(&cfs_rq->load, se->load.weight);
3804         }
3805         dequeue_load_avg(cfs_rq, se);
3806
3807         if (se->on_rq) {
3808                 reweight_eevdf(se, avruntime, weight);
3809         } else {
3810                 /*
3811                  * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3812                  * we need to scale se->vlag when w_i changes.
3813                  */
3814                 se->vlag = div_s64(se->vlag * se->load.weight, weight);
3815         }
3816
3817         update_load_set(&se->load, weight);
3818
3819 #ifdef CONFIG_SMP
3820         do {
3821                 u32 divider = get_pelt_divider(&se->avg);
3822
3823                 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3824         } while (0);
3825 #endif
3826
3827         enqueue_load_avg(cfs_rq, se);
3828         if (se->on_rq) {
3829                 update_load_add(&cfs_rq->load, se->load.weight);
3830                 if (!curr)
3831                         __enqueue_entity(cfs_rq, se);
3832
3833                 /*
3834                  * The entity's vruntime has been adjusted, so let's check
3835                  * whether the rq-wide min_vruntime needs updated too. Since
3836                  * the calculations above require stable min_vruntime rather
3837                  * than up-to-date one, we do the update at the end of the
3838                  * reweight process.
3839                  */
3840                 update_min_vruntime(cfs_rq);
3841         }
3842 }
3843
3844 void reweight_task(struct task_struct *p, int prio)
3845 {
3846         struct sched_entity *se = &p->se;
3847         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3848         struct load_weight *load = &se->load;
3849         unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3850
3851         reweight_entity(cfs_rq, se, weight);
3852         load->inv_weight = sched_prio_to_wmult[prio];
3853 }
3854
3855 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3856
3857 #ifdef CONFIG_FAIR_GROUP_SCHED
3858 #ifdef CONFIG_SMP
3859 /*
3860  * All this does is approximate the hierarchical proportion which includes that
3861  * global sum we all love to hate.
3862  *
3863  * That is, the weight of a group entity, is the proportional share of the
3864  * group weight based on the group runqueue weights. That is:
3865  *
3866  *                     tg->weight * grq->load.weight
3867  *   ge->load.weight = -----------------------------               (1)
3868  *                       \Sum grq->load.weight
3869  *
3870  * Now, because computing that sum is prohibitively expensive to compute (been
3871  * there, done that) we approximate it with this average stuff. The average
3872  * moves slower and therefore the approximation is cheaper and more stable.
3873  *
3874  * So instead of the above, we substitute:
3875  *
3876  *   grq->load.weight -> grq->avg.load_avg                         (2)
3877  *
3878  * which yields the following:
3879  *
3880  *                     tg->weight * grq->avg.load_avg
3881  *   ge->load.weight = ------------------------------              (3)
3882  *                             tg->load_avg
3883  *
3884  * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3885  *
3886  * That is shares_avg, and it is right (given the approximation (2)).
3887  *
3888  * The problem with it is that because the average is slow -- it was designed
3889  * to be exactly that of course -- this leads to transients in boundary
3890  * conditions. In specific, the case where the group was idle and we start the
3891  * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3892  * yielding bad latency etc..
3893  *
3894  * Now, in that special case (1) reduces to:
3895  *
3896  *                     tg->weight * grq->load.weight
3897  *   ge->load.weight = ----------------------------- = tg->weight   (4)
3898  *                         grp->load.weight
3899  *
3900  * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3901  *
3902  * So what we do is modify our approximation (3) to approach (4) in the (near)
3903  * UP case, like:
3904  *
3905  *   ge->load.weight =
3906  *
3907  *              tg->weight * grq->load.weight
3908  *     ---------------------------------------------------         (5)
3909  *     tg->load_avg - grq->avg.load_avg + grq->load.weight
3910  *
3911  * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3912  * we need to use grq->avg.load_avg as its lower bound, which then gives:
3913  *
3914  *
3915  *                     tg->weight * grq->load.weight
3916  *   ge->load.weight = -----------------------------               (6)
3917  *                             tg_load_avg'
3918  *
3919  * Where:
3920  *
3921  *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3922  *                  max(grq->load.weight, grq->avg.load_avg)
3923  *
3924  * And that is shares_weight and is icky. In the (near) UP case it approaches
3925  * (4) while in the normal case it approaches (3). It consistently
3926  * overestimates the ge->load.weight and therefore:
3927  *
3928  *   \Sum ge->load.weight >= tg->weight
3929  *
3930  * hence icky!
3931  */
3932 static long calc_group_shares(struct cfs_rq *cfs_rq)
3933 {
3934         long tg_weight, tg_shares, load, shares;
3935         struct task_group *tg = cfs_rq->tg;
3936
3937         tg_shares = READ_ONCE(tg->shares);
3938
3939         load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3940
3941         tg_weight = atomic_long_read(&tg->load_avg);
3942
3943         /* Ensure tg_weight >= load */
3944         tg_weight -= cfs_rq->tg_load_avg_contrib;
3945         tg_weight += load;
3946
3947         shares = (tg_shares * load);
3948         if (tg_weight)
3949                 shares /= tg_weight;
3950
3951         /*
3952          * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3953          * of a group with small tg->shares value. It is a floor value which is
3954          * assigned as a minimum load.weight to the sched_entity representing
3955          * the group on a CPU.
3956          *
3957          * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3958          * on an 8-core system with 8 tasks each runnable on one CPU shares has
3959          * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3960          * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3961          * instead of 0.
3962          */
3963         return clamp_t(long, shares, MIN_SHARES, tg_shares);
3964 }
3965 #endif /* CONFIG_SMP */
3966
3967 /*
3968  * Recomputes the group entity based on the current state of its group
3969  * runqueue.
3970  */
3971 static void update_cfs_group(struct sched_entity *se)
3972 {
3973         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3974         long shares;
3975
3976         if (!gcfs_rq)
3977                 return;
3978
3979         if (throttled_hierarchy(gcfs_rq))
3980                 return;
3981
3982 #ifndef CONFIG_SMP
3983         shares = READ_ONCE(gcfs_rq->tg->shares);
3984 #else
3985         shares = calc_group_shares(gcfs_rq);
3986 #endif
3987         if (unlikely(se->load.weight != shares))
3988                 reweight_entity(cfs_rq_of(se), se, shares);
3989 }
3990
3991 #else /* CONFIG_FAIR_GROUP_SCHED */
3992 static inline void update_cfs_group(struct sched_entity *se)
3993 {
3994 }
3995 #endif /* CONFIG_FAIR_GROUP_SCHED */
3996
3997 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3998 {
3999         struct rq *rq = rq_of(cfs_rq);
4000
4001         if (&rq->cfs == cfs_rq) {
4002                 /*
4003                  * There are a few boundary cases this might miss but it should
4004                  * get called often enough that that should (hopefully) not be
4005                  * a real problem.
4006                  *
4007                  * It will not get called when we go idle, because the idle
4008                  * thread is a different class (!fair), nor will the utilization
4009                  * number include things like RT tasks.
4010                  *
4011                  * As is, the util number is not freq-invariant (we'd have to
4012                  * implement arch_scale_freq_capacity() for that).
4013                  *
4014                  * See cpu_util_cfs().
4015                  */
4016                 cpufreq_update_util(rq, flags);
4017         }
4018 }
4019
4020 #ifdef CONFIG_SMP
4021 static inline bool load_avg_is_decayed(struct sched_avg *sa)
4022 {
4023         if (sa->load_sum)
4024                 return false;
4025
4026         if (sa->util_sum)
4027                 return false;
4028
4029         if (sa->runnable_sum)
4030                 return false;
4031
4032         /*
4033          * _avg must be null when _sum are null because _avg = _sum / divider
4034          * Make sure that rounding and/or propagation of PELT values never
4035          * break this.
4036          */
4037         SCHED_WARN_ON(sa->load_avg ||
4038                       sa->util_avg ||
4039                       sa->runnable_avg);
4040
4041         return true;
4042 }
4043
4044 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4045 {
4046         return u64_u32_load_copy(cfs_rq->avg.last_update_time,
4047                                  cfs_rq->last_update_time_copy);
4048 }
4049 #ifdef CONFIG_FAIR_GROUP_SCHED
4050 /*
4051  * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
4052  * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
4053  * bottom-up, we only have to test whether the cfs_rq before us on the list
4054  * is our child.
4055  * If cfs_rq is not on the list, test whether a child needs its to be added to
4056  * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
4057  */
4058 static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
4059 {
4060         struct cfs_rq *prev_cfs_rq;
4061         struct list_head *prev;
4062
4063         if (cfs_rq->on_list) {
4064                 prev = cfs_rq->leaf_cfs_rq_list.prev;
4065         } else {
4066                 struct rq *rq = rq_of(cfs_rq);
4067
4068                 prev = rq->tmp_alone_branch;
4069         }
4070
4071         prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
4072
4073         return (prev_cfs_rq->tg->parent == cfs_rq->tg);
4074 }
4075
4076 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4077 {
4078         if (cfs_rq->load.weight)
4079                 return false;
4080
4081         if (!load_avg_is_decayed(&cfs_rq->avg))
4082                 return false;
4083
4084         if (child_cfs_rq_on_list(cfs_rq))
4085                 return false;
4086
4087         return true;
4088 }
4089
4090 /**
4091  * update_tg_load_avg - update the tg's load avg
4092  * @cfs_rq: the cfs_rq whose avg changed
4093  *
4094  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
4095  * However, because tg->load_avg is a global value there are performance
4096  * considerations.
4097  *
4098  * In order to avoid having to look at the other cfs_rq's, we use a
4099  * differential update where we store the last value we propagated. This in
4100  * turn allows skipping updates if the differential is 'small'.
4101  *
4102  * Updating tg's load_avg is necessary before update_cfs_share().
4103  */
4104 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
4105 {
4106         long delta;
4107         u64 now;
4108
4109         /*
4110          * No need to update load_avg for root_task_group as it is not used.
4111          */
4112         if (cfs_rq->tg == &root_task_group)
4113                 return;
4114
4115         /* rq has been offline and doesn't contribute to the share anymore: */
4116         if (!cpu_active(cpu_of(rq_of(cfs_rq))))
4117                 return;
4118
4119         /*
4120          * For migration heavy workloads, access to tg->load_avg can be
4121          * unbound. Limit the update rate to at most once per ms.
4122          */
4123         now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4124         if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
4125                 return;
4126
4127         delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
4128         if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
4129                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
4130                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4131                 cfs_rq->last_update_tg_load_avg = now;
4132         }
4133 }
4134
4135 static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
4136 {
4137         long delta;
4138         u64 now;
4139
4140         /*
4141          * No need to update load_avg for root_task_group, as it is not used.
4142          */
4143         if (cfs_rq->tg == &root_task_group)
4144                 return;
4145
4146         now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4147         delta = 0 - cfs_rq->tg_load_avg_contrib;
4148         atomic_long_add(delta, &cfs_rq->tg->load_avg);
4149         cfs_rq->tg_load_avg_contrib = 0;
4150         cfs_rq->last_update_tg_load_avg = now;
4151 }
4152
4153 /* CPU offline callback: */
4154 static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
4155 {
4156         struct task_group *tg;
4157
4158         lockdep_assert_rq_held(rq);
4159
4160         /*
4161          * The rq clock has already been updated in
4162          * set_rq_offline(), so we should skip updating
4163          * the rq clock again in unthrottle_cfs_rq().
4164          */
4165         rq_clock_start_loop_update(rq);
4166
4167         rcu_read_lock();
4168         list_for_each_entry_rcu(tg, &task_groups, list) {
4169                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4170
4171                 clear_tg_load_avg(cfs_rq);
4172         }
4173         rcu_read_unlock();
4174
4175         rq_clock_stop_loop_update(rq);
4176 }
4177
4178 /*
4179  * Called within set_task_rq() right before setting a task's CPU. The
4180  * caller only guarantees p->pi_lock is held; no other assumptions,
4181  * including the state of rq->lock, should be made.
4182  */
4183 void set_task_rq_fair(struct sched_entity *se,
4184                       struct cfs_rq *prev, struct cfs_rq *next)
4185 {
4186         u64 p_last_update_time;
4187         u64 n_last_update_time;
4188
4189         if (!sched_feat(ATTACH_AGE_LOAD))
4190                 return;
4191
4192         /*
4193          * We are supposed to update the task to "current" time, then its up to
4194          * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4195          * getting what current time is, so simply throw away the out-of-date
4196          * time. This will result in the wakee task is less decayed, but giving
4197          * the wakee more load sounds not bad.
4198          */
4199         if (!(se->avg.last_update_time && prev))
4200                 return;
4201
4202         p_last_update_time = cfs_rq_last_update_time(prev);
4203         n_last_update_time = cfs_rq_last_update_time(next);
4204
4205         __update_load_avg_blocked_se(p_last_update_time, se);
4206         se->avg.last_update_time = n_last_update_time;
4207 }
4208
4209 /*
4210  * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
4211  * propagate its contribution. The key to this propagation is the invariant
4212  * that for each group:
4213  *
4214  *   ge->avg == grq->avg                                                (1)
4215  *
4216  * _IFF_ we look at the pure running and runnable sums. Because they
4217  * represent the very same entity, just at different points in the hierarchy.
4218  *
4219  * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
4220  * and simply copies the running/runnable sum over (but still wrong, because
4221  * the group entity and group rq do not have their PELT windows aligned).
4222  *
4223  * However, update_tg_cfs_load() is more complex. So we have:
4224  *
4225  *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg          (2)
4226  *
4227  * And since, like util, the runnable part should be directly transferable,
4228  * the following would _appear_ to be the straight forward approach:
4229  *
4230  *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg       (3)
4231  *
4232  * And per (1) we have:
4233  *
4234  *   ge->avg.runnable_avg == grq->avg.runnable_avg
4235  *
4236  * Which gives:
4237  *
4238  *                      ge->load.weight * grq->avg.load_avg
4239  *   ge->avg.load_avg = -----------------------------------             (4)
4240  *                               grq->load.weight
4241  *
4242  * Except that is wrong!
4243  *
4244  * Because while for entities historical weight is not important and we
4245  * really only care about our future and therefore can consider a pure
4246  * runnable sum, runqueues can NOT do this.
4247  *
4248  * We specifically want runqueues to have a load_avg that includes
4249  * historical weights. Those represent the blocked load, the load we expect
4250  * to (shortly) return to us. This only works by keeping the weights as
4251  * integral part of the sum. We therefore cannot decompose as per (3).
4252  *
4253  * Another reason this doesn't work is that runnable isn't a 0-sum entity.
4254  * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
4255  * rq itself is runnable anywhere between 2/3 and 1 depending on how the
4256  * runnable section of these tasks overlap (or not). If they were to perfectly
4257  * align the rq as a whole would be runnable 2/3 of the time. If however we
4258  * always have at least 1 runnable task, the rq as a whole is always runnable.
4259  *
4260  * So we'll have to approximate.. :/
4261  *
4262  * Given the constraint:
4263  *
4264  *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
4265  *
4266  * We can construct a rule that adds runnable to a rq by assuming minimal
4267  * overlap.
4268  *
4269  * On removal, we'll assume each task is equally runnable; which yields:
4270  *
4271  *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
4272  *
4273  * XXX: only do this for the part of runnable > running ?
4274  *
4275  */
4276 static inline void
4277 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4278 {
4279         long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
4280         u32 new_sum, divider;
4281
4282         /* Nothing to update */
4283         if (!delta_avg)
4284                 return;
4285
4286         /*
4287          * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4288          * See ___update_load_avg() for details.
4289          */
4290         divider = get_pelt_divider(&cfs_rq->avg);
4291
4292
4293         /* Set new sched_entity's utilization */
4294         se->avg.util_avg = gcfs_rq->avg.util_avg;
4295         new_sum = se->avg.util_avg * divider;
4296         delta_sum = (long)new_sum - (long)se->avg.util_sum;
4297         se->avg.util_sum = new_sum;
4298
4299         /* Update parent cfs_rq utilization */
4300         add_positive(&cfs_rq->avg.util_avg, delta_avg);
4301         add_positive(&cfs_rq->avg.util_sum, delta_sum);
4302
4303         /* See update_cfs_rq_load_avg() */
4304         cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4305                                           cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4306 }
4307
4308 static inline void
4309 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4310 {
4311         long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
4312         u32 new_sum, divider;
4313
4314         /* Nothing to update */
4315         if (!delta_avg)
4316                 return;
4317
4318         /*
4319          * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4320          * See ___update_load_avg() for details.
4321          */
4322         divider = get_pelt_divider(&cfs_rq->avg);
4323
4324         /* Set new sched_entity's runnable */
4325         se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
4326         new_sum = se->avg.runnable_avg * divider;
4327         delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
4328         se->avg.runnable_sum = new_sum;
4329
4330         /* Update parent cfs_rq runnable */
4331         add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
4332         add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
4333         /* See update_cfs_rq_load_avg() */
4334         cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4335                                               cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4336 }
4337
4338 static inline void
4339 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4340 {
4341         long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
4342         unsigned long load_avg;
4343         u64 load_sum = 0;
4344         s64 delta_sum;
4345         u32 divider;
4346
4347         if (!runnable_sum)
4348                 return;
4349
4350         gcfs_rq->prop_runnable_sum = 0;
4351
4352         /*
4353          * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4354          * See ___update_load_avg() for details.
4355          */
4356         divider = get_pelt_divider(&cfs_rq->avg);
4357
4358         if (runnable_sum >= 0) {
4359                 /*
4360                  * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
4361                  * the CPU is saturated running == runnable.
4362                  */
4363                 runnable_sum += se->avg.load_sum;
4364                 runnable_sum = min_t(long, runnable_sum, divider);
4365         } else {
4366                 /*
4367                  * Estimate the new unweighted runnable_sum of the gcfs_rq by
4368                  * assuming all tasks are equally runnable.
4369                  */
4370                 if (scale_load_down(gcfs_rq->load.weight)) {
4371                         load_sum = div_u64(gcfs_rq->avg.load_sum,
4372                                 scale_load_down(gcfs_rq->load.weight));
4373                 }
4374
4375                 /* But make sure to not inflate se's runnable */
4376                 runnable_sum = min(se->avg.load_sum, load_sum);
4377         }
4378
4379         /*
4380          * runnable_sum can't be lower than running_sum
4381          * Rescale running sum to be in the same range as runnable sum
4382          * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
4383          * runnable_sum is in [0 : LOAD_AVG_MAX]
4384          */
4385         running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
4386         runnable_sum = max(runnable_sum, running_sum);
4387
4388         load_sum = se_weight(se) * runnable_sum;
4389         load_avg = div_u64(load_sum, divider);
4390
4391         delta_avg = load_avg - se->avg.load_avg;
4392         if (!delta_avg)
4393                 return;
4394
4395         delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
4396
4397         se->avg.load_sum = runnable_sum;
4398         se->avg.load_avg = load_avg;
4399         add_positive(&cfs_rq->avg.load_avg, delta_avg);
4400         add_positive(&cfs_rq->avg.load_sum, delta_sum);
4401         /* See update_cfs_rq_load_avg() */
4402         cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
4403                                           cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
4404 }
4405
4406 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
4407 {
4408         cfs_rq->propagate = 1;
4409         cfs_rq->prop_runnable_sum += runnable_sum;
4410 }
4411
4412 /* Update task and its cfs_rq load average */
4413 static inline int propagate_entity_load_avg(struct sched_entity *se)
4414 {
4415         struct cfs_rq *cfs_rq, *gcfs_rq;
4416
4417         if (entity_is_task(se))
4418                 return 0;
4419
4420         gcfs_rq = group_cfs_rq(se);
4421         if (!gcfs_rq->propagate)
4422                 return 0;
4423
4424         gcfs_rq->propagate = 0;
4425
4426         cfs_rq = cfs_rq_of(se);
4427
4428         add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
4429
4430         update_tg_cfs_util(cfs_rq, se, gcfs_rq);
4431         update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
4432         update_tg_cfs_load(cfs_rq, se, gcfs_rq);
4433
4434         trace_pelt_cfs_tp(cfs_rq);
4435         trace_pelt_se_tp(se);
4436
4437         return 1;
4438 }
4439
4440 /*
4441  * Check if we need to update the load and the utilization of a blocked
4442  * group_entity:
4443  */
4444 static inline bool skip_blocked_update(struct sched_entity *se)
4445 {
4446         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4447
4448         /*
4449          * If sched_entity still have not zero load or utilization, we have to
4450          * decay it:
4451          */
4452         if (se->avg.load_avg || se->avg.util_avg)
4453                 return false;
4454
4455         /*
4456          * If there is a pending propagation, we have to update the load and
4457          * the utilization of the sched_entity:
4458          */
4459         if (gcfs_rq->propagate)
4460                 return false;
4461
4462         /*
4463          * Otherwise, the load and the utilization of the sched_entity is
4464          * already zero and there is no pending propagation, so it will be a
4465          * waste of time to try to decay it:
4466          */
4467         return true;
4468 }
4469
4470 #else /* CONFIG_FAIR_GROUP_SCHED */
4471
4472 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
4473
4474 static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
4475
4476 static inline int propagate_entity_load_avg(struct sched_entity *se)
4477 {
4478         return 0;
4479 }
4480
4481 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
4482
4483 #endif /* CONFIG_FAIR_GROUP_SCHED */
4484
4485 #ifdef CONFIG_NO_HZ_COMMON
4486 static inline void migrate_se_pelt_lag(struct sched_entity *se)
4487 {
4488         u64 throttled = 0, now, lut;
4489         struct cfs_rq *cfs_rq;
4490         struct rq *rq;
4491         bool is_idle;
4492
4493         if (load_avg_is_decayed(&se->avg))
4494                 return;
4495
4496         cfs_rq = cfs_rq_of(se);
4497         rq = rq_of(cfs_rq);
4498
4499         rcu_read_lock();
4500         is_idle = is_idle_task(rcu_dereference(rq->curr));
4501         rcu_read_unlock();
4502
4503         /*
4504          * The lag estimation comes with a cost we don't want to pay all the
4505          * time. Hence, limiting to the case where the source CPU is idle and
4506          * we know we are at the greatest risk to have an outdated clock.
4507          */
4508         if (!is_idle)
4509                 return;
4510
4511         /*
4512          * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
4513          *
4514          *   last_update_time (the cfs_rq's last_update_time)
4515          *      = cfs_rq_clock_pelt()@cfs_rq_idle
4516          *      = rq_clock_pelt()@cfs_rq_idle
4517          *        - cfs->throttled_clock_pelt_time@cfs_rq_idle
4518          *
4519          *   cfs_idle_lag (delta between rq's update and cfs_rq's update)
4520          *      = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
4521          *
4522          *   rq_idle_lag (delta between now and rq's update)
4523          *      = sched_clock_cpu() - rq_clock()@rq_idle
4524          *
4525          * We can then write:
4526          *
4527          *    now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
4528          *          sched_clock_cpu() - rq_clock()@rq_idle
4529          * Where:
4530          *      rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
4531          *      rq_clock()@rq_idle      is rq->clock_idle
4532          *      cfs->throttled_clock_pelt_time@cfs_rq_idle
4533          *                              is cfs_rq->throttled_pelt_idle
4534          */
4535
4536 #ifdef CONFIG_CFS_BANDWIDTH
4537         throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
4538         /* The clock has been stopped for throttling */
4539         if (throttled == U64_MAX)
4540                 return;
4541 #endif
4542         now = u64_u32_load(rq->clock_pelt_idle);
4543         /*
4544          * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
4545          * is observed the old clock_pelt_idle value and the new clock_idle,
4546          * which lead to an underestimation. The opposite would lead to an
4547          * overestimation.
4548          */
4549         smp_rmb();
4550         lut = cfs_rq_last_update_time(cfs_rq);
4551
4552         now -= throttled;
4553         if (now < lut)
4554                 /*
4555                  * cfs_rq->avg.last_update_time is more recent than our
4556                  * estimation, let's use it.
4557                  */
4558                 now = lut;
4559         else
4560                 now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
4561
4562         __update_load_avg_blocked_se(now, se);
4563 }
4564 #else
4565 static void migrate_se_pelt_lag(struct sched_entity *se) {}
4566 #endif
4567
4568 /**
4569  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4570  * @now: current time, as per cfs_rq_clock_pelt()
4571  * @cfs_rq: cfs_rq to update
4572  *
4573  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4574  * avg. The immediate corollary is that all (fair) tasks must be attached.
4575  *
4576  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4577  *
4578  * Return: true if the load decayed or we removed load.
4579  *
4580  * Since both these conditions indicate a changed cfs_rq->avg.load we should
4581  * call update_tg_load_avg() when this function returns true.
4582  */
4583 static inline int
4584 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4585 {
4586         unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
4587         struct sched_avg *sa = &cfs_rq->avg;
4588         int decayed = 0;
4589
4590         if (cfs_rq->removed.nr) {
4591                 unsigned long r;
4592                 u32 divider = get_pelt_divider(&cfs_rq->avg);
4593
4594                 raw_spin_lock(&cfs_rq->removed.lock);
4595                 swap(cfs_rq->removed.util_avg, removed_util);
4596                 swap(cfs_rq->removed.load_avg, removed_load);
4597                 swap(cfs_rq->removed.runnable_avg, removed_runnable);
4598                 cfs_rq->removed.nr = 0;
4599                 raw_spin_unlock(&cfs_rq->removed.lock);
4600
4601                 r = removed_load;
4602                 sub_positive(&sa->load_avg, r);
4603                 sub_positive(&sa->load_sum, r * divider);
4604                 /* See sa->util_sum below */
4605                 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
4606
4607                 r = removed_util;
4608                 sub_positive(&sa->util_avg, r);
4609                 sub_positive(&sa->util_sum, r * divider);
4610                 /*
4611                  * Because of rounding, se->util_sum might ends up being +1 more than
4612                  * cfs->util_sum. Although this is not a problem by itself, detaching
4613                  * a lot of tasks with the rounding problem between 2 updates of
4614                  * util_avg (~1ms) can make cfs->util_sum becoming null whereas
4615                  * cfs_util_avg is not.
4616                  * Check that util_sum is still above its lower bound for the new
4617                  * util_avg. Given that period_contrib might have moved since the last
4618                  * sync, we are only sure that util_sum must be above or equal to
4619                  *    util_avg * minimum possible divider
4620                  */
4621                 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
4622
4623                 r = removed_runnable;
4624                 sub_positive(&sa->runnable_avg, r);
4625                 sub_positive(&sa->runnable_sum, r * divider);
4626                 /* See sa->util_sum above */
4627                 sa->runnable_sum = max_t(u32, sa->runnable_sum,
4628                                               sa->runnable_avg * PELT_MIN_DIVIDER);
4629
4630                 /*
4631                  * removed_runnable is the unweighted version of removed_load so we
4632                  * can use it to estimate removed_load_sum.
4633                  */
4634                 add_tg_cfs_propagate(cfs_rq,
4635                         -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
4636
4637                 decayed = 1;
4638         }
4639
4640         decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
4641         u64_u32_store_copy(sa->last_update_time,
4642                            cfs_rq->last_update_time_copy,
4643                            sa->last_update_time);
4644         return decayed;
4645 }
4646
4647 /**
4648  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4649  * @cfs_rq: cfs_rq to attach to
4650  * @se: sched_entity to attach
4651  *
4652  * Must call update_cfs_rq_load_avg() before this, since we rely on
4653  * cfs_rq->avg.last_update_time being current.
4654  */
4655 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4656 {
4657         /*
4658          * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4659          * See ___update_load_avg() for details.
4660          */
4661         u32 divider = get_pelt_divider(&cfs_rq->avg);
4662
4663         /*
4664          * When we attach the @se to the @cfs_rq, we must align the decay
4665          * window because without that, really weird and wonderful things can
4666          * happen.
4667          *
4668          * XXX illustrate
4669          */
4670         se->avg.last_update_time = cfs_rq->avg.last_update_time;
4671         se->avg.period_contrib = cfs_rq->avg.period_contrib;
4672
4673         /*
4674          * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
4675          * period_contrib. This isn't strictly correct, but since we're
4676          * entirely outside of the PELT hierarchy, nobody cares if we truncate
4677          * _sum a little.
4678          */
4679         se->avg.util_sum = se->avg.util_avg * divider;
4680
4681         se->avg.runnable_sum = se->avg.runnable_avg * divider;
4682
4683         se->avg.load_sum = se->avg.load_avg * divider;
4684         if (se_weight(se) < se->avg.load_sum)
4685                 se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
4686         else
4687                 se->avg.load_sum = 1;
4688
4689         enqueue_load_avg(cfs_rq, se);
4690         cfs_rq->avg.util_avg += se->avg.util_avg;
4691         cfs_rq->avg.util_sum += se->avg.util_sum;
4692         cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
4693         cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
4694
4695         add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
4696
4697         cfs_rq_util_change(cfs_rq, 0);
4698
4699         trace_pelt_cfs_tp(cfs_rq);
4700 }
4701
4702 /**
4703  * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4704  * @cfs_rq: cfs_rq to detach from
4705  * @se: sched_entity to detach
4706  *
4707  * Must call update_cfs_rq_load_avg() before this, since we rely on
4708  * cfs_rq->avg.last_update_time being current.
4709  */
4710 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4711 {
4712         dequeue_load_avg(cfs_rq, se);
4713         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4714         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4715         /* See update_cfs_rq_load_avg() */
4716         cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4717                                           cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4718
4719         sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
4720         sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
4721         /* See update_cfs_rq_load_avg() */
4722         cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4723                                               cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4724
4725         add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
4726
4727         cfs_rq_util_change(cfs_rq, 0);
4728
4729         trace_pelt_cfs_tp(cfs_rq);
4730 }
4731
4732 /*
4733  * Optional action to be done while updating the load average
4734  */
4735 #define UPDATE_TG       0x1
4736 #define SKIP_AGE_LOAD   0x2
4737 #define DO_ATTACH       0x4
4738 #define DO_DETACH       0x8
4739
4740 /* Update task and its cfs_rq load average */
4741 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4742 {
4743         u64 now = cfs_rq_clock_pelt(cfs_rq);
4744         int decayed;
4745
4746         /*
4747          * Track task load average for carrying it to new CPU after migrated, and
4748          * track group sched_entity load average for task_h_load calc in migration
4749          */
4750         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
4751                 __update_load_avg_se(now, cfs_rq, se);
4752
4753         decayed  = update_cfs_rq_load_avg(now, cfs_rq);
4754         decayed |= propagate_entity_load_avg(se);
4755
4756         if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
4757
4758                 /*
4759                  * DO_ATTACH means we're here from enqueue_entity().
4760                  * !last_update_time means we've passed through
4761                  * migrate_task_rq_fair() indicating we migrated.
4762                  *
4763                  * IOW we're enqueueing a task on a new CPU.
4764                  */
4765                 attach_entity_load_avg(cfs_rq, se);
4766                 update_tg_load_avg(cfs_rq);
4767
4768         } else if (flags & DO_DETACH) {
4769                 /*
4770                  * DO_DETACH means we're here from dequeue_entity()
4771                  * and we are migrating task out of the CPU.
4772                  */
4773                 detach_entity_load_avg(cfs_rq, se);
4774                 update_tg_load_avg(cfs_rq);
4775         } else if (decayed) {
4776                 cfs_rq_util_change(cfs_rq, 0);
4777
4778                 if (flags & UPDATE_TG)
4779                         update_tg_load_avg(cfs_rq);
4780         }
4781 }
4782
4783 /*
4784  * Synchronize entity load avg of dequeued entity without locking
4785  * the previous rq.
4786  */
4787 static void sync_entity_load_avg(struct sched_entity *se)
4788 {
4789         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4790         u64 last_update_time;
4791
4792         last_update_time = cfs_rq_last_update_time(cfs_rq);
4793         __update_load_avg_blocked_se(last_update_time, se);
4794 }
4795
4796 /*
4797  * Task first catches up with cfs_rq, and then subtract
4798  * itself from the cfs_rq (task must be off the queue now).
4799  */
4800 static void remove_entity_load_avg(struct sched_entity *se)
4801 {
4802         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4803         unsigned long flags;
4804
4805         /*
4806          * tasks cannot exit without having gone through wake_up_new_task() ->
4807          * enqueue_task_fair() which will have added things to the cfs_rq,
4808          * so we can remove unconditionally.
4809          */
4810
4811         sync_entity_load_avg(se);
4812
4813         raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4814         ++cfs_rq->removed.nr;
4815         cfs_rq->removed.util_avg        += se->avg.util_avg;
4816         cfs_rq->removed.load_avg        += se->avg.load_avg;
4817         cfs_rq->removed.runnable_avg    += se->avg.runnable_avg;
4818         raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
4819 }
4820
4821 static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4822 {
4823         return cfs_rq->avg.runnable_avg;
4824 }
4825
4826 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4827 {
4828         return cfs_rq->avg.load_avg;
4829 }
4830
4831 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
4832
4833 static inline unsigned long task_util(struct task_struct *p)
4834 {
4835         return READ_ONCE(p->se.avg.util_avg);
4836 }
4837
4838 static inline unsigned long task_runnable(struct task_struct *p)
4839 {
4840         return READ_ONCE(p->se.avg.runnable_avg);
4841 }
4842
4843 static inline unsigned long _task_util_est(struct task_struct *p)
4844 {
4845         return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
4846 }
4847
4848 static inline unsigned long task_util_est(struct task_struct *p)
4849 {
4850         return max(task_util(p), _task_util_est(p));
4851 }
4852
4853 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4854                                     struct task_struct *p)
4855 {
4856         unsigned int enqueued;
4857
4858         if (!sched_feat(UTIL_EST))
4859                 return;
4860
4861         /* Update root cfs_rq's estimated utilization */
4862         enqueued  = cfs_rq->avg.util_est;
4863         enqueued += _task_util_est(p);
4864         WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4865
4866         trace_sched_util_est_cfs_tp(cfs_rq);
4867 }
4868
4869 static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4870                                     struct task_struct *p)
4871 {
4872         unsigned int enqueued;
4873
4874         if (!sched_feat(UTIL_EST))
4875                 return;
4876
4877         /* Update root cfs_rq's estimated utilization */
4878         enqueued  = cfs_rq->avg.util_est;
4879         enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4880         WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4881
4882         trace_sched_util_est_cfs_tp(cfs_rq);
4883 }
4884
4885 #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4886
4887 static inline void util_est_update(struct cfs_rq *cfs_rq,
4888                                    struct task_struct *p,
4889                                    bool task_sleep)
4890 {
4891         unsigned int ewma, dequeued, last_ewma_diff;
4892
4893         if (!sched_feat(UTIL_EST))
4894                 return;
4895
4896         /*
4897          * Skip update of task's estimated utilization when the task has not
4898          * yet completed an activation, e.g. being migrated.
4899          */
4900         if (!task_sleep)
4901                 return;
4902
4903         /* Get current estimate of utilization */
4904         ewma = READ_ONCE(p->se.avg.util_est);
4905
4906         /*
4907          * If the PELT values haven't changed since enqueue time,
4908          * skip the util_est update.
4909          */
4910         if (ewma & UTIL_AVG_UNCHANGED)
4911                 return;
4912
4913         /* Get utilization at dequeue */
4914         dequeued = task_util(p);
4915
4916         /*
4917          * Reset EWMA on utilization increases, the moving average is used only
4918          * to smooth utilization decreases.
4919          */
4920         if (ewma <= dequeued) {
4921                 ewma = dequeued;
4922                 goto done;
4923         }
4924
4925         /*
4926          * Skip update of task's estimated utilization when its members are
4927          * already ~1% close to its last activation value.
4928          */
4929         last_ewma_diff = ewma - dequeued;
4930         if (last_ewma_diff < UTIL_EST_MARGIN)
4931                 goto done;
4932
4933         /*
4934          * To avoid overestimation of actual task utilization, skip updates if
4935          * we cannot grant there is idle time in this CPU.
4936          */
4937         if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
4938                 return;
4939
4940         /*
4941          * To avoid underestimate of task utilization, skip updates of EWMA if
4942          * we cannot grant that thread got all CPU time it wanted.
4943          */
4944         if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
4945                 goto done;
4946
4947
4948         /*
4949          * Update Task's estimated utilization
4950          *
4951          * When *p completes an activation we can consolidate another sample
4952          * of the task size. This is done by using this value to update the
4953          * Exponential Weighted Moving Average (EWMA):
4954          *
4955          *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
4956          *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
4957          *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
4958          *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
4959          *          = w * (-last_ewma_diff +  ewma(t-1) / w)
4960          *
4961          * Where 'w' is the weight of new samples, which is configured to be
4962          * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4963          */
4964         ewma <<= UTIL_EST_WEIGHT_SHIFT;
4965         ewma  -= last_ewma_diff;
4966         ewma >>= UTIL_EST_WEIGHT_SHIFT;
4967 done:
4968         ewma |= UTIL_AVG_UNCHANGED;
4969         WRITE_ONCE(p->se.avg.util_est, ewma);
4970
4971         trace_sched_util_est_se_tp(&p->se);
4972 }
4973
4974 static inline int util_fits_cpu(unsigned long util,
4975                                 unsigned long uclamp_min,
4976                                 unsigned long uclamp_max,
4977                                 int cpu)
4978 {
4979         unsigned long capacity_orig, capacity_orig_thermal;
4980         unsigned long capacity = capacity_of(cpu);
4981         bool fits, uclamp_max_fits;
4982
4983         /*
4984          * Check if the real util fits without any uclamp boost/cap applied.
4985          */
4986         fits = fits_capacity(util, capacity);
4987
4988         if (!uclamp_is_used())
4989                 return fits;
4990
4991         /*
4992          * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
4993          * uclamp_max. We only care about capacity pressure (by using
4994          * capacity_of()) for comparing against the real util.
4995          *
4996          * If a task is boosted to 1024 for example, we don't want a tiny
4997          * pressure to skew the check whether it fits a CPU or not.
4998          *
4999          * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
5000          * should fit a little cpu even if there's some pressure.
5001          *
5002          * Only exception is for thermal pressure since it has a direct impact
5003          * on available OPP of the system.
5004          *
5005          * We honour it for uclamp_min only as a drop in performance level
5006          * could result in not getting the requested minimum performance level.
5007          *
5008          * For uclamp_max, we can tolerate a drop in performance level as the
5009          * goal is to cap the task. So it's okay if it's getting less.
5010          */
5011         capacity_orig = arch_scale_cpu_capacity(cpu);
5012         capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
5013
5014         /*
5015          * We want to force a task to fit a cpu as implied by uclamp_max.
5016          * But we do have some corner cases to cater for..
5017          *
5018          *
5019          *                                 C=z
5020          *   |                             ___
5021          *   |                  C=y       |   |
5022          *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
5023          *   |      C=x        |   |      |   |
5024          *   |      ___        |   |      |   |
5025          *   |     |   |       |   |      |   |    (util somewhere in this region)
5026          *   |     |   |       |   |      |   |
5027          *   |     |   |       |   |      |   |
5028          *   +----------------------------------------
5029          *         cpu0        cpu1       cpu2
5030          *
5031          *   In the above example if a task is capped to a specific performance
5032          *   point, y, then when:
5033          *
5034          *   * util = 80% of x then it does not fit on cpu0 and should migrate
5035          *     to cpu1
5036          *   * util = 80% of y then it is forced to fit on cpu1 to honour
5037          *     uclamp_max request.
5038          *
5039          *   which is what we're enforcing here. A task always fits if
5040          *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
5041          *   the normal upmigration rules should withhold still.
5042          *
5043          *   Only exception is when we are on max capacity, then we need to be
5044          *   careful not to block overutilized state. This is so because:
5045          *
5046          *     1. There's no concept of capping at max_capacity! We can't go
5047          *        beyond this performance level anyway.
5048          *     2. The system is being saturated when we're operating near
5049          *        max capacity, it doesn't make sense to block overutilized.
5050          */
5051         uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
5052         uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
5053         fits = fits || uclamp_max_fits;
5054
5055         /*
5056          *
5057          *                                 C=z
5058          *   |                             ___       (region a, capped, util >= uclamp_max)
5059          *   |                  C=y       |   |
5060          *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
5061          *   |      C=x        |   |      |   |
5062          *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max)
5063          *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
5064          *   |     |   |       |   |      |   |
5065          *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
5066          *   +----------------------------------------
5067          *         cpu0        cpu1       cpu2
5068          *
5069          * a) If util > uclamp_max, then we're capped, we don't care about
5070          *    actual fitness value here. We only care if uclamp_max fits
5071          *    capacity without taking margin/pressure into account.
5072          *    See comment above.
5073          *
5074          * b) If uclamp_min <= util <= uclamp_max, then the normal
5075          *    fits_capacity() rules apply. Except we need to ensure that we
5076          *    enforce we remain within uclamp_max, see comment above.
5077          *
5078          * c) If util < uclamp_min, then we are boosted. Same as (b) but we
5079          *    need to take into account the boosted value fits the CPU without
5080          *    taking margin/pressure into account.
5081          *
5082          * Cases (a) and (b) are handled in the 'fits' variable already. We
5083          * just need to consider an extra check for case (c) after ensuring we
5084          * handle the case uclamp_min > uclamp_max.
5085          */
5086         uclamp_min = min(uclamp_min, uclamp_max);
5087         if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
5088                 return -1;
5089
5090         return fits;
5091 }
5092
5093 static inline int task_fits_cpu(struct task_struct *p, int cpu)
5094 {
5095         unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
5096         unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
5097         unsigned long util = task_util_est(p);
5098         /*
5099          * Return true only if the cpu fully fits the task requirements, which
5100          * include the utilization but also the performance hints.
5101          */
5102         return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
5103 }
5104
5105 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
5106 {
5107         if (!sched_asym_cpucap_active())
5108                 return;
5109
5110         if (!p || p->nr_cpus_allowed == 1) {
5111                 rq->misfit_task_load = 0;
5112                 return;
5113         }
5114
5115         if (task_fits_cpu(p, cpu_of(rq))) {
5116                 rq->misfit_task_load = 0;
5117                 return;
5118         }
5119
5120         /*
5121          * Make sure that misfit_task_load will not be null even if
5122          * task_h_load() returns 0.
5123          */
5124         rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
5125 }
5126
5127 #else /* CONFIG_SMP */
5128
5129 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
5130 {
5131         return !cfs_rq->nr_running;
5132 }
5133
5134 #define UPDATE_TG       0x0
5135 #define SKIP_AGE_LOAD   0x0
5136 #define DO_ATTACH       0x0
5137 #define DO_DETACH       0x0
5138
5139 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
5140 {
5141         cfs_rq_util_change(cfs_rq, 0);
5142 }
5143
5144 static inline void remove_entity_load_avg(struct sched_entity *se) {}
5145
5146 static inline void
5147 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
5148 static inline void
5149 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
5150
5151 static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
5152 {
5153         return 0;
5154 }
5155
5156 static inline void
5157 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
5158
5159 static inline void
5160 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
5161
5162 static inline void
5163 util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
5164                 bool task_sleep) {}
5165 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
5166
5167 #endif /* CONFIG_SMP */
5168
5169 static void
5170 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5171 {
5172         u64 vslice, vruntime = avg_vruntime(cfs_rq);
5173         s64 lag = 0;
5174
5175         se->slice = sysctl_sched_base_slice;
5176         vslice = calc_delta_fair(se->slice, se);
5177
5178         /*
5179          * Due to how V is constructed as the weighted average of entities,
5180          * adding tasks with positive lag, or removing tasks with negative lag
5181          * will move 'time' backwards, this can screw around with the lag of
5182          * other tasks.
5183          *
5184          * EEVDF: placement strategy #1 / #2
5185          */
5186         if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
5187                 struct sched_entity *curr = cfs_rq->curr;
5188                 unsigned long load;
5189
5190                 lag = se->vlag;
5191
5192                 /*
5193                  * If we want to place a task and preserve lag, we have to
5194                  * consider the effect of the new entity on the weighted
5195                  * average and compensate for this, otherwise lag can quickly
5196                  * evaporate.
5197                  *
5198                  * Lag is defined as:
5199                  *
5200                  *   lag_i = S - s_i = w_i * (V - v_i)
5201                  *
5202                  * To avoid the 'w_i' term all over the place, we only track
5203                  * the virtual lag:
5204                  *
5205                  *   vl_i = V - v_i <=> v_i = V - vl_i
5206                  *
5207                  * And we take V to be the weighted average of all v:
5208                  *
5209                  *   V = (\Sum w_j*v_j) / W
5210                  *
5211                  * Where W is: \Sum w_j
5212                  *
5213                  * Then, the weighted average after adding an entity with lag
5214                  * vl_i is given by:
5215                  *
5216                  *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
5217                  *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
5218                  *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
5219                  *      = (V*(W + w_i) - w_i*l) / (W + w_i)
5220                  *      = V - w_i*vl_i / (W + w_i)
5221                  *
5222                  * And the actual lag after adding an entity with vl_i is:
5223                  *
5224                  *   vl'_i = V' - v_i
5225                  *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
5226                  *         = vl_i - w_i*vl_i / (W + w_i)
5227                  *
5228                  * Which is strictly less than vl_i. So in order to preserve lag
5229                  * we should inflate the lag before placement such that the
5230                  * effective lag after placement comes out right.
5231                  *
5232                  * As such, invert the above relation for vl'_i to get the vl_i
5233                  * we need to use such that the lag after placement is the lag
5234                  * we computed before dequeue.
5235                  *
5236                  *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
5237                  *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
5238                  *
5239                  *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
5240                  *                   = W*vl_i
5241                  *
5242                  *   vl_i = (W + w_i)*vl'_i / W
5243                  */
5244                 load = cfs_rq->avg_load;
5245                 if (curr && curr->on_rq)
5246                         load += scale_load_down(curr->load.weight);
5247
5248                 lag *= load + scale_load_down(se->load.weight);
5249                 if (WARN_ON_ONCE(!load))
5250                         load = 1;
5251                 lag = div_s64(lag, load);
5252         }
5253
5254         se->vruntime = vruntime - lag;
5255
5256         /*
5257          * When joining the competition; the exisiting tasks will be,
5258          * on average, halfway through their slice, as such start tasks
5259          * off with half a slice to ease into the competition.
5260          */
5261         if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
5262                 vslice /= 2;
5263
5264         /*
5265          * EEVDF: vd_i = ve_i + r_i/w_i
5266          */
5267         se->deadline = se->vruntime + vslice;
5268 }
5269
5270 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
5271 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
5272
5273 static inline bool cfs_bandwidth_used(void);
5274
5275 static void
5276 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5277 {
5278         bool curr = cfs_rq->curr == se;
5279
5280         /*
5281          * If we're the current task, we must renormalise before calling
5282          * update_curr().
5283          */
5284         if (curr)
5285                 place_entity(cfs_rq, se, flags);
5286
5287         update_curr(cfs_rq);
5288
5289         /*
5290          * When enqueuing a sched_entity, we must:
5291          *   - Update loads to have both entity and cfs_rq synced with now.
5292          *   - For group_entity, update its runnable_weight to reflect the new
5293          *     h_nr_running of its group cfs_rq.
5294          *   - For group_entity, update its weight to reflect the new share of
5295          *     its group cfs_rq
5296          *   - Add its new weight to cfs_rq->load.weight
5297          */
5298         update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
5299         se_update_runnable(se);
5300         /*
5301          * XXX update_load_avg() above will have attached us to the pelt sum;
5302          * but update_cfs_group() here will re-adjust the weight and have to
5303          * undo/redo all that. Seems wasteful.
5304          */
5305         update_cfs_group(se);
5306
5307         /*
5308          * XXX now that the entity has been re-weighted, and it's lag adjusted,
5309          * we can place the entity.
5310          */
5311         if (!curr)
5312                 place_entity(cfs_rq, se, flags);
5313
5314         account_entity_enqueue(cfs_rq, se);
5315
5316         /* Entity has migrated, no longer consider this task hot */
5317         if (flags & ENQUEUE_MIGRATED)
5318                 se->exec_start = 0;
5319
5320         check_schedstat_required();
5321         update_stats_enqueue_fair(cfs_rq, se, flags);
5322         if (!curr)
5323                 __enqueue_entity(cfs_rq, se);
5324         se->on_rq = 1;
5325
5326         if (cfs_rq->nr_running == 1) {
5327                 check_enqueue_throttle(cfs_rq);
5328                 if (!throttled_hierarchy(cfs_rq)) {
5329                         list_add_leaf_cfs_rq(cfs_rq);
5330                 } else {
5331 #ifdef CONFIG_CFS_BANDWIDTH
5332                         struct rq *rq = rq_of(cfs_rq);
5333
5334                         if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
5335                                 cfs_rq->throttled_clock = rq_clock(rq);
5336                         if (!cfs_rq->throttled_clock_self)
5337                                 cfs_rq->throttled_clock_self = rq_clock(rq);
5338 #endif
5339                 }
5340         }
5341 }
5342
5343 static void __clear_buddies_next(struct sched_entity *se)
5344 {
5345         for_each_sched_entity(se) {
5346                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5347                 if (cfs_rq->next != se)
5348                         break;
5349
5350                 cfs_rq->next = NULL;
5351         }
5352 }
5353
5354 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
5355 {
5356         if (cfs_rq->next == se)
5357                 __clear_buddies_next(se);
5358 }
5359
5360 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5361
5362 static void
5363 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5364 {
5365         int action = UPDATE_TG;
5366
5367         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
5368                 action |= DO_DETACH;
5369
5370         /*
5371          * Update run-time statistics of the 'current'.
5372          */
5373         update_curr(cfs_rq);
5374
5375         /*
5376          * When dequeuing a sched_entity, we must:
5377          *   - Update loads to have both entity and cfs_rq synced with now.
5378          *   - For group_entity, update its runnable_weight to reflect the new
5379          *     h_nr_running of its group cfs_rq.
5380          *   - Subtract its previous weight from cfs_rq->load.weight.
5381          *   - For group entity, update its weight to reflect the new share
5382          *     of its group cfs_rq.
5383          */
5384         update_load_avg(cfs_rq, se, action);
5385         se_update_runnable(se);
5386
5387         update_stats_dequeue_fair(cfs_rq, se, flags);
5388
5389         clear_buddies(cfs_rq, se);
5390
5391         update_entity_lag(cfs_rq, se);
5392         if (se != cfs_rq->curr)
5393                 __dequeue_entity(cfs_rq, se);
5394         se->on_rq = 0;
5395         account_entity_dequeue(cfs_rq, se);
5396
5397         /* return excess runtime on last dequeue */
5398         return_cfs_rq_runtime(cfs_rq);
5399
5400         update_cfs_group(se);
5401
5402         /*
5403          * Now advance min_vruntime if @se was the entity holding it back,
5404          * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
5405          * put back on, and if we advance min_vruntime, we'll be placed back
5406          * further than we started -- ie. we'll be penalized.
5407          */
5408         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
5409                 update_min_vruntime(cfs_rq);
5410
5411         if (cfs_rq->nr_running == 0)
5412                 update_idle_cfs_rq_clock_pelt(cfs_rq);
5413 }
5414
5415 static void
5416 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
5417 {
5418         clear_buddies(cfs_rq, se);
5419
5420         /* 'current' is not kept within the tree. */
5421         if (se->on_rq) {
5422                 /*
5423                  * Any task has to be enqueued before it get to execute on
5424                  * a CPU. So account for the time it spent waiting on the
5425                  * runqueue.
5426                  */
5427                 update_stats_wait_end_fair(cfs_rq, se);
5428                 __dequeue_entity(cfs_rq, se);
5429                 update_load_avg(cfs_rq, se, UPDATE_TG);
5430                 /*
5431                  * HACK, stash a copy of deadline at the point of pick in vlag,
5432                  * which isn't used until dequeue.
5433                  */
5434                 se->vlag = se->deadline;
5435         }
5436
5437         update_stats_curr_start(cfs_rq, se);
5438         cfs_rq->curr = se;
5439
5440         /*
5441          * Track our maximum slice length, if the CPU's load is at
5442          * least twice that of our own weight (i.e. dont track it
5443          * when there are only lesser-weight tasks around):
5444          */
5445         if (schedstat_enabled() &&
5446             rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
5447                 struct sched_statistics *stats;
5448
5449                 stats = __schedstats_from_se(se);
5450                 __schedstat_set(stats->slice_max,
5451                                 max((u64)stats->slice_max,
5452                                     se->sum_exec_runtime - se->prev_sum_exec_runtime));
5453         }
5454
5455         se->prev_sum_exec_runtime = se->sum_exec_runtime;
5456 }
5457
5458 /*
5459  * Pick the next process, keeping these things in mind, in this order:
5460  * 1) keep things fair between processes/task groups
5461  * 2) pick the "next" process, since someone really wants that to run
5462  * 3) pick the "last" process, for cache locality
5463  * 4) do not run the "skip" process, if something else is available
5464  */
5465 static struct sched_entity *
5466 pick_next_entity(struct cfs_rq *cfs_rq)
5467 {
5468         /*
5469          * Enabling NEXT_BUDDY will affect latency but not fairness.
5470          */
5471         if (sched_feat(NEXT_BUDDY) &&
5472             cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
5473                 return cfs_rq->next;
5474
5475         return pick_eevdf(cfs_rq);
5476 }
5477
5478 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5479
5480 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
5481 {
5482         /*
5483          * If still on the runqueue then deactivate_task()
5484          * was not called and update_curr() has to be done:
5485          */
5486         if (prev->on_rq)
5487                 update_curr(cfs_rq);
5488
5489         /* throttle cfs_rqs exceeding runtime */
5490         check_cfs_rq_runtime(cfs_rq);
5491
5492         if (prev->on_rq) {
5493                 update_stats_wait_start_fair(cfs_rq, prev);
5494                 /* Put 'current' back into the tree. */
5495                 __enqueue_entity(cfs_rq, prev);
5496                 /* in !on_rq case, update occurred at dequeue */
5497                 update_load_avg(cfs_rq, prev, 0);
5498         }
5499         cfs_rq->curr = NULL;
5500 }
5501
5502 static void
5503 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
5504 {
5505         /*
5506          * Update run-time statistics of the 'current'.
5507          */
5508         update_curr(cfs_rq);
5509
5510         /*
5511          * Ensure that runnable average is periodically updated.
5512          */
5513         update_load_avg(cfs_rq, curr, UPDATE_TG);
5514         update_cfs_group(curr);
5515
5516 #ifdef CONFIG_SCHED_HRTICK
5517         /*
5518          * queued ticks are scheduled to match the slice, so don't bother
5519          * validating it and just reschedule.
5520          */
5521         if (queued) {
5522                 resched_curr(rq_of(cfs_rq));
5523                 return;
5524         }
5525         /*
5526          * don't let the period tick interfere with the hrtick preemption
5527          */
5528         if (!sched_feat(DOUBLE_TICK) &&
5529                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
5530                 return;
5531 #endif
5532 }
5533
5534
5535 /**************************************************
5536  * CFS bandwidth control machinery
5537  */
5538
5539 #ifdef CONFIG_CFS_BANDWIDTH
5540
5541 #ifdef CONFIG_JUMP_LABEL
5542 static struct static_key __cfs_bandwidth_used;
5543
5544 static inline bool cfs_bandwidth_used(void)
5545 {
5546         return static_key_false(&__cfs_bandwidth_used);
5547 }
5548
5549 void cfs_bandwidth_usage_inc(void)
5550 {
5551         static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
5552 }
5553
5554 void cfs_bandwidth_usage_dec(void)
5555 {
5556         static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
5557 }
5558 #else /* CONFIG_JUMP_LABEL */
5559 static bool cfs_bandwidth_used(void)
5560 {
5561         return true;
5562 }
5563
5564 void cfs_bandwidth_usage_inc(void) {}
5565 void cfs_bandwidth_usage_dec(void) {}
5566 #endif /* CONFIG_JUMP_LABEL */
5567
5568 /*
5569  * default period for cfs group bandwidth.
5570  * default: 0.1s, units: nanoseconds
5571  */
5572 static inline u64 default_cfs_period(void)
5573 {
5574         return 100000000ULL;
5575 }
5576
5577 static inline u64 sched_cfs_bandwidth_slice(void)
5578 {
5579         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5580 }
5581
5582 /*
5583  * Replenish runtime according to assigned quota. We use sched_clock_cpu
5584  * directly instead of rq->clock to avoid adding additional synchronization
5585  * around rq->lock.
5586  *
5587  * requires cfs_b->lock
5588  */
5589 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5590 {
5591         s64 runtime;
5592
5593         if (unlikely(cfs_b->quota == RUNTIME_INF))
5594                 return;
5595
5596         cfs_b->runtime += cfs_b->quota;
5597         runtime = cfs_b->runtime_snap - cfs_b->runtime;
5598         if (runtime > 0) {
5599                 cfs_b->burst_time += runtime;
5600                 cfs_b->nr_burst++;
5601         }
5602
5603         cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
5604         cfs_b->runtime_snap = cfs_b->runtime;
5605 }
5606
5607 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5608 {
5609         return &tg->cfs_bandwidth;
5610 }
5611
5612 /* returns 0 on failure to allocate runtime */
5613 static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
5614                                    struct cfs_rq *cfs_rq, u64 target_runtime)
5615 {
5616         u64 min_amount, amount = 0;
5617
5618         lockdep_assert_held(&cfs_b->lock);
5619
5620         /* note: this is a positive sum as runtime_remaining <= 0 */
5621         min_amount = target_runtime - cfs_rq->runtime_remaining;
5622
5623         if (cfs_b->quota == RUNTIME_INF)
5624                 amount = min_amount;
5625         else {
5626                 start_cfs_bandwidth(cfs_b);
5627
5628                 if (cfs_b->runtime > 0) {
5629                         amount = min(cfs_b->runtime, min_amount);
5630                         cfs_b->runtime -= amount;
5631                         cfs_b->idle = 0;
5632                 }
5633         }
5634
5635         cfs_rq->runtime_remaining += amount;
5636
5637         return cfs_rq->runtime_remaining > 0;
5638 }
5639
5640 /* returns 0 on failure to allocate runtime */
5641 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5642 {
5643         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5644         int ret;
5645
5646         raw_spin_lock(&cfs_b->lock);
5647         ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
5648         raw_spin_unlock(&cfs_b->lock);
5649
5650         return ret;
5651 }
5652
5653 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5654 {
5655         /* dock delta_exec before expiring quota (as it could span periods) */
5656         cfs_rq->runtime_remaining -= delta_exec;
5657
5658         if (likely(cfs_rq->runtime_remaining > 0))
5659                 return;
5660
5661         if (cfs_rq->throttled)
5662                 return;
5663         /*
5664          * if we're unable to extend our runtime we resched so that the active
5665          * hierarchy can be throttled
5666          */
5667         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5668                 resched_curr(rq_of(cfs_rq));
5669 }
5670
5671 static __always_inline
5672 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5673 {
5674         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5675                 return;
5676
5677         __account_cfs_rq_runtime(cfs_rq, delta_exec);
5678 }
5679
5680 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5681 {
5682         return cfs_bandwidth_used() && cfs_rq->throttled;
5683 }
5684
5685 /* check whether cfs_rq, or any parent, is throttled */
5686 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5687 {
5688         return cfs_bandwidth_used() && cfs_rq->throttle_count;
5689 }
5690
5691 /*
5692  * Ensure that neither of the group entities corresponding to src_cpu or
5693  * dest_cpu are members of a throttled hierarchy when performing group
5694  * load-balance operations.
5695  */
5696 static inline int throttled_lb_pair(struct task_group *tg,
5697                                     int src_cpu, int dest_cpu)
5698 {
5699         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
5700
5701         src_cfs_rq = tg->cfs_rq[src_cpu];
5702         dest_cfs_rq = tg->cfs_rq[dest_cpu];
5703
5704         return throttled_hierarchy(src_cfs_rq) ||
5705                throttled_hierarchy(dest_cfs_rq);
5706 }
5707
5708 static int tg_unthrottle_up(struct task_group *tg, void *data)
5709 {
5710         struct rq *rq = data;
5711         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5712
5713         cfs_rq->throttle_count--;
5714         if (!cfs_rq->throttle_count) {
5715                 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5716                                              cfs_rq->throttled_clock_pelt;
5717
5718                 /* Add cfs_rq with load or one or more already running entities to the list */
5719                 if (!cfs_rq_is_decayed(cfs_rq))
5720                         list_add_leaf_cfs_rq(cfs_rq);
5721
5722                 if (cfs_rq->throttled_clock_self) {
5723                         u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
5724
5725                         cfs_rq->throttled_clock_self = 0;
5726
5727                         if (SCHED_WARN_ON((s64)delta < 0))
5728                                 delta = 0;
5729
5730                         cfs_rq->throttled_clock_self_time += delta;
5731                 }
5732         }
5733
5734         return 0;
5735 }
5736
5737 static int tg_throttle_down(struct task_group *tg, void *data)
5738 {
5739         struct rq *rq = data;
5740         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5741
5742         /* group is entering throttled state, stop time */
5743         if (!cfs_rq->throttle_count) {
5744                 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5745                 list_del_leaf_cfs_rq(cfs_rq);
5746
5747                 SCHED_WARN_ON(cfs_rq->throttled_clock_self);
5748                 if (cfs_rq->nr_running)
5749                         cfs_rq->throttled_clock_self = rq_clock(rq);
5750         }
5751         cfs_rq->throttle_count++;
5752
5753         return 0;
5754 }
5755
5756 static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
5757 {
5758         struct rq *rq = rq_of(cfs_rq);
5759         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5760         struct sched_entity *se;
5761         long task_delta, idle_task_delta, dequeue = 1;
5762
5763         raw_spin_lock(&cfs_b->lock);
5764         /* This will start the period timer if necessary */
5765         if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
5766                 /*
5767                  * We have raced with bandwidth becoming available, and if we
5768                  * actually throttled the timer might not unthrottle us for an
5769                  * entire period. We additionally needed to make sure that any
5770                  * subsequent check_cfs_rq_runtime calls agree not to throttle
5771                  * us, as we may commit to do cfs put_prev+pick_next, so we ask
5772                  * for 1ns of runtime rather than just check cfs_b.
5773                  */
5774                 dequeue = 0;
5775         } else {
5776                 list_add_tail_rcu(&cfs_rq->throttled_list,
5777                                   &cfs_b->throttled_cfs_rq);
5778         }
5779         raw_spin_unlock(&cfs_b->lock);
5780
5781         if (!dequeue)
5782                 return false;  /* Throttle no longer required. */
5783
5784         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5785
5786         /* freeze hierarchy runnable averages while throttled */
5787         rcu_read_lock();
5788         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5789         rcu_read_unlock();
5790
5791         task_delta = cfs_rq->h_nr_running;
5792         idle_task_delta = cfs_rq->idle_h_nr_running;
5793         for_each_sched_entity(se) {
5794                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5795                 /* throttled entity or throttle-on-deactivate */
5796                 if (!se->on_rq)
5797                         goto done;
5798
5799                 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5800
5801                 if (cfs_rq_is_idle(group_cfs_rq(se)))
5802                         idle_task_delta = cfs_rq->h_nr_running;
5803
5804                 qcfs_rq->h_nr_running -= task_delta;
5805                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
5806
5807                 if (qcfs_rq->load.weight) {
5808                         /* Avoid re-evaluating load for this entity: */
5809                         se = parent_entity(se);
5810                         break;
5811                 }
5812         }
5813
5814         for_each_sched_entity(se) {
5815                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5816                 /* throttled entity or throttle-on-deactivate */
5817                 if (!se->on_rq)
5818                         goto done;
5819
5820                 update_load_avg(qcfs_rq, se, 0);
5821                 se_update_runnable(se);
5822
5823                 if (cfs_rq_is_idle(group_cfs_rq(se)))
5824                         idle_task_delta = cfs_rq->h_nr_running;
5825
5826                 qcfs_rq->h_nr_running -= task_delta;
5827                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
5828         }
5829
5830         /* At this point se is NULL and we are at root level*/
5831         sub_nr_running(rq, task_delta);
5832
5833 done:
5834         /*
5835          * Note: distribution will already see us throttled via the
5836          * throttled-list.  rq->lock protects completion.
5837          */
5838         cfs_rq->throttled = 1;
5839         SCHED_WARN_ON(cfs_rq->throttled_clock);
5840         if (cfs_rq->nr_running)
5841                 cfs_rq->throttled_clock = rq_clock(rq);
5842         return true;
5843 }
5844
5845 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5846 {
5847         struct rq *rq = rq_of(cfs_rq);
5848         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5849         struct sched_entity *se;
5850         long task_delta, idle_task_delta;
5851
5852         se = cfs_rq->tg->se[cpu_of(rq)];
5853
5854         cfs_rq->throttled = 0;
5855
5856         update_rq_clock(rq);
5857
5858         raw_spin_lock(&cfs_b->lock);
5859         if (cfs_rq->throttled_clock) {
5860                 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5861                 cfs_rq->throttled_clock = 0;
5862         }
5863         list_del_rcu(&cfs_rq->throttled_list);
5864         raw_spin_unlock(&cfs_b->lock);
5865
5866         /* update hierarchical throttle state */
5867         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5868
5869         if (!cfs_rq->load.weight) {
5870                 if (!cfs_rq->on_list)
5871                         return;
5872                 /*
5873                  * Nothing to run but something to decay (on_list)?
5874                  * Complete the branch.
5875                  */
5876                 for_each_sched_entity(se) {
5877                         if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
5878                                 break;
5879                 }
5880                 goto unthrottle_throttle;
5881         }
5882
5883         task_delta = cfs_rq->h_nr_running;
5884         idle_task_delta = cfs_rq->idle_h_nr_running;
5885         for_each_sched_entity(se) {
5886                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5887
5888                 if (se->on_rq)
5889                         break;
5890                 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
5891
5892                 if (cfs_rq_is_idle(group_cfs_rq(se)))
5893                         idle_task_delta = cfs_rq->h_nr_running;
5894
5895                 qcfs_rq->h_nr_running += task_delta;
5896                 qcfs_rq->idle_h_nr_running += idle_task_delta;
5897
5898                 /* end evaluation on encountering a throttled cfs_rq */
5899                 if (cfs_rq_throttled(qcfs_rq))
5900                         goto unthrottle_throttle;
5901         }
5902
5903         for_each_sched_entity(se) {
5904                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5905
5906                 update_load_avg(qcfs_rq, se, UPDATE_TG);
5907                 se_update_runnable(se);
5908
5909                 if (cfs_rq_is_idle(group_cfs_rq(se)))
5910                         idle_task_delta = cfs_rq->h_nr_running;
5911
5912                 qcfs_rq->h_nr_running += task_delta;
5913                 qcfs_rq->idle_h_nr_running += idle_task_delta;
5914
5915                 /* end evaluation on encountering a throttled cfs_rq */
5916                 if (cfs_rq_throttled(qcfs_rq))
5917                         goto unthrottle_throttle;
5918         }
5919
5920         /* At this point se is NULL and we are at root level*/
5921         add_nr_running(rq, task_delta);
5922
5923 unthrottle_throttle:
5924         assert_list_leaf_cfs_rq(rq);
5925
5926         /* Determine whether we need to wake up potentially idle CPU: */
5927         if (rq->curr == rq->idle && rq->cfs.nr_running)
5928                 resched_curr(rq);
5929 }
5930
5931 #ifdef CONFIG_SMP
5932 static void __cfsb_csd_unthrottle(void *arg)
5933 {
5934         struct cfs_rq *cursor, *tmp;
5935         struct rq *rq = arg;
5936         struct rq_flags rf;
5937
5938         rq_lock(rq, &rf);
5939
5940         /*
5941          * Iterating over the list can trigger several call to
5942          * update_rq_clock() in unthrottle_cfs_rq().
5943          * Do it once and skip the potential next ones.
5944          */
5945         update_rq_clock(rq);
5946         rq_clock_start_loop_update(rq);
5947
5948         /*
5949          * Since we hold rq lock we're safe from concurrent manipulation of
5950          * the CSD list. However, this RCU critical section annotates the
5951          * fact that we pair with sched_free_group_rcu(), so that we cannot
5952          * race with group being freed in the window between removing it
5953          * from the list and advancing to the next entry in the list.
5954          */
5955         rcu_read_lock();
5956
5957         list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
5958                                  throttled_csd_list) {
5959                 list_del_init(&cursor->throttled_csd_list);
5960
5961                 if (cfs_rq_throttled(cursor))
5962                         unthrottle_cfs_rq(cursor);
5963         }
5964
5965         rcu_read_unlock();
5966
5967         rq_clock_stop_loop_update(rq);
5968         rq_unlock(rq, &rf);
5969 }
5970
5971 static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
5972 {
5973         struct rq *rq = rq_of(cfs_rq);
5974         bool first;
5975
5976         if (rq == this_rq()) {
5977                 unthrottle_cfs_rq(cfs_rq);
5978                 return;
5979         }
5980
5981         /* Already enqueued */
5982         if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
5983                 return;
5984
5985         first = list_empty(&rq->cfsb_csd_list);
5986         list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
5987         if (first)
5988                 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
5989 }
5990 #else
5991 static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
5992 {
5993         unthrottle_cfs_rq(cfs_rq);
5994 }
5995 #endif
5996
5997 static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
5998 {
5999         lockdep_assert_rq_held(rq_of(cfs_rq));
6000
6001         if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
6002             cfs_rq->runtime_remaining <= 0))
6003                 return;
6004
6005         __unthrottle_cfs_rq_async(cfs_rq);
6006 }
6007
6008 static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
6009 {
6010         int this_cpu = smp_processor_id();
6011         u64 runtime, remaining = 1;
6012         bool throttled = false;
6013         struct cfs_rq *cfs_rq, *tmp;
6014         struct rq_flags rf;
6015         struct rq *rq;
6016         LIST_HEAD(local_unthrottle);
6017
6018         rcu_read_lock();
6019         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
6020                                 throttled_list) {
6021                 rq = rq_of(cfs_rq);
6022
6023                 if (!remaining) {
6024                         throttled = true;
6025                         break;
6026                 }
6027
6028                 rq_lock_irqsave(rq, &rf);
6029                 if (!cfs_rq_throttled(cfs_rq))
6030                         goto next;
6031
6032                 /* Already queued for async unthrottle */
6033                 if (!list_empty(&cfs_rq->throttled_csd_list))
6034                         goto next;
6035
6036                 /* By the above checks, this should never be true */
6037                 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
6038
6039                 raw_spin_lock(&cfs_b->lock);
6040                 runtime = -cfs_rq->runtime_remaining + 1;
6041                 if (runtime > cfs_b->runtime)
6042                         runtime = cfs_b->runtime;
6043                 cfs_b->runtime -= runtime;
6044                 remaining = cfs_b->runtime;
6045                 raw_spin_unlock(&cfs_b->lock);
6046
6047                 cfs_rq->runtime_remaining += runtime;
6048
6049                 /* we check whether we're throttled above */
6050                 if (cfs_rq->runtime_remaining > 0) {
6051                         if (cpu_of(rq) != this_cpu) {
6052                                 unthrottle_cfs_rq_async(cfs_rq);
6053                         } else {
6054                                 /*
6055                                  * We currently only expect to be unthrottling
6056                                  * a single cfs_rq locally.
6057                                  */
6058                                 SCHED_WARN_ON(!list_empty(&local_unthrottle));
6059                                 list_add_tail(&cfs_rq->throttled_csd_list,
6060                                               &local_unthrottle);
6061                         }
6062                 } else {
6063                         throttled = true;
6064                 }
6065
6066 next:
6067                 rq_unlock_irqrestore(rq, &rf);
6068         }
6069
6070         list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
6071                                  throttled_csd_list) {
6072                 struct rq *rq = rq_of(cfs_rq);
6073
6074                 rq_lock_irqsave(rq, &rf);
6075
6076                 list_del_init(&cfs_rq->throttled_csd_list);
6077
6078                 if (cfs_rq_throttled(cfs_rq))
6079                         unthrottle_cfs_rq(cfs_rq);
6080
6081                 rq_unlock_irqrestore(rq, &rf);
6082         }
6083         SCHED_WARN_ON(!list_empty(&local_unthrottle));
6084
6085         rcu_read_unlock();
6086
6087         return throttled;
6088 }
6089
6090 /*
6091  * Responsible for refilling a task_group's bandwidth and unthrottling its
6092  * cfs_rqs as appropriate. If there has been no activity within the last
6093  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
6094  * used to track this state.
6095  */
6096 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
6097 {
6098         int throttled;
6099
6100         /* no need to continue the timer with no bandwidth constraint */
6101         if (cfs_b->quota == RUNTIME_INF)
6102                 goto out_deactivate;
6103
6104         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
6105         cfs_b->nr_periods += overrun;
6106
6107         /* Refill extra burst quota even if cfs_b->idle */
6108         __refill_cfs_bandwidth_runtime(cfs_b);
6109
6110         /*
6111          * idle depends on !throttled (for the case of a large deficit), and if
6112          * we're going inactive then everything else can be deferred
6113          */
6114         if (cfs_b->idle && !throttled)
6115                 goto out_deactivate;
6116
6117         if (!throttled) {
6118                 /* mark as potentially idle for the upcoming period */
6119                 cfs_b->idle = 1;
6120                 return 0;
6121         }
6122
6123         /* account preceding periods in which throttling occurred */
6124         cfs_b->nr_throttled += overrun;
6125
6126         /*
6127          * This check is repeated as we release cfs_b->lock while we unthrottle.
6128          */
6129         while (throttled && cfs_b->runtime > 0) {
6130                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6131                 /* we can't nest cfs_b->lock while distributing bandwidth */
6132                 throttled = distribute_cfs_runtime(cfs_b);
6133                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
6134         }
6135
6136         /*
6137          * While we are ensured activity in the period following an
6138          * unthrottle, this also covers the case in which the new bandwidth is
6139          * insufficient to cover the existing bandwidth deficit.  (Forcing the
6140          * timer to remain active while there are any throttled entities.)
6141          */
6142         cfs_b->idle = 0;
6143
6144         return 0;
6145
6146 out_deactivate:
6147         return 1;
6148 }
6149
6150 /* a cfs_rq won't donate quota below this amount */
6151 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
6152 /* minimum remaining period time to redistribute slack quota */
6153 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
6154 /* how long we wait to gather additional slack before distributing */
6155 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
6156
6157 /*
6158  * Are we near the end of the current quota period?
6159  *
6160  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
6161  * hrtimer base being cleared by hrtimer_start. In the case of
6162  * migrate_hrtimers, base is never cleared, so we are fine.
6163  */
6164 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
6165 {
6166         struct hrtimer *refresh_timer = &cfs_b->period_timer;
6167         s64 remaining;
6168
6169         /* if the call-back is running a quota refresh is already occurring */
6170         if (hrtimer_callback_running(refresh_timer))
6171                 return 1;
6172
6173         /* is a quota refresh about to occur? */
6174         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
6175         if (remaining < (s64)min_expire)
6176                 return 1;
6177
6178         return 0;
6179 }
6180
6181 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
6182 {
6183         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
6184
6185         /* if there's a quota refresh soon don't bother with slack */
6186         if (runtime_refresh_within(cfs_b, min_left))
6187                 return;
6188
6189         /* don't push forwards an existing deferred unthrottle */
6190         if (cfs_b->slack_started)
6191                 return;
6192         cfs_b->slack_started = true;
6193
6194         hrtimer_start(&cfs_b->slack_timer,
6195                         ns_to_ktime(cfs_bandwidth_slack_period),
6196                         HRTIMER_MODE_REL);
6197 }
6198
6199 /* we know any runtime found here is valid as update_curr() precedes return */
6200 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6201 {
6202         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6203         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
6204
6205         if (slack_runtime <= 0)
6206                 return;
6207
6208         raw_spin_lock(&cfs_b->lock);
6209         if (cfs_b->quota != RUNTIME_INF) {
6210                 cfs_b->runtime += slack_runtime;
6211
6212                 /* we are under rq->lock, defer unthrottling using a timer */
6213                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
6214                     !list_empty(&cfs_b->throttled_cfs_rq))
6215                         start_cfs_slack_bandwidth(cfs_b);
6216         }
6217         raw_spin_unlock(&cfs_b->lock);
6218
6219         /* even if it's not valid for return we don't want to try again */
6220         cfs_rq->runtime_remaining -= slack_runtime;
6221 }
6222
6223 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6224 {
6225         if (!cfs_bandwidth_used())
6226                 return;
6227
6228         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
6229                 return;
6230
6231         __return_cfs_rq_runtime(cfs_rq);
6232 }
6233
6234 /*
6235  * This is done with a timer (instead of inline with bandwidth return) since
6236  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
6237  */
6238 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
6239 {
6240         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
6241         unsigned long flags;
6242
6243         /* confirm we're still not at a refresh boundary */
6244         raw_spin_lock_irqsave(&cfs_b->lock, flags);
6245         cfs_b->slack_started = false;
6246
6247         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
6248                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6249                 return;
6250         }
6251
6252         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
6253                 runtime = cfs_b->runtime;
6254
6255         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6256
6257         if (!runtime)
6258                 return;
6259
6260         distribute_cfs_runtime(cfs_b);
6261 }
6262
6263 /*
6264  * When a group wakes up we want to make sure that its quota is not already
6265  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
6266  * runtime as update_curr() throttling can not trigger until it's on-rq.
6267  */
6268 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
6269 {
6270         if (!cfs_bandwidth_used())
6271                 return;
6272
6273         /* an active group must be handled by the update_curr()->put() path */
6274         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
6275                 return;
6276
6277         /* ensure the group is not already throttled */
6278         if (cfs_rq_throttled(cfs_rq))
6279                 return;
6280
6281         /* update runtime allocation */
6282         account_cfs_rq_runtime(cfs_rq, 0);
6283         if (cfs_rq->runtime_remaining <= 0)
6284                 throttle_cfs_rq(cfs_rq);
6285 }
6286
6287 static void sync_throttle(struct task_group *tg, int cpu)
6288 {
6289         struct cfs_rq *pcfs_rq, *cfs_rq;
6290
6291         if (!cfs_bandwidth_used())
6292                 return;
6293
6294         if (!tg->parent)
6295                 return;
6296
6297         cfs_rq = tg->cfs_rq[cpu];
6298         pcfs_rq = tg->parent->cfs_rq[cpu];
6299
6300         cfs_rq->throttle_count = pcfs_rq->throttle_count;
6301         cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
6302 }
6303
6304 /* conditionally throttle active cfs_rq's from put_prev_entity() */
6305 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6306 {
6307         if (!cfs_bandwidth_used())
6308                 return false;
6309
6310         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
6311                 return false;
6312
6313         /*
6314          * it's possible for a throttled entity to be forced into a running
6315          * state (e.g. set_curr_task), in this case we're finished.
6316          */
6317         if (cfs_rq_throttled(cfs_rq))
6318                 return true;
6319
6320         return throttle_cfs_rq(cfs_rq);
6321 }
6322
6323 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
6324 {
6325         struct cfs_bandwidth *cfs_b =
6326                 container_of(timer, struct cfs_bandwidth, slack_timer);
6327
6328         do_sched_cfs_slack_timer(cfs_b);
6329
6330         return HRTIMER_NORESTART;
6331 }
6332
6333 extern const u64 max_cfs_quota_period;
6334
6335 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
6336 {
6337         struct cfs_bandwidth *cfs_b =
6338                 container_of(timer, struct cfs_bandwidth, period_timer);
6339         unsigned long flags;
6340         int overrun;
6341         int idle = 0;
6342         int count = 0;
6343
6344         raw_spin_lock_irqsave(&cfs_b->lock, flags);
6345         for (;;) {
6346                 overrun = hrtimer_forward_now(timer, cfs_b->period);
6347                 if (!overrun)
6348                         break;
6349
6350                 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
6351
6352                 if (++count > 3) {
6353                         u64 new, old = ktime_to_ns(cfs_b->period);
6354
6355                         /*
6356                          * Grow period by a factor of 2 to avoid losing precision.
6357                          * Precision loss in the quota/period ratio can cause __cfs_schedulable
6358                          * to fail.
6359                          */
6360                         new = old * 2;
6361                         if (new < max_cfs_quota_period) {
6362                                 cfs_b->period = ns_to_ktime(new);
6363                                 cfs_b->quota *= 2;
6364                                 cfs_b->burst *= 2;
6365
6366                                 pr_warn_ratelimited(
6367         "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6368                                         smp_processor_id(),
6369                                         div_u64(new, NSEC_PER_USEC),
6370                                         div_u64(cfs_b->quota, NSEC_PER_USEC));
6371                         } else {
6372                                 pr_warn_ratelimited(
6373         "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6374                                         smp_processor_id(),
6375                                         div_u64(old, NSEC_PER_USEC),
6376                                         div_u64(cfs_b->quota, NSEC_PER_USEC));
6377                         }
6378
6379                         /* reset count so we don't come right back in here */
6380                         count = 0;
6381                 }
6382         }
6383         if (idle)
6384                 cfs_b->period_active = 0;
6385         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6386
6387         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
6388 }
6389
6390 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
6391 {
6392         raw_spin_lock_init(&cfs_b->lock);
6393         cfs_b->runtime = 0;
6394         cfs_b->quota = RUNTIME_INF;
6395         cfs_b->period = ns_to_ktime(default_cfs_period());
6396         cfs_b->burst = 0;
6397         cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
6398
6399         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
6400         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
6401         cfs_b->period_timer.function = sched_cfs_period_timer;
6402
6403         /* Add a random offset so that timers interleave */
6404         hrtimer_set_expires(&cfs_b->period_timer,
6405                             get_random_u32_below(cfs_b->period));
6406         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6407         cfs_b->slack_timer.function = sched_cfs_slack_timer;
6408         cfs_b->slack_started = false;
6409 }
6410
6411 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6412 {
6413         cfs_rq->runtime_enabled = 0;
6414         INIT_LIST_HEAD(&cfs_rq->throttled_list);
6415         INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
6416 }
6417
6418 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6419 {
6420         lockdep_assert_held(&cfs_b->lock);
6421
6422         if (cfs_b->period_active)
6423                 return;
6424
6425         cfs_b->period_active = 1;
6426         hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
6427         hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
6428 }
6429
6430 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6431 {
6432         int __maybe_unused i;
6433
6434         /* init_cfs_bandwidth() was not called */
6435         if (!cfs_b->throttled_cfs_rq.next)
6436                 return;
6437
6438         hrtimer_cancel(&cfs_b->period_timer);
6439         hrtimer_cancel(&cfs_b->slack_timer);
6440
6441         /*
6442          * It is possible that we still have some cfs_rq's pending on a CSD
6443          * list, though this race is very rare. In order for this to occur, we
6444          * must have raced with the last task leaving the group while there
6445          * exist throttled cfs_rq(s), and the period_timer must have queued the
6446          * CSD item but the remote cpu has not yet processed it. To handle this,
6447          * we can simply flush all pending CSD work inline here. We're
6448          * guaranteed at this point that no additional cfs_rq of this group can
6449          * join a CSD list.
6450          */
6451 #ifdef CONFIG_SMP
6452         for_each_possible_cpu(i) {
6453                 struct rq *rq = cpu_rq(i);
6454                 unsigned long flags;
6455
6456                 if (list_empty(&rq->cfsb_csd_list))
6457                         continue;
6458
6459                 local_irq_save(flags);
6460                 __cfsb_csd_unthrottle(rq);
6461                 local_irq_restore(flags);
6462         }
6463 #endif
6464 }
6465
6466 /*
6467  * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
6468  *
6469  * The race is harmless, since modifying bandwidth settings of unhooked group
6470  * bits doesn't do much.
6471  */
6472
6473 /* cpu online callback */
6474 static void __maybe_unused update_runtime_enabled(struct rq *rq)
6475 {
6476         struct task_group *tg;
6477
6478         lockdep_assert_rq_held(rq);
6479
6480         rcu_read_lock();
6481         list_for_each_entry_rcu(tg, &task_groups, list) {
6482                 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6483                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6484
6485                 raw_spin_lock(&cfs_b->lock);
6486                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
6487                 raw_spin_unlock(&cfs_b->lock);
6488         }
6489         rcu_read_unlock();
6490 }
6491
6492 /* cpu offline callback */
6493 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
6494 {
6495         struct task_group *tg;
6496
6497         lockdep_assert_rq_held(rq);
6498
6499         /*
6500          * The rq clock has already been updated in the
6501          * set_rq_offline(), so we should skip updating
6502          * the rq clock again in unthrottle_cfs_rq().
6503          */
6504         rq_clock_start_loop_update(rq);
6505
6506         rcu_read_lock();
6507         list_for_each_entry_rcu(tg, &task_groups, list) {
6508                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6509
6510                 if (!cfs_rq->runtime_enabled)
6511                         continue;
6512
6513                 /*
6514                  * clock_task is not advancing so we just need to make sure
6515                  * there's some valid quota amount
6516                  */
6517                 cfs_rq->runtime_remaining = 1;
6518                 /*
6519                  * Offline rq is schedulable till CPU is completely disabled
6520                  * in take_cpu_down(), so we prevent new cfs throttling here.
6521                  */
6522                 cfs_rq->runtime_enabled = 0;
6523
6524                 if (cfs_rq_throttled(cfs_rq))
6525                         unthrottle_cfs_rq(cfs_rq);
6526         }
6527         rcu_read_unlock();
6528
6529         rq_clock_stop_loop_update(rq);
6530 }
6531
6532 bool cfs_task_bw_constrained(struct task_struct *p)
6533 {
6534         struct cfs_rq *cfs_rq = task_cfs_rq(p);
6535
6536         if (!cfs_bandwidth_used())
6537                 return false;
6538
6539         if (cfs_rq->runtime_enabled ||
6540             tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
6541                 return true;
6542
6543         return false;
6544 }
6545
6546 #ifdef CONFIG_NO_HZ_FULL
6547 /* called from pick_next_task_fair() */
6548 static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
6549 {
6550         int cpu = cpu_of(rq);
6551
6552         if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
6553                 return;
6554
6555         if (!tick_nohz_full_cpu(cpu))
6556                 return;
6557
6558         if (rq->nr_running != 1)
6559                 return;
6560
6561         /*
6562          *  We know there is only one task runnable and we've just picked it. The
6563          *  normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
6564          *  be otherwise able to stop the tick. Just need to check if we are using
6565          *  bandwidth control.
6566          */
6567         if (cfs_task_bw_constrained(p))
6568                 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
6569 }
6570 #endif
6571
6572 #else /* CONFIG_CFS_BANDWIDTH */
6573
6574 static inline bool cfs_bandwidth_used(void)
6575 {
6576         return false;
6577 }
6578
6579 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
6580 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
6581 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6582 static inline void sync_throttle(struct task_group *tg, int cpu) {}
6583 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6584
6585 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
6586 {
6587         return 0;
6588 }
6589
6590 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
6591 {
6592         return 0;
6593 }
6594
6595 static inline int throttled_lb_pair(struct task_group *tg,
6596                                     int src_cpu, int dest_cpu)
6597 {
6598         return 0;
6599 }
6600
6601 #ifdef CONFIG_FAIR_GROUP_SCHED
6602 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
6603 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6604 #endif
6605
6606 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
6607 {
6608         return NULL;
6609 }
6610 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
6611 static inline void update_runtime_enabled(struct rq *rq) {}
6612 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6613 #ifdef CONFIG_CGROUP_SCHED
6614 bool cfs_task_bw_constrained(struct task_struct *p)
6615 {
6616         return false;
6617 }
6618 #endif
6619 #endif /* CONFIG_CFS_BANDWIDTH */
6620
6621 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
6622 static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
6623 #endif
6624
6625 /**************************************************
6626  * CFS operations on tasks:
6627  */
6628
6629 #ifdef CONFIG_SCHED_HRTICK
6630 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
6631 {
6632         struct sched_entity *se = &p->se;
6633
6634         SCHED_WARN_ON(task_rq(p) != rq);
6635
6636         if (rq->cfs.h_nr_running > 1) {
6637                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
6638                 u64 slice = se->slice;
6639                 s64 delta = slice - ran;
6640
6641                 if (delta < 0) {
6642                         if (task_current(rq, p))
6643                                 resched_curr(rq);
6644                         return;
6645                 }
6646                 hrtick_start(rq, delta);
6647         }
6648 }
6649
6650 /*
6651  * called from enqueue/dequeue and updates the hrtick when the
6652  * current task is from our class and nr_running is low enough
6653  * to matter.
6654  */
6655 static void hrtick_update(struct rq *rq)
6656 {
6657         struct task_struct *curr = rq->curr;
6658
6659         if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
6660                 return;
6661
6662         hrtick_start_fair(rq, curr);
6663 }
6664 #else /* !CONFIG_SCHED_HRTICK */
6665 static inline void
6666 hrtick_start_fair(struct rq *rq, struct task_struct *p)
6667 {
6668 }
6669
6670 static inline void hrtick_update(struct rq *rq)
6671 {
6672 }
6673 #endif
6674
6675 #ifdef CONFIG_SMP
6676 static inline bool cpu_overutilized(int cpu)
6677 {
6678         unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
6679         unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
6680
6681         /* Return true only if the utilization doesn't fit CPU's capacity */
6682         return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
6683 }
6684
6685 static inline void update_overutilized_status(struct rq *rq)
6686 {
6687         if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
6688                 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
6689                 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
6690         }
6691 }
6692 #else
6693 static inline void update_overutilized_status(struct rq *rq) { }
6694 #endif
6695
6696 /* Runqueue only has SCHED_IDLE tasks enqueued */
6697 static int sched_idle_rq(struct rq *rq)
6698 {
6699         return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
6700                         rq->nr_running);
6701 }
6702
6703 #ifdef CONFIG_SMP
6704 static int sched_idle_cpu(int cpu)
6705 {
6706         return sched_idle_rq(cpu_rq(cpu));
6707 }
6708 #endif
6709
6710 /*
6711  * The enqueue_task method is called before nr_running is
6712  * increased. Here we update the fair scheduling stats and
6713  * then put the task into the rbtree:
6714  */
6715 static void
6716 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
6717 {
6718         struct cfs_rq *cfs_rq;
6719         struct sched_entity *se = &p->se;
6720         int idle_h_nr_running = task_has_idle_policy(p);
6721         int task_new = !(flags & ENQUEUE_WAKEUP);
6722
6723         /*
6724          * The code below (indirectly) updates schedutil which looks at
6725          * the cfs_rq utilization to select a frequency.
6726          * Let's add the task's estimated utilization to the cfs_rq's
6727          * estimated utilization, before we update schedutil.
6728          */
6729         util_est_enqueue(&rq->cfs, p);
6730
6731         /*
6732          * If in_iowait is set, the code below may not trigger any cpufreq
6733          * utilization updates, so do it here explicitly with the IOWAIT flag
6734          * passed.
6735          */
6736         if (p->in_iowait)
6737                 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
6738
6739         for_each_sched_entity(se) {
6740                 if (se->on_rq)
6741                         break;
6742                 cfs_rq = cfs_rq_of(se);
6743                 enqueue_entity(cfs_rq, se, flags);
6744
6745                 cfs_rq->h_nr_running++;
6746                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
6747
6748                 if (cfs_rq_is_idle(cfs_rq))
6749                         idle_h_nr_running = 1;
6750
6751                 /* end evaluation on encountering a throttled cfs_rq */
6752                 if (cfs_rq_throttled(cfs_rq))
6753                         goto enqueue_throttle;
6754
6755                 flags = ENQUEUE_WAKEUP;
6756         }
6757
6758         for_each_sched_entity(se) {
6759                 cfs_rq = cfs_rq_of(se);
6760
6761                 update_load_avg(cfs_rq, se, UPDATE_TG);
6762                 se_update_runnable(se);
6763                 update_cfs_group(se);
6764
6765                 cfs_rq->h_nr_running++;
6766                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
6767
6768                 if (cfs_rq_is_idle(cfs_rq))
6769                         idle_h_nr_running = 1;
6770
6771                 /* end evaluation on encountering a throttled cfs_rq */
6772                 if (cfs_rq_throttled(cfs_rq))
6773                         goto enqueue_throttle;
6774         }
6775
6776         /* At this point se is NULL and we are at root level*/
6777         add_nr_running(rq, 1);
6778
6779         /*
6780          * Since new tasks are assigned an initial util_avg equal to
6781          * half of the spare capacity of their CPU, tiny tasks have the
6782          * ability to cross the overutilized threshold, which will
6783          * result in the load balancer ruining all the task placement
6784          * done by EAS. As a way to mitigate that effect, do not account
6785          * for the first enqueue operation of new tasks during the
6786          * overutilized flag detection.
6787          *
6788          * A better way of solving this problem would be to wait for
6789          * the PELT signals of tasks to converge before taking them
6790          * into account, but that is not straightforward to implement,
6791          * and the following generally works well enough in practice.
6792          */
6793         if (!task_new)
6794                 update_overutilized_status(rq);
6795
6796 enqueue_throttle:
6797         assert_list_leaf_cfs_rq(rq);
6798
6799         hrtick_update(rq);
6800 }
6801
6802 static void set_next_buddy(struct sched_entity *se);
6803
6804 /*
6805  * The dequeue_task method is called before nr_running is
6806  * decreased. We remove the task from the rbtree and
6807  * update the fair scheduling stats:
6808  */
6809 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
6810 {
6811         struct cfs_rq *cfs_rq;
6812         struct sched_entity *se = &p->se;
6813         int task_sleep = flags & DEQUEUE_SLEEP;
6814         int idle_h_nr_running = task_has_idle_policy(p);
6815         bool was_sched_idle = sched_idle_rq(rq);
6816
6817         util_est_dequeue(&rq->cfs, p);
6818
6819         for_each_sched_entity(se) {
6820                 cfs_rq = cfs_rq_of(se);
6821                 dequeue_entity(cfs_rq, se, flags);
6822
6823                 cfs_rq->h_nr_running--;
6824                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
6825
6826                 if (cfs_rq_is_idle(cfs_rq))
6827                         idle_h_nr_running = 1;
6828
6829                 /* end evaluation on encountering a throttled cfs_rq */
6830                 if (cfs_rq_throttled(cfs_rq))
6831                         goto dequeue_throttle;
6832
6833                 /* Don't dequeue parent if it has other entities besides us */
6834                 if (cfs_rq->load.weight) {
6835                         /* Avoid re-evaluating load for this entity: */
6836                         se = parent_entity(se);
6837                         /*
6838                          * Bias pick_next to pick a task from this cfs_rq, as
6839                          * p is sleeping when it is within its sched_slice.
6840                          */
6841                         if (task_sleep && se && !throttled_hierarchy(cfs_rq))
6842                                 set_next_buddy(se);
6843                         break;
6844                 }
6845                 flags |= DEQUEUE_SLEEP;
6846         }
6847
6848         for_each_sched_entity(se) {
6849                 cfs_rq = cfs_rq_of(se);
6850
6851                 update_load_avg(cfs_rq, se, UPDATE_TG);
6852                 se_update_runnable(se);
6853                 update_cfs_group(se);
6854
6855                 cfs_rq->h_nr_running--;
6856                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
6857
6858                 if (cfs_rq_is_idle(cfs_rq))
6859                         idle_h_nr_running = 1;
6860
6861                 /* end evaluation on encountering a throttled cfs_rq */
6862                 if (cfs_rq_throttled(cfs_rq))
6863                         goto dequeue_throttle;
6864
6865         }
6866
6867         /* At this point se is NULL and we are at root level*/
6868         sub_nr_running(rq, 1);
6869
6870         /* balance early to pull high priority tasks */
6871         if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
6872                 rq->next_balance = jiffies;
6873
6874 dequeue_throttle:
6875         util_est_update(&rq->cfs, p, task_sleep);
6876         hrtick_update(rq);
6877 }
6878
6879 #ifdef CONFIG_SMP
6880
6881 /* Working cpumask for: load_balance, load_balance_newidle. */
6882 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6883 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
6884 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
6885
6886 #ifdef CONFIG_NO_HZ_COMMON
6887
6888 static struct {
6889         cpumask_var_t idle_cpus_mask;
6890         atomic_t nr_cpus;
6891         int has_blocked;                /* Idle CPUS has blocked load */
6892         int needs_update;               /* Newly idle CPUs need their next_balance collated */
6893         unsigned long next_balance;     /* in jiffy units */
6894         unsigned long next_blocked;     /* Next update of blocked load in jiffies */
6895 } nohz ____cacheline_aligned;
6896
6897 #endif /* CONFIG_NO_HZ_COMMON */
6898
6899 static unsigned long cpu_load(struct rq *rq)
6900 {
6901         return cfs_rq_load_avg(&rq->cfs);
6902 }
6903
6904 /*
6905  * cpu_load_without - compute CPU load without any contributions from *p
6906  * @cpu: the CPU which load is requested
6907  * @p: the task which load should be discounted
6908  *
6909  * The load of a CPU is defined by the load of tasks currently enqueued on that
6910  * CPU as well as tasks which are currently sleeping after an execution on that
6911  * CPU.
6912  *
6913  * This method returns the load of the specified CPU by discounting the load of
6914  * the specified task, whenever the task is currently contributing to the CPU
6915  * load.
6916  */
6917 static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
6918 {
6919         struct cfs_rq *cfs_rq;
6920         unsigned int load;
6921
6922         /* Task has no contribution or is new */
6923         if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6924                 return cpu_load(rq);
6925
6926         cfs_rq = &rq->cfs;
6927         load = READ_ONCE(cfs_rq->avg.load_avg);
6928
6929         /* Discount task's util from CPU's util */
6930         lsub_positive(&load, task_h_load(p));
6931
6932         return load;
6933 }
6934
6935 static unsigned long cpu_runnable(struct rq *rq)
6936 {
6937         return cfs_rq_runnable_avg(&rq->cfs);
6938 }
6939
6940 static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
6941 {
6942         struct cfs_rq *cfs_rq;
6943         unsigned int runnable;
6944
6945         /* Task has no contribution or is new */
6946         if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6947                 return cpu_runnable(rq);
6948
6949         cfs_rq = &rq->cfs;
6950         runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
6951
6952         /* Discount task's runnable from CPU's runnable */
6953         lsub_positive(&runnable, p->se.avg.runnable_avg);
6954
6955         return runnable;
6956 }
6957
6958 static unsigned long capacity_of(int cpu)
6959 {
6960         return cpu_rq(cpu)->cpu_capacity;
6961 }
6962
6963 static void record_wakee(struct task_struct *p)
6964 {
6965         /*
6966          * Only decay a single time; tasks that have less then 1 wakeup per
6967          * jiffy will not have built up many flips.
6968          */
6969         if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
6970                 current->wakee_flips >>= 1;
6971                 current->wakee_flip_decay_ts = jiffies;
6972         }
6973
6974         if (current->last_wakee != p) {
6975                 current->last_wakee = p;
6976                 current->wakee_flips++;
6977         }
6978 }
6979
6980 /*
6981  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
6982  *
6983  * A waker of many should wake a different task than the one last awakened
6984  * at a frequency roughly N times higher than one of its wakees.
6985  *
6986  * In order to determine whether we should let the load spread vs consolidating
6987  * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
6988  * partner, and a factor of lls_size higher frequency in the other.
6989  *
6990  * With both conditions met, we can be relatively sure that the relationship is
6991  * non-monogamous, with partner count exceeding socket size.
6992  *
6993  * Waker/wakee being client/server, worker/dispatcher, interrupt source or
6994  * whatever is irrelevant, spread criteria is apparent partner count exceeds
6995  * socket size.
6996  */
6997 static int wake_wide(struct task_struct *p)
6998 {
6999         unsigned int master = current->wakee_flips;
7000         unsigned int slave = p->wakee_flips;
7001         int factor = __this_cpu_read(sd_llc_size);
7002
7003         if (master < slave)
7004                 swap(master, slave);
7005         if (slave < factor || master < slave * factor)
7006                 return 0;
7007         return 1;
7008 }
7009
7010 /*
7011  * The purpose of wake_affine() is to quickly determine on which CPU we can run
7012  * soonest. For the purpose of speed we only consider the waking and previous
7013  * CPU.
7014  *
7015  * wake_affine_idle() - only considers 'now', it check if the waking CPU is
7016  *                      cache-affine and is (or will be) idle.
7017  *
7018  * wake_affine_weight() - considers the weight to reflect the average
7019  *                        scheduling latency of the CPUs. This seems to work
7020  *                        for the overloaded case.
7021  */
7022 static int
7023 wake_affine_idle(int this_cpu, int prev_cpu, int sync)
7024 {
7025         /*
7026          * If this_cpu is idle, it implies the wakeup is from interrupt
7027          * context. Only allow the move if cache is shared. Otherwise an
7028          * interrupt intensive workload could force all tasks onto one
7029          * node depending on the IO topology or IRQ affinity settings.
7030          *
7031          * If the prev_cpu is idle and cache affine then avoid a migration.
7032          * There is no guarantee that the cache hot data from an interrupt
7033          * is more important than cache hot data on the prev_cpu and from
7034          * a cpufreq perspective, it's better to have higher utilisation
7035          * on one CPU.
7036          */
7037         if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
7038                 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
7039
7040         if (sync && cpu_rq(this_cpu)->nr_running == 1)
7041                 return this_cpu;
7042
7043         if (available_idle_cpu(prev_cpu))
7044                 return prev_cpu;
7045
7046         return nr_cpumask_bits;
7047 }
7048
7049 static int
7050 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
7051                    int this_cpu, int prev_cpu, int sync)
7052 {
7053         s64 this_eff_load, prev_eff_load;
7054         unsigned long task_load;
7055
7056         this_eff_load = cpu_load(cpu_rq(this_cpu));
7057
7058         if (sync) {
7059                 unsigned long current_load = task_h_load(current);
7060
7061                 if (current_load > this_eff_load)
7062                         return this_cpu;
7063
7064                 this_eff_load -= current_load;
7065         }
7066
7067         task_load = task_h_load(p);
7068
7069         this_eff_load += task_load;
7070         if (sched_feat(WA_BIAS))
7071                 this_eff_load *= 100;
7072         this_eff_load *= capacity_of(prev_cpu);
7073
7074         prev_eff_load = cpu_load(cpu_rq(prev_cpu));
7075         prev_eff_load -= task_load;
7076         if (sched_feat(WA_BIAS))
7077                 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
7078         prev_eff_load *= capacity_of(this_cpu);
7079
7080         /*
7081          * If sync, adjust the weight of prev_eff_load such that if
7082          * prev_eff == this_eff that select_idle_sibling() will consider
7083          * stacking the wakee on top of the waker if no other CPU is
7084          * idle.
7085          */
7086         if (sync)
7087                 prev_eff_load += 1;
7088
7089         return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
7090 }
7091
7092 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
7093                        int this_cpu, int prev_cpu, int sync)
7094 {
7095         int target = nr_cpumask_bits;
7096
7097         if (sched_feat(WA_IDLE))
7098                 target = wake_affine_idle(this_cpu, prev_cpu, sync);
7099
7100         if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
7101                 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
7102
7103         schedstat_inc(p->stats.nr_wakeups_affine_attempts);
7104         if (target != this_cpu)
7105                 return prev_cpu;
7106
7107         schedstat_inc(sd->ttwu_move_affine);
7108         schedstat_inc(p->stats.nr_wakeups_affine);
7109         return target;
7110 }
7111
7112 static struct sched_group *
7113 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
7114
7115 /*
7116  * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
7117  */
7118 static int
7119 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7120 {
7121         unsigned long load, min_load = ULONG_MAX;
7122         unsigned int min_exit_latency = UINT_MAX;
7123         u64 latest_idle_timestamp = 0;
7124         int least_loaded_cpu = this_cpu;
7125         int shallowest_idle_cpu = -1;
7126         int i;
7127
7128         /* Check if we have any choice: */
7129         if (group->group_weight == 1)
7130                 return cpumask_first(sched_group_span(group));
7131
7132         /* Traverse only the allowed CPUs */
7133         for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
7134                 struct rq *rq = cpu_rq(i);
7135
7136                 if (!sched_core_cookie_match(rq, p))
7137                         continue;
7138
7139                 if (sched_idle_cpu(i))
7140                         return i;
7141
7142                 if (available_idle_cpu(i)) {
7143                         struct cpuidle_state *idle = idle_get_state(rq);
7144                         if (idle && idle->exit_latency < min_exit_latency) {
7145                                 /*
7146                                  * We give priority to a CPU whose idle state
7147                                  * has the smallest exit latency irrespective
7148                                  * of any idle timestamp.
7149                                  */
7150                                 min_exit_latency = idle->exit_latency;
7151                                 latest_idle_timestamp = rq->idle_stamp;
7152                                 shallowest_idle_cpu = i;
7153                         } else if ((!idle || idle->exit_latency == min_exit_latency) &&
7154                                    rq->idle_stamp > latest_idle_timestamp) {
7155                                 /*
7156                                  * If equal or no active idle state, then
7157                                  * the most recently idled CPU might have
7158                                  * a warmer cache.
7159                                  */
7160                                 latest_idle_timestamp = rq->idle_stamp;
7161                                 shallowest_idle_cpu = i;
7162                         }
7163                 } else if (shallowest_idle_cpu == -1) {
7164                         load = cpu_load(cpu_rq(i));
7165                         if (load < min_load) {
7166                                 min_load = load;
7167                                 least_loaded_cpu = i;
7168                         }
7169                 }
7170         }
7171
7172         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7173 }
7174
7175 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
7176                                   int cpu, int prev_cpu, int sd_flag)
7177 {
7178         int new_cpu = cpu;
7179
7180         if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
7181                 return prev_cpu;
7182
7183         /*
7184          * We need task's util for cpu_util_without, sync it up to
7185          * prev_cpu's last_update_time.
7186          */
7187         if (!(sd_flag & SD_BALANCE_FORK))
7188                 sync_entity_load_avg(&p->se);
7189
7190         while (sd) {
7191                 struct sched_group *group;
7192                 struct sched_domain *tmp;
7193                 int weight;
7194
7195                 if (!(sd->flags & sd_flag)) {
7196                         sd = sd->child;
7197                         continue;
7198                 }
7199
7200                 group = find_idlest_group(sd, p, cpu);
7201                 if (!group) {
7202                         sd = sd->child;
7203                         continue;
7204                 }
7205
7206                 new_cpu = find_idlest_group_cpu(group, p, cpu);
7207                 if (new_cpu == cpu) {
7208                         /* Now try balancing at a lower domain level of 'cpu': */
7209                         sd = sd->child;
7210                         continue;
7211                 }
7212
7213                 /* Now try balancing at a lower domain level of 'new_cpu': */
7214                 cpu = new_cpu;
7215                 weight = sd->span_weight;
7216                 sd = NULL;
7217                 for_each_domain(cpu, tmp) {
7218                         if (weight <= tmp->span_weight)
7219                                 break;
7220                         if (tmp->flags & sd_flag)
7221                                 sd = tmp;
7222                 }
7223         }
7224
7225         return new_cpu;
7226 }
7227
7228 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
7229 {
7230         if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
7231             sched_cpu_cookie_match(cpu_rq(cpu), p))
7232                 return cpu;
7233
7234         return -1;
7235 }
7236
7237 #ifdef CONFIG_SCHED_SMT
7238 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7239 EXPORT_SYMBOL_GPL(sched_smt_present);
7240
7241 static inline void set_idle_cores(int cpu, int val)
7242 {
7243         struct sched_domain_shared *sds;
7244
7245         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7246         if (sds)
7247                 WRITE_ONCE(sds->has_idle_cores, val);
7248 }
7249
7250 static inline bool test_idle_cores(int cpu)
7251 {
7252         struct sched_domain_shared *sds;
7253
7254         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7255         if (sds)
7256                 return READ_ONCE(sds->has_idle_cores);
7257
7258         return false;
7259 }
7260
7261 /*
7262  * Scans the local SMT mask to see if the entire core is idle, and records this
7263  * information in sd_llc_shared->has_idle_cores.
7264  *
7265  * Since SMT siblings share all cache levels, inspecting this limited remote
7266  * state should be fairly cheap.
7267  */
7268 void __update_idle_core(struct rq *rq)
7269 {
7270         int core = cpu_of(rq);
7271         int cpu;
7272
7273         rcu_read_lock();
7274         if (test_idle_cores(core))
7275                 goto unlock;
7276
7277         for_each_cpu(cpu, cpu_smt_mask(core)) {
7278                 if (cpu == core)
7279                         continue;
7280
7281                 if (!available_idle_cpu(cpu))
7282                         goto unlock;
7283         }
7284
7285         set_idle_cores(core, 1);
7286 unlock:
7287         rcu_read_unlock();
7288 }
7289
7290 /*
7291  * Scan the entire LLC domain for idle cores; this dynamically switches off if
7292  * there are no idle cores left in the system; tracked through
7293  * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
7294  */
7295 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7296 {
7297         bool idle = true;
7298         int cpu;
7299
7300         for_each_cpu(cpu, cpu_smt_mask(core)) {
7301                 if (!available_idle_cpu(cpu)) {
7302                         idle = false;
7303                         if (*idle_cpu == -1) {
7304                                 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
7305                                         *idle_cpu = cpu;
7306                                         break;
7307                                 }
7308                                 continue;
7309                         }
7310                         break;
7311                 }
7312                 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
7313                         *idle_cpu = cpu;
7314         }
7315
7316         if (idle)
7317                 return core;
7318
7319         cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
7320         return -1;
7321 }
7322
7323 /*
7324  * Scan the local SMT mask for idle CPUs.
7325  */
7326 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7327 {
7328         int cpu;
7329
7330         for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
7331                 if (cpu == target)
7332                         continue;
7333                 /*
7334                  * Check if the CPU is in the LLC scheduling domain of @target.
7335                  * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
7336                  */
7337                 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
7338                         continue;
7339                 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
7340                         return cpu;
7341         }
7342
7343         return -1;
7344 }
7345
7346 #else /* CONFIG_SCHED_SMT */
7347
7348 static inline void set_idle_cores(int cpu, int val)
7349 {
7350 }
7351
7352 static inline bool test_idle_cores(int cpu)
7353 {
7354         return false;
7355 }
7356
7357 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7358 {
7359         return __select_idle_cpu(core, p);
7360 }
7361
7362 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7363 {
7364         return -1;
7365 }
7366
7367 #endif /* CONFIG_SCHED_SMT */
7368
7369 /*
7370  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
7371  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
7372  * average idle time for this rq (as found in rq->avg_idle).
7373  */
7374 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
7375 {
7376         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7377         int i, cpu, idle_cpu = -1, nr = INT_MAX;
7378         struct sched_domain_shared *sd_share;
7379
7380         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7381
7382         if (sched_feat(SIS_UTIL)) {
7383                 sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
7384                 if (sd_share) {
7385                         /* because !--nr is the condition to stop scan */
7386                         nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
7387                         /* overloaded LLC is unlikely to have idle cpu/core */
7388                         if (nr == 1)
7389                                 return -1;
7390                 }
7391         }
7392
7393         if (static_branch_unlikely(&sched_cluster_active)) {
7394                 struct sched_group *sg = sd->groups;
7395
7396                 if (sg->flags & SD_CLUSTER) {
7397                         for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
7398                                 if (!cpumask_test_cpu(cpu, cpus))
7399                                         continue;
7400
7401                                 if (has_idle_core) {
7402                                         i = select_idle_core(p, cpu, cpus, &idle_cpu);
7403                                         if ((unsigned int)i < nr_cpumask_bits)
7404                                                 return i;
7405                                 } else {
7406                                         if (--nr <= 0)
7407                                                 return -1;
7408                                         idle_cpu = __select_idle_cpu(cpu, p);
7409                                         if ((unsigned int)idle_cpu < nr_cpumask_bits)
7410                                                 return idle_cpu;
7411                                 }
7412                         }
7413                         cpumask_andnot(cpus, cpus, sched_group_span(sg));
7414                 }
7415         }
7416
7417         for_each_cpu_wrap(cpu, cpus, target + 1) {
7418                 if (has_idle_core) {
7419                         i = select_idle_core(p, cpu, cpus, &idle_cpu);
7420                         if ((unsigned int)i < nr_cpumask_bits)
7421                                 return i;
7422
7423                 } else {
7424                         if (--nr <= 0)
7425                                 return -1;
7426                         idle_cpu = __select_idle_cpu(cpu, p);
7427                         if ((unsigned int)idle_cpu < nr_cpumask_bits)
7428                                 break;
7429                 }
7430         }
7431
7432         if (has_idle_core)
7433                 set_idle_cores(target, false);
7434
7435         return idle_cpu;
7436 }
7437
7438 /*
7439  * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
7440  * the task fits. If no CPU is big enough, but there are idle ones, try to
7441  * maximize capacity.
7442  */
7443 static int
7444 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
7445 {
7446         unsigned long task_util, util_min, util_max, best_cap = 0;
7447         int fits, best_fits = 0;
7448         int cpu, best_cpu = -1;
7449         struct cpumask *cpus;
7450
7451         cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7452         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7453
7454         task_util = task_util_est(p);
7455         util_min = uclamp_eff_value(p, UCLAMP_MIN);
7456         util_max = uclamp_eff_value(p, UCLAMP_MAX);
7457
7458         for_each_cpu_wrap(cpu, cpus, target) {
7459                 unsigned long cpu_cap = capacity_of(cpu);
7460
7461                 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
7462                         continue;
7463
7464                 fits = util_fits_cpu(task_util, util_min, util_max, cpu);
7465
7466                 /* This CPU fits with all requirements */
7467                 if (fits > 0)
7468                         return cpu;
7469                 /*
7470                  * Only the min performance hint (i.e. uclamp_min) doesn't fit.
7471                  * Look for the CPU with best capacity.
7472                  */
7473                 else if (fits < 0)
7474                         cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
7475
7476                 /*
7477                  * First, select CPU which fits better (-1 being better than 0).
7478                  * Then, select the one with best capacity at same level.
7479                  */
7480                 if ((fits < best_fits) ||
7481                     ((fits == best_fits) && (cpu_cap > best_cap))) {
7482                         best_cap = cpu_cap;
7483                         best_cpu = cpu;
7484                         best_fits = fits;
7485                 }
7486         }
7487
7488         return best_cpu;
7489 }
7490
7491 static inline bool asym_fits_cpu(unsigned long util,
7492                                  unsigned long util_min,
7493                                  unsigned long util_max,
7494                                  int cpu)
7495 {
7496         if (sched_asym_cpucap_active())
7497                 /*
7498                  * Return true only if the cpu fully fits the task requirements
7499                  * which include the utilization and the performance hints.
7500                  */
7501                 return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
7502
7503         return true;
7504 }
7505
7506 /*
7507  * Try and locate an idle core/thread in the LLC cache domain.
7508  */
7509 static int select_idle_sibling(struct task_struct *p, int prev, int target)
7510 {
7511         bool has_idle_core = false;
7512         struct sched_domain *sd;
7513         unsigned long task_util, util_min, util_max;
7514         int i, recent_used_cpu, prev_aff = -1;
7515
7516         /*
7517          * On asymmetric system, update task utilization because we will check
7518          * that the task fits with cpu's capacity.
7519          */
7520         if (sched_asym_cpucap_active()) {
7521                 sync_entity_load_avg(&p->se);
7522                 task_util = task_util_est(p);
7523                 util_min = uclamp_eff_value(p, UCLAMP_MIN);
7524                 util_max = uclamp_eff_value(p, UCLAMP_MAX);
7525         }
7526
7527         /*
7528          * per-cpu select_rq_mask usage
7529          */
7530         lockdep_assert_irqs_disabled();
7531
7532         if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
7533             asym_fits_cpu(task_util, util_min, util_max, target))
7534                 return target;
7535
7536         /*
7537          * If the previous CPU is cache affine and idle, don't be stupid:
7538          */
7539         if (prev != target && cpus_share_cache(prev, target) &&
7540             (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
7541             asym_fits_cpu(task_util, util_min, util_max, prev)) {
7542
7543                 if (!static_branch_unlikely(&sched_cluster_active) ||
7544                     cpus_share_resources(prev, target))
7545                         return prev;
7546
7547                 prev_aff = prev;
7548         }
7549
7550         /*
7551          * Allow a per-cpu kthread to stack with the wakee if the
7552          * kworker thread and the tasks previous CPUs are the same.
7553          * The assumption is that the wakee queued work for the
7554          * per-cpu kthread that is now complete and the wakeup is
7555          * essentially a sync wakeup. An obvious example of this
7556          * pattern is IO completions.
7557          */
7558         if (is_per_cpu_kthread(current) &&
7559             in_task() &&
7560             prev == smp_processor_id() &&
7561             this_rq()->nr_running <= 1 &&
7562             asym_fits_cpu(task_util, util_min, util_max, prev)) {
7563                 return prev;
7564         }
7565
7566         /* Check a recently used CPU as a potential idle candidate: */
7567         recent_used_cpu = p->recent_used_cpu;
7568         p->recent_used_cpu = prev;
7569         if (recent_used_cpu != prev &&
7570             recent_used_cpu != target &&
7571             cpus_share_cache(recent_used_cpu, target) &&
7572             (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
7573             cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
7574             asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
7575
7576                 if (!static_branch_unlikely(&sched_cluster_active) ||
7577                     cpus_share_resources(recent_used_cpu, target))
7578                         return recent_used_cpu;
7579
7580         } else {
7581                 recent_used_cpu = -1;
7582         }
7583
7584         /*
7585          * For asymmetric CPU capacity systems, our domain of interest is
7586          * sd_asym_cpucapacity rather than sd_llc.
7587          */
7588         if (sched_asym_cpucap_active()) {
7589                 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
7590                 /*
7591                  * On an asymmetric CPU capacity system where an exclusive
7592                  * cpuset defines a symmetric island (i.e. one unique
7593                  * capacity_orig value through the cpuset), the key will be set
7594                  * but the CPUs within that cpuset will not have a domain with
7595                  * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
7596                  * capacity path.
7597                  */
7598                 if (sd) {
7599                         i = select_idle_capacity(p, sd, target);
7600                         return ((unsigned)i < nr_cpumask_bits) ? i : target;
7601                 }
7602         }
7603
7604         sd = rcu_dereference(per_cpu(sd_llc, target));
7605         if (!sd)
7606                 return target;
7607
7608         if (sched_smt_active()) {
7609                 has_idle_core = test_idle_cores(target);
7610
7611                 if (!has_idle_core && cpus_share_cache(prev, target)) {
7612                         i = select_idle_smt(p, sd, prev);
7613                         if ((unsigned int)i < nr_cpumask_bits)
7614                                 return i;
7615                 }
7616         }
7617
7618         i = select_idle_cpu(p, sd, has_idle_core, target);
7619         if ((unsigned)i < nr_cpumask_bits)
7620                 return i;
7621
7622         /*
7623          * For cluster machines which have lower sharing cache like L2 or
7624          * LLC Tag, we tend to find an idle CPU in the target's cluster
7625          * first. But prev_cpu or recent_used_cpu may also be a good candidate,
7626          * use them if possible when no idle CPU found in select_idle_cpu().
7627          */
7628         if ((unsigned int)prev_aff < nr_cpumask_bits)
7629                 return prev_aff;
7630         if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
7631                 return recent_used_cpu;
7632
7633         return target;
7634 }
7635
7636 /**
7637  * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
7638  * @cpu: the CPU to get the utilization for
7639  * @p: task for which the CPU utilization should be predicted or NULL
7640  * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
7641  * @boost: 1 to enable boosting, otherwise 0
7642  *
7643  * The unit of the return value must be the same as the one of CPU capacity
7644  * so that CPU utilization can be compared with CPU capacity.
7645  *
7646  * CPU utilization is the sum of running time of runnable tasks plus the
7647  * recent utilization of currently non-runnable tasks on that CPU.
7648  * It represents the amount of CPU capacity currently used by CFS tasks in
7649  * the range [0..max CPU capacity] with max CPU capacity being the CPU
7650  * capacity at f_max.
7651  *
7652  * The estimated CPU utilization is defined as the maximum between CPU
7653  * utilization and sum of the estimated utilization of the currently
7654  * runnable tasks on that CPU. It preserves a utilization "snapshot" of
7655  * previously-executed tasks, which helps better deduce how busy a CPU will
7656  * be when a long-sleeping task wakes up. The contribution to CPU utilization
7657  * of such a task would be significantly decayed at this point of time.
7658  *
7659  * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
7660  * CPU contention for CFS tasks can be detected by CPU runnable > CPU
7661  * utilization. Boosting is implemented in cpu_util() so that internal
7662  * users (e.g. EAS) can use it next to external users (e.g. schedutil),
7663  * latter via cpu_util_cfs_boost().
7664  *
7665  * CPU utilization can be higher than the current CPU capacity
7666  * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
7667  * of rounding errors as well as task migrations or wakeups of new tasks.
7668  * CPU utilization has to be capped to fit into the [0..max CPU capacity]
7669  * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
7670  * could be seen as over-utilized even though CPU1 has 20% of spare CPU
7671  * capacity. CPU utilization is allowed to overshoot current CPU capacity
7672  * though since this is useful for predicting the CPU capacity required
7673  * after task migrations (scheduler-driven DVFS).
7674  *
7675  * Return: (Boosted) (estimated) utilization for the specified CPU.
7676  */
7677 static unsigned long
7678 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
7679 {
7680         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
7681         unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
7682         unsigned long runnable;
7683
7684         if (boost) {
7685                 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7686                 util = max(util, runnable);
7687         }
7688
7689         /*
7690          * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
7691          * contribution. If @p migrates from another CPU to @cpu add its
7692          * contribution. In all the other cases @cpu is not impacted by the
7693          * migration so its util_avg is already correct.
7694          */
7695         if (p && task_cpu(p) == cpu && dst_cpu != cpu)
7696                 lsub_positive(&util, task_util(p));
7697         else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
7698                 util += task_util(p);
7699
7700         if (sched_feat(UTIL_EST)) {
7701                 unsigned long util_est;
7702
7703                 util_est = READ_ONCE(cfs_rq->avg.util_est);
7704
7705                 /*
7706                  * During wake-up @p isn't enqueued yet and doesn't contribute
7707                  * to any cpu_rq(cpu)->cfs.avg.util_est.
7708                  * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
7709                  * has been enqueued.
7710                  *
7711                  * During exec (@dst_cpu = -1) @p is enqueued and does
7712                  * contribute to cpu_rq(cpu)->cfs.util_est.
7713                  * Remove it to "simulate" cpu_util without @p's contribution.
7714                  *
7715                  * Despite the task_on_rq_queued(@p) check there is still a
7716                  * small window for a possible race when an exec
7717                  * select_task_rq_fair() races with LB's detach_task().
7718                  *
7719                  *   detach_task()
7720                  *     deactivate_task()
7721                  *       p->on_rq = TASK_ON_RQ_MIGRATING;
7722                  *       -------------------------------- A
7723                  *       dequeue_task()                    \
7724                  *         dequeue_task_fair()              + Race Time
7725                  *           util_est_dequeue()            /
7726                  *       -------------------------------- B
7727                  *
7728                  * The additional check "current == p" is required to further
7729                  * reduce the race window.
7730                  */
7731                 if (dst_cpu == cpu)
7732                         util_est += _task_util_est(p);
7733                 else if (p && unlikely(task_on_rq_queued(p) || current == p))
7734                         lsub_positive(&util_est, _task_util_est(p));
7735
7736                 util = max(util, util_est);
7737         }
7738
7739         return min(util, arch_scale_cpu_capacity(cpu));
7740 }
7741
7742 unsigned long cpu_util_cfs(int cpu)
7743 {
7744         return cpu_util(cpu, NULL, -1, 0);
7745 }
7746
7747 unsigned long cpu_util_cfs_boost(int cpu)
7748 {
7749         return cpu_util(cpu, NULL, -1, 1);
7750 }
7751
7752 /*
7753  * cpu_util_without: compute cpu utilization without any contributions from *p
7754  * @cpu: the CPU which utilization is requested
7755  * @p: the task which utilization should be discounted
7756  *
7757  * The utilization of a CPU is defined by the utilization of tasks currently
7758  * enqueued on that CPU as well as tasks which are currently sleeping after an
7759  * execution on that CPU.
7760  *
7761  * This method returns the utilization of the specified CPU by discounting the
7762  * utilization of the specified task, whenever the task is currently
7763  * contributing to the CPU utilization.
7764  */
7765 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
7766 {
7767         /* Task has no contribution or is new */
7768         if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
7769                 p = NULL;
7770
7771         return cpu_util(cpu, p, -1, 0);
7772 }
7773
7774 /*
7775  * energy_env - Utilization landscape for energy estimation.
7776  * @task_busy_time: Utilization contribution by the task for which we test the
7777  *                  placement. Given by eenv_task_busy_time().
7778  * @pd_busy_time:   Utilization of the whole perf domain without the task
7779  *                  contribution. Given by eenv_pd_busy_time().
7780  * @cpu_cap:        Maximum CPU capacity for the perf domain.
7781  * @pd_cap:         Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
7782  */
7783 struct energy_env {
7784         unsigned long task_busy_time;
7785         unsigned long pd_busy_time;
7786         unsigned long cpu_cap;
7787         unsigned long pd_cap;
7788 };
7789
7790 /*
7791  * Compute the task busy time for compute_energy(). This time cannot be
7792  * injected directly into effective_cpu_util() because of the IRQ scaling.
7793  * The latter only makes sense with the most recent CPUs where the task has
7794  * run.
7795  */
7796 static inline void eenv_task_busy_time(struct energy_env *eenv,
7797                                        struct task_struct *p, int prev_cpu)
7798 {
7799         unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
7800         unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
7801
7802         if (unlikely(irq >= max_cap))
7803                 busy_time = max_cap;
7804         else
7805                 busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
7806
7807         eenv->task_busy_time = busy_time;
7808 }
7809
7810 /*
7811  * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
7812  * utilization for each @pd_cpus, it however doesn't take into account
7813  * clamping since the ratio (utilization / cpu_capacity) is already enough to
7814  * scale the EM reported power consumption at the (eventually clamped)
7815  * cpu_capacity.
7816  *
7817  * The contribution of the task @p for which we want to estimate the
7818  * energy cost is removed (by cpu_util()) and must be calculated
7819  * separately (see eenv_task_busy_time). This ensures:
7820  *
7821  *   - A stable PD utilization, no matter which CPU of that PD we want to place
7822  *     the task on.
7823  *
7824  *   - A fair comparison between CPUs as the task contribution (task_util())
7825  *     will always be the same no matter which CPU utilization we rely on
7826  *     (util_avg or util_est).
7827  *
7828  * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
7829  * exceed @eenv->pd_cap.
7830  */
7831 static inline void eenv_pd_busy_time(struct energy_env *eenv,
7832                                      struct cpumask *pd_cpus,
7833                                      struct task_struct *p)
7834 {
7835         unsigned long busy_time = 0;
7836         int cpu;
7837
7838         for_each_cpu(cpu, pd_cpus) {
7839                 unsigned long util = cpu_util(cpu, p, -1, 0);
7840
7841                 busy_time += effective_cpu_util(cpu, util, NULL, NULL);
7842         }
7843
7844         eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
7845 }
7846
7847 /*
7848  * Compute the maximum utilization for compute_energy() when the task @p
7849  * is placed on the cpu @dst_cpu.
7850  *
7851  * Returns the maximum utilization among @eenv->cpus. This utilization can't
7852  * exceed @eenv->cpu_cap.
7853  */
7854 static inline unsigned long
7855 eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
7856                  struct task_struct *p, int dst_cpu)
7857 {
7858         unsigned long max_util = 0;
7859         int cpu;
7860
7861         for_each_cpu(cpu, pd_cpus) {
7862                 struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
7863                 unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
7864                 unsigned long eff_util, min, max;
7865
7866                 /*
7867                  * Performance domain frequency: utilization clamping
7868                  * must be considered since it affects the selection
7869                  * of the performance domain frequency.
7870                  * NOTE: in case RT tasks are running, by default the
7871                  * FREQUENCY_UTIL's utilization can be max OPP.
7872                  */
7873                 eff_util = effective_cpu_util(cpu, util, &min, &max);
7874
7875                 /* Task's uclamp can modify min and max value */
7876                 if (tsk && uclamp_is_used()) {
7877                         min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
7878
7879                         /*
7880                          * If there is no active max uclamp constraint,
7881                          * directly use task's one, otherwise keep max.
7882                          */
7883                         if (uclamp_rq_is_idle(cpu_rq(cpu)))
7884                                 max = uclamp_eff_value(p, UCLAMP_MAX);
7885                         else
7886                                 max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
7887                 }
7888
7889                 eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
7890                 max_util = max(max_util, eff_util);
7891         }
7892
7893         return min(max_util, eenv->cpu_cap);
7894 }
7895
7896 /*
7897  * compute_energy(): Use the Energy Model to estimate the energy that @pd would
7898  * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
7899  * contribution is ignored.
7900  */
7901 static inline unsigned long
7902 compute_energy(struct energy_env *eenv, struct perf_domain *pd,
7903                struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
7904 {
7905         unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
7906         unsigned long busy_time = eenv->pd_busy_time;
7907         unsigned long energy;
7908
7909         if (dst_cpu >= 0)
7910                 busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
7911
7912         energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
7913
7914         trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
7915
7916         return energy;
7917 }
7918
7919 /*
7920  * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
7921  * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
7922  * spare capacity in each performance domain and uses it as a potential
7923  * candidate to execute the task. Then, it uses the Energy Model to figure
7924  * out which of the CPU candidates is the most energy-efficient.
7925  *
7926  * The rationale for this heuristic is as follows. In a performance domain,
7927  * all the most energy efficient CPU candidates (according to the Energy
7928  * Model) are those for which we'll request a low frequency. When there are
7929  * several CPUs for which the frequency request will be the same, we don't
7930  * have enough data to break the tie between them, because the Energy Model
7931  * only includes active power costs. With this model, if we assume that
7932  * frequency requests follow utilization (e.g. using schedutil), the CPU with
7933  * the maximum spare capacity in a performance domain is guaranteed to be among
7934  * the best candidates of the performance domain.
7935  *
7936  * In practice, it could be preferable from an energy standpoint to pack
7937  * small tasks on a CPU in order to let other CPUs go in deeper idle states,
7938  * but that could also hurt our chances to go cluster idle, and we have no
7939  * ways to tell with the current Energy Model if this is actually a good
7940  * idea or not. So, find_energy_efficient_cpu() basically favors
7941  * cluster-packing, and spreading inside a cluster. That should at least be
7942  * a good thing for latency, and this is consistent with the idea that most
7943  * of the energy savings of EAS come from the asymmetry of the system, and
7944  * not so much from breaking the tie between identical CPUs. That's also the
7945  * reason why EAS is enabled in the topology code only for systems where
7946  * SD_ASYM_CPUCAPACITY is set.
7947  *
7948  * NOTE: Forkees are not accepted in the energy-aware wake-up path because
7949  * they don't have any useful utilization data yet and it's not possible to
7950  * forecast their impact on energy consumption. Consequently, they will be
7951  * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
7952  * to be energy-inefficient in some use-cases. The alternative would be to
7953  * bias new tasks towards specific types of CPUs first, or to try to infer
7954  * their util_avg from the parent task, but those heuristics could hurt
7955  * other use-cases too. So, until someone finds a better way to solve this,
7956  * let's keep things simple by re-using the existing slow path.
7957  */
7958 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
7959 {
7960         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7961         unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
7962         unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
7963         unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
7964         struct root_domain *rd = this_rq()->rd;
7965         int cpu, best_energy_cpu, target = -1;
7966         int prev_fits = -1, best_fits = -1;
7967         unsigned long best_thermal_cap = 0;
7968         unsigned long prev_thermal_cap = 0;
7969         struct sched_domain *sd;
7970         struct perf_domain *pd;
7971         struct energy_env eenv;
7972
7973         rcu_read_lock();
7974         pd = rcu_dereference(rd->pd);
7975         if (!pd || READ_ONCE(rd->overutilized))
7976                 goto unlock;
7977
7978         /*
7979          * Energy-aware wake-up happens on the lowest sched_domain starting
7980          * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
7981          */
7982         sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
7983         while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
7984                 sd = sd->parent;
7985         if (!sd)
7986                 goto unlock;
7987
7988         target = prev_cpu;
7989
7990         sync_entity_load_avg(&p->se);
7991         if (!task_util_est(p) && p_util_min == 0)
7992                 goto unlock;
7993
7994         eenv_task_busy_time(&eenv, p, prev_cpu);
7995
7996         for (; pd; pd = pd->next) {
7997                 unsigned long util_min = p_util_min, util_max = p_util_max;
7998                 unsigned long cpu_cap, cpu_thermal_cap, util;
7999                 long prev_spare_cap = -1, max_spare_cap = -1;
8000                 unsigned long rq_util_min, rq_util_max;
8001                 unsigned long cur_delta, base_energy;
8002                 int max_spare_cap_cpu = -1;
8003                 int fits, max_fits = -1;
8004
8005                 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
8006
8007                 if (cpumask_empty(cpus))
8008                         continue;
8009
8010                 /* Account thermal pressure for the energy estimation */
8011                 cpu = cpumask_first(cpus);
8012                 cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
8013                 cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
8014
8015                 eenv.cpu_cap = cpu_thermal_cap;
8016                 eenv.pd_cap = 0;
8017
8018                 for_each_cpu(cpu, cpus) {
8019                         struct rq *rq = cpu_rq(cpu);
8020
8021                         eenv.pd_cap += cpu_thermal_cap;
8022
8023                         if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
8024                                 continue;
8025
8026                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
8027                                 continue;
8028
8029                         util = cpu_util(cpu, p, cpu, 0);
8030                         cpu_cap = capacity_of(cpu);
8031
8032                         /*
8033                          * Skip CPUs that cannot satisfy the capacity request.
8034                          * IOW, placing the task there would make the CPU
8035                          * overutilized. Take uclamp into account to see how
8036                          * much capacity we can get out of the CPU; this is
8037                          * aligned with sched_cpu_util().
8038                          */
8039                         if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
8040                                 /*
8041                                  * Open code uclamp_rq_util_with() except for
8042                                  * the clamp() part. Ie: apply max aggregation
8043                                  * only. util_fits_cpu() logic requires to
8044                                  * operate on non clamped util but must use the
8045                                  * max-aggregated uclamp_{min, max}.
8046                                  */
8047                                 rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
8048                                 rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
8049
8050                                 util_min = max(rq_util_min, p_util_min);
8051                                 util_max = max(rq_util_max, p_util_max);
8052                         }
8053
8054                         fits = util_fits_cpu(util, util_min, util_max, cpu);
8055                         if (!fits)
8056                                 continue;
8057
8058                         lsub_positive(&cpu_cap, util);
8059
8060                         if (cpu == prev_cpu) {
8061                                 /* Always use prev_cpu as a candidate. */
8062                                 prev_spare_cap = cpu_cap;
8063                                 prev_fits = fits;
8064                         } else if ((fits > max_fits) ||
8065                                    ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
8066                                 /*
8067                                  * Find the CPU with the maximum spare capacity
8068                                  * among the remaining CPUs in the performance
8069                                  * domain.
8070                                  */
8071                                 max_spare_cap = cpu_cap;
8072                                 max_spare_cap_cpu = cpu;
8073                                 max_fits = fits;
8074                         }
8075                 }
8076
8077                 if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
8078                         continue;
8079
8080                 eenv_pd_busy_time(&eenv, cpus, p);
8081                 /* Compute the 'base' energy of the pd, without @p */
8082                 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
8083
8084                 /* Evaluate the energy impact of using prev_cpu. */
8085                 if (prev_spare_cap > -1) {
8086                         prev_delta = compute_energy(&eenv, pd, cpus, p,
8087                                                     prev_cpu);
8088                         /* CPU utilization has changed */
8089                         if (prev_delta < base_energy)
8090                                 goto unlock;
8091                         prev_delta -= base_energy;
8092                         prev_thermal_cap = cpu_thermal_cap;
8093                         best_delta = min(best_delta, prev_delta);
8094                 }
8095
8096                 /* Evaluate the energy impact of using max_spare_cap_cpu. */
8097                 if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
8098                         /* Current best energy cpu fits better */
8099                         if (max_fits < best_fits)
8100                                 continue;
8101
8102                         /*
8103                          * Both don't fit performance hint (i.e. uclamp_min)
8104                          * but best energy cpu has better capacity.
8105                          */
8106                         if ((max_fits < 0) &&
8107                             (cpu_thermal_cap <= best_thermal_cap))
8108                                 continue;
8109
8110                         cur_delta = compute_energy(&eenv, pd, cpus, p,
8111                                                    max_spare_cap_cpu);
8112                         /* CPU utilization has changed */
8113                         if (cur_delta < base_energy)
8114                                 goto unlock;
8115                         cur_delta -= base_energy;
8116
8117                         /*
8118                          * Both fit for the task but best energy cpu has lower
8119                          * energy impact.
8120                          */
8121                         if ((max_fits > 0) && (best_fits > 0) &&
8122                             (cur_delta >= best_delta))
8123                                 continue;
8124
8125                         best_delta = cur_delta;
8126                         best_energy_cpu = max_spare_cap_cpu;
8127                         best_fits = max_fits;
8128                         best_thermal_cap = cpu_thermal_cap;
8129                 }
8130         }
8131         rcu_read_unlock();
8132
8133         if ((best_fits > prev_fits) ||
8134             ((best_fits > 0) && (best_delta < prev_delta)) ||
8135             ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
8136                 target = best_energy_cpu;
8137
8138         return target;
8139
8140 unlock:
8141         rcu_read_unlock();
8142
8143         return target;
8144 }
8145
8146 /*
8147  * select_task_rq_fair: Select target runqueue for the waking task in domains
8148  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
8149  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
8150  *
8151  * Balances load by selecting the idlest CPU in the idlest group, or under
8152  * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
8153  *
8154  * Returns the target CPU number.
8155  */
8156 static int
8157 select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
8158 {
8159         int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
8160         struct sched_domain *tmp, *sd = NULL;
8161         int cpu = smp_processor_id();
8162         int new_cpu = prev_cpu;
8163         int want_affine = 0;
8164         /* SD_flags and WF_flags share the first nibble */
8165         int sd_flag = wake_flags & 0xF;
8166
8167         /*
8168          * required for stable ->cpus_allowed
8169          */
8170         lockdep_assert_held(&p->pi_lock);
8171         if (wake_flags & WF_TTWU) {
8172                 record_wakee(p);
8173
8174                 if ((wake_flags & WF_CURRENT_CPU) &&
8175                     cpumask_test_cpu(cpu, p->cpus_ptr))
8176                         return cpu;
8177
8178                 if (sched_energy_enabled()) {
8179                         new_cpu = find_energy_efficient_cpu(p, prev_cpu);
8180                         if (new_cpu >= 0)
8181                                 return new_cpu;
8182                         new_cpu = prev_cpu;
8183                 }
8184
8185                 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
8186         }
8187
8188         rcu_read_lock();
8189         for_each_domain(cpu, tmp) {
8190                 /*
8191                  * If both 'cpu' and 'prev_cpu' are part of this domain,
8192                  * cpu is a valid SD_WAKE_AFFINE target.
8193                  */
8194                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
8195                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
8196                         if (cpu != prev_cpu)
8197                                 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
8198
8199                         sd = NULL; /* Prefer wake_affine over balance flags */
8200                         break;
8201                 }
8202
8203                 /*
8204                  * Usually only true for WF_EXEC and WF_FORK, as sched_domains
8205                  * usually do not have SD_BALANCE_WAKE set. That means wakeup
8206                  * will usually go to the fast path.
8207                  */
8208                 if (tmp->flags & sd_flag)
8209                         sd = tmp;
8210                 else if (!want_affine)
8211                         break;
8212         }
8213
8214         if (unlikely(sd)) {
8215                 /* Slow path */
8216                 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
8217         } else if (wake_flags & WF_TTWU) { /* XXX always ? */
8218                 /* Fast path */
8219                 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8220         }
8221         rcu_read_unlock();
8222
8223         return new_cpu;
8224 }
8225
8226 /*
8227  * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
8228  * cfs_rq_of(p) references at time of call are still valid and identify the
8229  * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
8230  */
8231 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
8232 {
8233         struct sched_entity *se = &p->se;
8234
8235         if (!task_on_rq_migrating(p)) {
8236                 remove_entity_load_avg(se);
8237
8238                 /*
8239                  * Here, the task's PELT values have been updated according to
8240                  * the current rq's clock. But if that clock hasn't been
8241                  * updated in a while, a substantial idle time will be missed,
8242                  * leading to an inflation after wake-up on the new rq.
8243                  *
8244                  * Estimate the missing time from the cfs_rq last_update_time
8245                  * and update sched_avg to improve the PELT continuity after
8246                  * migration.
8247                  */
8248                 migrate_se_pelt_lag(se);
8249         }
8250
8251         /* Tell new CPU we are migrated */
8252         se->avg.last_update_time = 0;
8253
8254         update_scan_period(p, new_cpu);
8255 }
8256
8257 static void task_dead_fair(struct task_struct *p)
8258 {
8259         remove_entity_load_avg(&p->se);
8260 }
8261
8262 static int
8263 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
8264 {
8265         if (rq->nr_running)
8266                 return 1;
8267
8268         return newidle_balance(rq, rf) != 0;
8269 }
8270 #endif /* CONFIG_SMP */
8271
8272 static void set_next_buddy(struct sched_entity *se)
8273 {
8274         for_each_sched_entity(se) {
8275                 if (SCHED_WARN_ON(!se->on_rq))
8276                         return;
8277                 if (se_is_idle(se))
8278                         return;
8279                 cfs_rq_of(se)->next = se;
8280         }
8281 }
8282
8283 /*
8284  * Preempt the current task with a newly woken task if needed:
8285  */
8286 static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
8287 {
8288         struct task_struct *curr = rq->curr;
8289         struct sched_entity *se = &curr->se, *pse = &p->se;
8290         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8291         int cse_is_idle, pse_is_idle;
8292
8293         if (unlikely(se == pse))
8294                 return;
8295
8296         /*
8297          * This is possible from callers such as attach_tasks(), in which we
8298          * unconditionally wakeup_preempt() after an enqueue (which may have
8299          * lead to a throttle).  This both saves work and prevents false
8300          * next-buddy nomination below.
8301          */
8302         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8303                 return;
8304
8305         if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
8306                 set_next_buddy(pse);
8307         }
8308
8309         /*
8310          * We can come here with TIF_NEED_RESCHED already set from new task
8311          * wake up path.
8312          *
8313          * Note: this also catches the edge-case of curr being in a throttled
8314          * group (e.g. via set_curr_task), since update_curr() (in the
8315          * enqueue of curr) will have resulted in resched being set.  This
8316          * prevents us from potentially nominating it as a false LAST_BUDDY
8317          * below.
8318          */
8319         if (test_tsk_need_resched(curr))
8320                 return;
8321
8322         /* Idle tasks are by definition preempted by non-idle tasks. */
8323         if (unlikely(task_has_idle_policy(curr)) &&
8324             likely(!task_has_idle_policy(p)))
8325                 goto preempt;
8326
8327         /*
8328          * Batch and idle tasks do not preempt non-idle tasks (their preemption
8329          * is driven by the tick):
8330          */
8331         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
8332                 return;
8333
8334         find_matching_se(&se, &pse);
8335         WARN_ON_ONCE(!pse);
8336
8337         cse_is_idle = se_is_idle(se);
8338         pse_is_idle = se_is_idle(pse);
8339
8340         /*
8341          * Preempt an idle group in favor of a non-idle group (and don't preempt
8342          * in the inverse case).
8343          */
8344         if (cse_is_idle && !pse_is_idle)
8345                 goto preempt;
8346         if (cse_is_idle != pse_is_idle)
8347                 return;
8348
8349         cfs_rq = cfs_rq_of(se);
8350         update_curr(cfs_rq);
8351
8352         /*
8353          * XXX pick_eevdf(cfs_rq) != se ?
8354          */
8355         if (pick_eevdf(cfs_rq) == pse)
8356                 goto preempt;
8357
8358         return;
8359
8360 preempt:
8361         resched_curr(rq);
8362 }
8363
8364 #ifdef CONFIG_SMP
8365 static struct task_struct *pick_task_fair(struct rq *rq)
8366 {
8367         struct sched_entity *se;
8368         struct cfs_rq *cfs_rq;
8369
8370 again:
8371         cfs_rq = &rq->cfs;
8372         if (!cfs_rq->nr_running)
8373                 return NULL;
8374
8375         do {
8376                 struct sched_entity *curr = cfs_rq->curr;
8377
8378                 /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
8379                 if (curr) {
8380                         if (curr->on_rq)
8381                                 update_curr(cfs_rq);
8382                         else
8383                                 curr = NULL;
8384
8385                         if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8386                                 goto again;
8387                 }
8388
8389                 se = pick_next_entity(cfs_rq);
8390                 cfs_rq = group_cfs_rq(se);
8391         } while (cfs_rq);
8392
8393         return task_of(se);
8394 }
8395 #endif
8396
8397 struct task_struct *
8398 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
8399 {
8400         struct cfs_rq *cfs_rq = &rq->cfs;
8401         struct sched_entity *se;
8402         struct task_struct *p;
8403         int new_tasks;
8404
8405 again:
8406         if (!sched_fair_runnable(rq))
8407                 goto idle;
8408
8409 #ifdef CONFIG_FAIR_GROUP_SCHED
8410         if (!prev || prev->sched_class != &fair_sched_class)
8411                 goto simple;
8412
8413         /*
8414          * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8415          * likely that a next task is from the same cgroup as the current.
8416          *
8417          * Therefore attempt to avoid putting and setting the entire cgroup
8418          * hierarchy, only change the part that actually changes.
8419          */
8420
8421         do {
8422                 struct sched_entity *curr = cfs_rq->curr;
8423
8424                 /*
8425                  * Since we got here without doing put_prev_entity() we also
8426                  * have to consider cfs_rq->curr. If it is still a runnable
8427                  * entity, update_curr() will update its vruntime, otherwise
8428                  * forget we've ever seen it.
8429                  */
8430                 if (curr) {
8431                         if (curr->on_rq)
8432                                 update_curr(cfs_rq);
8433                         else
8434                                 curr = NULL;
8435
8436                         /*
8437                          * This call to check_cfs_rq_runtime() will do the
8438                          * throttle and dequeue its entity in the parent(s).
8439                          * Therefore the nr_running test will indeed
8440                          * be correct.
8441                          */
8442                         if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
8443                                 cfs_rq = &rq->cfs;
8444
8445                                 if (!cfs_rq->nr_running)
8446                                         goto idle;
8447
8448                                 goto simple;
8449                         }
8450                 }
8451
8452                 se = pick_next_entity(cfs_rq);
8453                 cfs_rq = group_cfs_rq(se);
8454         } while (cfs_rq);
8455
8456         p = task_of(se);
8457
8458         /*
8459          * Since we haven't yet done put_prev_entity and if the selected task
8460          * is a different task than we started out with, try and touch the
8461          * least amount of cfs_rqs.
8462          */
8463         if (prev != p) {
8464                 struct sched_entity *pse = &prev->se;
8465
8466                 while (!(cfs_rq = is_same_group(se, pse))) {
8467                         int se_depth = se->depth;
8468                         int pse_depth = pse->depth;
8469
8470                         if (se_depth <= pse_depth) {
8471                                 put_prev_entity(cfs_rq_of(pse), pse);
8472                                 pse = parent_entity(pse);
8473                         }
8474                         if (se_depth >= pse_depth) {
8475                                 set_next_entity(cfs_rq_of(se), se);
8476                                 se = parent_entity(se);
8477                         }
8478                 }
8479
8480                 put_prev_entity(cfs_rq, pse);
8481                 set_next_entity(cfs_rq, se);
8482         }
8483
8484         goto done;
8485 simple:
8486 #endif
8487         if (prev)
8488                 put_prev_task(rq, prev);
8489
8490         do {
8491                 se = pick_next_entity(cfs_rq);
8492                 set_next_entity(cfs_rq, se);
8493                 cfs_rq = group_cfs_rq(se);
8494         } while (cfs_rq);
8495
8496         p = task_of(se);
8497
8498 done: __maybe_unused;
8499 #ifdef CONFIG_SMP
8500         /*
8501          * Move the next running task to the front of
8502          * the list, so our cfs_tasks list becomes MRU
8503          * one.
8504          */
8505         list_move(&p->se.group_node, &rq->cfs_tasks);
8506 #endif
8507
8508         if (hrtick_enabled_fair(rq))
8509                 hrtick_start_fair(rq, p);
8510
8511         update_misfit_status(p, rq);
8512         sched_fair_update_stop_tick(rq, p);
8513
8514         return p;
8515
8516 idle:
8517         if (!rf)
8518                 return NULL;
8519
8520         new_tasks = newidle_balance(rq, rf);
8521
8522         /*
8523          * Because newidle_balance() releases (and re-acquires) rq->lock, it is
8524          * possible for any higher priority task to appear. In that case we
8525          * must re-start the pick_next_entity() loop.
8526          */
8527         if (new_tasks < 0)
8528                 return RETRY_TASK;
8529
8530         if (new_tasks > 0)
8531                 goto again;
8532
8533         /*
8534          * rq is about to be idle, check if we need to update the
8535          * lost_idle_time of clock_pelt
8536          */
8537         update_idle_rq_clock_pelt(rq);
8538
8539         return NULL;
8540 }
8541
8542 static struct task_struct *__pick_next_task_fair(struct rq *rq)
8543 {
8544         return pick_next_task_fair(rq, NULL, NULL);
8545 }
8546
8547 /*
8548  * Account for a descheduled task:
8549  */
8550 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
8551 {
8552         struct sched_entity *se = &prev->se;
8553         struct cfs_rq *cfs_rq;
8554
8555         for_each_sched_entity(se) {
8556                 cfs_rq = cfs_rq_of(se);
8557                 put_prev_entity(cfs_rq, se);
8558         }
8559 }
8560
8561 /*
8562  * sched_yield() is very simple
8563  */
8564 static void yield_task_fair(struct rq *rq)
8565 {
8566         struct task_struct *curr = rq->curr;
8567         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8568         struct sched_entity *se = &curr->se;
8569
8570         /*
8571          * Are we the only task in the tree?
8572          */
8573         if (unlikely(rq->nr_running == 1))
8574                 return;
8575
8576         clear_buddies(cfs_rq, se);
8577
8578         update_rq_clock(rq);
8579         /*
8580          * Update run-time statistics of the 'current'.
8581          */
8582         update_curr(cfs_rq);
8583         /*
8584          * Tell update_rq_clock() that we've just updated,
8585          * so we don't do microscopic update in schedule()
8586          * and double the fastpath cost.
8587          */
8588         rq_clock_skip_update(rq);
8589
8590         se->deadline += calc_delta_fair(se->slice, se);
8591 }
8592
8593 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
8594 {
8595         struct sched_entity *se = &p->se;
8596
8597         /* throttled hierarchies are not runnable */
8598         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
8599                 return false;
8600
8601         /* Tell the scheduler that we'd really like pse to run next. */
8602         set_next_buddy(se);
8603
8604         yield_task_fair(rq);
8605
8606         return true;
8607 }
8608
8609 #ifdef CONFIG_SMP
8610 /**************************************************
8611  * Fair scheduling class load-balancing methods.
8612  *
8613  * BASICS
8614  *
8615  * The purpose of load-balancing is to achieve the same basic fairness the
8616  * per-CPU scheduler provides, namely provide a proportional amount of compute
8617  * time to each task. This is expressed in the following equation:
8618  *
8619  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
8620  *
8621  * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
8622  * W_i,0 is defined as:
8623  *
8624  *   W_i,0 = \Sum_j w_i,j                                             (2)
8625  *
8626  * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
8627  * is derived from the nice value as per sched_prio_to_weight[].
8628  *
8629  * The weight average is an exponential decay average of the instantaneous
8630  * weight:
8631  *
8632  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
8633  *
8634  * C_i is the compute capacity of CPU i, typically it is the
8635  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8636  * can also include other factors [XXX].
8637  *
8638  * To achieve this balance we define a measure of imbalance which follows
8639  * directly from (1):
8640  *
8641  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
8642  *
8643  * We them move tasks around to minimize the imbalance. In the continuous
8644  * function space it is obvious this converges, in the discrete case we get
8645  * a few fun cases generally called infeasible weight scenarios.
8646  *
8647  * [XXX expand on:
8648  *     - infeasible weights;
8649  *     - local vs global optima in the discrete case. ]
8650  *
8651  *
8652  * SCHED DOMAINS
8653  *
8654  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8655  * for all i,j solution, we create a tree of CPUs that follows the hardware
8656  * topology where each level pairs two lower groups (or better). This results
8657  * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
8658  * tree to only the first of the previous level and we decrease the frequency
8659  * of load-balance at each level inv. proportional to the number of CPUs in
8660  * the groups.
8661  *
8662  * This yields:
8663  *
8664  *     log_2 n     1     n
8665  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
8666  *     i = 0      2^i   2^i
8667  *                               `- size of each group
8668  *         |         |     `- number of CPUs doing load-balance
8669  *         |         `- freq
8670  *         `- sum over all levels
8671  *
8672  * Coupled with a limit on how many tasks we can migrate every balance pass,
8673  * this makes (5) the runtime complexity of the balancer.
8674  *
8675  * An important property here is that each CPU is still (indirectly) connected
8676  * to every other CPU in at most O(log n) steps:
8677  *
8678  * The adjacency matrix of the resulting graph is given by:
8679  *
8680  *             log_2 n
8681  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
8682  *             k = 0
8683  *
8684  * And you'll find that:
8685  *
8686  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
8687  *
8688  * Showing there's indeed a path between every CPU in at most O(log n) steps.
8689  * The task movement gives a factor of O(m), giving a convergence complexity
8690  * of:
8691  *
8692  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
8693  *
8694  *
8695  * WORK CONSERVING
8696  *
8697  * In order to avoid CPUs going idle while there's still work to do, new idle
8698  * balancing is more aggressive and has the newly idle CPU iterate up the domain
8699  * tree itself instead of relying on other CPUs to bring it work.
8700  *
8701  * This adds some complexity to both (5) and (8) but it reduces the total idle
8702  * time.
8703  *
8704  * [XXX more?]
8705  *
8706  *
8707  * CGROUPS
8708  *
8709  * Cgroups make a horror show out of (2), instead of a simple sum we get:
8710  *
8711  *                                s_k,i
8712  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
8713  *                                 S_k
8714  *
8715  * Where
8716  *
8717  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
8718  *
8719  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
8720  *
8721  * The big problem is S_k, its a global sum needed to compute a local (W_i)
8722  * property.
8723  *
8724  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
8725  *      rewrite all of this once again.]
8726  */
8727
8728 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
8729
8730 enum fbq_type { regular, remote, all };
8731
8732 /*
8733  * 'group_type' describes the group of CPUs at the moment of load balancing.
8734  *
8735  * The enum is ordered by pulling priority, with the group with lowest priority
8736  * first so the group_type can simply be compared when selecting the busiest
8737  * group. See update_sd_pick_busiest().
8738  */
8739 enum group_type {
8740         /* The group has spare capacity that can be used to run more tasks.  */
8741         group_has_spare = 0,
8742         /*
8743          * The group is fully used and the tasks don't compete for more CPU
8744          * cycles. Nevertheless, some tasks might wait before running.
8745          */
8746         group_fully_busy,
8747         /*
8748          * One task doesn't fit with CPU's capacity and must be migrated to a
8749          * more powerful CPU.
8750          */
8751         group_misfit_task,
8752         /*
8753          * Balance SMT group that's fully busy. Can benefit from migration
8754          * a task on SMT with busy sibling to another CPU on idle core.
8755          */
8756         group_smt_balance,
8757         /*
8758          * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
8759          * and the task should be migrated to it instead of running on the
8760          * current CPU.
8761          */
8762         group_asym_packing,
8763         /*
8764          * The tasks' affinity constraints previously prevented the scheduler
8765          * from balancing the load across the system.
8766          */
8767         group_imbalanced,
8768         /*
8769          * The CPU is overloaded and can't provide expected CPU cycles to all
8770          * tasks.
8771          */
8772         group_overloaded
8773 };
8774
8775 enum migration_type {
8776         migrate_load = 0,
8777         migrate_util,
8778         migrate_task,
8779         migrate_misfit
8780 };
8781
8782 #define LBF_ALL_PINNED  0x01
8783 #define LBF_NEED_BREAK  0x02
8784 #define LBF_DST_PINNED  0x04
8785 #define LBF_SOME_PINNED 0x08
8786 #define LBF_ACTIVE_LB   0x10
8787
8788 struct lb_env {
8789         struct sched_domain     *sd;
8790
8791         struct rq               *src_rq;
8792         int                     src_cpu;
8793
8794         int                     dst_cpu;
8795         struct rq               *dst_rq;
8796
8797         struct cpumask          *dst_grpmask;
8798         int                     new_dst_cpu;
8799         enum cpu_idle_type      idle;
8800         long                    imbalance;
8801         /* The set of CPUs under consideration for load-balancing */
8802         struct cpumask          *cpus;
8803
8804         unsigned int            flags;
8805
8806         unsigned int            loop;
8807         unsigned int            loop_break;
8808         unsigned int            loop_max;
8809
8810         enum fbq_type           fbq_type;
8811         enum migration_type     migration_type;
8812         struct list_head        tasks;
8813 };
8814
8815 /*
8816  * Is this task likely cache-hot:
8817  */
8818 static int task_hot(struct task_struct *p, struct lb_env *env)
8819 {
8820         s64 delta;
8821
8822         lockdep_assert_rq_held(env->src_rq);
8823
8824         if (p->sched_class != &fair_sched_class)
8825                 return 0;
8826
8827         if (unlikely(task_has_idle_policy(p)))
8828                 return 0;
8829
8830         /* SMT siblings share cache */
8831         if (env->sd->flags & SD_SHARE_CPUCAPACITY)
8832                 return 0;
8833
8834         /*
8835          * Buddy candidates are cache hot:
8836          */
8837         if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
8838             (&p->se == cfs_rq_of(&p->se)->next))
8839                 return 1;
8840
8841         if (sysctl_sched_migration_cost == -1)
8842                 return 1;
8843
8844         /*
8845          * Don't migrate task if the task's cookie does not match
8846          * with the destination CPU's core cookie.
8847          */
8848         if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
8849                 return 1;
8850
8851         if (sysctl_sched_migration_cost == 0)
8852                 return 0;
8853
8854         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
8855
8856         return delta < (s64)sysctl_sched_migration_cost;
8857 }
8858
8859 #ifdef CONFIG_NUMA_BALANCING
8860 /*
8861  * Returns 1, if task migration degrades locality
8862  * Returns 0, if task migration improves locality i.e migration preferred.
8863  * Returns -1, if task migration is not affected by locality.
8864  */
8865 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
8866 {
8867         struct numa_group *numa_group = rcu_dereference(p->numa_group);
8868         unsigned long src_weight, dst_weight;
8869         int src_nid, dst_nid, dist;
8870
8871         if (!static_branch_likely(&sched_numa_balancing))
8872                 return -1;
8873
8874         if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
8875                 return -1;
8876
8877         src_nid = cpu_to_node(env->src_cpu);
8878         dst_nid = cpu_to_node(env->dst_cpu);
8879
8880         if (src_nid == dst_nid)
8881                 return -1;
8882
8883         /* Migrating away from the preferred node is always bad. */
8884         if (src_nid == p->numa_preferred_nid) {
8885                 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
8886                         return 1;
8887                 else
8888                         return -1;
8889         }
8890
8891         /* Encourage migration to the preferred node. */
8892         if (dst_nid == p->numa_preferred_nid)
8893                 return 0;
8894
8895         /* Leaving a core idle is often worse than degrading locality. */
8896         if (env->idle == CPU_IDLE)
8897                 return -1;
8898
8899         dist = node_distance(src_nid, dst_nid);
8900         if (numa_group) {
8901                 src_weight = group_weight(p, src_nid, dist);
8902                 dst_weight = group_weight(p, dst_nid, dist);
8903         } else {
8904                 src_weight = task_weight(p, src_nid, dist);
8905                 dst_weight = task_weight(p, dst_nid, dist);
8906         }
8907
8908         return dst_weight < src_weight;
8909 }
8910
8911 #else
8912 static inline int migrate_degrades_locality(struct task_struct *p,
8913                                              struct lb_env *env)
8914 {
8915         return -1;
8916 }
8917 #endif
8918
8919 /*
8920  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
8921  */
8922 static
8923 int can_migrate_task(struct task_struct *p, struct lb_env *env)
8924 {
8925         int tsk_cache_hot;
8926
8927         lockdep_assert_rq_held(env->src_rq);
8928
8929         /*
8930          * We do not migrate tasks that are:
8931          * 1) throttled_lb_pair, or
8932          * 2) cannot be migrated to this CPU due to cpus_ptr, or
8933          * 3) running (obviously), or
8934          * 4) are cache-hot on their current CPU.
8935          */
8936         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
8937                 return 0;
8938
8939         /* Disregard pcpu kthreads; they are where they need to be. */
8940         if (kthread_is_per_cpu(p))
8941                 return 0;
8942
8943         if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
8944                 int cpu;
8945
8946                 schedstat_inc(p->stats.nr_failed_migrations_affine);
8947
8948                 env->flags |= LBF_SOME_PINNED;
8949
8950                 /*
8951                  * Remember if this task can be migrated to any other CPU in
8952                  * our sched_group. We may want to revisit it if we couldn't
8953                  * meet load balance goals by pulling other tasks on src_cpu.
8954                  *
8955                  * Avoid computing new_dst_cpu
8956                  * - for NEWLY_IDLE
8957                  * - if we have already computed one in current iteration
8958                  * - if it's an active balance
8959                  */
8960                 if (env->idle == CPU_NEWLY_IDLE ||
8961                     env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
8962                         return 0;
8963
8964                 /* Prevent to re-select dst_cpu via env's CPUs: */
8965                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8966                         if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
8967                                 env->flags |= LBF_DST_PINNED;
8968                                 env->new_dst_cpu = cpu;
8969                                 break;
8970                         }
8971                 }
8972
8973                 return 0;
8974         }
8975
8976         /* Record that we found at least one task that could run on dst_cpu */
8977         env->flags &= ~LBF_ALL_PINNED;
8978
8979         if (task_on_cpu(env->src_rq, p)) {
8980                 schedstat_inc(p->stats.nr_failed_migrations_running);
8981                 return 0;
8982         }
8983
8984         /*
8985          * Aggressive migration if:
8986          * 1) active balance
8987          * 2) destination numa is preferred
8988          * 3) task is cache cold, or
8989          * 4) too many balance attempts have failed.
8990          */
8991         if (env->flags & LBF_ACTIVE_LB)
8992                 return 1;
8993
8994         tsk_cache_hot = migrate_degrades_locality(p, env);
8995         if (tsk_cache_hot == -1)
8996                 tsk_cache_hot = task_hot(p, env);
8997
8998         if (tsk_cache_hot <= 0 ||
8999             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9000                 if (tsk_cache_hot == 1) {
9001                         schedstat_inc(env->sd->lb_hot_gained[env->idle]);
9002                         schedstat_inc(p->stats.nr_forced_migrations);
9003                 }
9004                 return 1;
9005         }
9006
9007         schedstat_inc(p->stats.nr_failed_migrations_hot);
9008         return 0;
9009 }
9010
9011 /*
9012  * detach_task() -- detach the task for the migration specified in env
9013  */
9014 static void detach_task(struct task_struct *p, struct lb_env *env)
9015 {
9016         lockdep_assert_rq_held(env->src_rq);
9017
9018         deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
9019         set_task_cpu(p, env->dst_cpu);
9020 }
9021
9022 /*
9023  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
9024  * part of active balancing operations within "domain".
9025  *
9026  * Returns a task if successful and NULL otherwise.
9027  */
9028 static struct task_struct *detach_one_task(struct lb_env *env)
9029 {
9030         struct task_struct *p;
9031
9032         lockdep_assert_rq_held(env->src_rq);
9033
9034         list_for_each_entry_reverse(p,
9035                         &env->src_rq->cfs_tasks, se.group_node) {
9036                 if (!can_migrate_task(p, env))
9037                         continue;
9038
9039                 detach_task(p, env);
9040
9041                 /*
9042                  * Right now, this is only the second place where
9043                  * lb_gained[env->idle] is updated (other is detach_tasks)
9044                  * so we can safely collect stats here rather than
9045                  * inside detach_tasks().
9046                  */
9047                 schedstat_inc(env->sd->lb_gained[env->idle]);
9048                 return p;
9049         }
9050         return NULL;
9051 }
9052
9053 /*
9054  * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
9055  * busiest_rq, as part of a balancing operation within domain "sd".
9056  *
9057  * Returns number of detached tasks if successful and 0 otherwise.
9058  */
9059 static int detach_tasks(struct lb_env *env)
9060 {
9061         struct list_head *tasks = &env->src_rq->cfs_tasks;
9062         unsigned long util, load;
9063         struct task_struct *p;
9064         int detached = 0;
9065
9066         lockdep_assert_rq_held(env->src_rq);
9067
9068         /*
9069          * Source run queue has been emptied by another CPU, clear
9070          * LBF_ALL_PINNED flag as we will not test any task.
9071          */
9072         if (env->src_rq->nr_running <= 1) {
9073                 env->flags &= ~LBF_ALL_PINNED;
9074                 return 0;
9075         }
9076
9077         if (env->imbalance <= 0)
9078                 return 0;
9079
9080         while (!list_empty(tasks)) {
9081                 /*
9082                  * We don't want to steal all, otherwise we may be treated likewise,
9083                  * which could at worst lead to a livelock crash.
9084                  */
9085                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
9086                         break;
9087
9088                 env->loop++;
9089                 /*
9090                  * We've more or less seen every task there is, call it quits
9091                  * unless we haven't found any movable task yet.
9092                  */
9093                 if (env->loop > env->loop_max &&
9094                     !(env->flags & LBF_ALL_PINNED))
9095                         break;
9096
9097                 /* take a breather every nr_migrate tasks */
9098                 if (env->loop > env->loop_break) {
9099                         env->loop_break += SCHED_NR_MIGRATE_BREAK;
9100                         env->flags |= LBF_NEED_BREAK;
9101                         break;
9102                 }
9103
9104                 p = list_last_entry(tasks, struct task_struct, se.group_node);
9105
9106                 if (!can_migrate_task(p, env))
9107                         goto next;
9108
9109                 switch (env->migration_type) {
9110                 case migrate_load:
9111                         /*
9112                          * Depending of the number of CPUs and tasks and the
9113                          * cgroup hierarchy, task_h_load() can return a null
9114                          * value. Make sure that env->imbalance decreases
9115                          * otherwise detach_tasks() will stop only after
9116                          * detaching up to loop_max tasks.
9117                          */
9118                         load = max_t(unsigned long, task_h_load(p), 1);
9119
9120                         if (sched_feat(LB_MIN) &&
9121                             load < 16 && !env->sd->nr_balance_failed)
9122                                 goto next;
9123
9124                         /*
9125                          * Make sure that we don't migrate too much load.
9126                          * Nevertheless, let relax the constraint if
9127                          * scheduler fails to find a good waiting task to
9128                          * migrate.
9129                          */
9130                         if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
9131                                 goto next;
9132
9133                         env->imbalance -= load;
9134                         break;
9135
9136                 case migrate_util:
9137                         util = task_util_est(p);
9138
9139                         if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
9140                                 goto next;
9141
9142                         env->imbalance -= util;
9143                         break;
9144
9145                 case migrate_task:
9146                         env->imbalance--;
9147                         break;
9148
9149                 case migrate_misfit:
9150                         /* This is not a misfit task */
9151                         if (task_fits_cpu(p, env->src_cpu))
9152                                 goto next;
9153
9154                         env->imbalance = 0;
9155                         break;
9156                 }
9157
9158                 detach_task(p, env);
9159                 list_add(&p->se.group_node, &env->tasks);
9160
9161                 detached++;
9162
9163 #ifdef CONFIG_PREEMPTION
9164                 /*
9165                  * NEWIDLE balancing is a source of latency, so preemptible
9166                  * kernels will stop after the first task is detached to minimize
9167                  * the critical section.
9168                  */
9169                 if (env->idle == CPU_NEWLY_IDLE)
9170                         break;
9171 #endif
9172
9173                 /*
9174                  * We only want to steal up to the prescribed amount of
9175                  * load/util/tasks.
9176                  */
9177                 if (env->imbalance <= 0)
9178                         break;
9179
9180                 continue;
9181 next:
9182                 list_move(&p->se.group_node, tasks);
9183         }
9184
9185         /*
9186          * Right now, this is one of only two places we collect this stat
9187          * so we can safely collect detach_one_task() stats here rather
9188          * than inside detach_one_task().
9189          */
9190         schedstat_add(env->sd->lb_gained[env->idle], detached);
9191
9192         return detached;
9193 }
9194
9195 /*
9196  * attach_task() -- attach the task detached by detach_task() to its new rq.
9197  */
9198 static void attach_task(struct rq *rq, struct task_struct *p)
9199 {
9200         lockdep_assert_rq_held(rq);
9201
9202         WARN_ON_ONCE(task_rq(p) != rq);
9203         activate_task(rq, p, ENQUEUE_NOCLOCK);
9204         wakeup_preempt(rq, p, 0);
9205 }
9206
9207 /*
9208  * attach_one_task() -- attaches the task returned from detach_one_task() to
9209  * its new rq.
9210  */
9211 static void attach_one_task(struct rq *rq, struct task_struct *p)
9212 {
9213         struct rq_flags rf;
9214
9215         rq_lock(rq, &rf);
9216         update_rq_clock(rq);
9217         attach_task(rq, p);
9218         rq_unlock(rq, &rf);
9219 }
9220
9221 /*
9222  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
9223  * new rq.
9224  */
9225 static void attach_tasks(struct lb_env *env)
9226 {
9227         struct list_head *tasks = &env->tasks;
9228         struct task_struct *p;
9229         struct rq_flags rf;
9230
9231         rq_lock(env->dst_rq, &rf);
9232         update_rq_clock(env->dst_rq);
9233
9234         while (!list_empty(tasks)) {
9235                 p = list_first_entry(tasks, struct task_struct, se.group_node);
9236                 list_del_init(&p->se.group_node);
9237
9238                 attach_task(env->dst_rq, p);
9239         }
9240
9241         rq_unlock(env->dst_rq, &rf);
9242 }
9243
9244 #ifdef CONFIG_NO_HZ_COMMON
9245 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
9246 {
9247         if (cfs_rq->avg.load_avg)
9248                 return true;
9249
9250         if (cfs_rq->avg.util_avg)
9251                 return true;
9252
9253         return false;
9254 }
9255
9256 static inline bool others_have_blocked(struct rq *rq)
9257 {
9258         if (cpu_util_rt(rq))
9259                 return true;
9260
9261         if (cpu_util_dl(rq))
9262                 return true;
9263
9264         if (thermal_load_avg(rq))
9265                 return true;
9266
9267         if (cpu_util_irq(rq))
9268                 return true;
9269
9270         return false;
9271 }
9272
9273 static inline void update_blocked_load_tick(struct rq *rq)
9274 {
9275         WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
9276 }
9277
9278 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
9279 {
9280         if (!has_blocked)
9281                 rq->has_blocked_load = 0;
9282 }
9283 #else
9284 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
9285 static inline bool others_have_blocked(struct rq *rq) { return false; }
9286 static inline void update_blocked_load_tick(struct rq *rq) {}
9287 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
9288 #endif
9289
9290 static bool __update_blocked_others(struct rq *rq, bool *done)
9291 {
9292         const struct sched_class *curr_class;
9293         u64 now = rq_clock_pelt(rq);
9294         unsigned long thermal_pressure;
9295         bool decayed;
9296
9297         /*
9298          * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
9299          * DL and IRQ signals have been updated before updating CFS.
9300          */
9301         curr_class = rq->curr->sched_class;
9302
9303         thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
9304
9305         decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
9306                   update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
9307                   update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
9308                   update_irq_load_avg(rq, 0);
9309
9310         if (others_have_blocked(rq))
9311                 *done = false;
9312
9313         return decayed;
9314 }
9315
9316 #ifdef CONFIG_FAIR_GROUP_SCHED
9317
9318 static bool __update_blocked_fair(struct rq *rq, bool *done)
9319 {
9320         struct cfs_rq *cfs_rq, *pos;
9321         bool decayed = false;
9322         int cpu = cpu_of(rq);
9323
9324         /*
9325          * Iterates the task_group tree in a bottom up fashion, see
9326          * list_add_leaf_cfs_rq() for details.
9327          */
9328         for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
9329                 struct sched_entity *se;
9330
9331                 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
9332                         update_tg_load_avg(cfs_rq);
9333
9334                         if (cfs_rq->nr_running == 0)
9335                                 update_idle_cfs_rq_clock_pelt(cfs_rq);
9336
9337                         if (cfs_rq == &rq->cfs)
9338                                 decayed = true;
9339                 }
9340
9341                 /* Propagate pending load changes to the parent, if any: */
9342                 se = cfs_rq->tg->se[cpu];
9343                 if (se && !skip_blocked_update(se))
9344                         update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
9345
9346                 /*
9347                  * There can be a lot of idle CPU cgroups.  Don't let fully
9348                  * decayed cfs_rqs linger on the list.
9349                  */
9350                 if (cfs_rq_is_decayed(cfs_rq))
9351                         list_del_leaf_cfs_rq(cfs_rq);
9352
9353                 /* Don't need periodic decay once load/util_avg are null */
9354                 if (cfs_rq_has_blocked(cfs_rq))
9355                         *done = false;
9356         }
9357
9358         return decayed;
9359 }
9360
9361 /*
9362  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9363  * This needs to be done in a top-down fashion because the load of a child
9364  * group is a fraction of its parents load.
9365  */
9366 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9367 {
9368         struct rq *rq = rq_of(cfs_rq);
9369         struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9370         unsigned long now = jiffies;
9371         unsigned long load;
9372
9373         if (cfs_rq->last_h_load_update == now)
9374                 return;
9375
9376         WRITE_ONCE(cfs_rq->h_load_next, NULL);
9377         for_each_sched_entity(se) {
9378                 cfs_rq = cfs_rq_of(se);
9379                 WRITE_ONCE(cfs_rq->h_load_next, se);
9380                 if (cfs_rq->last_h_load_update == now)
9381                         break;
9382         }
9383
9384         if (!se) {
9385                 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9386                 cfs_rq->last_h_load_update = now;
9387         }
9388
9389         while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9390                 load = cfs_rq->h_load;
9391                 load = div64_ul(load * se->avg.load_avg,
9392                         cfs_rq_load_avg(cfs_rq) + 1);
9393                 cfs_rq = group_cfs_rq(se);
9394                 cfs_rq->h_load = load;
9395                 cfs_rq->last_h_load_update = now;
9396         }
9397 }
9398
9399 static unsigned long task_h_load(struct task_struct *p)
9400 {
9401         struct cfs_rq *cfs_rq = task_cfs_rq(p);
9402
9403         update_cfs_rq_h_load(cfs_rq);
9404         return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9405                         cfs_rq_load_avg(cfs_rq) + 1);
9406 }
9407 #else
9408 static bool __update_blocked_fair(struct rq *rq, bool *done)
9409 {
9410         struct cfs_rq *cfs_rq = &rq->cfs;
9411         bool decayed;
9412
9413         decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
9414         if (cfs_rq_has_blocked(cfs_rq))
9415                 *done = false;
9416
9417         return decayed;
9418 }
9419
9420 static unsigned long task_h_load(struct task_struct *p)
9421 {
9422         return p->se.avg.load_avg;
9423 }
9424 #endif
9425
9426 static void update_blocked_averages(int cpu)
9427 {
9428         bool decayed = false, done = true;
9429         struct rq *rq = cpu_rq(cpu);
9430         struct rq_flags rf;
9431
9432         rq_lock_irqsave(rq, &rf);
9433         update_blocked_load_tick(rq);
9434         update_rq_clock(rq);
9435
9436         decayed |= __update_blocked_others(rq, &done);
9437         decayed |= __update_blocked_fair(rq, &done);
9438
9439         update_blocked_load_status(rq, !done);
9440         if (decayed)
9441                 cpufreq_update_util(rq, 0);
9442         rq_unlock_irqrestore(rq, &rf);
9443 }
9444
9445 /********** Helpers for find_busiest_group ************************/
9446
9447 /*
9448  * sg_lb_stats - stats of a sched_group required for load_balancing
9449  */
9450 struct sg_lb_stats {
9451         unsigned long avg_load; /*Avg load across the CPUs of the group */
9452         unsigned long group_load; /* Total load over the CPUs of the group */
9453         unsigned long group_capacity;
9454         unsigned long group_util; /* Total utilization over the CPUs of the group */
9455         unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
9456         unsigned int sum_nr_running; /* Nr of tasks running in the group */
9457         unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
9458         unsigned int idle_cpus;
9459         unsigned int group_weight;
9460         enum group_type group_type;
9461         unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
9462         unsigned int group_smt_balance;  /* Task on busy SMT be moved */
9463         unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
9464 #ifdef CONFIG_NUMA_BALANCING
9465         unsigned int nr_numa_running;
9466         unsigned int nr_preferred_running;
9467 #endif
9468 };
9469
9470 /*
9471  * sd_lb_stats - Structure to store the statistics of a sched_domain
9472  *               during load balancing.
9473  */
9474 struct sd_lb_stats {
9475         struct sched_group *busiest;    /* Busiest group in this sd */
9476         struct sched_group *local;      /* Local group in this sd */
9477         unsigned long total_load;       /* Total load of all groups in sd */
9478         unsigned long total_capacity;   /* Total capacity of all groups in sd */
9479         unsigned long avg_load; /* Average load across all groups in sd */
9480         unsigned int prefer_sibling; /* tasks should go to sibling first */
9481
9482         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
9483         struct sg_lb_stats local_stat;  /* Statistics of the local group */
9484 };
9485
9486 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9487 {
9488         /*
9489          * Skimp on the clearing to avoid duplicate work. We can avoid clearing
9490          * local_stat because update_sg_lb_stats() does a full clear/assignment.
9491          * We must however set busiest_stat::group_type and
9492          * busiest_stat::idle_cpus to the worst busiest group because
9493          * update_sd_pick_busiest() reads these before assignment.
9494          */
9495         *sds = (struct sd_lb_stats){
9496                 .busiest = NULL,
9497                 .local = NULL,
9498                 .total_load = 0UL,
9499                 .total_capacity = 0UL,
9500                 .busiest_stat = {
9501                         .idle_cpus = UINT_MAX,
9502                         .group_type = group_has_spare,
9503                 },
9504         };
9505 }
9506
9507 static unsigned long scale_rt_capacity(int cpu)
9508 {
9509         struct rq *rq = cpu_rq(cpu);
9510         unsigned long max = arch_scale_cpu_capacity(cpu);
9511         unsigned long used, free;
9512         unsigned long irq;
9513
9514         irq = cpu_util_irq(rq);
9515
9516         if (unlikely(irq >= max))
9517                 return 1;
9518
9519         /*
9520          * avg_rt.util_avg and avg_dl.util_avg track binary signals
9521          * (running and not running) with weights 0 and 1024 respectively.
9522          * avg_thermal.load_avg tracks thermal pressure and the weighted
9523          * average uses the actual delta max capacity(load).
9524          */
9525         used = cpu_util_rt(rq);
9526         used += cpu_util_dl(rq);
9527         used += thermal_load_avg(rq);
9528
9529         if (unlikely(used >= max))
9530                 return 1;
9531
9532         free = max - used;
9533
9534         return scale_irq_capacity(free, irq, max);
9535 }
9536
9537 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
9538 {
9539         unsigned long capacity = scale_rt_capacity(cpu);
9540         struct sched_group *sdg = sd->groups;
9541
9542         if (!capacity)
9543                 capacity = 1;
9544
9545         cpu_rq(cpu)->cpu_capacity = capacity;
9546         trace_sched_cpu_capacity_tp(cpu_rq(cpu));
9547
9548         sdg->sgc->capacity = capacity;
9549         sdg->sgc->min_capacity = capacity;
9550         sdg->sgc->max_capacity = capacity;
9551 }
9552
9553 void update_group_capacity(struct sched_domain *sd, int cpu)
9554 {
9555         struct sched_domain *child = sd->child;
9556         struct sched_group *group, *sdg = sd->groups;
9557         unsigned long capacity, min_capacity, max_capacity;
9558         unsigned long interval;
9559
9560         interval = msecs_to_jiffies(sd->balance_interval);
9561         interval = clamp(interval, 1UL, max_load_balance_interval);
9562         sdg->sgc->next_update = jiffies + interval;
9563
9564         if (!child) {
9565                 update_cpu_capacity(sd, cpu);
9566                 return;
9567         }
9568
9569         capacity = 0;
9570         min_capacity = ULONG_MAX;
9571         max_capacity = 0;
9572
9573         if (child->flags & SD_OVERLAP) {
9574                 /*
9575                  * SD_OVERLAP domains cannot assume that child groups
9576                  * span the current group.
9577                  */
9578
9579                 for_each_cpu(cpu, sched_group_span(sdg)) {
9580                         unsigned long cpu_cap = capacity_of(cpu);
9581
9582                         capacity += cpu_cap;
9583                         min_capacity = min(cpu_cap, min_capacity);
9584                         max_capacity = max(cpu_cap, max_capacity);
9585                 }
9586         } else  {
9587                 /*
9588                  * !SD_OVERLAP domains can assume that child groups
9589                  * span the current group.
9590                  */
9591
9592                 group = child->groups;
9593                 do {
9594                         struct sched_group_capacity *sgc = group->sgc;
9595
9596                         capacity += sgc->capacity;
9597                         min_capacity = min(sgc->min_capacity, min_capacity);
9598                         max_capacity = max(sgc->max_capacity, max_capacity);
9599                         group = group->next;
9600                 } while (group != child->groups);
9601         }
9602
9603         sdg->sgc->capacity = capacity;
9604         sdg->sgc->min_capacity = min_capacity;
9605         sdg->sgc->max_capacity = max_capacity;
9606 }
9607
9608 /*
9609  * Check whether the capacity of the rq has been noticeably reduced by side
9610  * activity. The imbalance_pct is used for the threshold.
9611  * Return true is the capacity is reduced
9612  */
9613 static inline int
9614 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9615 {
9616         return ((rq->cpu_capacity * sd->imbalance_pct) <
9617                                 (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
9618 }
9619
9620 /*
9621  * Check whether a rq has a misfit task and if it looks like we can actually
9622  * help that task: we can migrate the task to a CPU of higher capacity, or
9623  * the task's current CPU is heavily pressured.
9624  */
9625 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
9626 {
9627         return rq->misfit_task_load &&
9628                 (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
9629                  check_cpu_capacity(rq, sd));
9630 }
9631
9632 /*
9633  * Group imbalance indicates (and tries to solve) the problem where balancing
9634  * groups is inadequate due to ->cpus_ptr constraints.
9635  *
9636  * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
9637  * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
9638  * Something like:
9639  *
9640  *      { 0 1 2 3 } { 4 5 6 7 }
9641  *              *     * * *
9642  *
9643  * If we were to balance group-wise we'd place two tasks in the first group and
9644  * two tasks in the second group. Clearly this is undesired as it will overload
9645  * cpu 3 and leave one of the CPUs in the second group unused.
9646  *
9647  * The current solution to this issue is detecting the skew in the first group
9648  * by noticing the lower domain failed to reach balance and had difficulty
9649  * moving tasks due to affinity constraints.
9650  *
9651  * When this is so detected; this group becomes a candidate for busiest; see
9652  * update_sd_pick_busiest(). And calculate_imbalance() and
9653  * find_busiest_group() avoid some of the usual balance conditions to allow it
9654  * to create an effective group imbalance.
9655  *
9656  * This is a somewhat tricky proposition since the next run might not find the
9657  * group imbalance and decide the groups need to be balanced again. A most
9658  * subtle and fragile situation.
9659  */
9660
9661 static inline int sg_imbalanced(struct sched_group *group)
9662 {
9663         return group->sgc->imbalance;
9664 }
9665
9666 /*
9667  * group_has_capacity returns true if the group has spare capacity that could
9668  * be used by some tasks.
9669  * We consider that a group has spare capacity if the number of task is
9670  * smaller than the number of CPUs or if the utilization is lower than the
9671  * available capacity for CFS tasks.
9672  * For the latter, we use a threshold to stabilize the state, to take into
9673  * account the variance of the tasks' load and to return true if the available
9674  * capacity in meaningful for the load balancer.
9675  * As an example, an available capacity of 1% can appear but it doesn't make
9676  * any benefit for the load balance.
9677  */
9678 static inline bool
9679 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
9680 {
9681         if (sgs->sum_nr_running < sgs->group_weight)
9682                 return true;
9683
9684         if ((sgs->group_capacity * imbalance_pct) <
9685                         (sgs->group_runnable * 100))
9686                 return false;
9687
9688         if ((sgs->group_capacity * 100) >
9689                         (sgs->group_util * imbalance_pct))
9690                 return true;
9691
9692         return false;
9693 }
9694
9695 /*
9696  *  group_is_overloaded returns true if the group has more tasks than it can
9697  *  handle.
9698  *  group_is_overloaded is not equals to !group_has_capacity because a group
9699  *  with the exact right number of tasks, has no more spare capacity but is not
9700  *  overloaded so both group_has_capacity and group_is_overloaded return
9701  *  false.
9702  */
9703 static inline bool
9704 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
9705 {
9706         if (sgs->sum_nr_running <= sgs->group_weight)
9707                 return false;
9708
9709         if ((sgs->group_capacity * 100) <
9710                         (sgs->group_util * imbalance_pct))
9711                 return true;
9712
9713         if ((sgs->group_capacity * imbalance_pct) <
9714                         (sgs->group_runnable * 100))
9715                 return true;
9716
9717         return false;
9718 }
9719
9720 static inline enum
9721 group_type group_classify(unsigned int imbalance_pct,
9722                           struct sched_group *group,
9723                           struct sg_lb_stats *sgs)
9724 {
9725         if (group_is_overloaded(imbalance_pct, sgs))
9726                 return group_overloaded;
9727
9728         if (sg_imbalanced(group))
9729                 return group_imbalanced;
9730
9731         if (sgs->group_asym_packing)
9732                 return group_asym_packing;
9733
9734         if (sgs->group_smt_balance)
9735                 return group_smt_balance;
9736
9737         if (sgs->group_misfit_task_load)
9738                 return group_misfit_task;
9739
9740         if (!group_has_capacity(imbalance_pct, sgs))
9741                 return group_fully_busy;
9742
9743         return group_has_spare;
9744 }
9745
9746 /**
9747  * sched_use_asym_prio - Check whether asym_packing priority must be used
9748  * @sd:         The scheduling domain of the load balancing
9749  * @cpu:        A CPU
9750  *
9751  * Always use CPU priority when balancing load between SMT siblings. When
9752  * balancing load between cores, it is not sufficient that @cpu is idle. Only
9753  * use CPU priority if the whole core is idle.
9754  *
9755  * Returns: True if the priority of @cpu must be followed. False otherwise.
9756  */
9757 static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
9758 {
9759         if (!(sd->flags & SD_ASYM_PACKING))
9760                 return false;
9761
9762         if (!sched_smt_active())
9763                 return true;
9764
9765         return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
9766 }
9767
9768 static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
9769 {
9770         /*
9771          * First check if @dst_cpu can do asym_packing load balance. Only do it
9772          * if it has higher priority than @src_cpu.
9773          */
9774         return sched_use_asym_prio(sd, dst_cpu) &&
9775                 sched_asym_prefer(dst_cpu, src_cpu);
9776 }
9777
9778 /**
9779  * sched_group_asym - Check if the destination CPU can do asym_packing balance
9780  * @env:        The load balancing environment
9781  * @sgs:        Load-balancing statistics of the candidate busiest group
9782  * @group:      The candidate busiest group
9783  *
9784  * @env::dst_cpu can do asym_packing if it has higher priority than the
9785  * preferred CPU of @group.
9786  *
9787  * Return: true if @env::dst_cpu can do with asym_packing load balance. False
9788  * otherwise.
9789  */
9790 static inline bool
9791 sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
9792 {
9793         /*
9794          * CPU priorities do not make sense for SMT cores with more than one
9795          * busy sibling.
9796          */
9797         if ((group->flags & SD_SHARE_CPUCAPACITY) &&
9798             (sgs->group_weight - sgs->idle_cpus != 1))
9799                 return false;
9800
9801         return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
9802 }
9803
9804 /* One group has more than one SMT CPU while the other group does not */
9805 static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
9806                                     struct sched_group *sg2)
9807 {
9808         if (!sg1 || !sg2)
9809                 return false;
9810
9811         return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
9812                 (sg2->flags & SD_SHARE_CPUCAPACITY);
9813 }
9814
9815 static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
9816                                struct sched_group *group)
9817 {
9818         if (env->idle == CPU_NOT_IDLE)
9819                 return false;
9820
9821         /*
9822          * For SMT source group, it is better to move a task
9823          * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
9824          * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
9825          * will not be on.
9826          */
9827         if (group->flags & SD_SHARE_CPUCAPACITY &&
9828             sgs->sum_h_nr_running > 1)
9829                 return true;
9830
9831         return false;
9832 }
9833
9834 static inline long sibling_imbalance(struct lb_env *env,
9835                                     struct sd_lb_stats *sds,
9836                                     struct sg_lb_stats *busiest,
9837                                     struct sg_lb_stats *local)
9838 {
9839         int ncores_busiest, ncores_local;
9840         long imbalance;
9841
9842         if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
9843                 return 0;
9844
9845         ncores_busiest = sds->busiest->cores;
9846         ncores_local = sds->local->cores;
9847
9848         if (ncores_busiest == ncores_local) {
9849                 imbalance = busiest->sum_nr_running;
9850                 lsub_positive(&imbalance, local->sum_nr_running);
9851                 return imbalance;
9852         }
9853
9854         /* Balance such that nr_running/ncores ratio are same on both groups */
9855         imbalance = ncores_local * busiest->sum_nr_running;
9856         lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
9857         /* Normalize imbalance and do rounding on normalization */
9858         imbalance = 2 * imbalance + ncores_local + ncores_busiest;
9859         imbalance /= ncores_local + ncores_busiest;
9860
9861         /* Take advantage of resource in an empty sched group */
9862         if (imbalance <= 1 && local->sum_nr_running == 0 &&
9863             busiest->sum_nr_running > 1)
9864                 imbalance = 2;
9865
9866         return imbalance;
9867 }
9868
9869 static inline bool
9870 sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
9871 {
9872         /*
9873          * When there is more than 1 task, the group_overloaded case already
9874          * takes care of cpu with reduced capacity
9875          */
9876         if (rq->cfs.h_nr_running != 1)
9877                 return false;
9878
9879         return check_cpu_capacity(rq, sd);
9880 }
9881
9882 /**
9883  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
9884  * @env: The load balancing environment.
9885  * @sds: Load-balancing data with statistics of the local group.
9886  * @group: sched_group whose statistics are to be updated.
9887  * @sgs: variable to hold the statistics for this group.
9888  * @sg_status: Holds flag indicating the status of the sched_group
9889  */
9890 static inline void update_sg_lb_stats(struct lb_env *env,
9891                                       struct sd_lb_stats *sds,
9892                                       struct sched_group *group,
9893                                       struct sg_lb_stats *sgs,
9894                                       int *sg_status)
9895 {
9896         int i, nr_running, local_group;
9897
9898         memset(sgs, 0, sizeof(*sgs));
9899
9900         local_group = group == sds->local;
9901
9902         for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9903                 struct rq *rq = cpu_rq(i);
9904                 unsigned long load = cpu_load(rq);
9905
9906                 sgs->group_load += load;
9907                 sgs->group_util += cpu_util_cfs(i);
9908                 sgs->group_runnable += cpu_runnable(rq);
9909                 sgs->sum_h_nr_running += rq->cfs.h_nr_running;
9910
9911                 nr_running = rq->nr_running;
9912                 sgs->sum_nr_running += nr_running;
9913
9914                 if (nr_running > 1)
9915                         *sg_status |= SG_OVERLOAD;
9916
9917                 if (cpu_overutilized(i))
9918                         *sg_status |= SG_OVERUTILIZED;
9919
9920 #ifdef CONFIG_NUMA_BALANCING
9921                 sgs->nr_numa_running += rq->nr_numa_running;
9922                 sgs->nr_preferred_running += rq->nr_preferred_running;
9923 #endif
9924                 /*
9925                  * No need to call idle_cpu() if nr_running is not 0
9926                  */
9927                 if (!nr_running && idle_cpu(i)) {
9928                         sgs->idle_cpus++;
9929                         /* Idle cpu can't have misfit task */
9930                         continue;
9931                 }
9932
9933                 if (local_group)
9934                         continue;
9935
9936                 if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
9937                         /* Check for a misfit task on the cpu */
9938                         if (sgs->group_misfit_task_load < rq->misfit_task_load) {
9939                                 sgs->group_misfit_task_load = rq->misfit_task_load;
9940                                 *sg_status |= SG_OVERLOAD;
9941                         }
9942                 } else if ((env->idle != CPU_NOT_IDLE) &&
9943                            sched_reduced_capacity(rq, env->sd)) {
9944                         /* Check for a task running on a CPU with reduced capacity */
9945                         if (sgs->group_misfit_task_load < load)
9946                                 sgs->group_misfit_task_load = load;
9947                 }
9948         }
9949
9950         sgs->group_capacity = group->sgc->capacity;
9951
9952         sgs->group_weight = group->group_weight;
9953
9954         /* Check if dst CPU is idle and preferred to this group */
9955         if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
9956             sched_group_asym(env, sgs, group))
9957                 sgs->group_asym_packing = 1;
9958
9959         /* Check for loaded SMT group to be balanced to dst CPU */
9960         if (!local_group && smt_balance(env, sgs, group))
9961                 sgs->group_smt_balance = 1;
9962
9963         sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
9964
9965         /* Computing avg_load makes sense only when group is overloaded */
9966         if (sgs->group_type == group_overloaded)
9967                 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
9968                                 sgs->group_capacity;
9969 }
9970
9971 /**
9972  * update_sd_pick_busiest - return 1 on busiest group
9973  * @env: The load balancing environment.
9974  * @sds: sched_domain statistics
9975  * @sg: sched_group candidate to be checked for being the busiest
9976  * @sgs: sched_group statistics
9977  *
9978  * Determine if @sg is a busier group than the previously selected
9979  * busiest group.
9980  *
9981  * Return: %true if @sg is a busier group than the previously selected
9982  * busiest group. %false otherwise.
9983  */
9984 static bool update_sd_pick_busiest(struct lb_env *env,
9985                                    struct sd_lb_stats *sds,
9986                                    struct sched_group *sg,
9987                                    struct sg_lb_stats *sgs)
9988 {
9989         struct sg_lb_stats *busiest = &sds->busiest_stat;
9990
9991         /* Make sure that there is at least one task to pull */
9992         if (!sgs->sum_h_nr_running)
9993                 return false;
9994
9995         /*
9996          * Don't try to pull misfit tasks we can't help.
9997          * We can use max_capacity here as reduction in capacity on some
9998          * CPUs in the group should either be possible to resolve
9999          * internally or be covered by avg_load imbalance (eventually).
10000          */
10001         if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10002             (sgs->group_type == group_misfit_task) &&
10003             (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
10004              sds->local_stat.group_type != group_has_spare))
10005                 return false;
10006
10007         if (sgs->group_type > busiest->group_type)
10008                 return true;
10009
10010         if (sgs->group_type < busiest->group_type)
10011                 return false;
10012
10013         /*
10014          * The candidate and the current busiest group are the same type of
10015          * group. Let check which one is the busiest according to the type.
10016          */
10017
10018         switch (sgs->group_type) {
10019         case group_overloaded:
10020                 /* Select the overloaded group with highest avg_load. */
10021                 return sgs->avg_load > busiest->avg_load;
10022
10023         case group_imbalanced:
10024                 /*
10025                  * Select the 1st imbalanced group as we don't have any way to
10026                  * choose one more than another.
10027                  */
10028                 return false;
10029
10030         case group_asym_packing:
10031                 /* Prefer to move from lowest priority CPU's work */
10032                 return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
10033
10034         case group_misfit_task:
10035                 /*
10036                  * If we have more than one misfit sg go with the biggest
10037                  * misfit.
10038                  */
10039                 return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
10040
10041         case group_smt_balance:
10042                 /*
10043                  * Check if we have spare CPUs on either SMT group to
10044                  * choose has spare or fully busy handling.
10045                  */
10046                 if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
10047                         goto has_spare;
10048
10049                 fallthrough;
10050
10051         case group_fully_busy:
10052                 /*
10053                  * Select the fully busy group with highest avg_load. In
10054                  * theory, there is no need to pull task from such kind of
10055                  * group because tasks have all compute capacity that they need
10056                  * but we can still improve the overall throughput by reducing
10057                  * contention when accessing shared HW resources.
10058                  *
10059                  * XXX for now avg_load is not computed and always 0 so we
10060                  * select the 1st one, except if @sg is composed of SMT
10061                  * siblings.
10062                  */
10063
10064                 if (sgs->avg_load < busiest->avg_load)
10065                         return false;
10066
10067                 if (sgs->avg_load == busiest->avg_load) {
10068                         /*
10069                          * SMT sched groups need more help than non-SMT groups.
10070                          * If @sg happens to also be SMT, either choice is good.
10071                          */
10072                         if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
10073                                 return false;
10074                 }
10075
10076                 break;
10077
10078         case group_has_spare:
10079                 /*
10080                  * Do not pick sg with SMT CPUs over sg with pure CPUs,
10081                  * as we do not want to pull task off SMT core with one task
10082                  * and make the core idle.
10083                  */
10084                 if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
10085                         if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
10086                                 return false;
10087                         else
10088                                 return true;
10089                 }
10090 has_spare:
10091
10092                 /*
10093                  * Select not overloaded group with lowest number of idle cpus
10094                  * and highest number of running tasks. We could also compare
10095                  * the spare capacity which is more stable but it can end up
10096                  * that the group has less spare capacity but finally more idle
10097                  * CPUs which means less opportunity to pull tasks.
10098                  */
10099                 if (sgs->idle_cpus > busiest->idle_cpus)
10100                         return false;
10101                 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
10102                          (sgs->sum_nr_running <= busiest->sum_nr_running))
10103                         return false;
10104
10105                 break;
10106         }
10107
10108         /*
10109          * Candidate sg has no more than one task per CPU and has higher
10110          * per-CPU capacity. Migrating tasks to less capable CPUs may harm
10111          * throughput. Maximize throughput, power/energy consequences are not
10112          * considered.
10113          */
10114         if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10115             (sgs->group_type <= group_fully_busy) &&
10116             (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
10117                 return false;
10118
10119         return true;
10120 }
10121
10122 #ifdef CONFIG_NUMA_BALANCING
10123 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10124 {
10125         if (sgs->sum_h_nr_running > sgs->nr_numa_running)
10126                 return regular;
10127         if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
10128                 return remote;
10129         return all;
10130 }
10131
10132 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10133 {
10134         if (rq->nr_running > rq->nr_numa_running)
10135                 return regular;
10136         if (rq->nr_running > rq->nr_preferred_running)
10137                 return remote;
10138         return all;
10139 }
10140 #else
10141 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10142 {
10143         return all;
10144 }
10145
10146 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10147 {
10148         return regular;
10149 }
10150 #endif /* CONFIG_NUMA_BALANCING */
10151
10152
10153 struct sg_lb_stats;
10154
10155 /*
10156  * task_running_on_cpu - return 1 if @p is running on @cpu.
10157  */
10158
10159 static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
10160 {
10161         /* Task has no contribution or is new */
10162         if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
10163                 return 0;
10164
10165         if (task_on_rq_queued(p))
10166                 return 1;
10167
10168         return 0;
10169 }
10170
10171 /**
10172  * idle_cpu_without - would a given CPU be idle without p ?
10173  * @cpu: the processor on which idleness is tested.
10174  * @p: task which should be ignored.
10175  *
10176  * Return: 1 if the CPU would be idle. 0 otherwise.
10177  */
10178 static int idle_cpu_without(int cpu, struct task_struct *p)
10179 {
10180         struct rq *rq = cpu_rq(cpu);
10181
10182         if (rq->curr != rq->idle && rq->curr != p)
10183                 return 0;
10184
10185         /*
10186          * rq->nr_running can't be used but an updated version without the
10187          * impact of p on cpu must be used instead. The updated nr_running
10188          * be computed and tested before calling idle_cpu_without().
10189          */
10190
10191         if (rq->ttwu_pending)
10192                 return 0;
10193
10194         return 1;
10195 }
10196
10197 /*
10198  * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
10199  * @sd: The sched_domain level to look for idlest group.
10200  * @group: sched_group whose statistics are to be updated.
10201  * @sgs: variable to hold the statistics for this group.
10202  * @p: The task for which we look for the idlest group/CPU.
10203  */
10204 static inline void update_sg_wakeup_stats(struct sched_domain *sd,
10205                                           struct sched_group *group,
10206                                           struct sg_lb_stats *sgs,
10207                                           struct task_struct *p)
10208 {
10209         int i, nr_running;
10210
10211         memset(sgs, 0, sizeof(*sgs));
10212
10213         /* Assume that task can't fit any CPU of the group */
10214         if (sd->flags & SD_ASYM_CPUCAPACITY)
10215                 sgs->group_misfit_task_load = 1;
10216
10217         for_each_cpu(i, sched_group_span(group)) {
10218                 struct rq *rq = cpu_rq(i);
10219                 unsigned int local;
10220
10221                 sgs->group_load += cpu_load_without(rq, p);
10222                 sgs->group_util += cpu_util_without(i, p);
10223                 sgs->group_runnable += cpu_runnable_without(rq, p);
10224                 local = task_running_on_cpu(i, p);
10225                 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
10226
10227                 nr_running = rq->nr_running - local;
10228                 sgs->sum_nr_running += nr_running;
10229
10230                 /*
10231                  * No need to call idle_cpu_without() if nr_running is not 0
10232                  */
10233                 if (!nr_running && idle_cpu_without(i, p))
10234                         sgs->idle_cpus++;
10235
10236                 /* Check if task fits in the CPU */
10237                 if (sd->flags & SD_ASYM_CPUCAPACITY &&
10238                     sgs->group_misfit_task_load &&
10239                     task_fits_cpu(p, i))
10240                         sgs->group_misfit_task_load = 0;
10241
10242         }
10243
10244         sgs->group_capacity = group->sgc->capacity;
10245
10246         sgs->group_weight = group->group_weight;
10247
10248         sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
10249
10250         /*
10251          * Computing avg_load makes sense only when group is fully busy or
10252          * overloaded
10253          */
10254         if (sgs->group_type == group_fully_busy ||
10255                 sgs->group_type == group_overloaded)
10256                 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10257                                 sgs->group_capacity;
10258 }
10259
10260 static bool update_pick_idlest(struct sched_group *idlest,
10261                                struct sg_lb_stats *idlest_sgs,
10262                                struct sched_group *group,
10263                                struct sg_lb_stats *sgs)
10264 {
10265         if (sgs->group_type < idlest_sgs->group_type)
10266                 return true;
10267
10268         if (sgs->group_type > idlest_sgs->group_type)
10269                 return false;
10270
10271         /*
10272          * The candidate and the current idlest group are the same type of
10273          * group. Let check which one is the idlest according to the type.
10274          */
10275
10276         switch (sgs->group_type) {
10277         case group_overloaded:
10278         case group_fully_busy:
10279                 /* Select the group with lowest avg_load. */
10280                 if (idlest_sgs->avg_load <= sgs->avg_load)
10281                         return false;
10282                 break;
10283
10284         case group_imbalanced:
10285         case group_asym_packing:
10286         case group_smt_balance:
10287                 /* Those types are not used in the slow wakeup path */
10288                 return false;
10289
10290         case group_misfit_task:
10291                 /* Select group with the highest max capacity */
10292                 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
10293                         return false;
10294                 break;
10295
10296         case group_has_spare:
10297                 /* Select group with most idle CPUs */
10298                 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
10299                         return false;
10300
10301                 /* Select group with lowest group_util */
10302                 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
10303                         idlest_sgs->group_util <= sgs->group_util)
10304                         return false;
10305
10306                 break;
10307         }
10308
10309         return true;
10310 }
10311
10312 /*
10313  * find_idlest_group() finds and returns the least busy CPU group within the
10314  * domain.
10315  *
10316  * Assumes p is allowed on at least one CPU in sd.
10317  */
10318 static struct sched_group *
10319 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
10320 {
10321         struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
10322         struct sg_lb_stats local_sgs, tmp_sgs;
10323         struct sg_lb_stats *sgs;
10324         unsigned long imbalance;
10325         struct sg_lb_stats idlest_sgs = {
10326                         .avg_load = UINT_MAX,
10327                         .group_type = group_overloaded,
10328         };
10329
10330         do {
10331                 int local_group;
10332
10333                 /* Skip over this group if it has no CPUs allowed */
10334                 if (!cpumask_intersects(sched_group_span(group),
10335                                         p->cpus_ptr))
10336                         continue;
10337
10338                 /* Skip over this group if no cookie matched */
10339                 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
10340                         continue;
10341
10342                 local_group = cpumask_test_cpu(this_cpu,
10343                                                sched_group_span(group));
10344
10345                 if (local_group) {
10346                         sgs = &local_sgs;
10347                         local = group;
10348                 } else {
10349                         sgs = &tmp_sgs;
10350                 }
10351
10352                 update_sg_wakeup_stats(sd, group, sgs, p);
10353
10354                 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
10355                         idlest = group;
10356                         idlest_sgs = *sgs;
10357                 }
10358
10359         } while (group = group->next, group != sd->groups);
10360
10361
10362         /* There is no idlest group to push tasks to */
10363         if (!idlest)
10364                 return NULL;
10365
10366         /* The local group has been skipped because of CPU affinity */
10367         if (!local)
10368                 return idlest;
10369
10370         /*
10371          * If the local group is idler than the selected idlest group
10372          * don't try and push the task.
10373          */
10374         if (local_sgs.group_type < idlest_sgs.group_type)
10375                 return NULL;
10376
10377         /*
10378          * If the local group is busier than the selected idlest group
10379          * try and push the task.
10380          */
10381         if (local_sgs.group_type > idlest_sgs.group_type)
10382                 return idlest;
10383
10384         switch (local_sgs.group_type) {
10385         case group_overloaded:
10386         case group_fully_busy:
10387
10388                 /* Calculate allowed imbalance based on load */
10389                 imbalance = scale_load_down(NICE_0_LOAD) *
10390                                 (sd->imbalance_pct-100) / 100;
10391
10392                 /*
10393                  * When comparing groups across NUMA domains, it's possible for
10394                  * the local domain to be very lightly loaded relative to the
10395                  * remote domains but "imbalance" skews the comparison making
10396                  * remote CPUs look much more favourable. When considering
10397                  * cross-domain, add imbalance to the load on the remote node
10398                  * and consider staying local.
10399                  */
10400
10401                 if ((sd->flags & SD_NUMA) &&
10402                     ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
10403                         return NULL;
10404
10405                 /*
10406                  * If the local group is less loaded than the selected
10407                  * idlest group don't try and push any tasks.
10408                  */
10409                 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
10410                         return NULL;
10411
10412                 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
10413                         return NULL;
10414                 break;
10415
10416         case group_imbalanced:
10417         case group_asym_packing:
10418         case group_smt_balance:
10419                 /* Those type are not used in the slow wakeup path */
10420                 return NULL;
10421
10422         case group_misfit_task:
10423                 /* Select group with the highest max capacity */
10424                 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
10425                         return NULL;
10426                 break;
10427
10428         case group_has_spare:
10429 #ifdef CONFIG_NUMA
10430                 if (sd->flags & SD_NUMA) {
10431                         int imb_numa_nr = sd->imb_numa_nr;
10432 #ifdef CONFIG_NUMA_BALANCING
10433                         int idlest_cpu;
10434                         /*
10435                          * If there is spare capacity at NUMA, try to select
10436                          * the preferred node
10437                          */
10438                         if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
10439                                 return NULL;
10440
10441                         idlest_cpu = cpumask_first(sched_group_span(idlest));
10442                         if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
10443                                 return idlest;
10444 #endif /* CONFIG_NUMA_BALANCING */
10445                         /*
10446                          * Otherwise, keep the task close to the wakeup source
10447                          * and improve locality if the number of running tasks
10448                          * would remain below threshold where an imbalance is
10449                          * allowed while accounting for the possibility the
10450                          * task is pinned to a subset of CPUs. If there is a
10451                          * real need of migration, periodic load balance will
10452                          * take care of it.
10453                          */
10454                         if (p->nr_cpus_allowed != NR_CPUS) {
10455                                 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
10456
10457                                 cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
10458                                 imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
10459                         }
10460
10461                         imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
10462                         if (!adjust_numa_imbalance(imbalance,
10463                                                    local_sgs.sum_nr_running + 1,
10464                                                    imb_numa_nr)) {
10465                                 return NULL;
10466                         }
10467                 }
10468 #endif /* CONFIG_NUMA */
10469
10470                 /*
10471                  * Select group with highest number of idle CPUs. We could also
10472                  * compare the utilization which is more stable but it can end
10473                  * up that the group has less spare capacity but finally more
10474                  * idle CPUs which means more opportunity to run task.
10475                  */
10476                 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
10477                         return NULL;
10478                 break;
10479         }
10480
10481         return idlest;
10482 }
10483
10484 static void update_idle_cpu_scan(struct lb_env *env,
10485                                  unsigned long sum_util)
10486 {
10487         struct sched_domain_shared *sd_share;
10488         int llc_weight, pct;
10489         u64 x, y, tmp;
10490         /*
10491          * Update the number of CPUs to scan in LLC domain, which could
10492          * be used as a hint in select_idle_cpu(). The update of sd_share
10493          * could be expensive because it is within a shared cache line.
10494          * So the write of this hint only occurs during periodic load
10495          * balancing, rather than CPU_NEWLY_IDLE, because the latter
10496          * can fire way more frequently than the former.
10497          */
10498         if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
10499                 return;
10500
10501         llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
10502         if (env->sd->span_weight != llc_weight)
10503                 return;
10504
10505         sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
10506         if (!sd_share)
10507                 return;
10508
10509         /*
10510          * The number of CPUs to search drops as sum_util increases, when
10511          * sum_util hits 85% or above, the scan stops.
10512          * The reason to choose 85% as the threshold is because this is the
10513          * imbalance_pct(117) when a LLC sched group is overloaded.
10514          *
10515          * let y = SCHED_CAPACITY_SCALE - p * x^2                       [1]
10516          * and y'= y / SCHED_CAPACITY_SCALE
10517          *
10518          * x is the ratio of sum_util compared to the CPU capacity:
10519          * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
10520          * y' is the ratio of CPUs to be scanned in the LLC domain,
10521          * and the number of CPUs to scan is calculated by:
10522          *
10523          * nr_scan = llc_weight * y'                                    [2]
10524          *
10525          * When x hits the threshold of overloaded, AKA, when
10526          * x = 100 / pct, y drops to 0. According to [1],
10527          * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
10528          *
10529          * Scale x by SCHED_CAPACITY_SCALE:
10530          * x' = sum_util / llc_weight;                                  [3]
10531          *
10532          * and finally [1] becomes:
10533          * y = SCHED_CAPACITY_SCALE -
10534          *     x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE)            [4]
10535          *
10536          */
10537         /* equation [3] */
10538         x = sum_util;
10539         do_div(x, llc_weight);
10540
10541         /* equation [4] */
10542         pct = env->sd->imbalance_pct;
10543         tmp = x * x * pct * pct;
10544         do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
10545         tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
10546         y = SCHED_CAPACITY_SCALE - tmp;
10547
10548         /* equation [2] */
10549         y *= llc_weight;
10550         do_div(y, SCHED_CAPACITY_SCALE);
10551         if ((int)y != sd_share->nr_idle_scan)
10552                 WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
10553 }
10554
10555 /**
10556  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
10557  * @env: The load balancing environment.
10558  * @sds: variable to hold the statistics for this sched_domain.
10559  */
10560
10561 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
10562 {
10563         struct sched_group *sg = env->sd->groups;
10564         struct sg_lb_stats *local = &sds->local_stat;
10565         struct sg_lb_stats tmp_sgs;
10566         unsigned long sum_util = 0;
10567         int sg_status = 0;
10568
10569         do {
10570                 struct sg_lb_stats *sgs = &tmp_sgs;
10571                 int local_group;
10572
10573                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
10574                 if (local_group) {
10575                         sds->local = sg;
10576                         sgs = local;
10577
10578                         if (env->idle != CPU_NEWLY_IDLE ||
10579                             time_after_eq(jiffies, sg->sgc->next_update))
10580                                 update_group_capacity(env->sd, env->dst_cpu);
10581                 }
10582
10583                 update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
10584
10585                 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
10586                         sds->busiest = sg;
10587                         sds->busiest_stat = *sgs;
10588                 }
10589
10590                 /* Now, start updating sd_lb_stats */
10591                 sds->total_load += sgs->group_load;
10592                 sds->total_capacity += sgs->group_capacity;
10593
10594                 sum_util += sgs->group_util;
10595                 sg = sg->next;
10596         } while (sg != env->sd->groups);
10597
10598         /*
10599          * Indicate that the child domain of the busiest group prefers tasks
10600          * go to a child's sibling domains first. NB the flags of a sched group
10601          * are those of the child domain.
10602          */
10603         if (sds->busiest)
10604                 sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
10605
10606
10607         if (env->sd->flags & SD_NUMA)
10608                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
10609
10610         if (!env->sd->parent) {
10611                 struct root_domain *rd = env->dst_rq->rd;
10612
10613                 /* update overload indicator if we are at root domain */
10614                 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
10615
10616                 /* Update over-utilization (tipping point, U >= 0) indicator */
10617                 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
10618                 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
10619         } else if (sg_status & SG_OVERUTILIZED) {
10620                 struct root_domain *rd = env->dst_rq->rd;
10621
10622                 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
10623                 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
10624         }
10625
10626         update_idle_cpu_scan(env, sum_util);
10627 }
10628
10629 /**
10630  * calculate_imbalance - Calculate the amount of imbalance present within the
10631  *                       groups of a given sched_domain during load balance.
10632  * @env: load balance environment
10633  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
10634  */
10635 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
10636 {
10637         struct sg_lb_stats *local, *busiest;
10638
10639         local = &sds->local_stat;
10640         busiest = &sds->busiest_stat;
10641
10642         if (busiest->group_type == group_misfit_task) {
10643                 if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
10644                         /* Set imbalance to allow misfit tasks to be balanced. */
10645                         env->migration_type = migrate_misfit;
10646                         env->imbalance = 1;
10647                 } else {
10648                         /*
10649                          * Set load imbalance to allow moving task from cpu
10650                          * with reduced capacity.
10651                          */
10652                         env->migration_type = migrate_load;
10653                         env->imbalance = busiest->group_misfit_task_load;
10654                 }
10655                 return;
10656         }
10657
10658         if (busiest->group_type == group_asym_packing) {
10659                 /*
10660                  * In case of asym capacity, we will try to migrate all load to
10661                  * the preferred CPU.
10662                  */
10663                 env->migration_type = migrate_task;
10664                 env->imbalance = busiest->sum_h_nr_running;
10665                 return;
10666         }
10667
10668         if (busiest->group_type == group_smt_balance) {
10669                 /* Reduce number of tasks sharing CPU capacity */
10670                 env->migration_type = migrate_task;
10671                 env->imbalance = 1;
10672                 return;
10673         }
10674
10675         if (busiest->group_type == group_imbalanced) {
10676                 /*
10677                  * In the group_imb case we cannot rely on group-wide averages
10678                  * to ensure CPU-load equilibrium, try to move any task to fix
10679                  * the imbalance. The next load balance will take care of
10680                  * balancing back the system.
10681                  */
10682                 env->migration_type = migrate_task;
10683                 env->imbalance = 1;
10684                 return;
10685         }
10686
10687         /*
10688          * Try to use spare capacity of local group without overloading it or
10689          * emptying busiest.
10690          */
10691         if (local->group_type == group_has_spare) {
10692                 if ((busiest->group_type > group_fully_busy) &&
10693                     !(env->sd->flags & SD_SHARE_LLC)) {
10694                         /*
10695                          * If busiest is overloaded, try to fill spare
10696                          * capacity. This might end up creating spare capacity
10697                          * in busiest or busiest still being overloaded but
10698                          * there is no simple way to directly compute the
10699                          * amount of load to migrate in order to balance the
10700                          * system.
10701                          */
10702                         env->migration_type = migrate_util;
10703                         env->imbalance = max(local->group_capacity, local->group_util) -
10704                                          local->group_util;
10705
10706                         /*
10707                          * In some cases, the group's utilization is max or even
10708                          * higher than capacity because of migrations but the
10709                          * local CPU is (newly) idle. There is at least one
10710                          * waiting task in this overloaded busiest group. Let's
10711                          * try to pull it.
10712                          */
10713                         if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
10714                                 env->migration_type = migrate_task;
10715                                 env->imbalance = 1;
10716                         }
10717
10718                         return;
10719                 }
10720
10721                 if (busiest->group_weight == 1 || sds->prefer_sibling) {
10722                         /*
10723                          * When prefer sibling, evenly spread running tasks on
10724                          * groups.
10725                          */
10726                         env->migration_type = migrate_task;
10727                         env->imbalance = sibling_imbalance(env, sds, busiest, local);
10728                 } else {
10729
10730                         /*
10731                          * If there is no overload, we just want to even the number of
10732                          * idle cpus.
10733                          */
10734                         env->migration_type = migrate_task;
10735                         env->imbalance = max_t(long, 0,
10736                                                (local->idle_cpus - busiest->idle_cpus));
10737                 }
10738
10739 #ifdef CONFIG_NUMA
10740                 /* Consider allowing a small imbalance between NUMA groups */
10741                 if (env->sd->flags & SD_NUMA) {
10742                         env->imbalance = adjust_numa_imbalance(env->imbalance,
10743                                                                local->sum_nr_running + 1,
10744                                                                env->sd->imb_numa_nr);
10745                 }
10746 #endif
10747
10748                 /* Number of tasks to move to restore balance */
10749                 env->imbalance >>= 1;
10750
10751                 return;
10752         }
10753
10754         /*
10755          * Local is fully busy but has to take more load to relieve the
10756          * busiest group
10757          */
10758         if (local->group_type < group_overloaded) {
10759                 /*
10760                  * Local will become overloaded so the avg_load metrics are
10761                  * finally needed.
10762                  */
10763
10764                 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
10765                                   local->group_capacity;
10766
10767                 /*
10768                  * If the local group is more loaded than the selected
10769                  * busiest group don't try to pull any tasks.
10770                  */
10771                 if (local->avg_load >= busiest->avg_load) {
10772                         env->imbalance = 0;
10773                         return;
10774                 }
10775
10776                 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
10777                                 sds->total_capacity;
10778
10779                 /*
10780                  * If the local group is more loaded than the average system
10781                  * load, don't try to pull any tasks.
10782                  */
10783                 if (local->avg_load >= sds->avg_load) {
10784                         env->imbalance = 0;
10785                         return;
10786                 }
10787
10788         }
10789
10790         /*
10791          * Both group are or will become overloaded and we're trying to get all
10792          * the CPUs to the average_load, so we don't want to push ourselves
10793          * above the average load, nor do we wish to reduce the max loaded CPU
10794          * below the average load. At the same time, we also don't want to
10795          * reduce the group load below the group capacity. Thus we look for
10796          * the minimum possible imbalance.
10797          */
10798         env->migration_type = migrate_load;
10799         env->imbalance = min(
10800                 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
10801                 (sds->avg_load - local->avg_load) * local->group_capacity
10802         ) / SCHED_CAPACITY_SCALE;
10803 }
10804
10805 /******* find_busiest_group() helpers end here *********************/
10806
10807 /*
10808  * Decision matrix according to the local and busiest group type:
10809  *
10810  * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
10811  * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
10812  * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
10813  * misfit_task      force     N/A        N/A    N/A  N/A        N/A
10814  * asym_packing     force     force      N/A    N/A  force      force
10815  * imbalanced       force     force      N/A    N/A  force      force
10816  * overloaded       force     force      N/A    N/A  force      avg_load
10817  *
10818  * N/A :      Not Applicable because already filtered while updating
10819  *            statistics.
10820  * balanced : The system is balanced for these 2 groups.
10821  * force :    Calculate the imbalance as load migration is probably needed.
10822  * avg_load : Only if imbalance is significant enough.
10823  * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite
10824  *            different in groups.
10825  */
10826
10827 /**
10828  * find_busiest_group - Returns the busiest group within the sched_domain
10829  * if there is an imbalance.
10830  * @env: The load balancing environment.
10831  *
10832  * Also calculates the amount of runnable load which should be moved
10833  * to restore balance.
10834  *
10835  * Return:      - The busiest group if imbalance exists.
10836  */
10837 static struct sched_group *find_busiest_group(struct lb_env *env)
10838 {
10839         struct sg_lb_stats *local, *busiest;
10840         struct sd_lb_stats sds;
10841
10842         init_sd_lb_stats(&sds);
10843
10844         /*
10845          * Compute the various statistics relevant for load balancing at
10846          * this level.
10847          */
10848         update_sd_lb_stats(env, &sds);
10849
10850         /* There is no busy sibling group to pull tasks from */
10851         if (!sds.busiest)
10852                 goto out_balanced;
10853
10854         busiest = &sds.busiest_stat;
10855
10856         /* Misfit tasks should be dealt with regardless of the avg load */
10857         if (busiest->group_type == group_misfit_task)
10858                 goto force_balance;
10859
10860         if (sched_energy_enabled()) {
10861                 struct root_domain *rd = env->dst_rq->rd;
10862
10863                 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
10864                         goto out_balanced;
10865         }
10866
10867         /* ASYM feature bypasses nice load balance check */
10868         if (busiest->group_type == group_asym_packing)
10869                 goto force_balance;
10870
10871         /*
10872          * If the busiest group is imbalanced the below checks don't
10873          * work because they assume all things are equal, which typically
10874          * isn't true due to cpus_ptr constraints and the like.
10875          */
10876         if (busiest->group_type == group_imbalanced)
10877                 goto force_balance;
10878
10879         local = &sds.local_stat;
10880         /*
10881          * If the local group is busier than the selected busiest group
10882          * don't try and pull any tasks.
10883          */
10884         if (local->group_type > busiest->group_type)
10885                 goto out_balanced;
10886
10887         /*
10888          * When groups are overloaded, use the avg_load to ensure fairness
10889          * between tasks.
10890          */
10891         if (local->group_type == group_overloaded) {
10892                 /*
10893                  * If the local group is more loaded than the selected
10894                  * busiest group don't try to pull any tasks.
10895                  */
10896                 if (local->avg_load >= busiest->avg_load)
10897                         goto out_balanced;
10898
10899                 /* XXX broken for overlapping NUMA groups */
10900                 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
10901                                 sds.total_capacity;
10902
10903                 /*
10904                  * Don't pull any tasks if this group is already above the
10905                  * domain average load.
10906                  */
10907                 if (local->avg_load >= sds.avg_load)
10908                         goto out_balanced;
10909
10910                 /*
10911                  * If the busiest group is more loaded, use imbalance_pct to be
10912                  * conservative.
10913                  */
10914                 if (100 * busiest->avg_load <=
10915                                 env->sd->imbalance_pct * local->avg_load)
10916                         goto out_balanced;
10917         }
10918
10919         /*
10920          * Try to move all excess tasks to a sibling domain of the busiest
10921          * group's child domain.
10922          */
10923         if (sds.prefer_sibling && local->group_type == group_has_spare &&
10924             sibling_imbalance(env, &sds, busiest, local) > 1)
10925                 goto force_balance;
10926
10927         if (busiest->group_type != group_overloaded) {
10928                 if (env->idle == CPU_NOT_IDLE) {
10929                         /*
10930                          * If the busiest group is not overloaded (and as a
10931                          * result the local one too) but this CPU is already
10932                          * busy, let another idle CPU try to pull task.
10933                          */
10934                         goto out_balanced;
10935                 }
10936
10937                 if (busiest->group_type == group_smt_balance &&
10938                     smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
10939                         /* Let non SMT CPU pull from SMT CPU sharing with sibling */
10940                         goto force_balance;
10941                 }
10942
10943                 if (busiest->group_weight > 1 &&
10944                     local->idle_cpus <= (busiest->idle_cpus + 1)) {
10945                         /*
10946                          * If the busiest group is not overloaded
10947                          * and there is no imbalance between this and busiest
10948                          * group wrt idle CPUs, it is balanced. The imbalance
10949                          * becomes significant if the diff is greater than 1
10950                          * otherwise we might end up to just move the imbalance
10951                          * on another group. Of course this applies only if
10952                          * there is more than 1 CPU per group.
10953                          */
10954                         goto out_balanced;
10955                 }
10956
10957                 if (busiest->sum_h_nr_running == 1) {
10958                         /*
10959                          * busiest doesn't have any tasks waiting to run
10960                          */
10961                         goto out_balanced;
10962                 }
10963         }
10964
10965 force_balance:
10966         /* Looks like there is an imbalance. Compute it */
10967         calculate_imbalance(env, &sds);
10968         return env->imbalance ? sds.busiest : NULL;
10969
10970 out_balanced:
10971         env->imbalance = 0;
10972         return NULL;
10973 }
10974
10975 /*
10976  * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
10977  */
10978 static struct rq *find_busiest_queue(struct lb_env *env,
10979                                      struct sched_group *group)
10980 {
10981         struct rq *busiest = NULL, *rq;
10982         unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
10983         unsigned int busiest_nr = 0;
10984         int i;
10985
10986         for_each_cpu_and(i, sched_group_span(group), env->cpus) {
10987                 unsigned long capacity, load, util;
10988                 unsigned int nr_running;
10989                 enum fbq_type rt;
10990
10991                 rq = cpu_rq(i);
10992                 rt = fbq_classify_rq(rq);
10993
10994                 /*
10995                  * We classify groups/runqueues into three groups:
10996                  *  - regular: there are !numa tasks
10997                  *  - remote:  there are numa tasks that run on the 'wrong' node
10998                  *  - all:     there is no distinction
10999                  *
11000                  * In order to avoid migrating ideally placed numa tasks,
11001                  * ignore those when there's better options.
11002                  *
11003                  * If we ignore the actual busiest queue to migrate another
11004                  * task, the next balance pass can still reduce the busiest
11005                  * queue by moving tasks around inside the node.
11006                  *
11007                  * If we cannot move enough load due to this classification
11008                  * the next pass will adjust the group classification and
11009                  * allow migration of more tasks.
11010                  *
11011                  * Both cases only affect the total convergence complexity.
11012                  */
11013                 if (rt > env->fbq_type)
11014                         continue;
11015
11016                 nr_running = rq->cfs.h_nr_running;
11017                 if (!nr_running)
11018                         continue;
11019
11020                 capacity = capacity_of(i);
11021
11022                 /*
11023                  * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
11024                  * eventually lead to active_balancing high->low capacity.
11025                  * Higher per-CPU capacity is considered better than balancing
11026                  * average load.
11027                  */
11028                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
11029                     !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
11030                     nr_running == 1)
11031                         continue;
11032
11033                 /*
11034                  * Make sure we only pull tasks from a CPU of lower priority
11035                  * when balancing between SMT siblings.
11036                  *
11037                  * If balancing between cores, let lower priority CPUs help
11038                  * SMT cores with more than one busy sibling.
11039                  */
11040                 if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
11041                         continue;
11042
11043                 switch (env->migration_type) {
11044                 case migrate_load:
11045                         /*
11046                          * When comparing with load imbalance, use cpu_load()
11047                          * which is not scaled with the CPU capacity.
11048                          */
11049                         load = cpu_load(rq);
11050
11051                         if (nr_running == 1 && load > env->imbalance &&
11052                             !check_cpu_capacity(rq, env->sd))
11053                                 break;
11054
11055                         /*
11056                          * For the load comparisons with the other CPUs,
11057                          * consider the cpu_load() scaled with the CPU
11058                          * capacity, so that the load can be moved away
11059                          * from the CPU that is potentially running at a
11060                          * lower capacity.
11061                          *
11062                          * Thus we're looking for max(load_i / capacity_i),
11063                          * crosswise multiplication to rid ourselves of the
11064                          * division works out to:
11065                          * load_i * capacity_j > load_j * capacity_i;
11066                          * where j is our previous maximum.
11067                          */
11068                         if (load * busiest_capacity > busiest_load * capacity) {
11069                                 busiest_load = load;
11070                                 busiest_capacity = capacity;
11071                                 busiest = rq;
11072                         }
11073                         break;
11074
11075                 case migrate_util:
11076                         util = cpu_util_cfs_boost(i);
11077
11078                         /*
11079                          * Don't try to pull utilization from a CPU with one
11080                          * running task. Whatever its utilization, we will fail
11081                          * detach the task.
11082                          */
11083                         if (nr_running <= 1)
11084                                 continue;
11085
11086                         if (busiest_util < util) {
11087                                 busiest_util = util;
11088                                 busiest = rq;
11089                         }
11090                         break;
11091
11092                 case migrate_task:
11093                         if (busiest_nr < nr_running) {
11094                                 busiest_nr = nr_running;
11095                                 busiest = rq;
11096                         }
11097                         break;
11098
11099                 case migrate_misfit:
11100                         /*
11101                          * For ASYM_CPUCAPACITY domains with misfit tasks we
11102                          * simply seek the "biggest" misfit task.
11103                          */
11104                         if (rq->misfit_task_load > busiest_load) {
11105                                 busiest_load = rq->misfit_task_load;
11106                                 busiest = rq;
11107                         }
11108
11109                         break;
11110
11111                 }
11112         }
11113
11114         return busiest;
11115 }
11116
11117 /*
11118  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
11119  * so long as it is large enough.
11120  */
11121 #define MAX_PINNED_INTERVAL     512
11122
11123 static inline bool
11124 asym_active_balance(struct lb_env *env)
11125 {
11126         /*
11127          * ASYM_PACKING needs to force migrate tasks from busy but lower
11128          * priority CPUs in order to pack all tasks in the highest priority
11129          * CPUs. When done between cores, do it only if the whole core if the
11130          * whole core is idle.
11131          *
11132          * If @env::src_cpu is an SMT core with busy siblings, let
11133          * the lower priority @env::dst_cpu help it. Do not follow
11134          * CPU priority.
11135          */
11136         return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
11137                (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
11138                 !sched_use_asym_prio(env->sd, env->src_cpu));
11139 }
11140
11141 static inline bool
11142 imbalanced_active_balance(struct lb_env *env)
11143 {
11144         struct sched_domain *sd = env->sd;
11145
11146         /*
11147          * The imbalanced case includes the case of pinned tasks preventing a fair
11148          * distribution of the load on the system but also the even distribution of the
11149          * threads on a system with spare capacity
11150          */
11151         if ((env->migration_type == migrate_task) &&
11152             (sd->nr_balance_failed > sd->cache_nice_tries+2))
11153                 return 1;
11154
11155         return 0;
11156 }
11157
11158 static int need_active_balance(struct lb_env *env)
11159 {
11160         struct sched_domain *sd = env->sd;
11161
11162         if (asym_active_balance(env))
11163                 return 1;
11164
11165         if (imbalanced_active_balance(env))
11166                 return 1;
11167
11168         /*
11169          * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
11170          * It's worth migrating the task if the src_cpu's capacity is reduced
11171          * because of other sched_class or IRQs if more capacity stays
11172          * available on dst_cpu.
11173          */
11174         if ((env->idle != CPU_NOT_IDLE) &&
11175             (env->src_rq->cfs.h_nr_running == 1)) {
11176                 if ((check_cpu_capacity(env->src_rq, sd)) &&
11177                     (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
11178                         return 1;
11179         }
11180
11181         if (env->migration_type == migrate_misfit)
11182                 return 1;
11183
11184         return 0;
11185 }
11186
11187 static int active_load_balance_cpu_stop(void *data);
11188
11189 static int should_we_balance(struct lb_env *env)
11190 {
11191         struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
11192         struct sched_group *sg = env->sd->groups;
11193         int cpu, idle_smt = -1;
11194
11195         /*
11196          * Ensure the balancing environment is consistent; can happen
11197          * when the softirq triggers 'during' hotplug.
11198          */
11199         if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
11200                 return 0;
11201
11202         /*
11203          * In the newly idle case, we will allow all the CPUs
11204          * to do the newly idle load balance.
11205          *
11206          * However, we bail out if we already have tasks or a wakeup pending,
11207          * to optimize wakeup latency.
11208          */
11209         if (env->idle == CPU_NEWLY_IDLE) {
11210                 if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
11211                         return 0;
11212                 return 1;
11213         }
11214
11215         cpumask_copy(swb_cpus, group_balance_mask(sg));
11216         /* Try to find first idle CPU */
11217         for_each_cpu_and(cpu, swb_cpus, env->cpus) {
11218                 if (!idle_cpu(cpu))
11219                         continue;
11220
11221                 /*
11222                  * Don't balance to idle SMT in busy core right away when
11223                  * balancing cores, but remember the first idle SMT CPU for
11224                  * later consideration.  Find CPU on an idle core first.
11225                  */
11226                 if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
11227                         if (idle_smt == -1)
11228                                 idle_smt = cpu;
11229                         /*
11230                          * If the core is not idle, and first SMT sibling which is
11231                          * idle has been found, then its not needed to check other
11232                          * SMT siblings for idleness:
11233                          */
11234 #ifdef CONFIG_SCHED_SMT
11235                         cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
11236 #endif
11237                         continue;
11238                 }
11239
11240                 /*
11241                  * Are we the first idle core in a non-SMT domain or higher,
11242                  * or the first idle CPU in a SMT domain?
11243                  */
11244                 return cpu == env->dst_cpu;
11245         }
11246
11247         /* Are we the first idle CPU with busy siblings? */
11248         if (idle_smt != -1)
11249                 return idle_smt == env->dst_cpu;
11250
11251         /* Are we the first CPU of this group ? */
11252         return group_balance_cpu(sg) == env->dst_cpu;
11253 }
11254
11255 /*
11256  * Check this_cpu to ensure it is balanced within domain. Attempt to move
11257  * tasks if there is an imbalance.
11258  */
11259 static int load_balance(int this_cpu, struct rq *this_rq,
11260                         struct sched_domain *sd, enum cpu_idle_type idle,
11261                         int *continue_balancing)
11262 {
11263         int ld_moved, cur_ld_moved, active_balance = 0;
11264         struct sched_domain *sd_parent = sd->parent;
11265         struct sched_group *group;
11266         struct rq *busiest;
11267         struct rq_flags rf;
11268         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
11269         struct lb_env env = {
11270                 .sd             = sd,
11271                 .dst_cpu        = this_cpu,
11272                 .dst_rq         = this_rq,
11273                 .dst_grpmask    = group_balance_mask(sd->groups),
11274                 .idle           = idle,
11275                 .loop_break     = SCHED_NR_MIGRATE_BREAK,
11276                 .cpus           = cpus,
11277                 .fbq_type       = all,
11278                 .tasks          = LIST_HEAD_INIT(env.tasks),
11279         };
11280
11281         cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
11282
11283         schedstat_inc(sd->lb_count[idle]);
11284
11285 redo:
11286         if (!should_we_balance(&env)) {
11287                 *continue_balancing = 0;
11288                 goto out_balanced;
11289         }
11290
11291         group = find_busiest_group(&env);
11292         if (!group) {
11293                 schedstat_inc(sd->lb_nobusyg[idle]);
11294                 goto out_balanced;
11295         }
11296
11297         busiest = find_busiest_queue(&env, group);
11298         if (!busiest) {
11299                 schedstat_inc(sd->lb_nobusyq[idle]);
11300                 goto out_balanced;
11301         }
11302
11303         WARN_ON_ONCE(busiest == env.dst_rq);
11304
11305         schedstat_add(sd->lb_imbalance[idle], env.imbalance);
11306
11307         env.src_cpu = busiest->cpu;
11308         env.src_rq = busiest;
11309
11310         ld_moved = 0;
11311         /* Clear this flag as soon as we find a pullable task */
11312         env.flags |= LBF_ALL_PINNED;
11313         if (busiest->nr_running > 1) {
11314                 /*
11315                  * Attempt to move tasks. If find_busiest_group has found
11316                  * an imbalance but busiest->nr_running <= 1, the group is
11317                  * still unbalanced. ld_moved simply stays zero, so it is
11318                  * correctly treated as an imbalance.
11319                  */
11320                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
11321
11322 more_balance:
11323                 rq_lock_irqsave(busiest, &rf);
11324                 update_rq_clock(busiest);
11325
11326                 /*
11327                  * cur_ld_moved - load moved in current iteration
11328                  * ld_moved     - cumulative load moved across iterations
11329                  */
11330                 cur_ld_moved = detach_tasks(&env);
11331
11332                 /*
11333                  * We've detached some tasks from busiest_rq. Every
11334                  * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
11335                  * unlock busiest->lock, and we are able to be sure
11336                  * that nobody can manipulate the tasks in parallel.
11337                  * See task_rq_lock() family for the details.
11338                  */
11339
11340                 rq_unlock(busiest, &rf);
11341
11342                 if (cur_ld_moved) {
11343                         attach_tasks(&env);
11344                         ld_moved += cur_ld_moved;
11345                 }
11346
11347                 local_irq_restore(rf.flags);
11348
11349                 if (env.flags & LBF_NEED_BREAK) {
11350                         env.flags &= ~LBF_NEED_BREAK;
11351                         /* Stop if we tried all running tasks */
11352                         if (env.loop < busiest->nr_running)
11353                                 goto more_balance;
11354                 }
11355
11356                 /*
11357                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
11358                  * us and move them to an alternate dst_cpu in our sched_group
11359                  * where they can run. The upper limit on how many times we
11360                  * iterate on same src_cpu is dependent on number of CPUs in our
11361                  * sched_group.
11362                  *
11363                  * This changes load balance semantics a bit on who can move
11364                  * load to a given_cpu. In addition to the given_cpu itself
11365                  * (or a ilb_cpu acting on its behalf where given_cpu is
11366                  * nohz-idle), we now have balance_cpu in a position to move
11367                  * load to given_cpu. In rare situations, this may cause
11368                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
11369                  * _independently_ and at _same_ time to move some load to
11370                  * given_cpu) causing excess load to be moved to given_cpu.
11371                  * This however should not happen so much in practice and
11372                  * moreover subsequent load balance cycles should correct the
11373                  * excess load moved.
11374                  */
11375                 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
11376
11377                         /* Prevent to re-select dst_cpu via env's CPUs */
11378                         __cpumask_clear_cpu(env.dst_cpu, env.cpus);
11379
11380                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
11381                         env.dst_cpu      = env.new_dst_cpu;
11382                         env.flags       &= ~LBF_DST_PINNED;
11383                         env.loop         = 0;
11384                         env.loop_break   = SCHED_NR_MIGRATE_BREAK;
11385
11386                         /*
11387                          * Go back to "more_balance" rather than "redo" since we
11388                          * need to continue with same src_cpu.
11389                          */
11390                         goto more_balance;
11391                 }
11392
11393                 /*
11394                  * We failed to reach balance because of affinity.
11395                  */
11396                 if (sd_parent) {
11397                         int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11398
11399                         if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
11400                                 *group_imbalance = 1;
11401                 }
11402
11403                 /* All tasks on this runqueue were pinned by CPU affinity */
11404                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
11405                         __cpumask_clear_cpu(cpu_of(busiest), cpus);
11406                         /*
11407                          * Attempting to continue load balancing at the current
11408                          * sched_domain level only makes sense if there are
11409                          * active CPUs remaining as possible busiest CPUs to
11410                          * pull load from which are not contained within the
11411                          * destination group that is receiving any migrated
11412                          * load.
11413                          */
11414                         if (!cpumask_subset(cpus, env.dst_grpmask)) {
11415                                 env.loop = 0;
11416                                 env.loop_break = SCHED_NR_MIGRATE_BREAK;
11417                                 goto redo;
11418                         }
11419                         goto out_all_pinned;
11420                 }
11421         }
11422
11423         if (!ld_moved) {
11424                 schedstat_inc(sd->lb_failed[idle]);
11425                 /*
11426                  * Increment the failure counter only on periodic balance.
11427                  * We do not want newidle balance, which can be very
11428                  * frequent, pollute the failure counter causing
11429                  * excessive cache_hot migrations and active balances.
11430                  */
11431                 if (idle != CPU_NEWLY_IDLE)
11432                         sd->nr_balance_failed++;
11433
11434                 if (need_active_balance(&env)) {
11435                         unsigned long flags;
11436
11437                         raw_spin_rq_lock_irqsave(busiest, flags);
11438
11439                         /*
11440                          * Don't kick the active_load_balance_cpu_stop,
11441                          * if the curr task on busiest CPU can't be
11442                          * moved to this_cpu:
11443                          */
11444                         if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
11445                                 raw_spin_rq_unlock_irqrestore(busiest, flags);
11446                                 goto out_one_pinned;
11447                         }
11448
11449                         /* Record that we found at least one task that could run on this_cpu */
11450                         env.flags &= ~LBF_ALL_PINNED;
11451
11452                         /*
11453                          * ->active_balance synchronizes accesses to
11454                          * ->active_balance_work.  Once set, it's cleared
11455                          * only after active load balance is finished.
11456                          */
11457                         if (!busiest->active_balance) {
11458                                 busiest->active_balance = 1;
11459                                 busiest->push_cpu = this_cpu;
11460                                 active_balance = 1;
11461                         }
11462
11463                         preempt_disable();
11464                         raw_spin_rq_unlock_irqrestore(busiest, flags);
11465                         if (active_balance) {
11466                                 stop_one_cpu_nowait(cpu_of(busiest),
11467                                         active_load_balance_cpu_stop, busiest,
11468                                         &busiest->active_balance_work);
11469                         }
11470                         preempt_enable();
11471                 }
11472         } else {
11473                 sd->nr_balance_failed = 0;
11474         }
11475
11476         if (likely(!active_balance) || need_active_balance(&env)) {
11477                 /* We were unbalanced, so reset the balancing interval */
11478                 sd->balance_interval = sd->min_interval;
11479         }
11480
11481         goto out;
11482
11483 out_balanced:
11484         /*
11485          * We reach balance although we may have faced some affinity
11486          * constraints. Clear the imbalance flag only if other tasks got
11487          * a chance to move and fix the imbalance.
11488          */
11489         if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
11490                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11491
11492                 if (*group_imbalance)
11493                         *group_imbalance = 0;
11494         }
11495
11496 out_all_pinned:
11497         /*
11498          * We reach balance because all tasks are pinned at this level so
11499          * we can't migrate them. Let the imbalance flag set so parent level
11500          * can try to migrate them.
11501          */
11502         schedstat_inc(sd->lb_balanced[idle]);
11503
11504         sd->nr_balance_failed = 0;
11505
11506 out_one_pinned:
11507         ld_moved = 0;
11508
11509         /*
11510          * newidle_balance() disregards balance intervals, so we could
11511          * repeatedly reach this code, which would lead to balance_interval
11512          * skyrocketing in a short amount of time. Skip the balance_interval
11513          * increase logic to avoid that.
11514          */
11515         if (env.idle == CPU_NEWLY_IDLE)
11516                 goto out;
11517
11518         /* tune up the balancing interval */
11519         if ((env.flags & LBF_ALL_PINNED &&
11520              sd->balance_interval < MAX_PINNED_INTERVAL) ||
11521             sd->balance_interval < sd->max_interval)
11522                 sd->balance_interval *= 2;
11523 out:
11524         return ld_moved;
11525 }
11526
11527 static inline unsigned long
11528 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
11529 {
11530         unsigned long interval = sd->balance_interval;
11531
11532         if (cpu_busy)
11533                 interval *= sd->busy_factor;
11534
11535         /* scale ms to jiffies */
11536         interval = msecs_to_jiffies(interval);
11537
11538         /*
11539          * Reduce likelihood of busy balancing at higher domains racing with
11540          * balancing at lower domains by preventing their balancing periods
11541          * from being multiples of each other.
11542          */
11543         if (cpu_busy)
11544                 interval -= 1;
11545
11546         interval = clamp(interval, 1UL, max_load_balance_interval);
11547
11548         return interval;
11549 }
11550
11551 static inline void
11552 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
11553 {
11554         unsigned long interval, next;
11555
11556         /* used by idle balance, so cpu_busy = 0 */
11557         interval = get_sd_balance_interval(sd, 0);
11558         next = sd->last_balance + interval;
11559
11560         if (time_after(*next_balance, next))
11561                 *next_balance = next;
11562 }
11563
11564 /*
11565  * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
11566  * running tasks off the busiest CPU onto idle CPUs. It requires at
11567  * least 1 task to be running on each physical CPU where possible, and
11568  * avoids physical / logical imbalances.
11569  */
11570 static int active_load_balance_cpu_stop(void *data)
11571 {
11572         struct rq *busiest_rq = data;
11573         int busiest_cpu = cpu_of(busiest_rq);
11574         int target_cpu = busiest_rq->push_cpu;
11575         struct rq *target_rq = cpu_rq(target_cpu);
11576         struct sched_domain *sd;
11577         struct task_struct *p = NULL;
11578         struct rq_flags rf;
11579
11580         rq_lock_irq(busiest_rq, &rf);
11581         /*
11582          * Between queueing the stop-work and running it is a hole in which
11583          * CPUs can become inactive. We should not move tasks from or to
11584          * inactive CPUs.
11585          */
11586         if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
11587                 goto out_unlock;
11588
11589         /* Make sure the requested CPU hasn't gone down in the meantime: */
11590         if (unlikely(busiest_cpu != smp_processor_id() ||
11591                      !busiest_rq->active_balance))
11592                 goto out_unlock;
11593
11594         /* Is there any task to move? */
11595         if (busiest_rq->nr_running <= 1)
11596                 goto out_unlock;
11597
11598         /*
11599          * This condition is "impossible", if it occurs
11600          * we need to fix it. Originally reported by
11601          * Bjorn Helgaas on a 128-CPU setup.
11602          */
11603         WARN_ON_ONCE(busiest_rq == target_rq);
11604
11605         /* Search for an sd spanning us and the target CPU. */
11606         rcu_read_lock();
11607         for_each_domain(target_cpu, sd) {
11608                 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
11609                         break;
11610         }
11611
11612         if (likely(sd)) {
11613                 struct lb_env env = {
11614                         .sd             = sd,
11615                         .dst_cpu        = target_cpu,
11616                         .dst_rq         = target_rq,
11617                         .src_cpu        = busiest_rq->cpu,
11618                         .src_rq         = busiest_rq,
11619                         .idle           = CPU_IDLE,
11620                         .flags          = LBF_ACTIVE_LB,
11621                 };
11622
11623                 schedstat_inc(sd->alb_count);
11624                 update_rq_clock(busiest_rq);
11625
11626                 p = detach_one_task(&env);
11627                 if (p) {
11628                         schedstat_inc(sd->alb_pushed);
11629                         /* Active balancing done, reset the failure counter. */
11630                         sd->nr_balance_failed = 0;
11631                 } else {
11632                         schedstat_inc(sd->alb_failed);
11633                 }
11634         }
11635         rcu_read_unlock();
11636 out_unlock:
11637         busiest_rq->active_balance = 0;
11638         rq_unlock(busiest_rq, &rf);
11639
11640         if (p)
11641                 attach_one_task(target_rq, p);
11642
11643         local_irq_enable();
11644
11645         return 0;
11646 }
11647
11648 static DEFINE_SPINLOCK(balancing);
11649
11650 /*
11651  * Scale the max load_balance interval with the number of CPUs in the system.
11652  * This trades load-balance latency on larger machines for less cross talk.
11653  */
11654 void update_max_interval(void)
11655 {
11656         max_load_balance_interval = HZ*num_online_cpus()/10;
11657 }
11658
11659 static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
11660 {
11661         if (cost > sd->max_newidle_lb_cost) {
11662                 /*
11663                  * Track max cost of a domain to make sure to not delay the
11664                  * next wakeup on the CPU.
11665                  */
11666                 sd->max_newidle_lb_cost = cost;
11667                 sd->last_decay_max_lb_cost = jiffies;
11668         } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
11669                 /*
11670                  * Decay the newidle max times by ~1% per second to ensure that
11671                  * it is not outdated and the current max cost is actually
11672                  * shorter.
11673                  */
11674                 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
11675                 sd->last_decay_max_lb_cost = jiffies;
11676
11677                 return true;
11678         }
11679
11680         return false;
11681 }
11682
11683 /*
11684  * It checks each scheduling domain to see if it is due to be balanced,
11685  * and initiates a balancing operation if so.
11686  *
11687  * Balancing parameters are set up in init_sched_domains.
11688  */
11689 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
11690 {
11691         int continue_balancing = 1;
11692         int cpu = rq->cpu;
11693         int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
11694         unsigned long interval;
11695         struct sched_domain *sd;
11696         /* Earliest time when we have to do rebalance again */
11697         unsigned long next_balance = jiffies + 60*HZ;
11698         int update_next_balance = 0;
11699         int need_serialize, need_decay = 0;
11700         u64 max_cost = 0;
11701
11702         rcu_read_lock();
11703         for_each_domain(cpu, sd) {
11704                 /*
11705                  * Decay the newidle max times here because this is a regular
11706                  * visit to all the domains.
11707                  */
11708                 need_decay = update_newidle_cost(sd, 0);
11709                 max_cost += sd->max_newidle_lb_cost;
11710
11711                 /*
11712                  * Stop the load balance at this level. There is another
11713                  * CPU in our sched group which is doing load balancing more
11714                  * actively.
11715                  */
11716                 if (!continue_balancing) {
11717                         if (need_decay)
11718                                 continue;
11719                         break;
11720                 }
11721
11722                 interval = get_sd_balance_interval(sd, busy);
11723
11724                 need_serialize = sd->flags & SD_SERIALIZE;
11725                 if (need_serialize) {
11726                         if (!spin_trylock(&balancing))
11727                                 goto out;
11728                 }
11729
11730                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
11731                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
11732                                 /*
11733                                  * The LBF_DST_PINNED logic could have changed
11734                                  * env->dst_cpu, so we can't know our idle
11735                                  * state even if we migrated tasks. Update it.
11736                                  */
11737                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
11738                                 busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
11739                         }
11740                         sd->last_balance = jiffies;
11741                         interval = get_sd_balance_interval(sd, busy);
11742                 }
11743                 if (need_serialize)
11744                         spin_unlock(&balancing);
11745 out:
11746                 if (time_after(next_balance, sd->last_balance + interval)) {
11747                         next_balance = sd->last_balance + interval;
11748                         update_next_balance = 1;
11749                 }
11750         }
11751         if (need_decay) {
11752                 /*
11753                  * Ensure the rq-wide value also decays but keep it at a
11754                  * reasonable floor to avoid funnies with rq->avg_idle.
11755                  */
11756                 rq->max_idle_balance_cost =
11757                         max((u64)sysctl_sched_migration_cost, max_cost);
11758         }
11759         rcu_read_unlock();
11760
11761         /*
11762          * next_balance will be updated only when there is a need.
11763          * When the cpu is attached to null domain for ex, it will not be
11764          * updated.
11765          */
11766         if (likely(update_next_balance))
11767                 rq->next_balance = next_balance;
11768
11769 }
11770
11771 static inline int on_null_domain(struct rq *rq)
11772 {
11773         return unlikely(!rcu_dereference_sched(rq->sd));
11774 }
11775
11776 #ifdef CONFIG_NO_HZ_COMMON
11777 /*
11778  * NOHZ idle load balancing (ILB) details:
11779  *
11780  * - When one of the busy CPUs notices that there may be an idle rebalancing
11781  *   needed, they will kick the idle load balancer, which then does idle
11782  *   load balancing for all the idle CPUs.
11783  *
11784  * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
11785  *   anywhere yet.
11786  */
11787 static inline int find_new_ilb(void)
11788 {
11789         const struct cpumask *hk_mask;
11790         int ilb_cpu;
11791
11792         hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
11793
11794         for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
11795
11796                 if (ilb_cpu == smp_processor_id())
11797                         continue;
11798
11799                 if (idle_cpu(ilb_cpu))
11800                         return ilb_cpu;
11801         }
11802
11803         return -1;
11804 }
11805
11806 /*
11807  * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
11808  * SMP function call (IPI).
11809  *
11810  * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
11811  */
11812 static void kick_ilb(unsigned int flags)
11813 {
11814         int ilb_cpu;
11815
11816         /*
11817          * Increase nohz.next_balance only when if full ilb is triggered but
11818          * not if we only update stats.
11819          */
11820         if (flags & NOHZ_BALANCE_KICK)
11821                 nohz.next_balance = jiffies+1;
11822
11823         ilb_cpu = find_new_ilb();
11824         if (ilb_cpu < 0)
11825                 return;
11826
11827         /*
11828          * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
11829          * the first flag owns it; cleared by nohz_csd_func().
11830          */
11831         flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
11832         if (flags & NOHZ_KICK_MASK)
11833                 return;
11834
11835         /*
11836          * This way we generate an IPI on the target CPU which
11837          * is idle, and the softirq performing NOHZ idle load balancing
11838          * will be run before returning from the IPI.
11839          */
11840         smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
11841 }
11842
11843 /*
11844  * Current decision point for kicking the idle load balancer in the presence
11845  * of idle CPUs in the system.
11846  */
11847 static void nohz_balancer_kick(struct rq *rq)
11848 {
11849         unsigned long now = jiffies;
11850         struct sched_domain_shared *sds;
11851         struct sched_domain *sd;
11852         int nr_busy, i, cpu = rq->cpu;
11853         unsigned int flags = 0;
11854
11855         if (unlikely(rq->idle_balance))
11856                 return;
11857
11858         /*
11859          * We may be recently in ticked or tickless idle mode. At the first
11860          * busy tick after returning from idle, we will update the busy stats.
11861          */
11862         nohz_balance_exit_idle(rq);
11863
11864         /*
11865          * None are in tickless mode and hence no need for NOHZ idle load
11866          * balancing:
11867          */
11868         if (likely(!atomic_read(&nohz.nr_cpus)))
11869                 return;
11870
11871         if (READ_ONCE(nohz.has_blocked) &&
11872             time_after(now, READ_ONCE(nohz.next_blocked)))
11873                 flags = NOHZ_STATS_KICK;
11874
11875         if (time_before(now, nohz.next_balance))
11876                 goto out;
11877
11878         if (rq->nr_running >= 2) {
11879                 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11880                 goto out;
11881         }
11882
11883         rcu_read_lock();
11884
11885         sd = rcu_dereference(rq->sd);
11886         if (sd) {
11887                 /*
11888                  * If there's a runnable CFS task and the current CPU has reduced
11889                  * capacity, kick the ILB to see if there's a better CPU to run on:
11890                  */
11891                 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
11892                         flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11893                         goto unlock;
11894                 }
11895         }
11896
11897         sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
11898         if (sd) {
11899                 /*
11900                  * When ASYM_PACKING; see if there's a more preferred CPU
11901                  * currently idle; in which case, kick the ILB to move tasks
11902                  * around.
11903                  *
11904                  * When balancing betwen cores, all the SMT siblings of the
11905                  * preferred CPU must be idle.
11906                  */
11907                 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
11908                         if (sched_asym(sd, i, cpu)) {
11909                                 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11910                                 goto unlock;
11911                         }
11912                 }
11913         }
11914
11915         sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
11916         if (sd) {
11917                 /*
11918                  * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
11919                  * to run the misfit task on.
11920                  */
11921                 if (check_misfit_status(rq, sd)) {
11922                         flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11923                         goto unlock;
11924                 }
11925
11926                 /*
11927                  * For asymmetric systems, we do not want to nicely balance
11928                  * cache use, instead we want to embrace asymmetry and only
11929                  * ensure tasks have enough CPU capacity.
11930                  *
11931                  * Skip the LLC logic because it's not relevant in that case.
11932                  */
11933                 goto unlock;
11934         }
11935
11936         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
11937         if (sds) {
11938                 /*
11939                  * If there is an imbalance between LLC domains (IOW we could
11940                  * increase the overall cache utilization), we need a less-loaded LLC
11941                  * domain to pull some load from. Likewise, we may need to spread
11942                  * load within the current LLC domain (e.g. packed SMT cores but
11943                  * other CPUs are idle). We can't really know from here how busy
11944                  * the others are - so just get a NOHZ balance going if it looks
11945                  * like this LLC domain has tasks we could move.
11946                  */
11947                 nr_busy = atomic_read(&sds->nr_busy_cpus);
11948                 if (nr_busy > 1) {
11949                         flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11950                         goto unlock;
11951                 }
11952         }
11953 unlock:
11954         rcu_read_unlock();
11955 out:
11956         if (READ_ONCE(nohz.needs_update))
11957                 flags |= NOHZ_NEXT_KICK;
11958
11959         if (flags)
11960                 kick_ilb(flags);
11961 }
11962
11963 static void set_cpu_sd_state_busy(int cpu)
11964 {
11965         struct sched_domain *sd;
11966
11967         rcu_read_lock();
11968         sd = rcu_dereference(per_cpu(sd_llc, cpu));
11969
11970         if (!sd || !sd->nohz_idle)
11971                 goto unlock;
11972         sd->nohz_idle = 0;
11973
11974         atomic_inc(&sd->shared->nr_busy_cpus);
11975 unlock:
11976         rcu_read_unlock();
11977 }
11978
11979 void nohz_balance_exit_idle(struct rq *rq)
11980 {
11981         SCHED_WARN_ON(rq != this_rq());
11982
11983         if (likely(!rq->nohz_tick_stopped))
11984                 return;
11985
11986         rq->nohz_tick_stopped = 0;
11987         cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
11988         atomic_dec(&nohz.nr_cpus);
11989
11990         set_cpu_sd_state_busy(rq->cpu);
11991 }
11992
11993 static void set_cpu_sd_state_idle(int cpu)
11994 {
11995         struct sched_domain *sd;
11996
11997         rcu_read_lock();
11998         sd = rcu_dereference(per_cpu(sd_llc, cpu));
11999
12000         if (!sd || sd->nohz_idle)
12001                 goto unlock;
12002         sd->nohz_idle = 1;
12003
12004         atomic_dec(&sd->shared->nr_busy_cpus);
12005 unlock:
12006         rcu_read_unlock();
12007 }
12008
12009 /*
12010  * This routine will record that the CPU is going idle with tick stopped.
12011  * This info will be used in performing idle load balancing in the future.
12012  */
12013 void nohz_balance_enter_idle(int cpu)
12014 {
12015         struct rq *rq = cpu_rq(cpu);
12016
12017         SCHED_WARN_ON(cpu != smp_processor_id());
12018
12019         /* If this CPU is going down, then nothing needs to be done: */
12020         if (!cpu_active(cpu))
12021                 return;
12022
12023         /* Spare idle load balancing on CPUs that don't want to be disturbed: */
12024         if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
12025                 return;
12026
12027         /*
12028          * Can be set safely without rq->lock held
12029          * If a clear happens, it will have evaluated last additions because
12030          * rq->lock is held during the check and the clear
12031          */
12032         rq->has_blocked_load = 1;
12033
12034         /*
12035          * The tick is still stopped but load could have been added in the
12036          * meantime. We set the nohz.has_blocked flag to trig a check of the
12037          * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
12038          * of nohz.has_blocked can only happen after checking the new load
12039          */
12040         if (rq->nohz_tick_stopped)
12041                 goto out;
12042
12043         /* If we're a completely isolated CPU, we don't play: */
12044         if (on_null_domain(rq))
12045                 return;
12046
12047         rq->nohz_tick_stopped = 1;
12048
12049         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
12050         atomic_inc(&nohz.nr_cpus);
12051
12052         /*
12053          * Ensures that if nohz_idle_balance() fails to observe our
12054          * @idle_cpus_mask store, it must observe the @has_blocked
12055          * and @needs_update stores.
12056          */
12057         smp_mb__after_atomic();
12058
12059         set_cpu_sd_state_idle(cpu);
12060
12061         WRITE_ONCE(nohz.needs_update, 1);
12062 out:
12063         /*
12064          * Each time a cpu enter idle, we assume that it has blocked load and
12065          * enable the periodic update of the load of idle cpus
12066          */
12067         WRITE_ONCE(nohz.has_blocked, 1);
12068 }
12069
12070 static bool update_nohz_stats(struct rq *rq)
12071 {
12072         unsigned int cpu = rq->cpu;
12073
12074         if (!rq->has_blocked_load)
12075                 return false;
12076
12077         if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
12078                 return false;
12079
12080         if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
12081                 return true;
12082
12083         update_blocked_averages(cpu);
12084
12085         return rq->has_blocked_load;
12086 }
12087
12088 /*
12089  * Internal function that runs load balance for all idle cpus. The load balance
12090  * can be a simple update of blocked load or a complete load balance with
12091  * tasks movement depending of flags.
12092  */
12093 static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
12094 {
12095         /* Earliest time when we have to do rebalance again */
12096         unsigned long now = jiffies;
12097         unsigned long next_balance = now + 60*HZ;
12098         bool has_blocked_load = false;
12099         int update_next_balance = 0;
12100         int this_cpu = this_rq->cpu;
12101         int balance_cpu;
12102         struct rq *rq;
12103
12104         SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
12105
12106         /*
12107          * We assume there will be no idle load after this update and clear
12108          * the has_blocked flag. If a cpu enters idle in the mean time, it will
12109          * set the has_blocked flag and trigger another update of idle load.
12110          * Because a cpu that becomes idle, is added to idle_cpus_mask before
12111          * setting the flag, we are sure to not clear the state and not
12112          * check the load of an idle cpu.
12113          *
12114          * Same applies to idle_cpus_mask vs needs_update.
12115          */
12116         if (flags & NOHZ_STATS_KICK)
12117                 WRITE_ONCE(nohz.has_blocked, 0);
12118         if (flags & NOHZ_NEXT_KICK)
12119                 WRITE_ONCE(nohz.needs_update, 0);
12120
12121         /*
12122          * Ensures that if we miss the CPU, we must see the has_blocked
12123          * store from nohz_balance_enter_idle().
12124          */
12125         smp_mb();
12126
12127         /*
12128          * Start with the next CPU after this_cpu so we will end with this_cpu and let a
12129          * chance for other idle cpu to pull load.
12130          */
12131         for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
12132                 if (!idle_cpu(balance_cpu))
12133                         continue;
12134
12135                 /*
12136                  * If this CPU gets work to do, stop the load balancing
12137                  * work being done for other CPUs. Next load
12138                  * balancing owner will pick it up.
12139                  */
12140                 if (need_resched()) {
12141                         if (flags & NOHZ_STATS_KICK)
12142                                 has_blocked_load = true;
12143                         if (flags & NOHZ_NEXT_KICK)
12144                                 WRITE_ONCE(nohz.needs_update, 1);
12145                         goto abort;
12146                 }
12147
12148                 rq = cpu_rq(balance_cpu);
12149
12150                 if (flags & NOHZ_STATS_KICK)
12151                         has_blocked_load |= update_nohz_stats(rq);
12152
12153                 /*
12154                  * If time for next balance is due,
12155                  * do the balance.
12156                  */
12157                 if (time_after_eq(jiffies, rq->next_balance)) {
12158                         struct rq_flags rf;
12159
12160                         rq_lock_irqsave(rq, &rf);
12161                         update_rq_clock(rq);
12162                         rq_unlock_irqrestore(rq, &rf);
12163
12164                         if (flags & NOHZ_BALANCE_KICK)
12165                                 rebalance_domains(rq, CPU_IDLE);
12166                 }
12167
12168                 if (time_after(next_balance, rq->next_balance)) {
12169                         next_balance = rq->next_balance;
12170                         update_next_balance = 1;
12171                 }
12172         }
12173
12174         /*
12175          * next_balance will be updated only when there is a need.
12176          * When the CPU is attached to null domain for ex, it will not be
12177          * updated.
12178          */
12179         if (likely(update_next_balance))
12180                 nohz.next_balance = next_balance;
12181
12182         if (flags & NOHZ_STATS_KICK)
12183                 WRITE_ONCE(nohz.next_blocked,
12184                            now + msecs_to_jiffies(LOAD_AVG_PERIOD));
12185
12186 abort:
12187         /* There is still blocked load, enable periodic update */
12188         if (has_blocked_load)
12189                 WRITE_ONCE(nohz.has_blocked, 1);
12190 }
12191
12192 /*
12193  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
12194  * rebalancing for all the cpus for whom scheduler ticks are stopped.
12195  */
12196 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12197 {
12198         unsigned int flags = this_rq->nohz_idle_balance;
12199
12200         if (!flags)
12201                 return false;
12202
12203         this_rq->nohz_idle_balance = 0;
12204
12205         if (idle != CPU_IDLE)
12206                 return false;
12207
12208         _nohz_idle_balance(this_rq, flags);
12209
12210         return true;
12211 }
12212
12213 /*
12214  * Check if we need to directly run the ILB for updating blocked load before
12215  * entering idle state. Here we run ILB directly without issuing IPIs.
12216  *
12217  * Note that when this function is called, the tick may not yet be stopped on
12218  * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
12219  * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
12220  * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
12221  * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
12222  * called from this function on (this) CPU that's not yet in the mask. That's
12223  * OK because the goal of nohz_run_idle_balance() is to run ILB only for
12224  * updating the blocked load of already idle CPUs without waking up one of
12225  * those idle CPUs and outside the preempt disable / irq off phase of the local
12226  * cpu about to enter idle, because it can take a long time.
12227  */
12228 void nohz_run_idle_balance(int cpu)
12229 {
12230         unsigned int flags;
12231
12232         flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
12233
12234         /*
12235          * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
12236          * (ie NOHZ_STATS_KICK set) and will do the same.
12237          */
12238         if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
12239                 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
12240 }
12241
12242 static void nohz_newidle_balance(struct rq *this_rq)
12243 {
12244         int this_cpu = this_rq->cpu;
12245
12246         /*
12247          * This CPU doesn't want to be disturbed by scheduler
12248          * housekeeping
12249          */
12250         if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
12251                 return;
12252
12253         /* Will wake up very soon. No time for doing anything else*/
12254         if (this_rq->avg_idle < sysctl_sched_migration_cost)
12255                 return;
12256
12257         /* Don't need to update blocked load of idle CPUs*/
12258         if (!READ_ONCE(nohz.has_blocked) ||
12259             time_before(jiffies, READ_ONCE(nohz.next_blocked)))
12260                 return;
12261
12262         /*
12263          * Set the need to trigger ILB in order to update blocked load
12264          * before entering idle state.
12265          */
12266         atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
12267 }
12268
12269 #else /* !CONFIG_NO_HZ_COMMON */
12270 static inline void nohz_balancer_kick(struct rq *rq) { }
12271
12272 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12273 {
12274         return false;
12275 }
12276
12277 static inline void nohz_newidle_balance(struct rq *this_rq) { }
12278 #endif /* CONFIG_NO_HZ_COMMON */
12279
12280 /*
12281  * newidle_balance is called by schedule() if this_cpu is about to become
12282  * idle. Attempts to pull tasks from other CPUs.
12283  *
12284  * Returns:
12285  *   < 0 - we released the lock and there are !fair tasks present
12286  *     0 - failed, no new tasks
12287  *   > 0 - success, new (fair) tasks present
12288  */
12289 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
12290 {
12291         unsigned long next_balance = jiffies + HZ;
12292         int this_cpu = this_rq->cpu;
12293         u64 t0, t1, curr_cost = 0;
12294         struct sched_domain *sd;
12295         int pulled_task = 0;
12296
12297         update_misfit_status(NULL, this_rq);
12298
12299         /*
12300          * There is a task waiting to run. No need to search for one.
12301          * Return 0; the task will be enqueued when switching to idle.
12302          */
12303         if (this_rq->ttwu_pending)
12304                 return 0;
12305
12306         /*
12307          * We must set idle_stamp _before_ calling idle_balance(), such that we
12308          * measure the duration of idle_balance() as idle time.
12309          */
12310         this_rq->idle_stamp = rq_clock(this_rq);
12311
12312         /*
12313          * Do not pull tasks towards !active CPUs...
12314          */
12315         if (!cpu_active(this_cpu))
12316                 return 0;
12317
12318         /*
12319          * This is OK, because current is on_cpu, which avoids it being picked
12320          * for load-balance and preemption/IRQs are still disabled avoiding
12321          * further scheduler activity on it and we're being very careful to
12322          * re-start the picking loop.
12323          */
12324         rq_unpin_lock(this_rq, rf);
12325
12326         rcu_read_lock();
12327         sd = rcu_dereference_check_sched_domain(this_rq->sd);
12328
12329         if (!READ_ONCE(this_rq->rd->overload) ||
12330             (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
12331
12332                 if (sd)
12333                         update_next_balance(sd, &next_balance);
12334                 rcu_read_unlock();
12335
12336                 goto out;
12337         }
12338         rcu_read_unlock();
12339
12340         raw_spin_rq_unlock(this_rq);
12341
12342         t0 = sched_clock_cpu(this_cpu);
12343         update_blocked_averages(this_cpu);
12344
12345         rcu_read_lock();
12346         for_each_domain(this_cpu, sd) {
12347                 int continue_balancing = 1;
12348                 u64 domain_cost;
12349
12350                 update_next_balance(sd, &next_balance);
12351
12352                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
12353                         break;
12354
12355                 if (sd->flags & SD_BALANCE_NEWIDLE) {
12356
12357                         pulled_task = load_balance(this_cpu, this_rq,
12358                                                    sd, CPU_NEWLY_IDLE,
12359                                                    &continue_balancing);
12360
12361                         t1 = sched_clock_cpu(this_cpu);
12362                         domain_cost = t1 - t0;
12363                         update_newidle_cost(sd, domain_cost);
12364
12365                         curr_cost += domain_cost;
12366                         t0 = t1;
12367                 }
12368
12369                 /*
12370                  * Stop searching for tasks to pull if there are
12371                  * now runnable tasks on this rq.
12372                  */
12373                 if (pulled_task || this_rq->nr_running > 0 ||
12374                     this_rq->ttwu_pending)
12375                         break;
12376         }
12377         rcu_read_unlock();
12378
12379         raw_spin_rq_lock(this_rq);
12380
12381         if (curr_cost > this_rq->max_idle_balance_cost)
12382                 this_rq->max_idle_balance_cost = curr_cost;
12383
12384         /*
12385          * While browsing the domains, we released the rq lock, a task could
12386          * have been enqueued in the meantime. Since we're not going idle,
12387          * pretend we pulled a task.
12388          */
12389         if (this_rq->cfs.h_nr_running && !pulled_task)
12390                 pulled_task = 1;
12391
12392         /* Is there a task of a high priority class? */
12393         if (this_rq->nr_running != this_rq->cfs.h_nr_running)
12394                 pulled_task = -1;
12395
12396 out:
12397         /* Move the next balance forward */
12398         if (time_after(this_rq->next_balance, next_balance))
12399                 this_rq->next_balance = next_balance;
12400
12401         if (pulled_task)
12402                 this_rq->idle_stamp = 0;
12403         else
12404                 nohz_newidle_balance(this_rq);
12405
12406         rq_repin_lock(this_rq, rf);
12407
12408         return pulled_task;
12409 }
12410
12411 /*
12412  * run_rebalance_domains is triggered when needed from the scheduler tick.
12413  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
12414  */
12415 static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
12416 {
12417         struct rq *this_rq = this_rq();
12418         enum cpu_idle_type idle = this_rq->idle_balance ?
12419                                                 CPU_IDLE : CPU_NOT_IDLE;
12420
12421         /*
12422          * If this CPU has a pending nohz_balance_kick, then do the
12423          * balancing on behalf of the other idle CPUs whose ticks are
12424          * stopped. Do nohz_idle_balance *before* rebalance_domains to
12425          * give the idle CPUs a chance to load balance. Else we may
12426          * load balance only within the local sched_domain hierarchy
12427          * and abort nohz_idle_balance altogether if we pull some load.
12428          */
12429         if (nohz_idle_balance(this_rq, idle))
12430                 return;
12431
12432         /* normal load balance */
12433         update_blocked_averages(this_rq->cpu);
12434         rebalance_domains(this_rq, idle);
12435 }
12436
12437 /*
12438  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
12439  */
12440 void trigger_load_balance(struct rq *rq)
12441 {
12442         /*
12443          * Don't need to rebalance while attached to NULL domain or
12444          * runqueue CPU is not active
12445          */
12446         if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
12447                 return;
12448
12449         if (time_after_eq(jiffies, rq->next_balance))
12450                 raise_softirq(SCHED_SOFTIRQ);
12451
12452         nohz_balancer_kick(rq);
12453 }
12454
12455 static void rq_online_fair(struct rq *rq)
12456 {
12457         update_sysctl();
12458
12459         update_runtime_enabled(rq);
12460 }
12461
12462 static void rq_offline_fair(struct rq *rq)
12463 {
12464         update_sysctl();
12465
12466         /* Ensure any throttled groups are reachable by pick_next_task */
12467         unthrottle_offline_cfs_rqs(rq);
12468
12469         /* Ensure that we remove rq contribution to group share: */
12470         clear_tg_offline_cfs_rqs(rq);
12471 }
12472
12473 #endif /* CONFIG_SMP */
12474
12475 #ifdef CONFIG_SCHED_CORE
12476 static inline bool
12477 __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
12478 {
12479         u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
12480         u64 slice = se->slice;
12481
12482         return (rtime * min_nr_tasks > slice);
12483 }
12484
12485 #define MIN_NR_TASKS_DURING_FORCEIDLE   2
12486 static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
12487 {
12488         if (!sched_core_enabled(rq))
12489                 return;
12490
12491         /*
12492          * If runqueue has only one task which used up its slice and
12493          * if the sibling is forced idle, then trigger schedule to
12494          * give forced idle task a chance.
12495          *
12496          * sched_slice() considers only this active rq and it gets the
12497          * whole slice. But during force idle, we have siblings acting
12498          * like a single runqueue and hence we need to consider runnable
12499          * tasks on this CPU and the forced idle CPU. Ideally, we should
12500          * go through the forced idle rq, but that would be a perf hit.
12501          * We can assume that the forced idle CPU has at least
12502          * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
12503          * if we need to give up the CPU.
12504          */
12505         if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
12506             __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
12507                 resched_curr(rq);
12508 }
12509
12510 /*
12511  * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
12512  */
12513 static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
12514                          bool forceidle)
12515 {
12516         for_each_sched_entity(se) {
12517                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
12518
12519                 if (forceidle) {
12520                         if (cfs_rq->forceidle_seq == fi_seq)
12521                                 break;
12522                         cfs_rq->forceidle_seq = fi_seq;
12523                 }
12524
12525                 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
12526         }
12527 }
12528
12529 void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
12530 {
12531         struct sched_entity *se = &p->se;
12532
12533         if (p->sched_class != &fair_sched_class)
12534                 return;
12535
12536         se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
12537 }
12538
12539 bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
12540                         bool in_fi)
12541 {
12542         struct rq *rq = task_rq(a);
12543         const struct sched_entity *sea = &a->se;
12544         const struct sched_entity *seb = &b->se;
12545         struct cfs_rq *cfs_rqa;
12546         struct cfs_rq *cfs_rqb;
12547         s64 delta;
12548
12549         SCHED_WARN_ON(task_rq(b)->core != rq->core);
12550
12551 #ifdef CONFIG_FAIR_GROUP_SCHED
12552         /*
12553          * Find an se in the hierarchy for tasks a and b, such that the se's
12554          * are immediate siblings.
12555          */
12556         while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
12557                 int sea_depth = sea->depth;
12558                 int seb_depth = seb->depth;
12559
12560                 if (sea_depth >= seb_depth)
12561                         sea = parent_entity(sea);
12562                 if (sea_depth <= seb_depth)
12563                         seb = parent_entity(seb);
12564         }
12565
12566         se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
12567         se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
12568
12569         cfs_rqa = sea->cfs_rq;
12570         cfs_rqb = seb->cfs_rq;
12571 #else
12572         cfs_rqa = &task_rq(a)->cfs;
12573         cfs_rqb = &task_rq(b)->cfs;
12574 #endif
12575
12576         /*
12577          * Find delta after normalizing se's vruntime with its cfs_rq's
12578          * min_vruntime_fi, which would have been updated in prior calls
12579          * to se_fi_update().
12580          */
12581         delta = (s64)(sea->vruntime - seb->vruntime) +
12582                 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
12583
12584         return delta > 0;
12585 }
12586
12587 static int task_is_throttled_fair(struct task_struct *p, int cpu)
12588 {
12589         struct cfs_rq *cfs_rq;
12590
12591 #ifdef CONFIG_FAIR_GROUP_SCHED
12592         cfs_rq = task_group(p)->cfs_rq[cpu];
12593 #else
12594         cfs_rq = &cpu_rq(cpu)->cfs;
12595 #endif
12596         return throttled_hierarchy(cfs_rq);
12597 }
12598 #else
12599 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
12600 #endif
12601
12602 /*
12603  * scheduler tick hitting a task of our scheduling class.
12604  *
12605  * NOTE: This function can be called remotely by the tick offload that
12606  * goes along full dynticks. Therefore no local assumption can be made
12607  * and everything must be accessed through the @rq and @curr passed in
12608  * parameters.
12609  */
12610 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
12611 {
12612         struct cfs_rq *cfs_rq;
12613         struct sched_entity *se = &curr->se;
12614
12615         for_each_sched_entity(se) {
12616                 cfs_rq = cfs_rq_of(se);
12617                 entity_tick(cfs_rq, se, queued);
12618         }
12619
12620         if (static_branch_unlikely(&sched_numa_balancing))
12621                 task_tick_numa(rq, curr);
12622
12623         update_misfit_status(curr, rq);
12624         update_overutilized_status(task_rq(curr));
12625
12626         task_tick_core(rq, curr);
12627 }
12628
12629 /*
12630  * called on fork with the child task as argument from the parent's context
12631  *  - child not yet on the tasklist
12632  *  - preemption disabled
12633  */
12634 static void task_fork_fair(struct task_struct *p)
12635 {
12636         struct sched_entity *se = &p->se, *curr;
12637         struct cfs_rq *cfs_rq;
12638         struct rq *rq = this_rq();
12639         struct rq_flags rf;
12640
12641         rq_lock(rq, &rf);
12642         update_rq_clock(rq);
12643
12644         cfs_rq = task_cfs_rq(current);
12645         curr = cfs_rq->curr;
12646         if (curr)
12647                 update_curr(cfs_rq);
12648         place_entity(cfs_rq, se, ENQUEUE_INITIAL);
12649         rq_unlock(rq, &rf);
12650 }
12651
12652 /*
12653  * Priority of the task has changed. Check to see if we preempt
12654  * the current task.
12655  */
12656 static void
12657 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
12658 {
12659         if (!task_on_rq_queued(p))
12660                 return;
12661
12662         if (rq->cfs.nr_running == 1)
12663                 return;
12664
12665         /*
12666          * Reschedule if we are currently running on this runqueue and
12667          * our priority decreased, or if we are not currently running on
12668          * this runqueue and our priority is higher than the current's
12669          */
12670         if (task_current(rq, p)) {
12671                 if (p->prio > oldprio)
12672                         resched_curr(rq);
12673         } else
12674                 wakeup_preempt(rq, p, 0);
12675 }
12676
12677 #ifdef CONFIG_FAIR_GROUP_SCHED
12678 /*
12679  * Propagate the changes of the sched_entity across the tg tree to make it
12680  * visible to the root
12681  */
12682 static void propagate_entity_cfs_rq(struct sched_entity *se)
12683 {
12684         struct cfs_rq *cfs_rq = cfs_rq_of(se);
12685
12686         if (cfs_rq_throttled(cfs_rq))
12687                 return;
12688
12689         if (!throttled_hierarchy(cfs_rq))
12690                 list_add_leaf_cfs_rq(cfs_rq);
12691
12692         /* Start to propagate at parent */
12693         se = se->parent;
12694
12695         for_each_sched_entity(se) {
12696                 cfs_rq = cfs_rq_of(se);
12697
12698                 update_load_avg(cfs_rq, se, UPDATE_TG);
12699
12700                 if (cfs_rq_throttled(cfs_rq))
12701                         break;
12702
12703                 if (!throttled_hierarchy(cfs_rq))
12704                         list_add_leaf_cfs_rq(cfs_rq);
12705         }
12706 }
12707 #else
12708 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
12709 #endif
12710
12711 static void detach_entity_cfs_rq(struct sched_entity *se)
12712 {
12713         struct cfs_rq *cfs_rq = cfs_rq_of(se);
12714
12715 #ifdef CONFIG_SMP
12716         /*
12717          * In case the task sched_avg hasn't been attached:
12718          * - A forked task which hasn't been woken up by wake_up_new_task().
12719          * - A task which has been woken up by try_to_wake_up() but is
12720          *   waiting for actually being woken up by sched_ttwu_pending().
12721          */
12722         if (!se->avg.last_update_time)
12723                 return;
12724 #endif
12725
12726         /* Catch up with the cfs_rq and remove our load when we leave */
12727         update_load_avg(cfs_rq, se, 0);
12728         detach_entity_load_avg(cfs_rq, se);
12729         update_tg_load_avg(cfs_rq);
12730         propagate_entity_cfs_rq(se);
12731 }
12732
12733 static void attach_entity_cfs_rq(struct sched_entity *se)
12734 {
12735         struct cfs_rq *cfs_rq = cfs_rq_of(se);
12736
12737         /* Synchronize entity with its cfs_rq */
12738         update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
12739         attach_entity_load_avg(cfs_rq, se);
12740         update_tg_load_avg(cfs_rq);
12741         propagate_entity_cfs_rq(se);
12742 }
12743
12744 static void detach_task_cfs_rq(struct task_struct *p)
12745 {
12746         struct sched_entity *se = &p->se;
12747
12748         detach_entity_cfs_rq(se);
12749 }
12750
12751 static void attach_task_cfs_rq(struct task_struct *p)
12752 {
12753         struct sched_entity *se = &p->se;
12754
12755         attach_entity_cfs_rq(se);
12756 }
12757
12758 static void switched_from_fair(struct rq *rq, struct task_struct *p)
12759 {
12760         detach_task_cfs_rq(p);
12761 }
12762
12763 static void switched_to_fair(struct rq *rq, struct task_struct *p)
12764 {
12765         attach_task_cfs_rq(p);
12766
12767         if (task_on_rq_queued(p)) {
12768                 /*
12769                  * We were most likely switched from sched_rt, so
12770                  * kick off the schedule if running, otherwise just see
12771                  * if we can still preempt the current task.
12772                  */
12773                 if (task_current(rq, p))
12774                         resched_curr(rq);
12775                 else
12776                         wakeup_preempt(rq, p, 0);
12777         }
12778 }
12779
12780 /* Account for a task changing its policy or group.
12781  *
12782  * This routine is mostly called to set cfs_rq->curr field when a task
12783  * migrates between groups/classes.
12784  */
12785 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
12786 {
12787         struct sched_entity *se = &p->se;
12788
12789 #ifdef CONFIG_SMP
12790         if (task_on_rq_queued(p)) {
12791                 /*
12792                  * Move the next running task to the front of the list, so our
12793                  * cfs_tasks list becomes MRU one.
12794                  */
12795                 list_move(&se->group_node, &rq->cfs_tasks);
12796         }
12797 #endif
12798
12799         for_each_sched_entity(se) {
12800                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
12801
12802                 set_next_entity(cfs_rq, se);
12803                 /* ensure bandwidth has been allocated on our new cfs_rq */
12804                 account_cfs_rq_runtime(cfs_rq, 0);
12805         }
12806 }
12807
12808 void init_cfs_rq(struct cfs_rq *cfs_rq)
12809 {
12810         cfs_rq->tasks_timeline = RB_ROOT_CACHED;
12811         u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
12812 #ifdef CONFIG_SMP
12813         raw_spin_lock_init(&cfs_rq->removed.lock);
12814 #endif
12815 }
12816
12817 #ifdef CONFIG_FAIR_GROUP_SCHED
12818 static void task_change_group_fair(struct task_struct *p)
12819 {
12820         /*
12821          * We couldn't detach or attach a forked task which
12822          * hasn't been woken up by wake_up_new_task().
12823          */
12824         if (READ_ONCE(p->__state) == TASK_NEW)
12825                 return;
12826
12827         detach_task_cfs_rq(p);
12828
12829 #ifdef CONFIG_SMP
12830         /* Tell se's cfs_rq has been changed -- migrated */
12831         p->se.avg.last_update_time = 0;
12832 #endif
12833         set_task_rq(p, task_cpu(p));
12834         attach_task_cfs_rq(p);
12835 }
12836
12837 void free_fair_sched_group(struct task_group *tg)
12838 {
12839         int i;
12840
12841         for_each_possible_cpu(i) {
12842                 if (tg->cfs_rq)
12843                         kfree(tg->cfs_rq[i]);
12844                 if (tg->se)
12845                         kfree(tg->se[i]);
12846         }
12847
12848         kfree(tg->cfs_rq);
12849         kfree(tg->se);
12850 }
12851
12852 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
12853 {
12854         struct sched_entity *se;
12855         struct cfs_rq *cfs_rq;
12856         int i;
12857
12858         tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
12859         if (!tg->cfs_rq)
12860                 goto err;
12861         tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
12862         if (!tg->se)
12863                 goto err;
12864
12865         tg->shares = NICE_0_LOAD;
12866
12867         init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
12868
12869         for_each_possible_cpu(i) {
12870                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
12871                                       GFP_KERNEL, cpu_to_node(i));
12872                 if (!cfs_rq)
12873                         goto err;
12874
12875                 se = kzalloc_node(sizeof(struct sched_entity_stats),
12876                                   GFP_KERNEL, cpu_to_node(i));
12877                 if (!se)
12878                         goto err_free_rq;
12879
12880                 init_cfs_rq(cfs_rq);
12881                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
12882                 init_entity_runnable_average(se);
12883         }
12884
12885         return 1;
12886
12887 err_free_rq:
12888         kfree(cfs_rq);
12889 err:
12890         return 0;
12891 }
12892
12893 void online_fair_sched_group(struct task_group *tg)
12894 {
12895         struct sched_entity *se;
12896         struct rq_flags rf;
12897         struct rq *rq;
12898         int i;
12899
12900         for_each_possible_cpu(i) {
12901                 rq = cpu_rq(i);
12902                 se = tg->se[i];
12903                 rq_lock_irq(rq, &rf);
12904                 update_rq_clock(rq);
12905                 attach_entity_cfs_rq(se);
12906                 sync_throttle(tg, i);
12907                 rq_unlock_irq(rq, &rf);
12908         }
12909 }
12910
12911 void unregister_fair_sched_group(struct task_group *tg)
12912 {
12913         unsigned long flags;
12914         struct rq *rq;
12915         int cpu;
12916
12917         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
12918
12919         for_each_possible_cpu(cpu) {
12920                 if (tg->se[cpu])
12921                         remove_entity_load_avg(tg->se[cpu]);
12922
12923                 /*
12924                  * Only empty task groups can be destroyed; so we can speculatively
12925                  * check on_list without danger of it being re-added.
12926                  */
12927                 if (!tg->cfs_rq[cpu]->on_list)
12928                         continue;
12929
12930                 rq = cpu_rq(cpu);
12931
12932                 raw_spin_rq_lock_irqsave(rq, flags);
12933                 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
12934                 raw_spin_rq_unlock_irqrestore(rq, flags);
12935         }
12936 }
12937
12938 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
12939                         struct sched_entity *se, int cpu,
12940                         struct sched_entity *parent)
12941 {
12942         struct rq *rq = cpu_rq(cpu);
12943
12944         cfs_rq->tg = tg;
12945         cfs_rq->rq = rq;
12946         init_cfs_rq_runtime(cfs_rq);
12947
12948         tg->cfs_rq[cpu] = cfs_rq;
12949         tg->se[cpu] = se;
12950
12951         /* se could be NULL for root_task_group */
12952         if (!se)
12953                 return;
12954
12955         if (!parent) {
12956                 se->cfs_rq = &rq->cfs;
12957                 se->depth = 0;
12958         } else {
12959                 se->cfs_rq = parent->my_q;
12960                 se->depth = parent->depth + 1;
12961         }
12962
12963         se->my_q = cfs_rq;
12964         /* guarantee group entities always have weight */
12965         update_load_set(&se->load, NICE_0_LOAD);
12966         se->parent = parent;
12967 }
12968
12969 static DEFINE_MUTEX(shares_mutex);
12970
12971 static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
12972 {
12973         int i;
12974
12975         lockdep_assert_held(&shares_mutex);
12976
12977         /*
12978          * We can't change the weight of the root cgroup.
12979          */
12980         if (!tg->se[0])
12981                 return -EINVAL;
12982
12983         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
12984
12985         if (tg->shares == shares)
12986                 return 0;
12987
12988         tg->shares = shares;
12989         for_each_possible_cpu(i) {
12990                 struct rq *rq = cpu_rq(i);
12991                 struct sched_entity *se = tg->se[i];
12992                 struct rq_flags rf;
12993
12994                 /* Propagate contribution to hierarchy */
12995                 rq_lock_irqsave(rq, &rf);
12996                 update_rq_clock(rq);
12997                 for_each_sched_entity(se) {
12998                         update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
12999                         update_cfs_group(se);
13000                 }
13001                 rq_unlock_irqrestore(rq, &rf);
13002         }
13003
13004         return 0;
13005 }
13006
13007 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
13008 {
13009         int ret;
13010
13011         mutex_lock(&shares_mutex);
13012         if (tg_is_idle(tg))
13013                 ret = -EINVAL;
13014         else
13015                 ret = __sched_group_set_shares(tg, shares);
13016         mutex_unlock(&shares_mutex);
13017
13018         return ret;
13019 }
13020
13021 int sched_group_set_idle(struct task_group *tg, long idle)
13022 {
13023         int i;
13024
13025         if (tg == &root_task_group)
13026                 return -EINVAL;
13027
13028         if (idle < 0 || idle > 1)
13029                 return -EINVAL;
13030
13031         mutex_lock(&shares_mutex);
13032
13033         if (tg->idle == idle) {
13034                 mutex_unlock(&shares_mutex);
13035                 return 0;
13036         }
13037
13038         tg->idle = idle;
13039
13040         for_each_possible_cpu(i) {
13041                 struct rq *rq = cpu_rq(i);
13042                 struct sched_entity *se = tg->se[i];
13043                 struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
13044                 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
13045                 long idle_task_delta;
13046                 struct rq_flags rf;
13047
13048                 rq_lock_irqsave(rq, &rf);
13049
13050                 grp_cfs_rq->idle = idle;
13051                 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
13052                         goto next_cpu;
13053
13054                 if (se->on_rq) {
13055                         parent_cfs_rq = cfs_rq_of(se);
13056                         if (cfs_rq_is_idle(grp_cfs_rq))
13057                                 parent_cfs_rq->idle_nr_running++;
13058                         else
13059                                 parent_cfs_rq->idle_nr_running--;
13060                 }
13061
13062                 idle_task_delta = grp_cfs_rq->h_nr_running -
13063                                   grp_cfs_rq->idle_h_nr_running;
13064                 if (!cfs_rq_is_idle(grp_cfs_rq))
13065                         idle_task_delta *= -1;
13066
13067                 for_each_sched_entity(se) {
13068                         struct cfs_rq *cfs_rq = cfs_rq_of(se);
13069
13070                         if (!se->on_rq)
13071                                 break;
13072
13073                         cfs_rq->idle_h_nr_running += idle_task_delta;
13074
13075                         /* Already accounted at parent level and above. */
13076                         if (cfs_rq_is_idle(cfs_rq))
13077                                 break;
13078                 }
13079
13080 next_cpu:
13081                 rq_unlock_irqrestore(rq, &rf);
13082         }
13083
13084         /* Idle groups have minimum weight. */
13085         if (tg_is_idle(tg))
13086                 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
13087         else
13088                 __sched_group_set_shares(tg, NICE_0_LOAD);
13089
13090         mutex_unlock(&shares_mutex);
13091         return 0;
13092 }
13093
13094 #endif /* CONFIG_FAIR_GROUP_SCHED */
13095
13096
13097 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
13098 {
13099         struct sched_entity *se = &task->se;
13100         unsigned int rr_interval = 0;
13101
13102         /*
13103          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
13104          * idle runqueue:
13105          */
13106         if (rq->cfs.load.weight)
13107                 rr_interval = NS_TO_JIFFIES(se->slice);
13108
13109         return rr_interval;
13110 }
13111
13112 /*
13113  * All the scheduling class methods:
13114  */
13115 DEFINE_SCHED_CLASS(fair) = {
13116
13117         .enqueue_task           = enqueue_task_fair,
13118         .dequeue_task           = dequeue_task_fair,
13119         .yield_task             = yield_task_fair,
13120         .yield_to_task          = yield_to_task_fair,
13121
13122         .wakeup_preempt         = check_preempt_wakeup_fair,
13123
13124         .pick_next_task         = __pick_next_task_fair,
13125         .put_prev_task          = put_prev_task_fair,
13126         .set_next_task          = set_next_task_fair,
13127
13128 #ifdef CONFIG_SMP
13129         .balance                = balance_fair,
13130         .pick_task              = pick_task_fair,
13131         .select_task_rq         = select_task_rq_fair,
13132         .migrate_task_rq        = migrate_task_rq_fair,
13133
13134         .rq_online              = rq_online_fair,
13135         .rq_offline             = rq_offline_fair,
13136
13137         .task_dead              = task_dead_fair,
13138         .set_cpus_allowed       = set_cpus_allowed_common,
13139 #endif
13140
13141         .task_tick              = task_tick_fair,
13142         .task_fork              = task_fork_fair,
13143
13144         .prio_changed           = prio_changed_fair,
13145         .switched_from          = switched_from_fair,
13146         .switched_to            = switched_to_fair,
13147
13148         .get_rr_interval        = get_rr_interval_fair,
13149
13150         .update_curr            = update_curr_fair,
13151
13152 #ifdef CONFIG_FAIR_GROUP_SCHED
13153         .task_change_group      = task_change_group_fair,
13154 #endif
13155
13156 #ifdef CONFIG_SCHED_CORE
13157         .task_is_throttled      = task_is_throttled_fair,
13158 #endif
13159
13160 #ifdef CONFIG_UCLAMP_TASK
13161         .uclamp_enabled         = 1,
13162 #endif
13163 };
13164
13165 #ifdef CONFIG_SCHED_DEBUG
13166 void print_cfs_stats(struct seq_file *m, int cpu)
13167 {
13168         struct cfs_rq *cfs_rq, *pos;
13169
13170         rcu_read_lock();
13171         for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
13172                 print_cfs_rq(m, cpu, cfs_rq);
13173         rcu_read_unlock();
13174 }
13175
13176 #ifdef CONFIG_NUMA_BALANCING
13177 void show_numa_stats(struct task_struct *p, struct seq_file *m)
13178 {
13179         int node;
13180         unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
13181         struct numa_group *ng;
13182
13183         rcu_read_lock();
13184         ng = rcu_dereference(p->numa_group);
13185         for_each_online_node(node) {
13186                 if (p->numa_faults) {
13187                         tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
13188                         tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
13189                 }
13190                 if (ng) {
13191                         gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
13192                         gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
13193                 }
13194                 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
13195         }
13196         rcu_read_unlock();
13197 }
13198 #endif /* CONFIG_NUMA_BALANCING */
13199 #endif /* CONFIG_SCHED_DEBUG */
13200
13201 __init void init_sched_fair_class(void)
13202 {
13203 #ifdef CONFIG_SMP
13204         int i;
13205
13206         for_each_possible_cpu(i) {
13207                 zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
13208                 zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
13209                 zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
13210                                         GFP_KERNEL, cpu_to_node(i));
13211
13212 #ifdef CONFIG_CFS_BANDWIDTH
13213                 INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
13214                 INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
13215 #endif
13216         }
13217
13218         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
13219
13220 #ifdef CONFIG_NO_HZ_COMMON
13221         nohz.next_balance = jiffies;
13222         nohz.next_blocked = jiffies;
13223         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
13224 #endif
13225 #endif /* SMP */
13226
13227 }