sched/fair: Tune down misfit NOHZ kicks

[linux-2.6-block.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 310d0637fe4b23e6bff5e090be1ea18c09a7decf..f0d2f8a352bf19360d7e5860e2124972e1d73cd7 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
   */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq->rq;
-}
-
  static inline struct task_struct *task_of(struct sched_entity *se)
  {
         SCHED_WARN_ON(!entity_is_task(se));
@@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
         return grp->my_q;
  }
  
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
-       if (!cfs_rq->on_list) {
-               struct rq *rq = rq_of(cfs_rq);
-               int cpu = cpu_of(rq);
+       struct rq *rq = rq_of(cfs_rq);
+       int cpu = cpu_of(rq);
+
+       if (cfs_rq->on_list)
+               return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+       cfs_rq->on_list = 1;
+
+       /*
+        * Ensure we either appear before our parent (if already
+        * enqueued) or force our parent to appear after us when it is
+        * enqueued. The fact that we always enqueue bottom-up
+        * reduces this to two cases and a special case for the root
+        * cfs_rq. Furthermore, it also means that we will always reset
+        * tmp_alone_branch either when the branch is connected
+        * to a tree or when we reach the top of the tree
+        */
+       if (cfs_rq->tg->parent &&
+           cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
                 /*
-                * Ensure we either appear before our parent (if already
-                * enqueued) or force our parent to appear after us when it is
-                * enqueued. The fact that we always enqueue bottom-up
-                * reduces this to two cases and a special case for the root
-                * cfs_rq. Furthermore, it also means that we will always reset
-                * tmp_alone_branch either when the branch is connected
-                * to a tree or when we reach the beg of the tree
+                * If parent is already on the list, we add the child
+                * just before. Thanks to circular linked property of
+                * the list, this means to put the child at the tail
+                * of the list that starts by parent.
                  */
-               if (cfs_rq->tg->parent &&
-                   cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
-                       /*
-                        * If parent is already on the list, we add the child
-                        * just before. Thanks to circular linked property of
-                        * the list, this means to put the child at the tail
-                        * of the list that starts by parent.
-                        */
-                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
-                       /*
-                        * The branch is now connected to its tree so we can
-                        * reset tmp_alone_branch to the beginning of the
-                        * list.
-                        */
-                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
-               } else if (!cfs_rq->tg->parent) {
-                       /*
-                        * cfs rq without parent should be put
-                        * at the tail of the list.
-                        */
-                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq->leaf_cfs_rq_list);
-                       /*
-                        * We have reach the beg of a tree so we can reset
-                        * tmp_alone_branch to the beginning of the list.
-                        */
-                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
-               } else {
-                       /*
-                        * The parent has not already been added so we want to
-                        * make sure that it will be put after us.
-                        * tmp_alone_branch points to the beg of the branch
-                        * where we will add parent.
-                        */
-                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               rq->tmp_alone_branch);
-                       /*
-                        * update tmp_alone_branch to points to the new beg
-                        * of the branch
-                        */
-                       rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
-               }
+               list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                       &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+               /*
+                * The branch is now connected to its tree so we can
+                * reset tmp_alone_branch to the beginning of the
+                * list.
+                */
+               rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               return true;
+       }
  
-               cfs_rq->on_list = 1;
+       if (!cfs_rq->tg->parent) {
+               /*
+                * cfs rq without parent should be put
+                * at the tail of the list.
+                */
+               list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                       &rq->leaf_cfs_rq_list);
+               /*
+                * We have reach the top of a tree so we can reset
+                * tmp_alone_branch to the beginning of the list.
+                */
+               rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               return true;
         }
+
+       /*
+        * The parent has not already been added so we want to
+        * make sure that it will be put after us.
+        * tmp_alone_branch points to the begin of the branch
+        * where we will add parent.
+        */
+       list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+       /*
+        * update tmp_alone_branch to points to the new begin
+        * of the branch
+        */
+       rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+       return false;
  }
  
  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
         if (cfs_rq->on_list) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               /*
+                * With cfs_rq being unthrottled/throttled during an enqueue,
+                * it can happen the tmp_alone_branch points the a leaf that
+                * we finally want to del. In this case, tmp_alone_branch moves
+                * to the prev element but it will point to rq->leaf_cfs_rq_list
+                * at the end of the enqueue.
+                */
+               if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+                       rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
                 cfs_rq->on_list = 0;
         }
  }
  
-/* Iterate through all leaf cfs_rq's on a runqueue: */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+       SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                     \
+       list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
+                                leaf_cfs_rq_list)
  
  /* Do the two (enqueued) entities belong to the same group ? */
  static inline struct cfs_rq *
@@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
         return container_of(se, struct task_struct, se);
  }
  
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-       return container_of(cfs_rq, struct rq, cfs);
-}
-
-
  #define for_each_sched_entity(se) \
                 for (; se; se = NULL)
  
@@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
         return NULL;
  }
  
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
+       return true;
  }
  
  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  }
  
-#define for_each_leaf_cfs_rq(rq, cfs_rq)       \
-               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+}
+
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)     \
+               for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
@@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
         return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
-#ifdef CONFIG_SMP
  #include "pelt.h"
-#include "sched-pelt.h"
+#ifdef CONFIG_SMP
  
  static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
@@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
   * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
   * if util_avg > util_avg_cap.
   */
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
  {
+       struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         struct sched_avg *sa = &se->avg;
         long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
@@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
                 }
         }
  
-       if (entity_is_task(se)) {
-               struct task_struct *p = task_of(se);
-               if (p->sched_class != &fair_sched_class) {
-                       /*
-                        * For !fair tasks do:
-                        *
-                       update_cfs_rq_load_avg(now, cfs_rq);
-                       attach_entity_load_avg(cfs_rq, se, 0);
-                       switched_from_fair(rq, p);
-                        *
-                        * such that the next switched_to_fair() has the
-                        * expected state.
-                        */
-                       se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
-                       return;
-               }
+       if (p->sched_class != &fair_sched_class) {
+               /*
+                * For !fair tasks do:
+                *
+               update_cfs_rq_load_avg(now, cfs_rq);
+               attach_entity_load_avg(cfs_rq, se, 0);
+               switched_from_fair(rq, p);
+                *
+                * such that the next switched_to_fair() has the
+                * expected state.
+                */
+               se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+               return;
         }
  
         attach_entity_cfs_rq(se);
@@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
  {
  }
  static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
  struct numa_group {
-       atomic_t refcount;
+       refcount_t refcount;
  
         spinlock_t lock; /* nr_tasks, tasks */
         int nr_tasks;
@@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p)
                 unsigned long shared = group_faults_shared(ng);
                 unsigned long private = group_faults_priv(ng);
  
-               period *= atomic_read(&ng->refcount);
+               period *= refcount_read(&ng->refcount);
                 period *= shared + 1;
                 period /= private + shared + 1;
         }
@@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p)
                 unsigned long private = group_faults_priv(ng);
                 unsigned long period = smax;
  
-               period *= atomic_read(&ng->refcount);
+               period *= refcount_read(&ng->refcount);
                 period *= shared + 1;
                 period /= private + shared + 1;
  
@@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p)
  
  static inline int get_numa_group(struct numa_group *grp)
  {
-       return atomic_inc_not_zero(&grp->refcount);
+       return refcount_inc_not_zero(&grp->refcount);
  }
  
  static inline void put_numa_group(struct numa_group *grp)
  {
-       if (atomic_dec_and_test(&grp->refcount))
+       if (refcount_dec_and_test(&grp->refcount))
                 kfree_rcu(grp, rcu);
  }
  
@@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                 if (!grp)
                         return;
  
-               atomic_set(&grp->refcount, 1);
+               refcount_set(&grp->refcount, 1);
                 grp->active_nodes = 1;
                 grp->max_faults_cpu = 0;
                 spin_lock_init(&grp->lock);
@@ -3122,7 +3135,7 @@ void set_task_rq_fair(struct sched_entity *se,
         p_last_update_time = prev->avg.last_update_time;
         n_last_update_time = next->avg.last_update_time;
  #endif
-       __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+       __update_load_avg_blocked_se(p_last_update_time, se);
         se->avg.last_update_time = n_last_update_time;
  }
  
@@ -3257,11 +3270,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
  
         /*
          * runnable_sum can't be lower than running_sum
-        * As running sum is scale with CPU capacity wehreas the runnable sum
-        * is not we rescale running_sum 1st
+        * Rescale running sum to be in the same range as runnable sum
+        * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
+        * runnable_sum is in [0 : LOAD_AVG_MAX]
          */
-       running_sum = se->avg.util_sum /
-               arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+       running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
         runnable_sum = max(runnable_sum, running_sum);
  
         load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3364,7 +3377,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
  
  /**
   * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- * @now: current time, as per cfs_rq_clock_task()
+ * @now: current time, as per cfs_rq_clock_pelt()
   * @cfs_rq: cfs_rq to update
   *
   * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3409,7 +3422,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
                 decayed = 1;
         }
  
-       decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+       decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
  
  #ifndef CONFIG_64BIT
         smp_wmb();
@@ -3499,9 +3512,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  /* Update task and its cfs_rq load average */
  static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
-       u64 now = cfs_rq_clock_task(cfs_rq);
-       struct rq *rq = rq_of(cfs_rq);
-       int cpu = cpu_of(rq);
+       u64 now = cfs_rq_clock_pelt(cfs_rq);
         int decayed;
  
         /*
@@ -3509,7 +3520,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
          * track group sched_entity load average for task_h_load calc in migration
          */
         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
-               __update_load_avg_se(now, cpu, cfs_rq, se);
+               __update_load_avg_se(now, cfs_rq, se);
  
         decayed  = update_cfs_rq_load_avg(now, cfs_rq);
         decayed |= propagate_entity_load_avg(se);
@@ -3561,7 +3572,7 @@ void sync_entity_load_avg(struct sched_entity *se)
         u64 last_update_time;
  
         last_update_time = cfs_rq_last_update_time(cfs_rq);
-       __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+       __update_load_avg_blocked_se(last_update_time, se);
  }
  
  /*
@@ -3577,10 +3588,6 @@ void remove_entity_load_avg(struct sched_entity *se)
          * tasks cannot exit without having gone through wake_up_new_task() ->
          * post_init_entity_util_avg() which will have added things to the
          * cfs_rq, so we can remove unconditionally.
-        *
-        * Similarly for groups, they will have passed through
-        * post_init_entity_util_avg() before unregister_sched_fair_group()
-        * calls this.
          */
  
         sync_entity_load_avg(se);
@@ -3654,6 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
  {
         long last_ewma_diff;
         struct util_est ue;
+       int cpu;
  
         if (!sched_feat(UTIL_EST))
                 return;
@@ -3687,6 +3695,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
         if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
                 return;
  
+       /*
+        * To avoid overestimation of actual task utilization, skip updates if
+        * we cannot grant there is idle time in this CPU.
+        */
+       cpu = cpu_of(rq_of(cfs_rq));
+       if (task_util(p) > capacity_orig_of(cpu))
+               return;
+
         /*
          * Update Task's estimated utilization
          *
@@ -4429,6 +4445,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
                 /* adjust cfs_rq_clock_task() */
                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                              cfs_rq->throttled_clock_task;
+
+               /* Add cfs_rq with already running entity in the list */
+               if (cfs_rq->nr_running >= 1)
+                       list_add_leaf_cfs_rq(cfs_rq);
         }
  
         return 0;
@@ -4440,8 +4460,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
         /* group is entering throttled state, stop time */
-       if (!cfs_rq->throttle_count)
+       if (!cfs_rq->throttle_count) {
                 cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               list_del_leaf_cfs_rq(cfs_rq);
+       }
         cfs_rq->throttle_count++;
  
         return 0;
@@ -4544,6 +4566,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                         break;
         }
  
+       assert_list_leaf_cfs_rq(rq);
+
         if (!se)
                 add_nr_running(rq, task_delta);
  
@@ -4565,7 +4589,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                 struct rq *rq = rq_of(cfs_rq);
                 struct rq_flags rf;
  
-               rq_lock(rq, &rf);
+               rq_lock_irqsave(rq, &rf);
                 if (!cfs_rq_throttled(cfs_rq))
                         goto next;
  
@@ -4582,7 +4606,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                         unthrottle_cfs_rq(cfs_rq);
  
  next:
-               rq_unlock(rq, &rf);
+               rq_unlock_irqrestore(rq, &rf);
  
                 if (!remaining)
                         break;
@@ -4598,7 +4622,7 @@ next:
   * period the timer is deactivated until scheduling resumes; cfs_b->idle is
   * used to track this state.
   */
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
  {
         u64 runtime, runtime_expires;
         int throttled;
@@ -4640,11 +4664,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
         while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
                 runtime = cfs_b->runtime;
                 cfs_b->distribute_running = 1;
-               raw_spin_unlock(&cfs_b->lock);
+               raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
                 runtime = distribute_cfs_runtime(cfs_b, runtime,
                                                  runtime_expires);
-               raw_spin_lock(&cfs_b->lock);
+               raw_spin_lock_irqsave(&cfs_b->lock, flags);
  
                 cfs_b->distribute_running = 0;
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@ -4753,17 +4777,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  {
         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+       unsigned long flags;
         u64 expires;
  
         /* confirm we're still not at a refresh boundary */
-       raw_spin_lock(&cfs_b->lock);
+       raw_spin_lock_irqsave(&cfs_b->lock, flags);
         if (cfs_b->distribute_running) {
-               raw_spin_unlock(&cfs_b->lock);
+               raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 return;
         }
  
         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
-               raw_spin_unlock(&cfs_b->lock);
+               raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 return;
         }
  
@@ -4774,18 +4799,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         if (runtime)
                 cfs_b->distribute_running = 1;
  
-       raw_spin_unlock(&cfs_b->lock);
+       raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  
         if (!runtime)
                 return;
  
         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
  
-       raw_spin_lock(&cfs_b->lock);
+       raw_spin_lock_irqsave(&cfs_b->lock, flags);
         if (expires == cfs_b->runtime_expires)
                 lsub_positive(&cfs_b->runtime, runtime);
         cfs_b->distribute_running = 0;
-       raw_spin_unlock(&cfs_b->lock);
+       raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  }
  
  /*
@@ -4863,20 +4888,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
  {
         struct cfs_bandwidth *cfs_b =
                 container_of(timer, struct cfs_bandwidth, period_timer);
+       unsigned long flags;
         int overrun;
         int idle = 0;
  
-       raw_spin_lock(&cfs_b->lock);
+       raw_spin_lock_irqsave(&cfs_b->lock, flags);
         for (;;) {
                 overrun = hrtimer_forward_now(timer, cfs_b->period);
                 if (!overrun)
                         break;
  
-               idle = do_sched_cfs_period_timer(cfs_b, overrun);
+               idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
         }
         if (idle)
                 cfs_b->period_active = 0;
-       raw_spin_unlock(&cfs_b->lock);
+       raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  
         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  }
@@ -4986,6 +5012,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
  }
  
  #else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+       return false;
+}
+
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
         return rq_clock_task(rq_of(cfs_rq));
@@ -5177,6 +5209,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
         }
  
+       if (cfs_bandwidth_used()) {
+               /*
+                * When bandwidth control is enabled; the cfs_rq_throttled()
+                * breaks in the above iteration can result in incomplete
+                * leaf list maintenance, resulting in triggering the assertion
+                * below.
+                */
+               for_each_sched_entity(se) {
+                       cfs_rq = cfs_rq_of(se);
+
+                       if (list_add_leaf_cfs_rq(cfs_rq))
+                               break;
+               }
+       }
+
+       assert_list_leaf_cfs_rq(rq);
+
         hrtick_update(rq);
  }
  
@@ -5556,11 +5605,6 @@ static unsigned long capacity_of(int cpu)
         return cpu_rq(cpu)->cpu_capacity;
  }
  
-static unsigned long capacity_orig_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -6053,7 +6097,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
                 bool idle = true;
  
                 for_each_cpu(cpu, cpu_smt_mask(core)) {
-                       cpumask_clear_cpu(cpu, cpus);
+                       __cpumask_clear_cpu(cpu, cpus);
                         if (!available_idle_cpu(cpu))
                                 idle = false;
                 }
@@ -6073,7 +6117,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
  /*
   * Scan the local SMT mask for idle CPUs.
   */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
  {
         int cpu;
  
@@ -6097,7 +6141,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
         return -1;
  }
  
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
  {
         return -1;
  }
@@ -6202,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         if ((unsigned)i < nr_cpumask_bits)
                 return i;
  
-       i = select_idle_smt(p, sd, target);
+       i = select_idle_smt(p, target);
         if ((unsigned)i < nr_cpumask_bits)
                 return i;
  
@@ -6608,7 +6652,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         if (sd_flag & SD_BALANCE_WAKE) {
                 record_wakee(p);
  
-               if (static_branch_unlikely(&sched_energy_present)) {
+               if (sched_energy_enabled()) {
                         new_cpu = find_energy_efficient_cpu(p, prev_cpu);
                         if (new_cpu >= 0)
                                 return new_cpu;
@@ -7027,6 +7071,12 @@ idle:
         if (new_tasks > 0)
                 goto again;
  
+       /*
+        * rq is about to be idle, check if we need to update the
+        * lost_idle_time of clock_pelt
+        */
+       update_idle_rq_clock_pelt(rq);
+
         return NULL;
  }
  
@@ -7647,10 +7697,27 @@ static inline bool others_have_blocked(struct rq *rq)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load.weight)
+               return false;
+
+       if (cfs_rq->avg.load_sum)
+               return false;
+
+       if (cfs_rq->avg.util_sum)
+               return false;
+
+       if (cfs_rq->avg.runnable_load_sum)
+               return false;
+
+       return true;
+}
+
  static void update_blocked_averages(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
         const struct sched_class *curr_class;
         struct rq_flags rf;
         bool done = true;
@@ -7662,14 +7729,10 @@ static void update_blocked_averages(int cpu)
          * Iterates the task_group tree in a bottom up fashion, see
          * list_add_leaf_cfs_rq() for details.
          */
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
+       for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
                 struct sched_entity *se;
  
-               /* throttled entities do not contribute to load */
-               if (throttled_hierarchy(cfs_rq))
-                       continue;
-
-               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
                         update_tg_load_avg(cfs_rq, 0);
  
                 /* Propagate pending load changes to the parent, if any: */
@@ -7677,14 +7740,21 @@ static void update_blocked_averages(int cpu)
                 if (se && !skip_blocked_update(se))
                         update_load_avg(cfs_rq_of(se), se, 0);
  
+               /*
+                * There can be a lot of idle CPU cgroups.  Don't let fully
+                * decayed cfs_rqs linger on the list.
+                */
+               if (cfs_rq_is_decayed(cfs_rq))
+                       list_del_leaf_cfs_rq(cfs_rq);
+
                 /* Don't need periodic decay once load/util_avg are null */
                 if (cfs_rq_has_blocked(cfs_rq))
                         done = false;
         }
  
         curr_class = rq->curr->sched_class;
-       update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
-       update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+       update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+       update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
         update_irq_load_avg(rq, 0);
         /* Don't need periodic decay once load/util_avg are null */
         if (others_have_blocked(rq))
@@ -7754,11 +7824,11 @@ static inline void update_blocked_averages(int cpu)
  
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
  
         curr_class = rq->curr->sched_class;
-       update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
-       update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+       update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+       update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
         update_irq_load_avg(rq, 0);
  #ifdef CONFIG_NO_HZ_COMMON
         rq->last_blocked_load_update_tick = jiffies;
@@ -7988,6 +8058,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
                                 (rq->cpu_capacity_orig * 100));
  }
  
+/*
+ * Check whether a rq has a misfit task and if it looks like we can actually
+ * help that task: we can migrate the task to a CPU of higher capacity, or
+ * the task's current CPU is heavily pressured.
+ */
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+{
+       return rq->misfit_task_load &&
+               (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+                check_cpu_capacity(rq, sd));
+}
+
  /*
   * Group imbalance indicates (and tries to solve) the problem where balancing
   * groups is inadequate due to ->cpus_allowed constraints.
@@ -8452,9 +8534,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
         if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
                 return 0;
  
-       env->imbalance = DIV_ROUND_CLOSEST(
-               sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
-               SCHED_CAPACITY_SCALE);
+       env->imbalance = sds->busiest_stat.group_load;
  
         return 1;
  }
@@ -8636,7 +8716,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
          */
         update_sd_lb_stats(env, &sds);
  
-       if (static_branch_unlikely(&sched_energy_present)) {
+       if (sched_energy_enabled()) {
                 struct root_domain *rd = env->dst_rq->rd;
  
                 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
@@ -8827,21 +8907,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
   */
  #define MAX_PINNED_INTERVAL    512
  
-static int need_active_balance(struct lb_env *env)
+static inline bool
+asym_active_balance(struct lb_env *env)
  {
-       struct sched_domain *sd = env->sd;
+       /*
+        * ASYM_PACKING needs to force migrate tasks from busy but
+        * lower priority CPUs in order to pack all tasks in the
+        * highest priority CPUs.
+        */
+       return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+              sched_asym_prefer(env->dst_cpu, env->src_cpu);
+}
  
-       if (env->idle == CPU_NEWLY_IDLE) {
+static inline bool
+voluntary_active_balance(struct lb_env *env)
+{
+       struct sched_domain *sd = env->sd;
  
-               /*
-                * ASYM_PACKING needs to force migrate tasks from busy but
-                * lower priority CPUs in order to pack all tasks in the
-                * highest priority CPUs.
-                */
-               if ((sd->flags & SD_ASYM_PACKING) &&
-                   sched_asym_prefer(env->dst_cpu, env->src_cpu))
-                       return 1;
-       }
+       if (asym_active_balance(env))
+               return 1;
  
         /*
          * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -8859,6 +8943,16 @@ static int need_active_balance(struct lb_env *env)
         if (env->src_grp_type == group_misfit_task)
                 return 1;
  
+       return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
+{
+       struct sched_domain *sd = env->sd;
+
+       if (voluntary_active_balance(env))
+               return 1;
+
         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  }
  
@@ -9023,7 +9117,7 @@ more_balance:
                 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
  
                         /* Prevent to re-select dst_cpu via env's CPUs */
-                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
+                       __cpumask_clear_cpu(env.dst_cpu, env.cpus);
  
                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
                         env.dst_cpu      = env.new_dst_cpu;
@@ -9050,7 +9144,7 @@ more_balance:
  
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
-                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       __cpumask_clear_cpu(cpu_of(busiest), cpus);
                         /*
                          * Attempting to continue load balancing at the current
                          * sched_domain level only makes sense if there are
@@ -9120,7 +9214,7 @@ more_balance:
         } else
                 sd->nr_balance_failed = 0;
  
-       if (likely(!active_balance)) {
+       if (likely(!active_balance) || voluntary_active_balance(&env)) {
                 /* We were unbalanced, so reset the balancing interval */
                 sd->balance_interval = sd->min_interval;
         } else {
@@ -9469,15 +9563,8 @@ static void kick_ilb(unsigned int flags)
  }
  
  /*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- *   - This rq has more than one task.
- *   - This rq has at least one CFS task and the capacity of the CPU is
- *     significantly reduced because of RT tasks or IRQs.
- *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
- *     multiple busy cpu.
- *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- *     domain span are idle.
+ * Current decision point for kicking the idle load balancer in the presence
+ * of idle CPUs in the system.
   */
  static void nohz_balancer_kick(struct rq *rq)
  {
@@ -9510,7 +9597,7 @@ static void nohz_balancer_kick(struct rq *rq)
         if (time_before(now, nohz.next_balance))
                 goto out;
  
-       if (rq->nr_running >= 2 || rq->misfit_task_load) {
+       if (rq->nr_running >= 2) {
                 flags = NOHZ_KICK_MASK;
                 goto out;
         }
@@ -9519,8 +9606,13 @@ static void nohz_balancer_kick(struct rq *rq)
         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
         if (sds) {
                 /*
-                * XXX: write a coherent comment on why we do this.
-                * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+                * If there is an imbalance between LLC domains (IOW we could
+                * increase the overall cache use), we need some less-loaded LLC
+                * domain to pull some load. Likewise, we may need to spread
+                * load within the current LLC domain (e.g. packed SMT cores but
+                * other CPUs are idle). We can't really know from here how busy
+                * the others are - so just get a nohz balance going if it looks
+                * like this LLC domain has tasks we could move.
                  */
                 nr_busy = atomic_read(&sds->nr_busy_cpus);
                 if (nr_busy > 1) {
@@ -9532,20 +9624,37 @@ static void nohz_balancer_kick(struct rq *rq)
  
         sd = rcu_dereference(rq->sd);
         if (sd) {
-               if ((rq->cfs.h_nr_running >= 1) &&
-                               check_cpu_capacity(rq, sd)) {
+               /*
+                * If there's a CFS task and the current CPU has reduced
+                * capacity; kick the ILB to see if there's a better CPU to run
+                * on.
+                */
+               if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
                         flags = NOHZ_KICK_MASK;
                         goto unlock;
                 }
         }
  
-       sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+       sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
         if (sd) {
-               for_each_cpu(i, sched_domain_span(sd)) {
-                       if (i == cpu ||
-                           !cpumask_test_cpu(i, nohz.idle_cpus_mask))
-                               continue;
+               /*
+                * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
+                * to run the misfit task on.
+                */
+               if (check_misfit_status(rq, sd)) {
+                       flags = NOHZ_KICK_MASK;
+                       goto unlock;
+               }
+       }
  
+       sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+       if (sd) {
+               /*
+                * When ASYM_PACKING; see if there's a more preferred CPU
+                * currently idle; in which case, kick the ILB to move tasks
+                * around.
+                */
+               for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
                         if (sched_asym_prefer(i, cpu)) {
                                 flags = NOHZ_KICK_MASK;
                                 goto unlock;
@@ -10546,10 +10655,10 @@ const struct sched_class fair_sched_class = {
  #ifdef CONFIG_SCHED_DEBUG
  void print_cfs_stats(struct seq_file *m, int cpu)
  {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
  
         rcu_read_lock();
-       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+       for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
                 print_cfs_rq(m, cpu, cfs_rq);
         rcu_read_unlock();
  }