sched/fair: Tune down misfit NOHZ kicks
[linux-2.6-block.git] / kernel / sched / fair.c
index d6a536dec0ca678b0f4e1f8c8a5e9ebf1853872a..f0d2f8a352bf19360d7e5860e2124972e1d73cd7 100644 (file)
@@ -275,13 +275,13 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
 
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        struct rq *rq = rq_of(cfs_rq);
        int cpu = cpu_of(rq);
 
        if (cfs_rq->on_list)
-               return;
+               return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
 
        cfs_rq->on_list = 1;
 
@@ -310,7 +310,7 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
                 * list.
                 */
                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
-               return;
+               return true;
        }
 
        if (!cfs_rq->tg->parent) {
@@ -325,7 +325,7 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
                 * tmp_alone_branch to the beginning of the list.
                 */
                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
-               return;
+               return true;
        }
 
        /*
@@ -340,11 +340,24 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
         * of the branch
         */
        rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+       return false;
 }
 
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (cfs_rq->on_list) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               /*
+                * With cfs_rq being unthrottled/throttled during an enqueue,
+                * it can happen the tmp_alone_branch points the a leaf that
+                * we finally want to del. In this case, tmp_alone_branch moves
+                * to the prev element but it will point to rq->leaf_cfs_rq_list
+                * at the end of the enqueue.
+                */
+               if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+                       rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
                cfs_rq->on_list = 0;
        }
@@ -355,9 +368,10 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
        SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
 }
 
-/* Iterate through all cfs_rq's on a runqueue in bottom-up order */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                     \
+       list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
+                                leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -435,8 +449,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return NULL;
 }
 
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
+       return true;
 }
 
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
@@ -447,8 +462,8 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq)       \
-               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)     \
+               for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
  * if util_avg > util_avg_cap.
  */
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
 {
+       struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
        long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
@@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
                }
        }
 
-       if (entity_is_task(se)) {
-               struct task_struct *p = task_of(se);
-               if (p->sched_class != &fair_sched_class) {
-                       /*
-                        * For !fair tasks do:
-                        *
-                       update_cfs_rq_load_avg(now, cfs_rq);
-                       attach_entity_load_avg(cfs_rq, se, 0);
-                       switched_from_fair(rq, p);
-                        *
-                        * such that the next switched_to_fair() has the
-                        * expected state.
-                        */
-                       se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
-                       return;
-               }
+       if (p->sched_class != &fair_sched_class) {
+               /*
+                * For !fair tasks do:
+                *
+               update_cfs_rq_load_avg(now, cfs_rq);
+               attach_entity_load_avg(cfs_rq, se, 0);
+               switched_from_fair(rq, p);
+                *
+                * such that the next switched_to_fair() has the
+                * expected state.
+                */
+               se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+               return;
        }
 
        attach_entity_cfs_rq(se);
@@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
 {
 }
 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@ -3575,10 +3588,6 @@ void remove_entity_load_avg(struct sched_entity *se)
         * tasks cannot exit without having gone through wake_up_new_task() ->
         * post_init_entity_util_avg() which will have added things to the
         * cfs_rq, so we can remove unconditionally.
-        *
-        * Similarly for groups, they will have passed through
-        * post_init_entity_util_avg() before unregister_sched_fair_group()
-        * calls this.
         */
 
        sync_entity_load_avg(se);
@@ -4436,6 +4445,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
                /* adjust cfs_rq_clock_task() */
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
+
+               /* Add cfs_rq with already running entity in the list */
+               if (cfs_rq->nr_running >= 1)
+                       list_add_leaf_cfs_rq(cfs_rq);
        }
 
        return 0;
@@ -4447,8 +4460,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
        /* group is entering throttled state, stop time */
-       if (!cfs_rq->throttle_count)
+       if (!cfs_rq->throttle_count) {
                cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               list_del_leaf_cfs_rq(cfs_rq);
+       }
        cfs_rq->throttle_count++;
 
        return 0;
@@ -4551,6 +4566,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                        break;
        }
 
+       assert_list_leaf_cfs_rq(rq);
+
        if (!se)
                add_nr_running(rq, task_delta);
 
@@ -4995,6 +5012,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+       return false;
+}
+
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
        return rq_clock_task(rq_of(cfs_rq));
@@ -5186,6 +5209,21 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
        }
 
+       if (cfs_bandwidth_used()) {
+               /*
+                * When bandwidth control is enabled; the cfs_rq_throttled()
+                * breaks in the above iteration can result in incomplete
+                * leaf list maintenance, resulting in triggering the assertion
+                * below.
+                */
+               for_each_sched_entity(se) {
+                       cfs_rq = cfs_rq_of(se);
+
+                       if (list_add_leaf_cfs_rq(cfs_rq))
+                               break;
+               }
+       }
+
        assert_list_leaf_cfs_rq(rq);
 
        hrtick_update(rq);
@@ -5986,6 +6024,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 
 #ifdef CONFIG_SCHED_SMT
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
 
 static inline void set_idle_cores(int cpu, int val)
 {
@@ -6058,7 +6097,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
                bool idle = true;
 
                for_each_cpu(cpu, cpu_smt_mask(core)) {
-                       cpumask_clear_cpu(cpu, cpus);
+                       __cpumask_clear_cpu(cpu, cpus);
                        if (!available_idle_cpu(cpu))
                                idle = false;
                }
@@ -6078,7 +6117,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 /*
  * Scan the local SMT mask for idle CPUs.
  */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
 {
        int cpu;
 
@@ -6102,7 +6141,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
        return -1;
 }
 
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
 {
        return -1;
 }
@@ -6207,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        if ((unsigned)i < nr_cpumask_bits)
                return i;
 
-       i = select_idle_smt(p, sd, target);
+       i = select_idle_smt(p, target);
        if ((unsigned)i < nr_cpumask_bits)
                return i;
 
@@ -7658,10 +7697,27 @@ static inline bool others_have_blocked(struct rq *rq)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load.weight)
+               return false;
+
+       if (cfs_rq->avg.load_sum)
+               return false;
+
+       if (cfs_rq->avg.util_sum)
+               return false;
+
+       if (cfs_rq->avg.runnable_load_sum)
+               return false;
+
+       return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
        const struct sched_class *curr_class;
        struct rq_flags rf;
        bool done = true;
@@ -7673,13 +7729,9 @@ static void update_blocked_averages(int cpu)
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
+       for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
                struct sched_entity *se;
 
-               /* throttled entities do not contribute to load */
-               if (throttled_hierarchy(cfs_rq))
-                       continue;
-
                if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
                        update_tg_load_avg(cfs_rq, 0);
 
@@ -7688,6 +7740,13 @@ static void update_blocked_averages(int cpu)
                if (se && !skip_blocked_update(se))
                        update_load_avg(cfs_rq_of(se), se, 0);
 
+               /*
+                * There can be a lot of idle CPU cgroups.  Don't let fully
+                * decayed cfs_rqs linger on the list.
+                */
+               if (cfs_rq_is_decayed(cfs_rq))
+                       list_del_leaf_cfs_rq(cfs_rq);
+
                /* Don't need periodic decay once load/util_avg are null */
                if (cfs_rq_has_blocked(cfs_rq))
                        done = false;
@@ -7999,6 +8058,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
                                (rq->cpu_capacity_orig * 100));
 }
 
+/*
+ * Check whether a rq has a misfit task and if it looks like we can actually
+ * help that task: we can migrate the task to a CPU of higher capacity, or
+ * the task's current CPU is heavily pressured.
+ */
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+{
+       return rq->misfit_task_load &&
+               (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+                check_cpu_capacity(rq, sd));
+}
+
 /*
  * Group imbalance indicates (and tries to solve) the problem where balancing
  * groups is inadequate due to ->cpus_allowed constraints.
@@ -9046,7 +9117,7 @@ more_balance:
                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
 
                        /* Prevent to re-select dst_cpu via env's CPUs */
-                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
+                       __cpumask_clear_cpu(env.dst_cpu, env.cpus);
 
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
@@ -9073,7 +9144,7 @@ more_balance:
 
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
-                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       __cpumask_clear_cpu(cpu_of(busiest), cpus);
                        /*
                         * Attempting to continue load balancing at the current
                         * sched_domain level only makes sense if there are
@@ -9492,15 +9563,8 @@ static void kick_ilb(unsigned int flags)
 }
 
 /*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- *   - This rq has more than one task.
- *   - This rq has at least one CFS task and the capacity of the CPU is
- *     significantly reduced because of RT tasks or IRQs.
- *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
- *     multiple busy cpu.
- *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- *     domain span are idle.
+ * Current decision point for kicking the idle load balancer in the presence
+ * of idle CPUs in the system.
  */
 static void nohz_balancer_kick(struct rq *rq)
 {
@@ -9533,7 +9597,7 @@ static void nohz_balancer_kick(struct rq *rq)
        if (time_before(now, nohz.next_balance))
                goto out;
 
-       if (rq->nr_running >= 2 || rq->misfit_task_load) {
+       if (rq->nr_running >= 2) {
                flags = NOHZ_KICK_MASK;
                goto out;
        }
@@ -9542,8 +9606,13 @@ static void nohz_balancer_kick(struct rq *rq)
        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
        if (sds) {
                /*
-                * XXX: write a coherent comment on why we do this.
-                * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+                * If there is an imbalance between LLC domains (IOW we could
+                * increase the overall cache use), we need some less-loaded LLC
+                * domain to pull some load. Likewise, we may need to spread
+                * load within the current LLC domain (e.g. packed SMT cores but
+                * other CPUs are idle). We can't really know from here how busy
+                * the others are - so just get a nohz balance going if it looks
+                * like this LLC domain has tasks we could move.
                 */
                nr_busy = atomic_read(&sds->nr_busy_cpus);
                if (nr_busy > 1) {
@@ -9555,20 +9624,37 @@ static void nohz_balancer_kick(struct rq *rq)
 
        sd = rcu_dereference(rq->sd);
        if (sd) {
-               if ((rq->cfs.h_nr_running >= 1) &&
-                               check_cpu_capacity(rq, sd)) {
+               /*
+                * If there's a CFS task and the current CPU has reduced
+                * capacity; kick the ILB to see if there's a better CPU to run
+                * on.
+                */
+               if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
                        flags = NOHZ_KICK_MASK;
                        goto unlock;
                }
        }
 
-       sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+       sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
        if (sd) {
-               for_each_cpu(i, sched_domain_span(sd)) {
-                       if (i == cpu ||
-                           !cpumask_test_cpu(i, nohz.idle_cpus_mask))
-                               continue;
+               /*
+                * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
+                * to run the misfit task on.
+                */
+               if (check_misfit_status(rq, sd)) {
+                       flags = NOHZ_KICK_MASK;
+                       goto unlock;
+               }
+       }
 
+       sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+       if (sd) {
+               /*
+                * When ASYM_PACKING; see if there's a more preferred CPU
+                * currently idle; in which case, kick the ILB to move tasks
+                * around.
+                */
+               for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
                        if (sched_asym_prefer(i, cpu)) {
                                flags = NOHZ_KICK_MASK;
                                goto unlock;
@@ -10569,10 +10655,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
 
        rcu_read_lock();
-       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+       for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
                print_cfs_rq(m, cpu, cfs_rq);
        rcu_read_unlock();
 }