sched/fair: Fix O(nr_cgroups) in load balance path
[linux-2.6-block.git] / kernel / sched / fair.c
index d711093218415d77ead6405004dd9e41323ad924..219fe58e30238592b7fc879a8e65992df84ec752 100644 (file)
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                     \
+       list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
+                                leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)     \
+               for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -2916,12 +2917,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
        /*
         * Step 2: update *_avg.
         */
-       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
        if (cfs_rq) {
                cfs_rq->runnable_load_avg =
-                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
        }
-       sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+       sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
 
        return 1;
 }
@@ -4642,24 +4643,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_cancel(&cfs_b->slack_timer);
 }
 
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-       struct cfs_rq *cfs_rq;
+       struct task_group *tg;
+
+       lockdep_assert_held(&rq->lock);
 
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
-               struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+       rcu_read_lock();
+       list_for_each_entry_rcu(tg, &task_groups, list) {
+               struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+               struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
                raw_spin_lock(&cfs_b->lock);
                cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
                raw_spin_unlock(&cfs_b->lock);
        }
+       rcu_read_unlock();
 }
 
+/* cpu offline callback */
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-       struct cfs_rq *cfs_rq;
+       struct task_group *tg;
+
+       lockdep_assert_held(&rq->lock);
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(tg, &task_groups, list) {
+               struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
                if (!cfs_rq->runtime_enabled)
                        continue;
 
@@ -4677,6 +4697,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
                if (cfs_rq_throttled(cfs_rq))
                        unthrottle_cfs_rq(cfs_rq);
        }
+       rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
@@ -5484,12 +5505,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                int i;
 
                /* Skip over this group if it has no CPUs allowed */
-               if (!cpumask_intersects(sched_group_cpus(group),
+               if (!cpumask_intersects(sched_group_span(group),
                                        &p->cpus_allowed))
                        continue;
 
                local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
+                                              sched_group_span(group));
 
                /*
                 * Tally up the load of all CPUs in the group and find
@@ -5499,7 +5520,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                runnable_load = 0;
                max_spare_cap = 0;
 
-               for_each_cpu(i, sched_group_cpus(group)) {
+               for_each_cpu(i, sched_group_span(group)) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
@@ -5602,10 +5623,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
        /* Check if we have any choice: */
        if (group->group_weight == 1)
-               return cpumask_first(sched_group_cpus(group));
+               return cpumask_first(sched_group_span(group));
 
        /* Traverse only the allowed CPUs */
-       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+       for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
                if (idle_cpu(i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
@@ -5640,43 +5661,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
-       int next;
-
-again:
-       next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
-       if (*wrapped) {
-               if (next >= start)
-                       return nr_cpumask_bits;
-       } else {
-               if (next >= nr_cpumask_bits) {
-                       *wrapped = 1;
-                       n = -1;
-                       goto again;
-               }
-       }
-
-       return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap)                              \
-       for ((wrap) = 0, (cpu) = (start)-1;                                     \
-               (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),     \
-               (cpu) < nr_cpumask_bits; )
-
 #ifdef CONFIG_SCHED_SMT
 
 static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5720,7 @@ unlock:
 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
 {
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
-       int core, cpu, wrap;
+       int core, cpu;
 
        if (!static_branch_likely(&sched_smt_present))
                return -1;
@@ -5746,7 +5730,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 
        cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
 
-       for_each_cpu_wrap(core, cpus, target, wrap) {
+       for_each_cpu_wrap(core, cpus, target) {
                bool idle = true;
 
                for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5812,7 +5796,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
        u64 avg_cost, avg_idle = this_rq()->avg_idle;
        u64 time, cost;
        s64 delta;
-       int cpu, wrap;
+       int cpu;
 
        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
        if (!this_sd)
@@ -5829,7 +5813,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
        time = local_clock();
 
-       for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+       for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
                        continue;
                if (idle_cpu(cpu))
@@ -6970,10 +6954,28 @@ static void attach_tasks(struct lb_env *env)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load.weight)
+               return false;
+
+       if (cfs_rq->avg.load_sum)
+               return false;
+
+       if (cfs_rq->avg.util_sum)
+               return false;
+
+       if (cfs_rq->runnable_load_sum)
+               return false;
+
+       return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
        struct rq_flags rf;
 
        rq_lock_irqsave(rq, &rf);
@@ -6983,7 +6985,7 @@ static void update_blocked_averages(int cpu)
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
+       for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
                struct sched_entity *se;
 
                /* throttled entities do not contribute to load */
@@ -6997,6 +6999,13 @@ static void update_blocked_averages(int cpu)
                se = cfs_rq->tg->se[cpu];
                if (se && !skip_blocked_update(se))
                        update_load_avg(se, 0);
+
+               /*
+                * There can be a lot of idle CPU cgroups.  Don't let fully
+                * decayed cfs_rqs linger on the list.
+                */
+               if (cfs_rq_is_decayed(cfs_rq))
+                       list_del_leaf_cfs_rq(cfs_rq);
        }
        rq_unlock_irqrestore(rq, &rf);
 }
@@ -7229,7 +7238,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                 * span the current group.
                 */
 
-               for_each_cpu(cpu, sched_group_cpus(sdg)) {
+               for_each_cpu(cpu, sched_group_span(sdg)) {
                        struct sched_group_capacity *sgc;
                        struct rq *rq = cpu_rq(cpu);
 
@@ -7408,7 +7417,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
        memset(sgs, 0, sizeof(*sgs));
 
-       for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+       for_each_cpu_and(i, sched_group_span(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
 
                /* Bias balancing toward cpus of our domain */
@@ -7572,7 +7581,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                struct sg_lb_stats *sgs = &tmp_sgs;
                int local_group;
 
-               local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+               local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
                if (local_group) {
                        sds->local = sg;
                        sgs = local;
@@ -7927,7 +7936,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        unsigned long busiest_load = 0, busiest_capacity = 1;
        int i;
 
-       for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+       for_each_cpu_and(i, sched_group_span(group), env->cpus) {
                unsigned long capacity, wl;
                enum fbq_type rt;
 
@@ -8033,7 +8042,6 @@ static int active_load_balance_cpu_stop(void *data);
 static int should_we_balance(struct lb_env *env)
 {
        struct sched_group *sg = env->sd->groups;
-       struct cpumask *sg_cpus, *sg_mask;
        int cpu, balance_cpu = -1;
 
        /*
@@ -8043,11 +8051,9 @@ static int should_we_balance(struct lb_env *env)
        if (env->idle == CPU_NEWLY_IDLE)
                return 1;
 
-       sg_cpus = sched_group_cpus(sg);
-       sg_mask = sched_group_mask(sg);
        /* Try to find first idle cpu */
-       for_each_cpu_and(cpu, sg_cpus, env->cpus) {
-               if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+       for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+               if (!idle_cpu(cpu))
                        continue;
 
                balance_cpu = cpu;
@@ -8083,7 +8089,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
-               .dst_grpmask    = sched_group_cpus(sd->groups),
+               .dst_grpmask    = sched_group_span(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
@@ -9523,10 +9529,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
 
        rcu_read_lock();
-       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+       for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
                print_cfs_rq(m, cpu, cfs_rq);
        rcu_read_unlock();
 }