sched/fair: Fix O(nr_cgroups) in load balance path

[linux-2.6-block.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index d711093218415d77ead6405004dd9e41323ad924..219fe58e30238592b7fc879a8e65992df84ec752 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  }
  
  /* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                     \
+       list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
+                                leaf_cfs_rq_list)
  
  /* Do the two (enqueued) entities belong to the same group ? */
  static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  }
  
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)     \
+               for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
@@ -2916,12 +2917,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
         /*
          * Step 2: update *_avg.
          */
-       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
         if (cfs_rq) {
                 cfs_rq->runnable_load_avg =
-                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
         }
-       sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+       sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
  
         return 1;
  }
@@ -4642,24 +4643,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         hrtimer_cancel(&cfs_b->slack_timer);
  }
  
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
  static void __maybe_unused update_runtime_enabled(struct rq *rq)
  {
-       struct cfs_rq *cfs_rq;
+       struct task_group *tg;
+
+       lockdep_assert_held(&rq->lock);
  
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
-               struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+       rcu_read_lock();
+       list_for_each_entry_rcu(tg, &task_groups, list) {
+               struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+               struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
                 raw_spin_lock(&cfs_b->lock);
                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
                 raw_spin_unlock(&cfs_b->lock);
         }
+       rcu_read_unlock();
  }
  
+/* cpu offline callback */
  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
  {
-       struct cfs_rq *cfs_rq;
+       struct task_group *tg;
+
+       lockdep_assert_held(&rq->lock);
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(tg, &task_groups, list) {
+               struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
                 if (!cfs_rq->runtime_enabled)
                         continue;
  
@@ -4677,6 +4697,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
                 if (cfs_rq_throttled(cfs_rq))
                         unthrottle_cfs_rq(cfs_rq);
         }
+       rcu_read_unlock();
  }
  
  #else /* CONFIG_CFS_BANDWIDTH */
@@ -5484,12 +5505,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 int i;
  
                 /* Skip over this group if it has no CPUs allowed */
-               if (!cpumask_intersects(sched_group_cpus(group),
+               if (!cpumask_intersects(sched_group_span(group),
                                         &p->cpus_allowed))
                         continue;
  
                 local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
+                                              sched_group_span(group));
  
                 /*
                  * Tally up the load of all CPUs in the group and find
@@ -5499,7 +5520,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 runnable_load = 0;
                 max_spare_cap = 0;
  
-               for_each_cpu(i, sched_group_cpus(group)) {
+               for_each_cpu(i, sched_group_span(group)) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = source_load(i, load_idx);
@@ -5602,10 +5623,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  
         /* Check if we have any choice: */
         if (group->group_weight == 1)
-               return cpumask_first(sched_group_cpus(group));
+               return cpumask_first(sched_group_span(group));
  
         /* Traverse only the allowed CPUs */
-       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+       for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
                 if (idle_cpu(i)) {
                         struct rq *rq = cpu_rq(i);
                         struct cpuidle_state *idle = idle_get_state(rq);
@@ -5640,43 +5661,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
  }
  
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
-       int next;
-
-again:
-       next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
-       if (*wrapped) {
-               if (next >= start)
-                       return nr_cpumask_bits;
-       } else {
-               if (next >= nr_cpumask_bits) {
-                       *wrapped = 1;
-                       n = -1;
-                       goto again;
-               }
-       }
-
-       return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap)                              \
-       for ((wrap) = 0, (cpu) = (start)-1;                                     \
-               (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),     \
-               (cpu) < nr_cpumask_bits; )
-
  #ifdef CONFIG_SCHED_SMT
  
  static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5720,7 @@ unlock:
  static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
  {
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
-       int core, cpu, wrap;
+       int core, cpu;
  
         if (!static_branch_likely(&sched_smt_present))
                 return -1;
@@ -5746,7 +5730,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
  
         cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
  
-       for_each_cpu_wrap(core, cpus, target, wrap) {
+       for_each_cpu_wrap(core, cpus, target) {
                 bool idle = true;
  
                 for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5812,7 +5796,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
         u64 avg_cost, avg_idle = this_rq()->avg_idle;
         u64 time, cost;
         s64 delta;
-       int cpu, wrap;
+       int cpu;
  
         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
         if (!this_sd)
@@ -5829,7 +5813,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
  
         time = local_clock();
  
-       for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+       for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
                         continue;
                 if (idle_cpu(cpu))
@@ -6970,10 +6954,28 @@ static void attach_tasks(struct lb_env *env)
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load.weight)
+               return false;
+
+       if (cfs_rq->avg.load_sum)
+               return false;
+
+       if (cfs_rq->avg.util_sum)
+               return false;
+
+       if (cfs_rq->runnable_load_sum)
+               return false;
+
+       return true;
+}
+
  static void update_blocked_averages(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
         struct rq_flags rf;
  
         rq_lock_irqsave(rq, &rf);
@@ -6983,7 +6985,7 @@ static void update_blocked_averages(int cpu)
          * Iterates the task_group tree in a bottom up fashion, see
          * list_add_leaf_cfs_rq() for details.
          */
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
+       for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
                 struct sched_entity *se;
  
                 /* throttled entities do not contribute to load */
@@ -6997,6 +6999,13 @@ static void update_blocked_averages(int cpu)
                 se = cfs_rq->tg->se[cpu];
                 if (se && !skip_blocked_update(se))
                         update_load_avg(se, 0);
+
+               /*
+                * There can be a lot of idle CPU cgroups.  Don't let fully
+                * decayed cfs_rqs linger on the list.
+                */
+               if (cfs_rq_is_decayed(cfs_rq))
+                       list_del_leaf_cfs_rq(cfs_rq);
         }
         rq_unlock_irqrestore(rq, &rf);
  }
@@ -7229,7 +7238,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                  * span the current group.
                  */
  
-               for_each_cpu(cpu, sched_group_cpus(sdg)) {
+               for_each_cpu(cpu, sched_group_span(sdg)) {
                         struct sched_group_capacity *sgc;
                         struct rq *rq = cpu_rq(cpu);
  
@@ -7408,7 +7417,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
  
         memset(sgs, 0, sizeof(*sgs));
  
-       for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+       for_each_cpu_and(i, sched_group_span(group), env->cpus) {
                 struct rq *rq = cpu_rq(i);
  
                 /* Bias balancing toward cpus of our domain */
@@ -7572,7 +7581,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 struct sg_lb_stats *sgs = &tmp_sgs;
                 int local_group;
  
-               local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+               local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
                 if (local_group) {
                         sds->local = sg;
                         sgs = local;
@@ -7927,7 +7936,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
         unsigned long busiest_load = 0, busiest_capacity = 1;
         int i;
  
-       for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+       for_each_cpu_and(i, sched_group_span(group), env->cpus) {
                 unsigned long capacity, wl;
                 enum fbq_type rt;
  
@@ -8033,7 +8042,6 @@ static int active_load_balance_cpu_stop(void *data);
  static int should_we_balance(struct lb_env *env)
  {
         struct sched_group *sg = env->sd->groups;
-       struct cpumask *sg_cpus, *sg_mask;
         int cpu, balance_cpu = -1;
  
         /*
@@ -8043,11 +8051,9 @@ static int should_we_balance(struct lb_env *env)
         if (env->idle == CPU_NEWLY_IDLE)
                 return 1;
  
-       sg_cpus = sched_group_cpus(sg);
-       sg_mask = sched_group_mask(sg);
         /* Try to find first idle cpu */
-       for_each_cpu_and(cpu, sg_cpus, env->cpus) {
-               if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+       for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+               if (!idle_cpu(cpu))
                         continue;
  
                 balance_cpu = cpu;
@@ -8083,7 +8089,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                 .sd             = sd,
                 .dst_cpu        = this_cpu,
                 .dst_rq         = this_rq,
-               .dst_grpmask    = sched_group_cpus(sd->groups),
+               .dst_grpmask    = sched_group_span(sd->groups),
                 .idle           = idle,
                 .loop_break     = sched_nr_migrate_break,
                 .cpus           = cpus,
@@ -9523,10 +9529,10 @@ const struct sched_class fair_sched_class = {
  #ifdef CONFIG_SCHED_DEBUG
  void print_cfs_stats(struct seq_file *m, int cpu)
  {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
  
         rcu_read_lock();
-       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+       for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
                 print_cfs_rq(m, cpu, cfs_rq);
         rcu_read_unlock();
  }