Revert "sched: Improve scalability via 'CPU buddies', which withstand random perturba...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 16 Sep 2012 19:29:43 +0000 (12:29 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 16 Sep 2012 19:29:43 +0000 (12:29 -0700)
This reverts commit 970e178985cadbca660feb02f4d2ee3a09f7fdda.

Nikolay Ulyanitsky reported thatthe 3.6-rc5 kernel has a 15-20%
performance drop on PostgreSQL 9.2 on his machine (running "pgbench").

Borislav Petkov was able to reproduce this, and bisected it to this
commit 970e178985ca ("sched: Improve scalability via 'CPU buddies' ...")
apparently because the new single-idle-buddy model simply doesn't find
idle CPU's to reschedule on aggressively enough.

Mike Galbraith suspects that it is likely due to the user-mode spinlocks
in PostgreSQL not reacting well to preemption, but we don't really know
the details - I'll just revert the commit for now.

There are hopefully other approaches to improve scheduler scalability
without it causing these kinds of downsides.

Reported-by: Nikolay Ulyanitsky <lystor@gmail.com>
Bisected-by: Borislav Petkov <bp@alien8.de>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/sched.h
kernel/sched/core.c
kernel/sched/fair.c

index b8c86648a2f95dc6f83aab668fd9a7e07f277b4e..23bddac4bad8d08f3781d1e8a453aa41edb28632 100644 (file)
@@ -954,7 +954,6 @@ struct sched_domain {
        unsigned int smt_gain;
        int flags;                      /* See SD_* */
        int level;
-       int idle_buddy;                 /* cpu assigned to select_idle_sibling() */
 
        /* Runtime fields. */
        unsigned long last_balance;     /* init to jiffies. units in jiffies */
index a4ea245f3d85a5383475b49df5292667535a1df0..649c9f876cb164b0e16683479b59bf4d4f3b794b 100644 (file)
@@ -6014,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
  * allows us to avoid some pointer chasing select_idle_sibling().
  *
- * Iterate domains and sched_groups downward, assigning CPUs to be
- * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
- * due to random perturbation self canceling, ie sw buddies pull
- * their counterpart to their CPU's hw counterpart.
- *
  * Also keep a unique ID per domain (we use the first cpu number in
  * the cpumask of the domain), this allows us to quickly tell if
  * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6032,40 +6027,8 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
 
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-       if (sd) {
-               struct sched_domain *tmp = sd;
-               struct sched_group *sg, *prev;
-               bool right;
-
-               /*
-                * Traverse to first CPU in group, and count hops
-                * to cpu from there, switching direction on each
-                * hop, never ever pointing the last CPU rightward.
-                */
-               do {
-                       id = cpumask_first(sched_domain_span(tmp));
-                       prev = sg = tmp->groups;
-                       right = 1;
-
-                       while (cpumask_first(sched_group_cpus(sg)) != id)
-                               sg = sg->next;
-
-                       while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
-                               prev = sg;
-                               sg = sg->next;
-                               right = !right;
-                       }
-
-                       /* A CPU went down, never point back to domain start. */
-                       if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
-                               right = false;
-
-                       sg = right ? sg->next : prev;
-                       tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
-               } while ((tmp = tmp->child));
-
+       if (sd)
                id = cpumask_first(sched_domain_span(sd));
-       }
 
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
index 42d9df6a5ca4c7db04ba73cf300b75084321533d..96e2b18b628312dd69e8ad2ee6e3a6e08f33cce7 100644 (file)
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
+       struct sched_group *sg;
+       int i;
 
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return prev_cpu;
 
        /*
-        * Otherwise, check assigned siblings to find an elegible idle cpu.
+        * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-
        for_each_lower_domain(sd) {
-               if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
-                       continue;
-               if (idle_cpu(sd->idle_buddy))
-                       return sd->idle_buddy;
-       }
+               sg = sd->groups;
+               do {
+                       if (!cpumask_intersects(sched_group_cpus(sg),
+                                               tsk_cpus_allowed(p)))
+                               goto next;
 
+                       for_each_cpu(i, sched_group_cpus(sg)) {
+                               if (!idle_cpu(i))
+                                       goto next;
+                       }
+
+                       target = cpumask_first_and(sched_group_cpus(sg),
+                                       tsk_cpus_allowed(p));
+                       goto done;
+next:
+                       sg = sg->next;
+               } while (sg != sd->groups);
+       }
+done:
        return target;
 }