[PATCH] sched: new sched domain for representing multi-core
authorSiddha, Suresh B <suresh.b.siddha@intel.com>
Mon, 27 Mar 2006 09:15:22 +0000 (01:15 -0800)
committerLinus Torvalds <torvalds@g5.osdl.org>
Mon, 27 Mar 2006 16:44:43 +0000 (08:44 -0800)
Add a new sched domain for representing multi-core with shared caches
between cores.  Consider a dual package system, each package containing two
cores and with last level cache shared between cores with in a package.  If
there are two runnable processes, with this appended patch those two
processes will be scheduled on different packages.

On such systems, with this patch we have observed 8% perf improvement with
specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2
users).

This new domain will come into play only on multi-core systems with shared
caches.  On other systems, this sched domain will be removed by domain
degeneration code.  This new domain can be also used for implementing power
savings policy (see OLS 2005 CMP kernel scheduler paper for more details..
I will post another patch for power savings policy soon)

Most of the arch/* file changes are for cpu_coregroup_map() implementation.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
14 files changed:
arch/i386/Kconfig
arch/i386/kernel/cpu/common.c
arch/i386/kernel/cpu/intel_cacheinfo.c
arch/i386/kernel/smpboot.c
arch/x86_64/Kconfig
arch/x86_64/kernel/setup.c
arch/x86_64/kernel/smpboot.c
include/asm-i386/processor.h
include/asm-i386/topology.h
include/asm-x86_64/processor.h
include/asm-x86_64/smp.h
include/asm-x86_64/topology.h
include/linux/topology.h
kernel/sched.c

index f7db71d0b913ed6408480cdf1d4facb4ead77ec1..f17bd1d2707eb31be18fd1010fb225b641e823d5 100644 (file)
@@ -231,6 +231,15 @@ config SCHED_SMT
          cost of slightly increased overhead in some places. If unsure say
          N here.
 
+config SCHED_MC
+       bool "Multi-core scheduler support"
+       depends on SMP
+       default y
+       help
+         Multi-core scheduler support improves the CPU scheduler's decision
+         making when dealing with multi-core CPU chips at a cost of slightly
+         increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
index 7e3d6b6a4e963bc7db6beee8dccbdd94ae96532e..a06a49075f107ec47e45aa8db6a91e214bfc1d1e 100644 (file)
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
 void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
        u32 tfms, xlvl;
-       int junk;
+       int ebx;
 
        if (have_cpuid_p()) {
                /* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                /* Intel-defined flags: level 0x00000001 */
                if ( c->cpuid_level >= 0x00000001 ) {
                        u32 capability, excap;
-                       cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
                        c->x86_capability[0] = capability;
                        c->x86_capability[4] = excap;
                        c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                        if (c->x86 >= 0x6)
                                c->x86_model += ((tfms >> 16) & 0xF) << 4;
                        c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+                       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+                       c->apicid = (ebx >> 24) & 0xFF;
+#endif
                } else {
                        /* Have CPUID level 0 only - unheard of */
                        c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
        cpuid(1, &eax, &ebx, &ecx, &edx);
 
-       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 
        if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                return;
index ce61921369e53aa5827e8262daece57cbe2fca2a..7e7fd4e67dd0613dcf0322c3a77a37e9ed1eba57 100644 (file)
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
        unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
        unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
        unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+       unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+       unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
 
        if (c->cpuid_level > 3) {
                static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                                        break;
                                    case 2:
                                        new_l2 = this_leaf.size/1024;
+                                       num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                       index_msb = get_count_order(num_threads_sharing);
+                                       l2_id = c->apicid >> index_msb;
                                        break;
                                    case 3:
                                        new_l3 = this_leaf.size/1024;
+                                       num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                       index_msb = get_count_order(num_threads_sharing);
+                                       l3_id = c->apicid >> index_msb;
                                        break;
                                    default:
                                        break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                if (new_l1i)
                        l1i = new_l1i;
 
-               if (new_l2)
+               if (new_l2) {
                        l2 = new_l2;
+#ifdef CONFIG_SMP
+                       cpu_llc_id[cpu] = l2_id;
+#endif
+               }
 
-               if (new_l3)
+               if (new_l3) {
                        l3 = new_l3;
+#ifdef CONFIG_SMP
+                       cpu_llc_id[cpu] = l3_id;
+#endif
+               }
 
                if ( trace )
                        printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
index 82371d83bfa912719c57971980c103b28504cea2..a6969903f2d6ff80ef0116ec7dc262f36b7763cc 100644 (file)
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* Core ID of each logical CPU */
 int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
 
 static int cpucount;
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * TBD: when power saving sched policy is added, we will return
+        *      cpu_core_map when power saving policy is enabled
+        */
+       return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
                                cpu_set(cpu, cpu_sibling_map[i]);
                                cpu_set(i, cpu_core_map[cpu]);
                                cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
                        }
                }
        } else {
                cpu_set(cpu, cpu_sibling_map[cpu]);
        }
 
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
        if (current_cpu_data.x86_max_cores == 1) {
                cpu_core_map[cpu] = cpu_sibling_map[cpu];
                c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
        }
 
        for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
                if (phys_proc_id[cpu] == phys_proc_id[i]) {
                        cpu_set(i, cpu_core_map[cpu]);
                        cpu_set(cpu, cpu_core_map[i]);
index 45efe0ca88f81290227f0ab2262451664be92225..1cb4aa241c8cf7dff07eb05c9c2353b558e14166 100644 (file)
@@ -250,6 +250,15 @@ config SCHED_SMT
          cost of slightly increased overhead in some places. If unsure say
          N here.
 
+config SCHED_MC
+       bool "Multi-core scheduler support"
+       depends on SMP
+       default y
+       help
+         Multi-core scheduler support improves the CPU scheduler's decision
+         making when dealing with multi-core CPU chips at a cost of slightly
+         increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config NUMA
index a57eec8311a79a5ac10bdc76622e2dc0627598e0..d1f3e9272c054b6efe4db3e032fbeb7883f371b4 100644 (file)
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
        cpuid(1, &eax, &ebx, &ecx, &edx);
 
-       c->apicid = phys_pkg_id(0);
 
        if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                        c->x86_capability[2] = cpuid_edx(0x80860001);
        }
 
+       c->apicid = phys_pkg_id(0);
+
        /*
         * Vendor-specific initialization.  In this section we
         * canonicalize the feature flags, meaning if there are
index 66e98659d077ea9c6f9ab8ce3890aa7318113d63..ea48fa638070060e05a257bad6ba4c03ecdc88f8 100644 (file)
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 /* core ID of each logical CPU */
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
 
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
        cpu_set(cpuid, cpu_callin_map);
 }
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * TBD: when power saving sched policy is added, we will return
+        *      cpu_core_map when power saving policy is enabled
+        */
+       return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
                                cpu_set(cpu, cpu_sibling_map[i]);
                                cpu_set(i, cpu_core_map[cpu]);
                                cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
                        }
                }
        } else {
                cpu_set(cpu, cpu_sibling_map[cpu]);
        }
 
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
        if (current_cpu_data.x86_max_cores == 1) {
                cpu_core_map[cpu] = cpu_sibling_map[cpu];
                c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
        }
 
        for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
                if (phys_proc_id[cpu] == phys_proc_id[i]) {
                        cpu_set(i, cpu_core_map[cpu]);
                        cpu_set(cpu, cpu_core_map[i]);
index feca5d961e2b0c26a20c89f2b67c1cac230866fc..af4bfd012475ced0edb62d553a1691872b5dbc9c 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/percpu.h>
+#include <linux/cpumask.h>
 
 /* flag for disabling the tsc */
 extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
        char    pad0;
        int     x86_power;
        unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
+#endif
        unsigned char x86_max_cores;    /* cpuid returned max cores value */
        unsigned char booted_cores;     /* number of cores as seen by OS */
        unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
 
 extern int phys_proc_id[NR_CPUS];
 extern int cpu_core_id[NR_CPUS];
+extern int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 
 extern void identify_cpu(struct cpuinfo_x86 *);
index aa958c6ee83e35898ad50ccecab748f18daf7d50..b94e5eeef917ea92d9f7339593e1bda8dcc3bafd 100644 (file)
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
 
 #endif /* CONFIG_NUMA */
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif /* _ASM_I386_TOPOLOGY_H */
index 8c8d88c036ed07d037f55435196b91aba545c586..1aa2cee433443caabeee201580020f0c852b5a5d 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/mmsegment.h>
 #include <asm/percpu.h>
 #include <linux/personality.h>
+#include <linux/cpumask.h>
 
 #define TF_MASK                0x00000100
 #define IF_MASK                0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
         __u32   x86_power;     
        __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
        unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
+#endif
        __u8    apicid;
        __u8    booted_cores;   /* number of cores as seen by OS */
 } ____cacheline_aligned;
index 9ccbb2cfd5c093110c3a853d2fe0c1d3e11ea9e3..a4fdaeb5c3977b189f919153a0e51aa46cd94477 100644 (file)
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern u8 phys_proc_id[NR_CPUS];
 extern u8 cpu_core_id[NR_CPUS];
+extern u8 cpu_llc_id[NR_CPUS];
 
 #define SMP_TRAMPOLINE_BASE 0x6000
 
index c642f5d9882df400e91b3f1fdf7ba3ff966c54bf..9db54e9d17bb0c68f31677fc2cc6fceccb8df6d7 100644 (file)
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
 
 #include <asm-generic/topology.h>
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif
index e8eb0040ce3a241928476e46f8c1e6ed673e158f..a305ae2e44b6dde305d3afe241768e32c47d8907 100644 (file)
        .nr_balance_failed      = 0,                    \
 }
 
+#ifdef CONFIG_SCHED_MC
+#ifndef SD_MC_INIT
+/* for now its same as SD_CPU_INIT.
+ * TBD: Tune Domain parameters!
+ */
+#define SD_MC_INIT   SD_CPU_INIT
+#endif
+#endif
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
index a96a05d23262be5353420d9398454fe90c4b7e90..8a8b71b5751ba3c14be002fc5ded28521ff4ef44 100644 (file)
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
 
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+       return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+       return cpu;
+}
+#endif
+
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+       cpumask_t mask = cpu_coregroup_map(cpu);
+       return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
        return first_cpu(cpu_sibling_map[cpu]);
 #else
        return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
                sd->parent = p;
                sd->groups = &sched_group_phys[group];
 
+#ifdef CONFIG_SCHED_MC
+               p = sd;
+               sd = &per_cpu(core_domains, i);
+               group = cpu_to_core_group(i);
+               *sd = SD_MC_INIT;
+               sd->span = cpu_coregroup_map(i);
+               cpus_and(sd->span, sd->span, *cpu_map);
+               sd->parent = p;
+               sd->groups = &sched_group_core[group];
+#endif
+
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
        }
 #endif
 
+#ifdef CONFIG_SCHED_MC
+       /* Set up multi-core groups */
+       for_each_cpu_mask(i, *cpu_map) {
+               cpumask_t this_core_map = cpu_coregroup_map(i);
+               cpus_and(this_core_map, this_core_map, *cpu_map);
+               if (i != first_cpu(this_core_map))
+                       continue;
+               init_sched_build_groups(sched_group_core, this_core_map,
+                                       &cpu_to_core_group);
+       }
+#endif
+
+
        /* Set up physical groups */
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
                power = SCHED_LOAD_SCALE;
                sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+               sd = &per_cpu(core_domains, i);
+               power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                                           * SCHED_LOAD_SCALE / 10;
+               sd->groups->cpu_power = power;
+
+               sd = &per_cpu(phys_domains, i);
 
+               /*
+                * This has to be < 2 * SCHED_LOAD_SCALE
+                * Lets keep it SCHED_LOAD_SCALE, so that
+                * while calculating NUMA group's cpu_power
+                * we can simply do
+                *  numa_group->cpu_power += phys_group->cpu_power;
+                *
+                * See "only add power once for each physical pkg"
+                * comment below
+                */
+               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
                sd = &per_cpu(phys_domains, i);
                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
                                (cpus_weight(sd->groups->cpumask)-1) / 10;
                sd->groups->cpu_power = power;
+#endif
 
 #ifdef CONFIG_NUMA
                sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
 next_sg:
                for_each_cpu_mask(j, sg->cpumask) {
                        struct sched_domain *sd;
-                       int power;
 
                        sd = &per_cpu(phys_domains, j);
                        if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
                                 */
                                continue;
                        }
-                       power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                               (cpus_weight(sd->groups->cpumask)-1) / 10;
 
-                       sg->cpu_power += power;
+                       sg->cpu_power += sd->groups->cpu_power;
                }
                sg = sg->next;
                if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+               sd = &per_cpu(core_domains, i);
 #else
                sd = &per_cpu(phys_domains, i);
 #endif