Merge branch 'linus' into x86/cleanups
authorIngo Molnar <mingo@elte.hu>
Fri, 18 Jul 2008 11:53:16 +0000 (13:53 +0200)
committerIngo Molnar <mingo@elte.hu>
Fri, 18 Jul 2008 11:53:16 +0000 (13:53 +0200)
1  2 
arch/x86/kernel/process.c
arch/x86/kernel/smpboot.c

index 58325a6604a4fa650952bf49323abbb82e5007e5,4d629c62f4f8fbb993a49c0d5f12d7e88bbd94b5..158bd6a16f6ab79aee5d918f93698aed2805a937
@@@ -6,6 -6,13 +6,13 @@@
  #include <linux/sched.h>
  #include <linux/module.h>
  #include <linux/pm.h>
+ #include <linux/clockchips.h>
+ #include <asm/system.h>
+ unsigned long idle_halt;
+ EXPORT_SYMBOL(idle_halt);
+ unsigned long idle_nomwait;
+ EXPORT_SYMBOL(idle_nomwait);
  
  struct kmem_cache *task_xstate_cachep;
  
@@@ -45,6 -52,76 +52,76 @@@ void arch_task_cache_init(void
                                  SLAB_PANIC, NULL);
  }
  
+ /*
+  * Idle related variables and functions
+  */
+ unsigned long boot_option_idle_override = 0;
+ EXPORT_SYMBOL(boot_option_idle_override);
+ /*
+  * Powermanagement idle function, if any..
+  */
+ void (*pm_idle)(void);
+ EXPORT_SYMBOL(pm_idle);
+ #ifdef CONFIG_X86_32
+ /*
+  * This halt magic was a workaround for ancient floppy DMA
+  * wreckage. It should be safe to remove.
+  */
+ static int hlt_counter;
+ void disable_hlt(void)
+ {
+       hlt_counter++;
+ }
+ EXPORT_SYMBOL(disable_hlt);
+ void enable_hlt(void)
+ {
+       hlt_counter--;
+ }
+ EXPORT_SYMBOL(enable_hlt);
+ static inline int hlt_use_halt(void)
+ {
+       return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+ }
+ #else
+ static inline int hlt_use_halt(void)
+ {
+       return 1;
+ }
+ #endif
+ /*
+  * We use this if we don't have any better
+  * idle routine..
+  */
+ void default_idle(void)
+ {
+       if (hlt_use_halt()) {
+               current_thread_info()->status &= ~TS_POLLING;
+               /*
+                * TS_POLLING-cleared state must be visible before we
+                * test NEED_RESCHED:
+                */
+               smp_mb();
+               if (!need_resched())
+                       safe_halt();    /* enables interrupts racelessly */
+               else
+                       local_irq_enable();
+               current_thread_info()->status |= TS_POLLING;
+       } else {
+               local_irq_enable();
+               /* loop is done by the caller */
+               cpu_relax();
+       }
+ }
+ #ifdef CONFIG_APM_MODULE
+ EXPORT_SYMBOL(default_idle);
+ #endif
  static void do_nothing(void *unused)
  {
  }
@@@ -61,7 -138,7 +138,7 @@@ void cpu_idle_wait(void
  {
        smp_mb();
        /* kick all the CPUs so that they exit out of pm_idle */
-       smp_call_function(do_nothing, NULL, 0, 1);
+       smp_call_function(do_nothing, NULL, 1);
  }
  EXPORT_SYMBOL_GPL(cpu_idle_wait);
  
@@@ -122,57 -199,159 +199,162 @@@ static void poll_idle(void
   *
   * idle=mwait overrides this decision and forces the usage of mwait.
   */
+ #define MWAIT_INFO                    0x05
+ #define MWAIT_ECX_EXTENDED_INFO               0x01
+ #define MWAIT_EDX_C1                  0xf0
  static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
  {
+       u32 eax, ebx, ecx, edx;
        if (force_mwait)
                return 1;
  
-       if (c->x86_vendor == X86_VENDOR_AMD) {
-               switch(c->x86) {
-               case 0x10:
-               case 0x11:
-                       return 0;
-               }
-       }
+       if (c->cpuid_level < MWAIT_INFO)
+               return 0;
+       cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+       /* Check, whether EDX has extended info about MWAIT */
+       if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+               return 1;
+       /*
+        * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+        * C1  supports MWAIT
+        */
+       return (edx & MWAIT_EDX_C1);
+ }
+ /*
+  * Check for AMD CPUs, which have potentially C1E support
+  */
+ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+ {
+       if (c->x86_vendor != X86_VENDOR_AMD)
+               return 0;
+       if (c->x86 < 0x0F)
+               return 0;
+       /* Family 0x0f models < rev F do not have C1E */
+       if (c->x86 == 0x0f && c->x86_model < 0x40)
+               return 0;
        return 1;
  }
  
- void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+ /*
+  * C1E aware idle routine. We check for C1E active in the interrupt
+  * pending message MSR. If we detect C1E, then we handle it the same
+  * way as C3 power states (local apic timer and TSC stop)
+  */
+ static void c1e_idle(void)
  {
-       static int selected;
+       static cpumask_t c1e_mask = CPU_MASK_NONE;
+       static int c1e_detected;
  
-       if (selected)
+       if (need_resched())
                return;
+       if (!c1e_detected) {
+               u32 lo, hi;
+               rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+               if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+                       c1e_detected = 1;
+                       mark_tsc_unstable("TSC halt in C1E");
+                       printk(KERN_INFO "System has C1E enabled\n");
+               }
+       }
+       if (c1e_detected) {
+               int cpu = smp_processor_id();
+               if (!cpu_isset(cpu, c1e_mask)) {
+                       cpu_set(cpu, c1e_mask);
+                       /*
+                        * Force broadcast so ACPI can not interfere. Needs
+                        * to run with interrupts enabled as it uses
+                        * smp_function_call.
+                        */
+                       local_irq_enable();
+                       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+                                          &cpu);
+                       printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+                              cpu);
+                       local_irq_disable();
+               }
+               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+               default_idle();
+               /*
+                * The switch back from broadcast mode needs to be
+                * called with interrupts disabled.
+                */
+                local_irq_disable();
+                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+                local_irq_enable();
+       } else
+               default_idle();
+ }
+ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+ {
  #ifdef CONFIG_X86_SMP
        if (pm_idle == poll_idle && smp_num_siblings > 1) {
                printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
                        " performance may degrade.\n");
        }
  #endif
+       if (pm_idle)
+               return;
        if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
                /*
-                * Skip, if setup has overridden idle.
                 * One CPU supports mwait => All CPUs supports mwait
                 */
-               if (!pm_idle) {
-                       printk(KERN_INFO "using mwait in idle threads.\n");
-                       pm_idle = mwait_idle;
-               }
-       }
-       selected = 1;
+               printk(KERN_INFO "using mwait in idle threads.\n");
+               pm_idle = mwait_idle;
+       } else if (check_c1e_idle(c)) {
+               printk(KERN_INFO "using C1E aware idle routine\n");
+               pm_idle = c1e_idle;
+       } else
+               pm_idle = default_idle;
  }
  
  static int __init idle_setup(char *str)
  {
 +      if (!str)
 +              return -EINVAL;
 +
        if (!strcmp(str, "poll")) {
                printk("using polling idle threads.\n");
                pm_idle = poll_idle;
        } else if (!strcmp(str, "mwait"))
                force_mwait = 1;
-       else
+       else if (!strcmp(str, "halt")) {
+               /*
+                * When the boot option of idle=halt is added, halt is
+                * forced to be used for CPU idle. In such case CPU C2/C3
+                * won't be used again.
+                * To continue to load the CPU idle driver, don't touch
+                * the boot_option_idle_override.
+                */
+               pm_idle = default_idle;
+               idle_halt = 1;
+               return 0;
+       } else if (!strcmp(str, "nomwait")) {
+               /*
+                * If the boot option of "idle=nomwait" is added,
+                * it means that mwait will be disabled for CPU C2/C3
+                * states. In such case it won't touch the variable
+                * of boot_option_idle_override.
+                */
+               idle_nomwait = 1;
+               return 0;
+       } else
                return -1;
  
        boot_option_idle_override = 1;
index e47bfac70c3807a7d5587329c83d39de21c6cb1f,687376ab07e82ece4ab1eeb609d738d76245d226..a9ca7dadc85215f988fc68a75722833dbeb05f1d
  #include <mach_wakecpu.h>
  #include <smpboot_hooks.h>
  
- /*
-  * FIXME: For x86_64, those are defined in other files. But moving them here,
-  * would make the setup areas dependent on smp, which is a loss. When we
-  * integrate apic between arches, we can probably do a better job, but
-  * right now, they'll stay here -- glommer
-  */
- /* which logical CPU number maps to which CPU (physical APIC ID) */
- u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
-                       { [0 ... NR_CPUS-1] = BAD_APICID };
- void *x86_cpu_to_apicid_early_ptr;
- u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
-                               = { [0 ... NR_CPUS-1] = BAD_APICID };
- void *x86_bios_cpu_apicid_early_ptr;
  #ifdef CONFIG_X86_32
  u8 apicid_2_node[MAX_APICID];
  static int low_mappings;
@@@ -197,13 -181,12 +181,12 @@@ static void map_cpu_to_logical_apicid(v
        map_cpu_to_node(cpu, node);
  }
  
static void unmap_cpu_to_logical_apicid(int cpu)
void numa_remove_cpu(int cpu)
  {
        cpu_2_logical_apicid[cpu] = BAD_APICID;
        unmap_cpu_to_node(cpu);
  }
  #else
- #define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
  #define map_cpu_to_logical_apicid()  do {} while (0)
  #endif
  
@@@ -344,19 -327,12 +327,12 @@@ static void __cpuinit start_secondary(v
         * lock helps us to not include this cpu in a currently in progress
         * smp_call_function().
         */
-       lock_ipi_call_lock();
- #ifdef CONFIG_X86_64
-       spin_lock(&vector_lock);
-       /* Setup the per cpu irq handling data structures */
-       __setup_vector_irq(smp_processor_id());
-       /*
-        * Allow the master to continue.
-        */
-       spin_unlock(&vector_lock);
+       ipi_call_lock_irq();
+ #ifdef CONFIG_X86_IO_APIC
+       setup_vector_irq(smp_processor_id());
  #endif
        cpu_set(smp_processor_id(), cpu_online_map);
-       unlock_ipi_call_lock();
+       ipi_call_unlock_irq();
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
  
        setup_secondary_clock();
        cpu_idle();
  }
  
- #ifdef CONFIG_X86_32
- /*
-  * Everything has been set up for the secondary
-  * CPUs - they just need to reload everything
-  * from the task structure
-  * This function must not return.
-  */
- void __devinit initialize_secondary(void)
- {
-       /*
-        * We don't actually need to load the full TSS,
-        * basically just the stack pointer and the ip.
-        */
-       asm volatile(
-               "movl %0,%%esp\n\t"
-               "jmp *%1"
-               :
-               :"m" (current->thread.sp), "m" (current->thread.ip));
- }
- #endif
  static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
  {
- #ifdef CONFIG_X86_32
        /*
         * Mask B, Pentium, but not Pentium MMX
         */
  
  valid_k7:
        ;
- #endif
  }
  
  static void __cpuinit smp_checks(void)
@@@ -554,23 -506,6 +506,6 @@@ cpumask_t cpu_coregroup_map(int cpu
                return c->llc_shared_map;
  }
  
- #ifdef CONFIG_X86_32
- /*
-  * We are called very early to get the low memory for the
-  * SMP bootup trampoline page.
-  */
- void __init smp_alloc_memory(void)
- {
-       trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-       /*
-        * Has to be in very low memory so we can execute
-        * real-mode AP code.
-        */
-       if (__pa(trampoline_base) >= 0x9F000)
-               BUG();
- }
- #endif
  static void impress_friends(void)
  {
        int cpu;
@@@ -747,11 -682,7 +682,7 @@@ wakeup_secondary_cpu(int phys_apicid, u
         * target processor state.
         */
        startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- #ifdef CONFIG_X86_64
-                        (unsigned long)init_rsp);
- #else
                         (unsigned long)stack_start.sp);
- #endif
  
        /*
         * Run STARTUP IPI loop.
@@@ -831,6 -762,45 +762,45 @@@ static void __cpuinit do_fork_idle(stru
        complete(&c_idle->done);
  }
  
+ #ifdef CONFIG_X86_64
+ /*
+  * Allocate node local memory for the AP pda.
+  *
+  * Must be called after the _cpu_pda pointer table is initialized.
+  */
+ static int __cpuinit get_local_pda(int cpu)
+ {
+       struct x8664_pda *oldpda, *newpda;
+       unsigned long size = sizeof(struct x8664_pda);
+       int node = cpu_to_node(cpu);
+       if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+               return 0;
+       oldpda = cpu_pda(cpu);
+       newpda = kmalloc_node(size, GFP_ATOMIC, node);
+       if (!newpda) {
+               printk(KERN_ERR "Could not allocate node local PDA "
+                       "for CPU %d on node %d\n", cpu, node);
+               if (oldpda)
+                       return 0;       /* have a usable pda */
+               else
+                       return -1;
+       }
+       if (oldpda) {
+               memcpy(newpda, oldpda, size);
+               if (!after_bootmem)
+                       free_bootmem((unsigned long)oldpda, size);
+       }
+       newpda->in_bootmem = 0;
+       cpu_pda(cpu) = newpda;
+       return 0;
+ }
+ #endif /* CONFIG_X86_64 */
  static int __cpuinit do_boot_cpu(int apicid, int cpu)
  /*
   * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
                .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
        };
        INIT_WORK(&c_idle.work, do_fork_idle);
- #ifdef CONFIG_X86_64
-       /* allocate memory for gdts of secondary cpus. Hotplug is considered */
-       if (!cpu_gdt_descr[cpu].address &&
-               !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-               printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
-               return -1;
-       }
  
+ #ifdef CONFIG_X86_64
        /* Allocate node local memory for AP pdas */
-       if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-               struct x8664_pda *newpda, *pda;
-               int node = cpu_to_node(cpu);
-               pda = cpu_pda(cpu);
-               newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
-                                     node);
-               if (newpda) {
-                       memcpy(newpda, pda, sizeof(struct x8664_pda));
-                       cpu_pda(cpu) = newpda;
-               } else
-                       printk(KERN_ERR
-               "Could not allocate node local PDA for CPU %d on node %d\n",
-                               cpu, node);
+       if (cpu > 0) {
+               boot_error = get_local_pda(cpu);
+               if (boot_error)
+                       goto restore_state;
+                       /* if can't get pda memory, can't start cpu */
        }
  #endif
  
@@@ -904,18 -860,15 +860,15 @@@ do_rest
  #ifdef CONFIG_X86_32
        per_cpu(current_task, cpu) = c_idle.idle;
        init_gdt(cpu);
-       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
-       c_idle.idle->thread.ip = (unsigned long) start_secondary;
        /* Stack for startup_32 can be just as for start_secondary onwards */
-       stack_start.sp = (void *) c_idle.idle->thread.sp;
        irq_ctx_init(cpu);
  #else
        cpu_pda(cpu)->pcurrent = c_idle.idle;
-       init_rsp = c_idle.idle->thread.sp;
-       load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
-       initial_code = (unsigned long)start_secondary;
        clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
  #endif
+       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+       initial_code = (unsigned long)start_secondary;
+       stack_start.sp = (void *) c_idle.idle->thread.sp;
  
        /* start_ip had better be page-aligned! */
        start_ip = setup_trampoline();
                                inquire_remote_apic(apicid);
                }
        }
-       if (boot_error) {
-               /* Try to put things back the way they were before ... */
-               unmap_cpu_to_logical_apicid(cpu);
  #ifdef CONFIG_X86_64
-               clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+ restore_state:
  #endif
+       if (boot_error) {
+               /* Try to put things back the way they were before ... */
+               numa_remove_cpu(cpu); /* was set by numa_add_cpu */
                cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
                cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
-               cpu_clear(cpu, cpu_possible_map);
                cpu_clear(cpu, cpu_present_map);
                per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
        }
@@@ -1087,14 -1038,12 +1038,12 @@@ static __init void disable_smp(void
  {
        cpu_present_map = cpumask_of_cpu(0);
        cpu_possible_map = cpumask_of_cpu(0);
- #ifdef CONFIG_X86_32
        smpboot_clear_io_apic_irqs();
- #endif
        if (smp_found_config)
-               phys_cpu_present_map =
-                               physid_mask_of_physid(boot_cpu_physical_apicid);
+               physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
        else
-               phys_cpu_present_map = physid_mask_of_physid(0);
+               physid_set_mask_of_physid(0, &phys_cpu_present_map);
        map_cpu_to_logical_apicid();
        cpu_set(0, per_cpu(cpu_sibling_map, 0));
        cpu_set(0, per_cpu(cpu_core_map, 0));
@@@ -1157,12 -1106,12 +1106,12 @@@ static int __init smp_sanity_check(unsi
         * If SMP should be disabled, then really disable it!
         */
        if (!max_cpus) {
-               printk(KERN_INFO "SMP mode deactivated,"
-                                "forcing use of dummy APIC emulation.\n");
+               printk(KERN_INFO "SMP mode deactivated.\n");
                smpboot_clear_io_apic();
- #ifdef CONFIG_X86_32
+               localise_nmi_watchdog();
                connect_bsp_APIC();
- #endif
                setup_local_APIC();
                end_local_APIC_setup();
                return -1;
@@@ -1190,7 -1139,6 +1139,6 @@@ static void __init smp_cpu_index_defaul
  void __init native_smp_prepare_cpus(unsigned int max_cpus)
  {
        preempt_disable();
-       nmi_watchdog_default();
        smp_cpu_index_default();
        current_cpu_data = boot_cpu_data;
        cpu_callin_map = cpumask_of_cpu(0);
        }
        preempt_enable();
  
- #ifdef CONFIG_X86_32
        connect_bsp_APIC();
- #endif
        /*
         * Switch from PIC to APIC mode.
         */
@@@ -1257,8 -1204,8 +1204,8 @@@ void __init native_smp_prepare_boot_cpu
        int me = smp_processor_id();
  #ifdef CONFIG_X86_32
        init_gdt(me);
-       switch_to_new_gdt();
  #endif
+       switch_to_new_gdt();
        /* already set me in cpu_online_map in boot_cpu_init() */
        cpu_set(me, cpu_callout_map);
        per_cpu(cpu_state, me) = CPU_ONLINE;
@@@ -1278,23 -1225,6 +1225,6 @@@ void __init native_smp_cpus_done(unsign
  
  #ifdef CONFIG_HOTPLUG_CPU
  
- #  ifdef CONFIG_X86_32
- void cpu_exit_clear(void)
- {
-       int cpu = raw_smp_processor_id();
-       idle_task_exit();
-       cpu_uninit();
-       irq_ctx_exit(cpu);
-       cpu_clear(cpu, cpu_callout_map);
-       cpu_clear(cpu, cpu_callin_map);
-       unmap_cpu_to_logical_apicid(cpu);
- }
- #  endif /* CONFIG_X86_32 */
  static void remove_siblinginfo(int cpu)
  {
        int sibling;
@@@ -1348,12 -1278,20 +1278,20 @@@ __init void prefill_possible_map(void
        int i;
        int possible;
  
+       /* no processor from mptable or madt */
+       if (!num_processors)
+               num_processors = 1;
+ #ifdef CONFIG_HOTPLUG_CPU
        if (additional_cpus == -1) {
                if (disabled_cpus > 0)
                        additional_cpus = disabled_cpus;
                else
                        additional_cpus = 0;
        }
+ #else
+       additional_cpus = 0;
+ #endif
        possible = num_processors + additional_cpus;
        if (possible > NR_CPUS)
                possible = NR_CPUS;
  
        for (i = 0; i < possible; i++)
                cpu_set(i, cpu_possible_map);
+       nr_cpu_ids = possible;
  }
  
  static void __ref remove_cpu_from_maps(int cpu)
  {
        cpu_clear(cpu, cpu_online_map);
- #ifdef CONFIG_X86_64
        cpu_clear(cpu, cpu_callout_map);
        cpu_clear(cpu, cpu_callin_map);
        /* was set by cpu_init() */
        clear_bit(cpu, (unsigned long *)&cpu_initialized);
-       clear_node_cpumask(cpu);
- #endif
+       numa_remove_cpu(cpu);
  }
  
  int __cpu_disable(void)
@@@ -1452,8 -1390,7 +1390,8 @@@ static int __init parse_maxcpus(char *a
  {
        extern unsigned int maxcpus;
  
 -      maxcpus = simple_strtoul(arg, NULL, 0);
 +      if (arg)
 +              maxcpus = simple_strtoul(arg, NULL, 0);
        return 0;
  }
  early_param("maxcpus", parse_maxcpus);