Merge x86-64 update from Andi

author Linus Torvalds <torvalds@g5.osdl.org>

Tue, 15 Nov 2005 03:56:02 +0000 (19:56 -0800)

committer Linus Torvalds <torvalds@g5.osdl.org>

Tue, 15 Nov 2005 03:56:02 +0000 (19:56 -0800)
author Linus Torvalds <torvalds@g5.osdl.org>
Tue, 15 Nov 2005 03:56:02 +0000 (19:56 -0800)
committer Linus Torvalds <torvalds@g5.osdl.org>
Tue, 15 Nov 2005 03:56:02 +0000 (19:56 -0800)
diff --combined arch/i386/kernel/cpu/common.c

index c145fb30002ed6894d37ffa03b33227166337243,4e9c2e99b0a52754f5c32aeff7ba430b8c96979c..31e344b26bae824f255baa1446980486d8aff52a
--- 1/arch/i386/kernel/cpu/common.c
--- 2/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@@ -30,6 -30,8 +30,6 @@@ static int disable_x86_serial_nr __devi
   
   struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
   
- -extern void mcheck_init(struct cpuinfo_x86 *c);
- -
   extern int disable_pse;
   
   static void default_init(struct cpuinfo_x86 * c)
@@@ -231,10 -233,10 +231,10 @@@ static void __init early_cpu_detect(voi
                 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
                 c->x86 = (tfms >> 8) & 15;
                 c->x86_model = (tfms >> 4) & 15;
-               if (c->x86 == 0xf) {
+               if (c->x86 == 0xf)
                         c->x86 += (tfms >> 20) & 0xff;
+               if (c->x86 >= 0x6)
                         c->x86_model += ((tfms >> 16) & 0xF) << 4;
-               }
                 c->x86_mask = tfms & 15;
                 if (cap0 & (1<<19))
                         c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
@@@ -333,7 -335,7 +333,7 @@@ void __devinit identify_cpu(struct cpui
         c->x86_model = c->x86_mask = 0; /* So far unknown... */
         c->x86_vendor_id[0] = '\0'; /* Unset */
         c->x86_model_id[0] = '\0';  /* Unset */
-       c->x86_num_cores = 1;
+       c->x86_max_cores = 1;
         memset(&c->x86_capability, 0, sizeof c->x86_capability);
   
         if (!have_cpuid_p()) {
@@@ -427,8 -429,9 +427,8 @@@
         }
   
         /* Init Machine Check Exception if available. */
- -#ifdef CONFIG_X86_MCE
         mcheck_init(c);
- -#endif
+ +
         if (c == &boot_cpu_data)
                 sysenter_setup();
         enable_sep_cpu();
@@@ -443,52 -446,44 +443,44 @@@
   void __devinit detect_ht(struct cpuinfo_x86 *c)
   {
         u32     eax, ebx, ecx, edx;
-       int     index_msb, tmp;
+       int     index_msb, core_bits;
         int     cpu = smp_processor_id();
   
+       cpuid(1, &eax, &ebx, &ecx, &edx);
+ 
+       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+ 
         if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                 return;
   
-       cpuid(1, &eax, &ebx, &ecx, &edx);
         smp_num_siblings = (ebx & 0xff0000) >> 16;
   
         if (smp_num_siblings == 1) {
                 printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
         } else if (smp_num_siblings > 1 ) {
-               index_msb = 31;
   
                 if (smp_num_siblings > NR_CPUS) {
                         printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
                         smp_num_siblings = 1;
                         return;
                 }
-               tmp = smp_num_siblings;
-               while ((tmp & 0x80000000 ) == 0) {
-                       tmp <<=1 ;
-                       index_msb--;
-               }
-               if (smp_num_siblings & (smp_num_siblings - 1))
-                       index_msb++;
+ 
+               index_msb = get_count_order(smp_num_siblings);
                 phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
   
                 printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
                        phys_proc_id[cpu]);
   
-               smp_num_siblings = smp_num_siblings / c->x86_num_cores;
+               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
   
-               tmp = smp_num_siblings;
-               index_msb = 31;
-               while ((tmp & 0x80000000) == 0) {
-                       tmp <<=1 ;
-                       index_msb--;
-               }
+               index_msb = get_count_order(smp_num_siblings) ;
   
-               if (smp_num_siblings & (smp_num_siblings - 1))
-                       index_msb++;
+               core_bits = get_count_order(c->x86_max_cores);
   
-               cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+               cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+                                              ((1 << core_bits) - 1);
   
-               if (c->x86_num_cores > 1)
+               if (c->x86_max_cores > 1)
                         printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
                                cpu_core_id[cpu]);
         }
diff --combined arch/i386/kernel/cpu/intel.c

index c28d26fb5f241891c02b3fd5fd7f508083315334,8d603ba2812695e16f7b7ed9a0d0d5ceb55a51d2..5e2da704f0faa8200fa744e39d8ffb86dd860542
--- 1/arch/i386/kernel/cpu/intel.c
--- 2/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@@ -6,7 -6,6 +6,7 @@@
   #include <linux/bitops.h>
   #include <linux/smp.h>
   #include <linux/thread_info.h>
+ +#include <linux/module.h>
   
   #include <asm/processor.h>
   #include <asm/msr.h>
@@@ -158,7 -157,7 +158,7 @@@ static void __devinit init_intel(struc
         if ( p )
                 strcpy(c->x86_model_id, p);
         
-       c->x86_num_cores = num_cpu_cores(c);
+       c->x86_max_cores = num_cpu_cores(c);
   
         detect_ht(c);
   
@@@ -265,52 -264,5 +265,52 @@@ __init int intel_cpu_init(void
         return 0;
   }
   
+ +#ifndef CONFIG_X86_CMPXCHG
+ +unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+ +{
+ +      u8 prev;
+ +      unsigned long flags;
+ +
+ +      /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ +      local_irq_save(flags);
+ +      prev = *(u8 *)ptr;
+ +      if (prev == old)
+ +              *(u8 *)ptr = new;
+ +      local_irq_restore(flags);
+ +      return prev;
+ +}
+ +EXPORT_SYMBOL(cmpxchg_386_u8);
+ +
+ +unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+ +{
+ +      u16 prev;
+ +      unsigned long flags;
+ +
+ +      /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ +      local_irq_save(flags);
+ +      prev = *(u16 *)ptr;
+ +      if (prev == old)
+ +              *(u16 *)ptr = new;
+ +      local_irq_restore(flags);
+ +      return prev;
+ +}
+ +EXPORT_SYMBOL(cmpxchg_386_u16);
+ +
+ +unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+ +{
+ +      u32 prev;
+ +      unsigned long flags;
+ +
+ +      /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ +      local_irq_save(flags);
+ +      prev = *(u32 *)ptr;
+ +      if (prev == old)
+ +              *(u32 *)ptr = new;
+ +      local_irq_restore(flags);
+ +      return prev;
+ +}
+ +EXPORT_SYMBOL(cmpxchg_386_u32);
+ +#endif
+ +
   // arch_initcall(intel_cpu_init);
   
diff --combined arch/i386/kernel/smpboot.c

index bc5a9d97466b572cf5e6755fc5e5de0eb828b11f,0a9c6465523694bab79436088ae0d21ba4ed0489..d16520da4550e6d1b5d318a56a2f1ba1b852728c
--- 1/arch/i386/kernel/smpboot.c
--- 2/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@@ -68,13 -68,17 +68,15 @@@ EXPORT_SYMBOL(smp_num_siblings)
   
   /* Package ID of each logical CPU */
   int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
- -EXPORT_SYMBOL(phys_proc_id);
   
   /* Core ID of each logical CPU */
   int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
- -EXPORT_SYMBOL(cpu_core_id);
   
+ /* representing HT siblings of each logical CPU */
   cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
   EXPORT_SYMBOL(cpu_sibling_map);
   
+ /* representing HT and core siblings of each logical CPU */
   cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
   EXPORT_SYMBOL(cpu_core_map);
   
@@@ -442,35 -446,60 +444,60 @@@ static void __devinit smp_callin(void
   
   static int cpucount;
   
+ /* representing cpus for which sibling maps can be computed */
+ static cpumask_t cpu_sibling_setup_map;
+ 
   static inline void
   set_cpu_sibling_map(int cpu)
   {
         int i;
+       struct cpuinfo_x86 *c = cpu_data;
+ 
+       cpu_set(cpu, cpu_sibling_setup_map);
   
         if (smp_num_siblings > 1) {
-               for (i = 0; i < NR_CPUS; i++) {
-                       if (!cpu_isset(i, cpu_callout_map))
-                               continue;
-                       if (cpu_core_id[cpu] == cpu_core_id[i]) {
+               for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                       if (phys_proc_id[cpu] == phys_proc_id[i] &&
+                           cpu_core_id[cpu] == cpu_core_id[i]) {
                                 cpu_set(i, cpu_sibling_map[cpu]);
                                 cpu_set(cpu, cpu_sibling_map[i]);
+                               cpu_set(i, cpu_core_map[cpu]);
+                               cpu_set(cpu, cpu_core_map[i]);
                         }
                 }
         } else {
                 cpu_set(cpu, cpu_sibling_map[cpu]);
         }
   
-       if (current_cpu_data.x86_num_cores > 1) {
-               for (i = 0; i < NR_CPUS; i++) {
-                       if (!cpu_isset(i, cpu_callout_map))
-                               continue;
-                       if (phys_proc_id[cpu] == phys_proc_id[i]) {
-                               cpu_set(i, cpu_core_map[cpu]);
-                               cpu_set(cpu, cpu_core_map[i]);
-                       }
-               }
-       } else {
+       if (current_cpu_data.x86_max_cores == 1) {
                 cpu_core_map[cpu] = cpu_sibling_map[cpu];
+               c[cpu].booted_cores = 1;
+               return;
+       }
+ 
+       for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (phys_proc_id[cpu] == phys_proc_id[i]) {
+                       cpu_set(i, cpu_core_map[cpu]);
+                       cpu_set(cpu, cpu_core_map[i]);
+                       /*
+                        *  Does this new cpu bringup a new core?
+                        */
+                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                               /*
+                                * for each core in package, increment
+                                * the booted_cores for this new cpu
+                                */
+                               if (first_cpu(cpu_sibling_map[i]) == i)
+                                       c[cpu].booted_cores++;
+                               /*
+                                * increment the core count for all
+                                * the other cpus in this package
+                                */
+                               if (i != cpu)
+                                       c[i].booted_cores++;
+                       } else if (i != cpu && !c[cpu].booted_cores)
+                               c[cpu].booted_cores = c[i].booted_cores;
+               }
         }
   }
   
@@@ -485,7 -514,6 +512,7 @@@ static void __devinit start_secondary(v
          * things done here to the most necessary things.
          */
         cpu_init();
+ +      preempt_disable();
         smp_callin();
         while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
                 rep_nop();
@@@ -611,7 -639,7 +638,7 @@@ static inline void __inquire_remote_api
   
         printk("Inquiring remote APIC #%d...\n", apicid);
   
- -      for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+ +      for (i = 0; i < ARRAY_SIZE(regs); i++) {
                 printk("... APIC #%d %s: ", apicid, names[i]);
   
                 /*
@@@ -1095,11 -1123,8 +1122,8 @@@ static void __init smp_boot_cpus(unsign
   
         current_thread_info()->cpu = 0;
         smp_tune_scheduling();
-       cpus_clear(cpu_sibling_map[0]);
-       cpu_set(0, cpu_sibling_map[0]);
   
-       cpus_clear(cpu_core_map[0]);
-       cpu_set(0, cpu_core_map[0]);
+       set_cpu_sibling_map(0);
   
         /*
          * If we couldn't find an SMP configuration at boot time,
@@@ -1278,15 -1303,24 +1302,24 @@@ static voi
   remove_siblinginfo(int cpu)
   {
         int sibling;
+       struct cpuinfo_x86 *c = cpu_data;
   
+       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+               cpu_clear(cpu, cpu_core_map[sibling]);
+               /*
+                * last thread sibling in this cpu core going down
+                */
+               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                       c[sibling].booted_cores--;
+       }
+                       
         for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
                 cpu_clear(cpu, cpu_sibling_map[sibling]);
-       for_each_cpu_mask(sibling, cpu_core_map[cpu])
-               cpu_clear(cpu, cpu_core_map[sibling]);
         cpus_clear(cpu_sibling_map[cpu]);
         cpus_clear(cpu_core_map[cpu]);
         phys_proc_id[cpu] = BAD_APICID;
         cpu_core_id[cpu] = BAD_APICID;
+       cpu_clear(cpu, cpu_sibling_setup_map);
   }
   
   int __cpu_disable(void)
diff --combined arch/ia64/Kconfig

index 8796e12c56f3f08b504f3e539249c0767e880870,3dba24c318b7cc7b81a3157169deed2bebd84c83..b76ce1fe2e7f31c04858901fac13dd66e7efff29
--- 1/arch/ia64/Kconfig
--- 2/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@@ -58,6 -58,10 +58,10 @@@ config IA64_UNCACHED_ALLOCATO
         bool
         select GENERIC_ALLOCATOR
   
+ config ZONE_DMA_IS_DMA32
+       bool
+       default y
+ 
   choice
         prompt "System type"
         default IA64_GENERIC
@@@ -164,19 -168,6 +168,19 @@@ config IA64_PAGE_SIZE_64K
   
   endchoice
   
+ +choice
+ +      prompt "Page Table Levels"
+ +      default PGTABLE_3
+ +
+ +config PGTABLE_3
+ +      bool "3 Levels"
+ +
+ +config PGTABLE_4
+ +      depends on !IA64_PAGE_SIZE_64KB
+ +      bool "4 Levels"
+ +
+ +endchoice
+ +
   source kernel/Kconfig.hz
   
   config IA64_BRL_EMU
@@@ -204,7 -195,6 +208,7 @@@ config IOSAPI
   
   config IA64_SGI_SN_XP
         tristate "Support communication between SGI SSIs"
+ +      depends on IA64_GENERIC || IA64_SGI_SN2
         select IA64_UNCACHED_ALLOCATOR
         help
           An SGI machine can be divided into multiple Single System
@@@ -440,21 -430,8 +444,21 @@@ config GENERIC_PENDING_IR
   
   source "arch/ia64/hp/sim/Kconfig"
   
+ +menu "Instrumentation Support"
+ +        depends on EXPERIMENTAL
+ +
   source "arch/ia64/oprofile/Kconfig"
   
+ +config KPROBES
+ +      bool "Kprobes (EXPERIMENTAL)"
+ +      help
+ +        Kprobes allows you to trap at almost any kernel address and
+ +        execute a callback function.  register_kprobe() establishes
+ +        a probepoint and specifies the callback.  Kprobes is useful
+ +        for kernel debugging, non-intrusive instrumentation and testing.
+ +        If in doubt, say "N".
+ +endmenu
+ +
   source "arch/ia64/Kconfig.debug"
   
   source "security/Kconfig"
diff --combined arch/x86_64/Kconfig

index 4cce2f6f170c0712dda1a1e6f60cbf545def6b70,1d6242a5cd0aec3c16be94b00af6383d5c76c15b..6ece645e4dbea691191ccdaa8d49ec42025ab25f
--- 1/arch/x86_64/Kconfig
--- 2/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@@ -226,22 -226,42 +226,42 @@@ config SCHED_SM
   
   source "kernel/Kconfig.preempt"
   
- config K8_NUMA
-        bool "K8 NUMA support"
-        select NUMA
+ config NUMA
+        bool "Non Uniform Memory Access (NUMA) Support"
          depends on SMP
          help
-         Enable NUMA (Non Unified Memory Architecture) support for
-         AMD Opteron Multiprocessor systems. The kernel will try to allocate
-         memory used by a CPU on the local memory controller of the CPU
-         and add some more NUMA awareness to the kernel.
-         This code is recommended on all multiprocessor Opteron systems
-         and normally doesn't hurt on others.
+        Enable NUMA (Non Uniform Memory Access) support. The kernel 
+        will try to allocate memory used by a CPU on the local memory 
+        controller of the CPU and add some more NUMA awareness to the kernel.
+        This code is recommended on all multiprocessor Opteron systems.
+        If the system is EM64T, you should say N unless your system is EM64T 
+        NUMA. 
+ 
+ config K8_NUMA
+        bool "Old style AMD Opteron NUMA detection"
+        depends on NUMA
+        default y
+        help
+        Enable K8 NUMA node topology detection.  You should say Y here if
+        you have a multi processor AMD K8 system. This uses an old
+        method to read the NUMA configurtion directly from the builtin
+        Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+        instead, which also takes priority if both are compiled in.   
+ 
+ # Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
+ 
+ config X86_64_ACPI_NUMA
+        bool "ACPI NUMA detection"
+        depends on NUMA
+        select ACPI 
+        select ACPI_NUMA
+        default y
+        help
+        Enable ACPI SRAT based node topology detection.
   
   config NUMA_EMU
-       bool "NUMA emulation support"
-       select NUMA
-       depends on SMP
+       bool "NUMA emulation"
+       depends on NUMA
         help
           Enable NUMA emulation. A flat machine will be split
           into virtual nodes when booted with "numa=fake=N", where N is the
@@@ -252,9 -272,6 +272,6 @@@ config ARCH_DISCONTIGMEM_ENABL
          depends on NUMA
          default y
   
- config NUMA
-        bool
-        default n
   
   config ARCH_DISCONTIGMEM_ENABLE
         def_bool y
@@@ -374,6 -391,14 +391,14 @@@ config X86_MCE_INTE
            Additional support for intel specific MCE features such as
            the thermal monitor.
   
+ config X86_MCE_AMD
+       bool "AMD MCE features"
+       depends on X86_MCE && X86_LOCAL_APIC
+       default y
+       help
+          Additional support for AMD specific MCE features such as
+          the DRAM Error Threshold.
+ 
   config PHYSICAL_START
         hex "Physical address where the kernel is loaded" if EMBEDDED
         default "0x100000"
@@@ -502,7 -527,7 +527,7 @@@ config IA32_EMULATIO
           left.
   
   config IA32_AOUT
-        bool "IA32 a.out support"
+        tristate "IA32 a.out support"
          depends on IA32_EMULATION
          help
            Support old a.out binaries in the 32bit emulation.
@@@ -532,21 -557,8 +557,21 @@@ source "drivers/firmware/Kconfig
   
   source fs/Kconfig
   
+ +menu "Instrumentation Support"
+ +        depends on EXPERIMENTAL
+ +
   source "arch/x86_64/oprofile/Kconfig"
   
+ +config KPROBES
+ +      bool "Kprobes (EXPERIMENTAL)"
+ +      help
+ +        Kprobes allows you to trap at almost any kernel address and
+ +        execute a callback function.  register_kprobe() establishes
+ +        a probepoint and specifies the callback.  Kprobes is useful
+ +        for kernel debugging, non-intrusive instrumentation and testing.
+ +        If in doubt, say "N".
+ +endmenu
+ +
   source "arch/x86_64/Kconfig.debug"
   
   source "security/Kconfig"
diff --combined arch/x86_64/Kconfig.debug

index d584ecc27ea1902e038329335486a60b8e72f9eb,3ccf6f4d1068523876634bf36e37f2e1f5699624..e2c6e64a85ec27318d0a76d51e421301e121dd3e
--- 1/arch/x86_64/Kconfig.debug
--- 2/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@@ -2,15 -2,6 +2,6 @@@ menu "Kernel hacking
   
   source "lib/Kconfig.debug"
   
- # !SMP for now because the context switch early causes GPF in segment reloading
- # and the GS base checking does the wrong thing then, causing a hang.
- config CHECKING
-       bool "Additional run-time checks"
-       depends on DEBUG_KERNEL && !SMP
-       help
-         Enables some internal consistency checks for kernel debugging.
-         You should normally say N.
- 
   config INIT_DEBUG
         bool "Debug __init statements"
         depends on DEBUG_KERNEL
@@@ -33,6 -24,16 +24,6 @@@ config IOMMU_DEBU
          options. See Documentation/x86_64/boot-options.txt for more
          details.
   
- -config KPROBES
- -      bool "Kprobes"
- -      depends on DEBUG_KERNEL
- -      help
- -        Kprobes allows you to trap at almost any kernel address and
- -        execute a callback function.  register_kprobe() establishes
- -        a probepoint and specifies the callback.  Kprobes is useful
- -        for kernel debugging, non-intrusive instrumentation and testing.
- -        If in doubt, say "N".
- -
   config IOMMU_LEAK
          bool "IOMMU leak tracing"
          depends on DEBUG_KERNEL
diff --combined arch/x86_64/kernel/i8259.c

index a9368d4c4aba1ceba5df433b9fdfb90713f1f543,5de30035e54bedaa5d420c2469e7a74db13718ed..6e5101ad3d1a075290e3acb298b78e0f5e430c8d
--- 1/arch/x86_64/kernel/i8259.c
--- 2/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@@ -492,6 -492,7 +492,7 @@@ void invalidate_interrupt5(void)
   void invalidate_interrupt6(void);
   void invalidate_interrupt7(void);
   void thermal_interrupt(void);
+ void threshold_interrupt(void);
   void i8254_timer_resume(void);
   
   static void setup_timer_hardware(void)
@@@ -515,7 -516,7 +516,7 @@@ void i8254_timer_resume(void
   }
   
   static struct sysdev_class timer_sysclass = {
- -      set_kset_name("timer"),
+ +      set_kset_name("timer_pit"),
         .resume         = timer_resume,
   };
   
@@@ -580,6 -581,7 +581,7 @@@ void __init init_IRQ(void
         set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
   #endif        
         set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+       set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
   
   #ifdef CONFIG_X86_LOCAL_APIC
         /* self generated IPI for local APIC timer */
diff --combined arch/x86_64/kernel/process.c

index 59be85d9a4bc166284ac102aea770d52c77a69cf,bc44c92ca1981102ce239889b14ab9e1f71b31f7..5afd63e8cef7ab9c9a71bb1f35df69a738ce90d5
--- 1/arch/x86_64/kernel/process.c
--- 2/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@@ -86,22 -86,12 +86,22 @@@ EXPORT_SYMBOL(enable_hlt)
    */
   void default_idle(void)
   {
+ +      local_irq_enable();
+ +
         if (!atomic_read(&hlt_counter)) {
- -              local_irq_disable();
- -              if (!need_resched())
- -                      safe_halt();
- -              else
- -                      local_irq_enable();
+ +              clear_thread_flag(TIF_POLLING_NRFLAG);
+ +              smp_mb__after_clear_bit();
+ +              while (!need_resched()) {
+ +                      local_irq_disable();
+ +                      if (!need_resched())
+ +                              safe_halt();
+ +                      else
+ +                              local_irq_enable();
+ +              }
+ +              set_thread_flag(TIF_POLLING_NRFLAG);
+ +      } else {
+ +              while (!need_resched())
+ +                      cpu_relax();
         }
   }
   
@@@ -112,16 -102,30 +112,16 @@@
    */
   static void poll_idle (void)
   {
- -      int oldval;
- -
         local_irq_enable();
   
- -      /*
- -       * Deal with another CPU just having chosen a thread to
- -       * run here:
- -       */
- -      oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
- -
- -      if (!oldval) {
- -              set_thread_flag(TIF_POLLING_NRFLAG); 
- -              asm volatile(
- -                      "2:"
- -                      "testl %0,%1;"
- -                      "rep; nop;"
- -                      "je 2b;"
- -                      : :
- -                      "i" (_TIF_NEED_RESCHED), 
- -                      "m" (current_thread_info()->flags));
- -              clear_thread_flag(TIF_POLLING_NRFLAG);
- -      } else {
- -              set_need_resched();
- -      }
+ +      asm volatile(
+ +              "2:"
+ +              "testl %0,%1;"
+ +              "rep; nop;"
+ +              "je 2b;"
+ +              : :
+ +              "i" (_TIF_NEED_RESCHED),
+ +              "m" (current_thread_info()->flags));
   }
   
   void cpu_idle_wait(void)
@@@ -144,7 -148,8 +144,8 @@@
         do {
                 ssleep(1);
                 for_each_online_cpu(cpu) {
-                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+                       if (cpu_isset(cpu, map) &&
+                                       !per_cpu(cpu_idle_state, cpu))
                                 cpu_clear(cpu, map);
                 }
                 cpus_and(map, map, cpu_online_map);
@@@ -183,8 -188,6 +184,8 @@@ static inline void play_dead(void
    */
   void cpu_idle (void)
   {
+ +      set_thread_flag(TIF_POLLING_NRFLAG);
+ +
         /* endless idle loop with no priority at all */
         while (1) {
                 while (!need_resched()) {
@@@ -202,9 -205,7 +203,9 @@@
                         idle();
                 }
   
+ +              preempt_enable_no_resched();
                 schedule();
+ +              preempt_disable();
         }
   }
   
@@@ -219,12 -220,15 +220,12 @@@ static void mwait_idle(void
   {
         local_irq_enable();
   
- -      if (!need_resched()) {
- -              set_thread_flag(TIF_POLLING_NRFLAG);
- -              do {
- -                      __monitor((void *)&current_thread_info()->flags, 0, 0);
- -                      if (need_resched())
- -                              break;
- -                      __mwait(0, 0);
- -              } while (!need_resched());
- -              clear_thread_flag(TIF_POLLING_NRFLAG);
+ +      while (!need_resched()) {
+ +              __monitor((void *)&current_thread_info()->flags, 0, 0);
+ +              smp_mb();
+ +              if (need_resched())
+ +                      break;
+ +              __mwait(0, 0);
         }
   }
   
@@@ -275,7 -279,8 +276,8 @@@ void __show_regs(struct pt_regs * regs
                 system_utsname.version);
         printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
         printk_address(regs->rip); 
-       printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+       printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
+               regs->eflags);
         printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
                regs->rax, regs->rbx, regs->rcx);
         printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@@ -427,15 -432,14 +429,14 @@@ int copy_thread(int nr, unsigned long c
         struct pt_regs * childregs;
         struct task_struct *me = current;
   
-       childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
- 
+       childregs = ((struct pt_regs *)
+                       (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
         *childregs = *regs;
   
         childregs->rax = 0;
         childregs->rsp = rsp;
-       if (rsp == ~0UL) {
+       if (rsp == ~0UL)
                 childregs->rsp = (unsigned long)childregs;
-       }
   
         p->thread.rsp = (unsigned long) childregs;
         p->thread.rsp0 = (unsigned long) (childregs+1);
@@@ -457,7 -461,8 +458,8 @@@
                         p->thread.io_bitmap_max = 0;
                         return -ENOMEM;
                 }
-               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+                               IO_BITMAP_BYTES);
         } 
   
         /*
@@@ -494,7 -499,8 +496,8 @@@ out
    * - fold all the options into a flag word and test it with a single test.
    * - could test fs/gs bitsliced
    */
- struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ struct task_struct *
+ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
   {
         struct thread_struct *prev = &prev_p->thread,
                                  *next = &next_p->thread;
@@@ -565,7 -571,8 +568,8 @@@
         prev->userrsp = read_pda(oldrsp); 
         write_pda(oldrsp, next->userrsp); 
         write_pda(pcurrent, next_p); 
-       write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+       write_pda(kernelstack,
+           (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
   
         /*
          * Now maybe reload the debug registers
@@@ -646,7 -653,9 +650,9 @@@ asmlinkage long sys_fork(struct pt_reg
         return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
   }
   
- asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+ asmlinkage long
+ sys_clone(unsigned long clone_flags, unsigned long newsp,
+         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
   {
         if (!newsp)
                 newsp = regs->rsp;
@@@ -682,7 -691,8 +688,8 @@@ unsigned long get_wchan(struct task_str
                 return 0;
         fp = *(u64 *)(p->thread.rsp);
         do { 
-               if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+               if (fp < (unsigned long)stack ||
+                   fp > (unsigned long)stack+THREAD_SIZE)
                         return 0; 
                 rip = *(u64 *)(fp+8); 
                 if (!in_sched_functions(rip))
@@@ -717,8 -727,8 +724,8 @@@ long do_arch_prctl(struct task_struct *
                         task->thread.gsindex = 0;
                         task->thread.gs = addr;
                         if (doit) {
-               load_gs_index(0);
-               ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 
+                               load_gs_index(0);
+                               ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
                         } 
                 }
                 put_cpu();
@@@ -735,7 -745,7 +742,7 @@@
                         set_32bit_tls(task, FS_TLS, addr);
                         if (doit) { 
                                 load_TLS(&task->thread, cpu); 
-                               asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
                         }
                         task->thread.fsindex = FS_TLS_SEL;
                         task->thread.fs = 0;
@@@ -745,8 -755,8 +752,8 @@@
                         if (doit) {
                                 /* set the selector to 0 to not confuse
                                    __switch_to */
-               asm volatile("movl %0,%%fs" :: "r" (0));
-               ret = checking_wrmsrl(MSR_FS_BASE, addr); 
+                               asm volatile("movl %0,%%fs" :: "r" (0));
+                               ret = checking_wrmsrl(MSR_FS_BASE, addr);
                         }
                 }
                 put_cpu();
@@@ -755,9 -765,9 +762,9 @@@
                 unsigned long base; 
                 if (task->thread.fsindex == FS_TLS_SEL)
                         base = read_32bit_tls(task, FS_TLS);
-               else if (doit) {
+               else if (doit)
                         rdmsrl(MSR_FS_BASE, base);
-               } else
+               else
                         base = task->thread.fs;
                 ret = put_user(base, (unsigned long __user *)addr); 
                 break; 
@@@ -766,9 -776,9 +773,9 @@@
                 unsigned long base;
                 if (task->thread.gsindex == GS_TLS_SEL)
                         base = read_32bit_tls(task, GS_TLS);
-               else if (doit) {
+               else if (doit)
                         rdmsrl(MSR_KERNEL_GS_BASE, base);
-               } else
+               else
                         base = task->thread.gs;
                 ret = put_user(base, (unsigned long __user *)addr); 
                 break;
diff --combined arch/x86_64/kernel/smpboot.c

index c4e59bbdc1872b4ce599f452b125183603cf8f8e,2b9ddba61b37a9e675e830b2172854ea565303b3..683c33f7b96753219a2c1043246dcdfac12b999e
--- 1/arch/x86_64/kernel/smpboot.c
--- 2/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@@ -64,7 -64,10 +64,8 @@@
   int smp_num_siblings = 1;
   /* Package ID of each logical CPU */
   u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+ /* core ID of each logical CPU */
   u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
- -EXPORT_SYMBOL(phys_proc_id);
- -EXPORT_SYMBOL(cpu_core_id);
   
   /* Bitmask of currently online CPUs */
   cpumask_t cpu_online_map __read_mostly;
@@@ -87,7 -90,10 +88,10 @@@ struct cpuinfo_x86 cpu_data[NR_CPUS] __
   /* Set when the idlers are all forked */
   int smp_threads_ready;
   
+ /* representing HT siblings of each logical CPU */
   cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+ 
+ /* representing HT and core siblings of each logical CPU */
   cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
   EXPORT_SYMBOL(cpu_core_map);
   
@@@ -434,30 -440,59 +438,59 @@@ void __cpuinit smp_callin(void
         cpu_set(cpuid, cpu_callin_map);
   }
   
+ /* representing cpus for which sibling maps can be computed */
+ static cpumask_t cpu_sibling_setup_map;
+ 
   static inline void set_cpu_sibling_map(int cpu)
   {
         int i;
+       struct cpuinfo_x86 *c = cpu_data;
+ 
+       cpu_set(cpu, cpu_sibling_setup_map);
   
         if (smp_num_siblings > 1) {
-               for_each_cpu(i) {
-                       if (cpu_core_id[cpu] == cpu_core_id[i]) {
+               for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                       if (phys_proc_id[cpu] == phys_proc_id[i] &&
+                           cpu_core_id[cpu] == cpu_core_id[i]) {
                                 cpu_set(i, cpu_sibling_map[cpu]);
                                 cpu_set(cpu, cpu_sibling_map[i]);
+                               cpu_set(i, cpu_core_map[cpu]);
+                               cpu_set(cpu, cpu_core_map[i]);
                         }
                 }
         } else {
                 cpu_set(cpu, cpu_sibling_map[cpu]);
         }
   
-       if (current_cpu_data.x86_num_cores > 1) {
-               for_each_cpu(i) {
-                       if (phys_proc_id[cpu] == phys_proc_id[i]) {
-                               cpu_set(i, cpu_core_map[cpu]);
-                               cpu_set(cpu, cpu_core_map[i]);
-                       }
-               }
-       } else {
+       if (current_cpu_data.x86_max_cores == 1) {
                 cpu_core_map[cpu] = cpu_sibling_map[cpu];
+               c[cpu].booted_cores = 1;
+               return;
+       }
+ 
+       for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (phys_proc_id[cpu] == phys_proc_id[i]) {
+                       cpu_set(i, cpu_core_map[cpu]);
+                       cpu_set(cpu, cpu_core_map[i]);
+                       /*
+                        *  Does this new cpu bringup a new core?
+                        */
+                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                               /*
+                                * for each core in package, increment
+                                * the booted_cores for this new cpu
+                                */
+                               if (first_cpu(cpu_sibling_map[i]) == i)
+                                       c[cpu].booted_cores++;
+                               /*
+                                * increment the core count for all
+                                * the other cpus in this package
+                                */
+                               if (i != cpu)
+                                       c[i].booted_cores++;
+                       } else if (i != cpu && !c[cpu].booted_cores)
+                               c[cpu].booted_cores = c[i].booted_cores;
+               }
         }
   }
   
@@@ -472,7 -507,6 +505,7 @@@ void __cpuinit start_secondary(void
          * things done here to the most necessary things.
          */
         cpu_init();
+ +      preempt_disable();
         smp_callin();
   
         /* otherwise gcc will move up the smp_processor_id before the cpu_init */
@@@ -879,6 -913,9 +912,9 @@@ static __init void disable_smp(void
   }
   
   #ifdef CONFIG_HOTPLUG_CPU
+ 
+ int additional_cpus __initdata = -1;
+ 
   /*
    * cpu_possible_map should be static, it cannot change as cpu's
    * are onlined, or offlined. The reason is per-cpu data-structures
@@@ -887,14 -924,38 +923,38 @@@
    * cpu_present_map on the other hand can change dynamically.
    * In case when cpu_hotplug is not compiled, then we resort to current
    * behaviour, which is cpu_possible == cpu_present.
-  * If cpu-hotplug is supported, then we need to preallocate for all
-  * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
    * - Ashok Raj
+  *
+  * Three ways to find out the number of additional hotplug CPUs:
+  * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+  * - otherwise use half of the available CPUs or 2, whatever is more.
+  * - The user can overwrite it with additional_cpus=NUM
+  * We do this because additional CPUs waste a lot of memory.
+  * -AK
    */
   __init void prefill_possible_map(void)
   {
         int i;
-       for (i = 0; i < NR_CPUS; i++)
+       int possible;
+ 
+       if (additional_cpus == -1) {
+               if (disabled_cpus > 0) {
+                       additional_cpus = disabled_cpus;
+               } else {
+                       additional_cpus = num_processors / 2;
+                       if (additional_cpus == 0)
+                               additional_cpus = 2;
+               }
+       }
+       possible = num_processors + additional_cpus;
+       if (possible > NR_CPUS) 
+               possible = NR_CPUS;
+ 
+       printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+               possible,
+               max_t(int, possible - num_processors, 0));
+ 
+       for (i = 0; i < possible; i++)
                 cpu_set(i, cpu_possible_map);
   }
   #endif
@@@ -965,6 -1026,7 +1025,7 @@@ void __init smp_prepare_cpus(unsigned i
         nmi_watchdog_default();
         current_cpu_data = boot_cpu_data;
         current_thread_info()->cpu = 0;  /* needed? */
+       set_cpu_sibling_map(0);
   
         if (smp_sanity_check(max_cpus) < 0) {
                 printk(KERN_INFO "SMP disabled\n");
@@@ -1008,8 -1070,6 +1069,6 @@@ void __init smp_prepare_boot_cpu(void
         int me = smp_processor_id();
         cpu_set(me, cpu_online_map);
         cpu_set(me, cpu_callout_map);
-       cpu_set(0, cpu_sibling_map[0]);
-       cpu_set(0, cpu_core_map[0]);
         per_cpu(cpu_state, me) = CPU_ONLINE;
   }
   
@@@ -1062,9 -1122,6 +1121,6 @@@ int __cpuinit __cpu_up(unsigned int cpu
    */
   void __init smp_cpus_done(unsigned int max_cpus)
   {
- #ifndef CONFIG_HOTPLUG_CPU
-       zap_low_mappings();
- #endif
         smp_cleanup_boot();
   
   #ifdef CONFIG_X86_IO_APIC
@@@ -1081,15 -1138,24 +1137,24 @@@
   static void remove_siblinginfo(int cpu)
   {
         int sibling;
+       struct cpuinfo_x86 *c = cpu_data;
   
+       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+               cpu_clear(cpu, cpu_core_map[sibling]);
+               /*
+                * last thread sibling in this cpu core going down
+                */
+               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                       c[sibling].booted_cores--;
+       }
+                       
         for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
                 cpu_clear(cpu, cpu_sibling_map[sibling]);
-       for_each_cpu_mask(sibling, cpu_core_map[cpu])
-               cpu_clear(cpu, cpu_core_map[sibling]);
         cpus_clear(cpu_sibling_map[cpu]);
         cpus_clear(cpu_core_map[cpu]);
         phys_proc_id[cpu] = BAD_APICID;
         cpu_core_id[cpu] = BAD_APICID;
+       cpu_clear(cpu, cpu_sibling_setup_map);
   }
   
   void remove_cpu_from_maps(void)
@@@ -1153,6 -1219,12 +1218,12 @@@ void __cpu_die(unsigned int cpu
         printk(KERN_ERR "CPU %u didn't die...\n", cpu);
   }
   
+ static __init int setup_additional_cpus(char *s)
+ {
+       return get_option(&s, &additional_cpus);
+ }
+ __setup("additional_cpus=", setup_additional_cpus);
+ 
   #else /* ... !CONFIG_HOTPLUG_CPU */
   
   int __cpu_disable(void)
diff --combined drivers/char/agp/amd64-agp.c

index 78ce98a69f37044dfe17c98d6f264f3989535a23,49996c692a734cdb8784cc247871d8c73cb99bb7..76589782adcbf2f2f2372b919c6b49490bc4d593
--- 1/drivers/char/agp/amd64-agp.c
--- 2/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@@ -13,7 -13,6 +13,7 @@@
   #include <linux/pci.h>
   #include <linux/init.h>
   #include <linux/agp_backend.h>
+ +#include <linux/mmzone.h>
   #include <asm/page.h>         /* PAGE_SIZE */
   #include "agp.h"
   
@@@ -57,9 -56,8 +57,8 @@@ static int nr_garts
   static struct pci_dev * hammers[MAX_HAMMER_GARTS];
   
   static struct resource *aperture_resource;
- static int __initdata agp_try_unsupported;
+ static int __initdata agp_try_unsupported = 1;
   
- static int gart_iterator;
   #define for_each_nb() for(gart_iterator=0;gart_iterator<nr_garts;gart_iterator++)
   
   static void flush_amd64_tlb(struct pci_dev *dev)
@@@ -73,6 -71,7 +72,7 @@@
   
   static void amd64_tlbflush(struct agp_memory *temp)
   {
+       int gart_iterator;
         for_each_nb()
                 flush_amd64_tlb(hammers[gart_iterator]);
   }
@@@ -222,6 -221,7 +222,7 @@@ static struct aper_size_info_32 amd_815
   static int amd_8151_configure(void)
   {
         unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
+       int gart_iterator;
   
         /* Configure AGP regs in each x86-64 host bridge. */
         for_each_nb() {
@@@ -235,7 -235,7 +236,7 @@@
   static void amd64_cleanup(void)
   {
         u32 tmp;
- 
+       int gart_iterator;
         for_each_nb() {
                 /* disable gart translation */
                 pci_read_config_dword (hammers[gart_iterator], AMD64_GARTAPERTURECTL, &tmp);
@@@ -697,6 -697,16 +698,16 @@@ static struct pci_device_id agp_amd64_p
         .subvendor      = PCI_ANY_ID,
         .subdevice      = PCI_ANY_ID,
         },
+       /* ALI/ULI M1695 */
+       {
+       .class          = (PCI_CLASS_BRIDGE_HOST << 8),
+       .class_mask     = ~0,
+       .vendor         = PCI_VENDOR_ID_AL,
+       .device         = 0x1689,
+       .subvendor      = PCI_ANY_ID,
+       .subdevice      = PCI_ANY_ID,
+       },
+ 
         { }
   };
   
diff --combined include/asm-i386/processor.h

index 8c02b0318703b4c9cf8d68e665949215f97455d6,9cd4a05234a16a2037df6575684b43d84186aa90..5c96cf6dcb3916f5735cdf6005b2ac7b3755cd4d
--- 1/include/asm-i386/processor.h
--- 2/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@@ -65,7 -65,9 +65,9 @@@ struct cpuinfo_x86 
         int     f00f_bug;
         int     coma_bug;
         unsigned long loops_per_jiffy;
-       unsigned char x86_num_cores;
+       unsigned char x86_max_cores;    /* cpuid returned max cores value */
+       unsigned char booted_cores;     /* number of cores as seen by OS */
+       unsigned char apicid;
   } __attribute__((__aligned__(SMP_CACHE_BYTES)));
   
   #define X86_VENDOR_INTEL 0
@@@ -718,10 -720,4 +720,10 @@@ extern void mtrr_bp_init(void)
   #define mtrr_bp_init() do {} while (0)
   #endif
   
+ +#ifdef CONFIG_X86_MCE
+ +extern void mcheck_init(struct cpuinfo_x86 *c);
+ +#else
+ +#define mcheck_init(c) do {} while(0)
+ +#endif
+ +
   #endif /* __ASM_I386_PROCESSOR_H */
diff --combined include/asm-x86_64/desc.h

index b837820c90733c51534d7eb100a160f19e251fe5,1a3d380f9d5df87335a16abd6923d26f6f6792f0..33764869387bc1c5ad9a707df503e5eef0c8bedf
--- 1/include/asm-x86_64/desc.h
--- 2/include/asm-x86_64/desc.h
+++ b/include/asm-x86_64/desc.h
@@@ -98,16 -98,19 +98,19 @@@ static inline void _set_gate(void *adr
   
   static inline void set_intr_gate(int nr, void *func) 
   { 
+       BUG_ON((unsigned)nr > 0xFF);
         _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
   } 
   
   static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
   { 
+       BUG_ON((unsigned)nr > 0xFF);
         _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
   } 
   
   static inline void set_system_gate(int nr, void *func) 
   { 
+       BUG_ON((unsigned)nr > 0xFF);
         _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
   } 
   
@@@ -129,16 -132,9 +132,16 @@@ static inline void set_tssldt_descripto
   
   static inline void set_tss_desc(unsigned cpu, void *addr)
   { 
- -      set_tssldt_descriptor(&cpu_gdt_table[cpu][GDT_ENTRY_TSS], (unsigned long)addr, 
- -                            DESC_TSS,
- -                            sizeof(struct tss_struct) - 1);
+ +      /*
+ +       * sizeof(unsigned long) coming from an extra "long" at the end
+ +       * of the iobitmap. See tss_struct definition in processor.h
+ +       *
+ +       * -1? seg base+limit should be pointing to the address of the
+ +       * last valid byte
+ +       */
+ +      set_tssldt_descriptor(&cpu_gdt_table[cpu][GDT_ENTRY_TSS],
+ +              (unsigned long)addr, DESC_TSS,
+ +              IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
   } 
   
   static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
diff --combined include/asm-x86_64/pgtable.h

index 7309fffeec9a04fe0b45abd0b7a39060a9abe19d,f8e87a57f3a7084879f516424b66f6230a2f9c5a..ecf58c7c16500908f40d93f76e5f00959d3ac9ca
--- 1/include/asm-x86_64/pgtable.h
--- 2/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@@ -16,6 -16,7 +16,7 @@@ extern pud_t level3_physmem_pgt[512]
   extern pud_t level3_ident_pgt[512];
   extern pmd_t level2_kernel_pgt[512];
   extern pgd_t init_level4_pgt[];
+ extern pgd_t boot_level4_pgt[];
   extern unsigned long __supported_pte_mask;
   
   #define swapper_pg_dir init_level4_pgt
@@@ -105,8 -106,6 +106,8 @@@ static inline void pgd_clear (pgd_t * p
   
   #define ptep_get_and_clear(mm,addr,xp)        __pte(xchg(&(xp)->pte, 0))
   
+ +struct mm_struct;
+ +
   static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
   {
         pte_t pte;
@@@ -247,7 -246,7 +248,7 @@@ static inline unsigned long pud_bad(pud
   #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))       /* FIXME: is this
                                                    right? */
   #define pte_page(x)   pfn_to_page(pte_pfn(x))
- #define pte_pfn(x)  ((pte_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK)
+ #define pte_pfn(x)  ((pte_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
   
   static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
   {
@@@ -354,7 -353,7 +355,7 @@@ static inline pud_t *__pud_offset_k(pud
   #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
   #define       pmd_bad(x)      ((pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE )
   #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
- #define pmd_pfn(x)  ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK)
+ #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
   
   #define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
   #define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
diff --combined include/asm-x86_64/smp.h

index b9fb2173ef99ee5368860a298a2d9a5c06162c5c,cf8f969f90209941ff56f5d9e504b9d1895fa07b..d030409a8fb5c61a2663249894834384d106f7f1
--- 1/include/asm-x86_64/smp.h
--- 2/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@@ -47,7 -47,6 +47,6 @@@ extern void lock_ipi_call_lock(void)
   extern void unlock_ipi_call_lock(void);
   extern int smp_num_siblings;
   extern void smp_send_reschedule(int cpu);
- extern void zap_low_mappings(void);
   void smp_stop_cpu(void);
   extern int smp_call_function_single(int cpuid, void (*func) (void *info),
                                 void *info, int retry, int wait);
@@@ -82,6 -81,8 +81,8 @@@ extern int safe_smp_processor_id(void)
   extern int __cpu_disable(void);
   extern void __cpu_die(unsigned int cpu);
   extern void prefill_possible_map(void);
+ extern unsigned num_processors;
+ extern unsigned disabled_cpus;
   
   #endif /* !ASSEMBLY */
   
@@@ -135,11 -136,5 +136,11 @@@ static __inline int logical_smp_process
   }
   #endif
   
+ +#ifdef CONFIG_SMP
+ +#define cpu_physical_id(cpu)          x86_cpu_to_apicid[cpu]
+ +#else
+ +#define cpu_physical_id(cpu)          boot_cpu_id
+ +#endif
+ +
   #endif
   
diff --combined include/linux/gfp.h

index 23279d8f19b1a05cd5a09b9403bea9a17ed8baca,4351e6bb5a799033fb0218ba507bddfa774d64c1..313dfe9b443abb0d98b890aae74158e580db3ba8
--- 1/include/linux/gfp.h
--- 2/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@@ -14,6 -14,13 +14,13 @@@ struct vm_area_struct
   /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
   #define __GFP_DMA     ((__force gfp_t)0x01u)
   #define __GFP_HIGHMEM ((__force gfp_t)0x02u)
+ #ifdef CONFIG_DMA_IS_DMA32
+ #define __GFP_DMA32   ((__force gfp_t)0x01)   /* ZONE_DMA is ZONE_DMA32 */
+ #elif BITS_PER_LONG < 64
+ #define __GFP_DMA32   ((__force gfp_t)0x00)   /* ZONE_NORMAL is ZONE_DMA32 */
+ #else
+ #define __GFP_DMA32   ((__force gfp_t)0x04)   /* Has own ZONE_DMA32 */
+ #endif
   
   /*
    * Action modifiers - doesn't change the zoning
@@@ -39,7 -46,8 +46,7 @@@
   #define __GFP_COMP    ((__force gfp_t)0x4000u)/* Add compound page metadata */
   #define __GFP_ZERO    ((__force gfp_t)0x8000u)/* Return zeroed page on success */
   #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
- -#define __GFP_NORECLAIM  ((__force gfp_t)0x20000u) /* No realy zone reclaim during allocation */
- -#define __GFP_HARDWALL   ((__force gfp_t)0x40000u) /* Enforce hardwall cpuset memory allocs */
+ +#define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
   
   #define __GFP_BITS_SHIFT 20   /* Room for 20 __GFP_FOO bits */
   #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@@ -48,7 -56,7 +55,7 @@@
   #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
                         __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
                         __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- -                      __GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL)
+ +                      __GFP_NOMEMALLOC|__GFP_HARDWALL)
   
   #define GFP_ATOMIC    (__GFP_HIGH)
   #define GFP_NOIO      (__GFP_WAIT)
@@@ -63,6 -71,10 +70,10 @@@
   
   #define GFP_DMA               __GFP_DMA
   
+ /* 4GB DMA on some platforms */
+ #define GFP_DMA32     __GFP_DMA32
+ 
+ 
   #define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK))
   
   /*
diff --combined include/linux/mm.h

index 7b115feca4df23064dfd3ee80a66a9e638b0dcbd,23fad4dae23cd5973844573e0aaa0aa40ef4413d..1013a42d10b15c1e82d9a7cc5d30658e55284b19
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -206,12 -206,6 +206,6 @@@ struct vm_operations_struct 
   struct mmu_gather;
   struct inode;
   
- #ifdef ARCH_HAS_ATOMIC_UNSIGNED
- typedef unsigned page_flags_t;
- #else
- typedef unsigned long page_flags_t;
- #endif
- 
   /*
    * Each physical page in the system has a struct page associated with
    * it to keep track of whatever it is we are using the page for at the
@@@ -219,7 -213,7 +213,7 @@@
    * a page.
    */
   struct page {
-       page_flags_t flags;             /* Atomic flags, some possibly
+       unsigned long flags;            /* Atomic flags, some possibly
                                          * updated asynchronously */
         atomic_t _count;                /* Usage count, see below. */
         atomic_t _mapcount;             /* Count of ptes mapped in mms,
@@@ -435,7 -429,7 +429,7 @@@ static inline void put_page(struct pag
   #endif
   
   /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
- #define SECTIONS_PGOFF                ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
+ #define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
   #define NODES_PGOFF           (SECTIONS_PGOFF - NODES_WIDTH)
   #define ZONES_PGOFF           (NODES_PGOFF - ZONES_WIDTH)
   
@@@ -932,13 -926,13 +926,13 @@@ int write_one_page(struct page *page, i
                                          * turning readahead off */
   
   int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- -                      unsigned long offset, unsigned long nr_to_read);
+ +                      pgoff_t offset, unsigned long nr_to_read);
   int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- -                      unsigned long offset, unsigned long nr_to_read);
- -unsigned long  page_cache_readahead(struct address_space *mapping,
+ +                      pgoff_t offset, unsigned long nr_to_read);
+ +unsigned long page_cache_readahead(struct address_space *mapping,
                           struct file_ra_state *ra,
                           struct file *filp,
- -                        unsigned long offset,
+ +                        pgoff_t offset,
                           unsigned long size);
   void handle_ra_miss(struct address_space *mapping, 
                     struct file_ra_state *ra, pgoff_t offset);
diff --combined include/linux/mmzone.h

index 6cfb114a0c34444756bda25bdc362dd673585dd2,f3cffc354deac72ae6e9d3a4aa5565696a93c5d7..2c8edad5dccf3796337dcb989f55815cd53faf51
--- 1/include/linux/mmzone.h
--- 2/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@@ -71,10 -71,11 +71,11 @@@ struct per_cpu_pageset 
   #endif
   
   #define ZONE_DMA              0
- #define ZONE_NORMAL           1
- #define ZONE_HIGHMEM          2
+ #define ZONE_DMA32            1
+ #define ZONE_NORMAL           2
+ #define ZONE_HIGHMEM          3
   
- #define MAX_NR_ZONES          3       /* Sync this with ZONES_SHIFT */
+ #define MAX_NR_ZONES          4       /* Sync this with ZONES_SHIFT */
   #define ZONES_SHIFT           2       /* ceil(log2(MAX_NR_ZONES)) */
   
   
@@@ -108,9 -109,10 +109,10 @@@
   
   /*
    * On machines where it is needed (eg PCs) we divide physical memory
-  * into multiple physical zones. On a PC we have 3 zones:
+  * into multiple physical zones. On a PC we have 4 zones:
    *
    * ZONE_DMA     < 16 MB       ISA DMA capable memory
+  * ZONE_DMA32      0 MB       Empty
    * ZONE_NORMAL        16-896 MB       direct mapped by the kernel
    * ZONE_HIGHMEM        > 896 MB       only page cache and user processes
    */
@@@ -329,7 -331,7 +331,7 @@@ void get_zone_counts(unsigned long *act
   void build_all_zonelists(void);
   void wakeup_kswapd(struct zone *zone, int order);
   int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- -              int alloc_type, int can_try_harder, gfp_t gfp_high);
+ +              int classzone_idx, int alloc_flags);
   
   #ifdef CONFIG_HAVE_MEMORY_PRESENT
   void memory_present(int nid, unsigned long start, unsigned long end);
@@@ -433,7 -435,9 +435,9 @@@ int lowmem_reserve_ratio_sysctl_handler
   
   #include <linux/topology.h>
   /* Returns the number of the current Node. */
+ #ifndef numa_node_id
   #define numa_node_id()                (cpu_to_node(raw_smp_processor_id()))
+ #endif
   
   #ifndef CONFIG_NEED_MULTIPLE_NODES
   
@@@ -453,12 -457,12 +457,12 @@@ extern struct pglist_data contig_page_d
   #include <asm/sparsemem.h>
   #endif
   
- #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
+ #if BITS_PER_LONG == 32
   /*
-  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
-  * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
+  * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
+  * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
    */
- #define FLAGS_RESERVED                8
+ #define FLAGS_RESERVED                9
   
   #elif BITS_PER_LONG == 64
   /*
diff --combined mm/page_alloc.c

index 3c5cf664abd2eca14d2613a57023dcfcac6b6fde,259a71bacca40f0995f31d92ba73ba9f1cdb20dc..104e69ca55e0117c5ed596e9c96a01a5336a3fd8
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -60,10 -60,14 +60,13 @@@ long nr_swap_pages
    *    NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
    *    HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
    *    HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+  *
+  * TBD: should special case ZONE_DMA32 machines here - in those we normally
+  * don't need any ZONE_NORMAL reservation
    */
- int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
   
   EXPORT_SYMBOL(totalram_pages);
- -EXPORT_SYMBOL(nr_swap_pages);
   
   /*
    * Used by page_zone() to look up the address of the struct zone whose
@@@ -72,7 -76,7 +75,7 @@@
   struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
   EXPORT_SYMBOL(zone_table);
   
- static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+ static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
   int min_free_kbytes = 1024;
   
   unsigned long __initdata nr_kernel_pages;
@@@ -124,7 -128,7 +127,7 @@@ static void bad_page(const char *functi
         printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
                 function, current->comm, page);
         printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-               (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+               (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
                 page->mapping, page_mapcount(page), page_count(page));
         printk(KERN_EMERG "Backtrace:\n");
         dump_stack();
@@@ -732,7 -736,9 +735,7 @@@ buffered_rmqueue(struct zone *zone, in
                 }
                 local_irq_restore(flags);
                 put_cpu();
- -      }
- -
- -      if (page == NULL) {
+ +      } else {
                 spin_lock_irqsave(&zone->lock, flags);
                 page = __rmqueue(zone, order);
                 spin_unlock_irqrestore(&zone->lock, flags);
@@@ -752,25 -758,20 +755,25 @@@
         return page;
   }
   
+ +#define ALLOC_NO_WATERMARKS   0x01 /* don't check watermarks at all */
+ +#define ALLOC_HARDER          0x02 /* try to alloc harder */
+ +#define ALLOC_HIGH            0x04 /* __GFP_HIGH set */
+ +#define ALLOC_CPUSET          0x08 /* check for correct cpuset */
+ +
   /*
    * Return 1 if free pages are above 'mark'. This takes into account the order
    * of the allocation.
    */
   int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- -                    int classzone_idx, int can_try_harder, gfp_t gfp_high)
+ +                    int classzone_idx, int alloc_flags)
   {
         /* free_pages my go negative - that's OK */
         long min = mark, free_pages = z->free_pages - (1 << order) + 1;
         int o;
   
- -      if (gfp_high)
+ +      if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
- -      if (can_try_harder)
+ +      if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
   
         if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@@ -788,40 -789,14 +791,40 @@@
         return 1;
   }
   
- -static inline int
- -should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
+ +/*
+ + * get_page_from_freeliest goes through the zonelist trying to allocate
+ + * a page.
+ + */
+ +static struct page *
+ +get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+ +              struct zonelist *zonelist, int alloc_flags)
   {
- -      if (!z->reclaim_pages)
- -              return 0;
- -      if (gfp_mask & __GFP_NORECLAIM)
- -              return 0;
- -      return 1;
+ +      struct zone **z = zonelist->zones;
+ +      struct page *page = NULL;
+ +      int classzone_idx = zone_idx(*z);
+ +
+ +      /*
+ +       * Go through the zonelist once, looking for a zone with enough free.
+ +       * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ +       */
+ +      do {
+ +              if ((alloc_flags & ALLOC_CPUSET) &&
+ +                              !cpuset_zone_allowed(*z, gfp_mask))
+ +                      continue;
+ +
+ +              if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+ +                      if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+ +                                  classzone_idx, alloc_flags))
+ +                              continue;
+ +              }
+ +
+ +              page = buffered_rmqueue(*z, order, gfp_mask);
+ +              if (page) {
+ +                      zone_statistics(zonelist, *z);
+ +                      break;
+ +              }
+ +      } while (*(++z) != NULL);
+ +      return page;
   }
   
   /*
@@@ -832,75 -807,105 +835,75 @@@ __alloc_pages(gfp_t gfp_mask, unsigned 
                 struct zonelist *zonelist)
   {
         const gfp_t wait = gfp_mask & __GFP_WAIT;
- -      struct zone **zones, *z;
+ +      struct zone **z;
         struct page *page;
         struct reclaim_state reclaim_state;
         struct task_struct *p = current;
- -      int i;
- -      int classzone_idx;
         int do_retry;
- -      int can_try_harder;
+ +      int alloc_flags;
         int did_some_progress;
   
         might_sleep_if(wait);
   
- -      /*
- -       * The caller may dip into page reserves a bit more if the caller
- -       * cannot run direct reclaim, or is the caller has realtime scheduling
- -       * policy
- -       */
- -      can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+ +      z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
   
- -      zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
- -
- -      if (unlikely(zones[0] == NULL)) {
+ +      if (unlikely(*z == NULL)) {
                 /* Should this ever happen?? */
                 return NULL;
         }
+ +restart:
+ +      page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ +                              zonelist, ALLOC_CPUSET);
+ +      if (page)
+ +              goto got_pg;
   
- -      classzone_idx = zone_idx(zones[0]);
+ +      do
+ +              wakeup_kswapd(*z, order);
+ +      while (*(++z));
   
- -restart:
         /*
- -       * Go through the zonelist once, looking for a zone with enough free.
- -       * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ +       * OK, we're below the kswapd watermark and have kicked background
+ +       * reclaim. Now things get more complex, so set up alloc_flags according
+ +       * to how we want to proceed.
+ +       *
+ +       * The caller may dip into page reserves a bit more if the caller
+ +       * cannot run direct reclaim, or if the caller has realtime scheduling
+ +       * policy.
          */
- -      for (i = 0; (z = zones[i]) != NULL; i++) {
- -              int do_reclaim = should_reclaim_zone(z, gfp_mask);
- -
- -              if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- -                      continue;
- -
- -              /*
- -               * If the zone is to attempt early page reclaim then this loop
- -               * will try to reclaim pages and check the watermark a second
- -               * time before giving up and falling back to the next zone.
- -               */
- -zone_reclaim_retry:
- -              if (!zone_watermark_ok(z, order, z->pages_low,
- -                                     classzone_idx, 0, 0)) {
- -                      if (!do_reclaim)
- -                              continue;
- -                      else {
- -                              zone_reclaim(z, gfp_mask, order);
- -                              /* Only try reclaim once */
- -                              do_reclaim = 0;
- -                              goto zone_reclaim_retry;
- -                      }
- -              }
- -
- -              page = buffered_rmqueue(z, order, gfp_mask);
- -              if (page)
- -                      goto got_pg;
- -      }
- -
- -      for (i = 0; (z = zones[i]) != NULL; i++)
- -              wakeup_kswapd(z, order);
+ +      alloc_flags = 0;
+ +      if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+ +              alloc_flags |= ALLOC_HARDER;
+ +      if (gfp_mask & __GFP_HIGH)
+ +              alloc_flags |= ALLOC_HIGH;
+ +      if (wait)
+ +              alloc_flags |= ALLOC_CPUSET;
   
         /*
          * Go through the zonelist again. Let __GFP_HIGH and allocations
- -       * coming from realtime tasks to go deeper into reserves
+ +       * coming from realtime tasks go deeper into reserves.
          *
          * This is the last chance, in general, before the goto nopage.
          * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
          */
- -      for (i = 0; (z = zones[i]) != NULL; i++) {
- -              if (!zone_watermark_ok(z, order, z->pages_min,
- -                                     classzone_idx, can_try_harder,
- -                                     gfp_mask & __GFP_HIGH))
- -                      continue;
- -
- -              if (wait && !cpuset_zone_allowed(z, gfp_mask))
- -                      continue;
- -
- -              page = buffered_rmqueue(z, order, gfp_mask);
- -              if (page)
- -                      goto got_pg;
- -      }
+ +      page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ +      if (page)
+ +              goto got_pg;
   
         /* This allocation should allow future memory freeing. */
   
         if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                         && !in_interrupt()) {
                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+ +nofail_alloc:
                         /* go through the zonelist yet again, ignoring mins */
- -                      for (i = 0; (z = zones[i]) != NULL; i++) {
- -                              if (!cpuset_zone_allowed(z, gfp_mask))
- -                                      continue;
- -                              page = buffered_rmqueue(z, order, gfp_mask);
- -                              if (page)
- -                                      goto got_pg;
+ +                      page = get_page_from_freelist(gfp_mask, order,
+ +                              zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+ +                      if (page)
+ +                              goto got_pg;
+ +                      if (gfp_mask & __GFP_NOFAIL) {
+ +                              blk_congestion_wait(WRITE, HZ/50);
+ +                              goto nofail_alloc;
                         }
                 }
                 goto nopage;
@@@ -918,7 -923,7 +921,7 @@@ rebalance
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
   
- -      did_some_progress = try_to_free_pages(zones, gfp_mask);
+ +      did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
   
         p->reclaim_state = NULL;
         p->flags &= ~PF_MEMALLOC;
@@@ -926,10 -931,19 +929,10 @@@
         cond_resched();
   
         if (likely(did_some_progress)) {
- -              for (i = 0; (z = zones[i]) != NULL; i++) {
- -                      if (!zone_watermark_ok(z, order, z->pages_min,
- -                                             classzone_idx, can_try_harder,
- -                                             gfp_mask & __GFP_HIGH))
- -                              continue;
- -
- -                      if (!cpuset_zone_allowed(z, gfp_mask))
- -                              continue;
- -
- -                      page = buffered_rmqueue(z, order, gfp_mask);
- -                      if (page)
- -                              goto got_pg;
- -              }
+ +              page = get_page_from_freelist(gfp_mask, order,
+ +                                              zonelist, alloc_flags);
+ +              if (page)
+ +                      goto got_pg;
         } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                 /*
                  * Go through the zonelist yet one more time, keep
@@@ -937,10 -951,18 +940,10 @@@
                  * a parallel oom killing, we must fail if we're still
                  * under heavy pressure.
                  */
- -              for (i = 0; (z = zones[i]) != NULL; i++) {
- -                      if (!zone_watermark_ok(z, order, z->pages_high,
- -                                             classzone_idx, 0, 0))
- -                              continue;
- -
- -                      if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- -                              continue;
- -
- -                      page = buffered_rmqueue(z, order, gfp_mask);
- -                      if (page)
- -                              goto got_pg;
- -              }
+ +              page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ +                                              zonelist, ALLOC_CPUSET);
+ +              if (page)
+ +                      goto got_pg;
   
                 out_of_memory(gfp_mask, order);
                 goto restart;
@@@ -973,7 -995,9 +976,7 @@@ nopage
                 dump_stack();
                 show_mem();
         }
- -      return NULL;
   got_pg:
- -      zone_statistics(zonelist, z);
         return page;
   }
   
@@@ -1310,7 -1334,7 +1313,7 @@@ void show_free_areas(void
                 } else
                         printk("\n");
   
- -              for_each_cpu(cpu) {
+ +              for_each_online_cpu(cpu) {
                         struct per_cpu_pageset *pageset;
   
                         pageset = zone_pcp(zone, cpu);
@@@ -1421,6 -1445,10 +1424,10 @@@ static int __init build_zonelists_node(
                 zone = pgdat->node_zones + ZONE_NORMAL;
                 if (zone->present_pages)
                         zonelist->zones[j++] = zone;
+       case ZONE_DMA32:
+               zone = pgdat->node_zones + ZONE_DMA32;
+               if (zone->present_pages)
+                       zonelist->zones[j++] = zone;
         case ZONE_DMA:
                 zone = pgdat->node_zones + ZONE_DMA;
                 if (zone->present_pages)
@@@ -1435,6 -1463,8 +1442,8 @@@ static inline int highest_zone(int zone
         int res = ZONE_NORMAL;
         if (zone_bits & (__force int)__GFP_HIGHMEM)
                 res = ZONE_HIGHMEM;
+       if (zone_bits & (__force int)__GFP_DMA32)
+               res = ZONE_DMA32;
         if (zone_bits & (__force int)__GFP_DMA)
                 res = ZONE_DMA;
         return res;
@@@ -1846,11 -1876,10 +1855,10 @@@ static int __devinit pageset_cpuup_call
                         if (process_zones(cpu))
                                 ret = NOTIFY_BAD;
                         break;
- #ifdef CONFIG_HOTPLUG_CPU
+               case CPU_UP_CANCELED:
                 case CPU_DEAD:
                         free_zone_pagesets(cpu);
                         break;
- #endif
                 default:
                         break;
         }
@@@ -1955,7 -1984,7 +1963,7 @@@ static void __init free_area_init_core(
                 if (zholes_size)
                         realsize -= zholes_size[j];
   
-               if (j == ZONE_DMA || j == ZONE_NORMAL)
+               if (j < ZONE_HIGHMEM)
                         nr_kernel_pages += realsize;
                 nr_all_pages += realsize;
   
@@@ -2397,18 -2426,13 +2405,18 @@@ void setup_per_zone_pages_min(void
         }
   
         for_each_zone(zone) {
+ +              unsigned long tmp;
                 spin_lock_irqsave(&zone->lru_lock, flags);
+ +              tmp = (pages_min * zone->present_pages) / lowmem_pages;
                 if (is_highmem(zone)) {
                         /*
- -                       * Often, highmem doesn't need to reserve any pages.
- -                       * But the pages_min/low/high values are also used for
- -                       * batching up page reclaim activity so we need a
- -                       * decent value here.
+ +                       * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+ +                       * need highmem pages, so cap pages_min to a small
+ +                       * value here.
+ +                       *
+ +                       * The (pages_high-pages_low) and (pages_low-pages_min)
+ +                       * deltas controls asynch page reclaim, and so should
+ +                       * not be capped for highmem.
                          */
                         int min_pages;
   
@@@ -2419,15 -2443,19 +2427,15 @@@
                                 min_pages = 128;
                         zone->pages_min = min_pages;
                 } else {
- -                      /* if it's a lowmem zone, reserve a number of pages
+ +                      /*
+ +                       * If it's a lowmem zone, reserve a number of pages
                          * proportionate to the zone's size.
                          */
- -                      zone->pages_min = (pages_min * zone->present_pages) /
- -                                         lowmem_pages;
+ +                      zone->pages_min = tmp;
                 }
   
- -              /*
- -               * When interpreting these watermarks, just keep in mind that:
- -               * zone->pages_min == (zone->pages_min * 4) / 4;
- -               */
- -              zone->pages_low   = (zone->pages_min * 5) / 4;
- -              zone->pages_high  = (zone->pages_min * 6) / 4;
+ +              zone->pages_low   = zone->pages_min + tmp / 4;
+ +              zone->pages_high  = zone->pages_min + tmp / 2;
                 spin_unlock_irqrestore(&zone->lru_lock, flags);
         }
   }
author	Linus Torvalds <torvalds@g5.osdl.org>
	Tue, 15 Nov 2005 03:56:02 +0000 (19:56 -0800)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Tue, 15 Nov 2005 03:56:02 +0000 (19:56 -0800)
		1	2
arch/i386/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/i386/kernel/cpu/intel.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/i386/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86_64/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86_64/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86_64/kernel/i8259.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86_64/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86_64/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/agp/amd64-agp.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-i386/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-x86_64/desc.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-x86_64/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-x86_64/smp.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/gfp.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mmzone.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history