struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
-extern void mcheck_init(struct cpuinfo_x86 *c);
-
extern int disable_pse;
static void default_init(struct cpuinfo_x86 * c)
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
c->x86 = (tfms >> 8) & 15;
c->x86_model = (tfms >> 4) & 15;
- if (c->x86 == 0xf) {
+ if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
- }
c->x86_mask = tfms & 15;
if (cap0 & (1<<19))
c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
c->x86_model = c->x86_mask = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_num_cores = 1;
+ c->x86_max_cores = 1;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
if (!have_cpuid_p()) {
}
/* Init Machine Check Exception if available. */
-#ifdef CONFIG_X86_MCE
mcheck_init(c);
-#endif
+
if (c == &boot_cpu_data)
sysenter_setup();
enable_sep_cpu();
void __devinit detect_ht(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- int index_msb, tmp;
+ int index_msb, core_bits;
int cpu = smp_processor_id();
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+ c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+
if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
- cpuid(1, &eax, &ebx, &ecx, &edx);
smp_num_siblings = (ebx & 0xff0000) >> 16;
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
} else if (smp_num_siblings > 1 ) {
- index_msb = 31;
if (smp_num_siblings > NR_CPUS) {
printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
smp_num_siblings = 1;
return;
}
- tmp = smp_num_siblings;
- while ((tmp & 0x80000000 ) == 0) {
- tmp <<=1 ;
- index_msb--;
- }
- if (smp_num_siblings & (smp_num_siblings - 1))
- index_msb++;
+
+ index_msb = get_count_order(smp_num_siblings);
phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
phys_proc_id[cpu]);
- smp_num_siblings = smp_num_siblings / c->x86_num_cores;
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
- tmp = smp_num_siblings;
- index_msb = 31;
- while ((tmp & 0x80000000) == 0) {
- tmp <<=1 ;
- index_msb--;
- }
+ index_msb = get_count_order(smp_num_siblings) ;
- if (smp_num_siblings & (smp_num_siblings - 1))
- index_msb++;
+ core_bits = get_count_order(c->x86_max_cores);
- cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+ cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+ ((1 << core_bits) - 1);
- if (c->x86_num_cores > 1)
+ if (c->x86_max_cores > 1)
printk(KERN_INFO "CPU: Processor Core ID: %d\n",
cpu_core_id[cpu]);
}
#include <linux/bitops.h>
#include <linux/smp.h>
#include <linux/thread_info.h>
+#include <linux/module.h>
#include <asm/processor.h>
#include <asm/msr.h>
if ( p )
strcpy(c->x86_model_id, p);
- c->x86_num_cores = num_cpu_cores(c);
+ c->x86_max_cores = num_cpu_cores(c);
detect_ht(c);
return 0;
}
+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+{
+ u8 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u8 *)ptr;
+ if (prev == old)
+ *(u8 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u8);
+
+unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+{
+ u16 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u16 *)ptr;
+ if (prev == old)
+ *(u16 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u16);
+
+unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+{
+ u32 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u32 *)ptr;
+ if (prev == old)
+ *(u32 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u32);
+#endif
+
// arch_initcall(intel_cpu_init);
/* Package ID of each logical CPU */
int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-EXPORT_SYMBOL(phys_proc_id);
/* Core ID of each logical CPU */
int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-EXPORT_SYMBOL(cpu_core_id);
+ /* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_sibling_map);
+ /* representing HT and core siblings of each logical CPU */
cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_core_map);
static int cpucount;
+ /* representing cpus for which sibling maps can be computed */
+ static cpumask_t cpu_sibling_setup_map;
+
static inline void
set_cpu_sibling_map(int cpu)
{
int i;
+ struct cpuinfo_x86 *c = cpu_data;
+
+ cpu_set(cpu, cpu_sibling_setup_map);
if (smp_num_siblings > 1) {
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_isset(i, cpu_callout_map))
- continue;
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i] &&
+ cpu_core_id[cpu] == cpu_core_id[i]) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
- if (current_cpu_data.x86_num_cores > 1) {
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_isset(i, cpu_callout_map))
- continue;
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
- cpu_set(i, cpu_core_map[cpu]);
- cpu_set(cpu, cpu_core_map[i]);
- }
- }
- } else {
+ if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ c[cpu].booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (first_cpu(cpu_sibling_map[i]) == i)
+ c[cpu].booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ c[i].booted_cores++;
+ } else if (i != cpu && !c[cpu].booted_cores)
+ c[cpu].booted_cores = c[i].booted_cores;
+ }
}
}
* things done here to the most necessary things.
*/
cpu_init();
+ preempt_disable();
smp_callin();
while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
rep_nop();
printk("Inquiring remote APIC #%d...\n", apicid);
- for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+ for (i = 0; i < ARRAY_SIZE(regs); i++) {
printk("... APIC #%d %s: ", apicid, names[i]);
/*
current_thread_info()->cpu = 0;
smp_tune_scheduling();
- cpus_clear(cpu_sibling_map[0]);
- cpu_set(0, cpu_sibling_map[0]);
- cpus_clear(cpu_core_map[0]);
- cpu_set(0, cpu_core_map[0]);
+ set_cpu_sibling_map(0);
/*
* If we couldn't find an SMP configuration at boot time,
remove_siblinginfo(int cpu)
{
int sibling;
+ struct cpuinfo_x86 *c = cpu_data;
+ for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ /*
+ * last thread sibling in this cpu core going down
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+ c[sibling].booted_cores--;
+ }
+
for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
cpu_clear(cpu, cpu_sibling_map[sibling]);
- for_each_cpu_mask(sibling, cpu_core_map[cpu])
- cpu_clear(cpu, cpu_core_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
phys_proc_id[cpu] = BAD_APICID;
cpu_core_id[cpu] = BAD_APICID;
+ cpu_clear(cpu, cpu_sibling_setup_map);
}
int __cpu_disable(void)
bool
select GENERIC_ALLOCATOR
+ config ZONE_DMA_IS_DMA32
+ bool
+ default y
+
choice
prompt "System type"
default IA64_GENERIC
endchoice
+choice
+ prompt "Page Table Levels"
+ default PGTABLE_3
+
+config PGTABLE_3
+ bool "3 Levels"
+
+config PGTABLE_4
+ depends on !IA64_PAGE_SIZE_64KB
+ bool "4 Levels"
+
+endchoice
+
source kernel/Kconfig.hz
config IA64_BRL_EMU
config IA64_SGI_SN_XP
tristate "Support communication between SGI SSIs"
+ depends on IA64_GENERIC || IA64_SGI_SN2
select IA64_UNCACHED_ALLOCATOR
help
An SGI machine can be divided into multiple Single System
source "arch/ia64/hp/sim/Kconfig"
+menu "Instrumentation Support"
+ depends on EXPERIMENTAL
+
source "arch/ia64/oprofile/Kconfig"
+config KPROBES
+ bool "Kprobes (EXPERIMENTAL)"
+ help
+ Kprobes allows you to trap at almost any kernel address and
+ execute a callback function. register_kprobe() establishes
+ a probepoint and specifies the callback. Kprobes is useful
+ for kernel debugging, non-intrusive instrumentation and testing.
+ If in doubt, say "N".
+endmenu
+
source "arch/ia64/Kconfig.debug"
source "security/Kconfig"
source "kernel/Kconfig.preempt"
- config K8_NUMA
- bool "K8 NUMA support"
- select NUMA
+ config NUMA
+ bool "Non Uniform Memory Access (NUMA) Support"
depends on SMP
help
- Enable NUMA (Non Unified Memory Architecture) support for
- AMD Opteron Multiprocessor systems. The kernel will try to allocate
- memory used by a CPU on the local memory controller of the CPU
- and add some more NUMA awareness to the kernel.
- This code is recommended on all multiprocessor Opteron systems
- and normally doesn't hurt on others.
+ Enable NUMA (Non Uniform Memory Access) support. The kernel
+ will try to allocate memory used by a CPU on the local memory
+ controller of the CPU and add some more NUMA awareness to the kernel.
+ This code is recommended on all multiprocessor Opteron systems.
+ If the system is EM64T, you should say N unless your system is EM64T
+ NUMA.
+
+ config K8_NUMA
+ bool "Old style AMD Opteron NUMA detection"
+ depends on NUMA
+ default y
+ help
+ Enable K8 NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD K8 system. This uses an old
+ method to read the NUMA configurtion directly from the builtin
+ Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+ instead, which also takes priority if both are compiled in.
+
+ # Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
+
+ config X86_64_ACPI_NUMA
+ bool "ACPI NUMA detection"
+ depends on NUMA
+ select ACPI
+ select ACPI_NUMA
+ default y
+ help
+ Enable ACPI SRAT based node topology detection.
config NUMA_EMU
- bool "NUMA emulation support"
- select NUMA
- depends on SMP
+ bool "NUMA emulation"
+ depends on NUMA
help
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
depends on NUMA
default y
- config NUMA
- bool
- default n
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
Additional support for intel specific MCE features such as
the thermal monitor.
+ config X86_MCE_AMD
+ bool "AMD MCE features"
+ depends on X86_MCE && X86_LOCAL_APIC
+ default y
+ help
+ Additional support for AMD specific MCE features such as
+ the DRAM Error Threshold.
+
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if EMBEDDED
default "0x100000"
left.
config IA32_AOUT
- bool "IA32 a.out support"
+ tristate "IA32 a.out support"
depends on IA32_EMULATION
help
Support old a.out binaries in the 32bit emulation.
source fs/Kconfig
+menu "Instrumentation Support"
+ depends on EXPERIMENTAL
+
source "arch/x86_64/oprofile/Kconfig"
+config KPROBES
+ bool "Kprobes (EXPERIMENTAL)"
+ help
+ Kprobes allows you to trap at almost any kernel address and
+ execute a callback function. register_kprobe() establishes
+ a probepoint and specifies the callback. Kprobes is useful
+ for kernel debugging, non-intrusive instrumentation and testing.
+ If in doubt, say "N".
+endmenu
+
source "arch/x86_64/Kconfig.debug"
source "security/Kconfig"
source "lib/Kconfig.debug"
- # !SMP for now because the context switch early causes GPF in segment reloading
- # and the GS base checking does the wrong thing then, causing a hang.
- config CHECKING
- bool "Additional run-time checks"
- depends on DEBUG_KERNEL && !SMP
- help
- Enables some internal consistency checks for kernel debugging.
- You should normally say N.
-
config INIT_DEBUG
bool "Debug __init statements"
depends on DEBUG_KERNEL
options. See Documentation/x86_64/boot-options.txt for more
details.
-config KPROBES
- bool "Kprobes"
- depends on DEBUG_KERNEL
- help
- Kprobes allows you to trap at almost any kernel address and
- execute a callback function. register_kprobe() establishes
- a probepoint and specifies the callback. Kprobes is useful
- for kernel debugging, non-intrusive instrumentation and testing.
- If in doubt, say "N".
-
config IOMMU_LEAK
bool "IOMMU leak tracing"
depends on DEBUG_KERNEL
void invalidate_interrupt6(void);
void invalidate_interrupt7(void);
void thermal_interrupt(void);
+ void threshold_interrupt(void);
void i8254_timer_resume(void);
static void setup_timer_hardware(void)
}
static struct sysdev_class timer_sysclass = {
- set_kset_name("timer"),
+ set_kset_name("timer_pit"),
.resume = timer_resume,
};
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
#endif
set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
*/
void default_idle(void)
{
+ local_irq_enable();
+
if (!atomic_read(&hlt_counter)) {
- local_irq_disable();
- if (!need_resched())
- safe_halt();
- else
- local_irq_enable();
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ smp_mb__after_clear_bit();
+ while (!need_resched()) {
+ local_irq_disable();
+ if (!need_resched())
+ safe_halt();
+ else
+ local_irq_enable();
+ }
+ set_thread_flag(TIF_POLLING_NRFLAG);
+ } else {
+ while (!need_resched())
+ cpu_relax();
}
}
*/
static void poll_idle (void)
{
- int oldval;
-
local_irq_enable();
- /*
- * Deal with another CPU just having chosen a thread to
- * run here:
- */
- oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
- if (!oldval) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- asm volatile(
- "2:"
- "testl %0,%1;"
- "rep; nop;"
- "je 2b;"
- : :
- "i" (_TIF_NEED_RESCHED),
- "m" (current_thread_info()->flags));
- clear_thread_flag(TIF_POLLING_NRFLAG);
- } else {
- set_need_resched();
- }
+ asm volatile(
+ "2:"
+ "testl %0,%1;"
+ "rep; nop;"
+ "je 2b;"
+ : :
+ "i" (_TIF_NEED_RESCHED),
+ "m" (current_thread_info()->flags));
}
void cpu_idle_wait(void)
do {
ssleep(1);
for_each_online_cpu(cpu) {
- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+ if (cpu_isset(cpu, map) &&
+ !per_cpu(cpu_idle_state, cpu))
cpu_clear(cpu, map);
}
cpus_and(map, map, cpu_online_map);
*/
void cpu_idle (void)
{
+ set_thread_flag(TIF_POLLING_NRFLAG);
+
/* endless idle loop with no priority at all */
while (1) {
while (!need_resched()) {
idle();
}
+ preempt_enable_no_resched();
schedule();
+ preempt_disable();
}
}
{
local_irq_enable();
- if (!need_resched()) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- do {
- __monitor((void *)¤t_thread_info()->flags, 0, 0);
- if (need_resched())
- break;
- __mwait(0, 0);
- } while (!need_resched());
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ while (!need_resched()) {
+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (need_resched())
+ break;
+ __mwait(0, 0);
}
}
system_utsname.version);
printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
printk_address(regs->rip);
- printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+ printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
+ regs->eflags);
printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->rax, regs->rbx, regs->rcx);
printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
struct pt_regs * childregs;
struct task_struct *me = current;
- childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
-
+ childregs = ((struct pt_regs *)
+ (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
*childregs = *regs;
childregs->rax = 0;
childregs->rsp = rsp;
- if (rsp == ~0UL) {
+ if (rsp == ~0UL)
childregs->rsp = (unsigned long)childregs;
- }
p->thread.rsp = (unsigned long) childregs;
p->thread.rsp0 = (unsigned long) (childregs+1);
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES);
}
/*
* - fold all the options into a flag word and test it with a single test.
* - could test fs/gs bitsliced
*/
- struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ struct task_struct *
+ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
prev->userrsp = read_pda(oldrsp);
write_pda(oldrsp, next->userrsp);
write_pda(pcurrent, next_p);
- write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+ write_pda(kernelstack,
+ (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
/*
* Now maybe reload the debug registers
return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
}
- asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+ asmlinkage long
+ sys_clone(unsigned long clone_flags, unsigned long newsp,
+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
if (!newsp)
newsp = regs->rsp;
return 0;
fp = *(u64 *)(p->thread.rsp);
do {
- if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+ if (fp < (unsigned long)stack ||
+ fp > (unsigned long)stack+THREAD_SIZE)
return 0;
rip = *(u64 *)(fp+8);
if (!in_sched_functions(rip))
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
- load_gs_index(0);
- ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+ load_gs_index(0);
+ ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
}
}
put_cpu();
set_32bit_tls(task, FS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
- asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
}
task->thread.fsindex = FS_TLS_SEL;
task->thread.fs = 0;
if (doit) {
/* set the selector to 0 to not confuse
__switch_to */
- asm volatile("movl %0,%%fs" :: "r" (0));
- ret = checking_wrmsrl(MSR_FS_BASE, addr);
+ asm volatile("movl %0,%%fs" :: "r" (0));
+ ret = checking_wrmsrl(MSR_FS_BASE, addr);
}
}
put_cpu();
unsigned long base;
if (task->thread.fsindex == FS_TLS_SEL)
base = read_32bit_tls(task, FS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_FS_BASE, base);
- } else
+ else
base = task->thread.fs;
ret = put_user(base, (unsigned long __user *)addr);
break;
unsigned long base;
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_KERNEL_GS_BASE, base);
- } else
+ else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
int smp_num_siblings = 1;
/* Package ID of each logical CPU */
u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+ /* core ID of each logical CPU */
u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-EXPORT_SYMBOL(phys_proc_id);
-EXPORT_SYMBOL(cpu_core_id);
/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
/* Set when the idlers are all forked */
int smp_threads_ready;
+ /* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+
+ /* representing HT and core siblings of each logical CPU */
cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_core_map);
cpu_set(cpuid, cpu_callin_map);
}
+ /* representing cpus for which sibling maps can be computed */
+ static cpumask_t cpu_sibling_setup_map;
+
static inline void set_cpu_sibling_map(int cpu)
{
int i;
+ struct cpuinfo_x86 *c = cpu_data;
+
+ cpu_set(cpu, cpu_sibling_setup_map);
if (smp_num_siblings > 1) {
- for_each_cpu(i) {
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i] &&
+ cpu_core_id[cpu] == cpu_core_id[i]) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
- if (current_cpu_data.x86_num_cores > 1) {
- for_each_cpu(i) {
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
- cpu_set(i, cpu_core_map[cpu]);
- cpu_set(cpu, cpu_core_map[i]);
- }
- }
- } else {
+ if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ c[cpu].booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (first_cpu(cpu_sibling_map[i]) == i)
+ c[cpu].booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ c[i].booted_cores++;
+ } else if (i != cpu && !c[cpu].booted_cores)
+ c[cpu].booted_cores = c[i].booted_cores;
+ }
}
}
* things done here to the most necessary things.
*/
cpu_init();
+ preempt_disable();
smp_callin();
/* otherwise gcc will move up the smp_processor_id before the cpu_init */
}
#ifdef CONFIG_HOTPLUG_CPU
+
+ int additional_cpus __initdata = -1;
+
/*
* cpu_possible_map should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
* cpu_present_map on the other hand can change dynamically.
* In case when cpu_hotplug is not compiled, then we resort to current
* behaviour, which is cpu_possible == cpu_present.
- * If cpu-hotplug is supported, then we need to preallocate for all
- * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
* - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - otherwise use half of the available CPUs or 2, whatever is more.
+ * - The user can overwrite it with additional_cpus=NUM
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
*/
__init void prefill_possible_map(void)
{
int i;
- for (i = 0; i < NR_CPUS; i++)
+ int possible;
+
+ if (additional_cpus == -1) {
+ if (disabled_cpus > 0) {
+ additional_cpus = disabled_cpus;
+ } else {
+ additional_cpus = num_processors / 2;
+ if (additional_cpus == 0)
+ additional_cpus = 2;
+ }
+ }
+ possible = num_processors + additional_cpus;
+ if (possible > NR_CPUS)
+ possible = NR_CPUS;
+
+ printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+ possible,
+ max_t(int, possible - num_processors, 0));
+
+ for (i = 0; i < possible; i++)
cpu_set(i, cpu_possible_map);
}
#endif
nmi_watchdog_default();
current_cpu_data = boot_cpu_data;
current_thread_info()->cpu = 0; /* needed? */
+ set_cpu_sibling_map(0);
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
int me = smp_processor_id();
cpu_set(me, cpu_online_map);
cpu_set(me, cpu_callout_map);
- cpu_set(0, cpu_sibling_map[0]);
- cpu_set(0, cpu_core_map[0]);
per_cpu(cpu_state, me) = CPU_ONLINE;
}
*/
void __init smp_cpus_done(unsigned int max_cpus)
{
- #ifndef CONFIG_HOTPLUG_CPU
- zap_low_mappings();
- #endif
smp_cleanup_boot();
#ifdef CONFIG_X86_IO_APIC
static void remove_siblinginfo(int cpu)
{
int sibling;
+ struct cpuinfo_x86 *c = cpu_data;
+ for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ /*
+ * last thread sibling in this cpu core going down
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+ c[sibling].booted_cores--;
+ }
+
for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
cpu_clear(cpu, cpu_sibling_map[sibling]);
- for_each_cpu_mask(sibling, cpu_core_map[cpu])
- cpu_clear(cpu, cpu_core_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
phys_proc_id[cpu] = BAD_APICID;
cpu_core_id[cpu] = BAD_APICID;
+ cpu_clear(cpu, cpu_sibling_setup_map);
}
void remove_cpu_from_maps(void)
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
+ static __init int setup_additional_cpus(char *s)
+ {
+ return get_option(&s, &additional_cpus);
+ }
+ __setup("additional_cpus=", setup_additional_cpus);
+
#else /* ... !CONFIG_HOTPLUG_CPU */
int __cpu_disable(void)
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/agp_backend.h>
+#include <linux/mmzone.h>
#include <asm/page.h> /* PAGE_SIZE */
#include "agp.h"
static struct pci_dev * hammers[MAX_HAMMER_GARTS];
static struct resource *aperture_resource;
- static int __initdata agp_try_unsupported;
+ static int __initdata agp_try_unsupported = 1;
- static int gart_iterator;
#define for_each_nb() for(gart_iterator=0;gart_iterator<nr_garts;gart_iterator++)
static void flush_amd64_tlb(struct pci_dev *dev)
static void amd64_tlbflush(struct agp_memory *temp)
{
+ int gart_iterator;
for_each_nb()
flush_amd64_tlb(hammers[gart_iterator]);
}
static int amd_8151_configure(void)
{
unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
+ int gart_iterator;
/* Configure AGP regs in each x86-64 host bridge. */
for_each_nb() {
static void amd64_cleanup(void)
{
u32 tmp;
-
+ int gart_iterator;
for_each_nb() {
/* disable gart translation */
pci_read_config_dword (hammers[gart_iterator], AMD64_GARTAPERTURECTL, &tmp);
.subvendor = PCI_ANY_ID,
.subdevice = PCI_ANY_ID,
},
+ /* ALI/ULI M1695 */
+ {
+ .class = (PCI_CLASS_BRIDGE_HOST << 8),
+ .class_mask = ~0,
+ .vendor = PCI_VENDOR_ID_AL,
+ .device = 0x1689,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ },
+
{ }
};
int f00f_bug;
int coma_bug;
unsigned long loops_per_jiffy;
- unsigned char x86_num_cores;
+ unsigned char x86_max_cores; /* cpuid returned max cores value */
+ unsigned char booted_cores; /* number of cores as seen by OS */
+ unsigned char apicid;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define X86_VENDOR_INTEL 0
#define mtrr_bp_init() do {} while (0)
#endif
+#ifdef CONFIG_X86_MCE
+extern void mcheck_init(struct cpuinfo_x86 *c);
+#else
+#define mcheck_init(c) do {} while(0)
+#endif
+
#endif /* __ASM_I386_PROCESSOR_H */
static inline void set_intr_gate(int nr, void *func)
{
+ BUG_ON((unsigned)nr > 0xFF);
_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
}
static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
{
+ BUG_ON((unsigned)nr > 0xFF);
_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
}
static inline void set_system_gate(int nr, void *func)
{
+ BUG_ON((unsigned)nr > 0xFF);
_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
}
static inline void set_tss_desc(unsigned cpu, void *addr)
{
- set_tssldt_descriptor(&cpu_gdt_table[cpu][GDT_ENTRY_TSS], (unsigned long)addr,
- DESC_TSS,
- sizeof(struct tss_struct) - 1);
+ /*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap. See tss_struct definition in processor.h
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+ set_tssldt_descriptor(&cpu_gdt_table[cpu][GDT_ENTRY_TSS],
+ (unsigned long)addr, DESC_TSS,
+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
}
static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pgd_t init_level4_pgt[];
+ extern pgd_t boot_level4_pgt[];
extern unsigned long __supported_pte_mask;
#define swapper_pg_dir init_level4_pgt
#define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte, 0))
+struct mm_struct;
+
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
{
pte_t pte;
#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this
right? */
#define pte_page(x) pfn_to_page(pte_pfn(x))
- #define pte_pfn(x) ((pte_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK)
+ #define pte_pfn(x) ((pte_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
#define pmd_bad(x) ((pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE )
#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
- #define pmd_pfn(x) ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK)
+ #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
extern void unlock_ipi_call_lock(void);
extern int smp_num_siblings;
extern void smp_send_reschedule(int cpu);
- extern void zap_low_mappings(void);
void smp_stop_cpu(void);
extern int smp_call_function_single(int cpuid, void (*func) (void *info),
void *info, int retry, int wait);
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
extern void prefill_possible_map(void);
+ extern unsigned num_processors;
+ extern unsigned disabled_cpus;
#endif /* !ASSEMBLY */
}
#endif
+#ifdef CONFIG_SMP
+#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
+#else
+#define cpu_physical_id(cpu) boot_cpu_id
+#endif
+
#endif
/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
#define __GFP_DMA ((__force gfp_t)0x01u)
#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
+ #ifdef CONFIG_DMA_IS_DMA32
+ #define __GFP_DMA32 ((__force gfp_t)0x01) /* ZONE_DMA is ZONE_DMA32 */
+ #elif BITS_PER_LONG < 64
+ #define __GFP_DMA32 ((__force gfp_t)0x00) /* ZONE_NORMAL is ZONE_DMA32 */
+ #else
+ #define __GFP_DMA32 ((__force gfp_t)0x04) /* Has own ZONE_DMA32 */
+ #endif
/*
* Action modifiers - doesn't change the zoning
#define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */
#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
-#define __GFP_NORECLAIM ((__force gfp_t)0x20000u) /* No realy zone reclaim during allocation */
-#define __GFP_HARDWALL ((__force gfp_t)0x40000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- __GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL)
+ __GFP_NOMEMALLOC|__GFP_HARDWALL)
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_DMA __GFP_DMA
+ /* 4GB DMA on some platforms */
+ #define GFP_DMA32 __GFP_DMA32
+
+
#define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK))
/*
struct mmu_gather;
struct inode;
- #ifdef ARCH_HAS_ATOMIC_UNSIGNED
- typedef unsigned page_flags_t;
- #else
- typedef unsigned long page_flags_t;
- #endif
-
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* a page.
*/
struct page {
- page_flags_t flags; /* Atomic flags, some possibly
+ unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
atomic_t _count; /* Usage count, see below. */
atomic_t _mapcount; /* Count of ptes mapped in mms,
#endif
/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
- #define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
+ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
* turning readahead off */
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read);
+ pgoff_t offset, unsigned long nr_to_read);
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read);
-unsigned long page_cache_readahead(struct address_space *mapping,
+ pgoff_t offset, unsigned long nr_to_read);
+unsigned long page_cache_readahead(struct address_space *mapping,
struct file_ra_state *ra,
struct file *filp,
- unsigned long offset,
+ pgoff_t offset,
unsigned long size);
void handle_ra_miss(struct address_space *mapping,
struct file_ra_state *ra, pgoff_t offset);
#endif
#define ZONE_DMA 0
- #define ZONE_NORMAL 1
- #define ZONE_HIGHMEM 2
+ #define ZONE_DMA32 1
+ #define ZONE_NORMAL 2
+ #define ZONE_HIGHMEM 3
- #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
+ #define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
/*
* On machines where it is needed (eg PCs) we divide physical memory
- * into multiple physical zones. On a PC we have 3 zones:
+ * into multiple physical zones. On a PC we have 4 zones:
*
* ZONE_DMA < 16 MB ISA DMA capable memory
+ * ZONE_DMA32 0 MB Empty
* ZONE_NORMAL 16-896 MB direct mapped by the kernel
* ZONE_HIGHMEM > 896 MB only page cache and user processes
*/
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone, int order);
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int alloc_type, int can_try_harder, gfp_t gfp_high);
+ int classzone_idx, int alloc_flags);
#ifdef CONFIG_HAVE_MEMORY_PRESENT
void memory_present(int nid, unsigned long start, unsigned long end);
#include <linux/topology.h>
/* Returns the number of the current Node. */
+ #ifndef numa_node_id
#define numa_node_id() (cpu_to_node(raw_smp_processor_id()))
+ #endif
#ifndef CONFIG_NEED_MULTIPLE_NODES
#include <asm/sparsemem.h>
#endif
- #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
+ #if BITS_PER_LONG == 32
/*
- * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
- * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
+ * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
+ * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
*/
- #define FLAGS_RESERVED 8
+ #define FLAGS_RESERVED 9
#elif BITS_PER_LONG == 64
/*
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
*/
- int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
EXPORT_SYMBOL(totalram_pages);
-EXPORT_SYMBOL(nr_swap_pages);
/*
* Used by page_zone() to look up the address of the struct zone whose
struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
EXPORT_SYMBOL(zone_table);
- static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+ static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
function, current->comm, page);
printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
- (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+ (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
page->mapping, page_mapcount(page), page_count(page));
printk(KERN_EMERG "Backtrace:\n");
dump_stack();
}
local_irq_restore(flags);
put_cpu();
- }
-
- if (page == NULL) {
+ } else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
return page;
}
+#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
+#define ALLOC_HARDER 0x02 /* try to alloc harder */
+#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x08 /* check for correct cpuset */
+
/*
* Return 1 if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int can_try_harder, gfp_t gfp_high)
+ int classzone_idx, int alloc_flags)
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
int o;
- if (gfp_high)
+ if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- if (can_try_harder)
+ if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return 1;
}
-static inline int
-should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
+/*
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, int alloc_flags)
{
- if (!z->reclaim_pages)
- return 0;
- if (gfp_mask & __GFP_NORECLAIM)
- return 0;
- return 1;
+ struct zone **z = zonelist->zones;
+ struct page *page = NULL;
+ int classzone_idx = zone_idx(*z);
+
+ /*
+ * Go through the zonelist once, looking for a zone with enough free.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ do {
+ if ((alloc_flags & ALLOC_CPUSET) &&
+ !cpuset_zone_allowed(*z, gfp_mask))
+ continue;
+
+ if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+ if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+ classzone_idx, alloc_flags))
+ continue;
+ }
+
+ page = buffered_rmqueue(*z, order, gfp_mask);
+ if (page) {
+ zone_statistics(zonelist, *z);
+ break;
+ }
+ } while (*(++z) != NULL);
+ return page;
}
/*
struct zonelist *zonelist)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
- struct zone **zones, *z;
+ struct zone **z;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
- int i;
- int classzone_idx;
int do_retry;
- int can_try_harder;
+ int alloc_flags;
int did_some_progress;
might_sleep_if(wait);
- /*
- * The caller may dip into page reserves a bit more if the caller
- * cannot run direct reclaim, or is the caller has realtime scheduling
- * policy
- */
- can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+ z = zonelist->zones; /* the list of zones suitable for gfp_mask */
- zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
-
- if (unlikely(zones[0] == NULL)) {
+ if (unlikely(*z == NULL)) {
/* Should this ever happen?? */
return NULL;
}
+restart:
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ zonelist, ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
- classzone_idx = zone_idx(zones[0]);
+ do
+ wakeup_kswapd(*z, order);
+ while (*(++z));
-restart:
/*
- * Go through the zonelist once, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * OK, we're below the kswapd watermark and have kicked background
+ * reclaim. Now things get more complex, so set up alloc_flags according
+ * to how we want to proceed.
+ *
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or if the caller has realtime scheduling
+ * policy.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- /*
- * If the zone is to attempt early page reclaim then this loop
- * will try to reclaim pages and check the watermark a second
- * time before giving up and falling back to the next zone.
- */
-zone_reclaim_retry:
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0)) {
- if (!do_reclaim)
- continue;
- else {
- zone_reclaim(z, gfp_mask, order);
- /* Only try reclaim once */
- do_reclaim = 0;
- goto zone_reclaim_retry;
- }
- }
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
-
- for (i = 0; (z = zones[i]) != NULL; i++)
- wakeup_kswapd(z, order);
+ alloc_flags = 0;
+ if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+ alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_HIGH)
+ alloc_flags |= ALLOC_HIGH;
+ if (wait)
+ alloc_flags |= ALLOC_CPUSET;
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
- * coming from realtime tasks to go deeper into reserves
+ * coming from realtime tasks go deeper into reserves.
*
* This is the last chance, in general, before the goto nopage.
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (wait && !cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ if (page)
+ goto got_pg;
/* This allocation should allow future memory freeing. */
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
+ page = get_page_from_freelist(gfp_mask, order,
+ zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
+ if (gfp_mask & __GFP_NOFAIL) {
+ blk_congestion_wait(WRITE, HZ/50);
+ goto nofail_alloc;
}
}
goto nopage;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zones, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
cond_resched();
if (likely(did_some_progress)) {
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order,
+ zonelist, alloc_flags);
+ if (page)
+ goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
/*
* Go through the zonelist yet one more time, keep
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_high,
- classzone_idx, 0, 0))
- continue;
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ zonelist, ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
out_of_memory(gfp_mask, order);
goto restart;
dump_stack();
show_mem();
}
- return NULL;
got_pg:
- zone_statistics(zonelist, z);
return page;
}
} else
printk("\n");
- for_each_cpu(cpu) {
+ for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
pageset = zone_pcp(zone, cpu);
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->present_pages)
zonelist->zones[j++] = zone;
+ case ZONE_DMA32:
+ zone = pgdat->node_zones + ZONE_DMA32;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->present_pages)
int res = ZONE_NORMAL;
if (zone_bits & (__force int)__GFP_HIGHMEM)
res = ZONE_HIGHMEM;
+ if (zone_bits & (__force int)__GFP_DMA32)
+ res = ZONE_DMA32;
if (zone_bits & (__force int)__GFP_DMA)
res = ZONE_DMA;
return res;
if (process_zones(cpu))
ret = NOTIFY_BAD;
break;
- #ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
case CPU_DEAD:
free_zone_pagesets(cpu);
break;
- #endif
default:
break;
}
if (zholes_size)
realsize -= zholes_size[j];
- if (j == ZONE_DMA || j == ZONE_NORMAL)
+ if (j < ZONE_HIGHMEM)
nr_kernel_pages += realsize;
nr_all_pages += realsize;
}
for_each_zone(zone) {
+ unsigned long tmp;
spin_lock_irqsave(&zone->lru_lock, flags);
+ tmp = (pages_min * zone->present_pages) / lowmem_pages;
if (is_highmem(zone)) {
/*
- * Often, highmem doesn't need to reserve any pages.
- * But the pages_min/low/high values are also used for
- * batching up page reclaim activity so we need a
- * decent value here.
+ * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+ * need highmem pages, so cap pages_min to a small
+ * value here.
+ *
+ * The (pages_high-pages_low) and (pages_low-pages_min)
+ * deltas controls asynch page reclaim, and so should
+ * not be capped for highmem.
*/
int min_pages;
min_pages = 128;
zone->pages_min = min_pages;
} else {
- /* if it's a lowmem zone, reserve a number of pages
+ /*
+ * If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->pages_min = (pages_min * zone->present_pages) /
- lowmem_pages;
+ zone->pages_min = tmp;
}
- /*
- * When interpreting these watermarks, just keep in mind that:
- * zone->pages_min == (zone->pages_min * 4) / 4;
- */
- zone->pages_low = (zone->pages_min * 5) / 4;
- zone->pages_high = (zone->pages_min * 6) / 4;
+ zone->pages_low = zone->pages_min + tmp / 4;
+ zone->pages_high = zone->pages_min + tmp / 2;
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
}