[linux-block.git] / arch / x86_64 / mm / fault.c

/*
 *  linux/arch/x86-64/mm/fault.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h>		/* For unblank_screen() */
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/tlbflush.h>
#include <asm/proto.h>
#include <asm/kdebug.h>
#include <asm-generic/sections.h>

/* Page fault error code bits */
#define PF_PROT	(1<<0)		/* or no page found */
#define PF_WRITE	(1<<1)
#define PF_USER	(1<<2)
#define PF_RSVD	(1<<3)
#define PF_INSTR	(1<<4)

static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);

/* Hook to register for page fault notifications */
int register_page_fault_notifier(struct notifier_block *nb)
{
	vmalloc_sync_all();
	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
}
EXPORT_SYMBOL_GPL(register_page_fault_notifier);

int unregister_page_fault_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);

static inline int notify_page_fault(enum die_val val, const char *str,
			struct pt_regs *regs, long err, int trap, int sig)
{
	struct die_args args = {
		.regs = regs,
		.str = str,
		.err = err,
		.trapnr = trap,
		.signr = sig
	};
	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
}

void bust_spinlocks(int yes)
{
	int loglevel_save = console_loglevel;
	if (yes) {
		oops_in_progress = 1;
	} else {
#ifdef CONFIG_VT
		unblank_screen();
#endif
		oops_in_progress = 0;
		/*
		 * OK, the message is on the console.  Now we call printk()
		 * without oops_in_progress set so that printk will give klogd
		 * a poke.  Hold onto your hats...
		 */
		console_loglevel = 15;		/* NMI oopser may have shut the console up */
		printk(" ");
		console_loglevel = loglevel_save;
	}
}

/* Sometimes the CPU reports invalid exceptions on prefetch.
   Check that here and ignore.
   Opcode checker based on code by Richard Brunner */
static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
				unsigned long error_code)
{ 
	unsigned char __user *instr;
	int scan_more = 1;
	int prefetch = 0; 
	unsigned char *max_instr;

	/* If it was a exec fault ignore */
	if (error_code & PF_INSTR)
		return 0;
	
	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
	max_instr = instr + 15;

	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
		return 0;

	while (scan_more && instr < max_instr) { 
		unsigned char opcode;
		unsigned char instr_hi;
		unsigned char instr_lo;

		if (__get_user(opcode, (char __user *)instr))
			break; 

		instr_hi = opcode & 0xf0; 
		instr_lo = opcode & 0x0f; 
		instr++;

		switch (instr_hi) { 
		case 0x20:
		case 0x30:
			/* Values 0x26,0x2E,0x36,0x3E are valid x86
			   prefixes.  In long mode, the CPU will signal
			   invalid opcode if some of these prefixes are
			   present so we will never get here anyway */
			scan_more = ((instr_lo & 7) == 0x6);
			break;
			
		case 0x40:
			/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
			   Need to figure out under what instruction mode the
			   instruction was issued ... */
			/* Could check the LDT for lm, but for now it's good
			   enough to assume that long mode only uses well known
			   segments or kernel. */
			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
			break;
			
		case 0x60:
			/* 0x64 thru 0x67 are valid prefixes in all modes. */
			scan_more = (instr_lo & 0xC) == 0x4;
			break;		
		case 0xF0:
			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
			scan_more = !instr_lo || (instr_lo>>1) == 1;
			break;			
		case 0x00:
			/* Prefetch instruction is 0x0F0D or 0x0F18 */
			scan_more = 0;
			if (__get_user(opcode, (char __user *)instr))
				break;
			prefetch = (instr_lo == 0xF) &&
				(opcode == 0x0D || opcode == 0x18);
			break;			
		default:
			scan_more = 0;
			break;
		} 
	}
	return prefetch;
}

static int bad_address(void *p) 
{ 
	unsigned long dummy;
	return __get_user(dummy, (unsigned long __user *)p);
} 

void dump_pagetable(unsigned long address)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	asm("movq %%cr3,%0" : "=r" (pgd));

	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
	pgd += pgd_index(address);
	if (bad_address(pgd)) goto bad;
	printk("PGD %lx ", pgd_val(*pgd));
	if (!pgd_present(*pgd)) goto ret; 

	pud = pud_offset(pgd, address);
	if (bad_address(pud)) goto bad;
	printk("PUD %lx ", pud_val(*pud));
	if (!pud_present(*pud))	goto ret;

	pmd = pmd_offset(pud, address);
	if (bad_address(pmd)) goto bad;
	printk("PMD %lx ", pmd_val(*pmd));
	if (!pmd_present(*pmd))	goto ret;	 

	pte = pte_offset_kernel(pmd, address);
	if (bad_address(pte)) goto bad;
	printk("PTE %lx", pte_val(*pte)); 
ret:
	printk("\n");
	return;
bad:
	printk("BAD\n");
}

static const char errata93_warning[] = 
KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
KERN_ERR "******* Please consider a BIOS update.\n"
KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";

/* Workaround for K8 erratum #93 & buggy BIOS.
   BIOS SMM functions are required to use a specific workaround
   to avoid corruption of the 64bit RIP register on C stepping K8. 
   A lot of BIOS that didn't get tested properly miss this. 
   The OS sees this as a page fault with the upper 32bits of RIP cleared.
   Try to work around it here.
   Note we only handle faults in kernel here. */

static int is_errata93(struct pt_regs *regs, unsigned long address) 
{
	static int warned;
	if (address != regs->rip)
		return 0;
	if ((address >> 32) != 0) 
		return 0;
	address |= 0xffffffffUL << 32;
	if ((address >= (u64)_stext && address <= (u64)_etext) || 
	    (address >= MODULES_VADDR && address <= MODULES_END)) { 
		if (!warned) {
			printk(errata93_warning); 		
			warned = 1;
		}
		regs->rip = address;
		return 1;
	}
	return 0;
} 

int unhandled_signal(struct task_struct *tsk, int sig)
{
	if (is_init(tsk))
		return 1;
	if (tsk->ptrace & PT_PTRACED)
		return 0;
	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
}

static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
				 unsigned long error_code)
{
	unsigned long flags = oops_begin();
	struct task_struct *tsk;

	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
	       current->comm, address);
	dump_pagetable(address);
	tsk = current;
	tsk->thread.cr2 = address;
	tsk->thread.trap_no = 14;
	tsk->thread.error_code = error_code;
	__die("Bad pagetable", regs, error_code);
	oops_end(flags);
	do_exit(SIGKILL);
}

/*
 * Handle a fault on the vmalloc area
 *
 * This assumes no large pages in there.
 */
static int vmalloc_fault(unsigned long address)
{
	pgd_t *pgd, *pgd_ref;
	pud_t *pud, *pud_ref;
	pmd_t *pmd, *pmd_ref;
	pte_t *pte, *pte_ref;

	/* Copy kernel mappings over when needed. This can also
	   happen within a race in page table update. In the later
	   case just flush. */

	pgd = pgd_offset(current->mm ?: &init_mm, address);
	pgd_ref = pgd_offset_k(address);
	if (pgd_none(*pgd_ref))
		return -1;
	if (pgd_none(*pgd))
		set_pgd(pgd, *pgd_ref);
	else
		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));

	/* Below here mismatches are bugs because these lower tables
	   are shared */

	pud = pud_offset(pgd, address);
	pud_ref = pud_offset(pgd_ref, address);
	if (pud_none(*pud_ref))
		return -1;
	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
		BUG();
	pmd = pmd_offset(pud, address);
	pmd_ref = pmd_offset(pud_ref, address);
	if (pmd_none(*pmd_ref))
		return -1;
	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
		BUG();
	pte_ref = pte_offset_kernel(pmd_ref, address);
	if (!pte_present(*pte_ref))
		return -1;
	pte = pte_offset_kernel(pmd, address);
	/* Don't use pte_page here, because the mappings can point
	   outside mem_map, and the NUMA hash lookup cannot handle
	   that. */
	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
		BUG();
	return 0;
}

int page_fault_trace = 0;
int exception_trace = 1;

/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 */
asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
					unsigned long error_code)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct * vma;
	unsigned long address;
	const struct exception_table_entry *fixup;
	int write;
	unsigned long flags;
	siginfo_t info;

	tsk = current;
	mm = tsk->mm;
	prefetchw(&mm->mmap_sem);

	/* get the address */
	__asm__("movq %%cr2,%0":"=r" (address));

	info.si_code = SEGV_MAPERR;


	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * This verifies that the fault happens in kernel space
	 * (error_code & 4) == 0, and that the fault was not a
	 * protection error (error_code & 9) == 0.
	 */
	if (unlikely(address >= TASK_SIZE64)) {
		/*
		 * Don't check for the module range here: its PML4
		 * is always initialized because it's shared with the main
		 * kernel text. Only vmalloc may need PML4 syncups.
		 */
		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
			if (vmalloc_fault(address) >= 0)
				return;
		}
		if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
						SIGSEGV) == NOTIFY_STOP)
			return;
		/*
		 * Don't take the mm semaphore here. If we fixup a prefetch
		 * fault we could otherwise deadlock.
		 */
		goto bad_area_nosemaphore;
	}

	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
					SIGSEGV) == NOTIFY_STOP)
		return;

	if (likely(regs->eflags & X86_EFLAGS_IF))
		local_irq_enable();

	if (unlikely(page_fault_trace))
		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 

	if (unlikely(error_code & PF_RSVD))
		pgtable_bad(address, regs, error_code);

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (unlikely(in_atomic() || !mm))
		goto bad_area_nosemaphore;

 again:
	/* When running in the kernel we expect faults to occur only to
	 * addresses in user space.  All other faults represent errors in the
	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
	 * erroneous fault occurring in a code path which already holds mmap_sem
	 * we will deadlock attempting to validate the fault against the
	 * address space.  Luckily the kernel only validly references user
	 * space from well defined areas of code, which are listed in the
	 * exceptions table.
	 *
	 * As the vast majority of faults will be valid we will only perform
	 * the source reference check when there is a possibilty of a deadlock.
	 * Attempt to lock the address space, if we cannot we then validate the
	 * source.  If this is invalid we can skip the address space check,
	 * thus avoiding the deadlock.
	 */
	if (!down_read_trylock(&mm->mmap_sem)) {
		if ((error_code & PF_USER) == 0 &&
		    !search_exception_tables(regs->rip))
			goto bad_area_nosemaphore;
		down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (likely(vma->vm_start <= address))
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (error_code & 4) {
		/* Allow userspace just enough access below the stack pointer
		 * to let the 'enter' instruction work.
		 */
		if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
			goto bad_area;
	}
	if (expand_stack(vma, address))
		goto bad_area;
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	info.si_code = SEGV_ACCERR;
	write = 0;
	switch (error_code & (PF_PROT|PF_WRITE)) {
		default:	/* 3: write, present */
			/* fall through */
		case PF_WRITE:		/* write, not present */
			if (!(vma->vm_flags & VM_WRITE))
				goto bad_area;
			write++;
			break;
		case PF_PROT:		/* read, present */
			goto bad_area;
		case 0:			/* read, not present */
			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
				goto bad_area;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	switch (handle_mm_fault(mm, vma, address, write)) {
	case VM_FAULT_MINOR:
		tsk->min_flt++;
		break;
	case VM_FAULT_MAJOR:
		tsk->maj_flt++;
		break;
	case VM_FAULT_SIGBUS:
		goto do_sigbus;
	default:
		goto out_of_memory;
	}

	up_read(&mm->mmap_sem);
	return;

/*
 * Something tried to access memory that isn't in our memory map..
 * Fix it, but check if it's kernel or user first..
 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (error_code & PF_USER) {
		if (is_prefetch(regs, address, error_code))
			return;

		/* Work around K8 erratum #100 K8 in compat mode
		   occasionally jumps to illegal addresses >4GB.  We
		   catch this here in the page fault handler because
		   these addresses are not reachable. Just detect this
		   case and return.  Any code segment in LDT is
		   compatibility mode. */
		if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
		    (address >> 32))
			return;

		if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
			printk(
		       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
					tsk->comm, tsk->pid, address, regs->rip,
					regs->rsp, error_code);
		}
       
		tsk->thread.cr2 = address;
		/* Kernel addresses are always protection faults */
		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
		tsk->thread.trap_no = 14;
		info.si_signo = SIGSEGV;
		info.si_errno = 0;
		/* info.si_code has been set above */
		info.si_addr = (void __user *)address;
		force_sig_info(SIGSEGV, &info, tsk);
		return;
	}

no_context:
	
	/* Are we prepared to handle this kernel fault?  */
	fixup = search_exception_tables(regs->rip);
	if (fixup) {
		regs->rip = fixup->fixup;
		return;
	}

	/* 
	 * Hall of shame of CPU/BIOS bugs.
	 */

 	if (is_prefetch(regs, address, error_code))
 		return;

	if (is_errata93(regs, address))
		return; 

/*
 * Oops. The kernel tried to access some bad page. We'll have to
 * terminate things with extreme prejudice.
 */

	flags = oops_begin();

	if (address < PAGE_SIZE)
		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
	else
		printk(KERN_ALERT "Unable to handle kernel paging request");
	printk(" at %016lx RIP: \n" KERN_ALERT,address);
	printk_address(regs->rip);
	dump_pagetable(address);
	tsk->thread.cr2 = address;
	tsk->thread.trap_no = 14;
	tsk->thread.error_code = error_code;
	__die("Oops", regs, error_code);
	/* Executive summary in case the body of the oops scrolled away */
	printk(KERN_EMERG "CR2: %016lx\n", address);
	oops_end(flags);
	do_exit(SIGKILL);

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (is_init(current)) {
		yield();
		goto again;
	}
	printk("VM: killing process %s\n", tsk->comm);
	if (error_code & 4)
		do_exit(SIGKILL);
	goto no_context;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Kernel mode? Handle exceptions or die */
	if (!(error_code & PF_USER))
		goto no_context;

	tsk->thread.cr2 = address;
	tsk->thread.error_code = error_code;
	tsk->thread.trap_no = 14;
	info.si_signo = SIGBUS;
	info.si_errno = 0;
	info.si_code = BUS_ADRERR;
	info.si_addr = (void __user *)address;
	force_sig_info(SIGBUS, &info, tsk);
	return;
}

DEFINE_SPINLOCK(pgd_lock);
struct page *pgd_list;

void vmalloc_sync_all(void)
{
	/* Note that races in the updates of insync and start aren't 
	   problematic:
	   insync can only get set bits added, and updates to start are only
	   improving performance (without affecting correctness if undone). */
	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
	static unsigned long start = VMALLOC_START & PGDIR_MASK;
	unsigned long address;

	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
		if (!test_bit(pgd_index(address), insync)) {
			const pgd_t *pgd_ref = pgd_offset_k(address);
			struct page *page;

			if (pgd_none(*pgd_ref))
				continue;
			spin_lock(&pgd_lock);
			for (page = pgd_list; page;
			     page = (struct page *)page->index) {
				pgd_t *pgd;
				pgd = (pgd_t *)page_address(page) + pgd_index(address);
				if (pgd_none(*pgd))
					set_pgd(pgd, *pgd_ref);
				else
					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
			}
			spin_unlock(&pgd_lock);
			set_bit(pgd_index(address), insync);
		}
		if (address == start)
			start = address + PGDIR_SIZE;
	}
	/* Check that there is no need to do the same for the modules area. */
	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
				(__START_KERNEL & PGDIR_MASK)));
}

static int __init enable_pagefaulttrace(char *str)
{
	page_fault_trace = 1;
	return 1;
}
__setup("pagefaulttrace", enable_pagefaulttrace);
Commit	Line	Data
1da177e4 LT	1	/*
	2	* linux/arch/x86-64/mm/fault.c
	3	*
	4	* Copyright (C) 1995 Linus Torvalds
	5	* Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
	6	*/
	7
1da177e4 LT	8	#include <linux/signal.h>
	9	#include <linux/sched.h>
	10	#include <linux/kernel.h>
	11	#include <linux/errno.h>
	12	#include <linux/string.h>
	13	#include <linux/types.h>
	14	#include <linux/ptrace.h>
	15	#include <linux/mman.h>
	16	#include <linux/mm.h>
	17	#include <linux/smp.h>
	18	#include <linux/smp_lock.h>
	19	#include <linux/interrupt.h>
	20	#include <linux/init.h>
	21	#include <linux/tty.h>
	22	#include <linux/vt_kern.h> /* For unblank_screen() */
	23	#include <linux/compiler.h>
	24	#include <linux/module.h>
0f2fbdcb	25	#include <linux/kprobes.h>
1da177e4 LT	26
	27	#include <asm/system.h>
	28	#include <asm/uaccess.h>
	29	#include <asm/pgalloc.h>
	30	#include <asm/smp.h>
	31	#include <asm/tlbflush.h>
	32	#include <asm/proto.h>
	33	#include <asm/kdebug.h>
	34	#include <asm-generic/sections.h>
1da177e4	35
66c58156 AK	36	/* Page fault error code bits */
	37	#define PF_PROT (1<<0) /* or no page found */
	38	#define PF_WRITE (1<<1)
	39	#define PF_USER (1<<2)
	40	#define PF_RSVD (1<<3)
	41	#define PF_INSTR (1<<4)
	42
273819a2	43	static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
1bd858a5 AK	44
	45	/* Hook to register for page fault notifications */
	46	int register_page_fault_notifier(struct notifier_block *nb)
	47	{
	48	vmalloc_sync_all();
	49	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
	50	}
273819a2	51	EXPORT_SYMBOL_GPL(register_page_fault_notifier);
1bd858a5 AK	52
	53	int unregister_page_fault_notifier(struct notifier_block *nb)
	54	{
	55	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
	56	}
273819a2	57	EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
1bd858a5 AK	58
	59	static inline int notify_page_fault(enum die_val val, const char *str,
	60	struct pt_regs *regs, long err, int trap, int sig)
	61	{
	62	struct die_args args = {
	63	.regs = regs,
	64	.str = str,
	65	.err = err,
	66	.trapnr = trap,
	67	.signr = sig
	68	};
	69	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
	70	}
1bd858a5	71
1da177e4 LT	72	void bust_spinlocks(int yes)
	73	{
	74	int loglevel_save = console_loglevel;
	75	if (yes) {
	76	oops_in_progress = 1;
	77	} else {
	78	#ifdef CONFIG_VT
	79	unblank_screen();
	80	#endif
	81	oops_in_progress = 0;
	82	/*
	83	* OK, the message is on the console. Now we call printk()
	84	* without oops_in_progress set so that printk will give klogd
	85	* a poke. Hold onto your hats...
	86	*/
	87	console_loglevel = 15; /* NMI oopser may have shut the console up */
	88	printk(" ");
	89	console_loglevel = loglevel_save;
	90	}
	91	}
	92
	93	/* Sometimes the CPU reports invalid exceptions on prefetch.
	94	Check that here and ignore.
	95	Opcode checker based on code by Richard Brunner */
	96	static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
	97	unsigned long error_code)
	98	{
dd2994f6	99	unsigned char __user *instr;
1da177e4 LT	100	int scan_more = 1;
1da177e4 LT	101	int prefetch = 0;
f1290ec9	102	unsigned char *max_instr;
1da177e4 LT	103
1da177e4 LT	104	/* If it was a exec fault ignore */
66c58156	105	if (error_code & PF_INSTR)
1da177e4 LT	106	return 0;
1da177e4 LT	107
dd2994f6	108	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
f1290ec9	109	max_instr = instr + 15;
1da177e4	110
76381fee	111	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4 LT	112	return 0;
	113
	114	while (scan_more && instr < max_instr) {
	115	unsigned char opcode;
	116	unsigned char instr_hi;
	117	unsigned char instr_lo;
	118
dd2994f6	119	if (__get_user(opcode, (char __user *)instr))
1da177e4 LT	120	break;
	121
	122	instr_hi = opcode & 0xf0;
	123	instr_lo = opcode & 0x0f;
	124	instr++;
	125
	126	switch (instr_hi) {
	127	case 0x20:
	128	case 0x30:
	129	/* Values 0x26,0x2E,0x36,0x3E are valid x86
	130	prefixes. In long mode, the CPU will signal
	131	invalid opcode if some of these prefixes are
	132	present so we will never get here anyway */
	133	scan_more = ((instr_lo & 7) == 0x6);
	134	break;
	135
	136	case 0x40:
	137	/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
	138	Need to figure out under what instruction mode the
	139	instruction was issued ... */
	140	/* Could check the LDT for lm, but for now it's good
	141	enough to assume that long mode only uses well known
	142	segments or kernel. */
76381fee	143	scan_more = (!user_mode(regs)) \|\| (regs->cs == __USER_CS);
1da177e4 LT	144	break;
	145
	146	case 0x60:
	147	/* 0x64 thru 0x67 are valid prefixes in all modes. */
	148	scan_more = (instr_lo & 0xC) == 0x4;
	149	break;
	150	case 0xF0:
	151	/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
	152	scan_more = !instr_lo \|\| (instr_lo>>1) == 1;
	153	break;
	154	case 0x00:
	155	/* Prefetch instruction is 0x0F0D or 0x0F18 */
	156	scan_more = 0;
dd2994f6	157	if (__get_user(opcode, (char __user *)instr))
1da177e4 LT	158	break;
	159	prefetch = (instr_lo == 0xF) &&
	160	(opcode == 0x0D \|\| opcode == 0x18);
	161	break;
	162	default:
	163	scan_more = 0;
	164	break;
	165	}
	166	}
	167	return prefetch;
	168	}
	169
	170	static int bad_address(void *p)
	171	{
	172	unsigned long dummy;
dd2994f6	173	return __get_user(dummy, (unsigned long __user *)p);
1da177e4 LT	174	}
	175
	176	void dump_pagetable(unsigned long address)
	177	{
	178	pgd_t *pgd;
	179	pud_t *pud;
	180	pmd_t *pmd;
	181	pte_t *pte;
	182
	183	asm("movq %%cr3,%0" : "=r" (pgd));
	184
	185	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
	186	pgd += pgd_index(address);
1da177e4	187	if (bad_address(pgd)) goto bad;
d646bce4	188	printk("PGD %lx ", pgd_val(*pgd));
1da177e4 LT	189	if (!pgd_present(*pgd)) goto ret;
1da177e4 LT	190
d2ae5b5f	191	pud = pud_offset(pgd, address);
1da177e4 LT	192	if (bad_address(pud)) goto bad;
	193	printk("PUD %lx ", pud_val(*pud));
	194	if (!pud_present(*pud)) goto ret;
	195
	196	pmd = pmd_offset(pud, address);
	197	if (bad_address(pmd)) goto bad;
	198	printk("PMD %lx ", pmd_val(*pmd));
	199	if (!pmd_present(*pmd)) goto ret;
	200
	201	pte = pte_offset_kernel(pmd, address);
	202	if (bad_address(pte)) goto bad;
	203	printk("PTE %lx", pte_val(*pte));
	204	ret:
	205	printk("\n");
	206	return;
	207	bad:
	208	printk("BAD\n");
	209	}
	210
	211	static const char errata93_warning[] =
	212	KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
	213	KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
	214	KERN_ERR "******* Please consider a BIOS update.\n"
	215	KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
	216
	217	/* Workaround for K8 erratum #93 & buggy BIOS.
	218	BIOS SMM functions are required to use a specific workaround
	219	to avoid corruption of the 64bit RIP register on C stepping K8.
	220	A lot of BIOS that didn't get tested properly miss this.
	221	The OS sees this as a page fault with the upper 32bits of RIP cleared.
	222	Try to work around it here.
	223	Note we only handle faults in kernel here. */
	224
	225	static int is_errata93(struct pt_regs *regs, unsigned long address)
	226	{
	227	static int warned;
	228	if (address != regs->rip)
	229	return 0;
	230	if ((address >> 32) != 0)
	231	return 0;
	232	address \|= 0xffffffffUL << 32;
	233	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
	234	(address >= MODULES_VADDR && address <= MODULES_END)) {
	235	if (!warned) {
	236	printk(errata93_warning);
	237	warned = 1;
	238	}
	239	regs->rip = address;
	240	return 1;
	241	}
	242	return 0;
	243	}
	244
	245	int unhandled_signal(struct task_struct *tsk, int sig)
	246	{
f400e198	247	if (is_init(tsk))
1da177e4	248	return 1;
5e5ec104	249	if (tsk->ptrace & PT_PTRACED)
1da177e4 LT	250	return 0;
	251	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) \|\|
	252	(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
	253	}
	254
	255	static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
	256	unsigned long error_code)
	257	{
1209140c	258	unsigned long flags = oops_begin();
6e3f3617	259	struct task_struct *tsk;
1209140c	260
1da177e4 LT	261	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
	262	current->comm, address);
	263	dump_pagetable(address);
6e3f3617 JB	264	tsk = current;
	265	tsk->thread.cr2 = address;
	266	tsk->thread.trap_no = 14;
	267	tsk->thread.error_code = error_code;
1da177e4	268	__die("Bad pagetable", regs, error_code);
1209140c	269	oops_end(flags);
1da177e4 LT	270	do_exit(SIGKILL);
	271	}
	272
	273	/*
f95190b2	274	* Handle a fault on the vmalloc area
3b9ba4d5 AK	275	*
3b9ba4d5 AK	276	* This assumes no large pages in there.
1da177e4 LT	277	*/
	278	static int vmalloc_fault(unsigned long address)
	279	{
	280	pgd_t pgd, pgd_ref;
	281	pud_t pud, pud_ref;
	282	pmd_t pmd, pmd_ref;
	283	pte_t pte, pte_ref;
	284
	285	/* Copy kernel mappings over when needed. This can also
	286	happen within a race in page table update. In the later
	287	case just flush. */
	288
	289	pgd = pgd_offset(current->mm ?: &init_mm, address);
	290	pgd_ref = pgd_offset_k(address);
	291	if (pgd_none(*pgd_ref))
	292	return -1;
	293	if (pgd_none(*pgd))
	294	set_pgd(pgd, *pgd_ref);
8c914cb7	295	else
46a82b2d	296	BUG_ON(pgd_page_vaddr(pgd) != pgd_page_vaddr(pgd_ref));
1da177e4 LT	297
	298	/* Below here mismatches are bugs because these lower tables
	299	are shared */
	300
	301	pud = pud_offset(pgd, address);
	302	pud_ref = pud_offset(pgd_ref, address);
	303	if (pud_none(*pud_ref))
	304	return -1;
46a82b2d	305	if (pud_none(pud) \|\| pud_page_vaddr(pud) != pud_page_vaddr(*pud_ref))
1da177e4 LT	306	BUG();
	307	pmd = pmd_offset(pud, address);
	308	pmd_ref = pmd_offset(pud_ref, address);
	309	if (pmd_none(*pmd_ref))
	310	return -1;
	311	if (pmd_none(pmd) \|\| pmd_page(pmd) != pmd_page(*pmd_ref))
	312	BUG();
	313	pte_ref = pte_offset_kernel(pmd_ref, address);
	314	if (!pte_present(*pte_ref))
	315	return -1;
	316	pte = pte_offset_kernel(pmd, address);
3b9ba4d5 AK	317	/* Don't use pte_page here, because the mappings can point
	318	outside mem_map, and the NUMA hash lookup cannot handle
	319	that. */
	320	if (!pte_present(pte) \|\| pte_pfn(pte) != pte_pfn(*pte_ref))
1da177e4	321	BUG();
1da177e4 LT	322	return 0;
	323	}
	324
	325	int page_fault_trace = 0;
	326	int exception_trace = 1;
	327
	328	/*
	329	* This routine handles page faults. It determines the address,
	330	* and the problem, and then passes it off to one of the appropriate
	331	* routines.
1da177e4	332	*/
0f2fbdcb PP	333	asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
0f2fbdcb PP	334	unsigned long error_code)
1da177e4 LT	335	{
	336	struct task_struct *tsk;
	337	struct mm_struct *mm;
	338	struct vm_area_struct * vma;
	339	unsigned long address;
	340	const struct exception_table_entry *fixup;
	341	int write;
1209140c	342	unsigned long flags;
1da177e4 LT	343	siginfo_t info;
1da177e4 LT	344
a9ba9a3b AV	345	tsk = current;
	346	mm = tsk->mm;
	347	prefetchw(&mm->mmap_sem);
	348
1da177e4 LT	349	/* get the address */
1da177e4 LT	350	__asm__("movq %%cr2,%0":"=r" (address));
1da177e4	351
1da177e4 LT	352	info.si_code = SEGV_MAPERR;
	353
	354
	355	/*
	356	* We fault-in kernel-space virtual memory on-demand. The
	357	* 'reference' page table is init_mm.pgd.
	358	*
	359	* NOTE! We MUST NOT take any locks for this case. We may
	360	* be in an interrupt or a critical region, and should
	361	* only copy the information from the master page table,
	362	* nothing more.
	363	*
	364	* This verifies that the fault happens in kernel space
	365	* (error_code & 4) == 0, and that the fault was not a
8b1bde93	366	* protection error (error_code & 9) == 0.
1da177e4	367	*/
84929801	368	if (unlikely(address >= TASK_SIZE64)) {
f95190b2 AK	369	/*
	370	* Don't check for the module range here: its PML4
	371	* is always initialized because it's shared with the main
	372	* kernel text. Only vmalloc may need PML4 syncups.
	373	*/
66c58156	374	if (!(error_code & (PF_RSVD\|PF_USER\|PF_PROT)) &&
f95190b2	375	((address >= VMALLOC_START && address < VMALLOC_END))) {
8c914cb7 JB	376	if (vmalloc_fault(address) >= 0)
8c914cb7 JB	377	return;
1da177e4	378	}
1bd858a5	379	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
8c914cb7 JB	380	SIGSEGV) == NOTIFY_STOP)
8c914cb7 JB	381	return;
1da177e4 LT	382	/*
	383	* Don't take the mm semaphore here. If we fixup a prefetch
	384	* fault we could otherwise deadlock.
	385	*/
	386	goto bad_area_nosemaphore;
	387	}
	388
1bd858a5	389	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
8c914cb7 JB	390	SIGSEGV) == NOTIFY_STOP)
	391	return;
	392
	393	if (likely(regs->eflags & X86_EFLAGS_IF))
	394	local_irq_enable();
	395
	396	if (unlikely(page_fault_trace))
	397	printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
	398	regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
	399
66c58156	400	if (unlikely(error_code & PF_RSVD))
1da177e4 LT	401	pgtable_bad(address, regs, error_code);
	402
	403	/*
	404	* If we're in an interrupt or have no user
	405	* context, we must not take the fault..
	406	*/
	407	if (unlikely(in_atomic() \|\| !mm))
	408	goto bad_area_nosemaphore;
	409
	410	again:
	411	/* When running in the kernel we expect faults to occur only to
	412	* addresses in user space. All other faults represent errors in the
	413	* kernel and should generate an OOPS. Unfortunatly, in the case of an
80f7228b	414	* erroneous fault occurring in a code path which already holds mmap_sem
1da177e4 LT	415	* we will deadlock attempting to validate the fault against the
	416	* address space. Luckily the kernel only validly references user
	417	* space from well defined areas of code, which are listed in the
	418	* exceptions table.
	419	*
	420	* As the vast majority of faults will be valid we will only perform
	421	* the source reference check when there is a possibilty of a deadlock.
	422	* Attempt to lock the address space, if we cannot we then validate the
	423	* source. If this is invalid we can skip the address space check,
	424	* thus avoiding the deadlock.
	425	*/
	426	if (!down_read_trylock(&mm->mmap_sem)) {
66c58156	427	if ((error_code & PF_USER) == 0 &&
1da177e4 LT	428	!search_exception_tables(regs->rip))
	429	goto bad_area_nosemaphore;
	430	down_read(&mm->mmap_sem);
	431	}
	432
	433	vma = find_vma(mm, address);
	434	if (!vma)
	435	goto bad_area;
	436	if (likely(vma->vm_start <= address))
	437	goto good_area;
	438	if (!(vma->vm_flags & VM_GROWSDOWN))
	439	goto bad_area;
	440	if (error_code & 4) {
03fdc2c2 CE	441	/* Allow userspace just enough access below the stack pointer
	442	* to let the 'enter' instruction work.
	443	*/
	444	if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
1da177e4 LT	445	goto bad_area;
	446	}
	447	if (expand_stack(vma, address))
	448	goto bad_area;
	449	/*
	450	* Ok, we have a good vm_area for this memory access, so
	451	* we can handle it..
	452	*/
	453	good_area:
	454	info.si_code = SEGV_ACCERR;
	455	write = 0;
66c58156	456	switch (error_code & (PF_PROT\|PF_WRITE)) {
1da177e4 LT	457	default: /* 3: write, present */
1da177e4 LT	458	/* fall through */
66c58156	459	case PF_WRITE: /* write, not present */
1da177e4 LT	460	if (!(vma->vm_flags & VM_WRITE))
	461	goto bad_area;
	462	write++;
	463	break;
66c58156	464	case PF_PROT: /* read, present */
1da177e4	465	goto bad_area;
66c58156	466	case 0: /* read, not present */
df67b3da	467	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
1da177e4 LT	468	goto bad_area;
	469	}
	470
	471	/*
	472	* If for any reason at all we couldn't handle the fault,
	473	* make sure we exit gracefully rather than endlessly redo
	474	* the fault.
	475	*/
	476	switch (handle_mm_fault(mm, vma, address, write)) {
96800216	477	case VM_FAULT_MINOR:
1da177e4 LT	478	tsk->min_flt++;
1da177e4 LT	479	break;
96800216	480	case VM_FAULT_MAJOR:
1da177e4 LT	481	tsk->maj_flt++;
1da177e4 LT	482	break;
96800216	483	case VM_FAULT_SIGBUS:
1da177e4 LT	484	goto do_sigbus;
	485	default:
	486	goto out_of_memory;
	487	}
	488
	489	up_read(&mm->mmap_sem);
	490	return;
	491
	492	/*
	493	* Something tried to access memory that isn't in our memory map..
	494	* Fix it, but check if it's kernel or user first..
	495	*/
	496	bad_area:
	497	up_read(&mm->mmap_sem);
	498
	499	bad_area_nosemaphore:
1da177e4	500	/* User mode accesses just cause a SIGSEGV */
66c58156	501	if (error_code & PF_USER) {
1da177e4 LT	502	if (is_prefetch(regs, address, error_code))
	503	return;
	504
	505	/* Work around K8 erratum #100 K8 in compat mode
	506	occasionally jumps to illegal addresses >4GB. We
	507	catch this here in the page fault handler because
	508	these addresses are not reachable. Just detect this
	509	case and return. Any code segment in LDT is
	510	compatibility mode. */
	511	if ((regs->cs == __USER32_CS \|\| (regs->cs & (1<<2))) &&
	512	(address >> 32))
	513	return;
	514
	515	if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
	516	printk(
	517	"%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
	518	tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
	519	tsk->comm, tsk->pid, address, regs->rip,
	520	regs->rsp, error_code);
	521	}
	522
	523	tsk->thread.cr2 = address;
	524	/* Kernel addresses are always protection faults */
	525	tsk->thread.error_code = error_code \| (address >= TASK_SIZE);
	526	tsk->thread.trap_no = 14;
	527	info.si_signo = SIGSEGV;
	528	info.si_errno = 0;
	529	/* info.si_code has been set above */
	530	info.si_addr = (void __user *)address;
	531	force_sig_info(SIGSEGV, &info, tsk);
	532	return;
	533	}
	534
	535	no_context:
	536
	537	/* Are we prepared to handle this kernel fault? */
	538	fixup = search_exception_tables(regs->rip);
	539	if (fixup) {
	540	regs->rip = fixup->fixup;
	541	return;
	542	}
	543
	544	/*
	545	* Hall of shame of CPU/BIOS bugs.
	546	*/
	547
	548	if (is_prefetch(regs, address, error_code))
	549	return;
	550
	551	if (is_errata93(regs, address))
	552	return;
	553
	554	/*
	555	* Oops. The kernel tried to access some bad page. We'll have to
	556	* terminate things with extreme prejudice.
	557	*/
	558
1209140c	559	flags = oops_begin();
1da177e4 LT	560
	561	if (address < PAGE_SIZE)
	562	printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
	563	else
	564	printk(KERN_ALERT "Unable to handle kernel paging request");
	565	printk(" at %016lx RIP: \n" KERN_ALERT,address);
	566	printk_address(regs->rip);
1da177e4	567	dump_pagetable(address);
6e3f3617 JB	568	tsk->thread.cr2 = address;
	569	tsk->thread.trap_no = 14;
	570	tsk->thread.error_code = error_code;
1da177e4 LT	571	__die("Oops", regs, error_code);
	572	/* Executive summary in case the body of the oops scrolled away */
	573	printk(KERN_EMERG "CR2: %016lx\n", address);
1209140c	574	oops_end(flags);
1da177e4 LT	575	do_exit(SIGKILL);
	576
	577	/*
	578	* We ran out of memory, or some other thing happened to us that made
	579	* us unable to handle the page fault gracefully.
	580	*/
	581	out_of_memory:
	582	up_read(&mm->mmap_sem);
f400e198	583	if (is_init(current)) {
1da177e4 LT	584	yield();
	585	goto again;
	586	}
	587	printk("VM: killing process %s\n", tsk->comm);
	588	if (error_code & 4)
	589	do_exit(SIGKILL);
	590	goto no_context;
	591
	592	do_sigbus:
	593	up_read(&mm->mmap_sem);
	594
	595	/* Kernel mode? Handle exceptions or die */
66c58156	596	if (!(error_code & PF_USER))
1da177e4 LT	597	goto no_context;
	598
	599	tsk->thread.cr2 = address;
	600	tsk->thread.error_code = error_code;
	601	tsk->thread.trap_no = 14;
	602	info.si_signo = SIGBUS;
	603	info.si_errno = 0;
	604	info.si_code = BUS_ADRERR;
	605	info.si_addr = (void __user *)address;
	606	force_sig_info(SIGBUS, &info, tsk);
	607	return;
	608	}
9e43e1b7	609
8c914cb7 JB	610	DEFINE_SPINLOCK(pgd_lock);
	611	struct page *pgd_list;
	612
	613	void vmalloc_sync_all(void)
	614	{
	615	/* Note that races in the updates of insync and start aren't
	616	problematic:
	617	insync can only get set bits added, and updates to start are only
	618	improving performance (without affecting correctness if undone). */
	619	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
	620	static unsigned long start = VMALLOC_START & PGDIR_MASK;
	621	unsigned long address;
	622
	623	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
	624	if (!test_bit(pgd_index(address), insync)) {
	625	const pgd_t *pgd_ref = pgd_offset_k(address);
	626	struct page *page;
	627
	628	if (pgd_none(*pgd_ref))
	629	continue;
	630	spin_lock(&pgd_lock);
	631	for (page = pgd_list; page;
	632	page = (struct page *)page->index) {
	633	pgd_t *pgd;
	634	pgd = (pgd_t *)page_address(page) + pgd_index(address);
	635	if (pgd_none(*pgd))
	636	set_pgd(pgd, *pgd_ref);
	637	else
46a82b2d	638	BUG_ON(pgd_page_vaddr(pgd) != pgd_page_vaddr(pgd_ref));
8c914cb7 JB	639	}
	640	spin_unlock(&pgd_lock);
	641	set_bit(pgd_index(address), insync);
	642	}
	643	if (address == start)
	644	start = address + PGDIR_SIZE;
	645	}
	646	/* Check that there is no need to do the same for the modules area. */
	647	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
	648	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
	649	(__START_KERNEL & PGDIR_MASK)));
	650	}
	651
9e43e1b7 AK	652	static int __init enable_pagefaulttrace(char *str)
	653	{
	654	page_fault_trace = 1;
9b41046c	655	return 1;
9e43e1b7 AK	656	}
9e43e1b7 AK	657	__setup("pagefaulttrace", enable_pagefaulttrace);