Merge branch 'x86/mm' into x86/core
[linux-2.6-block.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
0fd0e3da 13#include <linux/mmiotrace.h>
1da177e4
LT
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
1da177e4
LT
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h>
c61e211d
HH
22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 24#include <linux/vmalloc.h>
1da177e4 25#include <linux/module.h>
0f2fbdcb 26#include <linux/kprobes.h>
ab2bf0c1 27#include <linux/uaccess.h>
1eeb66a1 28#include <linux/kdebug.h>
7c9f8861 29#include <linux/magic.h>
1da177e4
LT
30
31#include <asm/system.h>
c61e211d
HH
32#include <asm/desc.h>
33#include <asm/segment.h>
1da177e4
LT
34#include <asm/pgalloc.h>
35#include <asm/smp.h>
36#include <asm/tlbflush.h>
37#include <asm/proto.h>
1da177e4 38#include <asm-generic/sections.h>
70ef5641 39#include <asm/traps.h>
1da177e4 40
33cb5243
HH
41/*
42 * Page fault error code bits
43 * bit 0 == 0 means no page found, 1 means protection fault
44 * bit 1 == 0 means read, 1 means write
45 * bit 2 == 0 means kernel, 1 means user-mode
46 * bit 3 == 1 means use of reserved bit detected
47 * bit 4 == 1 means fault was an instruction fetch
48 */
8a19da7b 49#define PF_PROT (1<<0)
66c58156 50#define PF_WRITE (1<<1)
8a19da7b
IM
51#define PF_USER (1<<2)
52#define PF_RSVD (1<<3)
66c58156
AK
53#define PF_INSTR (1<<4)
54
0fd0e3da 55static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 56{
fd3fdf11 57#ifdef CONFIG_MMIOTRACE
0fd0e3da
PP
58 if (unlikely(is_kmmio_active()))
59 if (kmmio_handler(regs, addr) == 1)
60 return -1;
86069782 61#endif
0fd0e3da 62 return 0;
86069782
PP
63}
64
74a0b576 65static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 66{
33cb5243 67#ifdef CONFIG_KPROBES
74a0b576
CH
68 int ret = 0;
69
70 /* kprobe_running() needs smp_processor_id() */
f8c2ee22 71 if (!user_mode_vm(regs)) {
74a0b576
CH
72 preempt_disable();
73 if (kprobe_running() && kprobe_fault_handler(regs, 14))
74 ret = 1;
75 preempt_enable();
76 }
1bd858a5 77
74a0b576 78 return ret;
74a0b576 79#else
74a0b576 80 return 0;
74a0b576 81#endif
33cb5243 82}
1bd858a5 83
1dc85be0
HH
84/*
85 * X86_32
86 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
87 * Check that here and ignore it.
88 *
89 * X86_64
90 * Sometimes the CPU reports invalid exceptions on prefetch.
91 * Check that here and ignore it.
92 *
93 * Opcode checker based on code by Richard Brunner
94 */
92181f19
NP
95static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
96 unsigned long addr)
33cb5243 97{
ab2bf0c1 98 unsigned char *instr;
1da177e4 99 int scan_more = 1;
33cb5243 100 int prefetch = 0;
f1290ec9 101 unsigned char *max_instr;
1da177e4 102
3085354d
IM
103 /*
104 * If it was a exec (instruction fetch) fault on NX page, then
105 * do not ignore the fault:
106 */
66c58156 107 if (error_code & PF_INSTR)
1da177e4 108 return 0;
1dc85be0 109
f2857ce9 110 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 111 max_instr = instr + 15;
1da177e4 112
76381fee 113 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
114 return 0;
115
33cb5243 116 while (scan_more && instr < max_instr) {
1da177e4
LT
117 unsigned char opcode;
118 unsigned char instr_hi;
119 unsigned char instr_lo;
120
ab2bf0c1 121 if (probe_kernel_address(instr, opcode))
33cb5243 122 break;
1da177e4 123
33cb5243
HH
124 instr_hi = opcode & 0xf0;
125 instr_lo = opcode & 0x0f;
1da177e4
LT
126 instr++;
127
33cb5243 128 switch (instr_hi) {
1da177e4
LT
129 case 0x20:
130 case 0x30:
33cb5243
HH
131 /*
132 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
133 * In X86_64 long mode, the CPU will signal invalid
134 * opcode if some of these prefixes are present so
135 * X86_64 will never get here anyway
136 */
1da177e4
LT
137 scan_more = ((instr_lo & 7) == 0x6);
138 break;
33cb5243 139#ifdef CONFIG_X86_64
1da177e4 140 case 0x40:
33cb5243
HH
141 /*
142 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
143 * Need to figure out under what instruction mode the
144 * instruction was issued. Could check the LDT for lm,
145 * but for now it's good enough to assume that long
146 * mode only uses well known segments or kernel.
147 */
76381fee 148 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 149 break;
33cb5243 150#endif
1da177e4
LT
151 case 0x60:
152 /* 0x64 thru 0x67 are valid prefixes in all modes. */
153 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 154 break;
1da177e4 155 case 0xF0:
1dc85be0 156 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 157 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 158 break;
1da177e4
LT
159 case 0x00:
160 /* Prefetch instruction is 0x0F0D or 0x0F18 */
161 scan_more = 0;
f2857ce9 162
ab2bf0c1 163 if (probe_kernel_address(instr, opcode))
1da177e4
LT
164 break;
165 prefetch = (instr_lo == 0xF) &&
166 (opcode == 0x0D || opcode == 0x18);
33cb5243 167 break;
1da177e4
LT
168 default:
169 scan_more = 0;
170 break;
33cb5243 171 }
1da177e4
LT
172 }
173 return prefetch;
174}
175
c4aba4a8
HH
176static void force_sig_info_fault(int si_signo, int si_code,
177 unsigned long address, struct task_struct *tsk)
178{
179 siginfo_t info;
180
181 info.si_signo = si_signo;
182 info.si_errno = 0;
183 info.si_code = si_code;
184 info.si_addr = (void __user *)address;
185 force_sig_info(si_signo, &info, tsk);
186}
187
1156e098 188#ifdef CONFIG_X86_64
33cb5243
HH
189static int bad_address(void *p)
190{
1da177e4 191 unsigned long dummy;
ab2bf0c1 192 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 193}
1156e098 194#endif
1da177e4 195
cae30f82 196static void dump_pagetable(unsigned long address)
1da177e4 197{
1156e098
HH
198#ifdef CONFIG_X86_32
199 __typeof__(pte_val(__pte(0))) page;
200
201 page = read_cr3();
202 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
203#ifdef CONFIG_X86_PAE
204 printk("*pdpt = %016Lx ", page);
205 if ((page >> PAGE_SHIFT) < max_low_pfn
206 && page & _PAGE_PRESENT) {
207 page &= PAGE_MASK;
208 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
209 & (PTRS_PER_PMD - 1)];
210 printk(KERN_CONT "*pde = %016Lx ", page);
211 page &= ~_PAGE_NX;
212 }
213#else
214 printk("*pde = %08lx ", page);
215#endif
216
217 /*
218 * We must not directly access the pte in the highpte
219 * case if the page table is located in highmem.
220 * And let's rather not kmap-atomic the pte, just in case
221 * it's allocated already.
222 */
223 if ((page >> PAGE_SHIFT) < max_low_pfn
224 && (page & _PAGE_PRESENT)
225 && !(page & _PAGE_PSE)) {
226 page &= PAGE_MASK;
227 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
228 & (PTRS_PER_PTE - 1)];
229 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
230 }
231
232 printk("\n");
233#else /* CONFIG_X86_64 */
1da177e4
LT
234 pgd_t *pgd;
235 pud_t *pud;
236 pmd_t *pmd;
237 pte_t *pte;
238
f51c9452 239 pgd = (pgd_t *)read_cr3();
1da177e4 240
33cb5243 241 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 242 pgd += pgd_index(address);
1da177e4 243 if (bad_address(pgd)) goto bad;
d646bce4 244 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 245 if (!pgd_present(*pgd)) goto ret;
1da177e4 246
d2ae5b5f 247 pud = pud_offset(pgd, address);
1da177e4
LT
248 if (bad_address(pud)) goto bad;
249 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
250 if (!pud_present(*pud) || pud_large(*pud))
251 goto ret;
1da177e4
LT
252
253 pmd = pmd_offset(pud, address);
254 if (bad_address(pmd)) goto bad;
255 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 256 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
257
258 pte = pte_offset_kernel(pmd, address);
259 if (bad_address(pte)) goto bad;
33cb5243 260 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
261ret:
262 printk("\n");
263 return;
264bad:
265 printk("BAD\n");
1156e098
HH
266#endif
267}
268
269#ifdef CONFIG_X86_32
270static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
271{
272 unsigned index = pgd_index(address);
273 pgd_t *pgd_k;
274 pud_t *pud, *pud_k;
275 pmd_t *pmd, *pmd_k;
276
277 pgd += index;
278 pgd_k = init_mm.pgd + index;
279
280 if (!pgd_present(*pgd_k))
281 return NULL;
282
283 /*
284 * set_pgd(pgd, *pgd_k); here would be useless on PAE
285 * and redundant with the set_pmd() on non-PAE. As would
286 * set_pud.
287 */
288
289 pud = pud_offset(pgd, address);
290 pud_k = pud_offset(pgd_k, address);
291 if (!pud_present(*pud_k))
292 return NULL;
293
294 pmd = pmd_offset(pud, address);
295 pmd_k = pmd_offset(pud_k, address);
296 if (!pmd_present(*pmd_k))
297 return NULL;
298 if (!pmd_present(*pmd)) {
299 set_pmd(pmd, *pmd_k);
300 arch_flush_lazy_mmu_mode();
301 } else
302 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
303 return pmd_k;
1da177e4 304}
1156e098 305#endif
1da177e4 306
1dc85be0 307#ifdef CONFIG_X86_64
33cb5243 308static const char errata93_warning[] =
1da177e4
LT
309KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
310KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
311KERN_ERR "******* Please consider a BIOS update.\n"
312KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 313#endif
1da177e4
LT
314
315/* Workaround for K8 erratum #93 & buggy BIOS.
316 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
317 to avoid corruption of the 64bit RIP register on C stepping K8.
318 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
319 The OS sees this as a page fault with the upper 32bits of RIP cleared.
320 Try to work around it here.
fdfe8aa8
HH
321 Note we only handle faults in kernel here.
322 Does nothing for X86_32
323 */
33cb5243 324static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 325{
fdfe8aa8 326#ifdef CONFIG_X86_64
1da177e4 327 static int warned;
65ea5b03 328 if (address != regs->ip)
1da177e4 329 return 0;
33cb5243 330 if ((address >> 32) != 0)
1da177e4
LT
331 return 0;
332 address |= 0xffffffffUL << 32;
33cb5243
HH
333 if ((address >= (u64)_stext && address <= (u64)_etext) ||
334 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 335 if (!warned) {
33cb5243 336 printk(errata93_warning);
1da177e4
LT
337 warned = 1;
338 }
65ea5b03 339 regs->ip = address;
1da177e4
LT
340 return 1;
341 }
fdfe8aa8 342#endif
1da177e4 343 return 0;
33cb5243 344}
1da177e4 345
35f3266f
HH
346/*
347 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
348 * addresses >4GB. We catch this in the page fault handler because these
349 * addresses are not reachable. Just detect this case and return. Any code
350 * segment in LDT is compatibility mode.
351 */
352static int is_errata100(struct pt_regs *regs, unsigned long address)
353{
354#ifdef CONFIG_X86_64
355 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
356 (address >> 32))
357 return 1;
358#endif
359 return 0;
360}
361
29caf2f9
HH
362static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{
364#ifdef CONFIG_X86_F00F_BUG
365 unsigned long nr;
366 /*
367 * Pentium F0 0F C7 C8 bug workaround.
368 */
369 if (boot_cpu_data.f00f_bug) {
370 nr = (address - idt_descr.address) >> 3;
371
372 if (nr == 6) {
373 do_invalid_op(regs, 0);
374 return 1;
375 }
376 }
377#endif
378 return 0;
379}
380
b3279c7f
HH
381static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
382 unsigned long address)
383{
1156e098
HH
384#ifdef CONFIG_X86_32
385 if (!oops_may_print())
386 return;
fd40d6e3 387#endif
1156e098
HH
388
389#ifdef CONFIG_X86_PAE
390 if (error_code & PF_INSTR) {
93809be8 391 unsigned int level;
1156e098
HH
392 pte_t *pte = lookup_address(address, &level);
393
394 if (pte && pte_present(*pte) && !pte_exec(*pte))
395 printk(KERN_CRIT "kernel tried to execute "
396 "NX-protected page - exploit attempt? "
350b4da7 397 "(uid: %d)\n", current_uid());
1156e098
HH
398 }
399#endif
1156e098 400
19f0dda9 401 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 402 if (address < PAGE_SIZE)
19f0dda9 403 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 404 else
19f0dda9 405 printk(KERN_CONT "paging request");
f294a8ce 406 printk(KERN_CONT " at %p\n", (void *) address);
19f0dda9 407 printk(KERN_ALERT "IP:");
b3279c7f
HH
408 printk_address(regs->ip, 1);
409 dump_pagetable(address);
410}
411
1156e098 412#ifdef CONFIG_X86_64
92181f19
NP
413static noinline void pgtable_bad(struct pt_regs *regs,
414 unsigned long error_code, unsigned long address)
1da177e4 415{
1209140c 416 unsigned long flags = oops_begin();
874d93d1 417 int sig = SIGKILL;
92181f19 418 struct task_struct *tsk = current;
1209140c 419
1da177e4 420 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
92181f19 421 tsk->comm, address);
1da177e4 422 dump_pagetable(address);
6e3f3617
JB
423 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code;
22f5991c 426 if (__die("Bad pagetable", regs, error_code))
874d93d1
AH
427 sig = 0;
428 oops_end(flags, regs, sig);
1da177e4 429}
1156e098 430#endif
1da177e4 431
92181f19
NP
432static noinline void no_context(struct pt_regs *regs,
433 unsigned long error_code, unsigned long address)
434{
435 struct task_struct *tsk = current;
19803078
IM
436 unsigned long *stackend;
437
92181f19
NP
438#ifdef CONFIG_X86_64
439 unsigned long flags;
440 int sig;
441#endif
442
443 /* Are we prepared to handle this kernel fault? */
444 if (fixup_exception(regs))
445 return;
446
447 /*
448 * X86_32
449 * Valid to do another page fault here, because if this fault
450 * had been triggered by is_prefetch fixup_exception would have
451 * handled it.
452 *
453 * X86_64
454 * Hall of shame of CPU/BIOS bugs.
455 */
456 if (is_prefetch(regs, error_code, address))
457 return;
458
459 if (is_errata93(regs, address))
460 return;
461
462 /*
463 * Oops. The kernel tried to access some bad page. We'll have to
464 * terminate things with extreme prejudice.
465 */
466#ifdef CONFIG_X86_32
467 bust_spinlocks(1);
468#else
469 flags = oops_begin();
470#endif
471
472 show_fault_oops(regs, error_code, address);
473
19803078
IM
474 stackend = end_of_stack(tsk);
475 if (*stackend != STACK_END_MAGIC)
476 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
477
92181f19
NP
478 tsk->thread.cr2 = address;
479 tsk->thread.trap_no = 14;
480 tsk->thread.error_code = error_code;
481
482#ifdef CONFIG_X86_32
483 die("Oops", regs, error_code);
484 bust_spinlocks(0);
485 do_exit(SIGKILL);
486#else
487 sig = SIGKILL;
488 if (__die("Oops", regs, error_code))
489 sig = 0;
490 /* Executive summary in case the body of the oops scrolled away */
491 printk(KERN_EMERG "CR2: %016lx\n", address);
492 oops_end(flags, regs, sig);
493#endif
494}
495
496static void __bad_area_nosemaphore(struct pt_regs *regs,
497 unsigned long error_code, unsigned long address,
498 int si_code)
499{
500 struct task_struct *tsk = current;
501
502 /* User mode accesses just cause a SIGSEGV */
503 if (error_code & PF_USER) {
504 /*
505 * It's possible to have interrupts off here.
506 */
507 local_irq_enable();
508
509 /*
510 * Valid to do another page fault here because this one came
511 * from user space.
512 */
513 if (is_prefetch(regs, error_code, address))
514 return;
515
516 if (is_errata100(regs, address))
517 return;
518
519 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
520 printk_ratelimit()) {
521 printk(
522 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
523 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
524 tsk->comm, task_pid_nr(tsk), address,
525 (void *) regs->ip, (void *) regs->sp, error_code);
526 print_vma_addr(" in ", regs->ip);
527 printk("\n");
528 }
529
530 tsk->thread.cr2 = address;
531 /* Kernel addresses are always protection faults */
532 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
533 tsk->thread.trap_no = 14;
534 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
535 return;
536 }
537
538 if (is_f00f_bug(regs, address))
539 return;
540
541 no_context(regs, error_code, address);
542}
543
544static noinline void bad_area_nosemaphore(struct pt_regs *regs,
545 unsigned long error_code, unsigned long address)
546{
547 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
548}
549
550static void __bad_area(struct pt_regs *regs,
551 unsigned long error_code, unsigned long address,
552 int si_code)
553{
554 struct mm_struct *mm = current->mm;
555
556 /*
557 * Something tried to access memory that isn't in our memory map..
558 * Fix it, but check if it's kernel or user first..
559 */
560 up_read(&mm->mmap_sem);
561
562 __bad_area_nosemaphore(regs, error_code, address, si_code);
563}
564
565static noinline void bad_area(struct pt_regs *regs,
566 unsigned long error_code, unsigned long address)
567{
568 __bad_area(regs, error_code, address, SEGV_MAPERR);
569}
570
571static noinline void bad_area_access_error(struct pt_regs *regs,
572 unsigned long error_code, unsigned long address)
573{
574 __bad_area(regs, error_code, address, SEGV_ACCERR);
575}
576
577/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
578static void out_of_memory(struct pt_regs *regs,
579 unsigned long error_code, unsigned long address)
580{
581 /*
582 * We ran out of memory, call the OOM killer, and return the userspace
583 * (which will retry the fault, or kill us if we got oom-killed).
584 */
585 up_read(&current->mm->mmap_sem);
586 pagefault_out_of_memory();
587}
588
589static void do_sigbus(struct pt_regs *regs,
590 unsigned long error_code, unsigned long address)
591{
592 struct task_struct *tsk = current;
593 struct mm_struct *mm = tsk->mm;
594
595 up_read(&mm->mmap_sem);
596
597 /* Kernel mode? Handle exceptions or die */
598 if (!(error_code & PF_USER))
599 no_context(regs, error_code, address);
600#ifdef CONFIG_X86_32
601 /* User space => ok to do another page fault */
602 if (is_prefetch(regs, error_code, address))
603 return;
604#endif
605 tsk->thread.cr2 = address;
606 tsk->thread.error_code = error_code;
607 tsk->thread.trap_no = 14;
608 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
609}
610
611static noinline void mm_fault_error(struct pt_regs *regs,
612 unsigned long error_code, unsigned long address, unsigned int fault)
613{
614 if (fault & VM_FAULT_OOM)
615 out_of_memory(regs, error_code, address);
616 else if (fault & VM_FAULT_SIGBUS)
617 do_sigbus(regs, error_code, address);
618 else
619 BUG();
620}
621
d8b57bb7
TG
622static int spurious_fault_check(unsigned long error_code, pte_t *pte)
623{
624 if ((error_code & PF_WRITE) && !pte_write(*pte))
625 return 0;
626 if ((error_code & PF_INSTR) && !pte_exec(*pte))
627 return 0;
628
629 return 1;
630}
631
5b727a3b
JF
632/*
633 * Handle a spurious fault caused by a stale TLB entry. This allows
634 * us to lazily refresh the TLB when increasing the permissions of a
635 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
636 * expensive since that implies doing a full cross-processor TLB
637 * flush, even if no stale TLB entries exist on other processors.
638 * There are no security implications to leaving a stale TLB when
639 * increasing the permissions on a page.
640 */
92181f19
NP
641static noinline int spurious_fault(unsigned long error_code,
642 unsigned long address)
5b727a3b
JF
643{
644 pgd_t *pgd;
645 pud_t *pud;
646 pmd_t *pmd;
647 pte_t *pte;
648
649 /* Reserved-bit violation or user access to kernel space? */
650 if (error_code & (PF_USER | PF_RSVD))
651 return 0;
652
653 pgd = init_mm.pgd + pgd_index(address);
654 if (!pgd_present(*pgd))
655 return 0;
656
657 pud = pud_offset(pgd, address);
658 if (!pud_present(*pud))
659 return 0;
660
d8b57bb7
TG
661 if (pud_large(*pud))
662 return spurious_fault_check(error_code, (pte_t *) pud);
663
5b727a3b
JF
664 pmd = pmd_offset(pud, address);
665 if (!pmd_present(*pmd))
666 return 0;
667
d8b57bb7
TG
668 if (pmd_large(*pmd))
669 return spurious_fault_check(error_code, (pte_t *) pmd);
670
5b727a3b
JF
671 pte = pte_offset_kernel(pmd, address);
672 if (!pte_present(*pte))
673 return 0;
674
d8b57bb7 675 return spurious_fault_check(error_code, pte);
5b727a3b
JF
676}
677
1da177e4 678/*
f8c2ee22
HH
679 * X86_32
680 * Handle a fault on the vmalloc or module mapping area
681 *
682 * X86_64
f95190b2 683 * Handle a fault on the vmalloc area
3b9ba4d5
AK
684 *
685 * This assumes no large pages in there.
1da177e4 686 */
92181f19 687static noinline int vmalloc_fault(unsigned long address)
1da177e4 688{
fdfe8aa8
HH
689#ifdef CONFIG_X86_32
690 unsigned long pgd_paddr;
691 pmd_t *pmd_k;
692 pte_t *pte_k;
b29c701d
HN
693
694 /* Make sure we are in vmalloc area */
695 if (!(address >= VMALLOC_START && address < VMALLOC_END))
696 return -1;
697
fdfe8aa8
HH
698 /*
699 * Synchronize this task's top level page-table
700 * with the 'reference' page table.
701 *
702 * Do _not_ use "current" here. We might be inside
703 * an interrupt in the middle of a task switch..
704 */
705 pgd_paddr = read_cr3();
706 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
707 if (!pmd_k)
708 return -1;
709 pte_k = pte_offset_kernel(pmd_k, address);
710 if (!pte_present(*pte_k))
711 return -1;
712 return 0;
713#else
1da177e4
LT
714 pgd_t *pgd, *pgd_ref;
715 pud_t *pud, *pud_ref;
716 pmd_t *pmd, *pmd_ref;
717 pte_t *pte, *pte_ref;
718
cf89ec92
HH
719 /* Make sure we are in vmalloc area */
720 if (!(address >= VMALLOC_START && address < VMALLOC_END))
721 return -1;
722
1da177e4
LT
723 /* Copy kernel mappings over when needed. This can also
724 happen within a race in page table update. In the later
725 case just flush. */
726
f313e123 727 pgd = pgd_offset(current->active_mm, address);
1da177e4
LT
728 pgd_ref = pgd_offset_k(address);
729 if (pgd_none(*pgd_ref))
730 return -1;
731 if (pgd_none(*pgd))
732 set_pgd(pgd, *pgd_ref);
8c914cb7 733 else
46a82b2d 734 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
735
736 /* Below here mismatches are bugs because these lower tables
737 are shared */
738
739 pud = pud_offset(pgd, address);
740 pud_ref = pud_offset(pgd_ref, address);
741 if (pud_none(*pud_ref))
742 return -1;
46a82b2d 743 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
744 BUG();
745 pmd = pmd_offset(pud, address);
746 pmd_ref = pmd_offset(pud_ref, address);
747 if (pmd_none(*pmd_ref))
748 return -1;
749 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
750 BUG();
751 pte_ref = pte_offset_kernel(pmd_ref, address);
752 if (!pte_present(*pte_ref))
753 return -1;
754 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
755 /* Don't use pte_page here, because the mappings can point
756 outside mem_map, and the NUMA hash lookup cannot handle
757 that. */
758 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 759 BUG();
1da177e4 760 return 0;
fdfe8aa8 761#endif
1da177e4
LT
762}
763
abd4f750 764int show_unhandled_signals = 1;
1da177e4 765
92181f19
NP
766static inline int access_error(unsigned long error_code, int write,
767 struct vm_area_struct *vma)
768{
769 if (write) {
770 /* write, present and write, not present */
771 if (unlikely(!(vma->vm_flags & VM_WRITE)))
772 return 1;
773 } else if (unlikely(error_code & PF_PROT)) {
774 /* read, present */
775 return 1;
776 } else {
777 /* read, not present */
778 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
779 return 1;
780 }
781
782 return 0;
783}
784
0973a06c
HS
785static int fault_in_kernel_space(unsigned long address)
786{
787#ifdef CONFIG_X86_32
788 return address >= TASK_SIZE;
789#else /* !CONFIG_X86_32 */
790 return address >= TASK_SIZE64;
791#endif /* CONFIG_X86_32 */
792}
793
1da177e4
LT
794/*
795 * This routine handles page faults. It determines the address,
796 * and the problem, and then passes it off to one of the appropriate
797 * routines.
1da177e4 798 */
f8c2ee22
HH
799#ifdef CONFIG_X86_64
800asmlinkage
801#endif
802void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4 803{
92181f19 804 unsigned long address;
1da177e4
LT
805 struct task_struct *tsk;
806 struct mm_struct *mm;
33cb5243 807 struct vm_area_struct *vma;
92181f19 808 int write;
f8c2ee22 809 int fault;
1da177e4 810
a9ba9a3b
AV
811 tsk = current;
812 mm = tsk->mm;
813 prefetchw(&mm->mmap_sem);
814
1da177e4 815 /* get the address */
f51c9452 816 address = read_cr2();
1da177e4 817
0fd0e3da 818 if (unlikely(kmmio_fault(regs, address)))
86069782 819 return;
1da177e4
LT
820
821 /*
822 * We fault-in kernel-space virtual memory on-demand. The
823 * 'reference' page table is init_mm.pgd.
824 *
825 * NOTE! We MUST NOT take any locks for this case. We may
826 * be in an interrupt or a critical region, and should
827 * only copy the information from the master page table,
828 * nothing more.
829 *
830 * This verifies that the fault happens in kernel space
831 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 832 * protection error (error_code & 9) == 0.
1da177e4 833 */
0973a06c 834 if (unlikely(fault_in_kernel_space(address))) {
f8c2ee22
HH
835 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
836 vmalloc_fault(address) >= 0)
837 return;
5b727a3b
JF
838
839 /* Can handle a stale RO->RW TLB */
92181f19 840 if (spurious_fault(error_code, address))
5b727a3b
JF
841 return;
842
9be260a6
MH
843 /* kprobes don't want to hook the spurious faults. */
844 if (notify_page_fault(regs))
845 return;
f8c2ee22
HH
846 /*
847 * Don't take the mm semaphore here. If we fixup a prefetch
848 * fault we could otherwise deadlock.
849 */
92181f19
NP
850 bad_area_nosemaphore(regs, error_code, address);
851 return;
f8c2ee22
HH
852 }
853
f8a6b2b9 854 if (unlikely(notify_page_fault(regs)))
9be260a6 855 return;
f8c2ee22 856 /*
891cffbd
LT
857 * It's safe to allow irq's after cr2 has been saved and the
858 * vmalloc fault has been handled.
859 *
860 * User-mode registers count as a user access even for any
861 * potential system fault or CPU buglet.
f8c2ee22 862 */
891cffbd
LT
863 if (user_mode_vm(regs)) {
864 local_irq_enable();
865 error_code |= PF_USER;
866 } else if (regs->flags & X86_EFLAGS_IF)
8c914cb7
JB
867 local_irq_enable();
868
891cffbd 869#ifdef CONFIG_X86_64
66c58156 870 if (unlikely(error_code & PF_RSVD))
92181f19 871 pgtable_bad(regs, error_code, address);
891cffbd 872#endif
1da177e4
LT
873
874 /*
33cb5243
HH
875 * If we're in an interrupt, have no user context or are running in an
876 * atomic region then we must not take the fault.
1da177e4 877 */
92181f19
NP
878 if (unlikely(in_atomic() || !mm)) {
879 bad_area_nosemaphore(regs, error_code, address);
880 return;
881 }
1da177e4 882
3a1dfe6e
IM
883 /*
884 * When running in the kernel we expect faults to occur only to
1da177e4 885 * addresses in user space. All other faults represent errors in the
676b1855 886 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 887 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
888 * we will deadlock attempting to validate the fault against the
889 * address space. Luckily the kernel only validly references user
890 * space from well defined areas of code, which are listed in the
891 * exceptions table.
892 *
893 * As the vast majority of faults will be valid we will only perform
676b1855 894 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
895 * Attempt to lock the address space, if we cannot we then validate the
896 * source. If this is invalid we can skip the address space check,
897 * thus avoiding the deadlock.
898 */
92181f19 899 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
66c58156 900 if ((error_code & PF_USER) == 0 &&
92181f19
NP
901 !search_exception_tables(regs->ip)) {
902 bad_area_nosemaphore(regs, error_code, address);
903 return;
904 }
1da177e4 905 down_read(&mm->mmap_sem);
01006074
PZ
906 } else {
907 /*
908 * The above down_read_trylock() might have succeeded in which
909 * case we'll have missed the might_sleep() from down_read().
910 */
911 might_sleep();
1da177e4
LT
912 }
913
914 vma = find_vma(mm, address);
92181f19
NP
915 if (unlikely(!vma)) {
916 bad_area(regs, error_code, address);
917 return;
918 }
919 if (likely(vma->vm_start <= address))
1da177e4 920 goto good_area;
92181f19
NP
921 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
922 bad_area(regs, error_code, address);
923 return;
924 }
33cb5243 925 if (error_code & PF_USER) {
6f4d368e
HH
926 /*
927 * Accessing the stack below %sp is always a bug.
928 * The large cushion allows instructions like enter
929 * and pusha to work. ("enter $65535,$31" pushes
930 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 931 */
92181f19
NP
932 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
933 bad_area(regs, error_code, address);
934 return;
935 }
1da177e4 936 }
92181f19
NP
937 if (unlikely(expand_stack(vma, address))) {
938 bad_area(regs, error_code, address);
939 return;
940 }
941
942 /*
943 * Ok, we have a good vm_area for this memory access, so
944 * we can handle it..
945 */
1da177e4 946good_area:
92181f19
NP
947 write = error_code & PF_WRITE;
948 if (unlikely(access_error(error_code, write, vma))) {
949 bad_area_access_error(regs, error_code, address);
950 return;
1da177e4
LT
951 }
952
953 /*
954 * If for any reason at all we couldn't handle the fault,
955 * make sure we exit gracefully rather than endlessly redo
956 * the fault.
957 */
83c54070
NP
958 fault = handle_mm_fault(mm, vma, address, write);
959 if (unlikely(fault & VM_FAULT_ERROR)) {
92181f19
NP
960 mm_fault_error(regs, error_code, address, fault);
961 return;
1da177e4 962 }
83c54070
NP
963 if (fault & VM_FAULT_MAJOR)
964 tsk->maj_flt++;
965 else
966 tsk->min_flt++;
d729ab35
HH
967
968#ifdef CONFIG_X86_32
969 /*
970 * Did it hit the DOS screen memory VA from vm86 mode?
971 */
972 if (v8086_mode(regs)) {
973 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
974 if (bit < 32)
975 tsk->thread.screen_bitmap |= 1 << bit;
976 }
977#endif
1da177e4 978 up_read(&mm->mmap_sem);
1da177e4 979}
9e43e1b7 980
8c914cb7 981DEFINE_SPINLOCK(pgd_lock);
2bff7383 982LIST_HEAD(pgd_list);
8c914cb7
JB
983
984void vmalloc_sync_all(void)
985{
1156e098
HH
986 unsigned long address;
987
cc643d46 988#ifdef CONFIG_X86_32
1156e098
HH
989 if (SHARED_KERNEL_PMD)
990 return;
991
cc643d46
JB
992 for (address = VMALLOC_START & PMD_MASK;
993 address >= TASK_SIZE && address < FIXADDR_TOP;
994 address += PMD_SIZE) {
67350a5c
JF
995 unsigned long flags;
996 struct page *page;
997
998 spin_lock_irqsave(&pgd_lock, flags);
999 list_for_each_entry(page, &pgd_list, lru) {
1000 if (!vmalloc_sync_one(page_address(page),
1001 address))
1002 break;
1156e098 1003 }
67350a5c 1004 spin_unlock_irqrestore(&pgd_lock, flags);
1156e098
HH
1005 }
1006#else /* CONFIG_X86_64 */
cc643d46
JB
1007 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
1008 address += PGDIR_SIZE) {
67350a5c
JF
1009 const pgd_t *pgd_ref = pgd_offset_k(address);
1010 unsigned long flags;
1011 struct page *page;
1012
1013 if (pgd_none(*pgd_ref))
1014 continue;
1015 spin_lock_irqsave(&pgd_lock, flags);
1016 list_for_each_entry(page, &pgd_list, lru) {
1017 pgd_t *pgd;
1018 pgd = (pgd_t *)page_address(page) + pgd_index(address);
1019 if (pgd_none(*pgd))
1020 set_pgd(pgd, *pgd_ref);
1021 else
1022 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 1023 }
67350a5c 1024 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7 1025 }
1156e098 1026#endif
8c914cb7 1027}