x86: kernel/mpparse.c fix compilation warnings
[linux-block.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
0fd0e3da 13#include <linux/mmiotrace.h>
1da177e4
LT
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
1da177e4
LT
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h>
c61e211d
HH
22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 24#include <linux/vmalloc.h>
1da177e4 25#include <linux/module.h>
0f2fbdcb 26#include <linux/kprobes.h>
ab2bf0c1 27#include <linux/uaccess.h>
1eeb66a1 28#include <linux/kdebug.h>
7c9f8861 29#include <linux/magic.h>
1da177e4
LT
30
31#include <asm/system.h>
c61e211d
HH
32#include <asm/desc.h>
33#include <asm/segment.h>
1da177e4
LT
34#include <asm/pgalloc.h>
35#include <asm/smp.h>
36#include <asm/tlbflush.h>
37#include <asm/proto.h>
1da177e4 38#include <asm-generic/sections.h>
70ef5641 39#include <asm/traps.h>
1da177e4 40
33cb5243
HH
41/*
42 * Page fault error code bits
43 * bit 0 == 0 means no page found, 1 means protection fault
44 * bit 1 == 0 means read, 1 means write
45 * bit 2 == 0 means kernel, 1 means user-mode
46 * bit 3 == 1 means use of reserved bit detected
47 * bit 4 == 1 means fault was an instruction fetch
48 */
8a19da7b 49#define PF_PROT (1<<0)
66c58156 50#define PF_WRITE (1<<1)
8a19da7b
IM
51#define PF_USER (1<<2)
52#define PF_RSVD (1<<3)
66c58156
AK
53#define PF_INSTR (1<<4)
54
0fd0e3da 55static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 56{
fd3fdf11 57#ifdef CONFIG_MMIOTRACE
0fd0e3da
PP
58 if (unlikely(is_kmmio_active()))
59 if (kmmio_handler(regs, addr) == 1)
60 return -1;
86069782 61#endif
0fd0e3da 62 return 0;
86069782
PP
63}
64
74a0b576 65static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 66{
33cb5243 67#ifdef CONFIG_KPROBES
74a0b576
CH
68 int ret = 0;
69
70 /* kprobe_running() needs smp_processor_id() */
f8c2ee22 71 if (!user_mode_vm(regs)) {
74a0b576
CH
72 preempt_disable();
73 if (kprobe_running() && kprobe_fault_handler(regs, 14))
74 ret = 1;
75 preempt_enable();
76 }
1bd858a5 77
74a0b576 78 return ret;
74a0b576 79#else
74a0b576 80 return 0;
74a0b576 81#endif
33cb5243 82}
1bd858a5 83
1dc85be0
HH
84/*
85 * X86_32
86 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
87 * Check that here and ignore it.
88 *
89 * X86_64
90 * Sometimes the CPU reports invalid exceptions on prefetch.
91 * Check that here and ignore it.
92 *
93 * Opcode checker based on code by Richard Brunner
94 */
92181f19
NP
95static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
96 unsigned long addr)
33cb5243 97{
ab2bf0c1 98 unsigned char *instr;
1da177e4 99 int scan_more = 1;
33cb5243 100 int prefetch = 0;
f1290ec9 101 unsigned char *max_instr;
1da177e4 102
3085354d
IM
103 /*
104 * If it was a exec (instruction fetch) fault on NX page, then
105 * do not ignore the fault:
106 */
66c58156 107 if (error_code & PF_INSTR)
1da177e4 108 return 0;
1dc85be0 109
f2857ce9 110 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 111 max_instr = instr + 15;
1da177e4 112
76381fee 113 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
114 return 0;
115
33cb5243 116 while (scan_more && instr < max_instr) {
1da177e4
LT
117 unsigned char opcode;
118 unsigned char instr_hi;
119 unsigned char instr_lo;
120
ab2bf0c1 121 if (probe_kernel_address(instr, opcode))
33cb5243 122 break;
1da177e4 123
33cb5243
HH
124 instr_hi = opcode & 0xf0;
125 instr_lo = opcode & 0x0f;
1da177e4
LT
126 instr++;
127
33cb5243 128 switch (instr_hi) {
1da177e4
LT
129 case 0x20:
130 case 0x30:
33cb5243
HH
131 /*
132 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
133 * In X86_64 long mode, the CPU will signal invalid
134 * opcode if some of these prefixes are present so
135 * X86_64 will never get here anyway
136 */
1da177e4
LT
137 scan_more = ((instr_lo & 7) == 0x6);
138 break;
33cb5243 139#ifdef CONFIG_X86_64
1da177e4 140 case 0x40:
33cb5243
HH
141 /*
142 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
143 * Need to figure out under what instruction mode the
144 * instruction was issued. Could check the LDT for lm,
145 * but for now it's good enough to assume that long
146 * mode only uses well known segments or kernel.
147 */
76381fee 148 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 149 break;
33cb5243 150#endif
1da177e4
LT
151 case 0x60:
152 /* 0x64 thru 0x67 are valid prefixes in all modes. */
153 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 154 break;
1da177e4 155 case 0xF0:
1dc85be0 156 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 157 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 158 break;
1da177e4
LT
159 case 0x00:
160 /* Prefetch instruction is 0x0F0D or 0x0F18 */
161 scan_more = 0;
f2857ce9 162
ab2bf0c1 163 if (probe_kernel_address(instr, opcode))
1da177e4
LT
164 break;
165 prefetch = (instr_lo == 0xF) &&
166 (opcode == 0x0D || opcode == 0x18);
33cb5243 167 break;
1da177e4
LT
168 default:
169 scan_more = 0;
170 break;
33cb5243 171 }
1da177e4
LT
172 }
173 return prefetch;
174}
175
c4aba4a8
HH
176static void force_sig_info_fault(int si_signo, int si_code,
177 unsigned long address, struct task_struct *tsk)
178{
179 siginfo_t info;
180
181 info.si_signo = si_signo;
182 info.si_errno = 0;
183 info.si_code = si_code;
184 info.si_addr = (void __user *)address;
185 force_sig_info(si_signo, &info, tsk);
186}
187
1156e098 188#ifdef CONFIG_X86_64
33cb5243
HH
189static int bad_address(void *p)
190{
1da177e4 191 unsigned long dummy;
ab2bf0c1 192 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 193}
1156e098 194#endif
1da177e4 195
cae30f82 196static void dump_pagetable(unsigned long address)
1da177e4 197{
1156e098
HH
198#ifdef CONFIG_X86_32
199 __typeof__(pte_val(__pte(0))) page;
200
201 page = read_cr3();
202 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
203#ifdef CONFIG_X86_PAE
204 printk("*pdpt = %016Lx ", page);
205 if ((page >> PAGE_SHIFT) < max_low_pfn
206 && page & _PAGE_PRESENT) {
207 page &= PAGE_MASK;
208 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
209 & (PTRS_PER_PMD - 1)];
210 printk(KERN_CONT "*pde = %016Lx ", page);
211 page &= ~_PAGE_NX;
212 }
213#else
214 printk("*pde = %08lx ", page);
215#endif
216
217 /*
218 * We must not directly access the pte in the highpte
219 * case if the page table is located in highmem.
220 * And let's rather not kmap-atomic the pte, just in case
221 * it's allocated already.
222 */
223 if ((page >> PAGE_SHIFT) < max_low_pfn
224 && (page & _PAGE_PRESENT)
225 && !(page & _PAGE_PSE)) {
226 page &= PAGE_MASK;
227 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
228 & (PTRS_PER_PTE - 1)];
229 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
230 }
231
232 printk("\n");
233#else /* CONFIG_X86_64 */
1da177e4
LT
234 pgd_t *pgd;
235 pud_t *pud;
236 pmd_t *pmd;
237 pte_t *pte;
238
f51c9452 239 pgd = (pgd_t *)read_cr3();
1da177e4 240
33cb5243 241 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 242 pgd += pgd_index(address);
1da177e4 243 if (bad_address(pgd)) goto bad;
d646bce4 244 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 245 if (!pgd_present(*pgd)) goto ret;
1da177e4 246
d2ae5b5f 247 pud = pud_offset(pgd, address);
1da177e4
LT
248 if (bad_address(pud)) goto bad;
249 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
250 if (!pud_present(*pud) || pud_large(*pud))
251 goto ret;
1da177e4
LT
252
253 pmd = pmd_offset(pud, address);
254 if (bad_address(pmd)) goto bad;
255 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 256 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
257
258 pte = pte_offset_kernel(pmd, address);
259 if (bad_address(pte)) goto bad;
33cb5243 260 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
261ret:
262 printk("\n");
263 return;
264bad:
265 printk("BAD\n");
1156e098
HH
266#endif
267}
268
269#ifdef CONFIG_X86_32
270static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
271{
272 unsigned index = pgd_index(address);
273 pgd_t *pgd_k;
274 pud_t *pud, *pud_k;
275 pmd_t *pmd, *pmd_k;
276
277 pgd += index;
278 pgd_k = init_mm.pgd + index;
279
280 if (!pgd_present(*pgd_k))
281 return NULL;
282
283 /*
284 * set_pgd(pgd, *pgd_k); here would be useless on PAE
285 * and redundant with the set_pmd() on non-PAE. As would
286 * set_pud.
287 */
288
289 pud = pud_offset(pgd, address);
290 pud_k = pud_offset(pgd_k, address);
291 if (!pud_present(*pud_k))
292 return NULL;
293
294 pmd = pmd_offset(pud, address);
295 pmd_k = pmd_offset(pud_k, address);
296 if (!pmd_present(*pmd_k))
297 return NULL;
298 if (!pmd_present(*pmd)) {
299 set_pmd(pmd, *pmd_k);
300 arch_flush_lazy_mmu_mode();
301 } else
302 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
303 return pmd_k;
1da177e4 304}
1156e098 305#endif
1da177e4 306
1dc85be0 307#ifdef CONFIG_X86_64
33cb5243 308static const char errata93_warning[] =
1da177e4
LT
309KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
310KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
311KERN_ERR "******* Please consider a BIOS update.\n"
312KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 313#endif
1da177e4
LT
314
315/* Workaround for K8 erratum #93 & buggy BIOS.
316 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
317 to avoid corruption of the 64bit RIP register on C stepping K8.
318 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
319 The OS sees this as a page fault with the upper 32bits of RIP cleared.
320 Try to work around it here.
fdfe8aa8
HH
321 Note we only handle faults in kernel here.
322 Does nothing for X86_32
323 */
33cb5243 324static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 325{
fdfe8aa8 326#ifdef CONFIG_X86_64
1da177e4 327 static int warned;
65ea5b03 328 if (address != regs->ip)
1da177e4 329 return 0;
33cb5243 330 if ((address >> 32) != 0)
1da177e4
LT
331 return 0;
332 address |= 0xffffffffUL << 32;
33cb5243
HH
333 if ((address >= (u64)_stext && address <= (u64)_etext) ||
334 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 335 if (!warned) {
33cb5243 336 printk(errata93_warning);
1da177e4
LT
337 warned = 1;
338 }
65ea5b03 339 regs->ip = address;
1da177e4
LT
340 return 1;
341 }
fdfe8aa8 342#endif
1da177e4 343 return 0;
33cb5243 344}
1da177e4 345
35f3266f
HH
346/*
347 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
348 * addresses >4GB. We catch this in the page fault handler because these
349 * addresses are not reachable. Just detect this case and return. Any code
350 * segment in LDT is compatibility mode.
351 */
352static int is_errata100(struct pt_regs *regs, unsigned long address)
353{
354#ifdef CONFIG_X86_64
355 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
356 (address >> 32))
357 return 1;
358#endif
359 return 0;
360}
361
29caf2f9
HH
362static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{
364#ifdef CONFIG_X86_F00F_BUG
365 unsigned long nr;
366 /*
367 * Pentium F0 0F C7 C8 bug workaround.
368 */
369 if (boot_cpu_data.f00f_bug) {
370 nr = (address - idt_descr.address) >> 3;
371
372 if (nr == 6) {
373 do_invalid_op(regs, 0);
374 return 1;
375 }
376 }
377#endif
378 return 0;
379}
380
b3279c7f
HH
381static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
382 unsigned long address)
383{
1156e098
HH
384#ifdef CONFIG_X86_32
385 if (!oops_may_print())
386 return;
fd40d6e3 387#endif
1156e098
HH
388
389#ifdef CONFIG_X86_PAE
390 if (error_code & PF_INSTR) {
93809be8 391 unsigned int level;
1156e098
HH
392 pte_t *pte = lookup_address(address, &level);
393
394 if (pte && pte_present(*pte) && !pte_exec(*pte))
395 printk(KERN_CRIT "kernel tried to execute "
396 "NX-protected page - exploit attempt? "
350b4da7 397 "(uid: %d)\n", current_uid());
1156e098
HH
398 }
399#endif
1156e098 400
19f0dda9 401 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 402 if (address < PAGE_SIZE)
19f0dda9 403 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 404 else
19f0dda9 405 printk(KERN_CONT "paging request");
f294a8ce 406 printk(KERN_CONT " at %p\n", (void *) address);
19f0dda9 407 printk(KERN_ALERT "IP:");
b3279c7f
HH
408 printk_address(regs->ip, 1);
409 dump_pagetable(address);
410}
411
1156e098 412#ifdef CONFIG_X86_64
92181f19
NP
413static noinline void pgtable_bad(struct pt_regs *regs,
414 unsigned long error_code, unsigned long address)
1da177e4 415{
1209140c 416 unsigned long flags = oops_begin();
874d93d1 417 int sig = SIGKILL;
92181f19 418 struct task_struct *tsk = current;
1209140c 419
1da177e4 420 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
92181f19 421 tsk->comm, address);
1da177e4 422 dump_pagetable(address);
6e3f3617
JB
423 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code;
22f5991c 426 if (__die("Bad pagetable", regs, error_code))
874d93d1
AH
427 sig = 0;
428 oops_end(flags, regs, sig);
1da177e4 429}
1156e098 430#endif
1da177e4 431
92181f19
NP
432static noinline void no_context(struct pt_regs *regs,
433 unsigned long error_code, unsigned long address)
434{
435 struct task_struct *tsk = current;
19803078
IM
436 unsigned long *stackend;
437
92181f19
NP
438#ifdef CONFIG_X86_64
439 unsigned long flags;
440 int sig;
441#endif
442
443 /* Are we prepared to handle this kernel fault? */
444 if (fixup_exception(regs))
445 return;
446
447 /*
448 * X86_32
449 * Valid to do another page fault here, because if this fault
450 * had been triggered by is_prefetch fixup_exception would have
451 * handled it.
452 *
453 * X86_64
454 * Hall of shame of CPU/BIOS bugs.
455 */
456 if (is_prefetch(regs, error_code, address))
457 return;
458
459 if (is_errata93(regs, address))
460 return;
461
462 /*
463 * Oops. The kernel tried to access some bad page. We'll have to
464 * terminate things with extreme prejudice.
465 */
466#ifdef CONFIG_X86_32
467 bust_spinlocks(1);
468#else
469 flags = oops_begin();
470#endif
471
472 show_fault_oops(regs, error_code, address);
473
19803078
IM
474 stackend = end_of_stack(tsk);
475 if (*stackend != STACK_END_MAGIC)
476 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
477
92181f19
NP
478 tsk->thread.cr2 = address;
479 tsk->thread.trap_no = 14;
480 tsk->thread.error_code = error_code;
481
482#ifdef CONFIG_X86_32
483 die("Oops", regs, error_code);
484 bust_spinlocks(0);
485 do_exit(SIGKILL);
486#else
487 sig = SIGKILL;
488 if (__die("Oops", regs, error_code))
489 sig = 0;
490 /* Executive summary in case the body of the oops scrolled away */
491 printk(KERN_EMERG "CR2: %016lx\n", address);
492 oops_end(flags, regs, sig);
493#endif
494}
495
496static void __bad_area_nosemaphore(struct pt_regs *regs,
497 unsigned long error_code, unsigned long address,
498 int si_code)
499{
500 struct task_struct *tsk = current;
501
502 /* User mode accesses just cause a SIGSEGV */
503 if (error_code & PF_USER) {
504 /*
505 * It's possible to have interrupts off here.
506 */
507 local_irq_enable();
508
509 /*
510 * Valid to do another page fault here because this one came
511 * from user space.
512 */
513 if (is_prefetch(regs, error_code, address))
514 return;
515
516 if (is_errata100(regs, address))
517 return;
518
519 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
520 printk_ratelimit()) {
521 printk(
522 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
523 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
524 tsk->comm, task_pid_nr(tsk), address,
525 (void *) regs->ip, (void *) regs->sp, error_code);
526 print_vma_addr(" in ", regs->ip);
527 printk("\n");
528 }
529
530 tsk->thread.cr2 = address;
531 /* Kernel addresses are always protection faults */
532 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
533 tsk->thread.trap_no = 14;
534 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
535 return;
536 }
537
538 if (is_f00f_bug(regs, address))
539 return;
540
541 no_context(regs, error_code, address);
542}
543
544static noinline void bad_area_nosemaphore(struct pt_regs *regs,
545 unsigned long error_code, unsigned long address)
546{
547 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
548}
549
550static void __bad_area(struct pt_regs *regs,
551 unsigned long error_code, unsigned long address,
552 int si_code)
553{
554 struct mm_struct *mm = current->mm;
555
556 /*
557 * Something tried to access memory that isn't in our memory map..
558 * Fix it, but check if it's kernel or user first..
559 */
560 up_read(&mm->mmap_sem);
561
562 __bad_area_nosemaphore(regs, error_code, address, si_code);
563}
564
565static noinline void bad_area(struct pt_regs *regs,
566 unsigned long error_code, unsigned long address)
567{
568 __bad_area(regs, error_code, address, SEGV_MAPERR);
569}
570
571static noinline void bad_area_access_error(struct pt_regs *regs,
572 unsigned long error_code, unsigned long address)
573{
574 __bad_area(regs, error_code, address, SEGV_ACCERR);
575}
576
577/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
578static void out_of_memory(struct pt_regs *regs,
579 unsigned long error_code, unsigned long address)
580{
581 /*
582 * We ran out of memory, call the OOM killer, and return the userspace
583 * (which will retry the fault, or kill us if we got oom-killed).
584 */
585 up_read(&current->mm->mmap_sem);
586 pagefault_out_of_memory();
587}
588
589static void do_sigbus(struct pt_regs *regs,
590 unsigned long error_code, unsigned long address)
591{
592 struct task_struct *tsk = current;
593 struct mm_struct *mm = tsk->mm;
594
595 up_read(&mm->mmap_sem);
596
597 /* Kernel mode? Handle exceptions or die */
598 if (!(error_code & PF_USER))
599 no_context(regs, error_code, address);
600#ifdef CONFIG_X86_32
601 /* User space => ok to do another page fault */
602 if (is_prefetch(regs, error_code, address))
603 return;
604#endif
605 tsk->thread.cr2 = address;
606 tsk->thread.error_code = error_code;
607 tsk->thread.trap_no = 14;
608 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
609}
610
611static noinline void mm_fault_error(struct pt_regs *regs,
612 unsigned long error_code, unsigned long address, unsigned int fault)
613{
614 if (fault & VM_FAULT_OOM)
615 out_of_memory(regs, error_code, address);
616 else if (fault & VM_FAULT_SIGBUS)
617 do_sigbus(regs, error_code, address);
618 else
619 BUG();
620}
621
d8b57bb7
TG
622static int spurious_fault_check(unsigned long error_code, pte_t *pte)
623{
624 if ((error_code & PF_WRITE) && !pte_write(*pte))
625 return 0;
626 if ((error_code & PF_INSTR) && !pte_exec(*pte))
627 return 0;
628
629 return 1;
630}
631
5b727a3b
JF
632/*
633 * Handle a spurious fault caused by a stale TLB entry. This allows
634 * us to lazily refresh the TLB when increasing the permissions of a
635 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
636 * expensive since that implies doing a full cross-processor TLB
637 * flush, even if no stale TLB entries exist on other processors.
638 * There are no security implications to leaving a stale TLB when
639 * increasing the permissions on a page.
640 */
92181f19
NP
641static noinline int spurious_fault(unsigned long error_code,
642 unsigned long address)
5b727a3b
JF
643{
644 pgd_t *pgd;
645 pud_t *pud;
646 pmd_t *pmd;
647 pte_t *pte;
648
649 /* Reserved-bit violation or user access to kernel space? */
650 if (error_code & (PF_USER | PF_RSVD))
651 return 0;
652
653 pgd = init_mm.pgd + pgd_index(address);
654 if (!pgd_present(*pgd))
655 return 0;
656
657 pud = pud_offset(pgd, address);
658 if (!pud_present(*pud))
659 return 0;
660
d8b57bb7
TG
661 if (pud_large(*pud))
662 return spurious_fault_check(error_code, (pte_t *) pud);
663
5b727a3b
JF
664 pmd = pmd_offset(pud, address);
665 if (!pmd_present(*pmd))
666 return 0;
667
d8b57bb7
TG
668 if (pmd_large(*pmd))
669 return spurious_fault_check(error_code, (pte_t *) pmd);
670
5b727a3b
JF
671 pte = pte_offset_kernel(pmd, address);
672 if (!pte_present(*pte))
673 return 0;
674
d8b57bb7 675 return spurious_fault_check(error_code, pte);
5b727a3b
JF
676}
677
1da177e4 678/*
f8c2ee22
HH
679 * X86_32
680 * Handle a fault on the vmalloc or module mapping area
681 *
682 * X86_64
f95190b2 683 * Handle a fault on the vmalloc area
3b9ba4d5
AK
684 *
685 * This assumes no large pages in there.
1da177e4 686 */
92181f19 687static noinline int vmalloc_fault(unsigned long address)
1da177e4 688{
fdfe8aa8
HH
689#ifdef CONFIG_X86_32
690 unsigned long pgd_paddr;
691 pmd_t *pmd_k;
692 pte_t *pte_k;
b29c701d
HN
693
694 /* Make sure we are in vmalloc area */
695 if (!(address >= VMALLOC_START && address < VMALLOC_END))
696 return -1;
697
fdfe8aa8
HH
698 /*
699 * Synchronize this task's top level page-table
700 * with the 'reference' page table.
701 *
702 * Do _not_ use "current" here. We might be inside
703 * an interrupt in the middle of a task switch..
704 */
705 pgd_paddr = read_cr3();
706 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
707 if (!pmd_k)
708 return -1;
709 pte_k = pte_offset_kernel(pmd_k, address);
710 if (!pte_present(*pte_k))
711 return -1;
712 return 0;
713#else
1da177e4
LT
714 pgd_t *pgd, *pgd_ref;
715 pud_t *pud, *pud_ref;
716 pmd_t *pmd, *pmd_ref;
717 pte_t *pte, *pte_ref;
718
cf89ec92
HH
719 /* Make sure we are in vmalloc area */
720 if (!(address >= VMALLOC_START && address < VMALLOC_END))
721 return -1;
722
1da177e4
LT
723 /* Copy kernel mappings over when needed. This can also
724 happen within a race in page table update. In the later
725 case just flush. */
726
f313e123 727 pgd = pgd_offset(current->active_mm, address);
1da177e4
LT
728 pgd_ref = pgd_offset_k(address);
729 if (pgd_none(*pgd_ref))
730 return -1;
731 if (pgd_none(*pgd))
732 set_pgd(pgd, *pgd_ref);
8c914cb7 733 else
46a82b2d 734 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
735
736 /* Below here mismatches are bugs because these lower tables
737 are shared */
738
739 pud = pud_offset(pgd, address);
740 pud_ref = pud_offset(pgd_ref, address);
741 if (pud_none(*pud_ref))
742 return -1;
46a82b2d 743 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
744 BUG();
745 pmd = pmd_offset(pud, address);
746 pmd_ref = pmd_offset(pud_ref, address);
747 if (pmd_none(*pmd_ref))
748 return -1;
749 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
750 BUG();
751 pte_ref = pte_offset_kernel(pmd_ref, address);
752 if (!pte_present(*pte_ref))
753 return -1;
754 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
755 /* Don't use pte_page here, because the mappings can point
756 outside mem_map, and the NUMA hash lookup cannot handle
757 that. */
758 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 759 BUG();
1da177e4 760 return 0;
fdfe8aa8 761#endif
1da177e4
LT
762}
763
abd4f750 764int show_unhandled_signals = 1;
1da177e4 765
92181f19
NP
766static inline int access_error(unsigned long error_code, int write,
767 struct vm_area_struct *vma)
768{
769 if (write) {
770 /* write, present and write, not present */
771 if (unlikely(!(vma->vm_flags & VM_WRITE)))
772 return 1;
773 } else if (unlikely(error_code & PF_PROT)) {
774 /* read, present */
775 return 1;
776 } else {
777 /* read, not present */
778 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
779 return 1;
780 }
781
782 return 0;
783}
784
1da177e4
LT
785/*
786 * This routine handles page faults. It determines the address,
787 * and the problem, and then passes it off to one of the appropriate
788 * routines.
1da177e4 789 */
f8c2ee22
HH
790#ifdef CONFIG_X86_64
791asmlinkage
792#endif
793void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4 794{
92181f19 795 unsigned long address;
1da177e4
LT
796 struct task_struct *tsk;
797 struct mm_struct *mm;
33cb5243 798 struct vm_area_struct *vma;
92181f19 799 int write;
f8c2ee22 800 int fault;
1da177e4 801
a9ba9a3b
AV
802 tsk = current;
803 mm = tsk->mm;
804 prefetchw(&mm->mmap_sem);
805
1da177e4 806 /* get the address */
f51c9452 807 address = read_cr2();
1da177e4 808
92181f19 809 if (unlikely(notify_page_fault(regs)))
608566b4 810 return;
0fd0e3da 811 if (unlikely(kmmio_fault(regs, address)))
86069782 812 return;
1da177e4
LT
813
814 /*
815 * We fault-in kernel-space virtual memory on-demand. The
816 * 'reference' page table is init_mm.pgd.
817 *
818 * NOTE! We MUST NOT take any locks for this case. We may
819 * be in an interrupt or a critical region, and should
820 * only copy the information from the master page table,
821 * nothing more.
822 *
823 * This verifies that the fault happens in kernel space
824 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 825 * protection error (error_code & 9) == 0.
1da177e4 826 */
f8c2ee22
HH
827#ifdef CONFIG_X86_32
828 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
829#else
830 if (unlikely(address >= TASK_SIZE64)) {
831#endif
f8c2ee22
HH
832 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
833 vmalloc_fault(address) >= 0)
834 return;
5b727a3b
JF
835
836 /* Can handle a stale RO->RW TLB */
92181f19 837 if (spurious_fault(error_code, address))
5b727a3b
JF
838 return;
839
f8c2ee22
HH
840 /*
841 * Don't take the mm semaphore here. If we fixup a prefetch
842 * fault we could otherwise deadlock.
843 */
92181f19
NP
844 bad_area_nosemaphore(regs, error_code, address);
845 return;
f8c2ee22
HH
846 }
847
f8c2ee22 848 /*
891cffbd
LT
849 * It's safe to allow irq's after cr2 has been saved and the
850 * vmalloc fault has been handled.
851 *
852 * User-mode registers count as a user access even for any
853 * potential system fault or CPU buglet.
f8c2ee22 854 */
891cffbd
LT
855 if (user_mode_vm(regs)) {
856 local_irq_enable();
857 error_code |= PF_USER;
858 } else if (regs->flags & X86_EFLAGS_IF)
8c914cb7
JB
859 local_irq_enable();
860
891cffbd 861#ifdef CONFIG_X86_64
66c58156 862 if (unlikely(error_code & PF_RSVD))
92181f19 863 pgtable_bad(regs, error_code, address);
891cffbd 864#endif
1da177e4
LT
865
866 /*
33cb5243
HH
867 * If we're in an interrupt, have no user context or are running in an
868 * atomic region then we must not take the fault.
1da177e4 869 */
92181f19
NP
870 if (unlikely(in_atomic() || !mm)) {
871 bad_area_nosemaphore(regs, error_code, address);
872 return;
873 }
1da177e4 874
3a1dfe6e
IM
875 /*
876 * When running in the kernel we expect faults to occur only to
1da177e4 877 * addresses in user space. All other faults represent errors in the
676b1855 878 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 879 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
880 * we will deadlock attempting to validate the fault against the
881 * address space. Luckily the kernel only validly references user
882 * space from well defined areas of code, which are listed in the
883 * exceptions table.
884 *
885 * As the vast majority of faults will be valid we will only perform
676b1855 886 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
887 * Attempt to lock the address space, if we cannot we then validate the
888 * source. If this is invalid we can skip the address space check,
889 * thus avoiding the deadlock.
890 */
92181f19 891 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
66c58156 892 if ((error_code & PF_USER) == 0 &&
92181f19
NP
893 !search_exception_tables(regs->ip)) {
894 bad_area_nosemaphore(regs, error_code, address);
895 return;
896 }
1da177e4
LT
897 down_read(&mm->mmap_sem);
898 }
899
900 vma = find_vma(mm, address);
92181f19
NP
901 if (unlikely(!vma)) {
902 bad_area(regs, error_code, address);
903 return;
904 }
905 if (likely(vma->vm_start <= address))
1da177e4 906 goto good_area;
92181f19
NP
907 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
908 bad_area(regs, error_code, address);
909 return;
910 }
33cb5243 911 if (error_code & PF_USER) {
6f4d368e
HH
912 /*
913 * Accessing the stack below %sp is always a bug.
914 * The large cushion allows instructions like enter
915 * and pusha to work. ("enter $65535,$31" pushes
916 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 917 */
92181f19
NP
918 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
919 bad_area(regs, error_code, address);
920 return;
921 }
1da177e4 922 }
92181f19
NP
923 if (unlikely(expand_stack(vma, address))) {
924 bad_area(regs, error_code, address);
925 return;
926 }
927
928 /*
929 * Ok, we have a good vm_area for this memory access, so
930 * we can handle it..
931 */
1da177e4 932good_area:
92181f19
NP
933 write = error_code & PF_WRITE;
934 if (unlikely(access_error(error_code, write, vma))) {
935 bad_area_access_error(regs, error_code, address);
936 return;
1da177e4
LT
937 }
938
939 /*
940 * If for any reason at all we couldn't handle the fault,
941 * make sure we exit gracefully rather than endlessly redo
942 * the fault.
943 */
83c54070
NP
944 fault = handle_mm_fault(mm, vma, address, write);
945 if (unlikely(fault & VM_FAULT_ERROR)) {
92181f19
NP
946 mm_fault_error(regs, error_code, address, fault);
947 return;
1da177e4 948 }
83c54070
NP
949 if (fault & VM_FAULT_MAJOR)
950 tsk->maj_flt++;
951 else
952 tsk->min_flt++;
d729ab35
HH
953
954#ifdef CONFIG_X86_32
955 /*
956 * Did it hit the DOS screen memory VA from vm86 mode?
957 */
958 if (v8086_mode(regs)) {
959 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
960 if (bit < 32)
961 tsk->thread.screen_bitmap |= 1 << bit;
962 }
963#endif
1da177e4 964 up_read(&mm->mmap_sem);
1da177e4 965}
9e43e1b7 966
8c914cb7 967DEFINE_SPINLOCK(pgd_lock);
2bff7383 968LIST_HEAD(pgd_list);
8c914cb7
JB
969
970void vmalloc_sync_all(void)
971{
1156e098
HH
972 unsigned long address;
973
cc643d46 974#ifdef CONFIG_X86_32
1156e098
HH
975 if (SHARED_KERNEL_PMD)
976 return;
977
cc643d46
JB
978 for (address = VMALLOC_START & PMD_MASK;
979 address >= TASK_SIZE && address < FIXADDR_TOP;
980 address += PMD_SIZE) {
67350a5c
JF
981 unsigned long flags;
982 struct page *page;
983
984 spin_lock_irqsave(&pgd_lock, flags);
985 list_for_each_entry(page, &pgd_list, lru) {
986 if (!vmalloc_sync_one(page_address(page),
987 address))
988 break;
1156e098 989 }
67350a5c 990 spin_unlock_irqrestore(&pgd_lock, flags);
1156e098
HH
991 }
992#else /* CONFIG_X86_64 */
cc643d46
JB
993 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
994 address += PGDIR_SIZE) {
67350a5c
JF
995 const pgd_t *pgd_ref = pgd_offset_k(address);
996 unsigned long flags;
997 struct page *page;
998
999 if (pgd_none(*pgd_ref))
1000 continue;
1001 spin_lock_irqsave(&pgd_lock, flags);
1002 list_for_each_entry(page, &pgd_list, lru) {
1003 pgd_t *pgd;
1004 pgd = (pgd_t *)page_address(page) + pgd_index(address);
1005 if (pgd_none(*pgd))
1006 set_pgd(pgd, *pgd_ref);
1007 else
1008 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 1009 }
67350a5c 1010 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7 1011 }
1156e098 1012#endif
8c914cb7 1013}