stackprotector: use canary at end of stack to indicate overruns at oops time
[linux-2.6-block.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
c61e211d
HH
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 23#include <linux/vmalloc.h>
1da177e4 24#include <linux/module.h>
0f2fbdcb 25#include <linux/kprobes.h>
ab2bf0c1 26#include <linux/uaccess.h>
1eeb66a1 27#include <linux/kdebug.h>
7c9f8861 28#include <linux/magic.h>
1da177e4
LT
29
30#include <asm/system.h>
c61e211d
HH
31#include <asm/desc.h>
32#include <asm/segment.h>
1da177e4
LT
33#include <asm/pgalloc.h>
34#include <asm/smp.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
1da177e4 37#include <asm-generic/sections.h>
1da177e4 38
33cb5243
HH
39/*
40 * Page fault error code bits
41 * bit 0 == 0 means no page found, 1 means protection fault
42 * bit 1 == 0 means read, 1 means write
43 * bit 2 == 0 means kernel, 1 means user-mode
44 * bit 3 == 1 means use of reserved bit detected
45 * bit 4 == 1 means fault was an instruction fetch
46 */
8a19da7b 47#define PF_PROT (1<<0)
66c58156 48#define PF_WRITE (1<<1)
8a19da7b
IM
49#define PF_USER (1<<2)
50#define PF_RSVD (1<<3)
66c58156
AK
51#define PF_INSTR (1<<4)
52
74a0b576 53static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 54{
33cb5243 55#ifdef CONFIG_KPROBES
74a0b576
CH
56 int ret = 0;
57
58 /* kprobe_running() needs smp_processor_id() */
f8c2ee22
HH
59#ifdef CONFIG_X86_32
60 if (!user_mode_vm(regs)) {
61#else
74a0b576 62 if (!user_mode(regs)) {
f8c2ee22 63#endif
74a0b576
CH
64 preempt_disable();
65 if (kprobe_running() && kprobe_fault_handler(regs, 14))
66 ret = 1;
67 preempt_enable();
68 }
1bd858a5 69
74a0b576 70 return ret;
74a0b576 71#else
74a0b576 72 return 0;
74a0b576 73#endif
33cb5243 74}
1bd858a5 75
1dc85be0
HH
76/*
77 * X86_32
78 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
79 * Check that here and ignore it.
80 *
81 * X86_64
82 * Sometimes the CPU reports invalid exceptions on prefetch.
83 * Check that here and ignore it.
84 *
85 * Opcode checker based on code by Richard Brunner
86 */
87static int is_prefetch(struct pt_regs *regs, unsigned long addr,
88 unsigned long error_code)
33cb5243 89{
ab2bf0c1 90 unsigned char *instr;
1da177e4 91 int scan_more = 1;
33cb5243 92 int prefetch = 0;
f1290ec9 93 unsigned char *max_instr;
1da177e4 94
3085354d
IM
95 /*
96 * If it was a exec (instruction fetch) fault on NX page, then
97 * do not ignore the fault:
98 */
66c58156 99 if (error_code & PF_INSTR)
1da177e4 100 return 0;
1dc85be0 101
f2857ce9 102 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 103 max_instr = instr + 15;
1da177e4 104
76381fee 105 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
106 return 0;
107
33cb5243 108 while (scan_more && instr < max_instr) {
1da177e4
LT
109 unsigned char opcode;
110 unsigned char instr_hi;
111 unsigned char instr_lo;
112
ab2bf0c1 113 if (probe_kernel_address(instr, opcode))
33cb5243 114 break;
1da177e4 115
33cb5243
HH
116 instr_hi = opcode & 0xf0;
117 instr_lo = opcode & 0x0f;
1da177e4
LT
118 instr++;
119
33cb5243 120 switch (instr_hi) {
1da177e4
LT
121 case 0x20:
122 case 0x30:
33cb5243
HH
123 /*
124 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
125 * In X86_64 long mode, the CPU will signal invalid
126 * opcode if some of these prefixes are present so
127 * X86_64 will never get here anyway
128 */
1da177e4
LT
129 scan_more = ((instr_lo & 7) == 0x6);
130 break;
33cb5243 131#ifdef CONFIG_X86_64
1da177e4 132 case 0x40:
33cb5243
HH
133 /*
134 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
135 * Need to figure out under what instruction mode the
136 * instruction was issued. Could check the LDT for lm,
137 * but for now it's good enough to assume that long
138 * mode only uses well known segments or kernel.
139 */
76381fee 140 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 141 break;
33cb5243 142#endif
1da177e4
LT
143 case 0x60:
144 /* 0x64 thru 0x67 are valid prefixes in all modes. */
145 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 146 break;
1da177e4 147 case 0xF0:
1dc85be0 148 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 149 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 150 break;
1da177e4
LT
151 case 0x00:
152 /* Prefetch instruction is 0x0F0D or 0x0F18 */
153 scan_more = 0;
f2857ce9 154
ab2bf0c1 155 if (probe_kernel_address(instr, opcode))
1da177e4
LT
156 break;
157 prefetch = (instr_lo == 0xF) &&
158 (opcode == 0x0D || opcode == 0x18);
33cb5243 159 break;
1da177e4
LT
160 default:
161 scan_more = 0;
162 break;
33cb5243 163 }
1da177e4
LT
164 }
165 return prefetch;
166}
167
c4aba4a8
HH
168static void force_sig_info_fault(int si_signo, int si_code,
169 unsigned long address, struct task_struct *tsk)
170{
171 siginfo_t info;
172
173 info.si_signo = si_signo;
174 info.si_errno = 0;
175 info.si_code = si_code;
176 info.si_addr = (void __user *)address;
177 force_sig_info(si_signo, &info, tsk);
178}
179
1156e098 180#ifdef CONFIG_X86_64
33cb5243
HH
181static int bad_address(void *p)
182{
1da177e4 183 unsigned long dummy;
ab2bf0c1 184 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 185}
1156e098 186#endif
1da177e4 187
cae30f82 188static void dump_pagetable(unsigned long address)
1da177e4 189{
1156e098
HH
190#ifdef CONFIG_X86_32
191 __typeof__(pte_val(__pte(0))) page;
192
193 page = read_cr3();
194 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
195#ifdef CONFIG_X86_PAE
196 printk("*pdpt = %016Lx ", page);
197 if ((page >> PAGE_SHIFT) < max_low_pfn
198 && page & _PAGE_PRESENT) {
199 page &= PAGE_MASK;
200 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
201 & (PTRS_PER_PMD - 1)];
202 printk(KERN_CONT "*pde = %016Lx ", page);
203 page &= ~_PAGE_NX;
204 }
205#else
206 printk("*pde = %08lx ", page);
207#endif
208
209 /*
210 * We must not directly access the pte in the highpte
211 * case if the page table is located in highmem.
212 * And let's rather not kmap-atomic the pte, just in case
213 * it's allocated already.
214 */
215 if ((page >> PAGE_SHIFT) < max_low_pfn
216 && (page & _PAGE_PRESENT)
217 && !(page & _PAGE_PSE)) {
218 page &= PAGE_MASK;
219 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
220 & (PTRS_PER_PTE - 1)];
221 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
222 }
223
224 printk("\n");
225#else /* CONFIG_X86_64 */
1da177e4
LT
226 pgd_t *pgd;
227 pud_t *pud;
228 pmd_t *pmd;
229 pte_t *pte;
230
f51c9452 231 pgd = (pgd_t *)read_cr3();
1da177e4 232
33cb5243 233 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 234 pgd += pgd_index(address);
1da177e4 235 if (bad_address(pgd)) goto bad;
d646bce4 236 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 237 if (!pgd_present(*pgd)) goto ret;
1da177e4 238
d2ae5b5f 239 pud = pud_offset(pgd, address);
1da177e4
LT
240 if (bad_address(pud)) goto bad;
241 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
242 if (!pud_present(*pud) || pud_large(*pud))
243 goto ret;
1da177e4
LT
244
245 pmd = pmd_offset(pud, address);
246 if (bad_address(pmd)) goto bad;
247 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 248 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
249
250 pte = pte_offset_kernel(pmd, address);
251 if (bad_address(pte)) goto bad;
33cb5243 252 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
253ret:
254 printk("\n");
255 return;
256bad:
257 printk("BAD\n");
1156e098
HH
258#endif
259}
260
261#ifdef CONFIG_X86_32
262static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
263{
264 unsigned index = pgd_index(address);
265 pgd_t *pgd_k;
266 pud_t *pud, *pud_k;
267 pmd_t *pmd, *pmd_k;
268
269 pgd += index;
270 pgd_k = init_mm.pgd + index;
271
272 if (!pgd_present(*pgd_k))
273 return NULL;
274
275 /*
276 * set_pgd(pgd, *pgd_k); here would be useless on PAE
277 * and redundant with the set_pmd() on non-PAE. As would
278 * set_pud.
279 */
280
281 pud = pud_offset(pgd, address);
282 pud_k = pud_offset(pgd_k, address);
283 if (!pud_present(*pud_k))
284 return NULL;
285
286 pmd = pmd_offset(pud, address);
287 pmd_k = pmd_offset(pud_k, address);
288 if (!pmd_present(*pmd_k))
289 return NULL;
290 if (!pmd_present(*pmd)) {
291 set_pmd(pmd, *pmd_k);
292 arch_flush_lazy_mmu_mode();
293 } else
294 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
295 return pmd_k;
1da177e4 296}
1156e098 297#endif
1da177e4 298
1dc85be0 299#ifdef CONFIG_X86_64
33cb5243 300static const char errata93_warning[] =
1da177e4
LT
301KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
302KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
303KERN_ERR "******* Please consider a BIOS update.\n"
304KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 305#endif
1da177e4
LT
306
307/* Workaround for K8 erratum #93 & buggy BIOS.
308 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
309 to avoid corruption of the 64bit RIP register on C stepping K8.
310 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
311 The OS sees this as a page fault with the upper 32bits of RIP cleared.
312 Try to work around it here.
fdfe8aa8
HH
313 Note we only handle faults in kernel here.
314 Does nothing for X86_32
315 */
33cb5243 316static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 317{
fdfe8aa8 318#ifdef CONFIG_X86_64
1da177e4 319 static int warned;
65ea5b03 320 if (address != regs->ip)
1da177e4 321 return 0;
33cb5243 322 if ((address >> 32) != 0)
1da177e4
LT
323 return 0;
324 address |= 0xffffffffUL << 32;
33cb5243
HH
325 if ((address >= (u64)_stext && address <= (u64)_etext) ||
326 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 327 if (!warned) {
33cb5243 328 printk(errata93_warning);
1da177e4
LT
329 warned = 1;
330 }
65ea5b03 331 regs->ip = address;
1da177e4
LT
332 return 1;
333 }
fdfe8aa8 334#endif
1da177e4 335 return 0;
33cb5243 336}
1da177e4 337
35f3266f
HH
338/*
339 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
340 * addresses >4GB. We catch this in the page fault handler because these
341 * addresses are not reachable. Just detect this case and return. Any code
342 * segment in LDT is compatibility mode.
343 */
344static int is_errata100(struct pt_regs *regs, unsigned long address)
345{
346#ifdef CONFIG_X86_64
347 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
348 (address >> 32))
349 return 1;
350#endif
351 return 0;
352}
353
29caf2f9
HH
354void do_invalid_op(struct pt_regs *, unsigned long);
355
356static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
357{
358#ifdef CONFIG_X86_F00F_BUG
359 unsigned long nr;
360 /*
361 * Pentium F0 0F C7 C8 bug workaround.
362 */
363 if (boot_cpu_data.f00f_bug) {
364 nr = (address - idt_descr.address) >> 3;
365
366 if (nr == 6) {
367 do_invalid_op(regs, 0);
368 return 1;
369 }
370 }
371#endif
372 return 0;
373}
374
b3279c7f
HH
375static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
376 unsigned long address)
377{
1156e098
HH
378#ifdef CONFIG_X86_32
379 if (!oops_may_print())
380 return;
fd40d6e3 381#endif
1156e098
HH
382
383#ifdef CONFIG_X86_PAE
384 if (error_code & PF_INSTR) {
93809be8 385 unsigned int level;
1156e098
HH
386 pte_t *pte = lookup_address(address, &level);
387
388 if (pte && pte_present(*pte) && !pte_exec(*pte))
389 printk(KERN_CRIT "kernel tried to execute "
390 "NX-protected page - exploit attempt? "
391 "(uid: %d)\n", current->uid);
392 }
393#endif
1156e098 394
19f0dda9 395 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 396 if (address < PAGE_SIZE)
19f0dda9 397 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 398 else
19f0dda9 399 printk(KERN_CONT "paging request");
fd40d6e3
HH
400#ifdef CONFIG_X86_32
401 printk(KERN_CONT " at %08lx\n", address);
402#else
19f0dda9 403 printk(KERN_CONT " at %016lx\n", address);
fd40d6e3 404#endif
19f0dda9 405 printk(KERN_ALERT "IP:");
b3279c7f
HH
406 printk_address(regs->ip, 1);
407 dump_pagetable(address);
408}
409
1156e098 410#ifdef CONFIG_X86_64
1da177e4
LT
411static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
412 unsigned long error_code)
413{
1209140c 414 unsigned long flags = oops_begin();
6e3f3617 415 struct task_struct *tsk;
1209140c 416
1da177e4
LT
417 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
418 current->comm, address);
419 dump_pagetable(address);
6e3f3617
JB
420 tsk = current;
421 tsk->thread.cr2 = address;
422 tsk->thread.trap_no = 14;
423 tsk->thread.error_code = error_code;
22f5991c
JB
424 if (__die("Bad pagetable", regs, error_code))
425 regs = NULL;
426 oops_end(flags, regs, SIGKILL);
1da177e4 427}
1156e098 428#endif
1da177e4 429
d8b57bb7
TG
430static int spurious_fault_check(unsigned long error_code, pte_t *pte)
431{
432 if ((error_code & PF_WRITE) && !pte_write(*pte))
433 return 0;
434 if ((error_code & PF_INSTR) && !pte_exec(*pte))
435 return 0;
436
437 return 1;
438}
439
5b727a3b
JF
440/*
441 * Handle a spurious fault caused by a stale TLB entry. This allows
442 * us to lazily refresh the TLB when increasing the permissions of a
443 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
444 * expensive since that implies doing a full cross-processor TLB
445 * flush, even if no stale TLB entries exist on other processors.
446 * There are no security implications to leaving a stale TLB when
447 * increasing the permissions on a page.
448 */
449static int spurious_fault(unsigned long address,
450 unsigned long error_code)
451{
452 pgd_t *pgd;
453 pud_t *pud;
454 pmd_t *pmd;
455 pte_t *pte;
456
457 /* Reserved-bit violation or user access to kernel space? */
458 if (error_code & (PF_USER | PF_RSVD))
459 return 0;
460
461 pgd = init_mm.pgd + pgd_index(address);
462 if (!pgd_present(*pgd))
463 return 0;
464
465 pud = pud_offset(pgd, address);
466 if (!pud_present(*pud))
467 return 0;
468
d8b57bb7
TG
469 if (pud_large(*pud))
470 return spurious_fault_check(error_code, (pte_t *) pud);
471
5b727a3b
JF
472 pmd = pmd_offset(pud, address);
473 if (!pmd_present(*pmd))
474 return 0;
475
d8b57bb7
TG
476 if (pmd_large(*pmd))
477 return spurious_fault_check(error_code, (pte_t *) pmd);
478
5b727a3b
JF
479 pte = pte_offset_kernel(pmd, address);
480 if (!pte_present(*pte))
481 return 0;
482
d8b57bb7 483 return spurious_fault_check(error_code, pte);
5b727a3b
JF
484}
485
1da177e4 486/*
f8c2ee22
HH
487 * X86_32
488 * Handle a fault on the vmalloc or module mapping area
489 *
490 * X86_64
f95190b2 491 * Handle a fault on the vmalloc area
3b9ba4d5
AK
492 *
493 * This assumes no large pages in there.
1da177e4
LT
494 */
495static int vmalloc_fault(unsigned long address)
496{
fdfe8aa8
HH
497#ifdef CONFIG_X86_32
498 unsigned long pgd_paddr;
499 pmd_t *pmd_k;
500 pte_t *pte_k;
501 /*
502 * Synchronize this task's top level page-table
503 * with the 'reference' page table.
504 *
505 * Do _not_ use "current" here. We might be inside
506 * an interrupt in the middle of a task switch..
507 */
508 pgd_paddr = read_cr3();
509 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
510 if (!pmd_k)
511 return -1;
512 pte_k = pte_offset_kernel(pmd_k, address);
513 if (!pte_present(*pte_k))
514 return -1;
515 return 0;
516#else
1da177e4
LT
517 pgd_t *pgd, *pgd_ref;
518 pud_t *pud, *pud_ref;
519 pmd_t *pmd, *pmd_ref;
520 pte_t *pte, *pte_ref;
521
cf89ec92
HH
522 /* Make sure we are in vmalloc area */
523 if (!(address >= VMALLOC_START && address < VMALLOC_END))
524 return -1;
525
1da177e4
LT
526 /* Copy kernel mappings over when needed. This can also
527 happen within a race in page table update. In the later
528 case just flush. */
529
530 pgd = pgd_offset(current->mm ?: &init_mm, address);
531 pgd_ref = pgd_offset_k(address);
532 if (pgd_none(*pgd_ref))
533 return -1;
534 if (pgd_none(*pgd))
535 set_pgd(pgd, *pgd_ref);
8c914cb7 536 else
46a82b2d 537 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
538
539 /* Below here mismatches are bugs because these lower tables
540 are shared */
541
542 pud = pud_offset(pgd, address);
543 pud_ref = pud_offset(pgd_ref, address);
544 if (pud_none(*pud_ref))
545 return -1;
46a82b2d 546 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
547 BUG();
548 pmd = pmd_offset(pud, address);
549 pmd_ref = pmd_offset(pud_ref, address);
550 if (pmd_none(*pmd_ref))
551 return -1;
552 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
553 BUG();
554 pte_ref = pte_offset_kernel(pmd_ref, address);
555 if (!pte_present(*pte_ref))
556 return -1;
557 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
558 /* Don't use pte_page here, because the mappings can point
559 outside mem_map, and the NUMA hash lookup cannot handle
560 that. */
561 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 562 BUG();
1da177e4 563 return 0;
fdfe8aa8 564#endif
1da177e4
LT
565}
566
abd4f750 567int show_unhandled_signals = 1;
1da177e4
LT
568
569/*
570 * This routine handles page faults. It determines the address,
571 * and the problem, and then passes it off to one of the appropriate
572 * routines.
1da177e4 573 */
f8c2ee22
HH
574#ifdef CONFIG_X86_64
575asmlinkage
576#endif
577void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
578{
579 struct task_struct *tsk;
580 struct mm_struct *mm;
33cb5243 581 struct vm_area_struct *vma;
1da177e4 582 unsigned long address;
f8c2ee22
HH
583 int write, si_code;
584 int fault;
7c9f8861
ES
585 unsigned long *stackend;
586
f8c2ee22 587#ifdef CONFIG_X86_64
1209140c 588 unsigned long flags;
f8c2ee22 589#endif
1da177e4 590
143a5d32
PZ
591 /*
592 * We can fault from pretty much anywhere, with unknown IRQ state.
593 */
594 trace_hardirqs_fixup();
595
a9ba9a3b
AV
596 tsk = current;
597 mm = tsk->mm;
598 prefetchw(&mm->mmap_sem);
599
1da177e4 600 /* get the address */
f51c9452 601 address = read_cr2();
1da177e4 602
c4aba4a8 603 si_code = SEGV_MAPERR;
1da177e4 604
608566b4
HH
605 if (notify_page_fault(regs))
606 return;
1da177e4
LT
607
608 /*
609 * We fault-in kernel-space virtual memory on-demand. The
610 * 'reference' page table is init_mm.pgd.
611 *
612 * NOTE! We MUST NOT take any locks for this case. We may
613 * be in an interrupt or a critical region, and should
614 * only copy the information from the master page table,
615 * nothing more.
616 *
617 * This verifies that the fault happens in kernel space
618 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 619 * protection error (error_code & 9) == 0.
1da177e4 620 */
f8c2ee22
HH
621#ifdef CONFIG_X86_32
622 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
623#else
624 if (unlikely(address >= TASK_SIZE64)) {
625#endif
f8c2ee22
HH
626 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
627 vmalloc_fault(address) >= 0)
628 return;
5b727a3b
JF
629
630 /* Can handle a stale RO->RW TLB */
631 if (spurious_fault(address, error_code))
632 return;
633
f8c2ee22
HH
634 /*
635 * Don't take the mm semaphore here. If we fixup a prefetch
636 * fault we could otherwise deadlock.
637 */
638 goto bad_area_nosemaphore;
639 }
640
cf89ec92
HH
641
642#ifdef CONFIG_X86_32
f8c2ee22
HH
643 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
644 fault has been handled. */
6b6891f9 645 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
f8c2ee22
HH
646 local_irq_enable();
647
648 /*
649 * If we're in an interrupt, have no user context or are running in an
650 * atomic region then we must not take the fault.
651 */
652 if (in_atomic() || !mm)
653 goto bad_area_nosemaphore;
654#else /* CONFIG_X86_64 */
65ea5b03 655 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
656 local_irq_enable();
657
66c58156 658 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
659 pgtable_bad(address, regs, error_code);
660
661 /*
33cb5243
HH
662 * If we're in an interrupt, have no user context or are running in an
663 * atomic region then we must not take the fault.
1da177e4
LT
664 */
665 if (unlikely(in_atomic() || !mm))
666 goto bad_area_nosemaphore;
667
dbe3ed1c
LT
668 /*
669 * User-mode registers count as a user access even for any
670 * potential system fault or CPU buglet.
671 */
672 if (user_mode_vm(regs))
673 error_code |= PF_USER;
f8c2ee22
HH
674again:
675#endif
1da177e4
LT
676 /* When running in the kernel we expect faults to occur only to
677 * addresses in user space. All other faults represent errors in the
676b1855 678 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 679 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
680 * we will deadlock attempting to validate the fault against the
681 * address space. Luckily the kernel only validly references user
682 * space from well defined areas of code, which are listed in the
683 * exceptions table.
684 *
685 * As the vast majority of faults will be valid we will only perform
676b1855 686 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
687 * Attempt to lock the address space, if we cannot we then validate the
688 * source. If this is invalid we can skip the address space check,
689 * thus avoiding the deadlock.
690 */
691 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 692 if ((error_code & PF_USER) == 0 &&
65ea5b03 693 !search_exception_tables(regs->ip))
1da177e4
LT
694 goto bad_area_nosemaphore;
695 down_read(&mm->mmap_sem);
696 }
697
698 vma = find_vma(mm, address);
699 if (!vma)
700 goto bad_area;
f8c2ee22 701 if (vma->vm_start <= address)
1da177e4
LT
702 goto good_area;
703 if (!(vma->vm_flags & VM_GROWSDOWN))
704 goto bad_area;
33cb5243 705 if (error_code & PF_USER) {
6f4d368e
HH
706 /*
707 * Accessing the stack below %sp is always a bug.
708 * The large cushion allows instructions like enter
709 * and pusha to work. ("enter $65535,$31" pushes
710 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 711 */
65ea5b03 712 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
713 goto bad_area;
714 }
715 if (expand_stack(vma, address))
716 goto bad_area;
717/*
718 * Ok, we have a good vm_area for this memory access, so
719 * we can handle it..
720 */
721good_area:
c4aba4a8 722 si_code = SEGV_ACCERR;
1da177e4 723 write = 0;
66c58156 724 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
725 default: /* 3: write, present */
726 /* fall through */
727 case PF_WRITE: /* write, not present */
728 if (!(vma->vm_flags & VM_WRITE))
729 goto bad_area;
730 write++;
731 break;
732 case PF_PROT: /* read, present */
733 goto bad_area;
734 case 0: /* read, not present */
735 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 736 goto bad_area;
1da177e4
LT
737 }
738
f8c2ee22
HH
739#ifdef CONFIG_X86_32
740survive:
741#endif
1da177e4
LT
742 /*
743 * If for any reason at all we couldn't handle the fault,
744 * make sure we exit gracefully rather than endlessly redo
745 * the fault.
746 */
83c54070
NP
747 fault = handle_mm_fault(mm, vma, address, write);
748 if (unlikely(fault & VM_FAULT_ERROR)) {
749 if (fault & VM_FAULT_OOM)
750 goto out_of_memory;
751 else if (fault & VM_FAULT_SIGBUS)
752 goto do_sigbus;
753 BUG();
1da177e4 754 }
83c54070
NP
755 if (fault & VM_FAULT_MAJOR)
756 tsk->maj_flt++;
757 else
758 tsk->min_flt++;
d729ab35
HH
759
760#ifdef CONFIG_X86_32
761 /*
762 * Did it hit the DOS screen memory VA from vm86 mode?
763 */
764 if (v8086_mode(regs)) {
765 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
766 if (bit < 32)
767 tsk->thread.screen_bitmap |= 1 << bit;
768 }
769#endif
1da177e4
LT
770 up_read(&mm->mmap_sem);
771 return;
772
773/*
774 * Something tried to access memory that isn't in our memory map..
775 * Fix it, but check if it's kernel or user first..
776 */
777bad_area:
778 up_read(&mm->mmap_sem);
779
780bad_area_nosemaphore:
1da177e4 781 /* User mode accesses just cause a SIGSEGV */
66c58156 782 if (error_code & PF_USER) {
e5e3c84b
SR
783 /*
784 * It's possible to have interrupts off here.
785 */
786 local_irq_enable();
787
1156e098
HH
788 /*
789 * Valid to do another page fault here because this one came
790 * from user space.
791 */
1da177e4
LT
792 if (is_prefetch(regs, address, error_code))
793 return;
794
35f3266f 795 if (is_errata100(regs, address))
1da177e4
LT
796 return;
797
abd4f750
MAS
798 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
799 printk_ratelimit()) {
1da177e4 800 printk(
6f4d368e 801#ifdef CONFIG_X86_32
edcd8119 802 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 803#else
03252919 804 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
805#endif
806 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
807 tsk->comm, task_pid_nr(tsk), address, regs->ip,
808 regs->sp, error_code);
03252919
AK
809 print_vma_addr(" in ", regs->ip);
810 printk("\n");
1da177e4 811 }
33cb5243 812
1da177e4
LT
813 tsk->thread.cr2 = address;
814 /* Kernel addresses are always protection faults */
815 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
816 tsk->thread.trap_no = 14;
c4aba4a8 817 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
818 return;
819 }
820
29caf2f9
HH
821 if (is_f00f_bug(regs, address))
822 return;
823
1da177e4 824no_context:
1da177e4 825 /* Are we prepared to handle this kernel fault? */
33cb5243 826 if (fixup_exception(regs))
1da177e4 827 return;
1da177e4 828
33cb5243 829 /*
f8c2ee22
HH
830 * X86_32
831 * Valid to do another page fault here, because if this fault
832 * had been triggered by is_prefetch fixup_exception would have
833 * handled it.
834 *
835 * X86_64
1da177e4
LT
836 * Hall of shame of CPU/BIOS bugs.
837 */
33cb5243
HH
838 if (is_prefetch(regs, address, error_code))
839 return;
1da177e4
LT
840
841 if (is_errata93(regs, address))
33cb5243 842 return;
1da177e4
LT
843
844/*
845 * Oops. The kernel tried to access some bad page. We'll have to
846 * terminate things with extreme prejudice.
847 */
f8c2ee22
HH
848#ifdef CONFIG_X86_32
849 bust_spinlocks(1);
fd40d6e3
HH
850#else
851 flags = oops_begin();
852#endif
f8c2ee22
HH
853
854 show_fault_oops(regs, error_code, address);
1da177e4 855
7c9f8861
ES
856 stackend = end_of_stack(tsk);
857 if (*stackend != STACK_END_MAGIC)
858 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
859
f8c2ee22
HH
860 tsk->thread.cr2 = address;
861 tsk->thread.trap_no = 14;
862 tsk->thread.error_code = error_code;
fd40d6e3
HH
863
864#ifdef CONFIG_X86_32
f8c2ee22
HH
865 die("Oops", regs, error_code);
866 bust_spinlocks(0);
867 do_exit(SIGKILL);
fd40d6e3 868#else
22f5991c
JB
869 if (__die("Oops", regs, error_code))
870 regs = NULL;
1da177e4
LT
871 /* Executive summary in case the body of the oops scrolled away */
872 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 873 oops_end(flags, regs, SIGKILL);
f8c2ee22 874#endif
1da177e4
LT
875
876/*
877 * We ran out of memory, or some other thing happened to us that made
878 * us unable to handle the page fault gracefully.
879 */
880out_of_memory:
881 up_read(&mm->mmap_sem);
f8c2ee22
HH
882 if (is_global_init(tsk)) {
883 yield();
fd40d6e3 884#ifdef CONFIG_X86_32
f8c2ee22
HH
885 down_read(&mm->mmap_sem);
886 goto survive;
f8c2ee22 887#else
1da177e4 888 goto again;
f8c2ee22 889#endif
fd40d6e3
HH
890 }
891
1da177e4 892 printk("VM: killing process %s\n", tsk->comm);
318aa296 893 if (error_code & PF_USER)
021daae2 894 do_group_exit(SIGKILL);
1da177e4
LT
895 goto no_context;
896
897do_sigbus:
898 up_read(&mm->mmap_sem);
899
900 /* Kernel mode? Handle exceptions or die */
66c58156 901 if (!(error_code & PF_USER))
1da177e4 902 goto no_context;
f8c2ee22
HH
903#ifdef CONFIG_X86_32
904 /* User space => ok to do another page fault */
905 if (is_prefetch(regs, address, error_code))
906 return;
907#endif
1da177e4
LT
908 tsk->thread.cr2 = address;
909 tsk->thread.error_code = error_code;
910 tsk->thread.trap_no = 14;
c4aba4a8 911 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 912}
9e43e1b7 913
8c914cb7 914DEFINE_SPINLOCK(pgd_lock);
2bff7383 915LIST_HEAD(pgd_list);
8c914cb7
JB
916
917void vmalloc_sync_all(void)
918{
1156e098
HH
919#ifdef CONFIG_X86_32
920 /*
921 * Note that races in the updates of insync and start aren't
922 * problematic: insync can only get set bits added, and updates to
923 * start are only improving performance (without affecting correctness
924 * if undone).
925 */
926 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
927 static unsigned long start = TASK_SIZE;
928 unsigned long address;
929
930 if (SHARED_KERNEL_PMD)
931 return;
932
933 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
934 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
935 if (!test_bit(pgd_index(address), insync)) {
936 unsigned long flags;
937 struct page *page;
938
939 spin_lock_irqsave(&pgd_lock, flags);
e3ed910d 940 list_for_each_entry(page, &pgd_list, lru) {
1156e098 941 if (!vmalloc_sync_one(page_address(page),
e3ed910d 942 address))
1156e098 943 break;
e3ed910d 944 }
1156e098
HH
945 spin_unlock_irqrestore(&pgd_lock, flags);
946 if (!page)
947 set_bit(pgd_index(address), insync);
948 }
949 if (address == start && test_bit(pgd_index(address), insync))
950 start = address + PGDIR_SIZE;
951 }
952#else /* CONFIG_X86_64 */
6f4d368e
HH
953 /*
954 * Note that races in the updates of insync and start aren't
955 * problematic: insync can only get set bits added, and updates to
956 * start are only improving performance (without affecting correctness
957 * if undone).
958 */
8c914cb7
JB
959 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
960 static unsigned long start = VMALLOC_START & PGDIR_MASK;
961 unsigned long address;
962
963 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
964 if (!test_bit(pgd_index(address), insync)) {
965 const pgd_t *pgd_ref = pgd_offset_k(address);
58d5d0d8 966 unsigned long flags;
8c914cb7
JB
967 struct page *page;
968
969 if (pgd_none(*pgd_ref))
970 continue;
58d5d0d8 971 spin_lock_irqsave(&pgd_lock, flags);
2bff7383 972 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
973 pgd_t *pgd;
974 pgd = (pgd_t *)page_address(page) + pgd_index(address);
975 if (pgd_none(*pgd))
976 set_pgd(pgd, *pgd_ref);
977 else
46a82b2d 978 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 979 }
58d5d0d8 980 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7
JB
981 set_bit(pgd_index(address), insync);
982 }
983 if (address == start)
984 start = address + PGDIR_SIZE;
985 }
1156e098 986#endif
8c914cb7 987}