x86: sparse warning in therm_throt.c
[linux-2.6-block.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
c61e211d
HH
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 23#include <linux/vmalloc.h>
1da177e4 24#include <linux/module.h>
0f2fbdcb 25#include <linux/kprobes.h>
ab2bf0c1 26#include <linux/uaccess.h>
1eeb66a1 27#include <linux/kdebug.h>
1da177e4
LT
28
29#include <asm/system.h>
c61e211d
HH
30#include <asm/desc.h>
31#include <asm/segment.h>
1da177e4
LT
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
1da177e4 36#include <asm-generic/sections.h>
1da177e4 37
33cb5243
HH
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
8a19da7b 46#define PF_PROT (1<<0)
66c58156 47#define PF_WRITE (1<<1)
8a19da7b
IM
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
66c58156
AK
50#define PF_INSTR (1<<4)
51
74a0b576 52static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 53{
33cb5243 54#ifdef CONFIG_KPROBES
74a0b576
CH
55 int ret = 0;
56
57 /* kprobe_running() needs smp_processor_id() */
f8c2ee22
HH
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) {
60#else
74a0b576 61 if (!user_mode(regs)) {
f8c2ee22 62#endif
74a0b576
CH
63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1;
66 preempt_enable();
67 }
1bd858a5 68
74a0b576 69 return ret;
74a0b576 70#else
74a0b576 71 return 0;
74a0b576 72#endif
33cb5243 73}
1bd858a5 74
1dc85be0
HH
75/*
76 * X86_32
77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78 * Check that here and ignore it.
79 *
80 * X86_64
81 * Sometimes the CPU reports invalid exceptions on prefetch.
82 * Check that here and ignore it.
83 *
84 * Opcode checker based on code by Richard Brunner
85 */
86static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87 unsigned long error_code)
33cb5243 88{
ab2bf0c1 89 unsigned char *instr;
1da177e4 90 int scan_more = 1;
33cb5243 91 int prefetch = 0;
f1290ec9 92 unsigned char *max_instr;
1da177e4 93
1dc85be0 94#ifdef CONFIG_X86_32
b406ac61 95 if (!(__supported_pte_mask & _PAGE_NX))
1dc85be0 96 return 0;
b406ac61
HH
97#endif
98
c61e211d 99 /* If it was a exec fault on NX page, ignore */
66c58156 100 if (error_code & PF_INSTR)
1da177e4 101 return 0;
1dc85be0 102
f2857ce9 103 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 104 max_instr = instr + 15;
1da177e4 105
76381fee 106 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
107 return 0;
108
33cb5243 109 while (scan_more && instr < max_instr) {
1da177e4
LT
110 unsigned char opcode;
111 unsigned char instr_hi;
112 unsigned char instr_lo;
113
ab2bf0c1 114 if (probe_kernel_address(instr, opcode))
33cb5243 115 break;
1da177e4 116
33cb5243
HH
117 instr_hi = opcode & 0xf0;
118 instr_lo = opcode & 0x0f;
1da177e4
LT
119 instr++;
120
33cb5243 121 switch (instr_hi) {
1da177e4
LT
122 case 0x20:
123 case 0x30:
33cb5243
HH
124 /*
125 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
126 * In X86_64 long mode, the CPU will signal invalid
127 * opcode if some of these prefixes are present so
128 * X86_64 will never get here anyway
129 */
1da177e4
LT
130 scan_more = ((instr_lo & 7) == 0x6);
131 break;
33cb5243 132#ifdef CONFIG_X86_64
1da177e4 133 case 0x40:
33cb5243
HH
134 /*
135 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
136 * Need to figure out under what instruction mode the
137 * instruction was issued. Could check the LDT for lm,
138 * but for now it's good enough to assume that long
139 * mode only uses well known segments or kernel.
140 */
76381fee 141 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 142 break;
33cb5243 143#endif
1da177e4
LT
144 case 0x60:
145 /* 0x64 thru 0x67 are valid prefixes in all modes. */
146 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 147 break;
1da177e4 148 case 0xF0:
1dc85be0 149 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 150 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 151 break;
1da177e4
LT
152 case 0x00:
153 /* Prefetch instruction is 0x0F0D or 0x0F18 */
154 scan_more = 0;
f2857ce9 155
ab2bf0c1 156 if (probe_kernel_address(instr, opcode))
1da177e4
LT
157 break;
158 prefetch = (instr_lo == 0xF) &&
159 (opcode == 0x0D || opcode == 0x18);
33cb5243 160 break;
1da177e4
LT
161 default:
162 scan_more = 0;
163 break;
33cb5243 164 }
1da177e4
LT
165 }
166 return prefetch;
167}
168
c4aba4a8
HH
169static void force_sig_info_fault(int si_signo, int si_code,
170 unsigned long address, struct task_struct *tsk)
171{
172 siginfo_t info;
173
174 info.si_signo = si_signo;
175 info.si_errno = 0;
176 info.si_code = si_code;
177 info.si_addr = (void __user *)address;
178 force_sig_info(si_signo, &info, tsk);
179}
180
1156e098 181#ifdef CONFIG_X86_64
33cb5243
HH
182static int bad_address(void *p)
183{
1da177e4 184 unsigned long dummy;
ab2bf0c1 185 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 186}
1156e098 187#endif
1da177e4
LT
188
189void dump_pagetable(unsigned long address)
190{
1156e098
HH
191#ifdef CONFIG_X86_32
192 __typeof__(pte_val(__pte(0))) page;
193
194 page = read_cr3();
195 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
196#ifdef CONFIG_X86_PAE
197 printk("*pdpt = %016Lx ", page);
198 if ((page >> PAGE_SHIFT) < max_low_pfn
199 && page & _PAGE_PRESENT) {
200 page &= PAGE_MASK;
201 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
202 & (PTRS_PER_PMD - 1)];
203 printk(KERN_CONT "*pde = %016Lx ", page);
204 page &= ~_PAGE_NX;
205 }
206#else
207 printk("*pde = %08lx ", page);
208#endif
209
210 /*
211 * We must not directly access the pte in the highpte
212 * case if the page table is located in highmem.
213 * And let's rather not kmap-atomic the pte, just in case
214 * it's allocated already.
215 */
216 if ((page >> PAGE_SHIFT) < max_low_pfn
217 && (page & _PAGE_PRESENT)
218 && !(page & _PAGE_PSE)) {
219 page &= PAGE_MASK;
220 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
221 & (PTRS_PER_PTE - 1)];
222 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
223 }
224
225 printk("\n");
226#else /* CONFIG_X86_64 */
1da177e4
LT
227 pgd_t *pgd;
228 pud_t *pud;
229 pmd_t *pmd;
230 pte_t *pte;
231
f51c9452 232 pgd = (pgd_t *)read_cr3();
1da177e4 233
33cb5243 234 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 235 pgd += pgd_index(address);
1da177e4 236 if (bad_address(pgd)) goto bad;
d646bce4 237 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 238 if (!pgd_present(*pgd)) goto ret;
1da177e4 239
d2ae5b5f 240 pud = pud_offset(pgd, address);
1da177e4
LT
241 if (bad_address(pud)) goto bad;
242 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
243 if (!pud_present(*pud) || pud_large(*pud))
244 goto ret;
1da177e4
LT
245
246 pmd = pmd_offset(pud, address);
247 if (bad_address(pmd)) goto bad;
248 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 249 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
250
251 pte = pte_offset_kernel(pmd, address);
252 if (bad_address(pte)) goto bad;
33cb5243 253 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
254ret:
255 printk("\n");
256 return;
257bad:
258 printk("BAD\n");
1156e098
HH
259#endif
260}
261
262#ifdef CONFIG_X86_32
263static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
264{
265 unsigned index = pgd_index(address);
266 pgd_t *pgd_k;
267 pud_t *pud, *pud_k;
268 pmd_t *pmd, *pmd_k;
269
270 pgd += index;
271 pgd_k = init_mm.pgd + index;
272
273 if (!pgd_present(*pgd_k))
274 return NULL;
275
276 /*
277 * set_pgd(pgd, *pgd_k); here would be useless on PAE
278 * and redundant with the set_pmd() on non-PAE. As would
279 * set_pud.
280 */
281
282 pud = pud_offset(pgd, address);
283 pud_k = pud_offset(pgd_k, address);
284 if (!pud_present(*pud_k))
285 return NULL;
286
287 pmd = pmd_offset(pud, address);
288 pmd_k = pmd_offset(pud_k, address);
289 if (!pmd_present(*pmd_k))
290 return NULL;
291 if (!pmd_present(*pmd)) {
292 set_pmd(pmd, *pmd_k);
293 arch_flush_lazy_mmu_mode();
294 } else
295 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
296 return pmd_k;
1da177e4 297}
1156e098 298#endif
1da177e4 299
1dc85be0 300#ifdef CONFIG_X86_64
33cb5243 301static const char errata93_warning[] =
1da177e4
LT
302KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
303KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
304KERN_ERR "******* Please consider a BIOS update.\n"
305KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 306#endif
1da177e4
LT
307
308/* Workaround for K8 erratum #93 & buggy BIOS.
309 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
310 to avoid corruption of the 64bit RIP register on C stepping K8.
311 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
312 The OS sees this as a page fault with the upper 32bits of RIP cleared.
313 Try to work around it here.
fdfe8aa8
HH
314 Note we only handle faults in kernel here.
315 Does nothing for X86_32
316 */
33cb5243 317static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 318{
fdfe8aa8 319#ifdef CONFIG_X86_64
1da177e4 320 static int warned;
65ea5b03 321 if (address != regs->ip)
1da177e4 322 return 0;
33cb5243 323 if ((address >> 32) != 0)
1da177e4
LT
324 return 0;
325 address |= 0xffffffffUL << 32;
33cb5243
HH
326 if ((address >= (u64)_stext && address <= (u64)_etext) ||
327 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 328 if (!warned) {
33cb5243 329 printk(errata93_warning);
1da177e4
LT
330 warned = 1;
331 }
65ea5b03 332 regs->ip = address;
1da177e4
LT
333 return 1;
334 }
fdfe8aa8 335#endif
1da177e4 336 return 0;
33cb5243 337}
1da177e4 338
35f3266f
HH
339/*
340 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
341 * addresses >4GB. We catch this in the page fault handler because these
342 * addresses are not reachable. Just detect this case and return. Any code
343 * segment in LDT is compatibility mode.
344 */
345static int is_errata100(struct pt_regs *regs, unsigned long address)
346{
347#ifdef CONFIG_X86_64
348 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
349 (address >> 32))
350 return 1;
351#endif
352 return 0;
353}
354
29caf2f9
HH
355void do_invalid_op(struct pt_regs *, unsigned long);
356
357static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
358{
359#ifdef CONFIG_X86_F00F_BUG
360 unsigned long nr;
361 /*
362 * Pentium F0 0F C7 C8 bug workaround.
363 */
364 if (boot_cpu_data.f00f_bug) {
365 nr = (address - idt_descr.address) >> 3;
366
367 if (nr == 6) {
368 do_invalid_op(regs, 0);
369 return 1;
370 }
371 }
372#endif
373 return 0;
374}
375
b3279c7f
HH
376static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
377 unsigned long address)
378{
1156e098
HH
379#ifdef CONFIG_X86_32
380 if (!oops_may_print())
381 return;
fd40d6e3 382#endif
1156e098
HH
383
384#ifdef CONFIG_X86_PAE
385 if (error_code & PF_INSTR) {
93809be8 386 unsigned int level;
1156e098
HH
387 pte_t *pte = lookup_address(address, &level);
388
389 if (pte && pte_present(*pte) && !pte_exec(*pte))
390 printk(KERN_CRIT "kernel tried to execute "
391 "NX-protected page - exploit attempt? "
392 "(uid: %d)\n", current->uid);
393 }
394#endif
1156e098 395
19f0dda9 396 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 397 if (address < PAGE_SIZE)
19f0dda9 398 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 399 else
19f0dda9 400 printk(KERN_CONT "paging request");
fd40d6e3
HH
401#ifdef CONFIG_X86_32
402 printk(KERN_CONT " at %08lx\n", address);
403#else
19f0dda9 404 printk(KERN_CONT " at %016lx\n", address);
fd40d6e3 405#endif
19f0dda9 406 printk(KERN_ALERT "IP:");
b3279c7f
HH
407 printk_address(regs->ip, 1);
408 dump_pagetable(address);
409}
410
1156e098 411#ifdef CONFIG_X86_64
1da177e4
LT
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
413 unsigned long error_code)
414{
1209140c 415 unsigned long flags = oops_begin();
6e3f3617 416 struct task_struct *tsk;
1209140c 417
1da177e4
LT
418 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
419 current->comm, address);
420 dump_pagetable(address);
6e3f3617
JB
421 tsk = current;
422 tsk->thread.cr2 = address;
423 tsk->thread.trap_no = 14;
424 tsk->thread.error_code = error_code;
22f5991c
JB
425 if (__die("Bad pagetable", regs, error_code))
426 regs = NULL;
427 oops_end(flags, regs, SIGKILL);
1da177e4 428}
1156e098 429#endif
1da177e4 430
d8b57bb7
TG
431static int spurious_fault_check(unsigned long error_code, pte_t *pte)
432{
433 if ((error_code & PF_WRITE) && !pte_write(*pte))
434 return 0;
435 if ((error_code & PF_INSTR) && !pte_exec(*pte))
436 return 0;
437
438 return 1;
439}
440
5b727a3b
JF
441/*
442 * Handle a spurious fault caused by a stale TLB entry. This allows
443 * us to lazily refresh the TLB when increasing the permissions of a
444 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
445 * expensive since that implies doing a full cross-processor TLB
446 * flush, even if no stale TLB entries exist on other processors.
447 * There are no security implications to leaving a stale TLB when
448 * increasing the permissions on a page.
449 */
450static int spurious_fault(unsigned long address,
451 unsigned long error_code)
452{
453 pgd_t *pgd;
454 pud_t *pud;
455 pmd_t *pmd;
456 pte_t *pte;
457
458 /* Reserved-bit violation or user access to kernel space? */
459 if (error_code & (PF_USER | PF_RSVD))
460 return 0;
461
462 pgd = init_mm.pgd + pgd_index(address);
463 if (!pgd_present(*pgd))
464 return 0;
465
466 pud = pud_offset(pgd, address);
467 if (!pud_present(*pud))
468 return 0;
469
d8b57bb7
TG
470 if (pud_large(*pud))
471 return spurious_fault_check(error_code, (pte_t *) pud);
472
5b727a3b
JF
473 pmd = pmd_offset(pud, address);
474 if (!pmd_present(*pmd))
475 return 0;
476
d8b57bb7
TG
477 if (pmd_large(*pmd))
478 return spurious_fault_check(error_code, (pte_t *) pmd);
479
5b727a3b
JF
480 pte = pte_offset_kernel(pmd, address);
481 if (!pte_present(*pte))
482 return 0;
483
d8b57bb7 484 return spurious_fault_check(error_code, pte);
5b727a3b
JF
485}
486
1da177e4 487/*
f8c2ee22
HH
488 * X86_32
489 * Handle a fault on the vmalloc or module mapping area
490 *
491 * X86_64
f95190b2 492 * Handle a fault on the vmalloc area
3b9ba4d5
AK
493 *
494 * This assumes no large pages in there.
1da177e4
LT
495 */
496static int vmalloc_fault(unsigned long address)
497{
fdfe8aa8
HH
498#ifdef CONFIG_X86_32
499 unsigned long pgd_paddr;
500 pmd_t *pmd_k;
501 pte_t *pte_k;
502 /*
503 * Synchronize this task's top level page-table
504 * with the 'reference' page table.
505 *
506 * Do _not_ use "current" here. We might be inside
507 * an interrupt in the middle of a task switch..
508 */
509 pgd_paddr = read_cr3();
510 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
511 if (!pmd_k)
512 return -1;
513 pte_k = pte_offset_kernel(pmd_k, address);
514 if (!pte_present(*pte_k))
515 return -1;
516 return 0;
517#else
1da177e4
LT
518 pgd_t *pgd, *pgd_ref;
519 pud_t *pud, *pud_ref;
520 pmd_t *pmd, *pmd_ref;
521 pte_t *pte, *pte_ref;
522
cf89ec92
HH
523 /* Make sure we are in vmalloc area */
524 if (!(address >= VMALLOC_START && address < VMALLOC_END))
525 return -1;
526
1da177e4
LT
527 /* Copy kernel mappings over when needed. This can also
528 happen within a race in page table update. In the later
529 case just flush. */
530
531 pgd = pgd_offset(current->mm ?: &init_mm, address);
532 pgd_ref = pgd_offset_k(address);
533 if (pgd_none(*pgd_ref))
534 return -1;
535 if (pgd_none(*pgd))
536 set_pgd(pgd, *pgd_ref);
8c914cb7 537 else
46a82b2d 538 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
539
540 /* Below here mismatches are bugs because these lower tables
541 are shared */
542
543 pud = pud_offset(pgd, address);
544 pud_ref = pud_offset(pgd_ref, address);
545 if (pud_none(*pud_ref))
546 return -1;
46a82b2d 547 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
548 BUG();
549 pmd = pmd_offset(pud, address);
550 pmd_ref = pmd_offset(pud_ref, address);
551 if (pmd_none(*pmd_ref))
552 return -1;
553 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
554 BUG();
555 pte_ref = pte_offset_kernel(pmd_ref, address);
556 if (!pte_present(*pte_ref))
557 return -1;
558 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
559 /* Don't use pte_page here, because the mappings can point
560 outside mem_map, and the NUMA hash lookup cannot handle
561 that. */
562 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 563 BUG();
1da177e4 564 return 0;
fdfe8aa8 565#endif
1da177e4
LT
566}
567
abd4f750 568int show_unhandled_signals = 1;
1da177e4
LT
569
570/*
571 * This routine handles page faults. It determines the address,
572 * and the problem, and then passes it off to one of the appropriate
573 * routines.
1da177e4 574 */
f8c2ee22
HH
575#ifdef CONFIG_X86_64
576asmlinkage
577#endif
578void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
579{
580 struct task_struct *tsk;
581 struct mm_struct *mm;
33cb5243 582 struct vm_area_struct *vma;
1da177e4 583 unsigned long address;
f8c2ee22
HH
584 int write, si_code;
585 int fault;
586#ifdef CONFIG_X86_64
1209140c 587 unsigned long flags;
f8c2ee22 588#endif
1da177e4 589
143a5d32
PZ
590 /*
591 * We can fault from pretty much anywhere, with unknown IRQ state.
592 */
593 trace_hardirqs_fixup();
594
a9ba9a3b
AV
595 tsk = current;
596 mm = tsk->mm;
597 prefetchw(&mm->mmap_sem);
598
1da177e4 599 /* get the address */
f51c9452 600 address = read_cr2();
1da177e4 601
c4aba4a8 602 si_code = SEGV_MAPERR;
1da177e4 603
608566b4
HH
604 if (notify_page_fault(regs))
605 return;
1da177e4
LT
606
607 /*
608 * We fault-in kernel-space virtual memory on-demand. The
609 * 'reference' page table is init_mm.pgd.
610 *
611 * NOTE! We MUST NOT take any locks for this case. We may
612 * be in an interrupt or a critical region, and should
613 * only copy the information from the master page table,
614 * nothing more.
615 *
616 * This verifies that the fault happens in kernel space
617 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 618 * protection error (error_code & 9) == 0.
1da177e4 619 */
f8c2ee22
HH
620#ifdef CONFIG_X86_32
621 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
622#else
623 if (unlikely(address >= TASK_SIZE64)) {
624#endif
f8c2ee22
HH
625 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
626 vmalloc_fault(address) >= 0)
627 return;
5b727a3b
JF
628
629 /* Can handle a stale RO->RW TLB */
630 if (spurious_fault(address, error_code))
631 return;
632
f8c2ee22
HH
633 /*
634 * Don't take the mm semaphore here. If we fixup a prefetch
635 * fault we could otherwise deadlock.
636 */
637 goto bad_area_nosemaphore;
638 }
639
cf89ec92
HH
640
641#ifdef CONFIG_X86_32
f8c2ee22
HH
642 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
643 fault has been handled. */
644 if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
645 local_irq_enable();
646
647 /*
648 * If we're in an interrupt, have no user context or are running in an
649 * atomic region then we must not take the fault.
650 */
651 if (in_atomic() || !mm)
652 goto bad_area_nosemaphore;
653#else /* CONFIG_X86_64 */
65ea5b03 654 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
655 local_irq_enable();
656
66c58156 657 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
658 pgtable_bad(address, regs, error_code);
659
660 /*
33cb5243
HH
661 * If we're in an interrupt, have no user context or are running in an
662 * atomic region then we must not take the fault.
1da177e4
LT
663 */
664 if (unlikely(in_atomic() || !mm))
665 goto bad_area_nosemaphore;
666
dbe3ed1c
LT
667 /*
668 * User-mode registers count as a user access even for any
669 * potential system fault or CPU buglet.
670 */
671 if (user_mode_vm(regs))
672 error_code |= PF_USER;
f8c2ee22
HH
673again:
674#endif
1da177e4
LT
675 /* When running in the kernel we expect faults to occur only to
676 * addresses in user space. All other faults represent errors in the
676b1855 677 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 678 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
679 * we will deadlock attempting to validate the fault against the
680 * address space. Luckily the kernel only validly references user
681 * space from well defined areas of code, which are listed in the
682 * exceptions table.
683 *
684 * As the vast majority of faults will be valid we will only perform
676b1855 685 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
686 * Attempt to lock the address space, if we cannot we then validate the
687 * source. If this is invalid we can skip the address space check,
688 * thus avoiding the deadlock.
689 */
690 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 691 if ((error_code & PF_USER) == 0 &&
65ea5b03 692 !search_exception_tables(regs->ip))
1da177e4
LT
693 goto bad_area_nosemaphore;
694 down_read(&mm->mmap_sem);
695 }
696
697 vma = find_vma(mm, address);
698 if (!vma)
699 goto bad_area;
f8c2ee22 700 if (vma->vm_start <= address)
1da177e4
LT
701 goto good_area;
702 if (!(vma->vm_flags & VM_GROWSDOWN))
703 goto bad_area;
33cb5243 704 if (error_code & PF_USER) {
6f4d368e
HH
705 /*
706 * Accessing the stack below %sp is always a bug.
707 * The large cushion allows instructions like enter
708 * and pusha to work. ("enter $65535,$31" pushes
709 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 710 */
65ea5b03 711 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
712 goto bad_area;
713 }
714 if (expand_stack(vma, address))
715 goto bad_area;
716/*
717 * Ok, we have a good vm_area for this memory access, so
718 * we can handle it..
719 */
720good_area:
c4aba4a8 721 si_code = SEGV_ACCERR;
1da177e4 722 write = 0;
66c58156 723 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
724 default: /* 3: write, present */
725 /* fall through */
726 case PF_WRITE: /* write, not present */
727 if (!(vma->vm_flags & VM_WRITE))
728 goto bad_area;
729 write++;
730 break;
731 case PF_PROT: /* read, present */
732 goto bad_area;
733 case 0: /* read, not present */
734 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 735 goto bad_area;
1da177e4
LT
736 }
737
f8c2ee22
HH
738#ifdef CONFIG_X86_32
739survive:
740#endif
1da177e4
LT
741 /*
742 * If for any reason at all we couldn't handle the fault,
743 * make sure we exit gracefully rather than endlessly redo
744 * the fault.
745 */
83c54070
NP
746 fault = handle_mm_fault(mm, vma, address, write);
747 if (unlikely(fault & VM_FAULT_ERROR)) {
748 if (fault & VM_FAULT_OOM)
749 goto out_of_memory;
750 else if (fault & VM_FAULT_SIGBUS)
751 goto do_sigbus;
752 BUG();
1da177e4 753 }
83c54070
NP
754 if (fault & VM_FAULT_MAJOR)
755 tsk->maj_flt++;
756 else
757 tsk->min_flt++;
d729ab35
HH
758
759#ifdef CONFIG_X86_32
760 /*
761 * Did it hit the DOS screen memory VA from vm86 mode?
762 */
763 if (v8086_mode(regs)) {
764 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
765 if (bit < 32)
766 tsk->thread.screen_bitmap |= 1 << bit;
767 }
768#endif
1da177e4
LT
769 up_read(&mm->mmap_sem);
770 return;
771
772/*
773 * Something tried to access memory that isn't in our memory map..
774 * Fix it, but check if it's kernel or user first..
775 */
776bad_area:
777 up_read(&mm->mmap_sem);
778
779bad_area_nosemaphore:
1da177e4 780 /* User mode accesses just cause a SIGSEGV */
66c58156 781 if (error_code & PF_USER) {
e5e3c84b
SR
782 /*
783 * It's possible to have interrupts off here.
784 */
785 local_irq_enable();
786
1156e098
HH
787 /*
788 * Valid to do another page fault here because this one came
789 * from user space.
790 */
1da177e4
LT
791 if (is_prefetch(regs, address, error_code))
792 return;
793
35f3266f 794 if (is_errata100(regs, address))
1da177e4
LT
795 return;
796
abd4f750
MAS
797 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
798 printk_ratelimit()) {
1da177e4 799 printk(
6f4d368e 800#ifdef CONFIG_X86_32
edcd8119 801 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 802#else
03252919 803 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
804#endif
805 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
806 tsk->comm, task_pid_nr(tsk), address, regs->ip,
807 regs->sp, error_code);
03252919
AK
808 print_vma_addr(" in ", regs->ip);
809 printk("\n");
1da177e4 810 }
33cb5243 811
1da177e4
LT
812 tsk->thread.cr2 = address;
813 /* Kernel addresses are always protection faults */
814 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
815 tsk->thread.trap_no = 14;
c4aba4a8 816 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
817 return;
818 }
819
29caf2f9
HH
820 if (is_f00f_bug(regs, address))
821 return;
822
1da177e4 823no_context:
1da177e4 824 /* Are we prepared to handle this kernel fault? */
33cb5243 825 if (fixup_exception(regs))
1da177e4 826 return;
1da177e4 827
33cb5243 828 /*
f8c2ee22
HH
829 * X86_32
830 * Valid to do another page fault here, because if this fault
831 * had been triggered by is_prefetch fixup_exception would have
832 * handled it.
833 *
834 * X86_64
1da177e4
LT
835 * Hall of shame of CPU/BIOS bugs.
836 */
33cb5243
HH
837 if (is_prefetch(regs, address, error_code))
838 return;
1da177e4
LT
839
840 if (is_errata93(regs, address))
33cb5243 841 return;
1da177e4
LT
842
843/*
844 * Oops. The kernel tried to access some bad page. We'll have to
845 * terminate things with extreme prejudice.
846 */
f8c2ee22
HH
847#ifdef CONFIG_X86_32
848 bust_spinlocks(1);
fd40d6e3
HH
849#else
850 flags = oops_begin();
851#endif
f8c2ee22
HH
852
853 show_fault_oops(regs, error_code, address);
1da177e4 854
f8c2ee22
HH
855 tsk->thread.cr2 = address;
856 tsk->thread.trap_no = 14;
857 tsk->thread.error_code = error_code;
fd40d6e3
HH
858
859#ifdef CONFIG_X86_32
f8c2ee22
HH
860 die("Oops", regs, error_code);
861 bust_spinlocks(0);
862 do_exit(SIGKILL);
fd40d6e3 863#else
22f5991c
JB
864 if (__die("Oops", regs, error_code))
865 regs = NULL;
1da177e4
LT
866 /* Executive summary in case the body of the oops scrolled away */
867 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 868 oops_end(flags, regs, SIGKILL);
f8c2ee22 869#endif
1da177e4
LT
870
871/*
872 * We ran out of memory, or some other thing happened to us that made
873 * us unable to handle the page fault gracefully.
874 */
875out_of_memory:
876 up_read(&mm->mmap_sem);
f8c2ee22
HH
877 if (is_global_init(tsk)) {
878 yield();
fd40d6e3 879#ifdef CONFIG_X86_32
f8c2ee22
HH
880 down_read(&mm->mmap_sem);
881 goto survive;
f8c2ee22 882#else
1da177e4 883 goto again;
f8c2ee22 884#endif
fd40d6e3
HH
885 }
886
1da177e4 887 printk("VM: killing process %s\n", tsk->comm);
318aa296 888 if (error_code & PF_USER)
021daae2 889 do_group_exit(SIGKILL);
1da177e4
LT
890 goto no_context;
891
892do_sigbus:
893 up_read(&mm->mmap_sem);
894
895 /* Kernel mode? Handle exceptions or die */
66c58156 896 if (!(error_code & PF_USER))
1da177e4 897 goto no_context;
f8c2ee22
HH
898#ifdef CONFIG_X86_32
899 /* User space => ok to do another page fault */
900 if (is_prefetch(regs, address, error_code))
901 return;
902#endif
1da177e4
LT
903 tsk->thread.cr2 = address;
904 tsk->thread.error_code = error_code;
905 tsk->thread.trap_no = 14;
c4aba4a8 906 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 907}
9e43e1b7 908
8c914cb7 909DEFINE_SPINLOCK(pgd_lock);
2bff7383 910LIST_HEAD(pgd_list);
8c914cb7
JB
911
912void vmalloc_sync_all(void)
913{
1156e098
HH
914#ifdef CONFIG_X86_32
915 /*
916 * Note that races in the updates of insync and start aren't
917 * problematic: insync can only get set bits added, and updates to
918 * start are only improving performance (without affecting correctness
919 * if undone).
920 */
921 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
922 static unsigned long start = TASK_SIZE;
923 unsigned long address;
924
925 if (SHARED_KERNEL_PMD)
926 return;
927
928 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
929 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
930 if (!test_bit(pgd_index(address), insync)) {
931 unsigned long flags;
932 struct page *page;
933
934 spin_lock_irqsave(&pgd_lock, flags);
e3ed910d 935 list_for_each_entry(page, &pgd_list, lru) {
1156e098 936 if (!vmalloc_sync_one(page_address(page),
e3ed910d 937 address))
1156e098 938 break;
e3ed910d 939 }
1156e098
HH
940 spin_unlock_irqrestore(&pgd_lock, flags);
941 if (!page)
942 set_bit(pgd_index(address), insync);
943 }
944 if (address == start && test_bit(pgd_index(address), insync))
945 start = address + PGDIR_SIZE;
946 }
947#else /* CONFIG_X86_64 */
6f4d368e
HH
948 /*
949 * Note that races in the updates of insync and start aren't
950 * problematic: insync can only get set bits added, and updates to
951 * start are only improving performance (without affecting correctness
952 * if undone).
953 */
8c914cb7
JB
954 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
955 static unsigned long start = VMALLOC_START & PGDIR_MASK;
956 unsigned long address;
957
958 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
959 if (!test_bit(pgd_index(address), insync)) {
960 const pgd_t *pgd_ref = pgd_offset_k(address);
58d5d0d8 961 unsigned long flags;
8c914cb7
JB
962 struct page *page;
963
964 if (pgd_none(*pgd_ref))
965 continue;
58d5d0d8 966 spin_lock_irqsave(&pgd_lock, flags);
2bff7383 967 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
968 pgd_t *pgd;
969 pgd = (pgd_t *)page_address(page) + pgd_index(address);
970 if (pgd_none(*pgd))
971 set_pgd(pgd, *pgd_ref);
972 else
46a82b2d 973 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 974 }
58d5d0d8 975 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7
JB
976 set_bit(pgd_index(address), insync);
977 }
978 if (address == start)
979 start = address + PGDIR_SIZE;
980 }
981 /* Check that there is no need to do the same for the modules area. */
982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33cb5243 983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
8c914cb7 984 (__START_KERNEL & PGDIR_MASK)));
1156e098 985#endif
8c914cb7 986}