2 * linux/arch/x86_64/entry.S
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
10 * entry.S contains the system-call and fault low-level handling routines.
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call.
15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al.
18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved.
25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
40 #include <linux/linkage.h>
41 #include <asm/segment.h>
42 #include <asm/cache.h>
43 #include <asm/errno.h>
44 #include <asm/dwarf2.h>
45 #include <asm/calling.h>
46 #include <asm/asm-offsets.h>
48 #include <asm/unistd.h>
49 #include <asm/thread_info.h>
50 #include <asm/hw_irq.h>
52 #include <asm/irqflags.h>
53 #include <asm/paravirt.h>
54 #include <asm/ftrace.h>
56 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57 #include <linux/elf-em.h>
58 #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59 #define __AUDIT_ARCH_64BIT 0x80000000
60 #define __AUDIT_ARCH_LE 0x40000000
64 #ifdef CONFIG_FUNCTION_TRACER
65 #ifdef CONFIG_DYNAMIC_FTRACE
72 /* taken from glibc */
84 subq $MCOUNT_INSN_SIZE, %rdi
104 #else /* ! CONFIG_DYNAMIC_FTRACE */
106 cmpq $ftrace_stub, ftrace_trace_function
113 /* taken from glibc */
123 movq 0x38(%rsp), %rdi
125 subq $MCOUNT_INSN_SIZE, %rdi
127 call *ftrace_trace_function
140 #endif /* CONFIG_DYNAMIC_FTRACE */
141 #endif /* CONFIG_FUNCTION_TRACER */
143 #ifndef CONFIG_PREEMPT
144 #define retint_kernel retint_restore_args
147 #ifdef CONFIG_PARAVIRT
148 ENTRY(native_usergs_sysret64)
151 #endif /* CONFIG_PARAVIRT */
154 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
155 #ifdef CONFIG_TRACE_IRQFLAGS
156 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
164 * C code is not supposed to know about undefined top of stack. Every time
165 * a C function with an pt_regs argument is called from the SYSCALL based
166 * fast path FIXUP_TOP_OF_STACK is needed.
167 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
171 /* %rsp:at FRAMEEND */
172 .macro FIXUP_TOP_OF_STACK tmp
173 movq %gs:pda_oldrsp,\tmp
175 movq $__USER_DS,SS(%rsp)
176 movq $__USER_CS,CS(%rsp)
178 movq R11(%rsp),\tmp /* get eflags */
179 movq \tmp,EFLAGS(%rsp)
182 .macro RESTORE_TOP_OF_STACK tmp,offset=0
183 movq RSP-\offset(%rsp),\tmp
184 movq \tmp,%gs:pda_oldrsp
185 movq EFLAGS-\offset(%rsp),\tmp
186 movq \tmp,R11-\offset(%rsp)
189 .macro FAKE_STACK_FRAME child_rip
190 /* push in order ss, rsp, eflags, cs, rip */
192 pushq $__KERNEL_DS /* ss */
193 CFI_ADJUST_CFA_OFFSET 8
194 /*CFI_REL_OFFSET ss,0*/
196 CFI_ADJUST_CFA_OFFSET 8
198 pushq $(1<<9) /* eflags - interrupts on */
199 CFI_ADJUST_CFA_OFFSET 8
200 /*CFI_REL_OFFSET rflags,0*/
201 pushq $__KERNEL_CS /* cs */
202 CFI_ADJUST_CFA_OFFSET 8
203 /*CFI_REL_OFFSET cs,0*/
204 pushq \child_rip /* rip */
205 CFI_ADJUST_CFA_OFFSET 8
207 pushq %rax /* orig rax */
208 CFI_ADJUST_CFA_OFFSET 8
211 .macro UNFAKE_STACK_FRAME
213 CFI_ADJUST_CFA_OFFSET -(6*8)
216 .macro CFI_DEFAULT_STACK start=1
222 CFI_DEF_CFA_OFFSET SS+8
224 CFI_REL_OFFSET r15,R15
225 CFI_REL_OFFSET r14,R14
226 CFI_REL_OFFSET r13,R13
227 CFI_REL_OFFSET r12,R12
228 CFI_REL_OFFSET rbp,RBP
229 CFI_REL_OFFSET rbx,RBX
230 CFI_REL_OFFSET r11,R11
231 CFI_REL_OFFSET r10,R10
234 CFI_REL_OFFSET rax,RAX
235 CFI_REL_OFFSET rcx,RCX
236 CFI_REL_OFFSET rdx,RDX
237 CFI_REL_OFFSET rsi,RSI
238 CFI_REL_OFFSET rdi,RDI
239 CFI_REL_OFFSET rip,RIP
240 /*CFI_REL_OFFSET cs,CS*/
241 /*CFI_REL_OFFSET rflags,EFLAGS*/
242 CFI_REL_OFFSET rsp,RSP
243 /*CFI_REL_OFFSET ss,SS*/
246 * A newly forked process directly context switches into this.
251 push kernel_eflags(%rip)
252 CFI_ADJUST_CFA_OFFSET 8
253 popf # reset kernel eflags
254 CFI_ADJUST_CFA_OFFSET -8
256 GET_THREAD_INFO(%rcx)
257 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
262 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
263 je int_ret_from_sys_call
264 testl $_TIF_IA32,TI_flags(%rcx)
265 jnz int_ret_from_sys_call
266 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
267 jmp ret_from_sys_call
271 call syscall_trace_leave
272 GET_THREAD_INFO(%rcx)
278 * System call entry. Upto 6 arguments in registers are supported.
280 * SYSCALL does not save anything on the stack and does not change the
286 * rax system call number
288 * rcx return address for syscall/sysret, C arg3
291 * r10 arg3 (--> moved to rcx for C)
294 * r11 eflags for syscall/sysret, temporary for C
295 * r12-r15,rbp,rbx saved by C code, not touched.
297 * Interrupts are off on entry.
298 * Only called from user space.
300 * XXX if we had a free scratch register we could save the RSP into the stack frame
301 * and report it properly in ps. Unfortunately we haven't.
303 * When user can change the frames always force IRET. That is because
304 * it deals with uncanonical addresses better. SYSRET has trouble
305 * with them due to bugs in both AMD and Intel CPUs.
311 CFI_DEF_CFA rsp,PDA_STACKOFFSET
313 /*CFI_REGISTER rflags,r11*/
316 * A hypervisor implementation might want to use a label
317 * after the swapgs, so that it can do the swapgs
318 * for the guest and jump here on syscall.
320 ENTRY(system_call_after_swapgs)
322 movq %rsp,%gs:pda_oldrsp
323 movq %gs:pda_kernelstack,%rsp
325 * No need to follow this irqs off/on section - it's straight
328 ENABLE_INTERRUPTS(CLBR_NONE)
330 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
331 movq %rcx,RIP-ARGOFFSET(%rsp)
332 CFI_REL_OFFSET rip,RIP-ARGOFFSET
333 GET_THREAD_INFO(%rcx)
334 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
336 system_call_fastpath:
337 cmpq $__NR_syscall_max,%rax
340 call *sys_call_table(,%rax,8) # XXX: rip relative
341 movq %rax,RAX-ARGOFFSET(%rsp)
343 * Syscall return path ending with SYSRET (fast path)
344 * Has incomplete stack frame and undefined top of stack.
347 movl $_TIF_ALLWORK_MASK,%edi
351 GET_THREAD_INFO(%rcx)
352 DISABLE_INTERRUPTS(CLBR_NONE)
354 movl TI_flags(%rcx),%edx
359 * sysretq will re-enable interrupts:
362 movq RIP-ARGOFFSET(%rsp),%rcx
364 RESTORE_ARGS 0,-ARG_SKIP,1
365 /*CFI_REGISTER rflags,r11*/
366 movq %gs:pda_oldrsp, %rsp
370 /* Handle reschedules */
371 /* edx: work, edi: workmask */
373 bt $TIF_NEED_RESCHED,%edx
376 ENABLE_INTERRUPTS(CLBR_NONE)
378 CFI_ADJUST_CFA_OFFSET 8
381 CFI_ADJUST_CFA_OFFSET -8
384 /* Handle a signal */
387 ENABLE_INTERRUPTS(CLBR_NONE)
388 #ifdef CONFIG_AUDITSYSCALL
389 bt $TIF_SYSCALL_AUDIT,%edx
392 /* edx: work flags (arg3) */
393 leaq do_notify_resume(%rip),%rax
394 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
395 xorl %esi,%esi # oldset -> arg2
396 call ptregscall_common
397 movl $_TIF_WORK_MASK,%edi
398 /* Use IRET because user could have changed frame. This
399 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
400 DISABLE_INTERRUPTS(CLBR_NONE)
405 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
406 jmp ret_from_sys_call
408 #ifdef CONFIG_AUDITSYSCALL
410 * Fast path for syscall audit without full syscall trace.
411 * We just call audit_syscall_entry() directly, and then
412 * jump back to the normal fast path.
415 movq %r10,%r9 /* 6th arg: 4th syscall arg */
416 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
417 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
418 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
419 movq %rax,%rsi /* 2nd arg: syscall number */
420 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
421 call audit_syscall_entry
422 LOAD_ARGS 0 /* reload call-clobbered registers */
423 jmp system_call_fastpath
426 * Return fast path for syscall audit. Call audit_syscall_exit()
427 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
431 movq %rax,%rsi /* second arg, syscall return value */
432 cmpq $0,%rax /* is it < 0? */
433 setl %al /* 1 if so, 0 if not */
434 movzbl %al,%edi /* zero-extend that into %edi */
435 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
436 call audit_syscall_exit
437 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
439 #endif /* CONFIG_AUDITSYSCALL */
441 /* Do syscall tracing */
443 #ifdef CONFIG_AUDITSYSCALL
444 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
448 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
449 FIXUP_TOP_OF_STACK %rdi
451 call syscall_trace_enter
453 * Reload arg registers from stack in case ptrace changed them.
454 * We don't reload %rax because syscall_trace_enter() returned
455 * the value it wants us to use in the table lookup.
457 LOAD_ARGS ARGOFFSET, 1
459 cmpq $__NR_syscall_max,%rax
460 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
461 movq %r10,%rcx /* fixup for C */
462 call *sys_call_table(,%rax,8)
463 movq %rax,RAX-ARGOFFSET(%rsp)
464 /* Use IRET because user could have changed frame */
467 * Syscall return path ending with IRET.
468 * Has correct top of stack, but partial stack frame.
470 .globl int_ret_from_sys_call
471 .globl int_with_check
472 int_ret_from_sys_call:
473 DISABLE_INTERRUPTS(CLBR_NONE)
475 testl $3,CS-ARGOFFSET(%rsp)
476 je retint_restore_args
477 movl $_TIF_ALLWORK_MASK,%edi
478 /* edi: mask to check */
481 GET_THREAD_INFO(%rcx)
482 movl TI_flags(%rcx),%edx
485 andl $~TS_COMPAT,TI_status(%rcx)
488 /* Either reschedule or signal or syscall exit tracking needed. */
489 /* First do a reschedule test. */
490 /* edx: work, edi: workmask */
492 bt $TIF_NEED_RESCHED,%edx
495 ENABLE_INTERRUPTS(CLBR_NONE)
497 CFI_ADJUST_CFA_OFFSET 8
500 CFI_ADJUST_CFA_OFFSET -8
501 DISABLE_INTERRUPTS(CLBR_NONE)
505 /* handle signals and tracing -- both require a full stack frame */
508 ENABLE_INTERRUPTS(CLBR_NONE)
510 /* Check for syscall exit trace */
511 testl $_TIF_WORK_SYSCALL_EXIT,%edx
514 CFI_ADJUST_CFA_OFFSET 8
515 leaq 8(%rsp),%rdi # &ptregs -> arg1
516 call syscall_trace_leave
518 CFI_ADJUST_CFA_OFFSET -8
519 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
523 testl $_TIF_DO_NOTIFY_MASK,%edx
525 movq %rsp,%rdi # &ptregs -> arg1
526 xorl %esi,%esi # oldset -> arg2
527 call do_notify_resume
528 1: movl $_TIF_WORK_MASK,%edi
531 DISABLE_INTERRUPTS(CLBR_NONE)
538 * Certain special system calls that need to save a complete full stack frame.
541 .macro PTREGSCALL label,func,arg
544 leaq \func(%rip),%rax
545 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
546 jmp ptregscall_common
552 PTREGSCALL stub_clone, sys_clone, %r8
553 PTREGSCALL stub_fork, sys_fork, %rdi
554 PTREGSCALL stub_vfork, sys_vfork, %rdi
555 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
556 PTREGSCALL stub_iopl, sys_iopl, %rsi
558 ENTRY(ptregscall_common)
560 CFI_ADJUST_CFA_OFFSET -8
561 CFI_REGISTER rip, r11
564 CFI_REGISTER rip, r15
565 FIXUP_TOP_OF_STACK %r11
567 RESTORE_TOP_OF_STACK %r11
569 CFI_REGISTER rip, r11
572 CFI_ADJUST_CFA_OFFSET 8
573 CFI_REL_OFFSET rip, 0
576 END(ptregscall_common)
581 CFI_ADJUST_CFA_OFFSET -8
582 CFI_REGISTER rip, r11
584 FIXUP_TOP_OF_STACK %r11
587 RESTORE_TOP_OF_STACK %r11
590 jmp int_ret_from_sys_call
595 * sigreturn is special because it needs to restore all registers on return.
596 * This cannot be done with SYSRET, so use the IRET return path instead.
598 ENTRY(stub_rt_sigreturn)
601 CFI_ADJUST_CFA_OFFSET -8
604 FIXUP_TOP_OF_STACK %r11
605 call sys_rt_sigreturn
606 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
608 jmp int_ret_from_sys_call
610 END(stub_rt_sigreturn)
613 * initial frame state for interrupts and exceptions
618 CFI_DEF_CFA rsp,SS+8-\ref
619 /*CFI_REL_OFFSET ss,SS-\ref*/
620 CFI_REL_OFFSET rsp,RSP-\ref
621 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
622 /*CFI_REL_OFFSET cs,CS-\ref*/
623 CFI_REL_OFFSET rip,RIP-\ref
626 /* initial frame state for interrupts (and exceptions without error code) */
627 #define INTR_FRAME _frame RIP
628 /* initial frame state for exceptions with error code (and interrupts with
629 vector already pushed) */
630 #define XCPT_FRAME _frame ORIG_RAX
633 * Interrupt entry/exit.
635 * Interrupt entry points save only callee clobbered registers in fast path.
637 * Entry runs with interrupts off.
640 /* 0(%rsp): interrupt number */
641 .macro interrupt func
644 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
647 * Save rbp twice: One is for marking the stack frame, as usual, and the
648 * other, to fill pt_regs properly. This is because bx comes right
649 * before the last saved register in that structure, and not bp. If the
650 * base pointer were in the place bx is today, this would not be needed.
653 CFI_ADJUST_CFA_OFFSET 8
654 CFI_REL_OFFSET rbp, 0
656 CFI_DEF_CFA_REGISTER rbp
660 /* irqcount is used to check if a CPU is already on an interrupt
661 stack or not. While this is essentially redundant with preempt_count
662 it is a little cheaper to use a separate counter in the PDA
663 (short of moving irq_enter into assembly, which would be too
665 1: incl %gs:pda_irqcount
666 cmoveq %gs:pda_irqstackptr,%rsp
667 push %rbp # backlink for old unwinder
669 * We entered an interrupt context - irqs are off:
675 ENTRY(common_interrupt)
678 /* 0(%rsp): oldrsp-ARGOFFSET */
680 DISABLE_INTERRUPTS(CLBR_NONE)
682 decl %gs:pda_irqcount
684 CFI_DEF_CFA_REGISTER rsp
685 CFI_ADJUST_CFA_OFFSET -8
687 GET_THREAD_INFO(%rcx)
688 testl $3,CS-ARGOFFSET(%rsp)
691 /* Interrupt came from user space */
693 * Has a correct top of stack, but a partial stack frame
694 * %rcx: thread info. Interrupts off.
696 retint_with_reschedule:
697 movl $_TIF_WORK_MASK,%edi
700 movl TI_flags(%rcx),%edx
705 retint_swapgs: /* return to user-space */
707 * The iretq could re-enable interrupts:
709 DISABLE_INTERRUPTS(CLBR_ANY)
714 retint_restore_args: /* return to kernel space */
715 DISABLE_INTERRUPTS(CLBR_ANY)
717 * The iretq could re-enable interrupts:
726 .section __ex_table, "a"
727 .quad irq_return, bad_iret
730 #ifdef CONFIG_PARAVIRT
734 .section __ex_table,"a"
735 .quad native_iret, bad_iret
742 * The iret traps when the %cs or %ss being restored is bogus.
743 * We've lost the original trap vector and error code.
744 * #GPF is the most likely one to get for an invalid selector.
745 * So pretend we completed the iret and took the #GPF in user mode.
747 * We are now running with the kernel GS after exception recovery.
748 * But error_entry expects us to have user GS to match the user %cs,
754 jmp general_protection
758 /* edi: workmask, edx: work */
761 bt $TIF_NEED_RESCHED,%edx
764 ENABLE_INTERRUPTS(CLBR_NONE)
766 CFI_ADJUST_CFA_OFFSET 8
769 CFI_ADJUST_CFA_OFFSET -8
770 GET_THREAD_INFO(%rcx)
771 DISABLE_INTERRUPTS(CLBR_NONE)
776 testl $_TIF_DO_NOTIFY_MASK,%edx
779 ENABLE_INTERRUPTS(CLBR_NONE)
781 movq $-1,ORIG_RAX(%rsp)
782 xorl %esi,%esi # oldset
783 movq %rsp,%rdi # &pt_regs
784 call do_notify_resume
786 DISABLE_INTERRUPTS(CLBR_NONE)
788 GET_THREAD_INFO(%rcx)
789 jmp retint_with_reschedule
791 #ifdef CONFIG_PREEMPT
792 /* Returning to kernel space. Check if we need preemption */
793 /* rcx: threadinfo. interrupts off. */
795 cmpl $0,TI_preempt_count(%rcx)
796 jnz retint_restore_args
797 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
798 jnc retint_restore_args
799 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
800 jnc retint_restore_args
801 call preempt_schedule_irq
806 END(common_interrupt)
811 .macro apicinterrupt num,func
814 CFI_ADJUST_CFA_OFFSET 8
820 ENTRY(thermal_interrupt)
821 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
822 END(thermal_interrupt)
824 ENTRY(threshold_interrupt)
825 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
826 END(threshold_interrupt)
829 ENTRY(reschedule_interrupt)
830 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
831 END(reschedule_interrupt)
833 .macro INVALIDATE_ENTRY num
834 ENTRY(invalidate_interrupt\num)
835 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
836 END(invalidate_interrupt\num)
848 ENTRY(call_function_interrupt)
849 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
850 END(call_function_interrupt)
851 ENTRY(call_function_single_interrupt)
852 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
853 END(call_function_single_interrupt)
854 ENTRY(irq_move_cleanup_interrupt)
855 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
856 END(irq_move_cleanup_interrupt)
859 ENTRY(apic_timer_interrupt)
860 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
861 END(apic_timer_interrupt)
863 ENTRY(uv_bau_message_intr1)
864 apicinterrupt 220,uv_bau_message_interrupt
865 END(uv_bau_message_intr1)
867 ENTRY(error_interrupt)
868 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
871 ENTRY(spurious_interrupt)
872 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
873 END(spurious_interrupt)
876 * Exception entry points.
880 PARAVIRT_ADJUST_EXCEPTION_FRAME
881 pushq $0 /* push error code/oldrax */
882 CFI_ADJUST_CFA_OFFSET 8
883 pushq %rax /* push real oldrax to the rdi slot */
884 CFI_ADJUST_CFA_OFFSET 8
891 .macro errorentry sym
893 PARAVIRT_ADJUST_EXCEPTION_FRAME
895 CFI_ADJUST_CFA_OFFSET 8
902 /* error code is on the stack already */
903 /* handle NMI like exceptions that can happen everywhere */
904 .macro paranoidentry sym, ist=0, irqtrace=1
908 movl $MSR_GS_BASE,%ecx
916 movq %gs:pda_data_offset, %rbp
922 movq ORIG_RAX(%rsp),%rsi
923 movq $-1,ORIG_RAX(%rsp)
925 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
929 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
931 DISABLE_INTERRUPTS(CLBR_NONE)
938 * "Paranoid" exit path from exception stack.
939 * Paranoid because this is used by NMIs and cannot take
940 * any kernel state for granted.
941 * We don't do kernel preemption checks here, because only
942 * NMI should be common and it does not enable IRQs and
943 * cannot get reschedule ticks.
945 * "trace" is 0 for the NMI handler only, because irq-tracing
946 * is fundamentally NMI-unsafe. (we cannot change the soft and
947 * hard flags at once, atomically)
949 .macro paranoidexit trace=1
950 /* ebx: no swapgs flag */
952 testl %ebx,%ebx /* swapgs needed? */
953 jnz paranoid_restore\trace
955 jnz paranoid_userspace\trace
956 paranoid_swapgs\trace:
961 paranoid_restore\trace:
964 paranoid_userspace\trace:
965 GET_THREAD_INFO(%rcx)
966 movl TI_flags(%rcx),%ebx
967 andl $_TIF_WORK_MASK,%ebx
968 jz paranoid_swapgs\trace
969 movq %rsp,%rdi /* &pt_regs */
971 movq %rax,%rsp /* switch stack for scheduling */
972 testl $_TIF_NEED_RESCHED,%ebx
973 jnz paranoid_schedule\trace
974 movl %ebx,%edx /* arg3: thread flags */
978 ENABLE_INTERRUPTS(CLBR_NONE)
979 xorl %esi,%esi /* arg2: oldset */
980 movq %rsp,%rdi /* arg1: &pt_regs */
981 call do_notify_resume
982 DISABLE_INTERRUPTS(CLBR_NONE)
986 jmp paranoid_userspace\trace
987 paranoid_schedule\trace:
991 ENABLE_INTERRUPTS(CLBR_ANY)
993 DISABLE_INTERRUPTS(CLBR_ANY)
997 jmp paranoid_userspace\trace
1002 * Exception entry point. This expects an error code/orig_rax on the stack
1003 * and the exception handler in %rax.
1005 KPROBE_ENTRY(error_entry)
1007 CFI_REL_OFFSET rax,0
1008 /* rdi slot contains rax, oldrax contains error code */
1011 CFI_ADJUST_CFA_OFFSET (14*8)
1012 movq %rsi,13*8(%rsp)
1013 CFI_REL_OFFSET rsi,RSI
1014 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
1015 CFI_REGISTER rax,rsi
1016 movq %rdx,12*8(%rsp)
1017 CFI_REL_OFFSET rdx,RDX
1018 movq %rcx,11*8(%rsp)
1019 CFI_REL_OFFSET rcx,RCX
1020 movq %rsi,10*8(%rsp) /* store rax */
1021 CFI_REL_OFFSET rax,RAX
1023 CFI_REL_OFFSET r8,R8
1025 CFI_REL_OFFSET r9,R9
1027 CFI_REL_OFFSET r10,R10
1029 CFI_REL_OFFSET r11,R11
1031 CFI_REL_OFFSET rbx,RBX
1033 CFI_REL_OFFSET rbp,RBP
1035 CFI_REL_OFFSET r12,R12
1037 CFI_REL_OFFSET r13,R13
1039 CFI_REL_OFFSET r14,R14
1041 CFI_REL_OFFSET r15,R15
1044 je error_kernelspace
1050 CFI_REL_OFFSET rdi,RDI
1052 movq ORIG_RAX(%rsp),%rsi /* get error code */
1053 movq $-1,ORIG_RAX(%rsp)
1055 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1059 DISABLE_INTERRUPTS(CLBR_NONE)
1061 GET_THREAD_INFO(%rcx)
1064 LOCKDEP_SYS_EXIT_IRQ
1065 movl TI_flags(%rcx),%edx
1066 movl $_TIF_WORK_MASK,%edi
1074 /* There are two places in the kernel that can potentially fault with
1075 usergs. Handle them here. The exception handlers after
1076 iret run with kernel gs again, so don't set the user space flag.
1077 B stepping K8s sometimes report an truncated RIP for IRET
1078 exceptions returning to compat mode. Check for these here too. */
1079 leaq irq_return(%rip),%rcx
1082 movl %ecx,%ecx /* zero extend */
1085 cmpq $gs_change,RIP(%rsp)
1088 KPROBE_END(error_entry)
1090 /* Reload gs selector with exception handling */
1091 /* edi: new selector */
1092 ENTRY(native_load_gs_index)
1095 CFI_ADJUST_CFA_OFFSET 8
1096 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
1100 2: mfence /* workaround */
1103 CFI_ADJUST_CFA_OFFSET -8
1106 ENDPROC(native_load_gs_index)
1108 .section __ex_table,"a"
1110 .quad gs_change,bad_gs
1112 .section .fixup,"ax"
1113 /* running with kernelgs */
1115 SWAPGS /* switch back to user gs */
1122 * Create a kernel thread.
1124 * C extern interface:
1125 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1127 * asm input arguments:
1128 * rdi: fn, rsi: arg, rdx: flags
1130 ENTRY(kernel_thread)
1132 FAKE_STACK_FRAME $child_rip
1135 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1137 orq kernel_thread_flags(%rip),%rdi
1150 * It isn't worth to check for reschedule here,
1151 * so internally to the x86_64 port you can rely on kernel_thread()
1152 * not to reschedule the child before returning, this avoids the need
1153 * of hacks for example to fork off the per-CPU idle tasks.
1154 * [Hopefully no generic code relies on the reschedule -AK]
1160 ENDPROC(kernel_thread)
1163 pushq $0 # fake return address
1166 * Here we are in the child and the registers are set as they were
1167 * at kernel_thread() invocation in the parent.
1175 ud2 # padding for call trace
1180 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1182 * C extern interface:
1183 * extern long execve(char *name, char **argv, char **envp)
1185 * asm input arguments:
1186 * rdi: name, rsi: argv, rdx: envp
1188 * We want to fallback into:
1189 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
1191 * do_sys_execve asm fallback arguments:
1192 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1194 ENTRY(kernel_execve)
1200 movq %rax, RAX(%rsp)
1203 je int_ret_from_sys_call
1208 ENDPROC(kernel_execve)
1210 KPROBE_ENTRY(page_fault)
1211 errorentry do_page_fault
1212 KPROBE_END(page_fault)
1214 ENTRY(coprocessor_error)
1215 zeroentry do_coprocessor_error
1216 END(coprocessor_error)
1218 ENTRY(simd_coprocessor_error)
1219 zeroentry do_simd_coprocessor_error
1220 END(simd_coprocessor_error)
1222 ENTRY(device_not_available)
1223 zeroentry do_device_not_available
1224 END(device_not_available)
1226 /* runs on exception stack */
1229 PARAVIRT_ADJUST_EXCEPTION_FRAME
1231 CFI_ADJUST_CFA_OFFSET 8
1232 paranoidentry do_debug, DEBUG_STACK
1236 /* runs on exception stack */
1239 PARAVIRT_ADJUST_EXCEPTION_FRAME
1241 CFI_ADJUST_CFA_OFFSET 8
1242 paranoidentry do_nmi, 0, 0
1243 #ifdef CONFIG_TRACE_IRQFLAGS
1253 PARAVIRT_ADJUST_EXCEPTION_FRAME
1255 CFI_ADJUST_CFA_OFFSET 8
1256 paranoidentry do_int3, DEBUG_STACK
1262 zeroentry do_overflow
1270 zeroentry do_invalid_op
1273 ENTRY(coprocessor_segment_overrun)
1274 zeroentry do_coprocessor_segment_overrun
1275 END(coprocessor_segment_overrun)
1277 /* runs on exception stack */
1280 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 paranoidentry do_double_fault
1287 errorentry do_invalid_TSS
1290 ENTRY(segment_not_present)
1291 errorentry do_segment_not_present
1292 END(segment_not_present)
1294 /* runs on exception stack */
1295 ENTRY(stack_segment)
1297 PARAVIRT_ADJUST_EXCEPTION_FRAME
1298 paranoidentry do_stack_segment
1303 KPROBE_ENTRY(general_protection)
1304 errorentry do_general_protection
1305 KPROBE_END(general_protection)
1307 ENTRY(alignment_check)
1308 errorentry do_alignment_check
1309 END(alignment_check)
1312 zeroentry do_divide_error
1315 ENTRY(spurious_interrupt_bug)
1316 zeroentry do_spurious_interrupt_bug
1317 END(spurious_interrupt_bug)
1319 #ifdef CONFIG_X86_MCE
1320 /* runs on exception stack */
1321 ENTRY(machine_check)
1323 PARAVIRT_ADJUST_EXCEPTION_FRAME
1325 CFI_ADJUST_CFA_OFFSET 8
1326 paranoidentry do_machine_check
1332 /* Call softirq on interrupt stack. Interrupts are off. */
1336 CFI_ADJUST_CFA_OFFSET 8
1337 CFI_REL_OFFSET rbp,0
1339 CFI_DEF_CFA_REGISTER rbp
1340 incl %gs:pda_irqcount
1341 cmove %gs:pda_irqstackptr,%rsp
1342 push %rbp # backlink for old unwinder
1345 CFI_DEF_CFA_REGISTER rsp
1346 CFI_ADJUST_CFA_OFFSET -8
1347 decl %gs:pda_irqcount
1350 ENDPROC(call_softirq)
1352 KPROBE_ENTRY(ignore_sysret)
1357 ENDPROC(ignore_sysret)
1360 ENTRY(xen_hypervisor_callback)
1361 zeroentry xen_do_hypervisor_callback
1362 END(xen_hypervisor_callback)
1365 # A note on the "critical region" in our callback handler.
1366 # We want to avoid stacking callback handlers due to events occurring
1367 # during handling of the last event. To do this, we keep events disabled
1368 # until we've done all processing. HOWEVER, we must enable events before
1369 # popping the stack frame (can't be done atomically) and so it would still
1370 # be possible to get enough handler activations to overflow the stack.
1371 # Although unlikely, bugs of that kind are hard to track down, so we'd
1372 # like to avoid the possibility.
1373 # So, on entry to the handler we detect whether we interrupted an
1374 # existing activation in its critical region -- if so, we pop the current
1375 # activation and restart the handler using the previous one.
1377 ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1379 /* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1380 see the correct pointer to the pt_regs */
1381 movq %rdi, %rsp # we don't return, adjust the stack frame
1384 11: incl %gs:pda_irqcount
1386 CFI_DEF_CFA_REGISTER rbp
1387 cmovzq %gs:pda_irqstackptr,%rsp
1388 pushq %rbp # backlink for old unwinder
1389 call xen_evtchn_do_upcall
1391 CFI_DEF_CFA_REGISTER rsp
1392 decl %gs:pda_irqcount
1395 END(do_hypervisor_callback)
1398 # Hypervisor uses this for application faults while it executes.
1399 # We get here for two reasons:
1400 # 1. Fault while reloading DS, ES, FS or GS
1401 # 2. Fault while executing IRET
1402 # Category 1 we do not need to fix up as Xen has already reloaded all segment
1403 # registers that could be reloaded and zeroed the others.
1404 # Category 2 we fix up by killing the current process. We cannot use the
1405 # normal Linux return path in this case because if we use the IRET hypercall
1406 # to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1407 # We distinguish between categories by comparing each saved segment register
1408 # with its current contents: any discrepancy means we in category 1.
1410 ENTRY(xen_failsafe_callback)
1411 framesz = (RIP-0x30) /* workaround buggy gas */
1413 CFI_REL_OFFSET rcx, 0
1414 CFI_REL_OFFSET r11, 8
1428 /* All segments match their saved values => Category 2 (Bad IRET). */
1434 CFI_ADJUST_CFA_OFFSET -0x30
1436 CFI_ADJUST_CFA_OFFSET 8
1438 CFI_ADJUST_CFA_OFFSET 8
1440 CFI_ADJUST_CFA_OFFSET 8
1441 jmp general_protection
1443 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1449 CFI_ADJUST_CFA_OFFSET -0x30
1451 CFI_ADJUST_CFA_OFFSET 8
1455 END(xen_failsafe_callback)
1457 #endif /* CONFIG_XEN */