x86/debug: Remove perpetually broken, unmaintainable dwarf annotations
[linux-2.6-block.git] / arch / x86 / kernel / entry_64.S
CommitLineData
1da177e4
LT
1/*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
1da177e4
LT
7 */
8
9/*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
8b4777a4
AL
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
1da177e4
LT
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
0bd7b798 16 *
0bd7b798 17 * A note on terminology:
7fcb3bc3 18 * - iret frame: Architecture defined interrupt frame from SS to RIP
0bd7b798 19 * at the top of the kernel process stack.
2e91a17b
AK
20 *
21 * Some macro usage:
2e91a17b 22 * - ENTRY/END Define functions in the symbol table.
2e91a17b 23 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
cb5dd2c5 24 * - idtentry - Define exception entry points.
1da177e4
LT
25 */
26
1da177e4
LT
27#include <linux/linkage.h>
28#include <asm/segment.h>
1da177e4
LT
29#include <asm/cache.h>
30#include <asm/errno.h>
1da177e4 31#include <asm/calling.h>
e2d5df93 32#include <asm/asm-offsets.h>
1da177e4
LT
33#include <asm/msr.h>
34#include <asm/unistd.h>
35#include <asm/thread_info.h>
36#include <asm/hw_irq.h>
0341c14d 37#include <asm/page_types.h>
2601e64d 38#include <asm/irqflags.h>
72fe4858 39#include <asm/paravirt.h>
9939ddaf 40#include <asm/percpu.h>
d7abc0fa 41#include <asm/asm.h>
91d1aa43 42#include <asm/context_tracking.h>
63bcff2a 43#include <asm/smap.h>
3891a04a 44#include <asm/pgtable_types.h>
d7e7528b 45#include <linux/err.h>
1da177e4 46
86a1c34a
RM
47/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
48#include <linux/elf-em.h>
49#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
50#define __AUDIT_ARCH_64BIT 0x80000000
51#define __AUDIT_ARCH_LE 0x40000000
52
1da177e4 53 .code64
ea714547
JO
54 .section .entry.text, "ax"
55
16444a8a 56
72fe4858 57#ifdef CONFIG_PARAVIRT
2be29982 58ENTRY(native_usergs_sysret64)
72fe4858
GOC
59 swapgs
60 sysretq
b3baaa13 61ENDPROC(native_usergs_sysret64)
72fe4858
GOC
62#endif /* CONFIG_PARAVIRT */
63
2601e64d 64
f2db9382 65.macro TRACE_IRQS_IRETQ
2601e64d 66#ifdef CONFIG_TRACE_IRQFLAGS
f2db9382 67 bt $9,EFLAGS(%rsp) /* interrupts off? */
2601e64d
IM
68 jnc 1f
69 TRACE_IRQS_ON
701:
71#endif
72.endm
73
5963e317
SR
74/*
75 * When dynamic function tracer is enabled it will add a breakpoint
76 * to all locations that it is about to modify, sync CPUs, update
77 * all the code, sync CPUs, then remove the breakpoints. In this time
78 * if lockdep is enabled, it might jump back into the debug handler
79 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
80 *
81 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
82 * make sure the stack pointer does not get reset back to the top
83 * of the debug stack, and instead just reuses the current stack.
84 */
85#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
86
87.macro TRACE_IRQS_OFF_DEBUG
88 call debug_stack_set_zero
89 TRACE_IRQS_OFF
90 call debug_stack_reset
91.endm
92
93.macro TRACE_IRQS_ON_DEBUG
94 call debug_stack_set_zero
95 TRACE_IRQS_ON
96 call debug_stack_reset
97.endm
98
f2db9382
DV
99.macro TRACE_IRQS_IRETQ_DEBUG
100 bt $9,EFLAGS(%rsp) /* interrupts off? */
5963e317
SR
101 jnc 1f
102 TRACE_IRQS_ON_DEBUG
1031:
104.endm
105
106#else
107# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
108# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
109# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
110#endif
111
1da177e4 112/*
b87cf63e 113 * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
1da177e4 114 *
b87cf63e
DV
115 * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
116 * then loads new ss, cs, and rip from previously programmed MSRs.
117 * rflags gets masked by a value from another MSR (so CLD and CLAC
118 * are not needed). SYSCALL does not save anything on the stack
119 * and does not change rsp.
120 *
121 * Registers on entry:
1da177e4 122 * rax system call number
b87cf63e
DV
123 * rcx return address
124 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
1da177e4 125 * rdi arg0
1da177e4 126 * rsi arg1
0bd7b798 127 * rdx arg2
b87cf63e 128 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
1da177e4
LT
129 * r8 arg4
130 * r9 arg5
b87cf63e 131 * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
0bd7b798 132 *
1da177e4
LT
133 * Only called from user space.
134 *
7fcb3bc3 135 * When user can change pt_regs->foo always force IRET. That is because
7bf36bbc
AK
136 * it deals with uncanonical addresses better. SYSRET has trouble
137 * with them due to bugs in both AMD and Intel CPUs.
0bd7b798 138 */
1da177e4
LT
139
140ENTRY(system_call)
9ed8e7d8
DV
141 /*
142 * Interrupts are off on entry.
143 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
144 * it is too small to ever cause noticeable irq latency.
145 */
72fe4858
GOC
146 SWAPGS_UNSAFE_STACK
147 /*
148 * A hypervisor implementation might want to use a label
149 * after the swapgs, so that it can do the swapgs
150 * for the guest and jump here on syscall.
151 */
f6b2bc84 152GLOBAL(system_call_after_swapgs)
72fe4858 153
c38e5038 154 movq %rsp,PER_CPU_VAR(rsp_scratch)
3a23208e 155 movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp
9ed8e7d8
DV
156
157 /* Construct struct pt_regs on stack */
131484c8
IM
158 pushq $__USER_DS /* pt_regs->ss */
159 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
33db1fd4 160 /*
9ed8e7d8
DV
161 * Re-enable interrupts.
162 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
163 * must execute atomically in the face of possible interrupt-driven
164 * task preemption. We must enable interrupts only after we're done
165 * with using rsp_scratch:
33db1fd4
DV
166 */
167 ENABLE_INTERRUPTS(CLBR_NONE)
131484c8
IM
168 pushq %r11 /* pt_regs->flags */
169 pushq $__USER_CS /* pt_regs->cs */
170 pushq %rcx /* pt_regs->ip */
171 pushq %rax /* pt_regs->orig_ax */
172 pushq %rdi /* pt_regs->di */
173 pushq %rsi /* pt_regs->si */
174 pushq %rdx /* pt_regs->dx */
175 pushq %rcx /* pt_regs->cx */
176 pushq $-ENOSYS /* pt_regs->ax */
177 pushq %r8 /* pt_regs->r8 */
178 pushq %r9 /* pt_regs->r9 */
179 pushq %r10 /* pt_regs->r10 */
180 pushq %r11 /* pt_regs->r11 */
a71ffdd7 181 sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
9ed8e7d8 182
dca5b52a 183 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
1da177e4 184 jnz tracesys
86a1c34a 185system_call_fastpath:
fca460f9 186#if __SYSCALL_MASK == ~0
1da177e4 187 cmpq $__NR_syscall_max,%rax
fca460f9
PA
188#else
189 andl $__SYSCALL_MASK,%eax
190 cmpl $__NR_syscall_max,%eax
191#endif
146b2b09 192 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
1da177e4 193 movq %r10,%rcx
146b2b09 194 call *sys_call_table(,%rax,8)
f2db9382 195 movq %rax,RAX(%rsp)
146b2b09 1961:
1da177e4 197/*
146b2b09
DV
198 * Syscall return path ending with SYSRET (fast path).
199 * Has incompletely filled pt_regs.
0bd7b798 200 */
10cd706d 201 LOCKDEP_SYS_EXIT
4416c5a6
DV
202 /*
203 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
204 * it is too small to ever cause noticeable irq latency.
205 */
72fe4858 206 DISABLE_INTERRUPTS(CLBR_NONE)
b3494a4a
AL
207
208 /*
209 * We must check ti flags with interrupts (or at least preemption)
210 * off because we must *never* return to userspace without
211 * processing exit work that is enqueued if we're preempted here.
212 * In particular, returning to userspace with any of the one-shot
213 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
214 * very bad.
215 */
06ab9c1b
IM
216 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
217 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
b3494a4a 218
29722cd4
DV
219 RESTORE_C_REGS_EXCEPT_RCX_R11
220 movq RIP(%rsp),%rcx
29722cd4 221 movq EFLAGS(%rsp),%r11
263042e4 222 movq RSP(%rsp),%rsp
b87cf63e
DV
223 /*
224 * 64bit SYSRET restores rip from rcx,
225 * rflags from r11 (but RF and VM bits are forced to 0),
226 * cs and ss are loaded from MSRs.
4416c5a6 227 * Restoration of rflags re-enables interrupts.
61f01dd9
AL
228 *
229 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
230 * descriptor is not reinitialized. This means that we should
231 * avoid SYSRET with SS == NULL, which could happen if we schedule,
232 * exit the kernel, and re-enter using an interrupt vector. (All
233 * interrupt entries on x86_64 set SS to NULL.) We prevent that
234 * from happening by reloading SS in __switch_to. (Actually
235 * detecting the failure in 64-bit userspace is tricky but can be
236 * done.)
b87cf63e 237 */
2be29982 238 USERGS_SYSRET64
1da177e4 239
7fcb3bc3 240 /* Do syscall entry tracing */
0bd7b798 241tracesys:
76f5df43 242 movq %rsp, %rdi
47eb582e 243 movl $AUDIT_ARCH_X86_64, %esi
1dcf74f6
AL
244 call syscall_trace_enter_phase1
245 test %rax, %rax
246 jnz tracesys_phase2 /* if needed, run the slow path */
76f5df43 247 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
f2db9382 248 movq ORIG_RAX(%rsp), %rax
1dcf74f6
AL
249 jmp system_call_fastpath /* and return to the fast path */
250
251tracesys_phase2:
76f5df43 252 SAVE_EXTRA_REGS
1dcf74f6 253 movq %rsp, %rdi
47eb582e 254 movl $AUDIT_ARCH_X86_64, %esi
1dcf74f6
AL
255 movq %rax,%rdx
256 call syscall_trace_enter_phase2
257
d4d67150 258 /*
e90e147c 259 * Reload registers from stack in case ptrace changed them.
1dcf74f6 260 * We don't reload %rax because syscall_trace_entry_phase2() returned
d4d67150
RM
261 * the value it wants us to use in the table lookup.
262 */
76f5df43
DV
263 RESTORE_C_REGS_EXCEPT_RAX
264 RESTORE_EXTRA_REGS
fca460f9 265#if __SYSCALL_MASK == ~0
1da177e4 266 cmpq $__NR_syscall_max,%rax
fca460f9
PA
267#else
268 andl $__SYSCALL_MASK,%eax
269 cmpl $__NR_syscall_max,%eax
270#endif
a6de5a21 271 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
1da177e4
LT
272 movq %r10,%rcx /* fixup for C */
273 call *sys_call_table(,%rax,8)
f2db9382 274 movq %rax,RAX(%rsp)
a6de5a21 2751:
7fcb3bc3 276 /* Use IRET because user could have changed pt_regs->foo */
0bd7b798
AH
277
278/*
1da177e4 279 * Syscall return path ending with IRET.
7fcb3bc3 280 * Has correct iret frame.
bcddc015 281 */
bc8b2b92 282GLOBAL(int_ret_from_sys_call)
72fe4858 283 DISABLE_INTERRUPTS(CLBR_NONE)
4416c5a6 284int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
2601e64d 285 TRACE_IRQS_OFF
1da177e4
LT
286 movl $_TIF_ALLWORK_MASK,%edi
287 /* edi: mask to check */
bc8b2b92 288GLOBAL(int_with_check)
10cd706d 289 LOCKDEP_SYS_EXIT_IRQ
1da177e4 290 GET_THREAD_INFO(%rcx)
26ccb8a7 291 movl TI_flags(%rcx),%edx
1da177e4
LT
292 andl %edi,%edx
293 jnz int_careful
fffbb5dc
DV
294 andl $~TS_COMPAT,TI_status(%rcx)
295 jmp syscall_return
1da177e4
LT
296
297 /* Either reschedule or signal or syscall exit tracking needed. */
298 /* First do a reschedule test. */
299 /* edx: work, edi: workmask */
300int_careful:
301 bt $TIF_NEED_RESCHED,%edx
302 jnc int_very_careful
2601e64d 303 TRACE_IRQS_ON
72fe4858 304 ENABLE_INTERRUPTS(CLBR_NONE)
131484c8 305 pushq %rdi
0430499c 306 SCHEDULE_USER
131484c8 307 popq %rdi
72fe4858 308 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 309 TRACE_IRQS_OFF
1da177e4
LT
310 jmp int_with_check
311
7fcb3bc3 312 /* handle signals and tracing -- both require a full pt_regs */
1da177e4 313int_very_careful:
2601e64d 314 TRACE_IRQS_ON
72fe4858 315 ENABLE_INTERRUPTS(CLBR_NONE)
76f5df43 316 SAVE_EXTRA_REGS
0bd7b798 317 /* Check for syscall exit trace */
d4d67150 318 testl $_TIF_WORK_SYSCALL_EXIT,%edx
1da177e4 319 jz int_signal
131484c8 320 pushq %rdi
0bd7b798 321 leaq 8(%rsp),%rdi # &ptregs -> arg1
1da177e4 322 call syscall_trace_leave
131484c8 323 popq %rdi
d4d67150 324 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
1da177e4 325 jmp int_restore_rest
0bd7b798 326
1da177e4 327int_signal:
8f4d37ec 328 testl $_TIF_DO_NOTIFY_MASK,%edx
1da177e4
LT
329 jz 1f
330 movq %rsp,%rdi # &ptregs -> arg1
331 xorl %esi,%esi # oldset -> arg2
332 call do_notify_resume
eca91e78 3331: movl $_TIF_WORK_MASK,%edi
1da177e4 334int_restore_rest:
76f5df43 335 RESTORE_EXTRA_REGS
72fe4858 336 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 337 TRACE_IRQS_OFF
1da177e4 338 jmp int_with_check
fffbb5dc
DV
339
340syscall_return:
341 /* The IRETQ could re-enable interrupts: */
342 DISABLE_INTERRUPTS(CLBR_ANY)
343 TRACE_IRQS_IRETQ
344
345 /*
346 * Try to use SYSRET instead of IRET if we're returning to
347 * a completely clean 64-bit userspace context.
348 */
349 movq RCX(%rsp),%rcx
17be0aec
DV
350 movq RIP(%rsp),%r11
351 cmpq %rcx,%r11 /* RCX == RIP */
fffbb5dc
DV
352 jne opportunistic_sysret_failed
353
354 /*
355 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
356 * in kernel space. This essentially lets the user take over
17be0aec 357 * the kernel, since userspace controls RSP.
fffbb5dc 358 *
17be0aec 359 * If width of "canonical tail" ever becomes variable, this will need
fffbb5dc
DV
360 * to be updated to remain correct on both old and new CPUs.
361 */
362 .ifne __VIRTUAL_MASK_SHIFT - 47
363 .error "virtual address width changed -- SYSRET checks need update"
364 .endif
17be0aec
DV
365 /* Change top 16 bits to be the sign-extension of 47th bit */
366 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
367 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
368 /* If this changed %rcx, it was not canonical */
369 cmpq %rcx, %r11
370 jne opportunistic_sysret_failed
fffbb5dc
DV
371
372 cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
373 jne opportunistic_sysret_failed
374
375 movq R11(%rsp),%r11
376 cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
377 jne opportunistic_sysret_failed
378
379 /*
380 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
381 * restoring TF results in a trap from userspace immediately after
382 * SYSRET. This would cause an infinite loop whenever #DB happens
383 * with register state that satisfies the opportunistic SYSRET
384 * conditions. For example, single-stepping this user code:
385 *
386 * movq $stuck_here,%rcx
387 * pushfq
388 * popq %r11
389 * stuck_here:
390 *
391 * would never get past 'stuck_here'.
392 */
393 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
394 jnz opportunistic_sysret_failed
395
396 /* nothing to check for RSP */
397
398 cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
399 jne opportunistic_sysret_failed
400
401 /*
402 * We win! This label is here just for ease of understanding
403 * perf profiles. Nothing jumps here.
404 */
405syscall_return_via_sysret:
17be0aec
DV
406 /* rcx and r11 are already restored (see code above) */
407 RESTORE_C_REGS_EXCEPT_RCX_R11
fffbb5dc
DV
408 movq RSP(%rsp),%rsp
409 USERGS_SYSRET64
fffbb5dc
DV
410
411opportunistic_sysret_failed:
412 SWAPGS
413 jmp restore_c_regs_and_iret
bcddc015 414END(system_call)
0bd7b798 415
fffbb5dc 416
1d4b4b29
AV
417 .macro FORK_LIKE func
418ENTRY(stub_\func)
76f5df43 419 SAVE_EXTRA_REGS 8
772951c4 420 jmp sys_\func
1d4b4b29
AV
421END(stub_\func)
422 .endm
423
424 FORK_LIKE clone
425 FORK_LIKE fork
426 FORK_LIKE vfork
1da177e4 427
1da177e4 428ENTRY(stub_execve)
fc3e958a
DV
429 call sys_execve
430return_from_execve:
431 testl %eax, %eax
432 jz 1f
433 /* exec failed, can use fast SYSRET code path in this case */
434 ret
4351:
436 /* must use IRET code path (pt_regs->cs may have changed) */
437 addq $8, %rsp
438 ZERO_EXTRA_REGS
439 movq %rax,RAX(%rsp)
440 jmp int_ret_from_sys_call
4b787e0b 441END(stub_execve)
a37f34a3
DV
442/*
443 * Remaining execve stubs are only 7 bytes long.
444 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
445 */
446 .align 8
447GLOBAL(stub_execveat)
fc3e958a
DV
448 call sys_execveat
449 jmp return_from_execve
27d6ec7a
DD
450END(stub_execveat)
451
ac7f5dfb 452#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
a37f34a3
DV
453 .align 8
454GLOBAL(stub_x32_execve)
ac7f5dfb 455GLOBAL(stub32_execve)
05f1752d
DV
456 call compat_sys_execve
457 jmp return_from_execve
ac7f5dfb 458END(stub32_execve)
05f1752d 459END(stub_x32_execve)
a37f34a3
DV
460 .align 8
461GLOBAL(stub_x32_execveat)
a37f34a3 462GLOBAL(stub32_execveat)
0f90fb97
DV
463 call compat_sys_execveat
464 jmp return_from_execve
0f90fb97 465END(stub32_execveat)
ac7f5dfb 466END(stub_x32_execveat)
0f90fb97
DV
467#endif
468
1da177e4
LT
469/*
470 * sigreturn is special because it needs to restore all registers on return.
471 * This cannot be done with SYSRET, so use the IRET return path instead.
0bd7b798 472 */
1da177e4 473ENTRY(stub_rt_sigreturn)
31f0119b
DV
474 /*
475 * SAVE_EXTRA_REGS result is not normally needed:
476 * sigreturn overwrites all pt_regs->GPREGS.
477 * But sigreturn can fail (!), and there is no easy way to detect that.
478 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
479 * we SAVE_EXTRA_REGS here.
480 */
481 SAVE_EXTRA_REGS 8
1da177e4 482 call sys_rt_sigreturn
31f0119b
DV
483return_from_stub:
484 addq $8, %rsp
76f5df43 485 RESTORE_EXTRA_REGS
31f0119b 486 movq %rax,RAX(%rsp)
1da177e4 487 jmp int_ret_from_sys_call
4b787e0b 488END(stub_rt_sigreturn)
1da177e4 489
c5a37394 490#ifdef CONFIG_X86_X32_ABI
c5a37394 491ENTRY(stub_x32_rt_sigreturn)
31f0119b 492 SAVE_EXTRA_REGS 8
c5a37394 493 call sys32_x32_rt_sigreturn
31f0119b 494 jmp return_from_stub
c5a37394 495END(stub_x32_rt_sigreturn)
c5a37394
PA
496#endif
497
1eeb207f
DV
498/*
499 * A newly forked process directly context switches into this address.
500 *
501 * rdi: prev task we switched from
502 */
503ENTRY(ret_from_fork)
1eeb207f
DV
504
505 LOCK ; btr $TIF_FORK,TI_flags(%r8)
506
131484c8
IM
507 pushq $0x0002
508 popfq # reset kernel eflags
1eeb207f
DV
509
510 call schedule_tail # rdi: 'prev' task parameter
511
1eeb207f
DV
512 RESTORE_EXTRA_REGS
513
03335e95 514 testb $3, CS(%rsp) # from kernel_thread?
1eeb207f 515
1e3fbb8a
AL
516 /*
517 * By the time we get here, we have no idea whether our pt_regs,
518 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
519 * the slow path, or one of the ia32entry paths.
66ad4efa 520 * Use IRET code path to return, since it can safely handle
1e3fbb8a
AL
521 * all of the above.
522 */
66ad4efa 523 jnz int_ret_from_sys_call
1eeb207f 524
66ad4efa
DV
525 /* We came from kernel_thread */
526 /* nb: we depend on RESTORE_EXTRA_REGS above */
1eeb207f
DV
527 movq %rbp, %rdi
528 call *%rbx
529 movl $0, RAX(%rsp)
530 RESTORE_EXTRA_REGS
531 jmp int_ret_from_sys_call
1eeb207f
DV
532END(ret_from_fork)
533
939b7871 534/*
3304c9c3
DV
535 * Build the entry stubs with some assembler magic.
536 * We pack 1 stub into every 8-byte block.
939b7871 537 */
3304c9c3 538 .align 8
939b7871 539ENTRY(irq_entries_start)
3304c9c3
DV
540 vector=FIRST_EXTERNAL_VECTOR
541 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
131484c8 542 pushq $(~vector+0x80) /* Note: always in signed byte range */
3304c9c3
DV
543 vector=vector+1
544 jmp common_interrupt
3304c9c3
DV
545 .align 8
546 .endr
939b7871
PA
547END(irq_entries_start)
548
d99015b1 549/*
1da177e4
LT
550 * Interrupt entry/exit.
551 *
552 * Interrupt entry points save only callee clobbered registers in fast path.
d99015b1
AH
553 *
554 * Entry runs with interrupts off.
555 */
1da177e4 556
722024db 557/* 0(%rsp): ~(interrupt number) */
1da177e4 558 .macro interrupt func
f6f64681 559 cld
e90e147c
DV
560 /*
561 * Since nothing in interrupt handling code touches r12...r15 members
562 * of "struct pt_regs", and since interrupts can nest, we can save
563 * four stack slots and simultaneously provide
564 * an unwind-friendly stack layout by saving "truncated" pt_regs
565 * exactly up to rbp slot, without these members.
566 */
76f5df43
DV
567 ALLOC_PT_GPREGS_ON_STACK -RBP
568 SAVE_C_REGS -RBP
569 /* this goes to 0(%rsp) for unwinder, not for saving the value: */
570 SAVE_EXTRA_REGS_RBP -RBP
571
572 leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
f6f64681 573
03335e95 574 testb $3, CS-RBP(%rsp)
dde74f2e 575 jz 1f
f6f64681 576 SWAPGS
76f5df43 5771:
f6f64681 578 /*
e90e147c 579 * Save previous stack pointer, optionally switch to interrupt stack.
f6f64681
DV
580 * irq_count is used to check if a CPU is already on an interrupt stack
581 * or not. While this is essentially redundant with preempt_count it is
582 * a little cheaper to use a separate counter in the PDA (short of
583 * moving irq_enter into assembly, which would be too much work)
584 */
76f5df43
DV
585 movq %rsp, %rsi
586 incl PER_CPU_VAR(irq_count)
f6f64681 587 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
f6f64681 588 pushq %rsi
f6f64681
DV
589 /* We entered an interrupt context - irqs are off: */
590 TRACE_IRQS_OFF
591
1da177e4
LT
592 call \func
593 .endm
594
722024db
AH
595 /*
596 * The interrupt stubs push (~vector+0x80) onto the stack and
597 * then jump to common_interrupt.
598 */
939b7871
PA
599 .p2align CONFIG_X86_L1_CACHE_SHIFT
600common_interrupt:
ee4eb87b 601 ASM_CLAC
722024db 602 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
1da177e4 603 interrupt do_IRQ
34061f13 604 /* 0(%rsp): old RSP */
7effaa88 605ret_from_intr:
72fe4858 606 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 607 TRACE_IRQS_OFF
56895530 608 decl PER_CPU_VAR(irq_count)
625dbc3b 609
a2bbe750
FW
610 /* Restore saved previous stack */
611 popq %rsi
e90e147c 612 /* return code expects complete pt_regs - adjust rsp accordingly: */
f2db9382 613 leaq -RBP(%rsi),%rsp
625dbc3b 614
03335e95 615 testb $3, CS(%rsp)
dde74f2e 616 jz retint_kernel
1da177e4 617 /* Interrupt came from user space */
a3675b32
DV
618
619 GET_THREAD_INFO(%rcx)
1da177e4 620 /*
1da177e4 621 * %rcx: thread info. Interrupts off.
0bd7b798 622 */
1da177e4
LT
623retint_with_reschedule:
624 movl $_TIF_WORK_MASK,%edi
7effaa88 625retint_check:
10cd706d 626 LOCKDEP_SYS_EXIT_IRQ
26ccb8a7 627 movl TI_flags(%rcx),%edx
1da177e4
LT
628 andl %edi,%edx
629 jnz retint_careful
10cd706d
PZ
630
631retint_swapgs: /* return to user-space */
2601e64d
IM
632 /*
633 * The iretq could re-enable interrupts:
634 */
72fe4858 635 DISABLE_INTERRUPTS(CLBR_ANY)
2601e64d 636 TRACE_IRQS_IRETQ
2a23c6b8 637
72fe4858 638 SWAPGS
fffbb5dc 639 jmp restore_c_regs_and_iret
2601e64d 640
627276cb 641/* Returning to kernel space */
6ba71b76 642retint_kernel:
627276cb
DV
643#ifdef CONFIG_PREEMPT
644 /* Interrupts are off */
645 /* Check if we need preemption */
627276cb 646 bt $9,EFLAGS(%rsp) /* interrupts were off? */
6ba71b76 647 jnc 1f
36acef25
DV
6480: cmpl $0,PER_CPU_VAR(__preempt_count)
649 jnz 1f
627276cb 650 call preempt_schedule_irq
36acef25 651 jmp 0b
6ba71b76 6521:
627276cb 653#endif
2601e64d
IM
654 /*
655 * The iretq could re-enable interrupts:
656 */
657 TRACE_IRQS_IRETQ
fffbb5dc
DV
658
659/*
660 * At this label, code paths which return to kernel and to user,
661 * which come from interrupts/exception and from syscalls, merge.
662 */
663restore_c_regs_and_iret:
76f5df43
DV
664 RESTORE_C_REGS
665 REMOVE_PT_GPREGS_FROM_STACK 8
3701d863 666
f7f3d791 667irq_return:
7209a75d
AL
668 INTERRUPT_RETURN
669
670ENTRY(native_iret)
3891a04a
PA
671 /*
672 * Are we returning to a stack segment from the LDT? Note: in
673 * 64-bit mode SS:RSP on the exception stack is always valid.
674 */
34273f41 675#ifdef CONFIG_X86_ESPFIX64
3891a04a 676 testb $4,(SS-RIP)(%rsp)
7209a75d 677 jnz native_irq_return_ldt
34273f41 678#endif
3891a04a 679
af726f21 680.global native_irq_return_iret
7209a75d 681native_irq_return_iret:
b645af2d
AL
682 /*
683 * This may fault. Non-paranoid faults on return to userspace are
684 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
685 * Double-faults due to espfix64 are handled in do_double_fault.
686 * Other faults here are fatal.
687 */
1da177e4 688 iretq
3701d863 689
34273f41 690#ifdef CONFIG_X86_ESPFIX64
7209a75d 691native_irq_return_ldt:
131484c8
IM
692 pushq %rax
693 pushq %rdi
3891a04a
PA
694 SWAPGS
695 movq PER_CPU_VAR(espfix_waddr),%rdi
696 movq %rax,(0*8)(%rdi) /* RAX */
697 movq (2*8)(%rsp),%rax /* RIP */
698 movq %rax,(1*8)(%rdi)
699 movq (3*8)(%rsp),%rax /* CS */
700 movq %rax,(2*8)(%rdi)
701 movq (4*8)(%rsp),%rax /* RFLAGS */
702 movq %rax,(3*8)(%rdi)
703 movq (6*8)(%rsp),%rax /* SS */
704 movq %rax,(5*8)(%rdi)
705 movq (5*8)(%rsp),%rax /* RSP */
706 movq %rax,(4*8)(%rdi)
707 andl $0xffff0000,%eax
131484c8 708 popq %rdi
3891a04a
PA
709 orq PER_CPU_VAR(espfix_stack),%rax
710 SWAPGS
711 movq %rax,%rsp
131484c8 712 popq %rax
7209a75d 713 jmp native_irq_return_iret
34273f41 714#endif
3891a04a 715
7effaa88 716 /* edi: workmask, edx: work */
1da177e4
LT
717retint_careful:
718 bt $TIF_NEED_RESCHED,%edx
719 jnc retint_signal
2601e64d 720 TRACE_IRQS_ON
72fe4858 721 ENABLE_INTERRUPTS(CLBR_NONE)
131484c8 722 pushq %rdi
0430499c 723 SCHEDULE_USER
131484c8 724 popq %rdi
1da177e4 725 GET_THREAD_INFO(%rcx)
72fe4858 726 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 727 TRACE_IRQS_OFF
1da177e4 728 jmp retint_check
0bd7b798 729
1da177e4 730retint_signal:
8f4d37ec 731 testl $_TIF_DO_NOTIFY_MASK,%edx
10ffdbb8 732 jz retint_swapgs
2601e64d 733 TRACE_IRQS_ON
72fe4858 734 ENABLE_INTERRUPTS(CLBR_NONE)
76f5df43 735 SAVE_EXTRA_REGS
0bd7b798 736 movq $-1,ORIG_RAX(%rsp)
3829ee6b 737 xorl %esi,%esi # oldset
1da177e4
LT
738 movq %rsp,%rdi # &pt_regs
739 call do_notify_resume
76f5df43 740 RESTORE_EXTRA_REGS
72fe4858 741 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 742 TRACE_IRQS_OFF
be9e6870 743 GET_THREAD_INFO(%rcx)
eca91e78 744 jmp retint_with_reschedule
1da177e4 745
4b787e0b 746END(common_interrupt)
3891a04a 747
1da177e4
LT
748/*
749 * APIC interrupts.
0bd7b798 750 */
cf910e83 751.macro apicinterrupt3 num sym do_sym
322648d1 752ENTRY(\sym)
ee4eb87b 753 ASM_CLAC
131484c8 754 pushq $~(\num)
39e95433 755.Lcommon_\sym:
322648d1 756 interrupt \do_sym
1da177e4 757 jmp ret_from_intr
322648d1
AH
758END(\sym)
759.endm
1da177e4 760
cf910e83
SA
761#ifdef CONFIG_TRACING
762#define trace(sym) trace_##sym
763#define smp_trace(sym) smp_trace_##sym
764
765.macro trace_apicinterrupt num sym
766apicinterrupt3 \num trace(\sym) smp_trace(\sym)
767.endm
768#else
769.macro trace_apicinterrupt num sym do_sym
770.endm
771#endif
772
773.macro apicinterrupt num sym do_sym
774apicinterrupt3 \num \sym \do_sym
775trace_apicinterrupt \num \sym
776.endm
777
322648d1 778#ifdef CONFIG_SMP
cf910e83 779apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
322648d1 780 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
cf910e83 781apicinterrupt3 REBOOT_VECTOR \
4ef702c1 782 reboot_interrupt smp_reboot_interrupt
322648d1 783#endif
1da177e4 784
03b48632 785#ifdef CONFIG_X86_UV
cf910e83 786apicinterrupt3 UV_BAU_MESSAGE \
322648d1 787 uv_bau_message_intr1 uv_bau_message_interrupt
03b48632 788#endif
322648d1
AH
789apicinterrupt LOCAL_TIMER_VECTOR \
790 apic_timer_interrupt smp_apic_timer_interrupt
4a4de9c7
DS
791apicinterrupt X86_PLATFORM_IPI_VECTOR \
792 x86_platform_ipi smp_x86_platform_ipi
89b831ef 793
d78f2664 794#ifdef CONFIG_HAVE_KVM
cf910e83 795apicinterrupt3 POSTED_INTR_VECTOR \
d78f2664
YZ
796 kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
797#endif
798
33e5ff63 799#ifdef CONFIG_X86_MCE_THRESHOLD
322648d1 800apicinterrupt THRESHOLD_APIC_VECTOR \
7856f6cc 801 threshold_interrupt smp_threshold_interrupt
33e5ff63
SA
802#endif
803
804#ifdef CONFIG_X86_THERMAL_VECTOR
322648d1
AH
805apicinterrupt THERMAL_APIC_VECTOR \
806 thermal_interrupt smp_thermal_interrupt
33e5ff63 807#endif
1812924b 808
322648d1
AH
809#ifdef CONFIG_SMP
810apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
811 call_function_single_interrupt smp_call_function_single_interrupt
812apicinterrupt CALL_FUNCTION_VECTOR \
813 call_function_interrupt smp_call_function_interrupt
814apicinterrupt RESCHEDULE_VECTOR \
815 reschedule_interrupt smp_reschedule_interrupt
816#endif
1da177e4 817
322648d1
AH
818apicinterrupt ERROR_APIC_VECTOR \
819 error_interrupt smp_error_interrupt
820apicinterrupt SPURIOUS_APIC_VECTOR \
821 spurious_interrupt smp_spurious_interrupt
0bd7b798 822
e360adbe
PZ
823#ifdef CONFIG_IRQ_WORK
824apicinterrupt IRQ_WORK_VECTOR \
825 irq_work_interrupt smp_irq_work_interrupt
241771ef
IM
826#endif
827
1da177e4
LT
828/*
829 * Exception entry points.
0bd7b798 830 */
9b476688 831#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
577ed45e
AL
832
833.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
322648d1 834ENTRY(\sym)
577ed45e
AL
835 /* Sanity check */
836 .if \shift_ist != -1 && \paranoid == 0
837 .error "using shift_ist requires paranoid=1"
838 .endif
839
ee4eb87b 840 ASM_CLAC
b8b1d08b 841 PARAVIRT_ADJUST_EXCEPTION_FRAME
cb5dd2c5
AL
842
843 .ifeq \has_error_code
131484c8 844 pushq $-1 /* ORIG_RAX: no syscall to restart */
cb5dd2c5
AL
845 .endif
846
76f5df43 847 ALLOC_PT_GPREGS_ON_STACK
cb5dd2c5
AL
848
849 .if \paranoid
48e08d0f 850 .if \paranoid == 1
03335e95 851 testb $3, CS(%rsp) /* If coming from userspace, switch */
48e08d0f
AL
852 jnz 1f /* stacks. */
853 .endif
ebfc453e 854 call paranoid_entry
cb5dd2c5
AL
855 .else
856 call error_entry
857 .endif
ebfc453e 858 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
cb5dd2c5 859
cb5dd2c5 860 .if \paranoid
577ed45e
AL
861 .if \shift_ist != -1
862 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
863 .else
b8b1d08b 864 TRACE_IRQS_OFF
cb5dd2c5 865 .endif
577ed45e 866 .endif
cb5dd2c5
AL
867
868 movq %rsp,%rdi /* pt_regs pointer */
869
870 .if \has_error_code
871 movq ORIG_RAX(%rsp),%rsi /* get error code */
872 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
873 .else
874 xorl %esi,%esi /* no error code */
875 .endif
876
577ed45e 877 .if \shift_ist != -1
9b476688 878 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
577ed45e
AL
879 .endif
880
322648d1 881 call \do_sym
cb5dd2c5 882
577ed45e 883 .if \shift_ist != -1
9b476688 884 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
577ed45e
AL
885 .endif
886
ebfc453e 887 /* these procedures expect "no swapgs" flag in ebx */
cb5dd2c5 888 .if \paranoid
ebfc453e 889 jmp paranoid_exit
cb5dd2c5 890 .else
ebfc453e 891 jmp error_exit
cb5dd2c5
AL
892 .endif
893
48e08d0f 894 .if \paranoid == 1
48e08d0f
AL
895 /*
896 * Paranoid entry from userspace. Switch stacks and treat it
897 * as a normal entry. This means that paranoid handlers
898 * run in real process context if user_mode(regs).
899 */
9001:
901 call error_entry
902
48e08d0f
AL
903
904 movq %rsp,%rdi /* pt_regs pointer */
905 call sync_regs
906 movq %rax,%rsp /* switch stack */
907
908 movq %rsp,%rdi /* pt_regs pointer */
909
910 .if \has_error_code
911 movq ORIG_RAX(%rsp),%rsi /* get error code */
912 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
913 .else
914 xorl %esi,%esi /* no error code */
915 .endif
916
917 call \do_sym
918
919 jmp error_exit /* %ebx: no swapgs flag */
920 .endif
ddeb8f21 921END(\sym)
322648d1 922.endm
b8b1d08b 923
25c74b10 924#ifdef CONFIG_TRACING
cb5dd2c5
AL
925.macro trace_idtentry sym do_sym has_error_code:req
926idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
927idtentry \sym \do_sym has_error_code=\has_error_code
25c74b10
SA
928.endm
929#else
cb5dd2c5
AL
930.macro trace_idtentry sym do_sym has_error_code:req
931idtentry \sym \do_sym has_error_code=\has_error_code
25c74b10
SA
932.endm
933#endif
934
cb5dd2c5
AL
935idtentry divide_error do_divide_error has_error_code=0
936idtentry overflow do_overflow has_error_code=0
937idtentry bounds do_bounds has_error_code=0
938idtentry invalid_op do_invalid_op has_error_code=0
939idtentry device_not_available do_device_not_available has_error_code=0
48e08d0f 940idtentry double_fault do_double_fault has_error_code=1 paranoid=2
cb5dd2c5
AL
941idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
942idtentry invalid_TSS do_invalid_TSS has_error_code=1
943idtentry segment_not_present do_segment_not_present has_error_code=1
944idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
945idtentry coprocessor_error do_coprocessor_error has_error_code=0
946idtentry alignment_check do_alignment_check has_error_code=1
947idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
5cec93c2 948
2601e64d 949
9f1e87ea
CG
950 /* Reload gs selector with exception handling */
951 /* edi: new selector */
9f9d489a 952ENTRY(native_load_gs_index)
131484c8 953 pushfq
b8aa287f 954 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
9f1e87ea 955 SWAPGS
0bd7b798 956gs_change:
9f1e87ea 957 movl %edi,%gs
1da177e4 9582: mfence /* workaround */
72fe4858 959 SWAPGS
131484c8 960 popfq
9f1e87ea 961 ret
6efdcfaf 962END(native_load_gs_index)
0bd7b798 963
d7abc0fa 964 _ASM_EXTABLE(gs_change,bad_gs)
9f1e87ea 965 .section .fixup,"ax"
1da177e4 966 /* running with kernelgs */
0bd7b798 967bad_gs:
72fe4858 968 SWAPGS /* switch back to user gs */
1da177e4 969 xorl %eax,%eax
9f1e87ea
CG
970 movl %eax,%gs
971 jmp 2b
972 .previous
0bd7b798 973
2699500b 974/* Call softirq on interrupt stack. Interrupts are off. */
7d65f4a6 975ENTRY(do_softirq_own_stack)
131484c8 976 pushq %rbp
2699500b 977 mov %rsp,%rbp
56895530 978 incl PER_CPU_VAR(irq_count)
26f80bd6 979 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
2699500b 980 push %rbp # backlink for old unwinder
ed6b676c 981 call __do_softirq
2699500b 982 leaveq
56895530 983 decl PER_CPU_VAR(irq_count)
ed6b676c 984 ret
7d65f4a6 985END(do_softirq_own_stack)
75154f40 986
3d75e1b8 987#ifdef CONFIG_XEN
cb5dd2c5 988idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
3d75e1b8
JF
989
990/*
9f1e87ea
CG
991 * A note on the "critical region" in our callback handler.
992 * We want to avoid stacking callback handlers due to events occurring
993 * during handling of the last event. To do this, we keep events disabled
994 * until we've done all processing. HOWEVER, we must enable events before
995 * popping the stack frame (can't be done atomically) and so it would still
996 * be possible to get enough handler activations to overflow the stack.
997 * Although unlikely, bugs of that kind are hard to track down, so we'd
998 * like to avoid the possibility.
999 * So, on entry to the handler we detect whether we interrupted an
1000 * existing activation in its critical region -- if so, we pop the current
1001 * activation and restart the handler using the previous one.
1002 */
3d75e1b8 1003ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
9f1e87ea
CG
1004/*
1005 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1006 * see the correct pointer to the pt_regs
1007 */
3d75e1b8 1008 movq %rdi, %rsp # we don't return, adjust the stack frame
56895530 100911: incl PER_CPU_VAR(irq_count)
3d75e1b8 1010 movq %rsp,%rbp
26f80bd6 1011 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
3d75e1b8
JF
1012 pushq %rbp # backlink for old unwinder
1013 call xen_evtchn_do_upcall
1014 popq %rsp
56895530 1015 decl PER_CPU_VAR(irq_count)
fdfd811d
DV
1016#ifndef CONFIG_PREEMPT
1017 call xen_maybe_preempt_hcall
1018#endif
3d75e1b8 1019 jmp error_exit
371c394a 1020END(xen_do_hypervisor_callback)
3d75e1b8
JF
1021
1022/*
9f1e87ea
CG
1023 * Hypervisor uses this for application faults while it executes.
1024 * We get here for two reasons:
1025 * 1. Fault while reloading DS, ES, FS or GS
1026 * 2. Fault while executing IRET
1027 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1028 * registers that could be reloaded and zeroed the others.
1029 * Category 2 we fix up by killing the current process. We cannot use the
1030 * normal Linux return path in this case because if we use the IRET hypercall
1031 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1032 * We distinguish between categories by comparing each saved segment register
1033 * with its current contents: any discrepancy means we in category 1.
1034 */
3d75e1b8 1035ENTRY(xen_failsafe_callback)
adeb5537 1036 movl %ds,%ecx
3d75e1b8 1037 cmpw %cx,0x10(%rsp)
3d75e1b8 1038 jne 1f
adeb5537 1039 movl %es,%ecx
3d75e1b8
JF
1040 cmpw %cx,0x18(%rsp)
1041 jne 1f
adeb5537 1042 movl %fs,%ecx
3d75e1b8
JF
1043 cmpw %cx,0x20(%rsp)
1044 jne 1f
adeb5537 1045 movl %gs,%ecx
3d75e1b8
JF
1046 cmpw %cx,0x28(%rsp)
1047 jne 1f
1048 /* All segments match their saved values => Category 2 (Bad IRET). */
1049 movq (%rsp),%rcx
3d75e1b8 1050 movq 8(%rsp),%r11
3d75e1b8 1051 addq $0x30,%rsp
131484c8
IM
1052 pushq $0 /* RIP */
1053 pushq %r11
1054 pushq %rcx
4a5c3e77 1055 jmp general_protection
3d75e1b8
JF
10561: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1057 movq (%rsp),%rcx
3d75e1b8 1058 movq 8(%rsp),%r11
3d75e1b8 1059 addq $0x30,%rsp
131484c8 1060 pushq $-1 /* orig_ax = -1 => not a system call */
76f5df43
DV
1061 ALLOC_PT_GPREGS_ON_STACK
1062 SAVE_C_REGS
1063 SAVE_EXTRA_REGS
3d75e1b8 1064 jmp error_exit
3d75e1b8
JF
1065END(xen_failsafe_callback)
1066
cf910e83 1067apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
38e20b07
SY
1068 xen_hvm_callback_vector xen_evtchn_do_upcall
1069
3d75e1b8 1070#endif /* CONFIG_XEN */
ddeb8f21 1071
bc2b0331 1072#if IS_ENABLED(CONFIG_HYPERV)
cf910e83 1073apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
bc2b0331
S
1074 hyperv_callback_vector hyperv_vector_handler
1075#endif /* CONFIG_HYPERV */
1076
577ed45e
AL
1077idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
1078idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
6f442be2 1079idtentry stack_segment do_stack_segment has_error_code=1
6cac5a92 1080#ifdef CONFIG_XEN
cb5dd2c5
AL
1081idtentry xen_debug do_debug has_error_code=0
1082idtentry xen_int3 do_int3 has_error_code=0
1083idtentry xen_stack_segment do_stack_segment has_error_code=1
6cac5a92 1084#endif
cb5dd2c5
AL
1085idtentry general_protection do_general_protection has_error_code=1
1086trace_idtentry page_fault do_page_fault has_error_code=1
631bc487 1087#ifdef CONFIG_KVM_GUEST
cb5dd2c5 1088idtentry async_page_fault do_async_page_fault has_error_code=1
631bc487 1089#endif
ddeb8f21 1090#ifdef CONFIG_X86_MCE
cb5dd2c5 1091idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
ddeb8f21
AH
1092#endif
1093
ebfc453e
DV
1094/*
1095 * Save all registers in pt_regs, and switch gs if needed.
1096 * Use slow, but surefire "are we in kernel?" check.
1097 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1098 */
1099ENTRY(paranoid_entry)
1eeb207f
DV
1100 cld
1101 SAVE_C_REGS 8
1102 SAVE_EXTRA_REGS 8
1103 movl $1,%ebx
1104 movl $MSR_GS_BASE,%ecx
1105 rdmsr
1106 testl %edx,%edx
1107 js 1f /* negative -> in kernel */
1108 SWAPGS
1109 xorl %ebx,%ebx
11101: ret
ebfc453e 1111END(paranoid_entry)
ddeb8f21 1112
ebfc453e
DV
1113/*
1114 * "Paranoid" exit path from exception stack. This is invoked
1115 * only on return from non-NMI IST interrupts that came
1116 * from kernel space.
1117 *
1118 * We may be returning to very strange contexts (e.g. very early
1119 * in syscall entry), so checking for preemption here would
1120 * be complicated. Fortunately, we there's no good reason
1121 * to try to handle preemption here.
1122 */
1123/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ddeb8f21 1124ENTRY(paranoid_exit)
ddeb8f21 1125 DISABLE_INTERRUPTS(CLBR_NONE)
5963e317 1126 TRACE_IRQS_OFF_DEBUG
ddeb8f21 1127 testl %ebx,%ebx /* swapgs needed? */
0d550836 1128 jnz paranoid_exit_no_swapgs
f2db9382 1129 TRACE_IRQS_IRETQ
ddeb8f21 1130 SWAPGS_UNSAFE_STACK
0d550836
DV
1131 jmp paranoid_exit_restore
1132paranoid_exit_no_swapgs:
f2db9382 1133 TRACE_IRQS_IRETQ_DEBUG
0d550836 1134paranoid_exit_restore:
76f5df43
DV
1135 RESTORE_EXTRA_REGS
1136 RESTORE_C_REGS
1137 REMOVE_PT_GPREGS_FROM_STACK 8
48e08d0f 1138 INTERRUPT_RETURN
ddeb8f21
AH
1139END(paranoid_exit)
1140
1141/*
ebfc453e
DV
1142 * Save all registers in pt_regs, and switch gs if needed.
1143 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
ddeb8f21
AH
1144 */
1145ENTRY(error_entry)
ddeb8f21 1146 cld
76f5df43
DV
1147 SAVE_C_REGS 8
1148 SAVE_EXTRA_REGS 8
ddeb8f21 1149 xorl %ebx,%ebx
03335e95 1150 testb $3, CS+8(%rsp)
dde74f2e 1151 jz error_kernelspace
ddeb8f21
AH
1152error_swapgs:
1153 SWAPGS
1154error_sti:
1155 TRACE_IRQS_OFF
1156 ret
ddeb8f21 1157
ebfc453e
DV
1158 /*
1159 * There are two places in the kernel that can potentially fault with
1160 * usergs. Handle them here. B stepping K8s sometimes report a
1161 * truncated RIP for IRET exceptions returning to compat mode. Check
1162 * for these here too.
1163 */
ddeb8f21
AH
1164error_kernelspace:
1165 incl %ebx
7209a75d 1166 leaq native_irq_return_iret(%rip),%rcx
ddeb8f21 1167 cmpq %rcx,RIP+8(%rsp)
b645af2d 1168 je error_bad_iret
ae24ffe5
BG
1169 movl %ecx,%eax /* zero extend */
1170 cmpq %rax,RIP+8(%rsp)
1171 je bstep_iret
ddeb8f21 1172 cmpq $gs_change,RIP+8(%rsp)
9f1e87ea 1173 je error_swapgs
ddeb8f21 1174 jmp error_sti
ae24ffe5
BG
1175
1176bstep_iret:
1177 /* Fix truncated RIP */
1178 movq %rcx,RIP+8(%rsp)
b645af2d
AL
1179 /* fall through */
1180
1181error_bad_iret:
1182 SWAPGS
1183 mov %rsp,%rdi
1184 call fixup_bad_iret
1185 mov %rax,%rsp
1186 decl %ebx /* Return to usergs */
1187 jmp error_sti
ddeb8f21
AH
1188END(error_entry)
1189
1190
ebfc453e 1191/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ddeb8f21 1192ENTRY(error_exit)
ddeb8f21 1193 movl %ebx,%eax
76f5df43 1194 RESTORE_EXTRA_REGS
ddeb8f21
AH
1195 DISABLE_INTERRUPTS(CLBR_NONE)
1196 TRACE_IRQS_OFF
1197 GET_THREAD_INFO(%rcx)
1198 testl %eax,%eax
dde74f2e 1199 jnz retint_kernel
ddeb8f21
AH
1200 LOCKDEP_SYS_EXIT_IRQ
1201 movl TI_flags(%rcx),%edx
1202 movl $_TIF_WORK_MASK,%edi
1203 andl %edi,%edx
1204 jnz retint_careful
1205 jmp retint_swapgs
ddeb8f21
AH
1206END(error_exit)
1207
0784b364 1208/* Runs on exception stack */
ddeb8f21 1209ENTRY(nmi)
ddeb8f21 1210 PARAVIRT_ADJUST_EXCEPTION_FRAME
3f3c8b8c
SR
1211 /*
1212 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1213 * the iretq it performs will take us out of NMI context.
1214 * This means that we can have nested NMIs where the next
1215 * NMI is using the top of the stack of the previous NMI. We
1216 * can't let it execute because the nested NMI will corrupt the
1217 * stack of the previous NMI. NMI handlers are not re-entrant
1218 * anyway.
1219 *
1220 * To handle this case we do the following:
1221 * Check the a special location on the stack that contains
1222 * a variable that is set when NMIs are executing.
1223 * The interrupted task's stack is also checked to see if it
1224 * is an NMI stack.
1225 * If the variable is not set and the stack is not the NMI
1226 * stack then:
1227 * o Set the special variable on the stack
1228 * o Copy the interrupt frame into a "saved" location on the stack
1229 * o Copy the interrupt frame into a "copy" location on the stack
1230 * o Continue processing the NMI
1231 * If the variable is set or the previous stack is the NMI stack:
1232 * o Modify the "copy" location to jump to the repeate_nmi
1233 * o return back to the first NMI
1234 *
1235 * Now on exit of the first NMI, we first clear the stack variable
1236 * The NMI stack will tell any nested NMIs at that point that it is
1237 * nested. Then we pop the stack normally with iret, and if there was
1238 * a nested NMI that updated the copy interrupt stack frame, a
1239 * jump will be made to the repeat_nmi code that will handle the second
1240 * NMI.
1241 */
1242
146b2b09 1243 /* Use %rdx as our temp variable throughout */
131484c8 1244 pushq %rdx
3f3c8b8c 1245
45d5a168
SR
1246 /*
1247 * If %cs was not the kernel segment, then the NMI triggered in user
1248 * space, which means it is definitely not nested.
1249 */
a38449ef 1250 cmpl $__KERNEL_CS, 16(%rsp)
45d5a168
SR
1251 jne first_nmi
1252
3f3c8b8c
SR
1253 /*
1254 * Check the special variable on the stack to see if NMIs are
1255 * executing.
1256 */
a38449ef 1257 cmpl $1, -8(%rsp)
3f3c8b8c
SR
1258 je nested_nmi
1259
1260 /*
1261 * Now test if the previous stack was an NMI stack.
1262 * We need the double check. We check the NMI stack to satisfy the
1263 * race when the first NMI clears the variable before returning.
1264 * We check the variable because the first NMI could be in a
1265 * breakpoint routine using a breakpoint stack.
1266 */
0784b364
DV
1267 lea 6*8(%rsp), %rdx
1268 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1269 cmpq %rdx, 4*8(%rsp)
1270 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1271 ja first_nmi
1272 subq $EXCEPTION_STKSZ, %rdx
1273 cmpq %rdx, 4*8(%rsp)
1274 /* If it is below the NMI stack, it is a normal NMI */
1275 jb first_nmi
1276 /* Ah, it is within the NMI stack, treat it as nested */
0784b364 1277
3f3c8b8c
SR
1278nested_nmi:
1279 /*
1280 * Do nothing if we interrupted the fixup in repeat_nmi.
1281 * It's about to repeat the NMI handler, so we are fine
1282 * with ignoring this one.
1283 */
1284 movq $repeat_nmi, %rdx
1285 cmpq 8(%rsp), %rdx
1286 ja 1f
1287 movq $end_repeat_nmi, %rdx
1288 cmpq 8(%rsp), %rdx
1289 ja nested_nmi_out
1290
12911:
1292 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
28696f43 1293 leaq -1*8(%rsp), %rdx
3f3c8b8c 1294 movq %rdx, %rsp
28696f43 1295 leaq -10*8(%rsp), %rdx
131484c8
IM
1296 pushq $__KERNEL_DS
1297 pushq %rdx
1298 pushfq
1299 pushq $__KERNEL_CS
1300 pushq $repeat_nmi
3f3c8b8c
SR
1301
1302 /* Put stack back */
28696f43 1303 addq $(6*8), %rsp
3f3c8b8c
SR
1304
1305nested_nmi_out:
131484c8 1306 popq %rdx
3f3c8b8c
SR
1307
1308 /* No need to check faults here */
1309 INTERRUPT_RETURN
1310
1311first_nmi:
1312 /*
1313 * Because nested NMIs will use the pushed location that we
1314 * stored in rdx, we must keep that space available.
1315 * Here's what our stack frame will look like:
1316 * +-------------------------+
1317 * | original SS |
1318 * | original Return RSP |
1319 * | original RFLAGS |
1320 * | original CS |
1321 * | original RIP |
1322 * +-------------------------+
1323 * | temp storage for rdx |
1324 * +-------------------------+
1325 * | NMI executing variable |
1326 * +-------------------------+
3f3c8b8c
SR
1327 * | copied SS |
1328 * | copied Return RSP |
1329 * | copied RFLAGS |
1330 * | copied CS |
1331 * | copied RIP |
1332 * +-------------------------+
28696f43
SQ
1333 * | Saved SS |
1334 * | Saved Return RSP |
1335 * | Saved RFLAGS |
1336 * | Saved CS |
1337 * | Saved RIP |
1338 * +-------------------------+
3f3c8b8c
SR
1339 * | pt_regs |
1340 * +-------------------------+
1341 *
79fb4ad6
SR
1342 * The saved stack frame is used to fix up the copied stack frame
1343 * that a nested NMI may change to make the interrupted NMI iret jump
1344 * to the repeat_nmi. The original stack frame and the temp storage
3f3c8b8c
SR
1345 * is also used by nested NMIs and can not be trusted on exit.
1346 */
79fb4ad6 1347 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
62610913 1348 movq (%rsp), %rdx
62610913 1349
3f3c8b8c 1350 /* Set the NMI executing variable on the stack. */
131484c8 1351 pushq $1
3f3c8b8c 1352
28696f43
SQ
1353 /*
1354 * Leave room for the "copied" frame
1355 */
1356 subq $(5*8), %rsp
1357
3f3c8b8c
SR
1358 /* Copy the stack frame to the Saved frame */
1359 .rept 5
131484c8 1360 pushq 11*8(%rsp)
3f3c8b8c 1361 .endr
62610913 1362
79fb4ad6
SR
1363 /* Everything up to here is safe from nested NMIs */
1364
62610913
JB
1365 /*
1366 * If there was a nested NMI, the first NMI's iret will return
1367 * here. But NMIs are still enabled and we can take another
1368 * nested NMI. The nested NMI checks the interrupted RIP to see
1369 * if it is between repeat_nmi and end_repeat_nmi, and if so
1370 * it will just return, as we are about to repeat an NMI anyway.
1371 * This makes it safe to copy to the stack frame that a nested
1372 * NMI will update.
1373 */
1374repeat_nmi:
1375 /*
1376 * Update the stack variable to say we are still in NMI (the update
1377 * is benign for the non-repeat case, where 1 was pushed just above
1378 * to this very stack slot).
1379 */
28696f43 1380 movq $1, 10*8(%rsp)
3f3c8b8c
SR
1381
1382 /* Make another copy, this one may be modified by nested NMIs */
28696f43 1383 addq $(10*8), %rsp
3f3c8b8c 1384 .rept 5
131484c8 1385 pushq -6*8(%rsp)
3f3c8b8c 1386 .endr
28696f43 1387 subq $(5*8), %rsp
62610913 1388end_repeat_nmi:
3f3c8b8c
SR
1389
1390 /*
1391 * Everything below this point can be preempted by a nested
79fb4ad6
SR
1392 * NMI if the first NMI took an exception and reset our iret stack
1393 * so that we repeat another NMI.
3f3c8b8c 1394 */
131484c8 1395 pushq $-1 /* ORIG_RAX: no syscall to restart */
76f5df43
DV
1396 ALLOC_PT_GPREGS_ON_STACK
1397
1fd466ef 1398 /*
ebfc453e 1399 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1fd466ef
SR
1400 * as we should not be calling schedule in NMI context.
1401 * Even with normal interrupts enabled. An NMI should not be
1402 * setting NEED_RESCHED or anything that normal interrupts and
1403 * exceptions might do.
1404 */
ebfc453e 1405 call paranoid_entry
7fbb98c5
SR
1406
1407 /*
1408 * Save off the CR2 register. If we take a page fault in the NMI then
1409 * it could corrupt the CR2 value. If the NMI preempts a page fault
1410 * handler before it was able to read the CR2 register, and then the
1411 * NMI itself takes a page fault, the page fault that was preempted
1412 * will read the information from the NMI page fault and not the
1413 * origin fault. Save it off and restore it if it changes.
1414 * Use the r12 callee-saved register.
1415 */
1416 movq %cr2, %r12
1417
ddeb8f21
AH
1418 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1419 movq %rsp,%rdi
1420 movq $-1,%rsi
1421 call do_nmi
7fbb98c5
SR
1422
1423 /* Did the NMI take a page fault? Restore cr2 if it did */
1424 movq %cr2, %rcx
1425 cmpq %rcx, %r12
1426 je 1f
1427 movq %r12, %cr2
14281:
ddeb8f21
AH
1429 testl %ebx,%ebx /* swapgs needed? */
1430 jnz nmi_restore
ddeb8f21
AH
1431nmi_swapgs:
1432 SWAPGS_UNSAFE_STACK
1433nmi_restore:
76f5df43
DV
1434 RESTORE_EXTRA_REGS
1435 RESTORE_C_REGS
444723dc 1436 /* Pop the extra iret frame at once */
76f5df43 1437 REMOVE_PT_GPREGS_FROM_STACK 6*8
28696f43 1438
3f3c8b8c 1439 /* Clear the NMI executing stack variable */
28696f43 1440 movq $0, 5*8(%rsp)
ddeb8f21 1441 jmp irq_return
ddeb8f21
AH
1442END(nmi)
1443
1444ENTRY(ignore_sysret)
ddeb8f21
AH
1445 mov $-ENOSYS,%eax
1446 sysret
ddeb8f21
AH
1447END(ignore_sysret)
1448