x86/entry: Fixup objtool/ibt validation
[linux-block.git] / arch / x86 / entry / entry_64.S
CommitLineData
b2441318 1/* SPDX-License-Identifier: GPL-2.0 */
1da177e4
LT
2/*
3 * linux/arch/x86_64/entry.S
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
7 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
4d732138 8 *
1da177e4
LT
9 * entry.S contains the system-call and fault low-level handling routines.
10 *
cb1aaebe 11 * Some of this is documented in Documentation/x86/entry_64.rst
8b4777a4 12 *
0bd7b798 13 * A note on terminology:
4d732138
IM
14 * - iret frame: Architecture defined interrupt frame from SS to RIP
15 * at the top of the kernel process stack.
2e91a17b
AK
16 *
17 * Some macro usage:
6dcc5627 18 * - SYM_FUNC_START/END:Define functions in the symbol table.
4d732138 19 * - idtentry: Define exception entry points.
1da177e4 20 */
1da177e4
LT
21#include <linux/linkage.h>
22#include <asm/segment.h>
1da177e4
LT
23#include <asm/cache.h>
24#include <asm/errno.h>
e2d5df93 25#include <asm/asm-offsets.h>
1da177e4
LT
26#include <asm/msr.h>
27#include <asm/unistd.h>
28#include <asm/thread_info.h>
29#include <asm/hw_irq.h>
0341c14d 30#include <asm/page_types.h>
2601e64d 31#include <asm/irqflags.h>
72fe4858 32#include <asm/paravirt.h>
9939ddaf 33#include <asm/percpu.h>
d7abc0fa 34#include <asm/asm.h>
63bcff2a 35#include <asm/smap.h>
3891a04a 36#include <asm/pgtable_types.h>
784d5699 37#include <asm/export.h>
8c1f7558 38#include <asm/frame.h>
cfa82a00 39#include <asm/trapnr.h>
2641f08b 40#include <asm/nospec-branch.h>
c82965f9 41#include <asm/fsgsbase.h>
d7e7528b 42#include <linux/err.h>
1da177e4 43
6fd166aa
PZ
44#include "calling.h"
45
4d732138
IM
46.code64
47.section .entry.text, "ax"
16444a8a 48
1da177e4 49/*
4d732138 50 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
1da177e4 51 *
fda57b22
AL
52 * This is the only entry point used for 64-bit system calls. The
53 * hardware interface is reasonably well designed and the register to
54 * argument mapping Linux uses fits well with the registers that are
55 * available when SYSCALL is used.
56 *
57 * SYSCALL instructions can be found inlined in libc implementations as
58 * well as some other programs and libraries. There are also a handful
59 * of SYSCALL instructions in the vDSO used, for example, as a
60 * clock_gettimeofday fallback.
61 *
4d732138 62 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
b87cf63e
DV
63 * then loads new ss, cs, and rip from previously programmed MSRs.
64 * rflags gets masked by a value from another MSR (so CLD and CLAC
65 * are not needed). SYSCALL does not save anything on the stack
66 * and does not change rsp.
67 *
68 * Registers on entry:
1da177e4 69 * rax system call number
b87cf63e
DV
70 * rcx return address
71 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
1da177e4 72 * rdi arg0
1da177e4 73 * rsi arg1
0bd7b798 74 * rdx arg2
b87cf63e 75 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
1da177e4
LT
76 * r8 arg4
77 * r9 arg5
4d732138 78 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
0bd7b798 79 *
1da177e4
LT
80 * Only called from user space.
81 *
7fcb3bc3 82 * When user can change pt_regs->foo always force IRET. That is because
7bf36bbc
AK
83 * it deals with uncanonical addresses better. SYSRET has trouble
84 * with them due to bugs in both AMD and Intel CPUs.
0bd7b798 85 */
1da177e4 86
bc7b11c0 87SYM_CODE_START(entry_SYSCALL_64)
8c1f7558 88 UNWIND_HINT_EMPTY
8f93402b 89 ENDBR
72fe4858 90
8a9949bc 91 swapgs
bf904d27 92 /* tss.sp2 is scratch space. */
98f05b51 93 movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
bf904d27 94 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
4d732138 95 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
9ed8e7d8 96
a13644f3 97SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
e8d61bdf 98 ANNOTATE_NOENDBR
a13644f3 99
9ed8e7d8 100 /* Construct struct pt_regs on stack */
98f05b51
AL
101 pushq $__USER_DS /* pt_regs->ss */
102 pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
103 pushq %r11 /* pt_regs->flags */
104 pushq $__USER_CS /* pt_regs->cs */
105 pushq %rcx /* pt_regs->ip */
26ba4e57 106SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
98f05b51 107 pushq %rax /* pt_regs->orig_ax */
30907fd1
DB
108
109 PUSH_AND_CLEAR_REGS rax=$-ENOSYS
4d732138 110
1e423bff 111 /* IRQs are off. */
3e5e7f77 112 movq %rsp, %rdi
05954948
PAI
113 /* Sign extend the lower 32bit as syscall numbers are treated as int */
114 movslq %eax, %rsi
1e423bff
AL
115 call do_syscall_64 /* returns with IRQs disabled */
116
fffbb5dc
DV
117 /*
118 * Try to use SYSRET instead of IRET if we're returning to
8a055d7f
AL
119 * a completely clean 64-bit userspace context. If we're not,
120 * go to the slow exit path.
afd30525 121 * In the Xen PV case we must use iret anyway.
fffbb5dc 122 */
afd30525
JG
123
124 ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
125 X86_FEATURE_XENPV
126
4d732138
IM
127 movq RCX(%rsp), %rcx
128 movq RIP(%rsp), %r11
8a055d7f
AL
129
130 cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
131 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
132
133 /*
134 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
135 * in kernel space. This essentially lets the user take over
17be0aec 136 * the kernel, since userspace controls RSP.
fffbb5dc 137 *
17be0aec 138 * If width of "canonical tail" ever becomes variable, this will need
fffbb5dc 139 * to be updated to remain correct on both old and new CPUs.
361b4b58 140 *
cbe0317b
KS
141 * Change top bits to match most significant bit (47th or 56th bit
142 * depending on paging mode) in the address.
fffbb5dc 143 */
09e61a77 144#ifdef CONFIG_X86_5LEVEL
39b95522
KS
145 ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
146 "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
09e61a77 147#else
17be0aec
DV
148 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
149 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
09e61a77 150#endif
4d732138 151
17be0aec
DV
152 /* If this changed %rcx, it was not canonical */
153 cmpq %rcx, %r11
8a055d7f 154 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 155
4d732138 156 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
8a055d7f 157 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 158
4d732138
IM
159 movq R11(%rsp), %r11
160 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
8a055d7f 161 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
162
163 /*
3e035305
BP
164 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
165 * restore RF properly. If the slowpath sets it for whatever reason, we
166 * need to restore it correctly.
167 *
168 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
169 * trap from userspace immediately after SYSRET. This would cause an
170 * infinite loop whenever #DB happens with register state that satisfies
171 * the opportunistic SYSRET conditions. For example, single-stepping
172 * this user code:
fffbb5dc 173 *
4d732138 174 * movq $stuck_here, %rcx
fffbb5dc
DV
175 * pushfq
176 * popq %r11
177 * stuck_here:
178 *
179 * would never get past 'stuck_here'.
180 */
4d732138 181 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
8a055d7f 182 jnz swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
183
184 /* nothing to check for RSP */
185
4d732138 186 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
8a055d7f 187 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
188
189 /*
4d732138
IM
190 * We win! This label is here just for ease of understanding
191 * perf profiles. Nothing jumps here.
fffbb5dc
DV
192 */
193syscall_return_via_sysret:
17be0aec 194 /* rcx and r11 are already restored (see code above) */
502af0d7 195 POP_REGS pop_rdi=0 skip_r11rcx=1
3e3b9293
AL
196
197 /*
198 * Now all regs are restored except RSP and RDI.
199 * Save old stack pointer and switch to trampoline stack.
200 */
201 movq %rsp, %rdi
c482feef 202 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 203 UNWIND_HINT_EMPTY
3e3b9293
AL
204
205 pushq RSP-RDI(%rdi) /* RSP */
206 pushq (%rdi) /* RDI */
207
208 /*
209 * We are on the trampoline stack. All regs except RDI are live.
210 * We can do future final exit work right here.
211 */
afaef01c
AP
212 STACKLEAK_ERASE_NOCLOBBER
213
6fd166aa 214 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
3e3b9293 215
4fbb3910 216 popq %rdi
3e3b9293 217 popq %rsp
47f33de4 218SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ce656528 219 ANNOTATE_NOENDBR
afd30525
JG
220 swapgs
221 sysretq
47f33de4 222SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ce656528
PZ
223 ANNOTATE_NOENDBR
224 int3
bc7b11c0 225SYM_CODE_END(entry_SYSCALL_64)
0bd7b798 226
0100301b
BG
227/*
228 * %rdi: prev task
229 * %rsi: next task
230 */
b9f6976b 231.pushsection .text, "ax"
96c64806 232SYM_FUNC_START(__switch_to_asm)
0100301b
BG
233 /*
234 * Save callee-saved registers
235 * This must match the order in inactive_task_frame
236 */
237 pushq %rbp
238 pushq %rbx
239 pushq %r12
240 pushq %r13
241 pushq %r14
242 pushq %r15
243
244 /* switch stack */
245 movq %rsp, TASK_threadsp(%rdi)
246 movq TASK_threadsp(%rsi), %rsp
247
050e9baa 248#ifdef CONFIG_STACKPROTECTOR
0100301b 249 movq TASK_stack_canary(%rsi), %rbx
e6401c13 250 movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
0100301b
BG
251#endif
252
c995efd5
DW
253#ifdef CONFIG_RETPOLINE
254 /*
255 * When switching from a shallower to a deeper call stack
256 * the RSB may either underflow or use entries populated
257 * with userspace addresses. On CPUs where those concerns
258 * exist, overwrite the RSB with entries which capture
259 * speculative execution to prevent attack.
260 */
d1c99108 261 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
c995efd5
DW
262#endif
263
0100301b
BG
264 /* restore callee-saved registers */
265 popq %r15
266 popq %r14
267 popq %r13
268 popq %r12
269 popq %rbx
270 popq %rbp
271
272 jmp __switch_to
96c64806 273SYM_FUNC_END(__switch_to_asm)
b9f6976b 274.popsection
0100301b 275
1eeb207f
DV
276/*
277 * A newly forked process directly context switches into this address.
278 *
0100301b 279 * rax: prev task we switched from
616d2483
BG
280 * rbx: kernel thread func (NULL for user thread)
281 * r12: kernel thread arg
1eeb207f 282 */
b9f6976b 283.pushsection .text, "ax"
bc7b11c0 284SYM_CODE_START(ret_from_fork)
8c1f7558 285 UNWIND_HINT_EMPTY
3e3f0695 286 ANNOTATE_NOENDBR // copy_thread
0100301b 287 movq %rax, %rdi
ebd57499 288 call schedule_tail /* rdi: 'prev' task parameter */
1eeb207f 289
ebd57499
JP
290 testq %rbx, %rbx /* from kernel_thread? */
291 jnz 1f /* kernel threads are uncommon */
24d978b7 292
616d2483 2932:
8c1f7558 294 UNWIND_HINT_REGS
ebd57499 295 movq %rsp, %rdi
167fd210 296 call syscall_exit_to_user_mode /* returns with IRQs disabled */
8a055d7f 297 jmp swapgs_restore_regs_and_return_to_usermode
616d2483
BG
298
2991:
300 /* kernel thread */
d31a5802 301 UNWIND_HINT_EMPTY
616d2483 302 movq %r12, %rdi
34fdce69 303 CALL_NOSPEC rbx
616d2483
BG
304 /*
305 * A kernel thread is allowed to return here after successfully
be619f7f 306 * calling kernel_execve(). Exit to userspace to complete the execve()
616d2483
BG
307 * syscall.
308 */
309 movq $0, RAX(%rsp)
310 jmp 2b
bc7b11c0 311SYM_CODE_END(ret_from_fork)
b9f6976b 312.popsection
1eeb207f 313
1d3e53e8
AL
314.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
315#ifdef CONFIG_DEBUG_ENTRY
e17f8234 316 pushq %rax
fafe5e74 317 SAVE_FLAGS
e17f8234 318 testl $X86_EFLAGS_IF, %eax
1d3e53e8
AL
319 jz .Lokay_\@
320 ud2
321.Lokay_\@:
e17f8234 322 popq %rax
1d3e53e8
AL
323#endif
324.endm
325
cfa82a00
TG
326/**
327 * idtentry_body - Macro to emit code calling the C function
cfa82a00
TG
328 * @cfunc: C function to be called
329 * @has_error_code: Hardware pushed error code on stack
330 */
e2dcb5f1 331.macro idtentry_body cfunc has_error_code:req
cfa82a00
TG
332
333 call error_entry
334 UNWIND_HINT_REGS
335
cfa82a00
TG
336 movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
337
338 .if \has_error_code == 1
339 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
340 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
341 .endif
342
cfa82a00
TG
343 call \cfunc
344
424c7d0a 345 jmp error_return
cfa82a00
TG
346.endm
347
348/**
349 * idtentry - Macro to generate entry stubs for simple IDT entries
350 * @vector: Vector number
351 * @asmsym: ASM symbol for the entry point
352 * @cfunc: C function to be called
353 * @has_error_code: Hardware pushed error code on stack
354 *
355 * The macro emits code to set up the kernel context for straight forward
356 * and simple IDT entries. No IST stack, no paranoid entry checks.
357 */
e2dcb5f1 358.macro idtentry vector asmsym cfunc has_error_code:req
cfa82a00
TG
359SYM_CODE_START(\asmsym)
360 UNWIND_HINT_IRET_REGS offset=\has_error_code*8
8f93402b 361 ENDBR
cfa82a00
TG
362 ASM_CLAC
363
364 .if \has_error_code == 0
365 pushq $-1 /* ORIG_RAX: no syscall to restart */
366 .endif
367
368 .if \vector == X86_TRAP_BP
369 /*
370 * If coming from kernel space, create a 6-word gap to allow the
371 * int3 handler to emulate a call instruction.
372 */
373 testb $3, CS-ORIG_RAX(%rsp)
374 jnz .Lfrom_usermode_no_gap_\@
375 .rept 6
376 pushq 5*8(%rsp)
377 .endr
378 UNWIND_HINT_IRET_REGS offset=8
379.Lfrom_usermode_no_gap_\@:
380 .endif
381
e2dcb5f1 382 idtentry_body \cfunc \has_error_code
cfa82a00
TG
383
384_ASM_NOKPROBE(\asmsym)
385SYM_CODE_END(\asmsym)
386.endm
387
0bf7c314
TG
388/*
389 * Interrupt entry/exit.
390 *
391 + The interrupt stubs push (vector) onto the stack, which is the error_code
392 * position of idtentry exceptions, and jump to one of the two idtentry points
393 * (common/spurious).
394 *
395 * common_interrupt is a hotpath, align it to a cache line
396 */
397.macro idtentry_irq vector cfunc
398 .p2align CONFIG_X86_L1_CACHE_SHIFT
399 idtentry \vector asm_\cfunc \cfunc has_error_code=1
400.endm
401
6368558c
TG
402/*
403 * System vectors which invoke their handlers directly and are not
404 * going through the regular common device interrupt handling code.
405 */
406.macro idtentry_sysvec vector cfunc
407 idtentry \vector asm_\cfunc \cfunc has_error_code=0
408.endm
409
cfa82a00
TG
410/**
411 * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
412 * @vector: Vector number
413 * @asmsym: ASM symbol for the entry point
414 * @cfunc: C function to be called
415 *
416 * The macro emits code to set up the kernel context for #MC and #DB
417 *
418 * If the entry comes from user space it uses the normal entry path
419 * including the return to user space work and preemption checks on
420 * exit.
421 *
422 * If hits in kernel mode then it needs to go through the paranoid
423 * entry as the exception can hit any random state. No preemption
424 * check on exit to keep the paranoid path simple.
cfa82a00
TG
425 */
426.macro idtentry_mce_db vector asmsym cfunc
427SYM_CODE_START(\asmsym)
428 UNWIND_HINT_IRET_REGS
8f93402b 429 ENDBR
cfa82a00
TG
430 ASM_CLAC
431
432 pushq $-1 /* ORIG_RAX: no syscall to restart */
433
434 /*
435 * If the entry is from userspace, switch stacks and treat it as
436 * a normal entry.
437 */
438 testb $3, CS-ORIG_RAX(%rsp)
439 jnz .Lfrom_usermode_switch_stack_\@
440
c82965f9 441 /* paranoid_entry returns GS information for paranoid_exit in EBX. */
cfa82a00
TG
442 call paranoid_entry
443
444 UNWIND_HINT_REGS
445
cfa82a00 446 movq %rsp, %rdi /* pt_regs pointer */
cfa82a00 447
cfa82a00
TG
448 call \cfunc
449
cfa82a00
TG
450 jmp paranoid_exit
451
452 /* Switch to the regular task stack and use the noist entry point */
453.Lfrom_usermode_switch_stack_\@:
e2dcb5f1 454 idtentry_body noist_\cfunc, has_error_code=0
cfa82a00
TG
455
456_ASM_NOKPROBE(\asmsym)
457SYM_CODE_END(\asmsym)
458.endm
459
a13644f3
JR
460#ifdef CONFIG_AMD_MEM_ENCRYPT
461/**
462 * idtentry_vc - Macro to generate entry stub for #VC
463 * @vector: Vector number
464 * @asmsym: ASM symbol for the entry point
465 * @cfunc: C function to be called
466 *
467 * The macro emits code to set up the kernel context for #VC. The #VC handler
468 * runs on an IST stack and needs to be able to cause nested #VC exceptions.
469 *
470 * To make this work the #VC entry code tries its best to pretend it doesn't use
471 * an IST stack by switching to the task stack if coming from user-space (which
472 * includes early SYSCALL entry path) or back to the stack in the IRET frame if
473 * entered from kernel-mode.
474 *
475 * If entered from kernel-mode the return stack is validated first, and if it is
476 * not safe to use (e.g. because it points to the entry stack) the #VC handler
477 * will switch to a fall-back stack (VC2) and call a special handler function.
478 *
479 * The macro is only used for one vector, but it is planned to be extended in
480 * the future for the #HV exception.
481 */
482.macro idtentry_vc vector asmsym cfunc
483SYM_CODE_START(\asmsym)
484 UNWIND_HINT_IRET_REGS
8f93402b 485 ENDBR
a13644f3
JR
486 ASM_CLAC
487
488 /*
489 * If the entry is from userspace, switch stacks and treat it as
490 * a normal entry.
491 */
492 testb $3, CS-ORIG_RAX(%rsp)
493 jnz .Lfrom_usermode_switch_stack_\@
494
495 /*
496 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
497 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
498 */
499 call paranoid_entry
500
501 UNWIND_HINT_REGS
502
503 /*
504 * Switch off the IST stack to make it free for nested exceptions. The
505 * vc_switch_off_ist() function will switch back to the interrupted
506 * stack if it is safe to do so. If not it switches to the VC fall-back
507 * stack.
508 */
509 movq %rsp, %rdi /* pt_regs pointer */
510 call vc_switch_off_ist
511 movq %rax, %rsp /* Switch to new stack */
512
c42b1451 513 ENCODE_FRAME_POINTER
a13644f3
JR
514 UNWIND_HINT_REGS
515
516 /* Update pt_regs */
517 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
518 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
519
520 movq %rsp, %rdi /* pt_regs pointer */
521
be1a5408 522 call kernel_\cfunc
a13644f3
JR
523
524 /*
525 * No need to switch back to the IST stack. The current stack is either
526 * identical to the stack in the IRET frame or the VC fall-back stack,
163b0991 527 * so it is definitely mapped even with PTI enabled.
a13644f3
JR
528 */
529 jmp paranoid_exit
530
531 /* Switch to the regular task stack */
532.Lfrom_usermode_switch_stack_\@:
be1a5408 533 idtentry_body user_\cfunc, has_error_code=1
a13644f3
JR
534
535_ASM_NOKPROBE(\asmsym)
536SYM_CODE_END(\asmsym)
537.endm
538#endif
539
cfa82a00
TG
540/*
541 * Double fault entry. Straight paranoid. No checks from which context
542 * this comes because for the espfix induced #DF this would do the wrong
543 * thing.
544 */
545.macro idtentry_df vector asmsym cfunc
546SYM_CODE_START(\asmsym)
547 UNWIND_HINT_IRET_REGS offset=8
8f93402b 548 ENDBR
cfa82a00
TG
549 ASM_CLAC
550
c82965f9 551 /* paranoid_entry returns GS information for paranoid_exit in EBX. */
cfa82a00
TG
552 call paranoid_entry
553 UNWIND_HINT_REGS
554
cfa82a00
TG
555 movq %rsp, %rdi /* pt_regs pointer into first argument */
556 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
557 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
558 call \cfunc
559
3515899b
PZ
560 /* For some configurations \cfunc ends up being a noreturn. */
561 REACHABLE
562
cfa82a00
TG
563 jmp paranoid_exit
564
565_ASM_NOKPROBE(\asmsym)
566SYM_CODE_END(\asmsym)
567.endm
568
53aaf262
TG
569/*
570 * Include the defines which emit the idt entries which are shared
f0178fc0
TG
571 * shared between 32 and 64 bit and emit the __irqentry_text_* markers
572 * so the stacktrace boundary checks work.
53aaf262 573 */
f0178fc0
TG
574 .align 16
575 .globl __irqentry_text_start
576__irqentry_text_start:
577
53aaf262
TG
578#include <asm/idtentry.h>
579
f0178fc0
TG
580 .align 16
581 .globl __irqentry_text_end
582__irqentry_text_end:
3e3f0695 583 ANNOTATE_NOENDBR
f0178fc0 584
fa5e5c40 585SYM_CODE_START_LOCAL(common_interrupt_return)
26ba4e57 586SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
26c4ef9c
AL
587#ifdef CONFIG_DEBUG_ENTRY
588 /* Assert that pt_regs indicates user mode. */
1e4c4f61 589 testb $3, CS(%rsp)
26c4ef9c
AL
590 jnz 1f
591 ud2
5921:
593#endif
5c8f6a2e
LJ
594#ifdef CONFIG_XEN_PV
595 ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
596#endif
597
502af0d7 598 POP_REGS pop_rdi=0
3e3b9293
AL
599
600 /*
601 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
602 * Save old stack pointer and switch to trampoline stack.
603 */
604 movq %rsp, %rdi
c482feef 605 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 606 UNWIND_HINT_EMPTY
3e3b9293
AL
607
608 /* Copy the IRET frame to the trampoline stack. */
609 pushq 6*8(%rdi) /* SS */
610 pushq 5*8(%rdi) /* RSP */
611 pushq 4*8(%rdi) /* EFLAGS */
612 pushq 3*8(%rdi) /* CS */
613 pushq 2*8(%rdi) /* RIP */
614
615 /* Push user RDI on the trampoline stack. */
616 pushq (%rdi)
617
618 /*
619 * We are on the trampoline stack. All regs except RDI are live.
620 * We can do future final exit work right here.
621 */
afaef01c 622 STACKLEAK_ERASE_NOCLOBBER
3e3b9293 623
6fd166aa 624 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
8a09317b 625
3e3b9293
AL
626 /* Restore RDI. */
627 popq %rdi
6cf3e4c0 628 swapgs
8b87d8ce 629 jmp .Lnative_iret
26c4ef9c 630
2601e64d 631
26ba4e57 632SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
26c4ef9c
AL
633#ifdef CONFIG_DEBUG_ENTRY
634 /* Assert that pt_regs indicates kernel mode. */
1e4c4f61 635 testb $3, CS(%rsp)
26c4ef9c
AL
636 jz 1f
637 ud2
6381:
639#endif
502af0d7 640 POP_REGS
e872045b 641 addq $8, %rsp /* skip regs->orig_ax */
10bcc80e
MD
642 /*
643 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
644 * when returning from IPI handler.
645 */
8b87d8ce
PZ
646#ifdef CONFIG_XEN_PV
647SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)
648 ANNOTATE_NOENDBR
649 .byte 0xe9
650 .long .Lnative_iret - (. + 4)
651#endif
7209a75d 652
8b87d8ce 653.Lnative_iret:
8c1f7558 654 UNWIND_HINT_IRET_REGS
3891a04a
PA
655 /*
656 * Are we returning to a stack segment from the LDT? Note: in
657 * 64-bit mode SS:RSP on the exception stack is always valid.
658 */
34273f41 659#ifdef CONFIG_X86_ESPFIX64
4d732138
IM
660 testb $4, (SS-RIP)(%rsp)
661 jnz native_irq_return_ldt
34273f41 662#endif
3891a04a 663
cc66936e 664SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
3e3f0695 665 ANNOTATE_NOENDBR // exc_double_fault
b645af2d
AL
666 /*
667 * This may fault. Non-paranoid faults on return to userspace are
668 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
c29c775a 669 * Double-faults due to espfix64 are handled in exc_double_fault.
b645af2d
AL
670 * Other faults here are fatal.
671 */
1da177e4 672 iretq
3701d863 673
34273f41 674#ifdef CONFIG_X86_ESPFIX64
7209a75d 675native_irq_return_ldt:
85063fac
AL
676 /*
677 * We are running with user GSBASE. All GPRs contain their user
678 * values. We have a percpu ESPFIX stack that is eight slots
679 * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
680 * of the ESPFIX stack.
681 *
682 * We clobber RAX and RDI in this code. We stash RDI on the
683 * normal stack and RAX on the ESPFIX stack.
684 *
685 * The ESPFIX stack layout we set up looks like this:
686 *
687 * --- top of ESPFIX stack ---
688 * SS
689 * RSP
690 * RFLAGS
691 * CS
692 * RIP <-- RSP points here when we're done
693 * RAX <-- espfix_waddr points here
694 * --- bottom of ESPFIX stack ---
695 */
696
697 pushq %rdi /* Stash user RDI */
53c9d924 698 swapgs /* to kernel GS */
8a09317b
DH
699 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
700
4d732138 701 movq PER_CPU_VAR(espfix_waddr), %rdi
85063fac
AL
702 movq %rax, (0*8)(%rdi) /* user RAX */
703 movq (1*8)(%rsp), %rax /* user RIP */
4d732138 704 movq %rax, (1*8)(%rdi)
85063fac 705 movq (2*8)(%rsp), %rax /* user CS */
4d732138 706 movq %rax, (2*8)(%rdi)
85063fac 707 movq (3*8)(%rsp), %rax /* user RFLAGS */
4d732138 708 movq %rax, (3*8)(%rdi)
85063fac 709 movq (5*8)(%rsp), %rax /* user SS */
4d732138 710 movq %rax, (5*8)(%rdi)
85063fac 711 movq (4*8)(%rsp), %rax /* user RSP */
4d732138 712 movq %rax, (4*8)(%rdi)
85063fac
AL
713 /* Now RAX == RSP. */
714
715 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
85063fac
AL
716
717 /*
718 * espfix_stack[31:16] == 0. The page tables are set up such that
719 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
720 * espfix_waddr for any X. That is, there are 65536 RO aliases of
721 * the same page. Set up RSP so that RSP[31:16] contains the
722 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
723 * still points to an RO alias of the ESPFIX stack.
724 */
4d732138 725 orq PER_CPU_VAR(espfix_stack), %rax
8a09317b 726
6fd166aa 727 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
53c9d924 728 swapgs /* to user GS */
8a09317b
DH
729 popq %rdi /* Restore user RDI */
730
4d732138 731 movq %rax, %rsp
8c1f7558 732 UNWIND_HINT_IRET_REGS offset=8
85063fac
AL
733
734 /*
735 * At this point, we cannot write to the stack any more, but we can
736 * still read.
737 */
738 popq %rax /* Restore user RAX */
739
740 /*
741 * RSP now points to an ordinary IRET frame, except that the page
742 * is read-only and RSP[31:16] are preloaded with the userspace
743 * values. We can now IRET back to userspace.
744 */
4d732138 745 jmp native_irq_return_iret
34273f41 746#endif
fa5e5c40
TG
747SYM_CODE_END(common_interrupt_return)
748_ASM_NOKPROBE(common_interrupt_return)
3891a04a 749
b9f6976b
TG
750/*
751 * Reload gs selector with exception handling
752 * edi: new selector
753 *
754 * Is in entry.text as it shouldn't be instrumented.
755 */
410367e3 756SYM_FUNC_START(asm_load_gs_index)
8c1f7558 757 FRAME_BEGIN
c9317202 758 swapgs
42c748bb 759.Lgs_change:
3e3f0695 760 ANNOTATE_NOENDBR // error_entry
4d732138 761 movl %edi, %gs
96e5d28a 7622: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
c9317202 763 swapgs
8c1f7558 764 FRAME_END
f94909ce 765 RET
0bd7b798 766
1da177e4 767 /* running with kernelgs */
16e617d0 768.Lbad_gs:
c9317202 769 swapgs /* switch back to user gs */
b038c842
AL
770.macro ZAP_GS
771 /* This can't be a string because the preprocessor needs to see it. */
772 movl $__USER_DS, %eax
773 movl %eax, %gs
774.endm
775 ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
4d732138
IM
776 xorl %eax, %eax
777 movl %eax, %gs
778 jmp 2b
16e617d0
PZ
779
780 _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
781
782SYM_FUNC_END(asm_load_gs_index)
783EXPORT_SYMBOL(asm_load_gs_index)
0bd7b798 784
28c11b0f 785#ifdef CONFIG_XEN_PV
3d75e1b8 786/*
9f1e87ea
CG
787 * A note on the "critical region" in our callback handler.
788 * We want to avoid stacking callback handlers due to events occurring
789 * during handling of the last event. To do this, we keep events disabled
790 * until we've done all processing. HOWEVER, we must enable events before
791 * popping the stack frame (can't be done atomically) and so it would still
792 * be possible to get enough handler activations to overflow the stack.
793 * Although unlikely, bugs of that kind are hard to track down, so we'd
794 * like to avoid the possibility.
795 * So, on entry to the handler we detect whether we interrupted an
796 * existing activation in its critical region -- if so, we pop the current
797 * activation and restart the handler using the previous one.
2f6474e4
TG
798 *
799 * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
9f1e87ea 800 */
2f6474e4 801SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
4d732138 802
9f1e87ea
CG
803/*
804 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
805 * see the correct pointer to the pt_regs
806 */
8c1f7558 807 UNWIND_HINT_FUNC
4d732138 808 movq %rdi, %rsp /* we don't return, adjust the stack frame */
8c1f7558 809 UNWIND_HINT_REGS
1d3e53e8 810
2f6474e4 811 call xen_pv_evtchn_do_upcall
1d3e53e8 812
2f6474e4
TG
813 jmp error_return
814SYM_CODE_END(exc_xen_hypervisor_callback)
3d75e1b8
JF
815
816/*
9f1e87ea
CG
817 * Hypervisor uses this for application faults while it executes.
818 * We get here for two reasons:
819 * 1. Fault while reloading DS, ES, FS or GS
820 * 2. Fault while executing IRET
821 * Category 1 we do not need to fix up as Xen has already reloaded all segment
822 * registers that could be reloaded and zeroed the others.
823 * Category 2 we fix up by killing the current process. We cannot use the
824 * normal Linux return path in this case because if we use the IRET hypercall
825 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
826 * We distinguish between categories by comparing each saved segment register
827 * with its current contents: any discrepancy means we in category 1.
828 */
bc7b11c0 829SYM_CODE_START(xen_failsafe_callback)
8c1f7558 830 UNWIND_HINT_EMPTY
5b2fc515 831 ENDBR
4d732138
IM
832 movl %ds, %ecx
833 cmpw %cx, 0x10(%rsp)
834 jne 1f
835 movl %es, %ecx
836 cmpw %cx, 0x18(%rsp)
837 jne 1f
838 movl %fs, %ecx
839 cmpw %cx, 0x20(%rsp)
840 jne 1f
841 movl %gs, %ecx
842 cmpw %cx, 0x28(%rsp)
843 jne 1f
3d75e1b8 844 /* All segments match their saved values => Category 2 (Bad IRET). */
4d732138
IM
845 movq (%rsp), %rcx
846 movq 8(%rsp), %r11
847 addq $0x30, %rsp
848 pushq $0 /* RIP */
8c1f7558 849 UNWIND_HINT_IRET_REGS offset=8
be4c11af 850 jmp asm_exc_general_protection
3d75e1b8 8511: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
4d732138
IM
852 movq (%rsp), %rcx
853 movq 8(%rsp), %r11
854 addq $0x30, %rsp
8c1f7558 855 UNWIND_HINT_IRET_REGS
4d732138 856 pushq $-1 /* orig_ax = -1 => not a system call */
3f01daec 857 PUSH_AND_CLEAR_REGS
946c1911 858 ENCODE_FRAME_POINTER
e88d9741 859 jmp error_return
bc7b11c0 860SYM_CODE_END(xen_failsafe_callback)
28c11b0f 861#endif /* CONFIG_XEN_PV */
3d75e1b8 862
ebfc453e 863/*
c82965f9
CB
864 * Save all registers in pt_regs. Return GSBASE related information
865 * in EBX depending on the availability of the FSGSBASE instructions:
866 *
867 * FSGSBASE R/EBX
868 * N 0 -> SWAPGS on exit
869 * 1 -> no SWAPGS on exit
870 *
871 * Y GSBASE value at entry, must be restored in paranoid_exit
ebfc453e 872 */
ef1e0315 873SYM_CODE_START_LOCAL(paranoid_entry)
8c1f7558 874 UNWIND_HINT_FUNC
1eeb207f 875 cld
9e809d15
DB
876 PUSH_AND_CLEAR_REGS save_ret=1
877 ENCODE_FRAME_POINTER 8
8a09317b 878
16561f27
DH
879 /*
880 * Always stash CR3 in %r14. This value will be restored,
ae852495
AL
881 * verbatim, at exit. Needed if paranoid_entry interrupted
882 * another entry that already switched to the user CR3 value
883 * but has not yet returned to userspace.
16561f27
DH
884 *
885 * This is also why CS (stashed in the "iret frame" by the
886 * hardware at entry) can not be used: this may be a return
ae852495 887 * to kernel code, but with a user CR3 value.
96b23714
CB
888 *
889 * Switching CR3 does not depend on kernel GSBASE so it can
890 * be done before switching to the kernel GSBASE. This is
891 * required for FSGSBASE because the kernel GSBASE has to
892 * be retrieved from a kernel internal table.
16561f27 893 */
8a09317b
DH
894 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
895
c82965f9
CB
896 /*
897 * Handling GSBASE depends on the availability of FSGSBASE.
898 *
899 * Without FSGSBASE the kernel enforces that negative GSBASE
900 * values indicate kernel GSBASE. With FSGSBASE no assumptions
901 * can be made about the GSBASE value when entering from user
902 * space.
903 */
904 ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
905
906 /*
907 * Read the current GSBASE and store it in %rbx unconditionally,
908 * retrieve and set the current CPUs kernel GSBASE. The stored value
909 * has to be restored in paranoid_exit unconditionally.
910 *
0b2c605f
BP
911 * The unconditional write to GS base below ensures that no subsequent
912 * loads based on a mispredicted GS base can happen, therefore no LFENCE
913 * is needed here.
c82965f9
CB
914 */
915 SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
f94909ce 916 RET
c82965f9
CB
917
918.Lparanoid_entry_checkgs:
96b23714
CB
919 /* EBX = 1 -> kernel GSBASE active, no restore required */
920 movl $1, %ebx
c07e4555 921
96b23714
CB
922 /*
923 * The kernel-enforced convention is a negative GSBASE indicates
924 * a kernel value. No SWAPGS needed on entry and exit.
925 */
926 movl $MSR_GS_BASE, %ecx
927 rdmsr
928 testl %edx, %edx
c07e4555 929 js .Lparanoid_kernel_gsbase
96b23714 930
c07e4555
LJ
931 /* EBX = 0 -> SWAPGS required on exit */
932 xorl %ebx, %ebx
53c9d924 933 swapgs
c07e4555 934.Lparanoid_kernel_gsbase:
96b23714 935
18ec54fd 936 FENCE_SWAPGS_KERNEL_ENTRY
f94909ce 937 RET
ef1e0315 938SYM_CODE_END(paranoid_entry)
ddeb8f21 939
ebfc453e
DV
940/*
941 * "Paranoid" exit path from exception stack. This is invoked
942 * only on return from non-NMI IST interrupts that came
943 * from kernel space.
944 *
945 * We may be returning to very strange contexts (e.g. very early
946 * in syscall entry), so checking for preemption here would
c82965f9
CB
947 * be complicated. Fortunately, there's no good reason to try
948 * to handle preemption here.
949 *
950 * R/EBX contains the GSBASE related information depending on the
951 * availability of the FSGSBASE instructions:
952 *
953 * FSGSBASE R/EBX
954 * N 0 -> SWAPGS on exit
955 * 1 -> no SWAPGS on exit
4d732138 956 *
c82965f9 957 * Y User space GSBASE, must be restored unconditionally
ebfc453e 958 */
ef1e0315 959SYM_CODE_START_LOCAL(paranoid_exit)
8c1f7558 960 UNWIND_HINT_REGS
c82965f9
CB
961 /*
962 * The order of operations is important. RESTORE_CR3 requires
963 * kernel GSBASE.
964 *
965 * NB to anyone to try to optimize this code: this code does
966 * not execute at all for exceptions from user mode. Those
967 * exceptions go through error_exit instead.
968 */
969 RESTORE_CR3 scratch_reg=%rax save_reg=%r14
970
971 /* Handle the three GSBASE cases */
972 ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
973
974 /* With FSGSBASE enabled, unconditionally restore GSBASE */
975 wrgsbase %rbx
976 jmp restore_regs_and_return_to_kernel
977
978.Lparanoid_exit_checkgs:
979 /* On non-FSGSBASE systems, conditionally do SWAPGS */
980 testl %ebx, %ebx
981 jnz restore_regs_and_return_to_kernel
982
983 /* We are returning to a context with user GSBASE */
53c9d924 984 swapgs
c82965f9 985 jmp restore_regs_and_return_to_kernel
ef1e0315 986SYM_CODE_END(paranoid_exit)
ddeb8f21
AH
987
988/*
9e809d15 989 * Save all registers in pt_regs, and switch GS if needed.
ddeb8f21 990 */
ef1e0315 991SYM_CODE_START_LOCAL(error_entry)
9e809d15 992 UNWIND_HINT_FUNC
ddeb8f21 993 cld
9e809d15
DB
994 PUSH_AND_CLEAR_REGS save_ret=1
995 ENCODE_FRAME_POINTER 8
03335e95 996 testb $3, CS+8(%rsp)
cb6f64ed 997 jz .Lerror_kernelspace
539f5113 998
cb6f64ed
AL
999 /*
1000 * We entered from user mode or we're pretending to have entered
1001 * from user mode due to an IRET fault.
1002 */
ddeb8f21 1003 SWAPGS
18ec54fd 1004 FENCE_SWAPGS_USER_ENTRY
8a09317b
DH
1005 /* We have user CR3. Change to kernel CR3. */
1006 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113 1007
cb6f64ed 1008.Lerror_entry_from_usermode_after_swapgs:
7f2590a1
AL
1009 /* Put us onto the real thread stack. */
1010 popq %r12 /* save return addr in %12 */
1011 movq %rsp, %rdi /* arg0 = pt_regs pointer */
1012 call sync_regs
1013 movq %rax, %rsp /* switch stack */
1014 ENCODE_FRAME_POINTER
1015 pushq %r12
f94909ce 1016 RET
02bc7768 1017
ebfc453e
DV
1018 /*
1019 * There are two places in the kernel that can potentially fault with
1020 * usergs. Handle them here. B stepping K8s sometimes report a
1021 * truncated RIP for IRET exceptions returning to compat mode. Check
1022 * for these here too.
1023 */
cb6f64ed 1024.Lerror_kernelspace:
4d732138
IM
1025 leaq native_irq_return_iret(%rip), %rcx
1026 cmpq %rcx, RIP+8(%rsp)
cb6f64ed 1027 je .Lerror_bad_iret
4d732138
IM
1028 movl %ecx, %eax /* zero extend */
1029 cmpq %rax, RIP+8(%rsp)
cb6f64ed 1030 je .Lbstep_iret
42c748bb 1031 cmpq $.Lgs_change, RIP+8(%rsp)
18ec54fd 1032 jne .Lerror_entry_done_lfence
539f5113
AL
1033
1034 /*
42c748bb 1035 * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
539f5113 1036 * gsbase and proceed. We'll fix up the exception and land in
42c748bb 1037 * .Lgs_change's error handler with kernel gsbase.
539f5113 1038 */
2fa5f04f 1039 SWAPGS
1367afaa
LJ
1040
1041 /*
1042 * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
1043 * kernel or user gsbase.
1044 */
1045.Lerror_entry_done_lfence:
1046 FENCE_SWAPGS_KERNEL_ENTRY
f94909ce 1047 RET
ae24ffe5 1048
cb6f64ed 1049.Lbstep_iret:
ae24ffe5 1050 /* Fix truncated RIP */
4d732138 1051 movq %rcx, RIP+8(%rsp)
b645af2d
AL
1052 /* fall through */
1053
cb6f64ed 1054.Lerror_bad_iret:
539f5113 1055 /*
8a09317b
DH
1056 * We came from an IRET to user mode, so we have user
1057 * gsbase and CR3. Switch to kernel gsbase and CR3:
539f5113 1058 */
b645af2d 1059 SWAPGS
18ec54fd 1060 FENCE_SWAPGS_USER_ENTRY
8a09317b 1061 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113
AL
1062
1063 /*
1064 * Pretend that the exception came from user mode: set up pt_regs
b3681dd5 1065 * as if we faulted immediately after IRET.
539f5113 1066 */
4d732138
IM
1067 mov %rsp, %rdi
1068 call fixup_bad_iret
1069 mov %rax, %rsp
cb6f64ed 1070 jmp .Lerror_entry_from_usermode_after_swapgs
ef1e0315 1071SYM_CODE_END(error_entry)
ddeb8f21 1072
424c7d0a
TG
1073SYM_CODE_START_LOCAL(error_return)
1074 UNWIND_HINT_REGS
1075 DEBUG_ENTRY_ASSERT_IRQS_OFF
1076 testb $3, CS(%rsp)
1077 jz restore_regs_and_return_to_kernel
1078 jmp swapgs_restore_regs_and_return_to_usermode
1079SYM_CODE_END(error_return)
1080
929bacec
AL
1081/*
1082 * Runs on exception stack. Xen PV does not go through this path at all,
1083 * so we can use real assembly here.
8a09317b
DH
1084 *
1085 * Registers:
1086 * %r14: Used to save/restore the CR3 of the interrupted context
1087 * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
929bacec 1088 */
6271fef0 1089SYM_CODE_START(asm_exc_nmi)
8c1f7558 1090 UNWIND_HINT_IRET_REGS
8f93402b 1091 ENDBR
929bacec 1092
3f3c8b8c
SR
1093 /*
1094 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1095 * the iretq it performs will take us out of NMI context.
1096 * This means that we can have nested NMIs where the next
1097 * NMI is using the top of the stack of the previous NMI. We
1098 * can't let it execute because the nested NMI will corrupt the
1099 * stack of the previous NMI. NMI handlers are not re-entrant
1100 * anyway.
1101 *
1102 * To handle this case we do the following:
1103 * Check the a special location on the stack that contains
1104 * a variable that is set when NMIs are executing.
1105 * The interrupted task's stack is also checked to see if it
1106 * is an NMI stack.
1107 * If the variable is not set and the stack is not the NMI
1108 * stack then:
1109 * o Set the special variable on the stack
0b22930e
AL
1110 * o Copy the interrupt frame into an "outermost" location on the
1111 * stack
1112 * o Copy the interrupt frame into an "iret" location on the stack
3f3c8b8c
SR
1113 * o Continue processing the NMI
1114 * If the variable is set or the previous stack is the NMI stack:
0b22930e 1115 * o Modify the "iret" location to jump to the repeat_nmi
3f3c8b8c
SR
1116 * o return back to the first NMI
1117 *
1118 * Now on exit of the first NMI, we first clear the stack variable
1119 * The NMI stack will tell any nested NMIs at that point that it is
1120 * nested. Then we pop the stack normally with iret, and if there was
1121 * a nested NMI that updated the copy interrupt stack frame, a
1122 * jump will be made to the repeat_nmi code that will handle the second
1123 * NMI.
9b6e6a83
AL
1124 *
1125 * However, espfix prevents us from directly returning to userspace
1126 * with a single IRET instruction. Similarly, IRET to user mode
1127 * can fault. We therefore handle NMIs from user space like
1128 * other IST entries.
3f3c8b8c
SR
1129 */
1130
e93c1730
AL
1131 ASM_CLAC
1132
146b2b09 1133 /* Use %rdx as our temp variable throughout */
4d732138 1134 pushq %rdx
3f3c8b8c 1135
9b6e6a83
AL
1136 testb $3, CS-RIP+8(%rsp)
1137 jz .Lnmi_from_kernel
1138
1139 /*
1140 * NMI from user mode. We need to run on the thread stack, but we
1141 * can't go through the normal entry paths: NMIs are masked, and
1142 * we don't want to enable interrupts, because then we'll end
1143 * up in an awkward situation in which IRQs are on but NMIs
1144 * are off.
83c133cf
AL
1145 *
1146 * We also must not push anything to the stack before switching
1147 * stacks lest we corrupt the "NMI executing" variable.
9b6e6a83
AL
1148 */
1149
929bacec 1150 swapgs
9b6e6a83 1151 cld
18ec54fd 1152 FENCE_SWAPGS_USER_ENTRY
8a09317b 1153 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
9b6e6a83
AL
1154 movq %rsp, %rdx
1155 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
8c1f7558 1156 UNWIND_HINT_IRET_REGS base=%rdx offset=8
9b6e6a83
AL
1157 pushq 5*8(%rdx) /* pt_regs->ss */
1158 pushq 4*8(%rdx) /* pt_regs->rsp */
1159 pushq 3*8(%rdx) /* pt_regs->flags */
1160 pushq 2*8(%rdx) /* pt_regs->cs */
1161 pushq 1*8(%rdx) /* pt_regs->rip */
8c1f7558 1162 UNWIND_HINT_IRET_REGS
9b6e6a83 1163 pushq $-1 /* pt_regs->orig_ax */
30907fd1 1164 PUSH_AND_CLEAR_REGS rdx=(%rdx)
946c1911 1165 ENCODE_FRAME_POINTER
9b6e6a83
AL
1166
1167 /*
1168 * At this point we no longer need to worry about stack damage
1169 * due to nesting -- we're on the normal thread stack and we're
1170 * done with the NMI stack.
1171 */
1172
1173 movq %rsp, %rdi
1174 movq $-1, %rsi
6271fef0 1175 call exc_nmi
9b6e6a83 1176
45d5a168 1177 /*
9b6e6a83 1178 * Return back to user mode. We must *not* do the normal exit
946c1911 1179 * work, because we don't want to enable interrupts.
45d5a168 1180 */
8a055d7f 1181 jmp swapgs_restore_regs_and_return_to_usermode
45d5a168 1182
9b6e6a83 1183.Lnmi_from_kernel:
3f3c8b8c 1184 /*
0b22930e
AL
1185 * Here's what our stack frame will look like:
1186 * +---------------------------------------------------------+
1187 * | original SS |
1188 * | original Return RSP |
1189 * | original RFLAGS |
1190 * | original CS |
1191 * | original RIP |
1192 * +---------------------------------------------------------+
1193 * | temp storage for rdx |
1194 * +---------------------------------------------------------+
1195 * | "NMI executing" variable |
1196 * +---------------------------------------------------------+
1197 * | iret SS } Copied from "outermost" frame |
1198 * | iret Return RSP } on each loop iteration; overwritten |
1199 * | iret RFLAGS } by a nested NMI to force another |
1200 * | iret CS } iteration if needed. |
1201 * | iret RIP } |
1202 * +---------------------------------------------------------+
1203 * | outermost SS } initialized in first_nmi; |
1204 * | outermost Return RSP } will not be changed before |
1205 * | outermost RFLAGS } NMI processing is done. |
1206 * | outermost CS } Copied to "iret" frame on each |
1207 * | outermost RIP } iteration. |
1208 * +---------------------------------------------------------+
1209 * | pt_regs |
1210 * +---------------------------------------------------------+
1211 *
1212 * The "original" frame is used by hardware. Before re-enabling
1213 * NMIs, we need to be done with it, and we need to leave enough
1214 * space for the asm code here.
1215 *
1216 * We return by executing IRET while RSP points to the "iret" frame.
1217 * That will either return for real or it will loop back into NMI
1218 * processing.
1219 *
1220 * The "outermost" frame is copied to the "iret" frame on each
1221 * iteration of the loop, so each iteration starts with the "iret"
1222 * frame pointing to the final return target.
1223 */
1224
45d5a168 1225 /*
0b22930e
AL
1226 * Determine whether we're a nested NMI.
1227 *
a27507ca
AL
1228 * If we interrupted kernel code between repeat_nmi and
1229 * end_repeat_nmi, then we are a nested NMI. We must not
1230 * modify the "iret" frame because it's being written by
1231 * the outer NMI. That's okay; the outer NMI handler is
6271fef0 1232 * about to about to call exc_nmi() anyway, so we can just
a27507ca 1233 * resume the outer NMI.
45d5a168 1234 */
a27507ca
AL
1235
1236 movq $repeat_nmi, %rdx
1237 cmpq 8(%rsp), %rdx
1238 ja 1f
1239 movq $end_repeat_nmi, %rdx
1240 cmpq 8(%rsp), %rdx
1241 ja nested_nmi_out
12421:
45d5a168 1243
3f3c8b8c 1244 /*
a27507ca 1245 * Now check "NMI executing". If it's set, then we're nested.
0b22930e
AL
1246 * This will not detect if we interrupted an outer NMI just
1247 * before IRET.
3f3c8b8c 1248 */
4d732138
IM
1249 cmpl $1, -8(%rsp)
1250 je nested_nmi
3f3c8b8c
SR
1251
1252 /*
0b22930e
AL
1253 * Now test if the previous stack was an NMI stack. This covers
1254 * the case where we interrupt an outer NMI after it clears
810bc075
AL
1255 * "NMI executing" but before IRET. We need to be careful, though:
1256 * there is one case in which RSP could point to the NMI stack
1257 * despite there being no NMI active: naughty userspace controls
1258 * RSP at the very beginning of the SYSCALL targets. We can
1259 * pull a fast one on naughty userspace, though: we program
1260 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1261 * if it controls the kernel's RSP. We set DF before we clear
1262 * "NMI executing".
3f3c8b8c 1263 */
0784b364
DV
1264 lea 6*8(%rsp), %rdx
1265 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1266 cmpq %rdx, 4*8(%rsp)
1267 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1268 ja first_nmi
4d732138 1269
0784b364
DV
1270 subq $EXCEPTION_STKSZ, %rdx
1271 cmpq %rdx, 4*8(%rsp)
1272 /* If it is below the NMI stack, it is a normal NMI */
1273 jb first_nmi
810bc075
AL
1274
1275 /* Ah, it is within the NMI stack. */
1276
1277 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1278 jz first_nmi /* RSP was user controlled. */
1279
1280 /* This is a nested NMI. */
0784b364 1281
3f3c8b8c
SR
1282nested_nmi:
1283 /*
0b22930e
AL
1284 * Modify the "iret" frame to point to repeat_nmi, forcing another
1285 * iteration of NMI handling.
3f3c8b8c 1286 */
23a781e9 1287 subq $8, %rsp
4d732138
IM
1288 leaq -10*8(%rsp), %rdx
1289 pushq $__KERNEL_DS
1290 pushq %rdx
131484c8 1291 pushfq
4d732138
IM
1292 pushq $__KERNEL_CS
1293 pushq $repeat_nmi
3f3c8b8c
SR
1294
1295 /* Put stack back */
4d732138 1296 addq $(6*8), %rsp
3f3c8b8c
SR
1297
1298nested_nmi_out:
4d732138 1299 popq %rdx
3f3c8b8c 1300
0b22930e 1301 /* We are returning to kernel mode, so this cannot result in a fault. */
929bacec 1302 iretq
3f3c8b8c
SR
1303
1304first_nmi:
0b22930e 1305 /* Restore rdx. */
4d732138 1306 movq (%rsp), %rdx
62610913 1307
36f1a77b
AL
1308 /* Make room for "NMI executing". */
1309 pushq $0
3f3c8b8c 1310
0b22930e 1311 /* Leave room for the "iret" frame */
4d732138 1312 subq $(5*8), %rsp
28696f43 1313
0b22930e 1314 /* Copy the "original" frame to the "outermost" frame */
3f3c8b8c 1315 .rept 5
4d732138 1316 pushq 11*8(%rsp)
3f3c8b8c 1317 .endr
8c1f7558 1318 UNWIND_HINT_IRET_REGS
62610913 1319
79fb4ad6
SR
1320 /* Everything up to here is safe from nested NMIs */
1321
a97439aa
AL
1322#ifdef CONFIG_DEBUG_ENTRY
1323 /*
1324 * For ease of testing, unmask NMIs right away. Disabled by
1325 * default because IRET is very expensive.
1326 */
1327 pushq $0 /* SS */
1328 pushq %rsp /* RSP (minus 8 because of the previous push) */
1329 addq $8, (%rsp) /* Fix up RSP */
1330 pushfq /* RFLAGS */
1331 pushq $__KERNEL_CS /* CS */
1332 pushq $1f /* RIP */
929bacec 1333 iretq /* continues at repeat_nmi below */
8c1f7558 1334 UNWIND_HINT_IRET_REGS
a97439aa
AL
13351:
1336#endif
1337
0b22930e 1338repeat_nmi:
3e3f0695 1339 ANNOTATE_NOENDBR // this code
62610913
JB
1340 /*
1341 * If there was a nested NMI, the first NMI's iret will return
1342 * here. But NMIs are still enabled and we can take another
1343 * nested NMI. The nested NMI checks the interrupted RIP to see
1344 * if it is between repeat_nmi and end_repeat_nmi, and if so
1345 * it will just return, as we are about to repeat an NMI anyway.
1346 * This makes it safe to copy to the stack frame that a nested
1347 * NMI will update.
0b22930e
AL
1348 *
1349 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
1350 * we're repeating an NMI, gsbase has the same value that it had on
1351 * the first iteration. paranoid_entry will load the kernel
6271fef0 1352 * gsbase if needed before we call exc_nmi(). "NMI executing"
36f1a77b 1353 * is zero.
62610913 1354 */
36f1a77b 1355 movq $1, 10*8(%rsp) /* Set "NMI executing". */
3f3c8b8c 1356
62610913 1357 /*
0b22930e
AL
1358 * Copy the "outermost" frame to the "iret" frame. NMIs that nest
1359 * here must not modify the "iret" frame while we're writing to
1360 * it or it will end up containing garbage.
62610913 1361 */
4d732138 1362 addq $(10*8), %rsp
3f3c8b8c 1363 .rept 5
4d732138 1364 pushq -6*8(%rsp)
3f3c8b8c 1365 .endr
4d732138 1366 subq $(5*8), %rsp
62610913 1367end_repeat_nmi:
3e3f0695 1368 ANNOTATE_NOENDBR // this code
3f3c8b8c
SR
1369
1370 /*
0b22930e
AL
1371 * Everything below this point can be preempted by a nested NMI.
1372 * If this happens, then the inner NMI will change the "iret"
1373 * frame to point back to repeat_nmi.
3f3c8b8c 1374 */
4d732138 1375 pushq $-1 /* ORIG_RAX: no syscall to restart */
76f5df43 1376
1fd466ef 1377 /*
ebfc453e 1378 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1fd466ef
SR
1379 * as we should not be calling schedule in NMI context.
1380 * Even with normal interrupts enabled. An NMI should not be
1381 * setting NEED_RESCHED or anything that normal interrupts and
1382 * exceptions might do.
1383 */
4d732138 1384 call paranoid_entry
8c1f7558 1385 UNWIND_HINT_REGS
7fbb98c5 1386
4d732138
IM
1387 movq %rsp, %rdi
1388 movq $-1, %rsi
6271fef0 1389 call exc_nmi
7fbb98c5 1390
16561f27 1391 /* Always restore stashed CR3 value (see paranoid_entry) */
21e94459 1392 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
8a09317b 1393
c82965f9
CB
1394 /*
1395 * The above invocation of paranoid_entry stored the GSBASE
1396 * related information in R/EBX depending on the availability
1397 * of FSGSBASE.
1398 *
1399 * If FSGSBASE is enabled, restore the saved GSBASE value
1400 * unconditionally, otherwise take the conditional SWAPGS path.
1401 */
1402 ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1403
1404 wrgsbase %rbx
1405 jmp nmi_restore
1406
1407nmi_no_fsgsbase:
1408 /* EBX == 0 -> invoke SWAPGS */
1409 testl %ebx, %ebx
4d732138 1410 jnz nmi_restore
c82965f9 1411
ddeb8f21 1412nmi_swapgs:
53c9d924 1413 swapgs
c82965f9 1414
ddeb8f21 1415nmi_restore:
502af0d7 1416 POP_REGS
0b22930e 1417
471ee483
AL
1418 /*
1419 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1420 * at the "iret" frame.
1421 */
1422 addq $6*8, %rsp
28696f43 1423
810bc075
AL
1424 /*
1425 * Clear "NMI executing". Set DF first so that we can easily
1426 * distinguish the remaining code between here and IRET from
929bacec
AL
1427 * the SYSCALL entry and exit paths.
1428 *
1429 * We arguably should just inspect RIP instead, but I (Andy) wrote
1430 * this code when I had the misapprehension that Xen PV supported
1431 * NMIs, and Xen PV would break that approach.
810bc075
AL
1432 */
1433 std
1434 movq $0, 5*8(%rsp) /* clear "NMI executing" */
0b22930e
AL
1435
1436 /*
929bacec
AL
1437 * iretq reads the "iret" frame and exits the NMI stack in a
1438 * single instruction. We are returning to kernel mode, so this
1439 * cannot result in a fault. Similarly, we don't need to worry
1440 * about espfix64 on the way back to kernel mode.
0b22930e 1441 */
929bacec 1442 iretq
6271fef0 1443SYM_CODE_END(asm_exc_nmi)
ddeb8f21 1444
dffb3f9d
AL
1445#ifndef CONFIG_IA32_EMULATION
1446/*
1447 * This handles SYSCALL from 32-bit code. There is no way to program
1448 * MSRs to fully disable 32-bit SYSCALL.
1449 */
bc7b11c0 1450SYM_CODE_START(ignore_sysret)
8c1f7558 1451 UNWIND_HINT_EMPTY
8f93402b 1452 ENDBR
4d732138 1453 mov $-ENOSYS, %eax
b2b1d94c 1454 sysretl
bc7b11c0 1455SYM_CODE_END(ignore_sysret)
dffb3f9d 1456#endif
2deb4be2 1457
b9f6976b 1458.pushsection .text, "ax"
0e25498f 1459SYM_CODE_START(rewind_stack_and_make_dead)
8c1f7558 1460 UNWIND_HINT_FUNC
2deb4be2
AL
1461 /* Prevent any naive code from trying to unwind to our caller. */
1462 xorl %ebp, %ebp
1463
1464 movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
8c1f7558 1465 leaq -PTREGS_SIZE(%rax), %rsp
f977df7b 1466 UNWIND_HINT_REGS
2deb4be2 1467
0e25498f
EB
1468 call make_task_dead
1469SYM_CODE_END(rewind_stack_and_make_dead)
b9f6976b 1470.popsection