Merge tag 'drivers-5.10-2020-10-12' of git://git.kernel.dk/linux-block
[linux-2.6-block.git] / arch / x86 / entry / entry_64.S
CommitLineData
b2441318 1/* SPDX-License-Identifier: GPL-2.0 */
1da177e4
LT
2/*
3 * linux/arch/x86_64/entry.S
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
7 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
4d732138 8 *
1da177e4
LT
9 * entry.S contains the system-call and fault low-level handling routines.
10 *
cb1aaebe 11 * Some of this is documented in Documentation/x86/entry_64.rst
8b4777a4 12 *
0bd7b798 13 * A note on terminology:
4d732138
IM
14 * - iret frame: Architecture defined interrupt frame from SS to RIP
15 * at the top of the kernel process stack.
2e91a17b
AK
16 *
17 * Some macro usage:
6dcc5627 18 * - SYM_FUNC_START/END:Define functions in the symbol table.
4d732138 19 * - idtentry: Define exception entry points.
1da177e4 20 */
1da177e4
LT
21#include <linux/linkage.h>
22#include <asm/segment.h>
1da177e4
LT
23#include <asm/cache.h>
24#include <asm/errno.h>
e2d5df93 25#include <asm/asm-offsets.h>
1da177e4
LT
26#include <asm/msr.h>
27#include <asm/unistd.h>
28#include <asm/thread_info.h>
29#include <asm/hw_irq.h>
0341c14d 30#include <asm/page_types.h>
2601e64d 31#include <asm/irqflags.h>
72fe4858 32#include <asm/paravirt.h>
9939ddaf 33#include <asm/percpu.h>
d7abc0fa 34#include <asm/asm.h>
63bcff2a 35#include <asm/smap.h>
3891a04a 36#include <asm/pgtable_types.h>
784d5699 37#include <asm/export.h>
8c1f7558 38#include <asm/frame.h>
cfa82a00 39#include <asm/trapnr.h>
2641f08b 40#include <asm/nospec-branch.h>
c82965f9 41#include <asm/fsgsbase.h>
d7e7528b 42#include <linux/err.h>
1da177e4 43
6fd166aa
PZ
44#include "calling.h"
45
4d732138
IM
46.code64
47.section .entry.text, "ax"
16444a8a 48
ecac7181 49#ifdef CONFIG_PARAVIRT_XXL
bc7b11c0 50SYM_CODE_START(native_usergs_sysret64)
8c1f7558 51 UNWIND_HINT_EMPTY
72fe4858
GOC
52 swapgs
53 sysretq
bc7b11c0 54SYM_CODE_END(native_usergs_sysret64)
ecac7181 55#endif /* CONFIG_PARAVIRT_XXL */
72fe4858 56
1da177e4 57/*
4d732138 58 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
1da177e4 59 *
fda57b22
AL
60 * This is the only entry point used for 64-bit system calls. The
61 * hardware interface is reasonably well designed and the register to
62 * argument mapping Linux uses fits well with the registers that are
63 * available when SYSCALL is used.
64 *
65 * SYSCALL instructions can be found inlined in libc implementations as
66 * well as some other programs and libraries. There are also a handful
67 * of SYSCALL instructions in the vDSO used, for example, as a
68 * clock_gettimeofday fallback.
69 *
4d732138 70 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
b87cf63e
DV
71 * then loads new ss, cs, and rip from previously programmed MSRs.
72 * rflags gets masked by a value from another MSR (so CLD and CLAC
73 * are not needed). SYSCALL does not save anything on the stack
74 * and does not change rsp.
75 *
76 * Registers on entry:
1da177e4 77 * rax system call number
b87cf63e
DV
78 * rcx return address
79 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
1da177e4 80 * rdi arg0
1da177e4 81 * rsi arg1
0bd7b798 82 * rdx arg2
b87cf63e 83 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
1da177e4
LT
84 * r8 arg4
85 * r9 arg5
4d732138 86 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
0bd7b798 87 *
1da177e4
LT
88 * Only called from user space.
89 *
7fcb3bc3 90 * When user can change pt_regs->foo always force IRET. That is because
7bf36bbc
AK
91 * it deals with uncanonical addresses better. SYSRET has trouble
92 * with them due to bugs in both AMD and Intel CPUs.
0bd7b798 93 */
1da177e4 94
bc7b11c0 95SYM_CODE_START(entry_SYSCALL_64)
8c1f7558 96 UNWIND_HINT_EMPTY
72fe4858 97
8a9949bc 98 swapgs
bf904d27 99 /* tss.sp2 is scratch space. */
98f05b51 100 movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
bf904d27 101 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
4d732138 102 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
9ed8e7d8
DV
103
104 /* Construct struct pt_regs on stack */
98f05b51
AL
105 pushq $__USER_DS /* pt_regs->ss */
106 pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
107 pushq %r11 /* pt_regs->flags */
108 pushq $__USER_CS /* pt_regs->cs */
109 pushq %rcx /* pt_regs->ip */
26ba4e57 110SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
98f05b51 111 pushq %rax /* pt_regs->orig_ax */
30907fd1
DB
112
113 PUSH_AND_CLEAR_REGS rax=$-ENOSYS
4d732138 114
1e423bff 115 /* IRQs are off. */
dfe64506
LT
116 movq %rax, %rdi
117 movq %rsp, %rsi
1e423bff
AL
118 call do_syscall_64 /* returns with IRQs disabled */
119
fffbb5dc
DV
120 /*
121 * Try to use SYSRET instead of IRET if we're returning to
8a055d7f
AL
122 * a completely clean 64-bit userspace context. If we're not,
123 * go to the slow exit path.
fffbb5dc 124 */
4d732138
IM
125 movq RCX(%rsp), %rcx
126 movq RIP(%rsp), %r11
8a055d7f
AL
127
128 cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
129 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
130
131 /*
132 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
133 * in kernel space. This essentially lets the user take over
17be0aec 134 * the kernel, since userspace controls RSP.
fffbb5dc 135 *
17be0aec 136 * If width of "canonical tail" ever becomes variable, this will need
fffbb5dc 137 * to be updated to remain correct on both old and new CPUs.
361b4b58 138 *
cbe0317b
KS
139 * Change top bits to match most significant bit (47th or 56th bit
140 * depending on paging mode) in the address.
fffbb5dc 141 */
09e61a77 142#ifdef CONFIG_X86_5LEVEL
39b95522
KS
143 ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
144 "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
09e61a77 145#else
17be0aec
DV
146 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
147 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
09e61a77 148#endif
4d732138 149
17be0aec
DV
150 /* If this changed %rcx, it was not canonical */
151 cmpq %rcx, %r11
8a055d7f 152 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 153
4d732138 154 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
8a055d7f 155 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 156
4d732138
IM
157 movq R11(%rsp), %r11
158 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
8a055d7f 159 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
160
161 /*
3e035305
BP
162 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
163 * restore RF properly. If the slowpath sets it for whatever reason, we
164 * need to restore it correctly.
165 *
166 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
167 * trap from userspace immediately after SYSRET. This would cause an
168 * infinite loop whenever #DB happens with register state that satisfies
169 * the opportunistic SYSRET conditions. For example, single-stepping
170 * this user code:
fffbb5dc 171 *
4d732138 172 * movq $stuck_here, %rcx
fffbb5dc
DV
173 * pushfq
174 * popq %r11
175 * stuck_here:
176 *
177 * would never get past 'stuck_here'.
178 */
4d732138 179 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
8a055d7f 180 jnz swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
181
182 /* nothing to check for RSP */
183
4d732138 184 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
8a055d7f 185 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
186
187 /*
4d732138
IM
188 * We win! This label is here just for ease of understanding
189 * perf profiles. Nothing jumps here.
fffbb5dc
DV
190 */
191syscall_return_via_sysret:
17be0aec 192 /* rcx and r11 are already restored (see code above) */
502af0d7 193 POP_REGS pop_rdi=0 skip_r11rcx=1
3e3b9293
AL
194
195 /*
196 * Now all regs are restored except RSP and RDI.
197 * Save old stack pointer and switch to trampoline stack.
198 */
199 movq %rsp, %rdi
c482feef 200 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 201 UNWIND_HINT_EMPTY
3e3b9293
AL
202
203 pushq RSP-RDI(%rdi) /* RSP */
204 pushq (%rdi) /* RDI */
205
206 /*
207 * We are on the trampoline stack. All regs except RDI are live.
208 * We can do future final exit work right here.
209 */
afaef01c
AP
210 STACKLEAK_ERASE_NOCLOBBER
211
6fd166aa 212 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
3e3b9293 213
4fbb3910 214 popq %rdi
3e3b9293 215 popq %rsp
fffbb5dc 216 USERGS_SYSRET64
bc7b11c0 217SYM_CODE_END(entry_SYSCALL_64)
0bd7b798 218
0100301b
BG
219/*
220 * %rdi: prev task
221 * %rsi: next task
222 */
b9f6976b 223.pushsection .text, "ax"
96c64806 224SYM_FUNC_START(__switch_to_asm)
0100301b
BG
225 /*
226 * Save callee-saved registers
227 * This must match the order in inactive_task_frame
228 */
229 pushq %rbp
230 pushq %rbx
231 pushq %r12
232 pushq %r13
233 pushq %r14
234 pushq %r15
235
236 /* switch stack */
237 movq %rsp, TASK_threadsp(%rdi)
238 movq TASK_threadsp(%rsi), %rsp
239
050e9baa 240#ifdef CONFIG_STACKPROTECTOR
0100301b 241 movq TASK_stack_canary(%rsi), %rbx
e6401c13 242 movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
0100301b
BG
243#endif
244
c995efd5
DW
245#ifdef CONFIG_RETPOLINE
246 /*
247 * When switching from a shallower to a deeper call stack
248 * the RSB may either underflow or use entries populated
249 * with userspace addresses. On CPUs where those concerns
250 * exist, overwrite the RSB with entries which capture
251 * speculative execution to prevent attack.
252 */
d1c99108 253 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
c995efd5
DW
254#endif
255
0100301b
BG
256 /* restore callee-saved registers */
257 popq %r15
258 popq %r14
259 popq %r13
260 popq %r12
261 popq %rbx
262 popq %rbp
263
264 jmp __switch_to
96c64806 265SYM_FUNC_END(__switch_to_asm)
b9f6976b 266.popsection
0100301b 267
1eeb207f
DV
268/*
269 * A newly forked process directly context switches into this address.
270 *
0100301b 271 * rax: prev task we switched from
616d2483
BG
272 * rbx: kernel thread func (NULL for user thread)
273 * r12: kernel thread arg
1eeb207f 274 */
b9f6976b 275.pushsection .text, "ax"
bc7b11c0 276SYM_CODE_START(ret_from_fork)
8c1f7558 277 UNWIND_HINT_EMPTY
0100301b 278 movq %rax, %rdi
ebd57499 279 call schedule_tail /* rdi: 'prev' task parameter */
1eeb207f 280
ebd57499
JP
281 testq %rbx, %rbx /* from kernel_thread? */
282 jnz 1f /* kernel threads are uncommon */
24d978b7 283
616d2483 2842:
8c1f7558 285 UNWIND_HINT_REGS
ebd57499 286 movq %rsp, %rdi
167fd210 287 call syscall_exit_to_user_mode /* returns with IRQs disabled */
8a055d7f 288 jmp swapgs_restore_regs_and_return_to_usermode
616d2483
BG
289
2901:
291 /* kernel thread */
d31a5802 292 UNWIND_HINT_EMPTY
616d2483 293 movq %r12, %rdi
34fdce69 294 CALL_NOSPEC rbx
616d2483
BG
295 /*
296 * A kernel thread is allowed to return here after successfully
be619f7f 297 * calling kernel_execve(). Exit to userspace to complete the execve()
616d2483
BG
298 * syscall.
299 */
300 movq $0, RAX(%rsp)
301 jmp 2b
bc7b11c0 302SYM_CODE_END(ret_from_fork)
b9f6976b 303.popsection
1eeb207f 304
1d3e53e8
AL
305.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
306#ifdef CONFIG_DEBUG_ENTRY
e17f8234
BO
307 pushq %rax
308 SAVE_FLAGS(CLBR_RAX)
309 testl $X86_EFLAGS_IF, %eax
1d3e53e8
AL
310 jz .Lokay_\@
311 ud2
312.Lokay_\@:
e17f8234 313 popq %rax
1d3e53e8
AL
314#endif
315.endm
316
cfa82a00
TG
317/**
318 * idtentry_body - Macro to emit code calling the C function
cfa82a00
TG
319 * @cfunc: C function to be called
320 * @has_error_code: Hardware pushed error code on stack
321 */
e2dcb5f1 322.macro idtentry_body cfunc has_error_code:req
cfa82a00
TG
323
324 call error_entry
325 UNWIND_HINT_REGS
326
cfa82a00
TG
327 movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
328
329 .if \has_error_code == 1
330 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
331 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
332 .endif
333
cfa82a00
TG
334 call \cfunc
335
424c7d0a 336 jmp error_return
cfa82a00
TG
337.endm
338
339/**
340 * idtentry - Macro to generate entry stubs for simple IDT entries
341 * @vector: Vector number
342 * @asmsym: ASM symbol for the entry point
343 * @cfunc: C function to be called
344 * @has_error_code: Hardware pushed error code on stack
345 *
346 * The macro emits code to set up the kernel context for straight forward
347 * and simple IDT entries. No IST stack, no paranoid entry checks.
348 */
e2dcb5f1 349.macro idtentry vector asmsym cfunc has_error_code:req
cfa82a00
TG
350SYM_CODE_START(\asmsym)
351 UNWIND_HINT_IRET_REGS offset=\has_error_code*8
352 ASM_CLAC
353
354 .if \has_error_code == 0
355 pushq $-1 /* ORIG_RAX: no syscall to restart */
356 .endif
357
358 .if \vector == X86_TRAP_BP
359 /*
360 * If coming from kernel space, create a 6-word gap to allow the
361 * int3 handler to emulate a call instruction.
362 */
363 testb $3, CS-ORIG_RAX(%rsp)
364 jnz .Lfrom_usermode_no_gap_\@
365 .rept 6
366 pushq 5*8(%rsp)
367 .endr
368 UNWIND_HINT_IRET_REGS offset=8
369.Lfrom_usermode_no_gap_\@:
370 .endif
371
e2dcb5f1 372 idtentry_body \cfunc \has_error_code
cfa82a00
TG
373
374_ASM_NOKPROBE(\asmsym)
375SYM_CODE_END(\asmsym)
376.endm
377
0bf7c314
TG
378/*
379 * Interrupt entry/exit.
380 *
381 + The interrupt stubs push (vector) onto the stack, which is the error_code
382 * position of idtentry exceptions, and jump to one of the two idtentry points
383 * (common/spurious).
384 *
385 * common_interrupt is a hotpath, align it to a cache line
386 */
387.macro idtentry_irq vector cfunc
388 .p2align CONFIG_X86_L1_CACHE_SHIFT
389 idtentry \vector asm_\cfunc \cfunc has_error_code=1
390.endm
391
6368558c
TG
392/*
393 * System vectors which invoke their handlers directly and are not
394 * going through the regular common device interrupt handling code.
395 */
396.macro idtentry_sysvec vector cfunc
397 idtentry \vector asm_\cfunc \cfunc has_error_code=0
398.endm
399
cfa82a00
TG
400/**
401 * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
402 * @vector: Vector number
403 * @asmsym: ASM symbol for the entry point
404 * @cfunc: C function to be called
405 *
406 * The macro emits code to set up the kernel context for #MC and #DB
407 *
408 * If the entry comes from user space it uses the normal entry path
409 * including the return to user space work and preemption checks on
410 * exit.
411 *
412 * If hits in kernel mode then it needs to go through the paranoid
413 * entry as the exception can hit any random state. No preemption
414 * check on exit to keep the paranoid path simple.
cfa82a00
TG
415 */
416.macro idtentry_mce_db vector asmsym cfunc
417SYM_CODE_START(\asmsym)
418 UNWIND_HINT_IRET_REGS
419 ASM_CLAC
420
421 pushq $-1 /* ORIG_RAX: no syscall to restart */
422
423 /*
424 * If the entry is from userspace, switch stacks and treat it as
425 * a normal entry.
426 */
427 testb $3, CS-ORIG_RAX(%rsp)
428 jnz .Lfrom_usermode_switch_stack_\@
429
c82965f9 430 /* paranoid_entry returns GS information for paranoid_exit in EBX. */
cfa82a00
TG
431 call paranoid_entry
432
433 UNWIND_HINT_REGS
434
cfa82a00 435 movq %rsp, %rdi /* pt_regs pointer */
cfa82a00 436
cfa82a00
TG
437 call \cfunc
438
cfa82a00
TG
439 jmp paranoid_exit
440
441 /* Switch to the regular task stack and use the noist entry point */
442.Lfrom_usermode_switch_stack_\@:
e2dcb5f1 443 idtentry_body noist_\cfunc, has_error_code=0
cfa82a00
TG
444
445_ASM_NOKPROBE(\asmsym)
446SYM_CODE_END(\asmsym)
447.endm
448
449/*
450 * Double fault entry. Straight paranoid. No checks from which context
451 * this comes because for the espfix induced #DF this would do the wrong
452 * thing.
453 */
454.macro idtentry_df vector asmsym cfunc
455SYM_CODE_START(\asmsym)
456 UNWIND_HINT_IRET_REGS offset=8
457 ASM_CLAC
458
c82965f9 459 /* paranoid_entry returns GS information for paranoid_exit in EBX. */
cfa82a00
TG
460 call paranoid_entry
461 UNWIND_HINT_REGS
462
cfa82a00
TG
463 movq %rsp, %rdi /* pt_regs pointer into first argument */
464 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
465 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
466 call \cfunc
467
468 jmp paranoid_exit
469
470_ASM_NOKPROBE(\asmsym)
471SYM_CODE_END(\asmsym)
472.endm
473
53aaf262
TG
474/*
475 * Include the defines which emit the idt entries which are shared
f0178fc0
TG
476 * shared between 32 and 64 bit and emit the __irqentry_text_* markers
477 * so the stacktrace boundary checks work.
53aaf262 478 */
f0178fc0
TG
479 .align 16
480 .globl __irqentry_text_start
481__irqentry_text_start:
482
53aaf262
TG
483#include <asm/idtentry.h>
484
f0178fc0
TG
485 .align 16
486 .globl __irqentry_text_end
487__irqentry_text_end:
488
fa5e5c40 489SYM_CODE_START_LOCAL(common_interrupt_return)
26ba4e57 490SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
26c4ef9c
AL
491#ifdef CONFIG_DEBUG_ENTRY
492 /* Assert that pt_regs indicates user mode. */
1e4c4f61 493 testb $3, CS(%rsp)
26c4ef9c
AL
494 jnz 1f
495 ud2
4961:
497#endif
502af0d7 498 POP_REGS pop_rdi=0
3e3b9293
AL
499
500 /*
501 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
502 * Save old stack pointer and switch to trampoline stack.
503 */
504 movq %rsp, %rdi
c482feef 505 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 506 UNWIND_HINT_EMPTY
3e3b9293
AL
507
508 /* Copy the IRET frame to the trampoline stack. */
509 pushq 6*8(%rdi) /* SS */
510 pushq 5*8(%rdi) /* RSP */
511 pushq 4*8(%rdi) /* EFLAGS */
512 pushq 3*8(%rdi) /* CS */
513 pushq 2*8(%rdi) /* RIP */
514
515 /* Push user RDI on the trampoline stack. */
516 pushq (%rdi)
517
518 /*
519 * We are on the trampoline stack. All regs except RDI are live.
520 * We can do future final exit work right here.
521 */
afaef01c 522 STACKLEAK_ERASE_NOCLOBBER
3e3b9293 523
6fd166aa 524 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
8a09317b 525
3e3b9293
AL
526 /* Restore RDI. */
527 popq %rdi
528 SWAPGS
26c4ef9c
AL
529 INTERRUPT_RETURN
530
2601e64d 531
26ba4e57 532SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
26c4ef9c
AL
533#ifdef CONFIG_DEBUG_ENTRY
534 /* Assert that pt_regs indicates kernel mode. */
1e4c4f61 535 testb $3, CS(%rsp)
26c4ef9c
AL
536 jz 1f
537 ud2
5381:
539#endif
502af0d7 540 POP_REGS
e872045b 541 addq $8, %rsp /* skip regs->orig_ax */
10bcc80e
MD
542 /*
543 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
544 * when returning from IPI handler.
545 */
7209a75d
AL
546 INTERRUPT_RETURN
547
cc66936e 548SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
8c1f7558 549 UNWIND_HINT_IRET_REGS
3891a04a
PA
550 /*
551 * Are we returning to a stack segment from the LDT? Note: in
552 * 64-bit mode SS:RSP on the exception stack is always valid.
553 */
34273f41 554#ifdef CONFIG_X86_ESPFIX64
4d732138
IM
555 testb $4, (SS-RIP)(%rsp)
556 jnz native_irq_return_ldt
34273f41 557#endif
3891a04a 558
cc66936e 559SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
b645af2d
AL
560 /*
561 * This may fault. Non-paranoid faults on return to userspace are
562 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
c29c775a 563 * Double-faults due to espfix64 are handled in exc_double_fault.
b645af2d
AL
564 * Other faults here are fatal.
565 */
1da177e4 566 iretq
3701d863 567
34273f41 568#ifdef CONFIG_X86_ESPFIX64
7209a75d 569native_irq_return_ldt:
85063fac
AL
570 /*
571 * We are running with user GSBASE. All GPRs contain their user
572 * values. We have a percpu ESPFIX stack that is eight slots
573 * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
574 * of the ESPFIX stack.
575 *
576 * We clobber RAX and RDI in this code. We stash RDI on the
577 * normal stack and RAX on the ESPFIX stack.
578 *
579 * The ESPFIX stack layout we set up looks like this:
580 *
581 * --- top of ESPFIX stack ---
582 * SS
583 * RSP
584 * RFLAGS
585 * CS
586 * RIP <-- RSP points here when we're done
587 * RAX <-- espfix_waddr points here
588 * --- bottom of ESPFIX stack ---
589 */
590
591 pushq %rdi /* Stash user RDI */
8a09317b
DH
592 SWAPGS /* to kernel GS */
593 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
594
4d732138 595 movq PER_CPU_VAR(espfix_waddr), %rdi
85063fac
AL
596 movq %rax, (0*8)(%rdi) /* user RAX */
597 movq (1*8)(%rsp), %rax /* user RIP */
4d732138 598 movq %rax, (1*8)(%rdi)
85063fac 599 movq (2*8)(%rsp), %rax /* user CS */
4d732138 600 movq %rax, (2*8)(%rdi)
85063fac 601 movq (3*8)(%rsp), %rax /* user RFLAGS */
4d732138 602 movq %rax, (3*8)(%rdi)
85063fac 603 movq (5*8)(%rsp), %rax /* user SS */
4d732138 604 movq %rax, (5*8)(%rdi)
85063fac 605 movq (4*8)(%rsp), %rax /* user RSP */
4d732138 606 movq %rax, (4*8)(%rdi)
85063fac
AL
607 /* Now RAX == RSP. */
608
609 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
85063fac
AL
610
611 /*
612 * espfix_stack[31:16] == 0. The page tables are set up such that
613 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
614 * espfix_waddr for any X. That is, there are 65536 RO aliases of
615 * the same page. Set up RSP so that RSP[31:16] contains the
616 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
617 * still points to an RO alias of the ESPFIX stack.
618 */
4d732138 619 orq PER_CPU_VAR(espfix_stack), %rax
8a09317b 620
6fd166aa 621 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
8a09317b
DH
622 SWAPGS /* to user GS */
623 popq %rdi /* Restore user RDI */
624
4d732138 625 movq %rax, %rsp
8c1f7558 626 UNWIND_HINT_IRET_REGS offset=8
85063fac
AL
627
628 /*
629 * At this point, we cannot write to the stack any more, but we can
630 * still read.
631 */
632 popq %rax /* Restore user RAX */
633
634 /*
635 * RSP now points to an ordinary IRET frame, except that the page
636 * is read-only and RSP[31:16] are preloaded with the userspace
637 * values. We can now IRET back to userspace.
638 */
4d732138 639 jmp native_irq_return_iret
34273f41 640#endif
fa5e5c40
TG
641SYM_CODE_END(common_interrupt_return)
642_ASM_NOKPROBE(common_interrupt_return)
3891a04a 643
b9f6976b
TG
644/*
645 * Reload gs selector with exception handling
646 * edi: new selector
647 *
648 * Is in entry.text as it shouldn't be instrumented.
649 */
410367e3 650SYM_FUNC_START(asm_load_gs_index)
8c1f7558 651 FRAME_BEGIN
c9317202 652 swapgs
42c748bb 653.Lgs_change:
4d732138 654 movl %edi, %gs
96e5d28a 6552: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
c9317202 656 swapgs
8c1f7558 657 FRAME_END
9f1e87ea 658 ret
410367e3
TG
659SYM_FUNC_END(asm_load_gs_index)
660EXPORT_SYMBOL(asm_load_gs_index)
0bd7b798 661
98ededb6 662 _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
4d732138 663 .section .fixup, "ax"
1da177e4 664 /* running with kernelgs */
ef77e688 665SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
c9317202 666 swapgs /* switch back to user gs */
b038c842
AL
667.macro ZAP_GS
668 /* This can't be a string because the preprocessor needs to see it. */
669 movl $__USER_DS, %eax
670 movl %eax, %gs
671.endm
672 ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
4d732138
IM
673 xorl %eax, %eax
674 movl %eax, %gs
675 jmp 2b
ef77e688 676SYM_CODE_END(.Lbad_gs)
9f1e87ea 677 .previous
0bd7b798 678
931b9414
TG
679/*
680 * rdi: New stack pointer points to the top word of the stack
681 * rsi: Function pointer
682 * rdx: Function argument (can be NULL if none)
683 */
684SYM_FUNC_START(asm_call_on_stack)
a7b3474c
TG
685SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL)
686SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL)
931b9414
TG
687 /*
688 * Save the frame pointer unconditionally. This allows the ORC
689 * unwinder to handle the stack switch.
690 */
691 pushq %rbp
692 mov %rsp, %rbp
693
694 /*
695 * The unwinder relies on the word at the top of the new stack
696 * page linking back to the previous RSP.
697 */
698 mov %rsp, (%rdi)
699 mov %rdi, %rsp
700 /* Move the argument to the right place */
701 mov %rdx, %rdi
702
7031:
704 .pushsection .discard.instr_begin
705 .long 1b - .
706 .popsection
707
708 CALL_NOSPEC rsi
709
7102:
711 .pushsection .discard.instr_end
712 .long 2b - .
713 .popsection
714
715 /* Restore the previous stack pointer from RBP. */
716 leaveq
717 ret
718SYM_FUNC_END(asm_call_on_stack)
719
28c11b0f 720#ifdef CONFIG_XEN_PV
3d75e1b8 721/*
9f1e87ea
CG
722 * A note on the "critical region" in our callback handler.
723 * We want to avoid stacking callback handlers due to events occurring
724 * during handling of the last event. To do this, we keep events disabled
725 * until we've done all processing. HOWEVER, we must enable events before
726 * popping the stack frame (can't be done atomically) and so it would still
727 * be possible to get enough handler activations to overflow the stack.
728 * Although unlikely, bugs of that kind are hard to track down, so we'd
729 * like to avoid the possibility.
730 * So, on entry to the handler we detect whether we interrupted an
731 * existing activation in its critical region -- if so, we pop the current
732 * activation and restart the handler using the previous one.
2f6474e4
TG
733 *
734 * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
9f1e87ea 735 */
2f6474e4 736SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
4d732138 737
9f1e87ea
CG
738/*
739 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
740 * see the correct pointer to the pt_regs
741 */
8c1f7558 742 UNWIND_HINT_FUNC
4d732138 743 movq %rdi, %rsp /* we don't return, adjust the stack frame */
8c1f7558 744 UNWIND_HINT_REGS
1d3e53e8 745
2f6474e4 746 call xen_pv_evtchn_do_upcall
1d3e53e8 747
2f6474e4
TG
748 jmp error_return
749SYM_CODE_END(exc_xen_hypervisor_callback)
3d75e1b8
JF
750
751/*
9f1e87ea
CG
752 * Hypervisor uses this for application faults while it executes.
753 * We get here for two reasons:
754 * 1. Fault while reloading DS, ES, FS or GS
755 * 2. Fault while executing IRET
756 * Category 1 we do not need to fix up as Xen has already reloaded all segment
757 * registers that could be reloaded and zeroed the others.
758 * Category 2 we fix up by killing the current process. We cannot use the
759 * normal Linux return path in this case because if we use the IRET hypercall
760 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
761 * We distinguish between categories by comparing each saved segment register
762 * with its current contents: any discrepancy means we in category 1.
763 */
bc7b11c0 764SYM_CODE_START(xen_failsafe_callback)
8c1f7558 765 UNWIND_HINT_EMPTY
4d732138
IM
766 movl %ds, %ecx
767 cmpw %cx, 0x10(%rsp)
768 jne 1f
769 movl %es, %ecx
770 cmpw %cx, 0x18(%rsp)
771 jne 1f
772 movl %fs, %ecx
773 cmpw %cx, 0x20(%rsp)
774 jne 1f
775 movl %gs, %ecx
776 cmpw %cx, 0x28(%rsp)
777 jne 1f
3d75e1b8 778 /* All segments match their saved values => Category 2 (Bad IRET). */
4d732138
IM
779 movq (%rsp), %rcx
780 movq 8(%rsp), %r11
781 addq $0x30, %rsp
782 pushq $0 /* RIP */
8c1f7558 783 UNWIND_HINT_IRET_REGS offset=8
be4c11af 784 jmp asm_exc_general_protection
3d75e1b8 7851: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
4d732138
IM
786 movq (%rsp), %rcx
787 movq 8(%rsp), %r11
788 addq $0x30, %rsp
8c1f7558 789 UNWIND_HINT_IRET_REGS
4d732138 790 pushq $-1 /* orig_ax = -1 => not a system call */
3f01daec 791 PUSH_AND_CLEAR_REGS
946c1911 792 ENCODE_FRAME_POINTER
e88d9741 793 jmp error_return
bc7b11c0 794SYM_CODE_END(xen_failsafe_callback)
28c11b0f 795#endif /* CONFIG_XEN_PV */
3d75e1b8 796
ebfc453e 797/*
c82965f9
CB
798 * Save all registers in pt_regs. Return GSBASE related information
799 * in EBX depending on the availability of the FSGSBASE instructions:
800 *
801 * FSGSBASE R/EBX
802 * N 0 -> SWAPGS on exit
803 * 1 -> no SWAPGS on exit
804 *
805 * Y GSBASE value at entry, must be restored in paranoid_exit
ebfc453e 806 */
ef1e0315 807SYM_CODE_START_LOCAL(paranoid_entry)
8c1f7558 808 UNWIND_HINT_FUNC
1eeb207f 809 cld
9e809d15
DB
810 PUSH_AND_CLEAR_REGS save_ret=1
811 ENCODE_FRAME_POINTER 8
8a09317b 812
16561f27
DH
813 /*
814 * Always stash CR3 in %r14. This value will be restored,
ae852495
AL
815 * verbatim, at exit. Needed if paranoid_entry interrupted
816 * another entry that already switched to the user CR3 value
817 * but has not yet returned to userspace.
16561f27
DH
818 *
819 * This is also why CS (stashed in the "iret frame" by the
820 * hardware at entry) can not be used: this may be a return
ae852495 821 * to kernel code, but with a user CR3 value.
96b23714
CB
822 *
823 * Switching CR3 does not depend on kernel GSBASE so it can
824 * be done before switching to the kernel GSBASE. This is
825 * required for FSGSBASE because the kernel GSBASE has to
826 * be retrieved from a kernel internal table.
16561f27 827 */
8a09317b
DH
828 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
829
c82965f9
CB
830 /*
831 * Handling GSBASE depends on the availability of FSGSBASE.
832 *
833 * Without FSGSBASE the kernel enforces that negative GSBASE
834 * values indicate kernel GSBASE. With FSGSBASE no assumptions
835 * can be made about the GSBASE value when entering from user
836 * space.
837 */
838 ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
839
840 /*
841 * Read the current GSBASE and store it in %rbx unconditionally,
842 * retrieve and set the current CPUs kernel GSBASE. The stored value
843 * has to be restored in paranoid_exit unconditionally.
844 *
0b2c605f
BP
845 * The unconditional write to GS base below ensures that no subsequent
846 * loads based on a mispredicted GS base can happen, therefore no LFENCE
847 * is needed here.
c82965f9
CB
848 */
849 SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
850 ret
851
852.Lparanoid_entry_checkgs:
96b23714
CB
853 /* EBX = 1 -> kernel GSBASE active, no restore required */
854 movl $1, %ebx
855 /*
856 * The kernel-enforced convention is a negative GSBASE indicates
857 * a kernel value. No SWAPGS needed on entry and exit.
858 */
859 movl $MSR_GS_BASE, %ecx
860 rdmsr
861 testl %edx, %edx
862 jns .Lparanoid_entry_swapgs
863 ret
864
865.Lparanoid_entry_swapgs:
866 SWAPGS
867
18ec54fd
JP
868 /*
869 * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
870 * unconditional CR3 write, even in the PTI case. So do an lfence
871 * to prevent GS speculation, regardless of whether PTI is enabled.
872 */
873 FENCE_SWAPGS_KERNEL_ENTRY
874
96b23714
CB
875 /* EBX = 0 -> SWAPGS required on exit */
876 xorl %ebx, %ebx
8a09317b 877 ret
ef1e0315 878SYM_CODE_END(paranoid_entry)
ddeb8f21 879
ebfc453e
DV
880/*
881 * "Paranoid" exit path from exception stack. This is invoked
882 * only on return from non-NMI IST interrupts that came
883 * from kernel space.
884 *
885 * We may be returning to very strange contexts (e.g. very early
886 * in syscall entry), so checking for preemption here would
c82965f9
CB
887 * be complicated. Fortunately, there's no good reason to try
888 * to handle preemption here.
889 *
890 * R/EBX contains the GSBASE related information depending on the
891 * availability of the FSGSBASE instructions:
892 *
893 * FSGSBASE R/EBX
894 * N 0 -> SWAPGS on exit
895 * 1 -> no SWAPGS on exit
4d732138 896 *
c82965f9 897 * Y User space GSBASE, must be restored unconditionally
ebfc453e 898 */
ef1e0315 899SYM_CODE_START_LOCAL(paranoid_exit)
8c1f7558 900 UNWIND_HINT_REGS
c82965f9
CB
901 /*
902 * The order of operations is important. RESTORE_CR3 requires
903 * kernel GSBASE.
904 *
905 * NB to anyone to try to optimize this code: this code does
906 * not execute at all for exceptions from user mode. Those
907 * exceptions go through error_exit instead.
908 */
909 RESTORE_CR3 scratch_reg=%rax save_reg=%r14
910
911 /* Handle the three GSBASE cases */
912 ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
913
914 /* With FSGSBASE enabled, unconditionally restore GSBASE */
915 wrgsbase %rbx
916 jmp restore_regs_and_return_to_kernel
917
918.Lparanoid_exit_checkgs:
919 /* On non-FSGSBASE systems, conditionally do SWAPGS */
920 testl %ebx, %ebx
921 jnz restore_regs_and_return_to_kernel
922
923 /* We are returning to a context with user GSBASE */
ddeb8f21 924 SWAPGS_UNSAFE_STACK
c82965f9 925 jmp restore_regs_and_return_to_kernel
ef1e0315 926SYM_CODE_END(paranoid_exit)
ddeb8f21
AH
927
928/*
9e809d15 929 * Save all registers in pt_regs, and switch GS if needed.
ddeb8f21 930 */
ef1e0315 931SYM_CODE_START_LOCAL(error_entry)
9e809d15 932 UNWIND_HINT_FUNC
ddeb8f21 933 cld
9e809d15
DB
934 PUSH_AND_CLEAR_REGS save_ret=1
935 ENCODE_FRAME_POINTER 8
03335e95 936 testb $3, CS+8(%rsp)
cb6f64ed 937 jz .Lerror_kernelspace
539f5113 938
cb6f64ed
AL
939 /*
940 * We entered from user mode or we're pretending to have entered
941 * from user mode due to an IRET fault.
942 */
ddeb8f21 943 SWAPGS
18ec54fd 944 FENCE_SWAPGS_USER_ENTRY
8a09317b
DH
945 /* We have user CR3. Change to kernel CR3. */
946 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113 947
cb6f64ed 948.Lerror_entry_from_usermode_after_swapgs:
7f2590a1
AL
949 /* Put us onto the real thread stack. */
950 popq %r12 /* save return addr in %12 */
951 movq %rsp, %rdi /* arg0 = pt_regs pointer */
952 call sync_regs
953 movq %rax, %rsp /* switch stack */
954 ENCODE_FRAME_POINTER
955 pushq %r12
f1075053 956 ret
02bc7768 957
18ec54fd
JP
958.Lerror_entry_done_lfence:
959 FENCE_SWAPGS_KERNEL_ENTRY
cb6f64ed 960.Lerror_entry_done:
ddeb8f21 961 ret
ddeb8f21 962
ebfc453e
DV
963 /*
964 * There are two places in the kernel that can potentially fault with
965 * usergs. Handle them here. B stepping K8s sometimes report a
966 * truncated RIP for IRET exceptions returning to compat mode. Check
967 * for these here too.
968 */
cb6f64ed 969.Lerror_kernelspace:
4d732138
IM
970 leaq native_irq_return_iret(%rip), %rcx
971 cmpq %rcx, RIP+8(%rsp)
cb6f64ed 972 je .Lerror_bad_iret
4d732138
IM
973 movl %ecx, %eax /* zero extend */
974 cmpq %rax, RIP+8(%rsp)
cb6f64ed 975 je .Lbstep_iret
42c748bb 976 cmpq $.Lgs_change, RIP+8(%rsp)
18ec54fd 977 jne .Lerror_entry_done_lfence
539f5113
AL
978
979 /*
42c748bb 980 * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
539f5113 981 * gsbase and proceed. We'll fix up the exception and land in
42c748bb 982 * .Lgs_change's error handler with kernel gsbase.
539f5113 983 */
2fa5f04f 984 SWAPGS
18ec54fd 985 FENCE_SWAPGS_USER_ENTRY
2fa5f04f 986 jmp .Lerror_entry_done
ae24ffe5 987
cb6f64ed 988.Lbstep_iret:
ae24ffe5 989 /* Fix truncated RIP */
4d732138 990 movq %rcx, RIP+8(%rsp)
b645af2d
AL
991 /* fall through */
992
cb6f64ed 993.Lerror_bad_iret:
539f5113 994 /*
8a09317b
DH
995 * We came from an IRET to user mode, so we have user
996 * gsbase and CR3. Switch to kernel gsbase and CR3:
539f5113 997 */
b645af2d 998 SWAPGS
18ec54fd 999 FENCE_SWAPGS_USER_ENTRY
8a09317b 1000 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113
AL
1001
1002 /*
1003 * Pretend that the exception came from user mode: set up pt_regs
b3681dd5 1004 * as if we faulted immediately after IRET.
539f5113 1005 */
4d732138
IM
1006 mov %rsp, %rdi
1007 call fixup_bad_iret
1008 mov %rax, %rsp
cb6f64ed 1009 jmp .Lerror_entry_from_usermode_after_swapgs
ef1e0315 1010SYM_CODE_END(error_entry)
ddeb8f21 1011
424c7d0a
TG
1012SYM_CODE_START_LOCAL(error_return)
1013 UNWIND_HINT_REGS
1014 DEBUG_ENTRY_ASSERT_IRQS_OFF
1015 testb $3, CS(%rsp)
1016 jz restore_regs_and_return_to_kernel
1017 jmp swapgs_restore_regs_and_return_to_usermode
1018SYM_CODE_END(error_return)
1019
929bacec
AL
1020/*
1021 * Runs on exception stack. Xen PV does not go through this path at all,
1022 * so we can use real assembly here.
8a09317b
DH
1023 *
1024 * Registers:
1025 * %r14: Used to save/restore the CR3 of the interrupted context
1026 * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
929bacec 1027 */
6271fef0 1028SYM_CODE_START(asm_exc_nmi)
8c1f7558 1029 UNWIND_HINT_IRET_REGS
929bacec 1030
3f3c8b8c
SR
1031 /*
1032 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1033 * the iretq it performs will take us out of NMI context.
1034 * This means that we can have nested NMIs where the next
1035 * NMI is using the top of the stack of the previous NMI. We
1036 * can't let it execute because the nested NMI will corrupt the
1037 * stack of the previous NMI. NMI handlers are not re-entrant
1038 * anyway.
1039 *
1040 * To handle this case we do the following:
1041 * Check the a special location on the stack that contains
1042 * a variable that is set when NMIs are executing.
1043 * The interrupted task's stack is also checked to see if it
1044 * is an NMI stack.
1045 * If the variable is not set and the stack is not the NMI
1046 * stack then:
1047 * o Set the special variable on the stack
0b22930e
AL
1048 * o Copy the interrupt frame into an "outermost" location on the
1049 * stack
1050 * o Copy the interrupt frame into an "iret" location on the stack
3f3c8b8c
SR
1051 * o Continue processing the NMI
1052 * If the variable is set or the previous stack is the NMI stack:
0b22930e 1053 * o Modify the "iret" location to jump to the repeat_nmi
3f3c8b8c
SR
1054 * o return back to the first NMI
1055 *
1056 * Now on exit of the first NMI, we first clear the stack variable
1057 * The NMI stack will tell any nested NMIs at that point that it is
1058 * nested. Then we pop the stack normally with iret, and if there was
1059 * a nested NMI that updated the copy interrupt stack frame, a
1060 * jump will be made to the repeat_nmi code that will handle the second
1061 * NMI.
9b6e6a83
AL
1062 *
1063 * However, espfix prevents us from directly returning to userspace
1064 * with a single IRET instruction. Similarly, IRET to user mode
1065 * can fault. We therefore handle NMIs from user space like
1066 * other IST entries.
3f3c8b8c
SR
1067 */
1068
e93c1730
AL
1069 ASM_CLAC
1070
146b2b09 1071 /* Use %rdx as our temp variable throughout */
4d732138 1072 pushq %rdx
3f3c8b8c 1073
9b6e6a83
AL
1074 testb $3, CS-RIP+8(%rsp)
1075 jz .Lnmi_from_kernel
1076
1077 /*
1078 * NMI from user mode. We need to run on the thread stack, but we
1079 * can't go through the normal entry paths: NMIs are masked, and
1080 * we don't want to enable interrupts, because then we'll end
1081 * up in an awkward situation in which IRQs are on but NMIs
1082 * are off.
83c133cf
AL
1083 *
1084 * We also must not push anything to the stack before switching
1085 * stacks lest we corrupt the "NMI executing" variable.
9b6e6a83
AL
1086 */
1087
929bacec 1088 swapgs
9b6e6a83 1089 cld
18ec54fd 1090 FENCE_SWAPGS_USER_ENTRY
8a09317b 1091 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
9b6e6a83
AL
1092 movq %rsp, %rdx
1093 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
8c1f7558 1094 UNWIND_HINT_IRET_REGS base=%rdx offset=8
9b6e6a83
AL
1095 pushq 5*8(%rdx) /* pt_regs->ss */
1096 pushq 4*8(%rdx) /* pt_regs->rsp */
1097 pushq 3*8(%rdx) /* pt_regs->flags */
1098 pushq 2*8(%rdx) /* pt_regs->cs */
1099 pushq 1*8(%rdx) /* pt_regs->rip */
8c1f7558 1100 UNWIND_HINT_IRET_REGS
9b6e6a83 1101 pushq $-1 /* pt_regs->orig_ax */
30907fd1 1102 PUSH_AND_CLEAR_REGS rdx=(%rdx)
946c1911 1103 ENCODE_FRAME_POINTER
9b6e6a83
AL
1104
1105 /*
1106 * At this point we no longer need to worry about stack damage
1107 * due to nesting -- we're on the normal thread stack and we're
1108 * done with the NMI stack.
1109 */
1110
1111 movq %rsp, %rdi
1112 movq $-1, %rsi
6271fef0 1113 call exc_nmi
9b6e6a83 1114
45d5a168 1115 /*
9b6e6a83 1116 * Return back to user mode. We must *not* do the normal exit
946c1911 1117 * work, because we don't want to enable interrupts.
45d5a168 1118 */
8a055d7f 1119 jmp swapgs_restore_regs_and_return_to_usermode
45d5a168 1120
9b6e6a83 1121.Lnmi_from_kernel:
3f3c8b8c 1122 /*
0b22930e
AL
1123 * Here's what our stack frame will look like:
1124 * +---------------------------------------------------------+
1125 * | original SS |
1126 * | original Return RSP |
1127 * | original RFLAGS |
1128 * | original CS |
1129 * | original RIP |
1130 * +---------------------------------------------------------+
1131 * | temp storage for rdx |
1132 * +---------------------------------------------------------+
1133 * | "NMI executing" variable |
1134 * +---------------------------------------------------------+
1135 * | iret SS } Copied from "outermost" frame |
1136 * | iret Return RSP } on each loop iteration; overwritten |
1137 * | iret RFLAGS } by a nested NMI to force another |
1138 * | iret CS } iteration if needed. |
1139 * | iret RIP } |
1140 * +---------------------------------------------------------+
1141 * | outermost SS } initialized in first_nmi; |
1142 * | outermost Return RSP } will not be changed before |
1143 * | outermost RFLAGS } NMI processing is done. |
1144 * | outermost CS } Copied to "iret" frame on each |
1145 * | outermost RIP } iteration. |
1146 * +---------------------------------------------------------+
1147 * | pt_regs |
1148 * +---------------------------------------------------------+
1149 *
1150 * The "original" frame is used by hardware. Before re-enabling
1151 * NMIs, we need to be done with it, and we need to leave enough
1152 * space for the asm code here.
1153 *
1154 * We return by executing IRET while RSP points to the "iret" frame.
1155 * That will either return for real or it will loop back into NMI
1156 * processing.
1157 *
1158 * The "outermost" frame is copied to the "iret" frame on each
1159 * iteration of the loop, so each iteration starts with the "iret"
1160 * frame pointing to the final return target.
1161 */
1162
45d5a168 1163 /*
0b22930e
AL
1164 * Determine whether we're a nested NMI.
1165 *
a27507ca
AL
1166 * If we interrupted kernel code between repeat_nmi and
1167 * end_repeat_nmi, then we are a nested NMI. We must not
1168 * modify the "iret" frame because it's being written by
1169 * the outer NMI. That's okay; the outer NMI handler is
6271fef0 1170 * about to about to call exc_nmi() anyway, so we can just
a27507ca 1171 * resume the outer NMI.
45d5a168 1172 */
a27507ca
AL
1173
1174 movq $repeat_nmi, %rdx
1175 cmpq 8(%rsp), %rdx
1176 ja 1f
1177 movq $end_repeat_nmi, %rdx
1178 cmpq 8(%rsp), %rdx
1179 ja nested_nmi_out
11801:
45d5a168 1181
3f3c8b8c 1182 /*
a27507ca 1183 * Now check "NMI executing". If it's set, then we're nested.
0b22930e
AL
1184 * This will not detect if we interrupted an outer NMI just
1185 * before IRET.
3f3c8b8c 1186 */
4d732138
IM
1187 cmpl $1, -8(%rsp)
1188 je nested_nmi
3f3c8b8c
SR
1189
1190 /*
0b22930e
AL
1191 * Now test if the previous stack was an NMI stack. This covers
1192 * the case where we interrupt an outer NMI after it clears
810bc075
AL
1193 * "NMI executing" but before IRET. We need to be careful, though:
1194 * there is one case in which RSP could point to the NMI stack
1195 * despite there being no NMI active: naughty userspace controls
1196 * RSP at the very beginning of the SYSCALL targets. We can
1197 * pull a fast one on naughty userspace, though: we program
1198 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1199 * if it controls the kernel's RSP. We set DF before we clear
1200 * "NMI executing".
3f3c8b8c 1201 */
0784b364
DV
1202 lea 6*8(%rsp), %rdx
1203 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1204 cmpq %rdx, 4*8(%rsp)
1205 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1206 ja first_nmi
4d732138 1207
0784b364
DV
1208 subq $EXCEPTION_STKSZ, %rdx
1209 cmpq %rdx, 4*8(%rsp)
1210 /* If it is below the NMI stack, it is a normal NMI */
1211 jb first_nmi
810bc075
AL
1212
1213 /* Ah, it is within the NMI stack. */
1214
1215 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1216 jz first_nmi /* RSP was user controlled. */
1217
1218 /* This is a nested NMI. */
0784b364 1219
3f3c8b8c
SR
1220nested_nmi:
1221 /*
0b22930e
AL
1222 * Modify the "iret" frame to point to repeat_nmi, forcing another
1223 * iteration of NMI handling.
3f3c8b8c 1224 */
23a781e9 1225 subq $8, %rsp
4d732138
IM
1226 leaq -10*8(%rsp), %rdx
1227 pushq $__KERNEL_DS
1228 pushq %rdx
131484c8 1229 pushfq
4d732138
IM
1230 pushq $__KERNEL_CS
1231 pushq $repeat_nmi
3f3c8b8c
SR
1232
1233 /* Put stack back */
4d732138 1234 addq $(6*8), %rsp
3f3c8b8c
SR
1235
1236nested_nmi_out:
4d732138 1237 popq %rdx
3f3c8b8c 1238
0b22930e 1239 /* We are returning to kernel mode, so this cannot result in a fault. */
929bacec 1240 iretq
3f3c8b8c
SR
1241
1242first_nmi:
0b22930e 1243 /* Restore rdx. */
4d732138 1244 movq (%rsp), %rdx
62610913 1245
36f1a77b
AL
1246 /* Make room for "NMI executing". */
1247 pushq $0
3f3c8b8c 1248
0b22930e 1249 /* Leave room for the "iret" frame */
4d732138 1250 subq $(5*8), %rsp
28696f43 1251
0b22930e 1252 /* Copy the "original" frame to the "outermost" frame */
3f3c8b8c 1253 .rept 5
4d732138 1254 pushq 11*8(%rsp)
3f3c8b8c 1255 .endr
8c1f7558 1256 UNWIND_HINT_IRET_REGS
62610913 1257
79fb4ad6
SR
1258 /* Everything up to here is safe from nested NMIs */
1259
a97439aa
AL
1260#ifdef CONFIG_DEBUG_ENTRY
1261 /*
1262 * For ease of testing, unmask NMIs right away. Disabled by
1263 * default because IRET is very expensive.
1264 */
1265 pushq $0 /* SS */
1266 pushq %rsp /* RSP (minus 8 because of the previous push) */
1267 addq $8, (%rsp) /* Fix up RSP */
1268 pushfq /* RFLAGS */
1269 pushq $__KERNEL_CS /* CS */
1270 pushq $1f /* RIP */
929bacec 1271 iretq /* continues at repeat_nmi below */
8c1f7558 1272 UNWIND_HINT_IRET_REGS
a97439aa
AL
12731:
1274#endif
1275
0b22930e 1276repeat_nmi:
62610913
JB
1277 /*
1278 * If there was a nested NMI, the first NMI's iret will return
1279 * here. But NMIs are still enabled and we can take another
1280 * nested NMI. The nested NMI checks the interrupted RIP to see
1281 * if it is between repeat_nmi and end_repeat_nmi, and if so
1282 * it will just return, as we are about to repeat an NMI anyway.
1283 * This makes it safe to copy to the stack frame that a nested
1284 * NMI will update.
0b22930e
AL
1285 *
1286 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
1287 * we're repeating an NMI, gsbase has the same value that it had on
1288 * the first iteration. paranoid_entry will load the kernel
6271fef0 1289 * gsbase if needed before we call exc_nmi(). "NMI executing"
36f1a77b 1290 * is zero.
62610913 1291 */
36f1a77b 1292 movq $1, 10*8(%rsp) /* Set "NMI executing". */
3f3c8b8c 1293
62610913 1294 /*
0b22930e
AL
1295 * Copy the "outermost" frame to the "iret" frame. NMIs that nest
1296 * here must not modify the "iret" frame while we're writing to
1297 * it or it will end up containing garbage.
62610913 1298 */
4d732138 1299 addq $(10*8), %rsp
3f3c8b8c 1300 .rept 5
4d732138 1301 pushq -6*8(%rsp)
3f3c8b8c 1302 .endr
4d732138 1303 subq $(5*8), %rsp
62610913 1304end_repeat_nmi:
3f3c8b8c
SR
1305
1306 /*
0b22930e
AL
1307 * Everything below this point can be preempted by a nested NMI.
1308 * If this happens, then the inner NMI will change the "iret"
1309 * frame to point back to repeat_nmi.
3f3c8b8c 1310 */
4d732138 1311 pushq $-1 /* ORIG_RAX: no syscall to restart */
76f5df43 1312
1fd466ef 1313 /*
ebfc453e 1314 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1fd466ef
SR
1315 * as we should not be calling schedule in NMI context.
1316 * Even with normal interrupts enabled. An NMI should not be
1317 * setting NEED_RESCHED or anything that normal interrupts and
1318 * exceptions might do.
1319 */
4d732138 1320 call paranoid_entry
8c1f7558 1321 UNWIND_HINT_REGS
7fbb98c5 1322
4d732138
IM
1323 movq %rsp, %rdi
1324 movq $-1, %rsi
6271fef0 1325 call exc_nmi
7fbb98c5 1326
16561f27 1327 /* Always restore stashed CR3 value (see paranoid_entry) */
21e94459 1328 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
8a09317b 1329
c82965f9
CB
1330 /*
1331 * The above invocation of paranoid_entry stored the GSBASE
1332 * related information in R/EBX depending on the availability
1333 * of FSGSBASE.
1334 *
1335 * If FSGSBASE is enabled, restore the saved GSBASE value
1336 * unconditionally, otherwise take the conditional SWAPGS path.
1337 */
1338 ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1339
1340 wrgsbase %rbx
1341 jmp nmi_restore
1342
1343nmi_no_fsgsbase:
1344 /* EBX == 0 -> invoke SWAPGS */
1345 testl %ebx, %ebx
4d732138 1346 jnz nmi_restore
c82965f9 1347
ddeb8f21
AH
1348nmi_swapgs:
1349 SWAPGS_UNSAFE_STACK
c82965f9 1350
ddeb8f21 1351nmi_restore:
502af0d7 1352 POP_REGS
0b22930e 1353
471ee483
AL
1354 /*
1355 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1356 * at the "iret" frame.
1357 */
1358 addq $6*8, %rsp
28696f43 1359
810bc075
AL
1360 /*
1361 * Clear "NMI executing". Set DF first so that we can easily
1362 * distinguish the remaining code between here and IRET from
929bacec
AL
1363 * the SYSCALL entry and exit paths.
1364 *
1365 * We arguably should just inspect RIP instead, but I (Andy) wrote
1366 * this code when I had the misapprehension that Xen PV supported
1367 * NMIs, and Xen PV would break that approach.
810bc075
AL
1368 */
1369 std
1370 movq $0, 5*8(%rsp) /* clear "NMI executing" */
0b22930e
AL
1371
1372 /*
929bacec
AL
1373 * iretq reads the "iret" frame and exits the NMI stack in a
1374 * single instruction. We are returning to kernel mode, so this
1375 * cannot result in a fault. Similarly, we don't need to worry
1376 * about espfix64 on the way back to kernel mode.
0b22930e 1377 */
929bacec 1378 iretq
6271fef0 1379SYM_CODE_END(asm_exc_nmi)
ddeb8f21 1380
dffb3f9d
AL
1381#ifndef CONFIG_IA32_EMULATION
1382/*
1383 * This handles SYSCALL from 32-bit code. There is no way to program
1384 * MSRs to fully disable 32-bit SYSCALL.
1385 */
bc7b11c0 1386SYM_CODE_START(ignore_sysret)
8c1f7558 1387 UNWIND_HINT_EMPTY
4d732138 1388 mov $-ENOSYS, %eax
b2b1d94c 1389 sysretl
bc7b11c0 1390SYM_CODE_END(ignore_sysret)
dffb3f9d 1391#endif
2deb4be2 1392
b9f6976b 1393.pushsection .text, "ax"
bc7b11c0 1394SYM_CODE_START(rewind_stack_do_exit)
8c1f7558 1395 UNWIND_HINT_FUNC
2deb4be2
AL
1396 /* Prevent any naive code from trying to unwind to our caller. */
1397 xorl %ebp, %ebp
1398
1399 movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
8c1f7558 1400 leaq -PTREGS_SIZE(%rax), %rsp
f977df7b 1401 UNWIND_HINT_REGS
2deb4be2
AL
1402
1403 call do_exit
bc7b11c0 1404SYM_CODE_END(rewind_stack_do_exit)
b9f6976b 1405.popsection