Commit | Line | Data |
---|---|---|
457c8996 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
744c193e | 2 | #include <linux/extable.h> |
7c0f6ba6 | 3 | #include <linux/uaccess.h> |
b17b0153 | 4 | #include <linux/sched/debug.h> |
4b5305de | 5 | #include <linux/bitfield.h> |
42b3a4cb | 6 | #include <xen/xen.h> |
b17b0153 | 7 | |
079ec41b | 8 | #include <asm/fpu/api.h> |
5105e768 | 9 | #include <asm/fred.h> |
e759959f | 10 | #include <asm/sev.h> |
0d0efc07 | 11 | #include <asm/traps.h> |
81c2949f | 12 | #include <asm/kdebug.h> |
4b5305de | 13 | #include <asm/insn-eval.h> |
5ce8e39f | 14 | #include <asm/sgx.h> |
4b5305de PZ |
15 | |
16 | static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr) | |
17 | { | |
18 | int reg_offset = pt_regs_offset(regs, nr); | |
19 | static unsigned long __dummy; | |
20 | ||
21 | if (WARN_ON_ONCE(reg_offset < 0)) | |
22 | return &__dummy; | |
23 | ||
24 | return (unsigned long *)((unsigned long)regs + reg_offset); | |
25 | } | |
6d48583b | 26 | |
70627654 PA |
27 | static inline unsigned long |
28 | ex_fixup_addr(const struct exception_table_entry *x) | |
29 | { | |
30 | return (unsigned long)&x->fixup + x->fixup; | |
31 | } | |
6d48583b | 32 | |
4b5305de | 33 | static bool ex_handler_default(const struct exception_table_entry *e, |
46d28947 | 34 | struct pt_regs *regs) |
6d48583b | 35 | { |
4b5305de PZ |
36 | if (e->data & EX_FLAG_CLEAR_AX) |
37 | regs->ax = 0; | |
38 | if (e->data & EX_FLAG_CLEAR_DX) | |
39 | regs->dx = 0; | |
40 | ||
41 | regs->ip = ex_fixup_addr(e); | |
548acf19 TL |
42 | return true; |
43 | } | |
548acf19 | 44 | |
c4e34dd9 LT |
45 | /* |
46 | * This is the *very* rare case where we do a "load_unaligned_zeropad()" | |
47 | * and it's a page crosser into a non-existent page. | |
48 | * | |
49 | * This happens when we optimistically load a pathname a word-at-a-time | |
50 | * and the name is less than the full word and the next page is not | |
51 | * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC. | |
52 | * | |
53 | * NOTE! The faulting address is always a 'mov mem,reg' type instruction | |
54 | * of size 'long', and the exception fixup must always point to right | |
55 | * after the instruction. | |
56 | */ | |
57 | static bool ex_handler_zeropad(const struct exception_table_entry *e, | |
58 | struct pt_regs *regs, | |
59 | unsigned long fault_addr) | |
60 | { | |
61 | struct insn insn; | |
62 | const unsigned long mask = sizeof(long) - 1; | |
63 | unsigned long offset, addr, next_ip, len; | |
64 | unsigned long *reg; | |
65 | ||
66 | next_ip = ex_fixup_addr(e); | |
67 | len = next_ip - regs->ip; | |
68 | if (len > MAX_INSN_SIZE) | |
69 | return false; | |
70 | ||
71 | if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN)) | |
72 | return false; | |
73 | if (insn.length != len) | |
74 | return false; | |
75 | ||
76 | if (insn.opcode.bytes[0] != 0x8b) | |
77 | return false; | |
78 | if (insn.opnd_bytes != sizeof(long)) | |
79 | return false; | |
80 | ||
81 | addr = (unsigned long) insn_get_addr_ref(&insn, regs); | |
82 | if (addr == ~0ul) | |
83 | return false; | |
84 | ||
85 | offset = addr & mask; | |
86 | addr = addr & ~mask; | |
87 | if (fault_addr != addr + sizeof(long)) | |
88 | return false; | |
89 | ||
90 | reg = insn_get_modrm_reg_ptr(&insn, regs); | |
91 | if (!reg) | |
92 | return false; | |
93 | ||
94 | *reg = *(unsigned long *)addr >> (offset * 8); | |
95 | return ex_handler_default(e, regs); | |
96 | } | |
97 | ||
46d28947 TG |
98 | static bool ex_handler_fault(const struct exception_table_entry *fixup, |
99 | struct pt_regs *regs, int trapnr) | |
548acf19 | 100 | { |
548acf19 | 101 | regs->ax = trapnr; |
46d28947 | 102 | return ex_handler_default(fixup, regs); |
548acf19 | 103 | } |
548acf19 | 104 | |
5ce8e39f PZ |
105 | static bool ex_handler_sgx(const struct exception_table_entry *fixup, |
106 | struct pt_regs *regs, int trapnr) | |
107 | { | |
108 | regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG; | |
109 | return ex_handler_default(fixup, regs); | |
110 | } | |
111 | ||
d5c8028b EB |
112 | /* |
113 | * Handler for when we fail to restore a task's FPU state. We should never get | |
114 | * here because the FPU state of a task using the FPU (task->thread.fpu.state) | |
115 | * should always be valid. However, past bugs have allowed userspace to set | |
116 | * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). | |
117 | * These caused XRSTOR to fail when switching to the task, leaking the FPU | |
118 | * registers of the task previously executing on the CPU. Mitigate this class | |
119 | * of vulnerability by restoring from the initial state (essentially, zeroing | |
120 | * out all the FPU registers) if we can't restore from the task's FPU state. | |
121 | */ | |
46d28947 TG |
122 | static bool ex_handler_fprestore(const struct exception_table_entry *fixup, |
123 | struct pt_regs *regs) | |
d5c8028b EB |
124 | { |
125 | regs->ip = ex_fixup_addr(fixup); | |
126 | ||
127 | WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", | |
128 | (void *)instruction_pointer(regs)); | |
129 | ||
079ec41b | 130 | fpu_reset_from_exception_fixup(); |
d5c8028b EB |
131 | return true; |
132 | } | |
d5c8028b | 133 | |
6014bc27 LT |
134 | /* |
135 | * On x86-64, we end up being imprecise with 'access_ok()', and allow | |
136 | * non-canonical user addresses to make the range comparisons simpler, | |
137 | * and to not have to worry about LAM being enabled. | |
138 | * | |
139 | * In fact, we allow up to one page of "slop" at the sign boundary, | |
140 | * which means that we can do access_ok() by just checking the sign | |
141 | * of the pointer for the common case of having a small access size. | |
142 | */ | |
143 | static bool gp_fault_address_ok(unsigned long fault_address) | |
144 | { | |
145 | #ifdef CONFIG_X86_64 | |
146 | /* Is it in the "user space" part of the non-canonical space? */ | |
798dec33 | 147 | if (valid_user_address(fault_address)) |
6014bc27 LT |
148 | return true; |
149 | ||
150 | /* .. or just above it? */ | |
151 | fault_address -= PAGE_SIZE; | |
798dec33 | 152 | if (valid_user_address(fault_address)) |
6014bc27 LT |
153 | return true; |
154 | #endif | |
155 | return false; | |
156 | } | |
157 | ||
46d28947 | 158 | static bool ex_handler_uaccess(const struct exception_table_entry *fixup, |
6014bc27 LT |
159 | struct pt_regs *regs, int trapnr, |
160 | unsigned long fault_address) | |
75045f77 | 161 | { |
6014bc27 LT |
162 | WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address), |
163 | "General protection fault in user access. Non-canonical address?"); | |
46d28947 | 164 | return ex_handler_default(fixup, regs); |
75045f77 | 165 | } |
75045f77 | 166 | |
d52a7344 PZ |
167 | static bool ex_handler_msr(const struct exception_table_entry *fixup, |
168 | struct pt_regs *regs, bool wrmsr, bool safe, int reg) | |
fbd70437 | 169 | { |
a1a5482a PZ |
170 | if (__ONCE_LITE_IF(!safe && wrmsr)) { |
171 | pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", | |
172 | (unsigned int)regs->cx, (unsigned int)regs->dx, | |
173 | (unsigned int)regs->ax, regs->ip, (void *)regs->ip); | |
d52a7344 | 174 | show_stack_regs(regs); |
a1a5482a | 175 | } |
d52a7344 | 176 | |
a1a5482a PZ |
177 | if (__ONCE_LITE_IF(!safe && !wrmsr)) { |
178 | pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", | |
179 | (unsigned int)regs->cx, regs->ip, (void *)regs->ip); | |
81c2949f | 180 | show_stack_regs(regs); |
a1a5482a | 181 | } |
fbd70437 | 182 | |
d52a7344 PZ |
183 | if (!wrmsr) { |
184 | /* Pretend that the read succeeded and returned 0. */ | |
185 | regs->ax = 0; | |
186 | regs->dx = 0; | |
187 | } | |
fbd70437 | 188 | |
d52a7344 PZ |
189 | if (safe) |
190 | *pt_regs_nr(regs, reg) = -EIO; | |
fbd70437 | 191 | |
46d28947 | 192 | return ex_handler_default(fixup, regs); |
fbd70437 | 193 | } |
fbd70437 | 194 | |
46d28947 TG |
195 | static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, |
196 | struct pt_regs *regs) | |
45e876f7 AL |
197 | { |
198 | if (static_cpu_has(X86_BUG_NULL_SEG)) | |
199 | asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); | |
200 | asm volatile ("mov %0, %%fs" : : "rm" (0)); | |
46d28947 | 201 | return ex_handler_default(fixup, regs); |
45e876f7 | 202 | } |
45e876f7 | 203 | |
4b5305de PZ |
204 | static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, |
205 | struct pt_regs *regs, int reg, int imm) | |
206 | { | |
207 | *pt_regs_nr(regs, reg) = (long)imm; | |
208 | return ex_handler_default(fixup, regs); | |
209 | } | |
210 | ||
d5d797dc | 211 | static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, |
6014bc27 LT |
212 | struct pt_regs *regs, int trapnr, |
213 | unsigned long fault_address, | |
214 | int reg, int imm) | |
d5d797dc PZ |
215 | { |
216 | regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg); | |
6014bc27 | 217 | return ex_handler_uaccess(fixup, regs, trapnr, fault_address); |
d5d797dc PZ |
218 | } |
219 | ||
5105e768 XL |
220 | #ifdef CONFIG_X86_FRED |
221 | static bool ex_handler_eretu(const struct exception_table_entry *fixup, | |
222 | struct pt_regs *regs, unsigned long error_code) | |
223 | { | |
224 | struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); | |
225 | unsigned short ss = uregs->ss; | |
226 | unsigned short cs = uregs->cs; | |
227 | ||
228 | /* | |
229 | * Move the NMI bit from the invalid stack frame, which caused ERETU | |
230 | * to fault, to the fault handler's stack frame, thus to unblock NMI | |
231 | * with the fault handler's ERETS instruction ASAP if NMI is blocked. | |
232 | */ | |
233 | regs->fred_ss.nmi = uregs->fred_ss.nmi; | |
234 | ||
235 | /* | |
236 | * Sync event information to uregs, i.e., the ERETU return frame, but | |
237 | * is it safe to write to the ERETU return frame which is just above | |
238 | * current event stack frame? | |
239 | * | |
240 | * The RSP used by FRED to push a stack frame is not the value in %rsp, | |
241 | * it is calculated from %rsp with the following 2 steps: | |
242 | * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes | |
243 | * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line | |
244 | * when an event delivery doesn't trigger a stack level change. | |
245 | * | |
246 | * Here is an example with N*64 (N=1) bytes reserved: | |
247 | * | |
248 | * 64-byte cache line ==> ______________ | |
249 | * |___Reserved___| | |
250 | * |__Event_data__| | |
251 | * |_____SS_______| | |
252 | * |_____RSP______| | |
253 | * |_____FLAGS____| | |
254 | * |_____CS_______| | |
255 | * |_____IP_______| | |
256 | * 64-byte cache line ==> |__Error_code__| <== ERETU return frame | |
257 | * |______________| | |
258 | * |______________| | |
259 | * |______________| | |
260 | * |______________| | |
261 | * |______________| | |
262 | * |______________| | |
263 | * |______________| | |
264 | * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) | |
265 | * |___Reserved___| | |
266 | * |__Event_data__| | |
267 | * |_____SS_______| | |
268 | * |_____RSP______| | |
269 | * |_____FLAGS____| | |
270 | * |_____CS_______| | |
271 | * |_____IP_______| | |
272 | * 64-byte cache line ==> |__Error_code__| <== ERETS return frame | |
273 | * | |
274 | * Thus a new FRED stack frame will always be pushed below a previous | |
275 | * FRED stack frame ((N*64) bytes may be reserved between), and it is | |
276 | * safe to write to a previous FRED stack frame as they never overlap. | |
277 | */ | |
278 | fred_info(uregs)->edata = fred_event_data(regs); | |
279 | uregs->ssx = regs->ssx; | |
280 | uregs->fred_ss.ss = ss; | |
281 | /* The NMI bit was moved away above */ | |
282 | uregs->fred_ss.nmi = 0; | |
283 | uregs->csx = regs->csx; | |
284 | uregs->fred_cs.sl = 0; | |
285 | uregs->fred_cs.wfe = 0; | |
286 | uregs->cs = cs; | |
287 | uregs->orig_ax = error_code; | |
288 | ||
289 | return ex_handler_default(fixup, regs); | |
290 | } | |
291 | #endif | |
292 | ||
46d28947 | 293 | int ex_get_fixup_type(unsigned long ip) |
548acf19 | 294 | { |
46d28947 | 295 | const struct exception_table_entry *e = search_exception_tables(ip); |
548acf19 | 296 | |
4b5305de | 297 | return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE; |
548acf19 TL |
298 | } |
299 | ||
81fd9c18 JH |
300 | int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, |
301 | unsigned long fault_addr) | |
548acf19 TL |
302 | { |
303 | const struct exception_table_entry *e; | |
4b5305de | 304 | int type, reg, imm; |
6d48583b HH |
305 | |
306 | #ifdef CONFIG_PNPBIOS | |
307 | if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { | |
308 | extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; | |
309 | extern u32 pnp_bios_is_utter_crap; | |
310 | pnp_bios_is_utter_crap = 1; | |
311 | printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); | |
312 | __asm__ volatile( | |
313 | "movl %0, %%esp\n\t" | |
314 | "jmp *%1\n\t" | |
315 | : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); | |
316 | panic("do_trap: can't hit this"); | |
317 | } | |
318 | #endif | |
319 | ||
548acf19 TL |
320 | e = search_exception_tables(regs->ip); |
321 | if (!e) | |
322 | return 0; | |
6d48583b | 323 | |
4b5305de PZ |
324 | type = FIELD_GET(EX_DATA_TYPE_MASK, e->data); |
325 | reg = FIELD_GET(EX_DATA_REG_MASK, e->data); | |
326 | imm = FIELD_GET(EX_DATA_IMM_MASK, e->data); | |
327 | ||
328 | switch (type) { | |
46d28947 | 329 | case EX_TYPE_DEFAULT: |
2cadf524 | 330 | case EX_TYPE_DEFAULT_MCE_SAFE: |
46d28947 TG |
331 | return ex_handler_default(e, regs); |
332 | case EX_TYPE_FAULT: | |
2cadf524 | 333 | case EX_TYPE_FAULT_MCE_SAFE: |
46d28947 TG |
334 | return ex_handler_fault(e, regs, trapnr); |
335 | case EX_TYPE_UACCESS: | |
6014bc27 | 336 | return ex_handler_uaccess(e, regs, trapnr, fault_addr); |
46d28947 TG |
337 | case EX_TYPE_CLEAR_FS: |
338 | return ex_handler_clear_fs(e, regs); | |
339 | case EX_TYPE_FPU_RESTORE: | |
340 | return ex_handler_fprestore(e, regs); | |
46d28947 TG |
341 | case EX_TYPE_BPF: |
342 | return ex_handler_bpf(e, regs); | |
d52a7344 PZ |
343 | case EX_TYPE_WRMSR: |
344 | return ex_handler_msr(e, regs, true, false, reg); | |
345 | case EX_TYPE_RDMSR: | |
346 | return ex_handler_msr(e, regs, false, false, reg); | |
347 | case EX_TYPE_WRMSR_SAFE: | |
348 | return ex_handler_msr(e, regs, true, true, reg); | |
349 | case EX_TYPE_RDMSR_SAFE: | |
350 | return ex_handler_msr(e, regs, false, true, reg); | |
46d28947 TG |
351 | case EX_TYPE_WRMSR_IN_MCE: |
352 | ex_handler_msr_mce(regs, true); | |
353 | break; | |
d52a7344 PZ |
354 | case EX_TYPE_RDMSR_IN_MCE: |
355 | ex_handler_msr_mce(regs, false); | |
356 | break; | |
9cdbeec4 PZ |
357 | case EX_TYPE_POP_REG: |
358 | regs->sp += sizeof(long); | |
359 | fallthrough; | |
4b5305de PZ |
360 | case EX_TYPE_IMM_REG: |
361 | return ex_handler_imm_reg(e, regs, reg, imm); | |
5ce8e39f PZ |
362 | case EX_TYPE_FAULT_SGX: |
363 | return ex_handler_sgx(e, regs, trapnr); | |
d5d797dc | 364 | case EX_TYPE_UCOPY_LEN: |
6014bc27 | 365 | return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); |
c4e34dd9 LT |
366 | case EX_TYPE_ZEROPAD: |
367 | return ex_handler_zeropad(e, regs, fault_addr); | |
5105e768 XL |
368 | #ifdef CONFIG_X86_FRED |
369 | case EX_TYPE_ERETU: | |
370 | return ex_handler_eretu(e, regs, error_code); | |
371 | #endif | |
46d28947 TG |
372 | } |
373 | BUG(); | |
6d48583b | 374 | } |
6a1ea279 | 375 | |
0e861fbb AL |
376 | extern unsigned int early_recursion_flag; |
377 | ||
6a1ea279 | 378 | /* Restricted version used during very early boot */ |
0e861fbb | 379 | void __init early_fixup_exception(struct pt_regs *regs, int trapnr) |
6a1ea279 | 380 | { |
0d0efc07 AL |
381 | /* Ignore early NMIs. */ |
382 | if (trapnr == X86_TRAP_NMI) | |
0e861fbb AL |
383 | return; |
384 | ||
385 | if (early_recursion_flag > 2) | |
386 | goto halt_loop; | |
387 | ||
fc0e81b2 AL |
388 | /* |
389 | * Old CPUs leave the high bits of CS on the stack | |
390 | * undefined. I'm not sure which CPUs do this, but at least | |
391 | * the 486 DX works this way. | |
42b3a4cb | 392 | * Xen pv domains are not using the default __KERNEL_CS. |
fc0e81b2 | 393 | */ |
42b3a4cb | 394 | if (!xen_pv_domain() && regs->cs != __KERNEL_CS) |
0e861fbb | 395 | goto fail; |
0d0efc07 | 396 | |
60a0e203 AL |
397 | /* |
398 | * The full exception fixup machinery is available as soon as | |
399 | * the early IDT is loaded. This means that it is the | |
400 | * responsibility of extable users to either function correctly | |
401 | * when handlers are invoked early or to simply avoid causing | |
402 | * exceptions before they're ready to handle them. | |
403 | * | |
404 | * This is better than filtering which handlers can be used, | |
405 | * because refusing to call a handler here is guaranteed to | |
406 | * result in a hard-to-debug panic. | |
407 | * | |
408 | * Keep in mind that not all vectors actually get here. Early | |
81fd9c18 | 409 | * page faults, for example, are special. |
60a0e203 | 410 | */ |
81fd9c18 | 411 | if (fixup_exception(regs, trapnr, regs->orig_ax, 0)) |
ae7ef45e | 412 | return; |
0e861fbb | 413 | |
15a416e8 AL |
414 | if (trapnr == X86_TRAP_UD) { |
415 | if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { | |
416 | /* Skip the ud2. */ | |
417 | regs->ip += LEN_UD2; | |
418 | return; | |
419 | } | |
420 | ||
421 | /* | |
422 | * If this was a BUG and report_bug returns or if this | |
423 | * was just a normal #UD, we want to continue onward and | |
424 | * crash. | |
425 | */ | |
426 | } | |
8a524f80 | 427 | |
0e861fbb AL |
428 | fail: |
429 | early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", | |
430 | (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, | |
431 | regs->orig_ax, read_cr2()); | |
432 | ||
433 | show_regs(regs); | |
434 | ||
435 | halt_loop: | |
436 | while (true) | |
437 | halt(); | |
6a1ea279 | 438 | } |