x86/tdx: Fix early #VE handling
[linux-block.git] / arch / x86 / coco / tdx / tdx.c
CommitLineData
59bd54a8
KS
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (C) 2021-2022 Intel Corporation */
3
4#undef pr_fmt
5#define pr_fmt(fmt) "tdx: " fmt
6
7#include <linux/cpufeature.h>
41394e33 8#include <asm/coco.h>
59bd54a8 9#include <asm/tdx.h>
bfe6ed0c 10#include <asm/vmx.h>
31d58c4e
KS
11#include <asm/insn.h>
12#include <asm/insn-eval.h>
7dbde763 13#include <asm/pgtable.h>
59bd54a8 14
41394e33
KS
15/* TDX module Call Leaf IDs */
16#define TDX_GET_INFO 1
9a22bf6d 17#define TDX_GET_VEINFO 3
7dbde763
KS
18#define TDX_ACCEPT_PAGE 6
19
20/* TDX hypercall Leaf IDs */
21#define TDVMCALL_MAP_GPA 0x10001
41394e33 22
31d58c4e
KS
23/* MMIO direction */
24#define EPT_READ 0
25#define EPT_WRITE 1
26
03149948
KS
27/* Port I/O direction */
28#define PORT_READ 0
29#define PORT_WRITE 1
30
31/* See Exit Qualification for I/O Instructions in VMX documentation */
32#define VE_IS_IO_IN(e) ((e) & BIT(3))
33#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
34#define VE_GET_PORT_NUM(e) ((e) >> 16)
35#define VE_IS_IO_STRING(e) ((e) & BIT(4))
36
eb94f1b6
KS
37/*
38 * Wrapper for standard use of __tdx_hypercall with no output aside from
39 * return code.
40 */
41static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
42{
43 struct tdx_hypercall_args args = {
44 .r10 = TDX_HYPERCALL_STANDARD,
45 .r11 = fn,
46 .r12 = r12,
47 .r13 = r13,
48 .r14 = r14,
49 .r15 = r15,
50 };
51
52 return __tdx_hypercall(&args, 0);
53}
54
55/* Called from __tdx_hypercall() for unrecoverable failure */
56void __tdx_hypercall_failed(void)
57{
58 panic("TDVMCALL failed. TDX module bug?");
59}
60
bfe6ed0c
KS
61/*
62 * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
63 * independently from but are currently matched 1:1 with VMX EXIT_REASONs.
64 * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
65 * guest sides of these calls.
66 */
67static u64 hcall_func(u64 exit_reason)
68{
69 return exit_reason;
70}
71
cfb8ec7a
KS
72#ifdef CONFIG_KVM_GUEST
73long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
74 unsigned long p3, unsigned long p4)
75{
76 struct tdx_hypercall_args args = {
77 .r10 = nr,
78 .r11 = p1,
79 .r12 = p2,
80 .r13 = p3,
81 .r14 = p4,
82 };
83
84 return __tdx_hypercall(&args, 0);
85}
86EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
87#endif
88
41394e33
KS
89/*
90 * Used for TDX guests to make calls directly to the TD module. This
91 * should only be used for calls that have no legitimate reason to fail
92 * or where the kernel can not survive the call failing.
93 */
94static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
95 struct tdx_module_output *out)
96{
97 if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
98 panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
99}
100
101static u64 get_cc_mask(void)
102{
103 struct tdx_module_output out;
104 unsigned int gpa_width;
105
106 /*
107 * TDINFO TDX module call is used to get the TD execution environment
108 * information like GPA width, number of available vcpus, debug mode
109 * information, etc. More details about the ABI can be found in TDX
110 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
111 * [TDG.VP.INFO].
112 *
113 * The GPA width that comes out of this call is critical. TDX guests
114 * can not meaningfully run without it.
115 */
116 tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
117
118 gpa_width = out.rcx & GENMASK(5, 0);
119
120 /*
121 * The highest bit of a guest physical address is the "sharing" bit.
122 * Set it for shared pages and clear it for private pages.
123 */
124 return BIT_ULL(gpa_width - 1);
125}
126
bfe6ed0c
KS
127static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
128{
129 struct tdx_hypercall_args args = {
130 .r10 = TDX_HYPERCALL_STANDARD,
131 .r11 = hcall_func(EXIT_REASON_HLT),
132 .r12 = irq_disabled,
133 };
134
135 /*
136 * Emulate HLT operation via hypercall. More info about ABI
137 * can be found in TDX Guest-Host-Communication Interface
138 * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
139 *
140 * The VMM uses the "IRQ disabled" param to understand IRQ
141 * enabled status (RFLAGS.IF) of the TD guest and to determine
142 * whether or not it should schedule the halted vCPU if an
143 * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
144 * can keep the vCPU in virtual HLT, even if an IRQ is
145 * pending, without hanging/breaking the guest.
146 */
147 return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
148}
149
150static bool handle_halt(void)
151{
152 /*
153 * Since non safe halt is mainly used in CPU offlining
154 * and the guest will always stay in the halt state, don't
155 * call the STI instruction (set do_sti as false).
156 */
157 const bool irq_disabled = irqs_disabled();
158 const bool do_sti = false;
159
160 if (__halt(irq_disabled, do_sti))
161 return false;
162
163 return true;
164}
165
166void __cpuidle tdx_safe_halt(void)
167{
168 /*
169 * For do_sti=true case, __tdx_hypercall() function enables
170 * interrupts using the STI instruction before the TDCALL. So
171 * set irq_disabled as false.
172 */
173 const bool irq_disabled = false;
174 const bool do_sti = true;
175
176 /*
177 * Use WARN_ONCE() to report the failure.
178 */
179 if (__halt(irq_disabled, do_sti))
180 WARN_ONCE(1, "HLT instruction emulation failed\n");
181}
182
ae87f609
KS
183static bool read_msr(struct pt_regs *regs)
184{
185 struct tdx_hypercall_args args = {
186 .r10 = TDX_HYPERCALL_STANDARD,
187 .r11 = hcall_func(EXIT_REASON_MSR_READ),
188 .r12 = regs->cx,
189 };
190
191 /*
192 * Emulate the MSR read via hypercall. More info about ABI
193 * can be found in TDX Guest-Host-Communication Interface
194 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
195 */
196 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
197 return false;
198
199 regs->ax = lower_32_bits(args.r11);
200 regs->dx = upper_32_bits(args.r11);
201 return true;
202}
203
204static bool write_msr(struct pt_regs *regs)
205{
206 struct tdx_hypercall_args args = {
207 .r10 = TDX_HYPERCALL_STANDARD,
208 .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
209 .r12 = regs->cx,
210 .r13 = (u64)regs->dx << 32 | regs->ax,
211 };
212
213 /*
214 * Emulate the MSR write via hypercall. More info about ABI
215 * can be found in TDX Guest-Host-Communication Interface
216 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
217 */
218 return !__tdx_hypercall(&args, 0);
219}
220
c141fa2c
KS
221static bool handle_cpuid(struct pt_regs *regs)
222{
223 struct tdx_hypercall_args args = {
224 .r10 = TDX_HYPERCALL_STANDARD,
225 .r11 = hcall_func(EXIT_REASON_CPUID),
226 .r12 = regs->ax,
227 .r13 = regs->cx,
228 };
229
230 /*
231 * Only allow VMM to control range reserved for hypervisor
232 * communication.
233 *
234 * Return all-zeros for any CPUID outside the range. It matches CPU
235 * behaviour for non-supported leaf.
236 */
237 if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
238 regs->ax = regs->bx = regs->cx = regs->dx = 0;
239 return true;
240 }
241
242 /*
243 * Emulate the CPUID instruction via a hypercall. More info about
244 * ABI can be found in TDX Guest-Host-Communication Interface
245 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
246 */
247 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
248 return false;
249
250 /*
251 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
252 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
253 * So copy the register contents back to pt_regs.
254 */
255 regs->ax = args.r12;
256 regs->bx = args.r13;
257 regs->cx = args.r14;
258 regs->dx = args.r15;
259
260 return true;
261}
262
31d58c4e
KS
263static bool mmio_read(int size, unsigned long addr, unsigned long *val)
264{
265 struct tdx_hypercall_args args = {
266 .r10 = TDX_HYPERCALL_STANDARD,
267 .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
268 .r12 = size,
269 .r13 = EPT_READ,
270 .r14 = addr,
271 .r15 = *val,
272 };
273
274 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
275 return false;
276 *val = args.r11;
277 return true;
278}
279
280static bool mmio_write(int size, unsigned long addr, unsigned long val)
281{
282 return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
283 EPT_WRITE, addr, val);
284}
285
286static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
287{
288 char buffer[MAX_INSN_SIZE];
289 unsigned long *reg, val;
290 struct insn insn = {};
291 enum mmio_type mmio;
292 int size, extend_size;
293 u8 extend_val = 0;
294
295 /* Only in-kernel MMIO is supported */
296 if (WARN_ON_ONCE(user_mode(regs)))
297 return false;
298
299 if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
300 return false;
301
302 if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
303 return false;
304
305 mmio = insn_decode_mmio(&insn, &size);
306 if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
307 return false;
308
309 if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
310 reg = insn_get_modrm_reg_ptr(&insn, regs);
311 if (!reg)
312 return false;
313 }
314
315 ve->instr_len = insn.length;
316
317 /* Handle writes first */
318 switch (mmio) {
319 case MMIO_WRITE:
320 memcpy(&val, reg, size);
321 return mmio_write(size, ve->gpa, val);
322 case MMIO_WRITE_IMM:
323 val = insn.immediate.value;
324 return mmio_write(size, ve->gpa, val);
325 case MMIO_READ:
326 case MMIO_READ_ZERO_EXTEND:
327 case MMIO_READ_SIGN_EXTEND:
328 /* Reads are handled below */
329 break;
330 case MMIO_MOVS:
331 case MMIO_DECODE_FAILED:
332 /*
333 * MMIO was accessed with an instruction that could not be
334 * decoded or handled properly. It was likely not using io.h
335 * helpers or accessed MMIO accidentally.
336 */
337 return false;
338 default:
339 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
340 return false;
341 }
342
343 /* Handle reads */
344 if (!mmio_read(size, ve->gpa, &val))
345 return false;
346
347 switch (mmio) {
348 case MMIO_READ:
349 /* Zero-extend for 32-bit operation */
350 extend_size = size == 4 ? sizeof(*reg) : 0;
351 break;
352 case MMIO_READ_ZERO_EXTEND:
353 /* Zero extend based on operand size */
354 extend_size = insn.opnd_bytes;
355 break;
356 case MMIO_READ_SIGN_EXTEND:
357 /* Sign extend based on operand size */
358 extend_size = insn.opnd_bytes;
359 if (size == 1 && val & BIT(7))
360 extend_val = 0xFF;
361 else if (size > 1 && val & BIT(15))
362 extend_val = 0xFF;
363 break;
364 default:
365 /* All other cases has to be covered with the first switch() */
366 WARN_ON_ONCE(1);
367 return false;
368 }
369
370 if (extend_size)
371 memset(reg, extend_val, extend_size);
372 memcpy(reg, &val, size);
373 return true;
374}
375
03149948
KS
376static bool handle_in(struct pt_regs *regs, int size, int port)
377{
378 struct tdx_hypercall_args args = {
379 .r10 = TDX_HYPERCALL_STANDARD,
380 .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
381 .r12 = size,
382 .r13 = PORT_READ,
383 .r14 = port,
384 };
385 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
386 bool success;
387
388 /*
389 * Emulate the I/O read via hypercall. More info about ABI can be found
390 * in TDX Guest-Host-Communication Interface (GHCI) section titled
391 * "TDG.VP.VMCALL<Instruction.IO>".
392 */
393 success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT);
394
395 /* Update part of the register affected by the emulated instruction */
396 regs->ax &= ~mask;
397 if (success)
398 regs->ax |= args.r11 & mask;
399
400 return success;
401}
402
403static bool handle_out(struct pt_regs *regs, int size, int port)
404{
405 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
406
407 /*
408 * Emulate the I/O write via hypercall. More info about ABI can be found
409 * in TDX Guest-Host-Communication Interface (GHCI) section titled
410 * "TDG.VP.VMCALL<Instruction.IO>".
411 */
412 return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
413 PORT_WRITE, port, regs->ax & mask);
414}
415
416/*
417 * Emulate I/O using hypercall.
418 *
419 * Assumes the IO instruction was using ax, which is enforced
420 * by the standard io.h macros.
421 *
422 * Return True on success or False on failure.
423 */
424static bool handle_io(struct pt_regs *regs, u32 exit_qual)
425{
426 int size, port;
427 bool in;
428
429 if (VE_IS_IO_STRING(exit_qual))
430 return false;
431
432 in = VE_IS_IO_IN(exit_qual);
433 size = VE_GET_IO_SIZE(exit_qual);
434 port = VE_GET_PORT_NUM(exit_qual);
435
436
437 if (in)
438 return handle_in(regs, size, port);
439 else
440 return handle_out(regs, size, port);
441}
442
32e72854
AK
443/*
444 * Early #VE exception handler. Only handles a subset of port I/O.
445 * Intended only for earlyprintk. If failed, return false.
446 */
447__init bool tdx_early_handle_ve(struct pt_regs *regs)
448{
449 struct ve_info ve;
60428d8b 450 bool ret;
32e72854
AK
451
452 tdx_get_ve_info(&ve);
453
454 if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
455 return false;
456
60428d8b
KS
457 ret = handle_io(regs, ve.exit_qual);
458 if (ret)
459 regs->ip += ve.instr_len;
460 return ret;
32e72854
AK
461}
462
9a22bf6d
KS
463void tdx_get_ve_info(struct ve_info *ve)
464{
465 struct tdx_module_output out;
466
467 /*
468 * Called during #VE handling to retrieve the #VE info from the
469 * TDX module.
470 *
471 * This has to be called early in #VE handling. A "nested" #VE which
472 * occurs before this will raise a #DF and is not recoverable.
473 *
474 * The call retrieves the #VE info from the TDX module, which also
475 * clears the "#VE valid" flag. This must be done before anything else
476 * because any #VE that occurs while the valid flag is set will lead to
477 * #DF.
478 *
479 * Note, the TDX module treats virtual NMIs as inhibited if the #VE
480 * valid flag is set. It means that NMI=>#VE will not result in a #DF.
481 */
482 tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
483
484 /* Transfer the output parameters */
485 ve->exit_reason = out.rcx;
486 ve->exit_qual = out.rdx;
487 ve->gla = out.r8;
488 ve->gpa = out.r9;
489 ve->instr_len = lower_32_bits(out.r10);
490 ve->instr_info = upper_32_bits(out.r10);
491}
492
c141fa2c
KS
493/* Handle the user initiated #VE */
494static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
495{
496 switch (ve->exit_reason) {
497 case EXIT_REASON_CPUID:
498 return handle_cpuid(regs);
499 default:
500 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
501 return false;
502 }
503}
504
bfe6ed0c
KS
505/* Handle the kernel #VE */
506static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
507{
508 switch (ve->exit_reason) {
509 case EXIT_REASON_HLT:
510 return handle_halt();
ae87f609
KS
511 case EXIT_REASON_MSR_READ:
512 return read_msr(regs);
513 case EXIT_REASON_MSR_WRITE:
514 return write_msr(regs);
c141fa2c
KS
515 case EXIT_REASON_CPUID:
516 return handle_cpuid(regs);
31d58c4e
KS
517 case EXIT_REASON_EPT_VIOLATION:
518 return handle_mmio(regs, ve);
03149948
KS
519 case EXIT_REASON_IO_INSTRUCTION:
520 return handle_io(regs, ve->exit_qual);
bfe6ed0c
KS
521 default:
522 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
523 return false;
524 }
525}
526
9a22bf6d
KS
527bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
528{
bfe6ed0c
KS
529 bool ret;
530
531 if (user_mode(regs))
c141fa2c 532 ret = virt_exception_user(regs, ve);
bfe6ed0c
KS
533 else
534 ret = virt_exception_kernel(regs, ve);
535
536 /* After successful #VE handling, move the IP */
537 if (ret)
538 regs->ip += ve->instr_len;
9a22bf6d 539
bfe6ed0c 540 return ret;
9a22bf6d
KS
541}
542
7dbde763
KS
543static bool tdx_tlb_flush_required(bool private)
544{
545 /*
546 * TDX guest is responsible for flushing TLB on private->shared
547 * transition. VMM is responsible for flushing on shared->private.
548 *
549 * The VMM _can't_ flush private addresses as it can't generate PAs
550 * with the guest's HKID. Shared memory isn't subject to integrity
551 * checking, i.e. the VMM doesn't need to flush for its own protection.
552 *
553 * There's no need to flush when converting from shared to private,
554 * as flushing is the VMM's responsibility in this case, e.g. it must
555 * flush to avoid integrity failures in the face of a buggy or
556 * malicious guest.
557 */
558 return !private;
559}
560
561static bool tdx_cache_flush_required(void)
562{
563 /*
564 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
565 * TDX doesn't have such capability.
566 *
567 * Flush cache unconditionally.
568 */
569 return true;
570}
571
572static bool try_accept_one(phys_addr_t *start, unsigned long len,
573 enum pg_level pg_level)
574{
575 unsigned long accept_size = page_level_size(pg_level);
576 u64 tdcall_rcx;
577 u8 page_size;
578
579 if (!IS_ALIGNED(*start, accept_size))
580 return false;
581
582 if (len < accept_size)
583 return false;
584
585 /*
586 * Pass the page physical address to the TDX module to accept the
587 * pending, private page.
588 *
589 * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
590 */
591 switch (pg_level) {
592 case PG_LEVEL_4K:
593 page_size = 0;
594 break;
595 case PG_LEVEL_2M:
596 page_size = 1;
597 break;
598 case PG_LEVEL_1G:
599 page_size = 2;
600 break;
601 default:
602 return false;
603 }
604
605 tdcall_rcx = *start | page_size;
606 if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
607 return false;
608
609 *start += accept_size;
610 return true;
611}
612
613/*
614 * Inform the VMM of the guest's intent for this physical page: shared with
615 * the VMM or private to the guest. The VMM is expected to change its mapping
616 * of the page in response.
617 */
618static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
619{
620 phys_addr_t start = __pa(vaddr);
621 phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
622
623 if (!enc) {
624 /* Set the shared (decrypted) bits: */
625 start |= cc_mkdec(0);
626 end |= cc_mkdec(0);
627 }
628
629 /*
630 * Notify the VMM about page mapping conversion. More info about ABI
631 * can be found in TDX Guest-Host-Communication Interface (GHCI),
632 * section "TDG.VP.VMCALL<MapGPA>"
633 */
634 if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
635 return false;
636
637 /* private->shared conversion requires only MapGPA call */
638 if (!enc)
639 return true;
640
641 /*
642 * For shared->private conversion, accept the page using
643 * TDX_ACCEPT_PAGE TDX module call.
644 */
645 while (start < end) {
646 unsigned long len = end - start;
647
648 /*
649 * Try larger accepts first. It gives chance to VMM to keep
650 * 1G/2M SEPT entries where possible and speeds up process by
651 * cutting number of hypercalls (if successful).
652 */
653
654 if (try_accept_one(&start, len, PG_LEVEL_1G))
655 continue;
656
657 if (try_accept_one(&start, len, PG_LEVEL_2M))
658 continue;
659
660 if (!try_accept_one(&start, len, PG_LEVEL_4K))
661 return false;
662 }
663
664 return true;
665}
666
59bd54a8
KS
667void __init tdx_early_init(void)
668{
41394e33 669 u64 cc_mask;
59bd54a8
KS
670 u32 eax, sig[3];
671
672 cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
673
674 if (memcmp(TDX_IDENT, sig, sizeof(sig)))
675 return;
676
677 setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
678
41394e33
KS
679 cc_set_vendor(CC_VENDOR_INTEL);
680 cc_mask = get_cc_mask();
681 cc_set_mask(cc_mask);
682
65fab5bc
KS
683 /*
684 * All bits above GPA width are reserved and kernel treats shared bit
685 * as flag, not as part of physical address.
686 *
687 * Adjust physical mask to only cover valid GPA bits.
688 */
689 physical_mask &= cc_mask - 1;
690
7dbde763
KS
691 x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
692 x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
693 x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed;
694
59bd54a8
KS
695 pr_info("Guest detected\n");
696}