KVM: x86/pmu: Gate all "unimplemented MSR" prints on report_ignored_msrs
[linux-block.git] / arch / x86 / kvm / vmx / vmx.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
6aa8b732
AK
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9611c187 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
6aa8b732 14 */
8d20bd63 15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6aa8b732 16
199b118a
SC
17#include <linux/highmem.h>
18#include <linux/hrtimer.h>
19#include <linux/kernel.h>
edf88417 20#include <linux/kvm_host.h>
6aa8b732 21#include <linux/module.h>
c7addb90 22#include <linux/moduleparam.h>
e9bda3b3 23#include <linux/mod_devicetable.h>
199b118a 24#include <linux/mm.h>
00089c04 25#include <linux/objtool.h>
199b118a 26#include <linux/sched.h>
b284909a 27#include <linux/sched/smt.h>
5a0e3ad6 28#include <linux/slab.h>
cafd6659 29#include <linux/tboot.h>
199b118a 30#include <linux/trace_events.h>
72c3c0fe 31#include <linux/entry-kvm.h>
e495606d 32
199b118a 33#include <asm/apic.h>
fd8ca6da 34#include <asm/asm.h>
28b835d6 35#include <asm/cpu.h>
ba5bade4 36#include <asm/cpu_device_id.h>
199b118a 37#include <asm/debugreg.h>
3b3be0d1 38#include <asm/desc.h>
b56d2795 39#include <asm/fpu/api.h>
ec5be88a 40#include <asm/fpu/xstate.h>
a217a659 41#include <asm/idtentry.h>
199b118a 42#include <asm/io.h>
efc64404 43#include <asm/irq_remapping.h>
199b118a
SC
44#include <asm/kexec.h>
45#include <asm/perf_event.h>
d6e41f11 46#include <asm/mmu_context.h>
773e8a04 47#include <asm/mshyperv.h>
b10c307f 48#include <asm/mwait.h>
199b118a
SC
49#include <asm/spec-ctrl.h>
50#include <asm/virtext.h>
51#include <asm/vmx.h>
6aa8b732 52
3077c191 53#include "capabilities.h"
199b118a 54#include "cpuid.h"
05f04ae4 55#include "hyperv.h"
3c86c0d3 56#include "kvm_onhyperv.h"
199b118a
SC
57#include "irq.h"
58#include "kvm_cache_regs.h"
59#include "lapic.h"
60#include "mmu.h"
55d2375e 61#include "nested.h"
25462f7f 62#include "pmu.h"
9798adbc 63#include "sgx.h"
199b118a 64#include "trace.h"
cb1d474b 65#include "vmcs.h"
609363cf 66#include "vmcs12.h"
89b0c9f5 67#include "vmx.h"
199b118a 68#include "x86.h"
b0b42197 69#include "smm.h"
229456fc 70
6aa8b732
AK
71MODULE_AUTHOR("Qumranet");
72MODULE_LICENSE("GPL");
73
575b255c 74#ifdef MODULE
e9bda3b3 75static const struct x86_cpu_id vmx_cpu_id[] = {
320debe5 76 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
e9bda3b3
JT
77 {}
78};
79MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
575b255c 80#endif
e9bda3b3 81
2c4fd91d 82bool __read_mostly enable_vpid = 1;
736caefe 83module_param_named(vpid, enable_vpid, bool, 0444);
2384d2b3 84
d02fcf50
PB
85static bool __read_mostly enable_vnmi = 1;
86module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
87
2c4fd91d 88bool __read_mostly flexpriority_enabled = 1;
736caefe 89module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
4c9fc8ef 90
2c4fd91d 91bool __read_mostly enable_ept = 1;
736caefe 92module_param_named(ept, enable_ept, bool, S_IRUGO);
d56f546d 93
2c4fd91d 94bool __read_mostly enable_unrestricted_guest = 1;
3a624e29
NK
95module_param_named(unrestricted_guest,
96 enable_unrestricted_guest, bool, S_IRUGO);
97
2c4fd91d 98bool __read_mostly enable_ept_ad_bits = 1;
83c3a331
XH
99module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
100
a27685c3 101static bool __read_mostly emulate_invalid_guest_state = true;
c1f8bc04 102module_param(emulate_invalid_guest_state, bool, S_IRUGO);
04fa4d32 103
476bc001 104static bool __read_mostly fasteoi = 1;
58fbbf26
KT
105module_param(fasteoi, bool, S_IRUGO);
106
01e439be 107module_param(enable_apicv, bool, S_IRUGO);
83d4c286 108
d588bb9b
CG
109bool __read_mostly enable_ipiv = true;
110module_param(enable_ipiv, bool, 0444);
111
801d3424
NHE
112/*
113 * If nested=1, nested virtualization is supported, i.e., guests may use
114 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
115 * use VMX instructions.
116 */
1e58e5e5 117static bool __read_mostly nested = 1;
801d3424
NHE
118module_param(nested, bool, S_IRUGO);
119
2c4fd91d 120bool __read_mostly enable_pml = 1;
843e4330
KH
121module_param_named(pml, enable_pml, bool, S_IRUGO);
122
3dbec44d
SC
123static bool __read_mostly error_on_inconsistent_vmcs_config = true;
124module_param(error_on_inconsistent_vmcs_config, bool, 0444);
125
6f2f8453
PB
126static bool __read_mostly dump_invalid_vmcs = 0;
127module_param(dump_invalid_vmcs, bool, 0644);
128
904e14fb
PB
129#define MSR_BITMAP_MODE_X2APIC 1
130#define MSR_BITMAP_MODE_X2APIC_APICV 2
904e14fb 131
64903d61
HZ
132#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
133
64672c95
YJ
134/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
135static int __read_mostly cpu_preemption_timer_multi;
136static bool __read_mostly enable_preemption_timer = 1;
137#ifdef CONFIG_X86_64
138module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
139#endif
140
b96e6506
MG
141extern bool __read_mostly allow_smaller_maxphyaddr;
142module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
143
3de6347b 144#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
1706bd0c
SC
145#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
146#define KVM_VM_CR0_ALWAYS_ON \
ee5a5584 147 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
4c38609a 148
5dc1f044 149#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
cdc0e244
AK
150#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
151#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
152
78ac8b47
AK
153#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
154
bf8c55d8
CP
155#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
156 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
157 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
158 RTIT_STATUS_BYTECNT))
159
3eb90017
AG
160/*
161 * List of MSRs that can be directly passed to the guest.
162 * In addition to these x2apic and PT MSRs are handled specially.
163 */
164static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
165 MSR_IA32_SPEC_CTRL,
166 MSR_IA32_PRED_CMD,
167 MSR_IA32_TSC,
dbdd096a 168#ifdef CONFIG_X86_64
3eb90017
AG
169 MSR_FS_BASE,
170 MSR_GS_BASE,
171 MSR_KERNEL_GS_BASE,
b5274b1b 172 MSR_IA32_XFD,
61f20813 173 MSR_IA32_XFD_ERR,
dbdd096a 174#endif
3eb90017
AG
175 MSR_IA32_SYSENTER_CS,
176 MSR_IA32_SYSENTER_ESP,
177 MSR_IA32_SYSENTER_EIP,
178 MSR_CORE_C1_RES,
179 MSR_CORE_C3_RESIDENCY,
180 MSR_CORE_C6_RESIDENCY,
181 MSR_CORE_C7_RESIDENCY,
182};
bf8c55d8 183
4b8d54f9
ZE
184/*
185 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
186 * ple_gap: upper bound on the amount of time between two successive
187 * executions of PAUSE in a loop. Also indicate if ple enabled.
00c25bce 188 * According to test, this time is usually smaller than 128 cycles.
4b8d54f9
ZE
189 * ple_window: upper bound on the amount of time a guest is allowed to execute
190 * in a PAUSE loop. Tests indicate that most spinlocks are held for
191 * less than 2^12 cycles
192 * Time is measured based on a counter that runs at the same rate as the TSC,
193 * refer SDM volume 3b section 21.6.13 & 22.1.3.
194 */
c8e88717 195static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
a87c99e6 196module_param(ple_gap, uint, 0444);
b4a2d31d 197
7fbc85a5
BM
198static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
199module_param(ple_window, uint, 0444);
4b8d54f9 200
b4a2d31d 201/* Default doubles per-vcpu window every exit. */
c8e88717 202static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
7fbc85a5 203module_param(ple_window_grow, uint, 0444);
b4a2d31d
RK
204
205/* Default resets per-vcpu window every exit to ple_window. */
c8e88717 206static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
7fbc85a5 207module_param(ple_window_shrink, uint, 0444);
b4a2d31d
RK
208
209/* Default is to compute the maximum so we can never overflow. */
7fbc85a5
BM
210static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
211module_param(ple_window_max, uint, 0444);
b4a2d31d 212
f99e3daf
CP
213/* Default is SYSTEM mode, 1 for host-guest mode */
214int __read_mostly pt_mode = PT_MODE_SYSTEM;
215module_param(pt_mode, int, S_IRUGO);
216
a399477e 217static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
427362a1 218static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
dd4bfa73 219static DEFINE_MUTEX(vmx_l1d_flush_mutex);
a399477e 220
7db92e16
TG
221/* Storage for pre module init parameter parsing */
222static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
223
224static const struct {
225 const char *option;
0027ff2a 226 bool for_parse;
a399477e 227} vmentry_l1d_param[] = {
0027ff2a
PB
228 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
229 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
230 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
231 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
232 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
233 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
a399477e
KRW
234};
235
7db92e16
TG
236#define L1D_CACHE_ORDER 4
237static void *vmx_l1d_flush_pages;
238
027bbb88
PG
239/* Control for disabling CPU Fill buffer clear */
240static bool __read_mostly vmx_fb_clear_ctrl_available;
241
7db92e16 242static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
a399477e 243{
7db92e16 244 struct page *page;
288d152c 245 unsigned int i;
a399477e 246
19a36d32
WL
247 if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
248 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
249 return 0;
250 }
251
7db92e16
TG
252 if (!enable_ept) {
253 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
254 return 0;
a399477e
KRW
255 }
256
d806afa4
YW
257 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
258 u64 msr;
259
260 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
261 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
262 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
263 return 0;
264 }
265 }
8e0b2b91 266
d90a7a0e
JK
267 /* If set to auto use the default l1tf mitigation method */
268 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
269 switch (l1tf_mitigation) {
270 case L1TF_MITIGATION_OFF:
271 l1tf = VMENTER_L1D_FLUSH_NEVER;
272 break;
273 case L1TF_MITIGATION_FLUSH_NOWARN:
274 case L1TF_MITIGATION_FLUSH:
275 case L1TF_MITIGATION_FLUSH_NOSMT:
276 l1tf = VMENTER_L1D_FLUSH_COND;
277 break;
278 case L1TF_MITIGATION_FULL:
279 case L1TF_MITIGATION_FULL_FORCE:
280 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
281 break;
282 }
283 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
284 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
285 }
286
7db92e16
TG
287 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
288 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
41836839
BG
289 /*
290 * This allocation for vmx_l1d_flush_pages is not tied to a VM
291 * lifetime and so should not be charged to a memcg.
292 */
7db92e16
TG
293 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
294 if (!page)
295 return -ENOMEM;
296 vmx_l1d_flush_pages = page_address(page);
288d152c
NS
297
298 /*
299 * Initialize each page with a different pattern in
300 * order to protect against KSM in the nested
301 * virtualization case.
302 */
303 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
304 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
305 PAGE_SIZE);
306 }
7db92e16
TG
307 }
308
309 l1tf_vmx_mitigation = l1tf;
310
895ae47f
TG
311 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
312 static_branch_enable(&vmx_l1d_should_flush);
313 else
314 static_branch_disable(&vmx_l1d_should_flush);
4c6523ec 315
427362a1
NS
316 if (l1tf == VMENTER_L1D_FLUSH_COND)
317 static_branch_enable(&vmx_l1d_flush_cond);
895ae47f 318 else
427362a1 319 static_branch_disable(&vmx_l1d_flush_cond);
7db92e16
TG
320 return 0;
321}
322
323static int vmentry_l1d_flush_parse(const char *s)
324{
325 unsigned int i;
326
327 if (s) {
328 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
0027ff2a
PB
329 if (vmentry_l1d_param[i].for_parse &&
330 sysfs_streq(s, vmentry_l1d_param[i].option))
331 return i;
7db92e16
TG
332 }
333 }
a399477e
KRW
334 return -EINVAL;
335}
336
7db92e16
TG
337static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
338{
dd4bfa73 339 int l1tf, ret;
7db92e16 340
7db92e16
TG
341 l1tf = vmentry_l1d_flush_parse(s);
342 if (l1tf < 0)
343 return l1tf;
344
0027ff2a
PB
345 if (!boot_cpu_has(X86_BUG_L1TF))
346 return 0;
347
7db92e16
TG
348 /*
349 * Has vmx_init() run already? If not then this is the pre init
350 * parameter parsing. In that case just store the value and let
351 * vmx_init() do the proper setup after enable_ept has been
352 * established.
353 */
354 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
355 vmentry_l1d_flush_param = l1tf;
356 return 0;
357 }
358
dd4bfa73
TG
359 mutex_lock(&vmx_l1d_flush_mutex);
360 ret = vmx_setup_l1d_flush(l1tf);
361 mutex_unlock(&vmx_l1d_flush_mutex);
362 return ret;
7db92e16
TG
363}
364
a399477e
KRW
365static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
366{
0027ff2a
PB
367 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
368 return sprintf(s, "???\n");
369
7db92e16 370 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
a399477e
KRW
371}
372
027bbb88
PG
373static void vmx_setup_fb_clear_ctrl(void)
374{
375 u64 msr;
376
377 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
378 !boot_cpu_has_bug(X86_BUG_MDS) &&
379 !boot_cpu_has_bug(X86_BUG_TAA)) {
380 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
381 if (msr & ARCH_CAP_FB_CLEAR_CTRL)
382 vmx_fb_clear_ctrl_available = true;
383 }
384}
385
386static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
387{
388 u64 msr;
389
390 if (!vmx->disable_fb_clear)
391 return;
392
742ab6df 393 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
027bbb88 394 msr |= FB_CLEAR_DIS;
742ab6df 395 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
027bbb88
PG
396 /* Cache the MSR value to avoid reading it later */
397 vmx->msr_ia32_mcu_opt_ctrl = msr;
398}
399
400static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
401{
402 if (!vmx->disable_fb_clear)
403 return;
404
405 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
742ab6df 406 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
027bbb88
PG
407}
408
409static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
410{
411 vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
412
413 /*
414 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
415 * at VMEntry. Skip the MSR read/write when a guest has no use case to
416 * execute VERW.
417 */
418 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
419 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
420 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
421 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
422 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
423 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
424 vmx->disable_fb_clear = false;
425}
426
a399477e
KRW
427static const struct kernel_param_ops vmentry_l1d_flush_ops = {
428 .set = vmentry_l1d_flush_set,
429 .get = vmentry_l1d_flush_get,
430};
895ae47f 431module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
a399477e 432
d99e4152 433static u32 vmx_segment_access_rights(struct kvm_segment *var);
75880a01 434
453eafbe
SC
435void vmx_vmexit(void);
436
52a9fcbc
SC
437#define vmx_insn_failed(fmt...) \
438do { \
439 WARN_ONCE(1, fmt); \
440 pr_warn_ratelimited(fmt); \
441} while (0)
442
57abfa11 443void vmread_error(unsigned long field, bool fault)
6e202097
SC
444{
445 if (fault)
446 kvm_spurious_fault();
447 else
8d20bd63 448 vmx_insn_failed("vmread failed: field=%lx\n", field);
6e202097
SC
449}
450
52a9fcbc
SC
451noinline void vmwrite_error(unsigned long field, unsigned long value)
452{
8d20bd63 453 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
52a9fcbc
SC
454 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
455}
456
457noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
458{
8d20bd63 459 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
8e39efd8 460 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
52a9fcbc
SC
461}
462
463noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
464{
8d20bd63 465 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
8e39efd8 466 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
52a9fcbc
SC
467}
468
469noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
470{
8d20bd63 471 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
52a9fcbc
SC
472 ext, vpid, gva);
473}
474
475noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
476{
8d20bd63 477 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
52a9fcbc
SC
478 ext, eptp, gpa);
479}
480
6aa8b732 481static DEFINE_PER_CPU(struct vmcs *, vmxarea);
75edce8a 482DEFINE_PER_CPU(struct vmcs *, current_vmcs);
d462b819
NHE
483/*
484 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
485 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
486 */
487static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
6aa8b732 488
2384d2b3
SY
489static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
490static DEFINE_SPINLOCK(vmx_vpid_lock);
491
58ca1930
SC
492struct vmcs_config vmcs_config __ro_after_init;
493struct vmx_capability vmx_capability __ro_after_init;
d56f546d 494
6aa8b732
AK
495#define VMX_SEGMENT_FIELD(seg) \
496 [VCPU_SREG_##seg] = { \
497 .selector = GUEST_##seg##_SELECTOR, \
498 .base = GUEST_##seg##_BASE, \
499 .limit = GUEST_##seg##_LIMIT, \
500 .ar_bytes = GUEST_##seg##_AR_BYTES, \
501 }
502
772e0318 503static const struct kvm_vmx_segment_field {
6aa8b732
AK
504 unsigned selector;
505 unsigned base;
506 unsigned limit;
507 unsigned ar_bytes;
508} kvm_vmx_segment_fields[] = {
509 VMX_SEGMENT_FIELD(CS),
510 VMX_SEGMENT_FIELD(DS),
511 VMX_SEGMENT_FIELD(ES),
512 VMX_SEGMENT_FIELD(FS),
513 VMX_SEGMENT_FIELD(GS),
514 VMX_SEGMENT_FIELD(SS),
515 VMX_SEGMENT_FIELD(TR),
516 VMX_SEGMENT_FIELD(LDTR),
517};
518
ec0241f3
SC
519static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
520{
521 vmx->segment_cache.bitmask = 0;
522}
523
2342080c 524static unsigned long host_idt_base;
26bb0981 525
773e8a04 526#if IS_ENABLED(CONFIG_HYPERV)
451d39e8
SC
527static struct kvm_x86_ops vmx_x86_ops __initdata;
528
773e8a04
VK
529static bool __read_mostly enlightened_vmcs = true;
530module_param(enlightened_vmcs, bool, 0444);
531
b83237ad 532static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
6f6a657c
VK
533{
534 struct hv_enlightened_vmcs *evmcs;
535 struct hv_partition_assist_pg **p_hv_pa_pg =
05f04ae4 536 &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
6f6a657c
VK
537 /*
538 * Synthetic VM-Exit is not enabled in current code and so All
539 * evmcs in singe VM shares same assist page.
540 */
cab01850 541 if (!*p_hv_pa_pg)
eba04b20 542 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
cab01850
VK
543
544 if (!*p_hv_pa_pg)
545 return -ENOMEM;
6f6a657c
VK
546
547 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
548
549 evmcs->partition_assist_page =
550 __pa(*p_hv_pa_pg);
cab01850 551 evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
6f6a657c
VK
552 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
553
6f6a657c
VK
554 return 0;
555}
556
451d39e8
SC
557static __init void hv_init_evmcs(void)
558{
559 int cpu;
560
561 if (!enlightened_vmcs)
562 return;
563
564 /*
565 * Enlightened VMCS usage should be recommended and the host needs
566 * to support eVMCS v1 or above.
567 */
568 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
569 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
570 KVM_EVMCS_VERSION) {
571
572 /* Check that we have assist pages on all online CPUs */
573 for_each_online_cpu(cpu) {
574 if (!hv_get_vp_assist_page(cpu)) {
575 enlightened_vmcs = false;
576 break;
577 }
578 }
579
580 if (enlightened_vmcs) {
8d20bd63 581 pr_info("Using Hyper-V Enlightened VMCS\n");
451d39e8
SC
582 static_branch_enable(&enable_evmcs);
583 }
584
585 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
586 vmx_x86_ops.enable_l2_tlb_flush
587 = hv_enable_l2_tlb_flush;
588
589 } else {
590 enlightened_vmcs = false;
591 }
592}
593
2916b70f
SC
594static void hv_reset_evmcs(void)
595{
596 struct hv_vp_assist_page *vp_ap;
597
598 if (!static_branch_unlikely(&enable_evmcs))
599 return;
600
601 /*
602 * KVM should enable eVMCS if and only if all CPUs have a VP assist
603 * page, and should reject CPU onlining if eVMCS is enabled the CPU
604 * doesn't have a VP assist page allocated.
605 */
606 vp_ap = hv_get_vp_assist_page(smp_processor_id());
607 if (WARN_ON_ONCE(!vp_ap))
608 return;
609
610 /*
611 * Reset everything to support using non-enlightened VMCS access later
612 * (e.g. when we reload the module with enlightened_vmcs=0)
613 */
614 vp_ap->nested_control.features.directhypercall = 0;
615 vp_ap->current_nested_vmcs = 0;
616 vp_ap->enlighten_vmentry = 0;
617}
618
619#else /* IS_ENABLED(CONFIG_HYPERV) */
451d39e8 620static void hv_init_evmcs(void) {}
2916b70f 621static void hv_reset_evmcs(void) {}
773e8a04
VK
622#endif /* IS_ENABLED(CONFIG_HYPERV) */
623
64672c95
YJ
624/*
625 * Comment's format: document - errata name - stepping - processor name.
626 * Refer from
627 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
628 */
629static u32 vmx_preemption_cpu_tfms[] = {
630/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
6310x000206E6,
632/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
633/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
634/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
6350x00020652,
636/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
6370x00020655,
638/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
639/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
640/*
641 * 320767.pdf - AAP86 - B1 -
642 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
643 */
6440x000106E5,
645/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
6460x000106A0,
647/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
6480x000106A1,
649/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
6500x000106A4,
651 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
652 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
653 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
6540x000106A5,
3d82c565
WH
655 /* Xeon E3-1220 V2 */
6560x000306A8,
64672c95
YJ
657};
658
659static inline bool cpu_has_broken_vmx_preemption_timer(void)
660{
661 u32 eax = cpuid_eax(0x00000001), i;
662
663 /* Clear the reserved bits */
664 eax &= ~(0x3U << 14 | 0xfU << 28);
03f6a22a 665 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
64672c95
YJ
666 if (eax == vmx_preemption_cpu_tfms[i])
667 return true;
668
669 return false;
670}
671
35754c98 672static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
f78e0e2e 673{
35754c98 674 return flexpriority_enabled && lapic_in_kernel(vcpu);
f78e0e2e
SY
675}
676
3eb90017
AG
677static int possible_passthrough_msr_slot(u32 msr)
678{
679 u32 i;
680
681 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
682 if (vmx_possible_passthrough_msrs[i] == msr)
683 return i;
684
685 return -ENOENT;
686}
687
688static bool is_valid_passthrough_msr(u32 msr)
689{
690 bool r;
691
692 switch (msr) {
693 case 0x800 ... 0x8ff:
694 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
695 return true;
696 case MSR_IA32_RTIT_STATUS:
697 case MSR_IA32_RTIT_OUTPUT_BASE:
698 case MSR_IA32_RTIT_OUTPUT_MASK:
699 case MSR_IA32_RTIT_CR3_MATCH:
700 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
701 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
1b5ac322
LX
702 case MSR_LBR_SELECT:
703 case MSR_LBR_TOS:
704 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
705 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
706 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
707 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
708 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
709 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
3eb90017
AG
710 return true;
711 }
712
713 r = possible_passthrough_msr_slot(msr) != -ENOENT;
714
715 WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
716
717 return r;
718}
719
d85a8034 720struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
a75beee6
ED
721{
722 int i;
723
8ea8b8d6 724 i = kvm_find_user_return_msr(msr);
a75beee6 725 if (i >= 0)
eb3db1b1 726 return &vmx->guest_uret_msrs[i];
8b6d44c7 727 return NULL;
7725f0ba
AK
728}
729
7bf662bb
SC
730static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
731 struct vmx_uret_msr *msr, u64 data)
b07a5c53 732{
ee9d22e0 733 unsigned int slot = msr - vmx->guest_uret_msrs;
b07a5c53
PB
734 int ret = 0;
735
ee9d22e0 736 if (msr->load_into_hardware) {
b07a5c53 737 preempt_disable();
3ab4ac87 738 ret = kvm_set_user_return_msr(slot, data, msr->mask);
b07a5c53 739 preempt_enable();
b07a5c53 740 }
3ab4ac87
LJ
741 if (!ret)
742 msr->data = data;
b07a5c53
PB
743 return ret;
744}
745
2965faa5 746#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
747static void crash_vmclear_local_loaded_vmcss(void)
748{
749 int cpu = raw_smp_processor_id();
750 struct loaded_vmcs *v;
751
8f536b76
ZY
752 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
753 loaded_vmcss_on_cpu_link)
754 vmcs_clear(v->vmcs);
755}
2965faa5 756#endif /* CONFIG_KEXEC_CORE */
8f536b76 757
d462b819 758static void __loaded_vmcs_clear(void *arg)
6aa8b732 759{
d462b819 760 struct loaded_vmcs *loaded_vmcs = arg;
d3b2c338 761 int cpu = raw_smp_processor_id();
6aa8b732 762
d462b819
NHE
763 if (loaded_vmcs->cpu != cpu)
764 return; /* vcpu migration can race with cpu offline */
765 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
6aa8b732 766 per_cpu(current_vmcs, cpu) = NULL;
31603d4f
SC
767
768 vmcs_clear(loaded_vmcs->vmcs);
769 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
770 vmcs_clear(loaded_vmcs->shadow_vmcs);
771
d462b819 772 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
5a560f8b
XG
773
774 /*
31603d4f 775 * Ensure all writes to loaded_vmcs, including deleting it from its
105e0c44
PH
776 * current percpu list, complete before setting loaded_vmcs->cpu to
777 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
778 * and add loaded_vmcs to its percpu list before it's deleted from this
779 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
5a560f8b
XG
780 */
781 smp_wmb();
782
31603d4f
SC
783 loaded_vmcs->cpu = -1;
784 loaded_vmcs->launched = 0;
6aa8b732
AK
785}
786
89b0c9f5 787void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
8d0be2b3 788{
e6c7d321
XG
789 int cpu = loaded_vmcs->cpu;
790
791 if (cpu != -1)
792 smp_call_function_single(cpu,
793 __loaded_vmcs_clear, loaded_vmcs, 1);
8d0be2b3
AK
794}
795
2fb92db1
AK
796static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
797 unsigned field)
798{
799 bool ret;
800 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
801
cb3c1e2f
SC
802 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
803 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
2fb92db1
AK
804 vmx->segment_cache.bitmask = 0;
805 }
806 ret = vmx->segment_cache.bitmask & mask;
807 vmx->segment_cache.bitmask |= mask;
808 return ret;
809}
810
811static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
812{
813 u16 *p = &vmx->segment_cache.seg[seg].selector;
814
815 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
816 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
817 return *p;
818}
819
820static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
821{
822 ulong *p = &vmx->segment_cache.seg[seg].base;
823
824 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
825 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
826 return *p;
827}
828
829static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
830{
831 u32 *p = &vmx->segment_cache.seg[seg].limit;
832
833 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
834 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
835 return *p;
836}
837
838static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
839{
840 u32 *p = &vmx->segment_cache.seg[seg].ar;
841
842 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
843 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
844 return *p;
845}
846
b6a7cc35 847void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
abd3f2d6
AK
848{
849 u32 eb;
850
fd7373cc 851 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
bd7e5b08 852 (1u << DB_VECTOR) | (1u << AC_VECTOR);
9e869480
LA
853 /*
854 * Guest access to VMware backdoor ports could legitimately
855 * trigger #GP because of TSS I/O permission bitmap.
856 * We intercept those #GP and allow access to them anyway
857 * as VMware does.
858 */
859 if (enable_vmware_backdoor)
860 eb |= (1u << GP_VECTOR);
fd7373cc
JK
861 if ((vcpu->guest_debug &
862 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
863 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
864 eb |= 1u << BP_VECTOR;
7ffd92c5 865 if (to_vmx(vcpu)->rmode.vm86_active)
abd3f2d6 866 eb = ~0;
a0c13434 867 if (!vmx_need_pf_intercept(vcpu))
49f933d4 868 eb &= ~(1u << PF_VECTOR);
36cf24e0
NHE
869
870 /* When we are running a nested L2 guest and L1 specified for it a
871 * certain exception bitmap, we must trap the same exceptions and pass
872 * them to L1. When running L2, we will only handle the exceptions
873 * specified above if L1 did not want them.
874 */
875 if (is_guest_mode(vcpu))
876 eb |= get_vmcs12(vcpu)->exception_bitmap;
b502e6ec 877 else {
5140bc7d
JM
878 int mask = 0, match = 0;
879
880 if (enable_ept && (eb & (1u << PF_VECTOR))) {
881 /*
882 * If EPT is enabled, #PF is currently only intercepted
883 * if MAXPHYADDR is smaller on the guest than on the
884 * host. In that case we only care about present,
885 * non-reserved faults. For vmcs02, however, PFEC_MASK
886 * and PFEC_MATCH are set in prepare_vmcs02_rare.
887 */
888 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
889 match = PFERR_PRESENT_MASK;
890 }
b502e6ec 891 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
5140bc7d 892 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
b502e6ec 893 }
36cf24e0 894
ec5be88a 895 /*
b5274b1b
KT
896 * Disabling xfd interception indicates that dynamic xfeatures
897 * might be used in the guest. Always trap #NM in this case
898 * to save guest xfd_err timely.
ec5be88a 899 */
b5274b1b 900 if (vcpu->arch.xfd_no_write_intercept)
ec5be88a
JL
901 eb |= (1u << NM_VECTOR);
902
abd3f2d6
AK
903 vmcs_write32(EXCEPTION_BITMAP, eb);
904}
905
d28b387f
KA
906/*
907 * Check if MSR is intercepted for currently loaded MSR bitmap.
908 */
7dfbc624 909static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
d28b387f 910{
7dfbc624 911 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
d28b387f
KA
912 return true;
913
020dac41 914 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
d28b387f
KA
915}
916
bb066506
JP
917unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
918{
919 unsigned int flags = 0;
920
921 if (vmx->loaded_vmcs->launched)
922 flags |= VMX_RUN_VMRESUME;
923
fc02735b
JP
924 /*
925 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
926 * to change it directly without causing a vmexit. In that case read
927 * it after vmexit and store it in vmx->spec_ctrl.
928 */
4f209989 929 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
fc02735b
JP
930 flags |= VMX_RUN_SAVE_SPEC_CTRL;
931
bb066506
JP
932 return flags;
933}
934
ee087b4d 935static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2961e876 936 unsigned long entry, unsigned long exit)
8bf00a52 937{
2961e876
GN
938 vm_entry_controls_clearbit(vmx, entry);
939 vm_exit_controls_clearbit(vmx, exit);
8bf00a52
GN
940}
941
a128a934 942int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
ca83b4a7
KRW
943{
944 unsigned int i;
945
946 for (i = 0; i < m->nr; ++i) {
947 if (m->val[i].index == msr)
948 return i;
949 }
950 return -ENOENT;
951}
952
61d2ef2c
AK
953static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
954{
ca83b4a7 955 int i;
61d2ef2c
AK
956 struct msr_autoload *m = &vmx->msr_autoload;
957
8bf00a52
GN
958 switch (msr) {
959 case MSR_EFER:
c73da3fc 960 if (cpu_has_load_ia32_efer()) {
2961e876
GN
961 clear_atomic_switch_msr_special(vmx,
962 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
963 VM_EXIT_LOAD_IA32_EFER);
964 return;
965 }
966 break;
967 case MSR_CORE_PERF_GLOBAL_CTRL:
c73da3fc 968 if (cpu_has_load_perf_global_ctrl()) {
2961e876 969 clear_atomic_switch_msr_special(vmx,
8bf00a52
GN
970 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
971 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
972 return;
973 }
974 break;
110312c8 975 }
a128a934 976 i = vmx_find_loadstore_msr_slot(&m->guest, msr);
ca83b4a7 977 if (i < 0)
31907093 978 goto skip_guest;
33966dd6 979 --m->guest.nr;
33966dd6 980 m->guest.val[i] = m->guest.val[m->guest.nr];
33966dd6 981 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
110312c8 982
31907093 983skip_guest:
a128a934 984 i = vmx_find_loadstore_msr_slot(&m->host, msr);
31907093 985 if (i < 0)
61d2ef2c 986 return;
31907093
KRW
987
988 --m->host.nr;
989 m->host.val[i] = m->host.val[m->host.nr];
33966dd6 990 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c
AK
991}
992
ee087b4d 993static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2961e876
GN
994 unsigned long entry, unsigned long exit,
995 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
996 u64 guest_val, u64 host_val)
8bf00a52
GN
997{
998 vmcs_write64(guest_val_vmcs, guest_val);
5a5e8a15
SC
999 if (host_val_vmcs != HOST_IA32_EFER)
1000 vmcs_write64(host_val_vmcs, host_val);
2961e876
GN
1001 vm_entry_controls_setbit(vmx, entry);
1002 vm_exit_controls_setbit(vmx, exit);
8bf00a52
GN
1003}
1004
61d2ef2c 1005static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
989e3992 1006 u64 guest_val, u64 host_val, bool entry_only)
61d2ef2c 1007{
989e3992 1008 int i, j = 0;
61d2ef2c
AK
1009 struct msr_autoload *m = &vmx->msr_autoload;
1010
8bf00a52
GN
1011 switch (msr) {
1012 case MSR_EFER:
c73da3fc 1013 if (cpu_has_load_ia32_efer()) {
2961e876
GN
1014 add_atomic_switch_msr_special(vmx,
1015 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
1016 VM_EXIT_LOAD_IA32_EFER,
1017 GUEST_IA32_EFER,
1018 HOST_IA32_EFER,
1019 guest_val, host_val);
1020 return;
1021 }
1022 break;
1023 case MSR_CORE_PERF_GLOBAL_CTRL:
c73da3fc 1024 if (cpu_has_load_perf_global_ctrl()) {
2961e876 1025 add_atomic_switch_msr_special(vmx,
8bf00a52
GN
1026 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1027 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1028 GUEST_IA32_PERF_GLOBAL_CTRL,
1029 HOST_IA32_PERF_GLOBAL_CTRL,
1030 guest_val, host_val);
1031 return;
1032 }
1033 break;
7099e2e1
RK
1034 case MSR_IA32_PEBS_ENABLE:
1035 /* PEBS needs a quiescent period after being disabled (to write
1036 * a record). Disabling PEBS through VMX MSR swapping doesn't
1037 * provide that period, so a CPU could write host's record into
1038 * guest's memory.
1039 */
1040 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
110312c8
AK
1041 }
1042
a128a934 1043 i = vmx_find_loadstore_msr_slot(&m->guest, msr);
989e3992 1044 if (!entry_only)
a128a934 1045 j = vmx_find_loadstore_msr_slot(&m->host, msr);
61d2ef2c 1046
ce833b23
SC
1047 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1048 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
60266204 1049 printk_once(KERN_WARNING "Not enough msr switch entries. "
e7fc6f93
GN
1050 "Can't add msr %x\n", msr);
1051 return;
61d2ef2c 1052 }
31907093 1053 if (i < 0) {
ca83b4a7 1054 i = m->guest.nr++;
33966dd6 1055 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
31907093 1056 }
989e3992
KRW
1057 m->guest.val[i].index = msr;
1058 m->guest.val[i].value = guest_val;
1059
1060 if (entry_only)
1061 return;
61d2ef2c 1062
31907093
KRW
1063 if (j < 0) {
1064 j = m->host.nr++;
33966dd6 1065 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c 1066 }
31907093
KRW
1067 m->host.val[j].index = msr;
1068 m->host.val[j].value = host_val;
61d2ef2c
AK
1069}
1070
86e3e494 1071static bool update_transition_efer(struct vcpu_vmx *vmx)
2cc51560 1072{
844a5fe2
PB
1073 u64 guest_efer = vmx->vcpu.arch.efer;
1074 u64 ignore_bits = 0;
86e3e494 1075 int i;
844a5fe2 1076
9167ab79
PB
1077 /* Shadow paging assumes NX to be available. */
1078 if (!enable_ept)
1079 guest_efer |= EFER_NX;
3a34a881 1080
51c6cf66 1081 /*
844a5fe2 1082 * LMA and LME handled by hardware; SCE meaningless outside long mode.
51c6cf66 1083 */
844a5fe2 1084 ignore_bits |= EFER_SCE;
51c6cf66
AK
1085#ifdef CONFIG_X86_64
1086 ignore_bits |= EFER_LMA | EFER_LME;
1087 /* SCE is meaningful only in long mode on Intel */
1088 if (guest_efer & EFER_LMA)
1089 ignore_bits &= ~(u64)EFER_SCE;
1090#endif
84ad33ef 1091
f6577a5f
AL
1092 /*
1093 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1094 * On CPUs that support "load IA32_EFER", always switch EFER
1095 * atomically, since it's faster than switching it manually.
1096 */
c73da3fc 1097 if (cpu_has_load_ia32_efer() ||
f6577a5f 1098 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
84ad33ef
AK
1099 if (!(guest_efer & EFER_LMA))
1100 guest_efer &= ~EFER_LME;
54b98bff
AL
1101 if (guest_efer != host_efer)
1102 add_atomic_switch_msr(vmx, MSR_EFER,
989e3992 1103 guest_efer, host_efer, false);
02343cf2
SC
1104 else
1105 clear_atomic_switch_msr(vmx, MSR_EFER);
84ad33ef 1106 return false;
86e3e494 1107 }
02343cf2 1108
8ea8b8d6 1109 i = kvm_find_user_return_msr(MSR_EFER);
86e3e494
SC
1110 if (i < 0)
1111 return false;
02343cf2 1112
86e3e494 1113 clear_atomic_switch_msr(vmx, MSR_EFER);
844a5fe2 1114
86e3e494
SC
1115 guest_efer &= ~ignore_bits;
1116 guest_efer |= host_efer & ignore_bits;
84ad33ef 1117
86e3e494
SC
1118 vmx->guest_uret_msrs[i].data = guest_efer;
1119 vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1120
1121 return true;
51c6cf66
AK
1122}
1123
e28baead
AL
1124#ifdef CONFIG_X86_32
1125/*
1126 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1127 * VMCS rather than the segment table. KVM uses this helper to figure
1128 * out the current bases to poke them into the VMCS before entry.
1129 */
2d49ec72
GN
1130static unsigned long segment_base(u16 selector)
1131{
8c2e41f7 1132 struct desc_struct *table;
2d49ec72
GN
1133 unsigned long v;
1134
8c2e41f7 1135 if (!(selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
1136 return 0;
1137
45fc8757 1138 table = get_current_gdt_ro();
2d49ec72 1139
8c2e41f7 1140 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2d49ec72
GN
1141 u16 ldt_selector = kvm_read_ldt();
1142
8c2e41f7 1143 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
1144 return 0;
1145
8c2e41f7 1146 table = (struct desc_struct *)segment_base(ldt_selector);
2d49ec72 1147 }
8c2e41f7 1148 v = get_desc_base(&table[selector >> 3]);
2d49ec72
GN
1149 return v;
1150}
e28baead 1151#endif
2d49ec72 1152
e348ac7c
SC
1153static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1154{
2ef7619d 1155 return vmx_pt_mode_is_host_guest() &&
e348ac7c
SC
1156 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1157}
1158
1cc6cbc3
SC
1159static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1160{
1161 /* The base must be 128-byte aligned and a legal physical address. */
636e8b73 1162 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1cc6cbc3
SC
1163}
1164
2ef444f1
CP
1165static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1166{
1167 u32 i;
1168
1169 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1170 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1171 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1172 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1173 for (i = 0; i < addr_range; i++) {
1174 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1175 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1176 }
1177}
1178
1179static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1180{
1181 u32 i;
1182
1183 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1184 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1185 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1186 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1187 for (i = 0; i < addr_range; i++) {
1188 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1189 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1190 }
1191}
1192
1193static void pt_guest_enter(struct vcpu_vmx *vmx)
1194{
2ef7619d 1195 if (vmx_pt_mode_is_system())
2ef444f1
CP
1196 return;
1197
2ef444f1 1198 /*
b08c2896
CP
1199 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1200 * Save host state before VM entry.
2ef444f1 1201 */
b08c2896 1202 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
2ef444f1
CP
1203 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1204 wrmsrl(MSR_IA32_RTIT_CTL, 0);
f4d3a902
XL
1205 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1206 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
2ef444f1
CP
1207 }
1208}
1209
1210static void pt_guest_exit(struct vcpu_vmx *vmx)
1211{
2ef7619d 1212 if (vmx_pt_mode_is_system())
2ef444f1
CP
1213 return;
1214
1215 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
f4d3a902
XL
1216 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1217 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
2ef444f1
CP
1218 }
1219
2e6e0d68
XL
1220 /*
1221 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1222 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
1223 */
1224 if (vmx->pt_desc.host.ctl)
1225 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
2ef444f1
CP
1226}
1227
bca06b85
SC
1228void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1229 unsigned long fs_base, unsigned long gs_base)
13b964a2
SC
1230{
1231 if (unlikely(fs_sel != host->fs_sel)) {
1232 if (!(fs_sel & 7))
1233 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1234 else
1235 vmcs_write16(HOST_FS_SELECTOR, 0);
1236 host->fs_sel = fs_sel;
1237 }
1238 if (unlikely(gs_sel != host->gs_sel)) {
1239 if (!(gs_sel & 7))
1240 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1241 else
1242 vmcs_write16(HOST_GS_SELECTOR, 0);
1243 host->gs_sel = gs_sel;
1244 }
1245 if (unlikely(fs_base != host->fs_base)) {
1246 vmcs_writel(HOST_FS_BASE, fs_base);
1247 host->fs_base = fs_base;
1248 }
1249 if (unlikely(gs_base != host->gs_base)) {
1250 vmcs_writel(HOST_GS_BASE, gs_base);
1251 host->gs_base = gs_base;
1252 }
1253}
1254
97b7ead3 1255void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
33ed6329 1256{
04d2cc77 1257 struct vcpu_vmx *vmx = to_vmx(vcpu);
d7ee039e 1258 struct vmcs_host_state *host_state;
51e8a8cc 1259#ifdef CONFIG_X86_64
35060ed6 1260 int cpu = raw_smp_processor_id();
51e8a8cc 1261#endif
e368b875
SC
1262 unsigned long fs_base, gs_base;
1263 u16 fs_sel, gs_sel;
26bb0981 1264 int i;
04d2cc77 1265
d264ee0c
SC
1266 vmx->req_immediate_exit = false;
1267
f48b4711
LA
1268 /*
1269 * Note that guest MSRs to be saved/restored can also be changed
1270 * when guest state is loaded. This happens when guest transitions
1271 * to/from long-mode by setting MSR_EFER.LMA.
1272 */
658ece84
SC
1273 if (!vmx->guest_uret_msrs_loaded) {
1274 vmx->guest_uret_msrs_loaded = true;
e5fda4bb 1275 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
ee9d22e0
SC
1276 if (!vmx->guest_uret_msrs[i].load_into_hardware)
1277 continue;
1278
1279 kvm_set_user_return_msr(i,
eb3db1b1
SC
1280 vmx->guest_uret_msrs[i].data,
1281 vmx->guest_uret_msrs[i].mask);
ee9d22e0 1282 }
f48b4711 1283 }
c9dfd3fb 1284
1285 if (vmx->nested.need_vmcs12_to_shadow_sync)
1286 nested_sync_vmcs12_to_shadow(vcpu);
1287
b464f57e 1288 if (vmx->guest_state_loaded)
33ed6329
AK
1289 return;
1290
b464f57e 1291 host_state = &vmx->loaded_vmcs->host_state;
bd9966de 1292
33ed6329
AK
1293 /*
1294 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1295 * allow segment selectors with cpl > 0 or ti == 1.
1296 */
d7ee039e 1297 host_state->ldt_sel = kvm_read_ldt();
42b933b5
VK
1298
1299#ifdef CONFIG_X86_64
d7ee039e
SC
1300 savesegment(ds, host_state->ds_sel);
1301 savesegment(es, host_state->es_sel);
e368b875
SC
1302
1303 gs_base = cpu_kernelmode_gs_base(cpu);
b062b794 1304 if (likely(is_64bit_mm(current->mm))) {
6758034e 1305 current_save_fsgs();
e368b875
SC
1306 fs_sel = current->thread.fsindex;
1307 gs_sel = current->thread.gsindex;
b062b794 1308 fs_base = current->thread.fsbase;
e368b875 1309 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
b062b794 1310 } else {
e368b875
SC
1311 savesegment(fs, fs_sel);
1312 savesegment(gs, gs_sel);
b062b794 1313 fs_base = read_msr(MSR_FS_BASE);
e368b875 1314 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
33ed6329 1315 }
b2da15ac 1316
4679b61f 1317 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
4fde8d57 1318#else
e368b875
SC
1319 savesegment(fs, fs_sel);
1320 savesegment(gs, gs_sel);
1321 fs_base = segment_base(fs_sel);
1322 gs_base = segment_base(gs_sel);
707c0874 1323#endif
e368b875 1324
bca06b85 1325 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
b464f57e 1326 vmx->guest_state_loaded = true;
33ed6329
AK
1327}
1328
6d6095bd 1329static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
33ed6329 1330{
d7ee039e
SC
1331 struct vmcs_host_state *host_state;
1332
b464f57e 1333 if (!vmx->guest_state_loaded)
33ed6329
AK
1334 return;
1335
b464f57e 1336 host_state = &vmx->loaded_vmcs->host_state;
bd9966de 1337
e1beb1d3 1338 ++vmx->vcpu.stat.host_state_reload;
bd9966de 1339
c8770e7b 1340#ifdef CONFIG_X86_64
4679b61f 1341 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
c8770e7b 1342#endif
d7ee039e
SC
1343 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1344 kvm_load_ldt(host_state->ldt_sel);
33ed6329 1345#ifdef CONFIG_X86_64
d7ee039e 1346 load_gs_index(host_state->gs_sel);
9581d442 1347#else
d7ee039e 1348 loadsegment(gs, host_state->gs_sel);
33ed6329 1349#endif
33ed6329 1350 }
d7ee039e
SC
1351 if (host_state->fs_sel & 7)
1352 loadsegment(fs, host_state->fs_sel);
b2da15ac 1353#ifdef CONFIG_X86_64
d7ee039e
SC
1354 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1355 loadsegment(ds, host_state->ds_sel);
1356 loadsegment(es, host_state->es_sel);
b2da15ac 1357 }
b2da15ac 1358#endif
b7ffc44d 1359 invalidate_tss_limit();
44ea2b17 1360#ifdef CONFIG_X86_64
c8770e7b 1361 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
44ea2b17 1362#endif
45fc8757 1363 load_fixmap_gdt(raw_smp_processor_id());
b464f57e 1364 vmx->guest_state_loaded = false;
658ece84 1365 vmx->guest_uret_msrs_loaded = false;
33ed6329
AK
1366}
1367
678e315e
SC
1368#ifdef CONFIG_X86_64
1369static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
a9b21b62 1370{
4679b61f 1371 preempt_disable();
b464f57e 1372 if (vmx->guest_state_loaded)
4679b61f
PB
1373 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1374 preempt_enable();
678e315e 1375 return vmx->msr_guest_kernel_gs_base;
a9b21b62
AK
1376}
1377
678e315e
SC
1378static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1379{
4679b61f 1380 preempt_disable();
b464f57e 1381 if (vmx->guest_state_loaded)
4679b61f
PB
1382 wrmsrl(MSR_KERNEL_GS_BASE, data);
1383 preempt_enable();
678e315e
SC
1384 vmx->msr_guest_kernel_gs_base = data;
1385}
1386#endif
1387
5c911bef
SC
1388void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1389 struct loaded_vmcs *buddy)
6aa8b732 1390{
a2fa3e9f 1391 struct vcpu_vmx *vmx = to_vmx(vcpu);
b80c76ec 1392 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
5c911bef 1393 struct vmcs *prev;
6aa8b732 1394
b80c76ec 1395 if (!already_loaded) {
fe0e80be 1396 loaded_vmcs_clear(vmx->loaded_vmcs);
92fe13be 1397 local_irq_disable();
5a560f8b
XG
1398
1399 /*
31603d4f
SC
1400 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1401 * this cpu's percpu list, otherwise it may not yet be deleted
1402 * from its previous cpu's percpu list. Pairs with the
1403 * smb_wmb() in __loaded_vmcs_clear().
5a560f8b
XG
1404 */
1405 smp_rmb();
1406
d462b819
NHE
1407 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1408 &per_cpu(loaded_vmcss_on_cpu, cpu));
92fe13be 1409 local_irq_enable();
b80c76ec
JM
1410 }
1411
5c911bef
SC
1412 prev = per_cpu(current_vmcs, cpu);
1413 if (prev != vmx->loaded_vmcs->vmcs) {
b80c76ec
JM
1414 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1415 vmcs_load(vmx->loaded_vmcs->vmcs);
5c911bef
SC
1416
1417 /*
1418 * No indirect branch prediction barrier needed when switching
2e7eab81
JM
1419 * the active VMCS within a vCPU, unless IBRS is advertised to
1420 * the vCPU. To minimize the number of IBPBs executed, KVM
1421 * performs IBPB on nested VM-Exit (a single nested transition
1422 * may switch the active VMCS multiple times).
5c911bef
SC
1423 */
1424 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1425 indirect_branch_prediction_barrier();
b80c76ec
JM
1426 }
1427
1428 if (!already_loaded) {
59c58ceb 1429 void *gdt = get_current_gdt_ro();
b80c76ec 1430
eeeb4f67
SC
1431 /*
1432 * Flush all EPTP/VPID contexts, the new pCPU may have stale
1433 * TLB entries from its previous association with the vCPU.
1434 */
b80c76ec 1435 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
92fe13be 1436
6aa8b732
AK
1437 /*
1438 * Linux uses per-cpu TSS and GDT, so set these when switching
e0c23063 1439 * processors. See 22.2.4.
6aa8b732 1440 */
e0c23063 1441 vmcs_writel(HOST_TR_BASE,
72f5e08d 1442 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
59c58ceb 1443 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
6aa8b732 1444
6ab8a405
LJ
1445 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1446 /* 22.2.3 */
1447 vmcs_writel(HOST_IA32_SYSENTER_ESP,
1448 (unsigned long)(cpu_entry_stack(cpu) + 1));
1449 }
ff2c3a18 1450
d462b819 1451 vmx->loaded_vmcs->cpu = cpu;
6aa8b732 1452 }
8ef863e6
SC
1453}
1454
1455/*
1456 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1457 * vcpu mutex is already taken.
1458 */
1af1bb05 1459static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
8ef863e6
SC
1460{
1461 struct vcpu_vmx *vmx = to_vmx(vcpu);
1462
5c911bef 1463 vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
2680d6da 1464
28b835d6 1465 vmx_vcpu_pi_load(vcpu, cpu);
8ef863e6 1466
74c55931 1467 vmx->host_debugctlmsr = get_debugctlmsr();
28b835d6
FW
1468}
1469
13b964a2 1470static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 1471{
28b835d6
FW
1472 vmx_vcpu_pi_put(vcpu);
1473
6d6095bd 1474 vmx_prepare_switch_to_host(to_vmx(vcpu));
6aa8b732
AK
1475}
1476
dbab610a 1477bool vmx_emulation_required(struct kvm_vcpu *vcpu)
f244deed 1478{
2ba4493a 1479 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
f244deed
WL
1480}
1481
97b7ead3 1482unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
6aa8b732 1483{
e7bddc52 1484 struct vcpu_vmx *vmx = to_vmx(vcpu);
78ac8b47 1485 unsigned long rflags, save_rflags;
345dcaa8 1486
cb3c1e2f
SC
1487 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1488 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
6de12732 1489 rflags = vmcs_readl(GUEST_RFLAGS);
e7bddc52 1490 if (vmx->rmode.vm86_active) {
6de12732 1491 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
e7bddc52 1492 save_rflags = vmx->rmode.save_rflags;
6de12732
AK
1493 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1494 }
e7bddc52 1495 vmx->rflags = rflags;
78ac8b47 1496 }
e7bddc52 1497 return vmx->rflags;
6aa8b732
AK
1498}
1499
97b7ead3 1500void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6aa8b732 1501{
e7bddc52 1502 struct vcpu_vmx *vmx = to_vmx(vcpu);
491c1ad1 1503 unsigned long old_rflags;
f244deed 1504
bddd82d1 1505 if (is_unrestricted_guest(vcpu)) {
cb3c1e2f 1506 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
491c1ad1
SC
1507 vmx->rflags = rflags;
1508 vmcs_writel(GUEST_RFLAGS, rflags);
1509 return;
1510 }
1511
1512 old_rflags = vmx_get_rflags(vcpu);
e7bddc52
SC
1513 vmx->rflags = rflags;
1514 if (vmx->rmode.vm86_active) {
1515 vmx->rmode.save_rflags = rflags;
053de044 1516 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
78ac8b47 1517 }
6aa8b732 1518 vmcs_writel(GUEST_RFLAGS, rflags);
f244deed 1519
e7bddc52 1520 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
dbab610a 1521 vmx->emulation_required = vmx_emulation_required(vcpu);
6aa8b732
AK
1522}
1523
c5063551
MO
1524static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1525{
1526 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1527}
1528
97b7ead3 1529u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
1530{
1531 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1532 int ret = 0;
1533
1534 if (interruptibility & GUEST_INTR_STATE_STI)
48005f64 1535 ret |= KVM_X86_SHADOW_INT_STI;
2809f5d2 1536 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
48005f64 1537 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2809f5d2 1538
37ccdcbe 1539 return ret;
2809f5d2
GC
1540}
1541
97b7ead3 1542void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2809f5d2
GC
1543{
1544 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1545 u32 interruptibility = interruptibility_old;
1546
1547 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1548
48005f64 1549 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2809f5d2 1550 interruptibility |= GUEST_INTR_STATE_MOV_SS;
48005f64 1551 else if (mask & KVM_X86_SHADOW_INT_STI)
2809f5d2
GC
1552 interruptibility |= GUEST_INTR_STATE_STI;
1553
1554 if ((interruptibility != interruptibility_old))
1555 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1556}
1557
bf8c55d8
CP
1558static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1559{
1560 struct vcpu_vmx *vmx = to_vmx(vcpu);
1561 unsigned long value;
1562
1563 /*
1564 * Any MSR write that attempts to change bits marked reserved will
1565 * case a #GP fault.
1566 */
1567 if (data & vmx->pt_desc.ctl_bitmask)
1568 return 1;
1569
1570 /*
1571 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1572 * result in a #GP unless the same write also clears TraceEn.
1573 */
1574 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1575 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1576 return 1;
1577
1578 /*
1579 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1580 * and FabricEn would cause #GP, if
1581 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1582 */
1583 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1584 !(data & RTIT_CTL_FABRIC_EN) &&
1585 !intel_pt_validate_cap(vmx->pt_desc.caps,
1586 PT_CAP_single_range_output))
1587 return 1;
1588
1589 /*
1590 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
d9f6e12f 1591 * utilize encodings marked reserved will cause a #GP fault.
bf8c55d8
CP
1592 */
1593 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1594 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1595 !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1596 RTIT_CTL_MTC_RANGE_OFFSET, &value))
1597 return 1;
1598 value = intel_pt_validate_cap(vmx->pt_desc.caps,
1599 PT_CAP_cycle_thresholds);
1600 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1601 !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1602 RTIT_CTL_CYC_THRESH_OFFSET, &value))
1603 return 1;
1604 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1605 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1606 !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1607 RTIT_CTL_PSB_FREQ_OFFSET, &value))
1608 return 1;
1609
1610 /*
1611 * If ADDRx_CFG is reserved or the encodings is >2 will
1612 * cause a #GP fault.
1613 */
1614 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
f4d3a902 1615 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
bf8c55d8
CP
1616 return 1;
1617 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
f4d3a902 1618 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
bf8c55d8
CP
1619 return 1;
1620 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
f4d3a902 1621 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
bf8c55d8
CP
1622 return 1;
1623 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
f4d3a902 1624 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
bf8c55d8
CP
1625 return 1;
1626
1627 return 0;
1628}
1629
4d31d9ef
SC
1630static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1631 void *insn, int insn_len)
09e3e2a1 1632{
3c0c2ad1
SC
1633 /*
1634 * Emulation of instructions in SGX enclaves is impossible as RIP does
4d31d9ef 1635 * not point at the failing instruction, and even if it did, the code
3c0c2ad1
SC
1636 * stream is inaccessible. Inject #UD instead of exiting to userspace
1637 * so that guest userspace can't DoS the guest simply by triggering
1638 * emulation (enclaves are CPL3 only).
1639 */
1640 if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1641 kvm_queue_exception(vcpu, UD_VECTOR);
1642 return false;
1643 }
09e3e2a1
SC
1644 return true;
1645}
1646
1957aa63 1647static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
6aa8b732 1648{
3c0c2ad1 1649 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
fede8076 1650 unsigned long rip, orig_rip;
3c0c2ad1 1651 u32 instr_len;
6aa8b732 1652
1957aa63
SC
1653 /*
1654 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1655 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1656 * set when EPT misconfig occurs. In practice, real hardware updates
1657 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1658 * (namely Hyper-V) don't set it due to it being undefined behavior,
1659 * i.e. we end up advancing IP with some random value.
1660 */
1661 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
3c0c2ad1
SC
1662 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1663 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1664
1665 /*
1666 * Emulating an enclave's instructions isn't supported as KVM
1667 * cannot access the enclave's memory or its true RIP, e.g. the
1668 * vmcs.GUEST_RIP points at the exit point of the enclave, not
1669 * the RIP that actually triggered the VM-Exit. But, because
1670 * most instructions that cause VM-Exit will #UD in an enclave,
1671 * most instruction-based VM-Exits simply do not occur.
1672 *
1673 * There are a few exceptions, notably the debug instructions
1674 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1675 * and generate #DB/#BP as expected, which KVM might intercept.
1676 * But again, the CPU does the dirty work and saves an instr
1677 * length of zero so VMMs don't shoot themselves in the foot.
1678 * WARN if KVM tries to skip a non-zero length instruction on
1679 * a VM-Exit from an enclave.
1680 */
1681 if (!instr_len)
1682 goto rip_updated;
1683
8d20bd63
SC
1684 WARN_ONCE(exit_reason.enclave_mode,
1685 "skipping instruction after SGX enclave VM-Exit");
3c0c2ad1 1686
fede8076 1687 orig_rip = kvm_rip_read(vcpu);
3c0c2ad1 1688 rip = orig_rip + instr_len;
fede8076
PB
1689#ifdef CONFIG_X86_64
1690 /*
1691 * We need to mask out the high 32 bits of RIP if not in 64-bit
1692 * mode, but just finding out that we are in 64-bit mode is
1693 * quite expensive. Only do it if there was a carry.
1694 */
1695 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1696 rip = (u32)rip;
1697#endif
1957aa63
SC
1698 kvm_rip_write(vcpu, rip);
1699 } else {
1700 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1701 return 0;
1702 }
6aa8b732 1703
3c0c2ad1 1704rip_updated:
2809f5d2
GC
1705 /* skipping an emulated instruction also counts */
1706 vmx_set_interrupt_shadow(vcpu, 0);
f8ea7c60 1707
60fc3d02 1708 return 1;
f8ea7c60
VK
1709}
1710
5ef8acbd
OU
1711/*
1712 * Recognizes a pending MTF VM-exit and records the nested state for later
1713 * delivery.
1714 */
1715static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1716{
1717 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1718 struct vcpu_vmx *vmx = to_vmx(vcpu);
1719
1720 if (!is_guest_mode(vcpu))
1721 return;
1722
1723 /*
1724 * Per the SDM, MTF takes priority over debug-trap exceptions besides
65ec8f01
SC
1725 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
1726 * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1727 * intercepted #DB deliberately avoids single-step #DB and MTF updates
1728 * as ICEBP is higher priority than both. As instruction emulation is
1729 * completed at this point (i.e. KVM is at the instruction boundary),
1730 * any #DB exception pending delivery must be a debug-trap of lower
1731 * priority than MTF. Record the pending MTF state to be delivered in
5ef8acbd
OU
1732 * vmx_check_nested_events().
1733 */
1734 if (nested_cpu_has_mtf(vmcs12) &&
1735 (!vcpu->arch.exception.pending ||
7709aba8
SC
1736 vcpu->arch.exception.vector == DB_VECTOR) &&
1737 (!vcpu->arch.exception_vmexit.pending ||
2ea89c7f 1738 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
5ef8acbd 1739 vmx->nested.mtf_pending = true;
2ea89c7f
SC
1740 kvm_make_request(KVM_REQ_EVENT, vcpu);
1741 } else {
5ef8acbd 1742 vmx->nested.mtf_pending = false;
2ea89c7f 1743 }
5ef8acbd
OU
1744}
1745
1746static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1747{
1748 vmx_update_emulated_instruction(vcpu);
1749 return skip_emulated_instruction(vcpu);
1750}
1751
caa057a2
WL
1752static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1753{
1754 /*
1755 * Ensure that we clear the HLT state in the VMCS. We don't need to
1756 * explicitly skip the instruction because if the HLT state is set,
1757 * then the instruction is already executing and RIP has already been
1758 * advanced.
1759 */
1760 if (kvm_hlt_in_guest(vcpu->kvm) &&
1761 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1762 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1763}
1764
6ad75c5c 1765static void vmx_inject_exception(struct kvm_vcpu *vcpu)
298101da 1766{
d4963e31
SC
1767 struct kvm_queued_exception *ex = &vcpu->arch.exception;
1768 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
77ab6db0
JK
1769 struct vcpu_vmx *vmx = to_vmx(vcpu);
1770
d4963e31 1771 kvm_deliver_exception_payload(vcpu, ex);
da998b46 1772
d4963e31 1773 if (ex->has_error_code) {
eba9799b
SC
1774 /*
1775 * Despite the error code being architecturally defined as 32
1776 * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1777 * VMX don't actually supporting setting bits 31:16. Hardware
1778 * will (should) never provide a bogus error code, but AMD CPUs
1779 * do generate error codes with bits 31:16 set, and so KVM's
1780 * ABI lets userspace shove in arbitrary 32-bit values. Drop
1781 * the upper bits to avoid VM-Fail, losing information that
1782 * does't really exist is preferable to killing the VM.
1783 */
d4963e31 1784 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
8ab2d2e2
JK
1785 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1786 }
77ab6db0 1787
7ffd92c5 1788 if (vmx->rmode.vm86_active) {
71f9833b 1789 int inc_eip = 0;
d4963e31 1790 if (kvm_exception_is_soft(ex->vector))
71f9833b 1791 inc_eip = vcpu->arch.event_exit_inst_len;
d4963e31 1792 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
77ab6db0
JK
1793 return;
1794 }
1795
add5ff7a
SC
1796 WARN_ON_ONCE(vmx->emulation_required);
1797
d4963e31 1798 if (kvm_exception_is_soft(ex->vector)) {
66fd3f7f
GN
1799 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1800 vmx->vcpu.arch.event_exit_inst_len);
8ab2d2e2
JK
1801 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1802 } else
1803 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1804
1805 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
caa057a2
WL
1806
1807 vmx_clear_hlt(vcpu);
298101da
AK
1808}
1809
ee9d22e0
SC
1810static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1811 bool load_into_hardware)
a75beee6 1812{
ee9d22e0 1813 struct vmx_uret_msr *uret_msr;
a2fa3e9f 1814
ee9d22e0
SC
1815 uret_msr = vmx_find_uret_msr(vmx, msr);
1816 if (!uret_msr)
bd65ba82 1817 return;
a2fa3e9f 1818
ee9d22e0 1819 uret_msr->load_into_hardware = load_into_hardware;
a75beee6
ED
1820}
1821
e38aea3e 1822/*
400dd54b
SC
1823 * Configuring user return MSRs to automatically save, load, and restore MSRs
1824 * that need to be shoved into hardware when running the guest. Note, omitting
1825 * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1826 * loaded into hardware when running the guest.
e38aea3e 1827 */
400dd54b 1828static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
e38aea3e 1829{
a75beee6 1830#ifdef CONFIG_X86_64
ee9d22e0
SC
1831 bool load_syscall_msrs;
1832
84c8c5b8
JM
1833 /*
1834 * The SYSCALL MSRs are only needed on long mode guests, and only
1835 * when EFER.SCE is set.
1836 */
ee9d22e0
SC
1837 load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1838 (vmx->vcpu.arch.efer & EFER_SCE);
1839
1840 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1841 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1842 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
a75beee6 1843#endif
ee9d22e0 1844 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
e38aea3e 1845
ee9d22e0
SC
1846 vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1847 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1848 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
bd65ba82 1849
5e17c624
SC
1850 /*
1851 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1852 * kernel and old userspace. If those guests run on a tsx=off host, do
1853 * allow guests to use TSX_CTRL, but don't change the value in hardware
1854 * so that TSX remains always disabled.
1855 */
1856 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
5897297b 1857
ee9d22e0
SC
1858 /*
1859 * The set of MSRs to load may have changed, reload MSRs before the
1860 * next VM-Enter.
1861 */
1862 vmx->guest_uret_msrs_loaded = false;
e38aea3e
AK
1863}
1864
307a94c7
IS
1865u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1866{
1867 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1868
1869 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1870 return vmcs12->tsc_offset;
1871
1872 return 0;
1873}
1874
1875u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1876{
1877 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1878
1879 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1880 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1881 return vmcs12->tsc_multiplier;
1882
938c8745 1883 return kvm_caps.default_tsc_scaling_ratio;
307a94c7
IS
1884}
1885
edcfe540 1886static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
6aa8b732 1887{
edcfe540 1888 vmcs_write64(TSC_OFFSET, offset);
6aa8b732
AK
1889}
1890
1ab9287a
IS
1891static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1892{
1893 vmcs_write64(TSC_MULTIPLIER, multiplier);
1894}
1895
801d3424
NHE
1896/*
1897 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1898 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1899 * all guests if the "nested" module option is off, and can also be disabled
1900 * for a single guest by disabling its VMX cpuid bit.
1901 */
7c97fcb3 1902bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
801d3424 1903{
d6321d49 1904 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
801d3424
NHE
1905}
1906
d2a00af2
SC
1907/*
1908 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1909 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
1910 * backwards compatibility even though KVM doesn't support emulating SMX. And
1911 * because userspace set "VMX in SMX", the guest must also be allowed to set it,
1912 * e.g. if the MSR is left unlocked and the guest does a RMW operation.
1913 */
1914#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
1915 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
1916 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
1917 FEAT_CTL_SGX_LC_ENABLED | \
1918 FEAT_CTL_SGX_ENABLED | \
1919 FEAT_CTL_LMCE_ENABLED)
1920
2d6cd686
SC
1921static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1922 struct msr_data *msr)
62cc6b9d 1923{
d2a00af2
SC
1924 uint64_t valid_bits;
1925
1926 /*
1927 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1928 * exposed to the guest.
1929 */
1930 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1931 ~KVM_SUPPORTED_FEATURE_CONTROL);
1932
2d6cd686
SC
1933 if (!msr->host_initiated &&
1934 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1935 return false;
1936
d2a00af2
SC
1937 if (msr->host_initiated)
1938 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1939 else
1940 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
62cc6b9d 1941
d2a00af2 1942 return !(msr->data & ~valid_bits);
62cc6b9d
DM
1943}
1944
55d2375e 1945static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
62cc6b9d 1946{
55d2375e
SC
1947 switch (msr->index) {
1948 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1949 if (!nested)
1950 return 1;
1951 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1952 default:
12bc2132 1953 return KVM_MSR_RET_INVALID;
55d2375e 1954 }
62cc6b9d
DM
1955}
1956
55d2375e 1957/*
fe26f91d 1958 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
55d2375e
SC
1959 * Returns 0 on success, non-0 otherwise.
1960 * Assumes vcpu_load() was already called.
1961 */
1962static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
62cc6b9d 1963{
55d2375e 1964 struct vcpu_vmx *vmx = to_vmx(vcpu);
eb3db1b1 1965 struct vmx_uret_msr *msr;
bf8c55d8 1966 u32 index;
62cc6b9d 1967
55d2375e
SC
1968 switch (msr_info->index) {
1969#ifdef CONFIG_X86_64
1970 case MSR_FS_BASE:
1971 msr_info->data = vmcs_readl(GUEST_FS_BASE);
62cc6b9d 1972 break;
55d2375e
SC
1973 case MSR_GS_BASE:
1974 msr_info->data = vmcs_readl(GUEST_GS_BASE);
62cc6b9d 1975 break;
55d2375e
SC
1976 case MSR_KERNEL_GS_BASE:
1977 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
62cc6b9d 1978 break;
55d2375e
SC
1979#endif
1980 case MSR_EFER:
1981 return kvm_get_msr_common(vcpu, msr_info);
c11f83e0
PB
1982 case MSR_IA32_TSX_CTRL:
1983 if (!msr_info->host_initiated &&
1984 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1985 return 1;
eb3db1b1 1986 goto find_uret_msr;
6e3ba4ab
TX
1987 case MSR_IA32_UMWAIT_CONTROL:
1988 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1989 return 1;
1990
1991 msr_info->data = vmx->msr_ia32_umwait_control;
1992 break;
55d2375e
SC
1993 case MSR_IA32_SPEC_CTRL:
1994 if (!msr_info->host_initiated &&
39485ed9 1995 !guest_has_spec_ctrl_msr(vcpu))
55d2375e
SC
1996 return 1;
1997
1998 msr_info->data = to_vmx(vcpu)->spec_ctrl;
62cc6b9d 1999 break;
6aa8b732 2000 case MSR_IA32_SYSENTER_CS:
609e36d3 2001 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
6aa8b732
AK
2002 break;
2003 case MSR_IA32_SYSENTER_EIP:
609e36d3 2004 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
6aa8b732
AK
2005 break;
2006 case MSR_IA32_SYSENTER_ESP:
609e36d3 2007 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
6aa8b732 2008 break;
0dd376e7 2009 case MSR_IA32_BNDCFGS:
691bd434 2010 if (!kvm_mpx_supported() ||
d6321d49
RK
2011 (!msr_info->host_initiated &&
2012 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 2013 return 1;
609e36d3 2014 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
0dd376e7 2015 break;
c45dcc71
AR
2016 case MSR_IA32_MCG_EXT_CTL:
2017 if (!msr_info->host_initiated &&
a6cb099a 2018 !(vmx->msr_ia32_feature_control &
32ad73db 2019 FEAT_CTL_LMCE_ENABLED))
cae50139 2020 return 1;
c45dcc71
AR
2021 msr_info->data = vcpu->arch.mcg_ext_ctl;
2022 break;
32ad73db 2023 case MSR_IA32_FEAT_CTL:
a6cb099a 2024 msr_info->data = vmx->msr_ia32_feature_control;
cae50139 2025 break;
8f102445
SC
2026 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2027 if (!msr_info->host_initiated &&
2028 !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2029 return 1;
2030 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2031 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2032 break;
cae50139
JK
2033 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2034 if (!nested_vmx_allowed(vcpu))
2035 return 1;
31de3d25
VK
2036 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2037 &msr_info->data))
2038 return 1;
2039 /*
8d68bad6
VK
2040 * Enlightened VMCS v1 doesn't have certain VMCS fields but
2041 * instead of just ignoring the features, different Hyper-V
2042 * versions are either trying to use them and fail or do some
2043 * sanity checking and refuse to boot. Filter all unsupported
2044 * features out.
31de3d25 2045 */
85ab071a 2046 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
4da77090 2047 nested_evmcs_filter_control_msr(vcpu, msr_info->index,
31de3d25
VK
2048 &msr_info->data);
2049 break;
bf8c55d8 2050 case MSR_IA32_RTIT_CTL:
2ef7619d 2051 if (!vmx_pt_mode_is_host_guest())
bf8c55d8
CP
2052 return 1;
2053 msr_info->data = vmx->pt_desc.guest.ctl;
2054 break;
2055 case MSR_IA32_RTIT_STATUS:
2ef7619d 2056 if (!vmx_pt_mode_is_host_guest())
bf8c55d8
CP
2057 return 1;
2058 msr_info->data = vmx->pt_desc.guest.status;
2059 break;
2060 case MSR_IA32_RTIT_CR3_MATCH:
2ef7619d 2061 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
2062 !intel_pt_validate_cap(vmx->pt_desc.caps,
2063 PT_CAP_cr3_filtering))
2064 return 1;
2065 msr_info->data = vmx->pt_desc.guest.cr3_match;
2066 break;
2067 case MSR_IA32_RTIT_OUTPUT_BASE:
2ef7619d 2068 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
2069 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2070 PT_CAP_topa_output) &&
2071 !intel_pt_validate_cap(vmx->pt_desc.caps,
2072 PT_CAP_single_range_output)))
2073 return 1;
2074 msr_info->data = vmx->pt_desc.guest.output_base;
2075 break;
2076 case MSR_IA32_RTIT_OUTPUT_MASK:
2ef7619d 2077 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
2078 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2079 PT_CAP_topa_output) &&
2080 !intel_pt_validate_cap(vmx->pt_desc.caps,
2081 PT_CAP_single_range_output)))
2082 return 1;
2083 msr_info->data = vmx->pt_desc.guest.output_mask;
2084 break;
2085 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2086 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2ef7619d 2087 if (!vmx_pt_mode_is_host_guest() ||
f4d3a902 2088 (index >= 2 * vmx->pt_desc.num_address_ranges))
bf8c55d8
CP
2089 return 1;
2090 if (index % 2)
2091 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2092 else
2093 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2094 break;
d855066f
LX
2095 case MSR_IA32_DEBUGCTLMSR:
2096 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
2097 break;
6aa8b732 2098 default:
eb3db1b1 2099 find_uret_msr:
d85a8034 2100 msr = vmx_find_uret_msr(vmx, msr_info->index);
3bab1f5d 2101 if (msr) {
609e36d3 2102 msr_info->data = msr->data;
3bab1f5d 2103 break;
6aa8b732 2104 }
609e36d3 2105 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
2106 }
2107
6aa8b732
AK
2108 return 0;
2109}
2110
2408500d
SC
2111static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2112 u64 data)
2113{
2114#ifdef CONFIG_X86_64
2115 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2116 return (u32)data;
2117#endif
2118 return (unsigned long)data;
2119}
2120
b333b8eb 2121static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
c6462363 2122{
18e897d2 2123 u64 debugctl = 0;
c6462363 2124
18e897d2 2125 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
b333b8eb 2126 (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
18e897d2 2127 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
c6462363 2128
bec46859 2129 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
b333b8eb 2130 (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
18e897d2 2131 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
76ea438b 2132
c6462363
LX
2133 return debugctl;
2134}
2135
6aa8b732 2136/*
311497e0 2137 * Writes msr value into the appropriate "register".
6aa8b732
AK
2138 * Returns 0 on success, non-0 otherwise.
2139 * Assumes vcpu_load() was already called.
2140 */
8fe8ab46 2141static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 2142{
a2fa3e9f 2143 struct vcpu_vmx *vmx = to_vmx(vcpu);
eb3db1b1 2144 struct vmx_uret_msr *msr;
2cc51560 2145 int ret = 0;
8fe8ab46
WA
2146 u32 msr_index = msr_info->index;
2147 u64 data = msr_info->data;
bf8c55d8 2148 u32 index;
2cc51560 2149
6aa8b732 2150 switch (msr_index) {
3bab1f5d 2151 case MSR_EFER:
8fe8ab46 2152 ret = kvm_set_msr_common(vcpu, msr_info);
2cc51560 2153 break;
16175a79 2154#ifdef CONFIG_X86_64
6aa8b732 2155 case MSR_FS_BASE:
2fb92db1 2156 vmx_segment_cache_clear(vmx);
6aa8b732
AK
2157 vmcs_writel(GUEST_FS_BASE, data);
2158 break;
2159 case MSR_GS_BASE:
2fb92db1 2160 vmx_segment_cache_clear(vmx);
6aa8b732
AK
2161 vmcs_writel(GUEST_GS_BASE, data);
2162 break;
44ea2b17 2163 case MSR_KERNEL_GS_BASE:
678e315e 2164 vmx_write_guest_kernel_gs_base(vmx, data);
44ea2b17 2165 break;
ec5be88a
JL
2166 case MSR_IA32_XFD:
2167 ret = kvm_set_msr_common(vcpu, msr_info);
b5274b1b
KT
2168 /*
2169 * Always intercepting WRMSR could incur non-negligible
2170 * overhead given xfd might be changed frequently in
2171 * guest context switch. Disable write interception
2172 * upon the first write with a non-zero value (indicating
2173 * potential usage on dynamic xfeatures). Also update
2174 * exception bitmap to trap #NM for proper virtualization
2175 * of guest xfd_err.
2176 */
2177 if (!ret && data) {
2178 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2179 MSR_TYPE_RW);
2180 vcpu->arch.xfd_no_write_intercept = true;
ec5be88a 2181 vmx_update_exception_bitmap(vcpu);
b5274b1b 2182 }
ec5be88a 2183 break;
6aa8b732
AK
2184#endif
2185 case MSR_IA32_SYSENTER_CS:
de70d279
SC
2186 if (is_guest_mode(vcpu))
2187 get_vmcs12(vcpu)->guest_sysenter_cs = data;
6aa8b732
AK
2188 vmcs_write32(GUEST_SYSENTER_CS, data);
2189 break;
2190 case MSR_IA32_SYSENTER_EIP:
2408500d
SC
2191 if (is_guest_mode(vcpu)) {
2192 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
de70d279 2193 get_vmcs12(vcpu)->guest_sysenter_eip = data;
2408500d 2194 }
f5b42c33 2195 vmcs_writel(GUEST_SYSENTER_EIP, data);
6aa8b732
AK
2196 break;
2197 case MSR_IA32_SYSENTER_ESP:
2408500d
SC
2198 if (is_guest_mode(vcpu)) {
2199 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
de70d279 2200 get_vmcs12(vcpu)->guest_sysenter_esp = data;
2408500d 2201 }
f5b42c33 2202 vmcs_writel(GUEST_SYSENTER_ESP, data);
6aa8b732 2203 break;
d855066f 2204 case MSR_IA32_DEBUGCTLMSR: {
b333b8eb
SC
2205 u64 invalid;
2206
2207 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
d855066f 2208 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
e76ae527 2209 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
d855066f
LX
2210 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2211 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2212 }
2213
2214 if (invalid)
2215 return 1;
2216
699a1ac2
SC
2217 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2218 VM_EXIT_SAVE_DEBUG_CONTROLS)
2219 get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2220
d855066f 2221 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
8e12911b
LX
2222 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2223 (data & DEBUGCTLMSR_LBR))
2224 intel_pmu_create_guest_lbr_event(vcpu);
d855066f
LX
2225 return 0;
2226 }
0dd376e7 2227 case MSR_IA32_BNDCFGS:
691bd434 2228 if (!kvm_mpx_supported() ||
d6321d49
RK
2229 (!msr_info->host_initiated &&
2230 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 2231 return 1;
fd8cb433 2232 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
4531662d 2233 (data & MSR_IA32_BNDCFGS_RSVD))
93c4adc7 2234 return 1;
913d6c9b
SC
2235
2236 if (is_guest_mode(vcpu) &&
2237 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2238 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2239 get_vmcs12(vcpu)->guest_bndcfgs = data;
2240
0dd376e7
LJ
2241 vmcs_write64(GUEST_BNDCFGS, data);
2242 break;
6e3ba4ab
TX
2243 case MSR_IA32_UMWAIT_CONTROL:
2244 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2245 return 1;
2246
2247 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2248 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2249 return 1;
2250
2251 vmx->msr_ia32_umwait_control = data;
2252 break;
d28b387f
KA
2253 case MSR_IA32_SPEC_CTRL:
2254 if (!msr_info->host_initiated &&
39485ed9 2255 !guest_has_spec_ctrl_msr(vcpu))
d28b387f
KA
2256 return 1;
2257
841c2be0 2258 if (kvm_spec_ctrl_test_value(data))
d28b387f
KA
2259 return 1;
2260
2261 vmx->spec_ctrl = data;
d28b387f
KA
2262 if (!data)
2263 break;
2264
2265 /*
2266 * For non-nested:
2267 * When it's written (to non-zero) for the first time, pass
2268 * it through.
2269 *
2270 * For nested:
2271 * The handling of the MSR bitmap for L2 guests is done in
4d516fe7 2272 * nested_vmx_prepare_msr_bitmap. We should not touch the
d28b387f
KA
2273 * vmcs02.msr_bitmap here since it gets completely overwritten
2274 * in the merging. We update the vmcs01 here for L1 as well
2275 * since it will end up touching the MSR anyway now.
2276 */
476c9bd8 2277 vmx_disable_intercept_for_msr(vcpu,
d28b387f
KA
2278 MSR_IA32_SPEC_CTRL,
2279 MSR_TYPE_RW);
2280 break;
c11f83e0
PB
2281 case MSR_IA32_TSX_CTRL:
2282 if (!msr_info->host_initiated &&
2283 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2284 return 1;
2285 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2286 return 1;
eb3db1b1 2287 goto find_uret_msr;
15d45071
AR
2288 case MSR_IA32_PRED_CMD:
2289 if (!msr_info->host_initiated &&
39485ed9 2290 !guest_has_pred_cmd_msr(vcpu))
15d45071
AR
2291 return 1;
2292
2293 if (data & ~PRED_CMD_IBPB)
2294 return 1;
39485ed9 2295 if (!boot_cpu_has(X86_FEATURE_IBPB))
6441fa61 2296 return 1;
15d45071
AR
2297 if (!data)
2298 break;
2299
2300 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2301
2302 /*
2303 * For non-nested:
2304 * When it's written (to non-zero) for the first time, pass
2305 * it through.
2306 *
2307 * For nested:
2308 * The handling of the MSR bitmap for L2 guests is done in
4d516fe7 2309 * nested_vmx_prepare_msr_bitmap. We should not touch the
15d45071
AR
2310 * vmcs02.msr_bitmap here since it gets completely overwritten
2311 * in the merging.
2312 */
476c9bd8 2313 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
15d45071 2314 break;
468d472f 2315 case MSR_IA32_CR_PAT:
d28f4290
SC
2316 if (!kvm_pat_valid(data))
2317 return 1;
2318
142e4be7
SC
2319 if (is_guest_mode(vcpu) &&
2320 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2321 get_vmcs12(vcpu)->guest_ia32_pat = data;
2322
468d472f
SY
2323 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2324 vmcs_write64(GUEST_IA32_PAT, data);
2325 vcpu->arch.pat = data;
2326 break;
2327 }
8fe8ab46 2328 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 2329 break;
c45dcc71
AR
2330 case MSR_IA32_MCG_EXT_CTL:
2331 if ((!msr_info->host_initiated &&
2332 !(to_vmx(vcpu)->msr_ia32_feature_control &
32ad73db 2333 FEAT_CTL_LMCE_ENABLED)) ||
c45dcc71
AR
2334 (data & ~MCG_EXT_CTL_LMCE_EN))
2335 return 1;
2336 vcpu->arch.mcg_ext_ctl = data;
2337 break;
32ad73db 2338 case MSR_IA32_FEAT_CTL:
2d6cd686 2339 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
cae50139 2340 return 1;
2d6cd686 2341
3b84080b 2342 vmx->msr_ia32_feature_control = data;
cae50139
JK
2343 if (msr_info->host_initiated && data == 0)
2344 vmx_leave_nested(vcpu);
72add915
SC
2345
2346 /* SGX may be enabled/disabled by guest's firmware */
2347 vmx_write_encls_bitmap(vcpu, NULL);
cae50139 2348 break;
8f102445
SC
2349 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2350 /*
2351 * On real hardware, the LE hash MSRs are writable before
2352 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2353 * at which point SGX related bits in IA32_FEATURE_CONTROL
2354 * become writable.
2355 *
2356 * KVM does not emulate SGX activation for simplicity, so
2357 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2358 * is unlocked. This is technically not architectural
2359 * behavior, but it's close enough.
2360 */
2361 if (!msr_info->host_initiated &&
2362 (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2363 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2364 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2365 return 1;
2366 vmx->msr_ia32_sgxlepubkeyhash
2367 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
cae50139
JK
2368 break;
2369 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
62cc6b9d
DM
2370 if (!msr_info->host_initiated)
2371 return 1; /* they are read-only */
2372 if (!nested_vmx_allowed(vcpu))
2373 return 1;
2374 return vmx_set_vmx_msr(vcpu, msr_index, data);
bf8c55d8 2375 case MSR_IA32_RTIT_CTL:
2ef7619d 2376 if (!vmx_pt_mode_is_host_guest() ||
ee85dec2
LK
2377 vmx_rtit_ctl_check(vcpu, data) ||
2378 vmx->nested.vmxon)
bf8c55d8
CP
2379 return 1;
2380 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2381 vmx->pt_desc.guest.ctl = data;
476c9bd8 2382 pt_update_intercept_for_msr(vcpu);
bf8c55d8
CP
2383 break;
2384 case MSR_IA32_RTIT_STATUS:
e348ac7c
SC
2385 if (!pt_can_write_msr(vmx))
2386 return 1;
2387 if (data & MSR_IA32_RTIT_STATUS_MASK)
bf8c55d8
CP
2388 return 1;
2389 vmx->pt_desc.guest.status = data;
2390 break;
2391 case MSR_IA32_RTIT_CR3_MATCH:
e348ac7c
SC
2392 if (!pt_can_write_msr(vmx))
2393 return 1;
2394 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2395 PT_CAP_cr3_filtering))
bf8c55d8
CP
2396 return 1;
2397 vmx->pt_desc.guest.cr3_match = data;
2398 break;
2399 case MSR_IA32_RTIT_OUTPUT_BASE:
e348ac7c
SC
2400 if (!pt_can_write_msr(vmx))
2401 return 1;
2402 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2403 PT_CAP_topa_output) &&
2404 !intel_pt_validate_cap(vmx->pt_desc.caps,
2405 PT_CAP_single_range_output))
2406 return 1;
1cc6cbc3 2407 if (!pt_output_base_valid(vcpu, data))
bf8c55d8
CP
2408 return 1;
2409 vmx->pt_desc.guest.output_base = data;
2410 break;
2411 case MSR_IA32_RTIT_OUTPUT_MASK:
e348ac7c
SC
2412 if (!pt_can_write_msr(vmx))
2413 return 1;
2414 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2415 PT_CAP_topa_output) &&
2416 !intel_pt_validate_cap(vmx->pt_desc.caps,
2417 PT_CAP_single_range_output))
bf8c55d8
CP
2418 return 1;
2419 vmx->pt_desc.guest.output_mask = data;
2420 break;
2421 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
e348ac7c
SC
2422 if (!pt_can_write_msr(vmx))
2423 return 1;
bf8c55d8 2424 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
f4d3a902 2425 if (index >= 2 * vmx->pt_desc.num_address_ranges)
bf8c55d8 2426 return 1;
fe6ed369 2427 if (is_noncanonical_address(data, vcpu))
bf8c55d8
CP
2428 return 1;
2429 if (index % 2)
2430 vmx->pt_desc.guest.addr_b[index / 2] = data;
2431 else
2432 vmx->pt_desc.guest.addr_a[index / 2] = data;
2433 break;
9c9520ce
PB
2434 case MSR_IA32_PERF_CAPABILITIES:
2435 if (data && !vcpu_to_pmu(vcpu)->version)
2436 return 1;
2437 if (data & PMU_CAP_LBR_FMT) {
2438 if ((data & PMU_CAP_LBR_FMT) !=
bec46859 2439 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
9c9520ce 2440 return 1;
59cc99f6 2441 if (!cpuid_model_is_consistent(vcpu))
9c9520ce
PB
2442 return 1;
2443 }
cf8e55fe
LX
2444 if (data & PERF_CAP_PEBS_FORMAT) {
2445 if ((data & PERF_CAP_PEBS_MASK) !=
bec46859 2446 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
cf8e55fe
LX
2447 return 1;
2448 if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2449 return 1;
2450 if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2451 return 1;
2452 if (!cpuid_model_is_consistent(vcpu))
9c9520ce
PB
2453 return 1;
2454 }
2455 ret = kvm_set_msr_common(vcpu, msr_info);
2456 break;
c11f83e0 2457
6aa8b732 2458 default:
eb3db1b1 2459 find_uret_msr:
d85a8034 2460 msr = vmx_find_uret_msr(vmx, msr_index);
b07a5c53 2461 if (msr)
7bf662bb 2462 ret = vmx_set_guest_uret_msr(vmx, msr, data);
b07a5c53
PB
2463 else
2464 ret = kvm_set_msr_common(vcpu, msr_info);
6aa8b732
AK
2465 }
2466
027bbb88
PG
2467 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2468 if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2469 vmx_update_fb_clear_dis(vcpu, vmx);
2470
2cc51560 2471 return ret;
6aa8b732
AK
2472}
2473
5fdbf976 2474static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
6aa8b732 2475{
f98c1e77
SC
2476 unsigned long guest_owned_bits;
2477
cb3c1e2f
SC
2478 kvm_register_mark_available(vcpu, reg);
2479
5fdbf976
MT
2480 switch (reg) {
2481 case VCPU_REGS_RSP:
2482 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2483 break;
2484 case VCPU_REGS_RIP:
2485 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2486 break;
6de4f3ad
AK
2487 case VCPU_EXREG_PDPTR:
2488 if (enable_ept)
2489 ept_save_pdptrs(vcpu);
2490 break;
bd31fe49
SC
2491 case VCPU_EXREG_CR0:
2492 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2493
2494 vcpu->arch.cr0 &= ~guest_owned_bits;
2495 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2496 break;
34059c25 2497 case VCPU_EXREG_CR3:
81ca0e73
SC
2498 /*
2499 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2500 * CR3 is loaded into hardware, not the guest's CR3.
2501 */
2502 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
34059c25
SC
2503 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2504 break;
f98c1e77
SC
2505 case VCPU_EXREG_CR4:
2506 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2507
2508 vcpu->arch.cr4 &= ~guest_owned_bits;
2509 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2510 break;
5fdbf976 2511 default:
67369273 2512 KVM_BUG_ON(1, vcpu->kvm);
5fdbf976
MT
2513 break;
2514 }
6aa8b732
AK
2515}
2516
7a57c09b
SC
2517/*
2518 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2519 * directly instead of going through cpu_has(), to ensure KVM is trapping
2520 * ENCLS whenever it's supported in hardware. It does not matter whether
2521 * the host OS supports or has enabled SGX.
2522 */
2523static bool cpu_has_sgx(void)
2524{
2525 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2526}
2527
9d78d6fb
VK
2528/*
2529 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2530 * can't be used due to errata where VM Exit may incorrectly clear
2531 * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
2532 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2533 */
2534static bool cpu_has_perf_global_ctrl_bug(void)
2535{
2536 if (boot_cpu_data.x86 == 0x6) {
2537 switch (boot_cpu_data.x86_model) {
2538 case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
2539 case INTEL_FAM6_NEHALEM: /* AAP115 */
2540 case INTEL_FAM6_WESTMERE: /* AAT100 */
2541 case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
2542 case INTEL_FAM6_NEHALEM_EX: /* BA97 */
2543 return true;
2544 default:
2545 break;
2546 }
2547 }
2548
2549 return false;
2550}
2551
d83420c2 2552static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
1c3d14fe
YS
2553{
2554 u32 vmx_msr_low, vmx_msr_high;
2555 u32 ctl = ctl_min | ctl_opt;
2556
2557 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2558
2559 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2560 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2561
2562 /* Ensure minimum (required) set of control bits are supported. */
2563 if (ctl_min & ~ctl)
002c7f7c 2564 return -EIO;
1c3d14fe
YS
2565
2566 *result = ctl;
2567 return 0;
2568}
2569
d83420c2 2570static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
1ad4e543
RH
2571{
2572 u64 allowed;
2573
2574 rdmsrl(msr, allowed);
2575
2576 return ctl_opt & allowed;
2577}
2578
d83420c2
SC
2579static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2580 struct vmx_capability *vmx_cap)
6aa8b732
AK
2581{
2582 u32 vmx_msr_low, vmx_msr_high;
1c3d14fe
YS
2583 u32 _pin_based_exec_control = 0;
2584 u32 _cpu_based_exec_control = 0;
f78e0e2e 2585 u32 _cpu_based_2nd_exec_control = 0;
1ad4e543 2586 u64 _cpu_based_3rd_exec_control = 0;
1c3d14fe
YS
2587 u32 _vmexit_control = 0;
2588 u32 _vmentry_control = 0;
0809d9b0 2589 u64 misc_msr;
f5a81d0e
SC
2590 int i;
2591
2592 /*
2593 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2594 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2595 * intercepts writes to PAT and EFER, i.e. never enables those controls.
2596 */
2597 struct {
2598 u32 entry_control;
2599 u32 exit_control;
2600 } const vmcs_entry_exit_pairs[] = {
2601 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2602 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
2603 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
2604 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
2605 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
2606 };
1c3d14fe 2607
1389309c 2608 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
ee087b4d
VK
2609
2610 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2611 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2612 MSR_IA32_VMX_PROCBASED_CTLS,
2613 &_cpu_based_exec_control))
002c7f7c 2614 return -EIO;
f78e0e2e 2615 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
ee087b4d
VK
2616 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2617 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
d56f546d 2618 MSR_IA32_VMX_PROCBASED_CTLS2,
ee087b4d 2619 &_cpu_based_2nd_exec_control))
f78e0e2e
SY
2620 return -EIO;
2621 }
2622#ifndef CONFIG_X86_64
2623 if (!(_cpu_based_2nd_exec_control &
2624 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2625 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2626#endif
83d4c286
YZ
2627
2628 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2629 _cpu_based_2nd_exec_control &= ~(
8d14695f 2630 SECONDARY_EXEC_APIC_REGISTER_VIRT |
c7c9c56c
YZ
2631 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2632 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
83d4c286 2633
61f1dd90 2634 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
7caaa711 2635 &vmx_cap->ept, &vmx_cap->vpid);
61f1dd90 2636
64f80ea7
SC
2637 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2638 vmx_cap->ept) {
61f1dd90
WL
2639 pr_warn_once("EPT CAP should not exist if not support "
2640 "1-setting enable EPT VM-execution control\n");
3dbec44d
SC
2641
2642 if (error_on_inconsistent_vmcs_config)
2643 return -EIO;
2644
2645 vmx_cap->ept = 0;
61f1dd90
WL
2646 }
2647 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
3dbec44d 2648 vmx_cap->vpid) {
61f1dd90
WL
2649 pr_warn_once("VPID CAP should not exist if not support "
2650 "1-setting enable VPID VM-execution control\n");
3dbec44d
SC
2651
2652 if (error_on_inconsistent_vmcs_config)
2653 return -EIO;
2654
2655 vmx_cap->vpid = 0;
d56f546d 2656 }
1c3d14fe 2657
1dae2765
VK
2658 if (!cpu_has_sgx())
2659 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2660
ee087b4d
VK
2661 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2662 _cpu_based_3rd_exec_control =
2663 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
1ad4e543 2664 MSR_IA32_VMX_PROCBASED_CTLS3);
1c3d14fe 2665
ee087b4d
VK
2666 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2667 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2668 MSR_IA32_VMX_EXIT_CTLS,
2669 &_vmexit_control))
002c7f7c 2670 return -EIO;
1c3d14fe 2671
ee087b4d
VK
2672 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2673 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2674 MSR_IA32_VMX_PINBASED_CTLS,
2675 &_pin_based_exec_control))
01e439be
YZ
2676 return -EIO;
2677
1c17c3e6
PB
2678 if (cpu_has_broken_vmx_preemption_timer())
2679 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be 2680 if (!(_cpu_based_2nd_exec_control &
91fa0f8e 2681 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
01e439be
YZ
2682 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2683
ee087b4d
VK
2684 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2685 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2686 MSR_IA32_VMX_ENTRY_CTLS,
2687 &_vmentry_control))
002c7f7c 2688 return -EIO;
6aa8b732 2689
f5a81d0e
SC
2690 for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2691 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2692 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2693
2694 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2695 continue;
2696
2697 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2698 _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2699
3dbec44d
SC
2700 if (error_on_inconsistent_vmcs_config)
2701 return -EIO;
2702
f5a81d0e
SC
2703 _vmentry_control &= ~n_ctrl;
2704 _vmexit_control &= ~x_ctrl;
2705 }
2706
c68876fd 2707 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1c3d14fe
YS
2708
2709 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2710 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
002c7f7c 2711 return -EIO;
1c3d14fe
YS
2712
2713#ifdef CONFIG_X86_64
2714 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2715 if (vmx_msr_high & (1u<<16))
002c7f7c 2716 return -EIO;
1c3d14fe
YS
2717#endif
2718
2719 /* Require Write-Back (WB) memory type for VMCS accesses. */
2720 if (((vmx_msr_high >> 18) & 15) != 6)
002c7f7c 2721 return -EIO;
1c3d14fe 2722
0809d9b0
VK
2723 rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2724
002c7f7c 2725 vmcs_conf->size = vmx_msr_high & 0x1fff;
9ac7e3e8 2726 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
773e8a04 2727
2307af1c 2728 vmcs_conf->revision_id = vmx_msr_low;
1c3d14fe 2729
002c7f7c
YS
2730 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2731 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
f78e0e2e 2732 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1ad4e543 2733 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
002c7f7c
YS
2734 vmcs_conf->vmexit_ctrl = _vmexit_control;
2735 vmcs_conf->vmentry_ctrl = _vmentry_control;
0809d9b0 2736 vmcs_conf->misc = misc_msr;
1c3d14fe 2737
80edc49f
VK
2738#if IS_ENABLED(CONFIG_HYPERV)
2739 if (enlightened_vmcs)
2740 evmcs_sanitize_exec_ctrls(vmcs_conf);
2741#endif
2742
1c3d14fe 2743 return 0;
c68876fd 2744}
6aa8b732 2745
d83420c2 2746static bool kvm_is_vmx_supported(void)
8504ef21 2747{
c82a5c5c
CG
2748 int cpu = raw_smp_processor_id();
2749
8504ef21 2750 if (!cpu_has_vmx()) {
c82a5c5c 2751 pr_err("VMX not supported by CPU %d\n", cpu);
8504ef21
SC
2752 return false;
2753 }
2754
2755 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2756 !this_cpu_has(X86_FEATURE_VMX)) {
c82a5c5c 2757 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
8504ef21
SC
2758 return false;
2759 }
2760
2761 return true;
2762}
2763
d83420c2 2764static int vmx_check_processor_compat(void)
8504ef21 2765{
c82a5c5c 2766 int cpu = raw_smp_processor_id();
8504ef21
SC
2767 struct vmcs_config vmcs_conf;
2768 struct vmx_capability vmx_cap;
2769
2770 if (!kvm_is_vmx_supported())
2771 return -EIO;
2772
c82a5c5c
CG
2773 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2774 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
8504ef21 2775 return -EIO;
c82a5c5c 2776 }
8504ef21
SC
2777 if (nested)
2778 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
c82a5c5c
CG
2779 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2780 pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
8504ef21
SC
2781 return -EIO;
2782 }
2783 return 0;
2784}
2785
2786static int kvm_cpu_vmxon(u64 vmxon_pointer)
2787{
2788 u64 msr;
2789
2790 cr4_set_bits(X86_CR4_VMXE);
2791
2792 asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2793 _ASM_EXTABLE(1b, %l[fault])
2794 : : [vmxon_pointer] "m"(vmxon_pointer)
2795 : : fault);
2796 return 0;
2797
2798fault:
2799 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2800 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2801 cr4_clear_bits(X86_CR4_VMXE);
2802
2803 return -EFAULT;
2804}
2805
2806static int vmx_hardware_enable(void)
2807{
2808 int cpu = raw_smp_processor_id();
2809 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2810 int r;
2811
2812 if (cr4_read_shadow() & X86_CR4_VMXE)
2813 return -EBUSY;
2814
2815 /*
2816 * This can happen if we hot-added a CPU but failed to allocate
2817 * VP assist page for it.
2818 */
2819 if (static_branch_unlikely(&enable_evmcs) &&
2820 !hv_get_vp_assist_page(cpu))
2821 return -EFAULT;
2822
2823 intel_pt_handle_vmx(1);
2824
2825 r = kvm_cpu_vmxon(phys_addr);
2826 if (r) {
2827 intel_pt_handle_vmx(0);
2828 return r;
2829 }
2830
2831 if (enable_ept)
2832 ept_sync_global();
2833
2834 return 0;
2835}
2836
2837static void vmclear_local_loaded_vmcss(void)
2838{
2839 int cpu = raw_smp_processor_id();
2840 struct loaded_vmcs *v, *n;
2841
2842 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2843 loaded_vmcss_on_cpu_link)
2844 __loaded_vmcs_clear(v);
2845}
2846
2847static void vmx_hardware_disable(void)
2848{
2849 vmclear_local_loaded_vmcss();
2850
2851 if (cpu_vmxoff())
2852 kvm_spurious_fault();
2853
2854 hv_reset_evmcs();
2855
2856 intel_pt_handle_vmx(0);
2857}
2858
41836839 2859struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
6aa8b732
AK
2860{
2861 int node = cpu_to_node(cpu);
2862 struct page *pages;
2863 struct vmcs *vmcs;
2864
519669cc 2865 pages = __alloc_pages_node(node, flags, 0);
6aa8b732
AK
2866 if (!pages)
2867 return NULL;
2868 vmcs = page_address(pages);
1c3d14fe 2869 memset(vmcs, 0, vmcs_config.size);
2307af1c
LA
2870
2871 /* KVM supports Enlightened VMCS v1 only */
2872 if (static_branch_unlikely(&enable_evmcs))
392b2f25 2873 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2307af1c 2874 else
392b2f25 2875 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 2876
491a6038
LA
2877 if (shadow)
2878 vmcs->hdr.shadow_vmcs = 1;
6aa8b732
AK
2879 return vmcs;
2880}
2881
89b0c9f5 2882void free_vmcs(struct vmcs *vmcs)
6aa8b732 2883{
519669cc 2884 free_page((unsigned long)vmcs);
6aa8b732
AK
2885}
2886
d462b819
NHE
2887/*
2888 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2889 */
89b0c9f5 2890void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
d462b819
NHE
2891{
2892 if (!loaded_vmcs->vmcs)
2893 return;
2894 loaded_vmcs_clear(loaded_vmcs);
2895 free_vmcs(loaded_vmcs->vmcs);
2896 loaded_vmcs->vmcs = NULL;
904e14fb
PB
2897 if (loaded_vmcs->msr_bitmap)
2898 free_page((unsigned long)loaded_vmcs->msr_bitmap);
355f4fb1 2899 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
d462b819
NHE
2900}
2901
89b0c9f5 2902int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
f21f165e 2903{
491a6038 2904 loaded_vmcs->vmcs = alloc_vmcs(false);
f21f165e
PB
2905 if (!loaded_vmcs->vmcs)
2906 return -ENOMEM;
2907
d260f9ef
SC
2908 vmcs_clear(loaded_vmcs->vmcs);
2909
f21f165e 2910 loaded_vmcs->shadow_vmcs = NULL;
804939ea 2911 loaded_vmcs->hv_timer_soft_disabled = false;
d260f9ef
SC
2912 loaded_vmcs->cpu = -1;
2913 loaded_vmcs->launched = 0;
904e14fb
PB
2914
2915 if (cpu_has_vmx_msr_bitmap()) {
41836839
BG
2916 loaded_vmcs->msr_bitmap = (unsigned long *)
2917 __get_free_page(GFP_KERNEL_ACCOUNT);
904e14fb
PB
2918 if (!loaded_vmcs->msr_bitmap)
2919 goto out_vmcs;
2920 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2921 }
d7ee039e
SC
2922
2923 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3af80fec
SC
2924 memset(&loaded_vmcs->controls_shadow, 0,
2925 sizeof(struct vmcs_controls_shadow));
d7ee039e 2926
f21f165e 2927 return 0;
904e14fb
PB
2928
2929out_vmcs:
2930 free_loaded_vmcs(loaded_vmcs);
2931 return -ENOMEM;
f21f165e
PB
2932}
2933
39959588 2934static void free_kvm_area(void)
6aa8b732
AK
2935{
2936 int cpu;
2937
3230bb47 2938 for_each_possible_cpu(cpu) {
6aa8b732 2939 free_vmcs(per_cpu(vmxarea, cpu));
3230bb47
ZA
2940 per_cpu(vmxarea, cpu) = NULL;
2941 }
6aa8b732
AK
2942}
2943
6aa8b732
AK
2944static __init int alloc_kvm_area(void)
2945{
2946 int cpu;
2947
3230bb47 2948 for_each_possible_cpu(cpu) {
6aa8b732
AK
2949 struct vmcs *vmcs;
2950
41836839 2951 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
6aa8b732
AK
2952 if (!vmcs) {
2953 free_kvm_area();
2954 return -ENOMEM;
2955 }
2956
2307af1c
LA
2957 /*
2958 * When eVMCS is enabled, alloc_vmcs_cpu() sets
2959 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2960 * revision_id reported by MSR_IA32_VMX_BASIC.
2961 *
312a4661 2962 * However, even though not explicitly documented by
2307af1c
LA
2963 * TLFS, VMXArea passed as VMXON argument should
2964 * still be marked with revision_id reported by
2965 * physical CPU.
2966 */
2967 if (static_branch_unlikely(&enable_evmcs))
392b2f25 2968 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 2969
6aa8b732
AK
2970 per_cpu(vmxarea, cpu) = vmcs;
2971 }
2972 return 0;
2973}
2974
91b0aa2c 2975static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
d99e4152 2976 struct kvm_segment *save)
6aa8b732 2977{
d99e4152
GN
2978 if (!emulate_invalid_guest_state) {
2979 /*
2980 * CS and SS RPL should be equal during guest entry according
2981 * to VMX spec, but in reality it is not always so. Since vcpu
2982 * is in the middle of the transition from real mode to
2983 * protected mode it is safe to assume that RPL 0 is a good
2984 * default value.
2985 */
2986 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
b32a9918
NA
2987 save->selector &= ~SEGMENT_RPL_MASK;
2988 save->dpl = save->selector & SEGMENT_RPL_MASK;
d99e4152 2989 save->s = 1;
6aa8b732 2990 }
1dd7a4f1 2991 __vmx_set_segment(vcpu, save, seg);
6aa8b732
AK
2992}
2993
2994static void enter_pmode(struct kvm_vcpu *vcpu)
2995{
2996 unsigned long flags;
a89a8fb9 2997 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732 2998
d99e4152 2999 /*
d9f6e12f 3000 * Update real mode segment cache. It may be not up-to-date if segment
d99e4152
GN
3001 * register was written while vcpu was in a guest mode.
3002 */
3003 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3004 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3005 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3006 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3007 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3008 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3009
7ffd92c5 3010 vmx->rmode.vm86_active = 0;
6aa8b732 3011
1dd7a4f1 3012 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
6aa8b732
AK
3013
3014 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47
AK
3015 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3016 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
6aa8b732
AK
3017 vmcs_writel(GUEST_RFLAGS, flags);
3018
66aee91a
RR
3019 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3020 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
6aa8b732 3021
b6a7cc35 3022 vmx_update_exception_bitmap(vcpu);
6aa8b732 3023
91b0aa2c
GN
3024 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3025 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3026 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3027 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3028 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3029 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
6aa8b732
AK
3030}
3031
f5f7b2fe 3032static void fix_rmode_seg(int seg, struct kvm_segment *save)
6aa8b732 3033{
772e0318 3034 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
d99e4152
GN
3035 struct kvm_segment var = *save;
3036
3037 var.dpl = 0x3;
3038 if (seg == VCPU_SREG_CS)
3039 var.type = 0x3;
3040
3041 if (!emulate_invalid_guest_state) {
3042 var.selector = var.base >> 4;
3043 var.base = var.base & 0xffff0;
3044 var.limit = 0xffff;
3045 var.g = 0;
3046 var.db = 0;
3047 var.present = 1;
3048 var.s = 1;
3049 var.l = 0;
3050 var.unusable = 0;
3051 var.type = 0x3;
3052 var.avl = 0;
3053 if (save->base & 0xf)
8d20bd63
SC
3054 pr_warn_once("segment base is not paragraph aligned "
3055 "when entering protected mode (seg=%d)", seg);
d99e4152 3056 }
6aa8b732 3057
d99e4152 3058 vmcs_write16(sf->selector, var.selector);
96794e4e 3059 vmcs_writel(sf->base, var.base);
d99e4152
GN
3060 vmcs_write32(sf->limit, var.limit);
3061 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
6aa8b732
AK
3062}
3063
3064static void enter_rmode(struct kvm_vcpu *vcpu)
3065{
3066 unsigned long flags;
a89a8fb9 3067 struct vcpu_vmx *vmx = to_vmx(vcpu);
40bbb9d0 3068 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
6aa8b732 3069
f5f7b2fe
AK
3070 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3071 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3072 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3073 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3074 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
c6ad1153
GN
3075 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3076 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
f5f7b2fe 3077
7ffd92c5 3078 vmx->rmode.vm86_active = 1;
6aa8b732 3079
776e58ea
GN
3080 /*
3081 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4918c6ca 3082 * vcpu. Warn the user that an update is overdue.
776e58ea 3083 */
40bbb9d0 3084 if (!kvm_vmx->tss_addr)
8d20bd63 3085 pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
776e58ea 3086
2fb92db1
AK
3087 vmx_segment_cache_clear(vmx);
3088
40bbb9d0 3089 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
6aa8b732 3090 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
6aa8b732
AK
3091 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3092
3093 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47 3094 vmx->rmode.save_rflags = flags;
6aa8b732 3095
053de044 3096 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
6aa8b732
AK
3097
3098 vmcs_writel(GUEST_RFLAGS, flags);
66aee91a 3099 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
b6a7cc35 3100 vmx_update_exception_bitmap(vcpu);
6aa8b732 3101
d99e4152
GN
3102 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3103 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3104 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3105 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3106 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3107 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
6aa8b732
AK
3108}
3109
72f211ec 3110int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
401d10de
AS
3111{
3112 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 3113
72f211ec 3114 /* Nothing to do if hardware doesn't support EFER. */
b76edfe9 3115 if (!vmx_find_uret_msr(vmx, MSR_EFER))
72f211ec 3116 return 0;
401d10de 3117
f6801dff 3118 vcpu->arch.efer = efer;
ebb3c8d4 3119#ifdef CONFIG_X86_64
b76edfe9
ZD
3120 if (efer & EFER_LMA)
3121 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3122 else
3123 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
ebb3c8d4
SC
3124#else
3125 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3126 return 1;
3127#endif
401d10de 3128
400dd54b 3129 vmx_setup_uret_msrs(vmx);
72f211ec 3130 return 0;
401d10de
AS
3131}
3132
05b3e0c2 3133#ifdef CONFIG_X86_64
6aa8b732
AK
3134
3135static void enter_lmode(struct kvm_vcpu *vcpu)
3136{
3137 u32 guest_tr_ar;
3138
2fb92db1
AK
3139 vmx_segment_cache_clear(to_vmx(vcpu));
3140
6aa8b732 3141 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4d283ec9 3142 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
bd80158a
JK
3143 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3144 __func__);
6aa8b732 3145 vmcs_write32(GUEST_TR_AR_BYTES,
4d283ec9
AL
3146 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3147 | VMX_AR_TYPE_BUSY_64_TSS);
6aa8b732 3148 }
da38f438 3149 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
6aa8b732
AK
3150}
3151
3152static void exit_lmode(struct kvm_vcpu *vcpu)
3153{
da38f438 3154 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
6aa8b732
AK
3155}
3156
3157#endif
3158
7780938c 3159static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
5058b692
SC
3160{
3161 struct vcpu_vmx *vmx = to_vmx(vcpu);
3162
3163 /*
7780938c
SC
3164 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3165 * the CPU is not required to invalidate guest-physical mappings on
3166 * VM-Entry, even if VPID is disabled. Guest-physical mappings are
3167 * associated with the root EPT structure and not any particular VPID
3168 * (INVVPID also isn't required to invalidate guest-physical mappings).
5058b692
SC
3169 */
3170 if (enable_ept) {
3171 ept_sync_global();
3172 } else if (enable_vpid) {
3173 if (cpu_has_vmx_invvpid_global()) {
3174 vpid_sync_vcpu_global();
3175 } else {
3176 vpid_sync_vcpu_single(vmx->vpid);
3177 vpid_sync_vcpu_single(vmx->nested.vpid02);
3178 }
3179 }
3180}
3181
2b4a5a5d
SC
3182static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3183{
3184 if (is_guest_mode(vcpu))
3185 return nested_get_vpid02(vcpu);
3186 return to_vmx(vcpu)->vpid;
3187}
3188
33d19ec9
SC
3189static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3190{
2a40b900 3191 struct kvm_mmu *mmu = vcpu->arch.mmu;
b9e5603c 3192 u64 root_hpa = mmu->root.hpa;
33d19ec9
SC
3193
3194 /* No flush required if the current context is invalid. */
3195 if (!VALID_PAGE(root_hpa))
3196 return;
3197
3198 if (enable_ept)
2a40b900 3199 ept_sync_context(construct_eptp(vcpu, root_hpa,
a972e29c 3200 mmu->root_role.level));
33d19ec9 3201 else
2b4a5a5d 3202 vpid_sync_context(vmx_get_current_vpid(vcpu));
33d19ec9
SC
3203}
3204
faff8758
JS
3205static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3206{
faff8758 3207 /*
2b4a5a5d 3208 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
ad104b5e 3209 * vmx_flush_tlb_guest() for an explanation of why this is ok.
faff8758 3210 */
2b4a5a5d 3211 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
faff8758
JS
3212}
3213
e64419d9
SC
3214static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3215{
3216 /*
2b4a5a5d
SC
3217 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3218 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
3219 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
e64419d9
SC
3220 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3221 * i.e. no explicit INVVPID is necessary.
3222 */
2b4a5a5d 3223 vpid_sync_context(vmx_get_current_vpid(vcpu));
e64419d9
SC
3224}
3225
43fea4e4 3226void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
1439442c 3227{
d0d538b9
GN
3228 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3229
cb3c1e2f 3230 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
6de4f3ad
AK
3231 return;
3232
bf03d4f9 3233 if (is_pae_paging(vcpu)) {
d0d538b9
GN
3234 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3235 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3236 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3237 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
1439442c
SY
3238 }
3239}
3240
97b7ead3 3241void ept_save_pdptrs(struct kvm_vcpu *vcpu)
8f5d549f 3242{
d0d538b9
GN
3243 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3244
9932b49e
SC
3245 if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3246 return;
3247
3248 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3249 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3250 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3251 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
6de4f3ad 3252
c0d6956e 3253 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
8f5d549f
AK
3254}
3255
470750b3
SC
3256#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3257 CPU_BASED_CR3_STORE_EXITING)
3258
97b7ead3 3259void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
6aa8b732 3260{
7ffd92c5 3261 struct vcpu_vmx *vmx = to_vmx(vcpu);
32437c2a 3262 unsigned long hw_cr0, old_cr0_pg;
470750b3 3263 u32 tmp;
3a624e29 3264
32437c2a
SC
3265 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3266
3de6347b 3267 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
bddd82d1 3268 if (is_unrestricted_guest(vcpu))
5037878e 3269 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
218e763f 3270 else {
5037878e 3271 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
ee5a5584
SC
3272 if (!enable_ept)
3273 hw_cr0 |= X86_CR0_WP;
1439442c 3274
218e763f
GN
3275 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3276 enter_pmode(vcpu);
6aa8b732 3277
218e763f
GN
3278 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3279 enter_rmode(vcpu);
3280 }
6aa8b732 3281
32437c2a
SC
3282 vmcs_writel(CR0_READ_SHADOW, cr0);
3283 vmcs_writel(GUEST_CR0, hw_cr0);
3284 vcpu->arch.cr0 = cr0;
3285 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3286
05b3e0c2 3287#ifdef CONFIG_X86_64
f6801dff 3288 if (vcpu->arch.efer & EFER_LME) {
32437c2a 3289 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
6aa8b732 3290 enter_lmode(vcpu);
32437c2a 3291 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
6aa8b732
AK
3292 exit_lmode(vcpu);
3293 }
3294#endif
3295
c834fd7f 3296 if (enable_ept && !is_unrestricted_guest(vcpu)) {
470750b3
SC
3297 /*
3298 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
3299 * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3300 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3301 * KVM's CR3 is installed.
3302 */
c834fd7f
SC
3303 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3304 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
470750b3
SC
3305
3306 /*
3307 * When running with EPT but not unrestricted guest, KVM must
3308 * intercept CR3 accesses when paging is _disabled_. This is
3309 * necessary because restricted guests can't actually run with
3310 * paging disabled, and so KVM stuffs its own CR3 in order to
3311 * run the guest when identity mapped page tables.
3312 *
3313 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3314 * update, it may be stale with respect to CR3 interception,
3315 * e.g. after nested VM-Enter.
3316 *
3317 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3318 * stores to forward them to L1, even if KVM does not need to
3319 * intercept them to preserve its identity mapped page tables.
3320 */
c834fd7f 3321 if (!(cr0 & X86_CR0_PG)) {
470750b3
SC
3322 exec_controls_setbit(vmx, CR3_EXITING_BITS);
3323 } else if (!is_guest_mode(vcpu)) {
3324 exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3325 } else {
3326 tmp = exec_controls_get(vmx);
3327 tmp &= ~CR3_EXITING_BITS;
3328 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3329 exec_controls_set(vmx, tmp);
3330 }
3331
32437c2a
SC
3332 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3333 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
c834fd7f 3334 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
5b61178c
LJ
3335
3336 /*
3337 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3338 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3339 */
3340 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3341 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
c834fd7f 3342 }
1439442c 3343
14168786 3344 /* depends on vcpu->arch.cr0 to be set to a new value */
dbab610a 3345 vmx->emulation_required = vmx_emulation_required(vcpu);
6aa8b732
AK
3346}
3347
d468d94b 3348static int vmx_get_max_tdp_level(void)
855feb67 3349{
d468d94b 3350 if (cpu_has_vmx_ept_5levels())
855feb67
YZ
3351 return 5;
3352 return 4;
3353}
3354
e83bc09c 3355u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
1439442c 3356{
855feb67
YZ
3357 u64 eptp = VMX_EPTP_MT_WB;
3358
2a40b900 3359 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
1439442c 3360
995f00a6
PF
3361 if (enable_ept_ad_bits &&
3362 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
bb97a016 3363 eptp |= VMX_EPTP_AD_ENABLE_BIT;
e83bc09c 3364 eptp |= root_hpa;
1439442c
SY
3365
3366 return eptp;
3367}
3368
e83bc09c
SC
3369static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3370 int root_level)
6aa8b732 3371{
877ad952 3372 struct kvm *kvm = vcpu->kvm;
04f11ef4 3373 bool update_guest_cr3 = true;
1439442c
SY
3374 unsigned long guest_cr3;
3375 u64 eptp;
3376
089d034e 3377 if (enable_ept) {
e83bc09c 3378 eptp = construct_eptp(vcpu, root_hpa, root_level);
1439442c 3379 vmcs_write64(EPT_POINTER, eptp);
877ad952 3380
3c86c0d3 3381 hv_track_root_tdp(vcpu, root_hpa);
877ad952 3382
df7e0681 3383 if (!enable_unrestricted_guest && !is_paging(vcpu))
877ad952 3384 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
c62c7bd4 3385 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
b17b7436 3386 guest_cr3 = vcpu->arch.cr3;
c62c7bd4 3387 else /* vmcs.GUEST_CR3 is already up-to-date. */
b17b7436 3388 update_guest_cr3 = false;
43fea4e4 3389 vmx_ept_load_pdptrs(vcpu);
be100ef1 3390 } else {
e83bc09c 3391 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
1439442c
SY
3392 }
3393
04f11ef4
SC
3394 if (update_guest_cr3)
3395 vmcs_writel(GUEST_CR3, guest_cr3);
6aa8b732
AK
3396}
3397
405329fc 3398
c2fe3cd4
SC
3399static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3400{
3401 /*
3402 * We operate under the default treatment of SMM, so VMX cannot be
c33f6f22
SC
3403 * enabled under SMM. Note, whether or not VMXE is allowed at all,
3404 * i.e. is a reserved bit, is handled by common x86 code.
c2fe3cd4
SC
3405 */
3406 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3407 return false;
3408
3409 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3410 return false;
3411
3412 return true;
3413}
3414
3415void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 3416{
2259c17f 3417 unsigned long old_cr4 = vcpu->arch.cr4;
fe7f895d 3418 struct vcpu_vmx *vmx = to_vmx(vcpu);
085e68ee
BS
3419 /*
3420 * Pass through host's Machine Check Enable value to hw_cr4, which
3421 * is in force while we are in guest mode. Do not let guests control
3422 * this bit, even if host CR4.MCE == 0.
3423 */
5dc1f044
SC
3424 unsigned long hw_cr4;
3425
3426 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
bddd82d1 3427 if (is_unrestricted_guest(vcpu))
5dc1f044 3428 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
fe7f895d 3429 else if (vmx->rmode.vm86_active)
5dc1f044
SC
3430 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3431 else
3432 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
1439442c 3433
64f7a115
SC
3434 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3435 if (cr4 & X86_CR4_UMIP) {
fe7f895d 3436 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
64f7a115
SC
3437 hw_cr4 &= ~X86_CR4_UMIP;
3438 } else if (!is_guest_mode(vcpu) ||
fe7f895d
SC
3439 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3440 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3441 }
64f7a115 3442 }
0367f205 3443
ad312c7c 3444 vcpu->arch.cr4 = cr4;
f98c1e77 3445 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
5dc1f044 3446
bddd82d1 3447 if (!is_unrestricted_guest(vcpu)) {
5dc1f044
SC
3448 if (enable_ept) {
3449 if (!is_paging(vcpu)) {
3450 hw_cr4 &= ~X86_CR4_PAE;
3451 hw_cr4 |= X86_CR4_PSE;
3452 } else if (!(cr4 & X86_CR4_PAE)) {
3453 hw_cr4 &= ~X86_CR4_PAE;
3454 }
bc23008b 3455 }
1439442c 3456
656ec4a4 3457 /*
ddba2628
HH
3458 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3459 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3460 * to be manually disabled when guest switches to non-paging
3461 * mode.
3462 *
3463 * If !enable_unrestricted_guest, the CPU is always running
3464 * with CR0.PG=1 and CR4 needs to be modified.
3465 * If enable_unrestricted_guest, the CPU automatically
3466 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
656ec4a4 3467 */
5dc1f044
SC
3468 if (!is_paging(vcpu))
3469 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3470 }
656ec4a4 3471
1439442c
SY
3472 vmcs_writel(CR4_READ_SHADOW, cr4);
3473 vmcs_writel(GUEST_CR4, hw_cr4);
2259c17f
JM
3474
3475 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3476 kvm_update_cpuid_runtime(vcpu);
6aa8b732
AK
3477}
3478
97b7ead3 3479void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
6aa8b732 3480{
a9179499 3481 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732
AK
3482 u32 ar;
3483
c6ad1153 3484 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
f5f7b2fe 3485 *var = vmx->rmode.segs[seg];
a9179499 3486 if (seg == VCPU_SREG_TR
2fb92db1 3487 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
f5f7b2fe 3488 return;
1390a28b
AK
3489 var->base = vmx_read_guest_seg_base(vmx, seg);
3490 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3491 return;
a9179499 3492 }
2fb92db1
AK
3493 var->base = vmx_read_guest_seg_base(vmx, seg);
3494 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3495 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3496 ar = vmx_read_guest_seg_ar(vmx, seg);
03617c18 3497 var->unusable = (ar >> 16) & 1;
6aa8b732
AK
3498 var->type = ar & 15;
3499 var->s = (ar >> 4) & 1;
3500 var->dpl = (ar >> 5) & 3;
03617c18
GN
3501 /*
3502 * Some userspaces do not preserve unusable property. Since usable
3503 * segment has to be present according to VMX spec we can use present
3504 * property to amend userspace bug by making unusable segment always
3505 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3506 * segment as unusable.
3507 */
3508 var->present = !var->unusable;
6aa8b732
AK
3509 var->avl = (ar >> 12) & 1;
3510 var->l = (ar >> 13) & 1;
3511 var->db = (ar >> 14) & 1;
3512 var->g = (ar >> 15) & 1;
6aa8b732
AK
3513}
3514
a9179499
AK
3515static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3516{
a9179499
AK
3517 struct kvm_segment s;
3518
3519 if (to_vmx(vcpu)->rmode.vm86_active) {
3520 vmx_get_segment(vcpu, &s, seg);
3521 return s.base;
3522 }
2fb92db1 3523 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
a9179499
AK
3524}
3525
97b7ead3 3526int vmx_get_cpl(struct kvm_vcpu *vcpu)
2e4d2653 3527{
b09408d0
MT
3528 struct vcpu_vmx *vmx = to_vmx(vcpu);
3529
ae9fedc7 3530 if (unlikely(vmx->rmode.vm86_active))
2e4d2653 3531 return 0;
ae9fedc7
PB
3532 else {
3533 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4d283ec9 3534 return VMX_AR_DPL(ar);
69c73028 3535 }
69c73028
AK
3536}
3537
653e3108 3538static u32 vmx_segment_access_rights(struct kvm_segment *var)
6aa8b732 3539{
6aa8b732
AK
3540 u32 ar;
3541
f0495f9b 3542 if (var->unusable || !var->present)
6aa8b732
AK
3543 ar = 1 << 16;
3544 else {
3545 ar = var->type & 15;
3546 ar |= (var->s & 1) << 4;
3547 ar |= (var->dpl & 3) << 5;
3548 ar |= (var->present & 1) << 7;
3549 ar |= (var->avl & 1) << 12;
3550 ar |= (var->l & 1) << 13;
3551 ar |= (var->db & 1) << 14;
3552 ar |= (var->g & 1) << 15;
3553 }
653e3108
AK
3554
3555 return ar;
3556}
3557
816be9e9 3558void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
653e3108 3559{
7ffd92c5 3560 struct vcpu_vmx *vmx = to_vmx(vcpu);
772e0318 3561 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
653e3108 3562
2fb92db1
AK
3563 vmx_segment_cache_clear(vmx);
3564
1ecd50a9
GN
3565 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3566 vmx->rmode.segs[seg] = *var;
3567 if (seg == VCPU_SREG_TR)
3568 vmcs_write16(sf->selector, var->selector);
3569 else if (var->s)
3570 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
1dd7a4f1 3571 return;
653e3108 3572 }
1ecd50a9 3573
653e3108
AK
3574 vmcs_writel(sf->base, var->base);
3575 vmcs_write32(sf->limit, var->limit);
3576 vmcs_write16(sf->selector, var->selector);
3a624e29
NK
3577
3578 /*
3579 * Fix the "Accessed" bit in AR field of segment registers for older
3580 * qemu binaries.
3581 * IA32 arch specifies that at the time of processor reset the
3582 * "Accessed" bit in the AR field of segment registers is 1. And qemu
0fa06071 3583 * is setting it to 0 in the userland code. This causes invalid guest
3a624e29
NK
3584 * state vmexit when "unrestricted guest" mode is turned on.
3585 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3586 * tree. Newer qemu binaries with that qemu fix would not need this
3587 * kvm hack.
3588 */
bddd82d1 3589 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
f924d66d 3590 var->type |= 0x1; /* Accessed */
3a624e29 3591
f924d66d 3592 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
1dd7a4f1 3593}
d99e4152 3594
816be9e9 3595static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
1dd7a4f1
SC
3596{
3597 __vmx_set_segment(vcpu, var, seg);
3598
dbab610a 3599 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
6aa8b732
AK
3600}
3601
6aa8b732
AK
3602static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3603{
2fb92db1 3604 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
6aa8b732
AK
3605
3606 *db = (ar >> 14) & 1;
3607 *l = (ar >> 13) & 1;
3608}
3609
89a27f4d 3610static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3611{
89a27f4d
GN
3612 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3613 dt->address = vmcs_readl(GUEST_IDTR_BASE);
6aa8b732
AK
3614}
3615
89a27f4d 3616static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3617{
89a27f4d
GN
3618 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3619 vmcs_writel(GUEST_IDTR_BASE, dt->address);
6aa8b732
AK
3620}
3621
89a27f4d 3622static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3623{
89a27f4d
GN
3624 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3625 dt->address = vmcs_readl(GUEST_GDTR_BASE);
6aa8b732
AK
3626}
3627
89a27f4d 3628static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3629{
89a27f4d
GN
3630 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3631 vmcs_writel(GUEST_GDTR_BASE, dt->address);
6aa8b732
AK
3632}
3633
648dfaa7
MG
3634static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3635{
3636 struct kvm_segment var;
3637 u32 ar;
3638
3639 vmx_get_segment(vcpu, &var, seg);
07f42f5f 3640 var.dpl = 0x3;
0647f4aa
GN
3641 if (seg == VCPU_SREG_CS)
3642 var.type = 0x3;
648dfaa7
MG
3643 ar = vmx_segment_access_rights(&var);
3644
3645 if (var.base != (var.selector << 4))
3646 return false;
89efbed0 3647 if (var.limit != 0xffff)
648dfaa7 3648 return false;
07f42f5f 3649 if (ar != 0xf3)
648dfaa7
MG
3650 return false;
3651
3652 return true;
3653}
3654
3655static bool code_segment_valid(struct kvm_vcpu *vcpu)
3656{
3657 struct kvm_segment cs;
3658 unsigned int cs_rpl;
3659
3660 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
b32a9918 3661 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
648dfaa7 3662
1872a3f4
AK
3663 if (cs.unusable)
3664 return false;
4d283ec9 3665 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
648dfaa7
MG
3666 return false;
3667 if (!cs.s)
3668 return false;
4d283ec9 3669 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
648dfaa7
MG
3670 if (cs.dpl > cs_rpl)
3671 return false;
1872a3f4 3672 } else {
648dfaa7
MG
3673 if (cs.dpl != cs_rpl)
3674 return false;
3675 }
3676 if (!cs.present)
3677 return false;
3678
3679 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3680 return true;
3681}
3682
3683static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3684{
3685 struct kvm_segment ss;
3686 unsigned int ss_rpl;
3687
3688 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
b32a9918 3689 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
648dfaa7 3690
1872a3f4
AK
3691 if (ss.unusable)
3692 return true;
3693 if (ss.type != 3 && ss.type != 7)
648dfaa7
MG
3694 return false;
3695 if (!ss.s)
3696 return false;
3697 if (ss.dpl != ss_rpl) /* DPL != RPL */
3698 return false;
3699 if (!ss.present)
3700 return false;
3701
3702 return true;
3703}
3704
3705static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3706{
3707 struct kvm_segment var;
3708 unsigned int rpl;
3709
3710 vmx_get_segment(vcpu, &var, seg);
b32a9918 3711 rpl = var.selector & SEGMENT_RPL_MASK;
648dfaa7 3712
1872a3f4
AK
3713 if (var.unusable)
3714 return true;
648dfaa7
MG
3715 if (!var.s)
3716 return false;
3717 if (!var.present)
3718 return false;
4d283ec9 3719 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
648dfaa7
MG
3720 if (var.dpl < rpl) /* DPL < RPL */
3721 return false;
3722 }
3723
3724 /* TODO: Add other members to kvm_segment_field to allow checking for other access
3725 * rights flags
3726 */
3727 return true;
3728}
3729
3730static bool tr_valid(struct kvm_vcpu *vcpu)
3731{
3732 struct kvm_segment tr;
3733
3734 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3735
1872a3f4
AK
3736 if (tr.unusable)
3737 return false;
b32a9918 3738 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7 3739 return false;
1872a3f4 3740 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
648dfaa7
MG
3741 return false;
3742 if (!tr.present)
3743 return false;
3744
3745 return true;
3746}
3747
3748static bool ldtr_valid(struct kvm_vcpu *vcpu)
3749{
3750 struct kvm_segment ldtr;
3751
3752 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3753
1872a3f4
AK
3754 if (ldtr.unusable)
3755 return true;
b32a9918 3756 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7
MG
3757 return false;
3758 if (ldtr.type != 2)
3759 return false;
3760 if (!ldtr.present)
3761 return false;
3762
3763 return true;
3764}
3765
3766static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3767{
3768 struct kvm_segment cs, ss;
3769
3770 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3771 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3772
b32a9918
NA
3773 return ((cs.selector & SEGMENT_RPL_MASK) ==
3774 (ss.selector & SEGMENT_RPL_MASK));
648dfaa7
MG
3775}
3776
3777/*
3778 * Check if guest state is valid. Returns true if valid, false if
3779 * not.
3780 * We assume that registers are always usable
3781 */
2ba4493a 3782bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
648dfaa7
MG
3783{
3784 /* real mode guest state checks */
f13882d8 3785 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
648dfaa7
MG
3786 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3787 return false;
3788 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3789 return false;
3790 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3791 return false;
3792 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3793 return false;
3794 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3795 return false;
3796 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3797 return false;
3798 } else {
3799 /* protected mode guest state checks */
3800 if (!cs_ss_rpl_check(vcpu))
3801 return false;
3802 if (!code_segment_valid(vcpu))
3803 return false;
3804 if (!stack_segment_valid(vcpu))
3805 return false;
3806 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3807 return false;
3808 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3809 return false;
3810 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3811 return false;
3812 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3813 return false;
3814 if (!tr_valid(vcpu))
3815 return false;
3816 if (!ldtr_valid(vcpu))
3817 return false;
3818 }
3819 /* TODO:
3820 * - Add checks on RIP
3821 * - Add checks on RFLAGS
3822 */
3823
3824 return true;
3825}
3826
ff5a983c 3827static int init_rmode_tss(struct kvm *kvm, void __user *ua)
6aa8b732 3828{
ff5a983c
PX
3829 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3830 u16 data;
3831 int i;
3832
3833 for (i = 0; i < 3; i++) {
3834 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3835 return -EFAULT;
3836 }
6aa8b732 3837
195aefde 3838 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
ff5a983c
PX
3839 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3840 return -EFAULT;
3841
195aefde 3842 data = ~0;
ff5a983c
PX
3843 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3844 return -EFAULT;
3845
3846 return 0;
6aa8b732
AK
3847}
3848
b7ebfb05
SY
3849static int init_rmode_identity_map(struct kvm *kvm)
3850{
40bbb9d0 3851 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
2a5755bb 3852 int i, r = 0;
ff5a983c 3853 void __user *uaddr;
b7ebfb05
SY
3854 u32 tmp;
3855
40bbb9d0 3856 /* Protect kvm_vmx->ept_identity_pagetable_done. */
a255d479
TC
3857 mutex_lock(&kvm->slots_lock);
3858
40bbb9d0 3859 if (likely(kvm_vmx->ept_identity_pagetable_done))
2a5755bb 3860 goto out;
a255d479 3861
40bbb9d0
SC
3862 if (!kvm_vmx->ept_identity_map_addr)
3863 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
a255d479 3864
ff5a983c
PX
3865 uaddr = __x86_set_memory_region(kvm,
3866 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3867 kvm_vmx->ept_identity_map_addr,
3868 PAGE_SIZE);
3869 if (IS_ERR(uaddr)) {
3870 r = PTR_ERR(uaddr);
2a5755bb 3871 goto out;
ff5a983c 3872 }
a255d479 3873
b7ebfb05 3874 /* Set up identity-mapping pagetable for EPT in real mode */
1ae20e0b 3875 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
b7ebfb05
SY
3876 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3877 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
ff5a983c
PX
3878 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3879 r = -EFAULT;
b7ebfb05 3880 goto out;
ff5a983c 3881 }
b7ebfb05 3882 }
40bbb9d0 3883 kvm_vmx->ept_identity_pagetable_done = true;
f51770ed 3884
b7ebfb05 3885out:
a255d479 3886 mutex_unlock(&kvm->slots_lock);
f51770ed 3887 return r;
b7ebfb05
SY
3888}
3889
6aa8b732
AK
3890static void seg_setup(int seg)
3891{
772e0318 3892 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3a624e29 3893 unsigned int ar;
6aa8b732
AK
3894
3895 vmcs_write16(sf->selector, 0);
3896 vmcs_writel(sf->base, 0);
3897 vmcs_write32(sf->limit, 0xffff);
d54d07b2
GN
3898 ar = 0x93;
3899 if (seg == VCPU_SREG_CS)
3900 ar |= 0x08; /* code segment */
3a624e29
NK
3901
3902 vmcs_write32(sf->ar_bytes, ar);
6aa8b732
AK
3903}
3904
97b7ead3 3905int allocate_vpid(void)
2384d2b3
SY
3906{
3907 int vpid;
3908
919818ab 3909 if (!enable_vpid)
991e7a0e 3910 return 0;
2384d2b3
SY
3911 spin_lock(&vmx_vpid_lock);
3912 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
991e7a0e 3913 if (vpid < VMX_NR_VPIDS)
2384d2b3 3914 __set_bit(vpid, vmx_vpid_bitmap);
991e7a0e
WL
3915 else
3916 vpid = 0;
2384d2b3 3917 spin_unlock(&vmx_vpid_lock);
991e7a0e 3918 return vpid;
2384d2b3
SY
3919}
3920
97b7ead3 3921void free_vpid(int vpid)
cdbecfc3 3922{
991e7a0e 3923 if (!enable_vpid || vpid == 0)
cdbecfc3
LJ
3924 return;
3925 spin_lock(&vmx_vpid_lock);
991e7a0e 3926 __clear_bit(vpid, vmx_vpid_bitmap);
cdbecfc3
LJ
3927 spin_unlock(&vmx_vpid_lock);
3928}
3929
b84155c3
VK
3930static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3931{
3932 /*
3933 * When KVM is a nested hypervisor on top of Hyper-V and uses
3934 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3935 * bitmap has changed.
3936 */
3937 if (static_branch_unlikely(&enable_evmcs))
3938 evmcs_touch_msr_bitmap();
ed2a4800
VK
3939
3940 vmx->nested.force_msr_bitmap_recalc = true;
b84155c3
VK
3941}
3942
e23f6d49 3943void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
25c5f225 3944{
476c9bd8
AL
3945 struct vcpu_vmx *vmx = to_vmx(vcpu);
3946 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
25c5f225
SY
3947
3948 if (!cpu_has_vmx_msr_bitmap())
3949 return;
3950
b84155c3 3951 vmx_msr_bitmap_l01_changed(vmx);
ceef7d10 3952
25c5f225 3953 /*
3eb90017
AG
3954 * Mark the desired intercept state in shadow bitmap, this is needed
3955 * for resync when the MSR filters change.
3956 */
3957 if (is_valid_passthrough_msr(msr)) {
3958 int idx = possible_passthrough_msr_slot(msr);
3959
3960 if (idx != -ENOENT) {
3961 if (type & MSR_TYPE_R)
3962 clear_bit(idx, vmx->shadow_msr_intercept.read);
3963 if (type & MSR_TYPE_W)
3964 clear_bit(idx, vmx->shadow_msr_intercept.write);
3965 }
3966 }
8d14695f 3967
3eb90017
AG
3968 if ((type & MSR_TYPE_R) &&
3969 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3970 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3971 type &= ~MSR_TYPE_R;
3972 }
8d14695f 3973
3eb90017
AG
3974 if ((type & MSR_TYPE_W) &&
3975 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3976 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3977 type &= ~MSR_TYPE_W;
3978 }
8d14695f 3979
3eb90017
AG
3980 if (type & MSR_TYPE_R)
3981 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
8d14695f 3982
3eb90017
AG
3983 if (type & MSR_TYPE_W)
3984 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
8d14695f
YZ
3985}
3986
e23f6d49 3987void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
904e14fb 3988{
476c9bd8
AL
3989 struct vcpu_vmx *vmx = to_vmx(vcpu);
3990 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
904e14fb
PB
3991
3992 if (!cpu_has_vmx_msr_bitmap())
3993 return;
3994
b84155c3 3995 vmx_msr_bitmap_l01_changed(vmx);
ceef7d10 3996
904e14fb 3997 /*
3eb90017
AG
3998 * Mark the desired intercept state in shadow bitmap, this is needed
3999 * for resync when the MSR filter changes.
4000 */
4001 if (is_valid_passthrough_msr(msr)) {
4002 int idx = possible_passthrough_msr_slot(msr);
4003
4004 if (idx != -ENOENT) {
4005 if (type & MSR_TYPE_R)
4006 set_bit(idx, vmx->shadow_msr_intercept.read);
4007 if (type & MSR_TYPE_W)
4008 set_bit(idx, vmx->shadow_msr_intercept.write);
4009 }
4010 }
904e14fb 4011
3eb90017
AG
4012 if (type & MSR_TYPE_R)
4013 vmx_set_msr_bitmap_read(msr_bitmap, msr);
904e14fb 4014
3eb90017
AG
4015 if (type & MSR_TYPE_W)
4016 vmx_set_msr_bitmap_write(msr_bitmap, msr);
904e14fb
PB
4017}
4018
9389b9d5 4019static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
8d14695f 4020{
9389b9d5
SC
4021 unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
4022 unsigned long read_intercept;
904e14fb
PB
4023 int msr;
4024
9389b9d5
SC
4025 read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
4026
904e14fb 4027 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
9389b9d5
SC
4028 unsigned int read_idx = msr / BITS_PER_LONG;
4029 unsigned int write_idx = read_idx + (0x800 / sizeof(long));
3eb90017 4030
9389b9d5
SC
4031 msr_bitmap[read_idx] = read_intercept;
4032 msr_bitmap[write_idx] = ~0ul;
904e14fb 4033 }
9389b9d5 4034}
904e14fb 4035
84ec8d2d 4036static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
9389b9d5 4037{
84ec8d2d
SC
4038 struct vcpu_vmx *vmx = to_vmx(vcpu);
4039 u8 mode;
4040
9389b9d5
SC
4041 if (!cpu_has_vmx_msr_bitmap())
4042 return;
4043
84ec8d2d
SC
4044 if (cpu_has_secondary_exec_ctrls() &&
4045 (secondary_exec_controls_get(vmx) &
4046 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4047 mode = MSR_BITMAP_MODE_X2APIC;
4048 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4049 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4050 } else {
4051 mode = 0;
4052 }
4053
4054 if (mode == vmx->x2apic_msr_bitmap_mode)
4055 return;
4056
4057 vmx->x2apic_msr_bitmap_mode = mode;
4058
9389b9d5
SC
4059 vmx_reset_x2apic_msrs(vcpu, mode);
4060
4061 /*
4062 * TPR reads and writes can be virtualized even if virtual interrupt
4063 * delivery is not in use.
4064 */
4065 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4066 !(mode & MSR_BITMAP_MODE_X2APIC));
4067
4068 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4069 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4070 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4071 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
d588bb9b
CG
4072 if (enable_ipiv)
4073 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
f6e90f9e 4074 }
5897297b
AK
4075}
4076
476c9bd8 4077void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
b08c2896 4078{
476c9bd8 4079 struct vcpu_vmx *vmx = to_vmx(vcpu);
b08c2896
CP
4080 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4081 u32 i;
4082
476c9bd8
AL
4083 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4084 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4085 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4086 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
f4d3a902 4087 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
476c9bd8
AL
4088 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4089 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
b08c2896
CP
4090 }
4091}
4092
e6c67d8c
LA
4093static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4094{
4095 struct vcpu_vmx *vmx = to_vmx(vcpu);
4096 void *vapic_page;
4097 u32 vppr;
4098 int rvi;
4099
4100 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4101 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
96c66e87 4102 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
e6c67d8c
LA
4103 return false;
4104
7e712684 4105 rvi = vmx_get_rvi();
e6c67d8c 4106
96c66e87 4107 vapic_page = vmx->nested.virtual_apic_map.hva;
e6c67d8c 4108 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
e6c67d8c
LA
4109
4110 return ((rvi & 0xf0) > (vppr & 0xf0));
4111}
4112
3eb90017
AG
4113static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4114{
4115 struct vcpu_vmx *vmx = to_vmx(vcpu);
4116 u32 i;
4117
4118 /*
d895f28e
SC
4119 * Redo intercept permissions for MSRs that KVM is passing through to
4120 * the guest. Disabling interception will check the new MSR filter and
4121 * ensure that KVM enables interception if usersepace wants to filter
4122 * the MSR. MSRs that KVM is already intercepting don't need to be
4123 * refreshed since KVM is going to intercept them regardless of what
4124 * userspace wants.
3eb90017
AG
4125 */
4126 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4127 u32 msr = vmx_possible_passthrough_msrs[i];
3eb90017 4128
d895f28e
SC
4129 if (!test_bit(i, vmx->shadow_msr_intercept.read))
4130 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4131
4132 if (!test_bit(i, vmx->shadow_msr_intercept.write))
4133 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
3eb90017
AG
4134 }
4135
b184b35d
SC
4136 /* PT MSRs can be passed through iff PT is exposed to the guest. */
4137 if (vmx_pt_mode_is_host_guest())
4138 pt_update_intercept_for_msr(vcpu);
3eb90017
AG
4139}
4140
ccf8d687 4141static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
296aa266 4142 int pi_vec)
21bc8dc5
RK
4143{
4144#ifdef CONFIG_SMP
4145 if (vcpu->mode == IN_GUEST_MODE) {
28b835d6 4146 /*
9b44423b
WL
4147 * The vector of the virtual has already been set in the PIR.
4148 * Send a notification event to deliver the virtual interrupt
4149 * unless the vCPU is the currently running vCPU, i.e. the
4150 * event is being sent from a fastpath VM-Exit handler, in
4151 * which case the PIR will be synced to the vIRR before
4152 * re-entering the guest.
5753743f 4153 *
9b44423b
WL
4154 * When the target is not the running vCPU, the following
4155 * possibilities emerge:
5753743f 4156 *
9b44423b
WL
4157 * Case 1: vCPU stays in non-root mode. Sending a notification
4158 * event posts the interrupt to the vCPU.
5753743f 4159 *
9b44423b
WL
4160 * Case 2: vCPU exits to root mode and is still runnable. The
4161 * PIR will be synced to the vIRR before re-entering the guest.
4162 * Sending a notification event is ok as the host IRQ handler
4163 * will ignore the spurious event.
28b835d6 4164 *
9b44423b
WL
4165 * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4166 * has already synced PIR to vIRR and never blocks the vCPU if
4167 * the vIRR is not empty. Therefore, a blocked vCPU here does
4168 * not wait for any requested interrupts in PIR, and sending a
4169 * notification event also results in a benign, spurious event.
28b835d6 4170 */
28b835d6 4171
9b44423b
WL
4172 if (vcpu != kvm_get_running_vcpu())
4173 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
ccf8d687 4174 return;
21bc8dc5
RK
4175 }
4176#endif
ccf8d687
SC
4177 /*
4178 * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4179 * otherwise do nothing as KVM will grab the highest priority pending
4180 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4181 */
4182 kvm_vcpu_wake_up(vcpu);
21bc8dc5
RK
4183}
4184
705699a1
WV
4185static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4186 int vector)
4187{
4188 struct vcpu_vmx *vmx = to_vmx(vcpu);
4189
4190 if (is_guest_mode(vcpu) &&
4191 vector == vmx->nested.posted_intr_nv) {
705699a1
WV
4192 /*
4193 * If a posted intr is not recognized by hardware,
4194 * we will accomplish it in the next vmentry.
4195 */
4196 vmx->nested.pi_pending = true;
4197 kvm_make_request(KVM_REQ_EVENT, vcpu);
83c98007
SC
4198
4199 /*
4200 * This pairs with the smp_mb_*() after setting vcpu->mode in
4201 * vcpu_enter_guest() to guarantee the vCPU sees the event
4202 * request if triggering a posted interrupt "fails" because
4203 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
4204 * the smb_wmb() in kvm_make_request() only ensures everything
4205 * done before making the request is visible when the request
4206 * is visible, it doesn't ensure ordering between the store to
4207 * vcpu->requests and the load from vcpu->mode.
4208 */
4209 smp_mb__after_atomic();
4210
6b697711 4211 /* the PIR and ON have been set by L1. */
ccf8d687 4212 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
705699a1
WV
4213 return 0;
4214 }
4215 return -1;
4216}
a20ed54d
YZ
4217/*
4218 * Send interrupt to vcpu via posted interrupt way.
4219 * 1. If target vcpu is running(non-root mode), send posted interrupt
4220 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4221 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4222 * interrupt from PIR in next vmentry.
4223 */
91a5f413 4224static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
a20ed54d
YZ
4225{
4226 struct vcpu_vmx *vmx = to_vmx(vcpu);
4227 int r;
4228
705699a1
WV
4229 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4230 if (!r)
91a5f413
VK
4231 return 0;
4232
ce0a58f4
SC
4233 /* Note, this is called iff the local APIC is in-kernel. */
4234 if (!vcpu->arch.apic->apicv_active)
91a5f413 4235 return -1;
705699a1 4236
a20ed54d 4237 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
91a5f413 4238 return 0;
a20ed54d 4239
b95234c8
PB
4240 /* If a previous notification has sent the IPI, nothing to do. */
4241 if (pi_test_and_set_on(&vmx->pi_desc))
91a5f413 4242 return 0;
b95234c8 4243
83c98007
SC
4244 /*
4245 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4246 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4247 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4248 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4249 */
ccf8d687 4250 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
91a5f413 4251 return 0;
a20ed54d
YZ
4252}
4253
57dfd7b5
SC
4254static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4255 int trig_mode, int vector)
4256{
4257 struct kvm_vcpu *vcpu = apic->vcpu;
4258
4259 if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4260 kvm_lapic_set_irr(vector, apic);
4261 kvm_make_request(KVM_REQ_EVENT, vcpu);
4262 kvm_vcpu_kick(vcpu);
4263 } else {
4264 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4265 trig_mode, vector);
4266 }
4267}
4268
a3a8ff8e
NHE
4269/*
4270 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4271 * will not change in the lifetime of the guest.
4272 * Note that host-state that does change is set elsewhere. E.g., host-state
4273 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4274 */
97b7ead3 4275void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
a3a8ff8e
NHE
4276{
4277 u32 low32, high32;
4278 unsigned long tmpl;
d6e41f11 4279 unsigned long cr0, cr3, cr4;
a3a8ff8e 4280
04ac88ab
AL
4281 cr0 = read_cr0();
4282 WARN_ON(cr0 & X86_CR0_TS);
4283 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
d6e41f11
AL
4284
4285 /*
4286 * Save the most likely value for this task's CR3 in the VMCS.
4287 * We can't use __get_current_cr3_fast() because we're not atomic.
4288 */
6c690ee1 4289 cr3 = __read_cr3();
d6e41f11 4290 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
d7ee039e 4291 vmx->loaded_vmcs->host_state.cr3 = cr3;
a3a8ff8e 4292
d974baa3 4293 /* Save the most likely value for this task's CR4 in the VMCS. */
1e02ce4c 4294 cr4 = cr4_read_shadow();
d974baa3 4295 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
d7ee039e 4296 vmx->loaded_vmcs->host_state.cr4 = cr4;
d974baa3 4297
a3a8ff8e 4298 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
b2da15ac
AK
4299#ifdef CONFIG_X86_64
4300 /*
4301 * Load null selectors, so we can avoid reloading them in
6d6095bd
SC
4302 * vmx_prepare_switch_to_host(), in case userspace uses
4303 * the null selectors too (the expected case).
b2da15ac
AK
4304 */
4305 vmcs_write16(HOST_DS_SELECTOR, 0);
4306 vmcs_write16(HOST_ES_SELECTOR, 0);
4307#else
a3a8ff8e
NHE
4308 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4309 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
b2da15ac 4310#endif
a3a8ff8e
NHE
4311 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4312 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
4313
2342080c 4314 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
a3a8ff8e 4315
453eafbe 4316 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
a3a8ff8e
NHE
4317
4318 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4319 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
6ab8a405
LJ
4320
4321 /*
94fea1d8
SC
4322 * SYSENTER is used for 32-bit system calls on either 32-bit or
4323 * 64-bit kernels. It is always zero If neither is allowed, otherwise
4324 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4325 * have already done so!).
6ab8a405 4326 */
94fea1d8
SC
4327 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4328 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4329
a3a8ff8e
NHE
4330 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4331 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
4332
4333 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4334 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4335 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4336 }
5a5e8a15 4337
c73da3fc 4338 if (cpu_has_load_ia32_efer())
5a5e8a15 4339 vmcs_write64(HOST_IA32_EFER, host_efer);
a3a8ff8e
NHE
4340}
4341
97b7ead3 4342void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
bf8179a0 4343{
2ed41aa6
SC
4344 struct kvm_vcpu *vcpu = &vmx->vcpu;
4345
4346 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4347 ~vcpu->arch.cr4_guest_rsvd_bits;
a37ebdce 4348 if (!enable_ept) {
5ec60aad 4349 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
a37ebdce
LJ
4350 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4351 }
fe3ef05c 4352 if (is_guest_mode(&vmx->vcpu))
2ed41aa6
SC
4353 vcpu->arch.cr4_guest_owned_bits &=
4354 ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4355 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
bf8179a0
NHE
4356}
4357
2fba4fc1 4358static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
01e439be
YZ
4359{
4360 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4361
d62caabb 4362 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
01e439be 4363 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
d02fcf50
PB
4364
4365 if (!enable_vnmi)
4366 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4367
804939ea
SC
4368 if (!enable_preemption_timer)
4369 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4370
01e439be
YZ
4371 return pin_based_exec_ctrl;
4372}
4373
2fba4fc1
SC
4374static u32 vmx_vmentry_ctrl(void)
4375{
4376 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4377
4378 if (vmx_pt_mode_is_system())
4379 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4380 VM_ENTRY_LOAD_IA32_RTIT_CTL);
ffaaf591
VK
4381 /*
4382 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4383 */
4384 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4385 VM_ENTRY_LOAD_IA32_EFER |
4386 VM_ENTRY_IA32E_MODE);
4387
9d78d6fb
VK
4388 if (cpu_has_perf_global_ctrl_bug())
4389 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4390
ffaaf591 4391 return vmentry_ctrl;
2fba4fc1
SC
4392}
4393
4394static u32 vmx_vmexit_ctrl(void)
4395{
4396 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4397
f16e4742
VK
4398 /*
4399 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4400 * nested virtualization and thus allowed to be set in vmcs12.
4401 */
4402 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4403 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4404
2fba4fc1
SC
4405 if (vmx_pt_mode_is_system())
4406 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4407 VM_EXIT_CLEAR_IA32_RTIT_CTL);
9d78d6fb
VK
4408
4409 if (cpu_has_perf_global_ctrl_bug())
4410 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4411
2fba4fc1
SC
4412 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4413 return vmexit_ctrl &
4414 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4415}
4416
d62caabb
AS
4417static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4418{
4419 struct vcpu_vmx *vmx = to_vmx(vcpu);
4420
7c69661e
SC
4421 if (is_guest_mode(vcpu)) {
4422 vmx->nested.update_vmcs01_apicv_status = true;
4423 return;
4424 }
4425
c5f2c766 4426 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
f08a06c9 4427
d588bb9b 4428 if (kvm_vcpu_apicv_active(vcpu)) {
f08a06c9
ZG
4429 secondary_exec_controls_setbit(vmx,
4430 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4431 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
d588bb9b
CG
4432 if (enable_ipiv)
4433 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4434 } else {
f08a06c9
ZG
4435 secondary_exec_controls_clearbit(vmx,
4436 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4437 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
d588bb9b
CG
4438 if (enable_ipiv)
4439 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
3ce424e4
RK
4440 }
4441
84ec8d2d 4442 vmx_update_msr_bitmap_x2apic(vcpu);
d62caabb
AS
4443}
4444
2fba4fc1 4445static u32 vmx_exec_control(struct vcpu_vmx *vmx)
89b0c9f5
SC
4446{
4447 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4448
a83bea73
VK
4449 /*
4450 * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4451 * vmcs12 and propagated to vmcs02 when set in vmcs12.
4452 */
4453 exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4454 CPU_BASED_USE_IO_BITMAPS |
4455 CPU_BASED_MONITOR_TRAP_FLAG |
4456 CPU_BASED_PAUSE_EXITING);
4457
378c4c18
VK
4458 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4459 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4460 CPU_BASED_NMI_WINDOW_EXITING);
4461
89b0c9f5
SC
4462 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4463 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4464
e89e1e23 4465 if (!cpu_need_tpr_shadow(&vmx->vcpu))
89b0c9f5 4466 exec_control &= ~CPU_BASED_TPR_SHADOW;
e89e1e23 4467
89b0c9f5 4468#ifdef CONFIG_X86_64
e89e1e23
VK
4469 if (exec_control & CPU_BASED_TPR_SHADOW)
4470 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4471 CPU_BASED_CR8_STORE_EXITING);
4472 else
89b0c9f5
SC
4473 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4474 CPU_BASED_CR8_LOAD_EXITING;
4475#endif
64f80ea7
SC
4476 /* No need to intercept CR3 access or INVPLG when using EPT. */
4477 if (enable_ept)
4478 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4479 CPU_BASED_CR3_STORE_EXITING |
4480 CPU_BASED_INVLPG_EXITING);
89b0c9f5
SC
4481 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4482 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4483 CPU_BASED_MONITOR_EXITING);
4484 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4485 exec_control &= ~CPU_BASED_HLT_EXITING;
4486 return exec_control;
4487}
4488
1ad4e543
RH
4489static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4490{
d588bb9b
CG
4491 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4492
4493 /*
4494 * IPI virtualization relies on APICv. Disable IPI virtualization if
4495 * APICv is inhibited.
4496 */
4497 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4498 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4499
4500 return exec_control;
1ad4e543
RH
4501}
4502
8b50b92f
SC
4503/*
4504 * Adjust a single secondary execution control bit to intercept/allow an
4505 * instruction in the guest. This is usually done based on whether or not a
4506 * feature has been exposed to the guest in order to correctly emulate faults.
4507 */
4508static inline void
4509vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4510 u32 control, bool enabled, bool exiting)
4511{
4512 /*
4513 * If the control is for an opt-in feature, clear the control if the
4514 * feature is not exposed to the guest, i.e. not enabled. If the
4515 * control is opt-out, i.e. an exiting control, clear the control if
4516 * the feature _is_ exposed to the guest, i.e. exiting/interception is
4517 * disabled for the associated instruction. Note, the caller is
4518 * responsible presetting exec_control to set all supported bits.
4519 */
4520 if (enabled == exiting)
4521 *exec_control &= ~control;
4522
4523 /*
4524 * Update the nested MSR settings so that a nested VMM can/can't set
4525 * controls for features that are/aren't exposed to the guest.
4526 */
4527 if (nested) {
a0860d68
SC
4528 /*
4529 * All features that can be added or removed to VMX MSRs must
4530 * be supported in the first place for nested virtualization.
4531 */
4532 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4533 enabled = false;
4534
8b50b92f
SC
4535 if (enabled)
4536 vmx->nested.msrs.secondary_ctls_high |= control;
4537 else
4538 vmx->nested.msrs.secondary_ctls_high &= ~control;
4539 }
4540}
4541
4542/*
4543 * Wrapper macro for the common case of adjusting a secondary execution control
4544 * based on a single guest CPUID bit, with a dedicated feature bit. This also
4545 * verifies that the control is actually supported by KVM and hardware.
4546 */
4547#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4548({ \
4549 bool __enabled; \
4550 \
4551 if (cpu_has_vmx_##name()) { \
4552 __enabled = guest_cpuid_has(&(vmx)->vcpu, \
4553 X86_FEATURE_##feat_name); \
4554 vmx_adjust_secondary_exec_control(vmx, exec_control, \
4555 SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4556 } \
4557})
4558
4559/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4560#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4561 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4562
4563#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4564 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
89b0c9f5 4565
2fba4fc1 4566static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
bf8179a0 4567{
80154d77
PB
4568 struct kvm_vcpu *vcpu = &vmx->vcpu;
4569
bf8179a0 4570 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
0367f205 4571
2ef7619d 4572 if (vmx_pt_mode_is_system())
f99e3daf 4573 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
80154d77 4574 if (!cpu_need_virtualize_apic_accesses(vcpu))
bf8179a0
NHE
4575 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4576 if (vmx->vpid == 0)
4577 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4578 if (!enable_ept) {
4579 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4580 enable_unrestricted_guest = 0;
4581 }
4582 if (!enable_unrestricted_guest)
4583 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
b31c114b 4584 if (kvm_pause_in_guest(vmx->vcpu.kvm))
bf8179a0 4585 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
80154d77 4586 if (!kvm_vcpu_apicv_active(vcpu))
c7c9c56c
YZ
4587 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4588 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
8d14695f 4589 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
0367f205
PB
4590
4591 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4592 * in vmx_set_cr4. */
4593 exec_control &= ~SECONDARY_EXEC_DESC;
4594
abc4fc58
AG
4595 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4596 (handle_vmptrld).
4597 We can NOT enable shadow_vmcs here because we don't have yet
4598 a current VMCS12
4599 */
4600 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
a3eaa864 4601
a85863c2
MS
4602 /*
4603 * PML is enabled/disabled when dirty logging of memsmlots changes, but
4604 * it needs to be set here when dirty logging is already active, e.g.
4605 * if this vCPU was created after dirty logging was enabled.
4606 */
4607 if (!vcpu->kvm->arch.cpu_dirty_logging_count)
a3eaa864 4608 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
843e4330 4609
becdad85 4610 if (cpu_has_vmx_xsaves()) {
3db13480
PB
4611 /* Exposing XSAVES only when XSAVE is exposed */
4612 bool xsaves_enabled =
96be4e06 4613 boot_cpu_has(X86_FEATURE_XSAVE) &&
3db13480
PB
4614 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4615 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4616
7204160e
AL
4617 vcpu->arch.xsaves_enabled = xsaves_enabled;
4618
8b50b92f
SC
4619 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4620 SECONDARY_EXEC_XSAVES,
4621 xsaves_enabled, false);
45ec368c
JM
4622 }
4623
36fa06f9
SC
4624 /*
4625 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4626 * feature is exposed to the guest. This creates a virtualization hole
4627 * if both are supported in hardware but only one is exposed to the
4628 * guest, but letting the guest execute RDTSCP or RDPID when either one
4629 * is advertised is preferable to emulating the advertised instruction
4630 * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4631 */
4632 if (cpu_has_vmx_rdtscp()) {
4633 bool rdpid_or_rdtscp_enabled =
4634 guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4635 guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4636
4637 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4638 SECONDARY_EXEC_ENABLE_RDTSCP,
4639 rdpid_or_rdtscp_enabled, false);
4640 }
8b50b92f 4641 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
75f4fc8d 4642
8b50b92f
SC
4643 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4644 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
e69e72fa 4645
8b50b92f
SC
4646 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4647 ENABLE_USR_WAIT_PAUSE, false);
e69e72fa 4648
fe6b6bc8
CQ
4649 if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4650 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4651
2f4073e0
TX
4652 if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4653 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4654
b6247686 4655 return exec_control;
bf8179a0
NHE
4656}
4657
d588bb9b
CG
4658static inline int vmx_get_pid_table_order(struct kvm *kvm)
4659{
4660 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4661}
4662
4663static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4664{
4665 struct page *pages;
4666 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4667
4668 if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4669 return 0;
4670
4671 if (kvm_vmx->pid_table)
4672 return 0;
4673
4674 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
4675 if (!pages)
4676 return -ENOMEM;
4677
4678 kvm_vmx->pid_table = (void *)page_address(pages);
4679 return 0;
4680}
4681
4682static int vmx_vcpu_precreate(struct kvm *kvm)
4683{
4684 return vmx_alloc_ipiv_pid_table(kvm);
4685}
4686
f53cd63c 4687#define VMX_XSS_EXIT_BITMAP 0
6aa8b732 4688
1b84292b 4689static void init_vmcs(struct vcpu_vmx *vmx)
944c3464 4690{
d588bb9b
CG
4691 struct kvm *kvm = vmx->vcpu.kvm;
4692 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4693
944c3464 4694 if (nested)
1b84292b 4695 nested_vmx_set_vmcs_shadowing_bitmap();
944c3464 4696
25c5f225 4697 if (cpu_has_vmx_msr_bitmap())
904e14fb 4698 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
25c5f225 4699
64c78508 4700 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
6aa8b732 4701
6aa8b732 4702 /* Control */
3af80fec 4703 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
6e5d865c 4704
3af80fec 4705 exec_controls_set(vmx, vmx_exec_control(vmx));
6aa8b732 4706
b6247686
SC
4707 if (cpu_has_secondary_exec_ctrls())
4708 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
f78e0e2e 4709
1ad4e543
RH
4710 if (cpu_has_tertiary_exec_ctrls())
4711 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4712
1421211a 4713 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
c7c9c56c
YZ
4714 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4715 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4716 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4717 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4718
4719 vmcs_write16(GUEST_INTR_STATUS, 0);
01e439be 4720
0bcf261c 4721 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
01e439be 4722 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
c7c9c56c
YZ
4723 }
4724
d588bb9b
CG
4725 if (vmx_can_use_ipiv(&vmx->vcpu)) {
4726 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4727 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4728 }
4729
4730 if (!kvm_pause_in_guest(kvm)) {
4b8d54f9 4731 vmcs_write32(PLE_GAP, ple_gap);
a7653ecd
RK
4732 vmx->ple_window = ple_window;
4733 vmx->ple_window_dirty = true;
4b8d54f9
ZE
4734 }
4735
2f4073e0
TX
4736 if (kvm_notify_vmexit_enabled(kvm))
4737 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4738
c3707958
XG
4739 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4740 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6aa8b732
AK
4741 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4742
9581d442
AK
4743 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4744 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
a547c6db 4745 vmx_set_constant_host_state(vmx);
6aa8b732
AK
4746 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4747 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6aa8b732 4748
2a499e49
BD
4749 if (cpu_has_vmx_vmfunc())
4750 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4751
2cc51560
ED
4752 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4753 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
33966dd6 4754 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2cc51560 4755 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
33966dd6 4756 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6aa8b732 4757
74545705
RK
4758 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4759 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
468d472f 4760
3af80fec 4761 vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
6aa8b732
AK
4762
4763 /* 22.2.1, 20.8.1 */
3af80fec 4764 vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
1c3d14fe 4765
fa71e952
SC
4766 vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4767 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
bd7e5b08 4768
bf8179a0 4769 set_cr4_guest_host_mask(vmx);
e00c8cf2 4770
35fbe0d4
XL
4771 if (vmx->vpid != 0)
4772 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4773
becdad85 4774 if (cpu_has_vmx_xsaves())
f53cd63c
WL
4775 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4776
4e59516a 4777 if (enable_pml) {
4e59516a
PF
4778 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4779 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4780 }
0b665d30 4781
72add915 4782 vmx_write_encls_bitmap(&vmx->vcpu, NULL);
2ef444f1 4783
2ef7619d 4784 if (vmx_pt_mode_is_host_guest()) {
2ef444f1
CP
4785 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4786 /* Bit[6~0] are forced to 1, writes are ignored. */
4787 vmx->pt_desc.guest.output_mask = 0x7F;
4788 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4789 }
c5c9f920 4790
e5494940
SC
4791 vmcs_write32(GUEST_SYSENTER_CS, 0);
4792 vmcs_writel(GUEST_SYSENTER_ESP, 0);
4793 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4794 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4795
4796 if (cpu_has_vmx_tpr_shadow()) {
4797 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4798 if (cpu_need_tpr_shadow(&vmx->vcpu))
4799 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4800 __pa(vmx->vcpu.arch.apic->regs));
4801 vmcs_write32(TPR_THRESHOLD, 0);
4802 }
4803
c5c9f920 4804 vmx_setup_uret_msrs(vmx);
e00c8cf2
AK
4805}
4806
06692e4b
SC
4807static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4808{
4809 struct vcpu_vmx *vmx = to_vmx(vcpu);
4810
4811 init_vmcs(vmx);
4812
4813 if (nested)
4814 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4815
4816 vcpu_setup_sgx_lepubkeyhash(vcpu);
4817
4818 vmx->nested.posted_intr_nv = -1;
4819 vmx->nested.vmxon_ptr = INVALID_GPA;
4820 vmx->nested.current_vmptr = INVALID_GPA;
4821 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4822
4823 vcpu->arch.microcode_version = 0x100000000ULL;
4824 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4825
4826 /*
4827 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4828 * or POSTED_INTR_WAKEUP_VECTOR.
4829 */
4830 vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4831 vmx->pi_desc.sn = 1;
4832}
4833
d28bc9dd 4834static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
e00c8cf2
AK
4835{
4836 struct vcpu_vmx *vmx = to_vmx(vcpu);
e00c8cf2 4837
06692e4b
SC
4838 if (!init_event)
4839 __vmx_vcpu_reset(vcpu);
4840
7ffd92c5 4841 vmx->rmode.vm86_active = 0;
d28b387f 4842 vmx->spec_ctrl = 0;
e00c8cf2 4843
6e3ba4ab
TX
4844 vmx->msr_ia32_umwait_control = 0;
4845
95c06540 4846 vmx->hv_deadline_tsc = -1;
d28bc9dd
NA
4847 kvm_set_cr8(vcpu, 0);
4848
2fb92db1 4849 vmx_segment_cache_clear(vmx);
ff8828c8 4850 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
2fb92db1 4851
5706be0d 4852 seg_setup(VCPU_SREG_CS);
66450a21 4853 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
f3531054 4854 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
e00c8cf2
AK
4855
4856 seg_setup(VCPU_SREG_DS);
4857 seg_setup(VCPU_SREG_ES);
4858 seg_setup(VCPU_SREG_FS);
4859 seg_setup(VCPU_SREG_GS);
4860 seg_setup(VCPU_SREG_SS);
4861
4862 vmcs_write16(GUEST_TR_SELECTOR, 0);
4863 vmcs_writel(GUEST_TR_BASE, 0);
4864 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4865 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4866
4867 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4868 vmcs_writel(GUEST_LDTR_BASE, 0);
4869 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4870 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4871
e00c8cf2
AK
4872 vmcs_writel(GUEST_GDTR_BASE, 0);
4873 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4874
4875 vmcs_writel(GUEST_IDTR_BASE, 0);
4876 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4877
443381a8 4878 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
e00c8cf2 4879 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
f3531054 4880 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
a554d207
WL
4881 if (kvm_mpx_supported())
4882 vmcs_write64(GUEST_BNDCFGS, 0);
e00c8cf2 4883
6aa8b732
AK
4884 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
4885
a73896cb 4886 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6aa8b732 4887
dd5f5341 4888 vpid_sync_context(vmx->vpid);
027bbb88
PG
4889
4890 vmx_update_fb_clear_dis(vcpu, vmx);
6aa8b732
AK
4891}
4892
b6a7cc35 4893static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
3b86cd99 4894{
9dadc2f9 4895 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
3b86cd99
JK
4896}
4897
b6a7cc35 4898static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
3b86cd99 4899{
d02fcf50 4900 if (!enable_vnmi ||
8a1b4392 4901 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
b6a7cc35 4902 vmx_enable_irq_window(vcpu);
c9a7953f
JK
4903 return;
4904 }
3b86cd99 4905
4e2a0bc5 4906 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
3b86cd99
JK
4907}
4908
2d613912 4909static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
85f455f7 4910{
9c8cba37 4911 struct vcpu_vmx *vmx = to_vmx(vcpu);
66fd3f7f
GN
4912 uint32_t intr;
4913 int irq = vcpu->arch.interrupt.nr;
9c8cba37 4914
2d613912 4915 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
2714d1d3 4916
fa89a817 4917 ++vcpu->stat.irq_injections;
7ffd92c5 4918 if (vmx->rmode.vm86_active) {
71f9833b
SH
4919 int inc_eip = 0;
4920 if (vcpu->arch.interrupt.soft)
4921 inc_eip = vcpu->arch.event_exit_inst_len;
9497e1f2 4922 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
85f455f7
ED
4923 return;
4924 }
66fd3f7f
GN
4925 intr = irq | INTR_INFO_VALID_MASK;
4926 if (vcpu->arch.interrupt.soft) {
4927 intr |= INTR_TYPE_SOFT_INTR;
4928 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4929 vmx->vcpu.arch.event_exit_inst_len);
4930 } else
4931 intr |= INTR_TYPE_EXT_INTR;
4932 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
caa057a2
WL
4933
4934 vmx_clear_hlt(vcpu);
85f455f7
ED
4935}
4936
f08864b4
SY
4937static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4938{
66a5a347
JK
4939 struct vcpu_vmx *vmx = to_vmx(vcpu);
4940
d02fcf50 4941 if (!enable_vnmi) {
8a1b4392
PB
4942 /*
4943 * Tracking the NMI-blocked state in software is built upon
4944 * finding the next open IRQ window. This, in turn, depends on
4945 * well-behaving guests: They have to keep IRQs disabled at
4946 * least as long as the NMI handler runs. Otherwise we may
4947 * cause NMI nesting, maybe breaking the guest. But as this is
4948 * highly unlikely, we can live with the residual risk.
4949 */
4950 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4951 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4952 }
4953
4c4a6f79
PB
4954 ++vcpu->stat.nmi_injections;
4955 vmx->loaded_vmcs->nmi_known_unmasked = false;
3b86cd99 4956
7ffd92c5 4957 if (vmx->rmode.vm86_active) {
9497e1f2 4958 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
66a5a347
JK
4959 return;
4960 }
c5a6d5f7 4961
f08864b4
SY
4962 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4963 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
caa057a2
WL
4964
4965 vmx_clear_hlt(vcpu);
f08864b4
SY
4966}
4967
97b7ead3 4968bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
3cfc3092 4969{
4c4a6f79
PB
4970 struct vcpu_vmx *vmx = to_vmx(vcpu);
4971 bool masked;
4972
d02fcf50 4973 if (!enable_vnmi)
8a1b4392 4974 return vmx->loaded_vmcs->soft_vnmi_blocked;
4c4a6f79 4975 if (vmx->loaded_vmcs->nmi_known_unmasked)
9d58b931 4976 return false;
4c4a6f79
PB
4977 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4978 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4979 return masked;
3cfc3092
JK
4980}
4981
97b7ead3 4982void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3cfc3092
JK
4983{
4984 struct vcpu_vmx *vmx = to_vmx(vcpu);
4985
d02fcf50 4986 if (!enable_vnmi) {
8a1b4392
PB
4987 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4988 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4989 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4990 }
4991 } else {
4992 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4993 if (masked)
4994 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4995 GUEST_INTR_STATE_NMI);
4996 else
4997 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4998 GUEST_INTR_STATE_NMI);
4999 }
3cfc3092
JK
5000}
5001
1b660b6b
SC
5002bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5003{
5004 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5005 return false;
5006
5007 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5008 return true;
5009
5010 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5011 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5012 GUEST_INTR_STATE_NMI));
5013}
5014
c9d40913 5015static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
2505dc9f 5016{
b6b8a145 5017 if (to_vmx(vcpu)->nested.nested_run_pending)
c9d40913 5018 return -EBUSY;
ea8ceb83 5019
c300ab9f
PB
5020 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
5021 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
c9d40913 5022 return -EBUSY;
c300ab9f 5023
1b660b6b
SC
5024 return !vmx_nmi_blocked(vcpu);
5025}
429ab576 5026
1b660b6b
SC
5027bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5028{
5029 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
88c604b6 5030 return false;
8a1b4392 5031
7ab0abdb 5032 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
1b660b6b
SC
5033 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5034 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2505dc9f
JK
5035}
5036
c9d40913 5037static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
78646121 5038{
a1c77abb 5039 if (to_vmx(vcpu)->nested.nested_run_pending)
c9d40913 5040 return -EBUSY;
a1c77abb 5041
c300ab9f
PB
5042 /*
5043 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5044 * e.g. if the IRQ arrived asynchronously after checking nested events.
5045 */
5046 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
c9d40913 5047 return -EBUSY;
c300ab9f 5048
1b660b6b 5049 return !vmx_interrupt_blocked(vcpu);
78646121
GN
5050}
5051
cbc94022
IE
5052static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5053{
ff5a983c 5054 void __user *ret;
cbc94022 5055
f7eaeb0a
SC
5056 if (enable_unrestricted_guest)
5057 return 0;
5058
6a3c623b
PX
5059 mutex_lock(&kvm->slots_lock);
5060 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5061 PAGE_SIZE * 3);
5062 mutex_unlock(&kvm->slots_lock);
5063
ff5a983c
PX
5064 if (IS_ERR(ret))
5065 return PTR_ERR(ret);
5066
40bbb9d0 5067 to_kvm_vmx(kvm)->tss_addr = addr;
ff5a983c
PX
5068
5069 return init_rmode_tss(kvm, ret);
cbc94022
IE
5070}
5071
2ac52ab8
SC
5072static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5073{
40bbb9d0 5074 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
2ac52ab8
SC
5075 return 0;
5076}
5077
0ca1b4f4 5078static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6aa8b732 5079{
77ab6db0 5080 switch (vec) {
77ab6db0 5081 case BP_VECTOR:
c573cd22
JK
5082 /*
5083 * Update instruction length as we may reinject the exception
5084 * from user space while in guest debugging mode.
5085 */
5086 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5087 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
d0bfb940 5088 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
0ca1b4f4 5089 return false;
df561f66 5090 fallthrough;
0ca1b4f4 5091 case DB_VECTOR:
a8cfbae5
ML
5092 return !(vcpu->guest_debug &
5093 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
d0bfb940 5094 case DE_VECTOR:
77ab6db0
JK
5095 case OF_VECTOR:
5096 case BR_VECTOR:
5097 case UD_VECTOR:
5098 case DF_VECTOR:
5099 case SS_VECTOR:
5100 case GP_VECTOR:
5101 case MF_VECTOR:
0ca1b4f4 5102 return true;
77ab6db0 5103 }
0ca1b4f4
GN
5104 return false;
5105}
5106
5107static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5108 int vec, u32 err_code)
5109{
5110 /*
5111 * Instruction with address size override prefix opcode 0x67
5112 * Cause the #SS fault with 0 error code in VM86 mode.
5113 */
5114 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
60fc3d02 5115 if (kvm_emulate_instruction(vcpu, 0)) {
0ca1b4f4
GN
5116 if (vcpu->arch.halt_request) {
5117 vcpu->arch.halt_request = 0;
1460179d 5118 return kvm_emulate_halt_noskip(vcpu);
0ca1b4f4
GN
5119 }
5120 return 1;
5121 }
5122 return 0;
5123 }
5124
5125 /*
5126 * Forward all other exceptions that are valid in real mode.
5127 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5128 * the required debugging infrastructure rework.
5129 */
5130 kvm_queue_exception(vcpu, vec);
5131 return 1;
6aa8b732
AK
5132}
5133
851ba692 5134static int handle_machine_check(struct kvm_vcpu *vcpu)
a0861c02 5135{
95b5a48c 5136 /* handled by vmx_vcpu_run() */
a0861c02
AK
5137 return 1;
5138}
5139
e6f8b6c1
XL
5140/*
5141 * If the host has split lock detection disabled, then #AC is
5142 * unconditionally injected into the guest, which is the pre split lock
5143 * detection behaviour.
5144 *
5145 * If the host has split lock detection enabled then #AC is
5146 * only injected into the guest when:
5147 * - Guest CPL == 3 (user mode)
5148 * - Guest has #AC detection enabled in CR0
5149 * - Guest EFLAGS has AC bit set
5150 */
b33bb78a 5151bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
e6f8b6c1
XL
5152{
5153 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5154 return true;
5155
5156 return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
5157 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5158}
5159
95b5a48c 5160static int handle_exception_nmi(struct kvm_vcpu *vcpu)
6aa8b732 5161{
1155f76a 5162 struct vcpu_vmx *vmx = to_vmx(vcpu);
851ba692 5163 struct kvm_run *kvm_run = vcpu->run;
d0bfb940 5164 u32 intr_info, ex_no, error_code;
e87e46d5 5165 unsigned long cr2, dr6;
6aa8b732 5166 u32 vect_info;
6aa8b732 5167
1155f76a 5168 vect_info = vmx->idt_vectoring_info;
f27ad73a 5169 intr_info = vmx_get_intr_info(vcpu);
6aa8b732 5170
2ea72039 5171 if (is_machine_check(intr_info) || is_nmi(intr_info))
95b5a48c 5172 return 1; /* handled by handle_exception_nmi_irqoff() */
2ab455cc 5173
ec5be88a
JL
5174 /*
5175 * Queue the exception here instead of in handle_nm_fault_irqoff().
5176 * This ensures the nested_vmx check is not skipped so vmexit can
5177 * be reflected to L1 (when it intercepts #NM) before reaching this
5178 * point.
5179 */
5180 if (is_nm_fault(intr_info)) {
5181 kvm_queue_exception(vcpu, NM_VECTOR);
5182 return 1;
5183 }
5184
082d06ed
WL
5185 if (is_invalid_opcode(intr_info))
5186 return handle_ud(vcpu);
7aa81cc0 5187
6aa8b732 5188 error_code = 0;
2e11384c 5189 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6aa8b732 5190 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
bf4ca23e 5191
9e869480
LA
5192 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5193 WARN_ON_ONCE(!enable_vmware_backdoor);
a6c6ed1e
SC
5194
5195 /*
5196 * VMware backdoor emulation on #GP interception only handles
5197 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5198 * error code on #GP.
5199 */
5200 if (error_code) {
5201 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5202 return 1;
5203 }
60fc3d02 5204 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
9e869480
LA
5205 }
5206
bf4ca23e
XG
5207 /*
5208 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5209 * MMIO, it is better to report an internal error.
5210 * See the comments in vmx_handle_exit.
5211 */
5212 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5213 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5214 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5215 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
1aa561b1 5216 vcpu->run->internal.ndata = 4;
bf4ca23e
XG
5217 vcpu->run->internal.data[0] = vect_info;
5218 vcpu->run->internal.data[1] = intr_info;
80f0e95d 5219 vcpu->run->internal.data[2] = error_code;
8a14fe4f 5220 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
bf4ca23e
XG
5221 return 0;
5222 }
5223
6aa8b732 5224 if (is_page_fault(intr_info)) {
5addc235 5225 cr2 = vmx_get_exit_qual(vcpu);
1dbf5d68
MG
5226 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5227 /*
5228 * EPT will cause page fault only if we need to
5229 * detect illegal GPAs.
5230 */
b96e6506 5231 WARN_ON_ONCE(!allow_smaller_maxphyaddr);
1dbf5d68
MG
5232 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5233 return 1;
5234 } else
5235 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
6aa8b732
AK
5236 }
5237
d0bfb940 5238 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
0ca1b4f4
GN
5239
5240 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5241 return handle_rmode_exception(vcpu, ex_no, error_code);
5242
42dbaa5a
JK
5243 switch (ex_no) {
5244 case DB_VECTOR:
5addc235 5245 dr6 = vmx_get_exit_qual(vcpu);
42dbaa5a
JK
5246 if (!(vcpu->guest_debug &
5247 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
b9bed78e
SC
5248 /*
5249 * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5250 * instruction. ICEBP generates a trap-like #DB, but
5251 * despite its interception control being tied to #DB,
5252 * is an instruction intercept, i.e. the VM-Exit occurs
65ec8f01
SC
5253 * on the ICEBP itself. Use the inner "skip" helper to
5254 * avoid single-step #DB and MTF updates, as ICEBP is
5255 * higher priority. Note, skipping ICEBP still clears
5256 * STI and MOVSS blocking.
b9bed78e
SC
5257 *
5258 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5259 * if single-step is enabled in RFLAGS and STI or MOVSS
5260 * blocking is active, as the CPU doesn't set the bit
5261 * on VM-Exit due to #DB interception. VM-Entry has a
5262 * consistency check that a single-step #DB is pending
5263 * in this scenario as the previous instruction cannot
5264 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5265 * don't modify RFLAGS), therefore the one instruction
5266 * delay when activating single-step breakpoints must
5267 * have already expired. Note, the CPU sets/clears BS
5268 * as appropriate for all other VM-Exits types.
5269 */
32d43cd3 5270 if (is_icebp(intr_info))
1957aa63 5271 WARN_ON(!skip_emulated_instruction(vcpu));
b9bed78e
SC
5272 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5273 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5274 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5275 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5276 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
fd2a445a 5277
4d5523cf 5278 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
42dbaa5a
JK
5279 return 1;
5280 }
9a3ecd5e 5281 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
42dbaa5a 5282 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
df561f66 5283 fallthrough;
42dbaa5a 5284 case BP_VECTOR:
c573cd22
JK
5285 /*
5286 * Update instruction length as we may reinject #BP from
5287 * user space while in guest debugging mode. Reading it for
5288 * #DB as well causes no harm, it is not used in that case.
5289 */
5290 vmx->vcpu.arch.event_exit_inst_len =
5291 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6aa8b732 5292 kvm_run->exit_reason = KVM_EXIT_DEBUG;
e87e46d5 5293 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
d0bfb940 5294 kvm_run->debug.arch.exception = ex_no;
42dbaa5a 5295 break;
e6f8b6c1 5296 case AC_VECTOR:
b33bb78a 5297 if (vmx_guest_inject_ac(vcpu)) {
e6f8b6c1
XL
5298 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5299 return 1;
5300 }
5301
5302 /*
5303 * Handle split lock. Depending on detection mode this will
5304 * either warn and disable split lock detection for this
5305 * task or force SIGBUS on it.
5306 */
5307 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5308 return 1;
5309 fallthrough;
42dbaa5a 5310 default:
d0bfb940
JK
5311 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5312 kvm_run->ex.exception = ex_no;
5313 kvm_run->ex.error_code = error_code;
42dbaa5a 5314 break;
6aa8b732 5315 }
6aa8b732
AK
5316 return 0;
5317}
5318
f399e60c 5319static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
6aa8b732 5320{
1165f5fe 5321 ++vcpu->stat.irq_exits;
6aa8b732
AK
5322 return 1;
5323}
5324
851ba692 5325static int handle_triple_fault(struct kvm_vcpu *vcpu)
988ad74f 5326{
851ba692 5327 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
bbeac283 5328 vcpu->mmio_needed = 0;
988ad74f
AK
5329 return 0;
5330}
6aa8b732 5331
851ba692 5332static int handle_io(struct kvm_vcpu *vcpu)
6aa8b732 5333{
bfdaab09 5334 unsigned long exit_qualification;
dca7f128 5335 int size, in, string;
039576c0 5336 unsigned port;
6aa8b732 5337
5addc235 5338 exit_qualification = vmx_get_exit_qual(vcpu);
039576c0 5339 string = (exit_qualification & 16) != 0;
e70669ab 5340
cf8f70bf 5341 ++vcpu->stat.io_exits;
e70669ab 5342
432baf60 5343 if (string)
60fc3d02 5344 return kvm_emulate_instruction(vcpu, 0);
e70669ab 5345
cf8f70bf
GN
5346 port = exit_qualification >> 16;
5347 size = (exit_qualification & 7) + 1;
432baf60 5348 in = (exit_qualification & 8) != 0;
cf8f70bf 5349
dca7f128 5350 return kvm_fast_pio(vcpu, size, port, in);
6aa8b732
AK
5351}
5352
102d8325
IM
5353static void
5354vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5355{
5356 /*
5357 * Patch in the VMCALL instruction:
5358 */
5359 hypercall[0] = 0x0f;
5360 hypercall[1] = 0x01;
5361 hypercall[2] = 0xc1;
102d8325
IM
5362}
5363
0fa06071 5364/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
eeadf9e7
NHE
5365static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5366{
eeadf9e7 5367 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
5368 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5369 unsigned long orig_val = val;
5370
eeadf9e7
NHE
5371 /*
5372 * We get here when L2 changed cr0 in a way that did not change
5373 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
1a0d74e6
JK
5374 * but did change L0 shadowed bits. So we first calculate the
5375 * effective cr0 value that L1 would like to write into the
5376 * hardware. It consists of the L2-owned bits from the new
5377 * value combined with the L1-owned bits from L1's guest_cr0.
eeadf9e7 5378 */
1a0d74e6
JK
5379 val = (val & ~vmcs12->cr0_guest_host_mask) |
5380 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5381
3899152c 5382 if (!nested_guest_cr0_valid(vcpu, val))
eeadf9e7 5383 return 1;
1a0d74e6
JK
5384
5385 if (kvm_set_cr0(vcpu, val))
5386 return 1;
5387 vmcs_writel(CR0_READ_SHADOW, orig_val);
eeadf9e7 5388 return 0;
1a0d74e6
JK
5389 } else {
5390 if (to_vmx(vcpu)->nested.vmxon &&
3899152c 5391 !nested_host_cr0_valid(vcpu, val))
1a0d74e6 5392 return 1;
3899152c 5393
eeadf9e7 5394 return kvm_set_cr0(vcpu, val);
1a0d74e6 5395 }
eeadf9e7
NHE
5396}
5397
5398static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5399{
5400 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
5401 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5402 unsigned long orig_val = val;
5403
5404 /* analogously to handle_set_cr0 */
5405 val = (val & ~vmcs12->cr4_guest_host_mask) |
5406 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5407 if (kvm_set_cr4(vcpu, val))
eeadf9e7 5408 return 1;
1a0d74e6 5409 vmcs_writel(CR4_READ_SHADOW, orig_val);
eeadf9e7
NHE
5410 return 0;
5411 } else
5412 return kvm_set_cr4(vcpu, val);
5413}
5414
0367f205
PB
5415static int handle_desc(struct kvm_vcpu *vcpu)
5416{
5417 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
60fc3d02 5418 return kvm_emulate_instruction(vcpu, 0);
0367f205
PB
5419}
5420
851ba692 5421static int handle_cr(struct kvm_vcpu *vcpu)
6aa8b732 5422{
229456fc 5423 unsigned long exit_qualification, val;
6aa8b732
AK
5424 int cr;
5425 int reg;
49a9b07e 5426 int err;
6affcbed 5427 int ret;
6aa8b732 5428
5addc235 5429 exit_qualification = vmx_get_exit_qual(vcpu);
6aa8b732
AK
5430 cr = exit_qualification & 15;
5431 reg = (exit_qualification >> 8) & 15;
5432 switch ((exit_qualification >> 4) & 3) {
5433 case 0: /* mov to cr */
27b4a9c4 5434 val = kvm_register_read(vcpu, reg);
229456fc 5435 trace_kvm_cr_write(cr, val);
6aa8b732
AK
5436 switch (cr) {
5437 case 0:
eeadf9e7 5438 err = handle_set_cr0(vcpu, val);
6affcbed 5439 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 5440 case 3:
e1de91cc 5441 WARN_ON_ONCE(enable_unrestricted_guest);
67369273 5442
2390218b 5443 err = kvm_set_cr3(vcpu, val);
6affcbed 5444 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 5445 case 4:
eeadf9e7 5446 err = handle_set_cr4(vcpu, val);
6affcbed 5447 return kvm_complete_insn_gp(vcpu, err);
0a5fff19
GN
5448 case 8: {
5449 u8 cr8_prev = kvm_get_cr8(vcpu);
1e32c079 5450 u8 cr8 = (u8)val;
eea1cff9 5451 err = kvm_set_cr8(vcpu, cr8);
6affcbed 5452 ret = kvm_complete_insn_gp(vcpu, err);
35754c98 5453 if (lapic_in_kernel(vcpu))
6affcbed 5454 return ret;
0a5fff19 5455 if (cr8_prev <= cr8)
6affcbed
KH
5456 return ret;
5457 /*
5458 * TODO: we might be squashing a
5459 * KVM_GUESTDBG_SINGLESTEP-triggered
5460 * KVM_EXIT_DEBUG here.
5461 */
851ba692 5462 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
0a5fff19
GN
5463 return 0;
5464 }
4b8073e4 5465 }
6aa8b732 5466 break;
25c4c276 5467 case 2: /* clts */
67369273
SC
5468 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5469 return -EIO;
6aa8b732
AK
5470 case 1: /*mov from cr*/
5471 switch (cr) {
5472 case 3:
e1de91cc 5473 WARN_ON_ONCE(enable_unrestricted_guest);
67369273 5474
9f8fe504
AK
5475 val = kvm_read_cr3(vcpu);
5476 kvm_register_write(vcpu, reg, val);
5477 trace_kvm_cr_read(cr, val);
6affcbed 5478 return kvm_skip_emulated_instruction(vcpu);
6aa8b732 5479 case 8:
229456fc
MT
5480 val = kvm_get_cr8(vcpu);
5481 kvm_register_write(vcpu, reg, val);
5482 trace_kvm_cr_read(cr, val);
6affcbed 5483 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
5484 }
5485 break;
5486 case 3: /* lmsw */
a1f83a74 5487 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4d4ec087 5488 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
a1f83a74 5489 kvm_lmsw(vcpu, val);
6aa8b732 5490
6affcbed 5491 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
5492 default:
5493 break;
5494 }
851ba692 5495 vcpu->run->exit_reason = 0;
a737f256 5496 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6aa8b732
AK
5497 (int)(exit_qualification >> 4) & 3, cr);
5498 return 0;
5499}
5500
851ba692 5501static int handle_dr(struct kvm_vcpu *vcpu)
6aa8b732 5502{
bfdaab09 5503 unsigned long exit_qualification;
16f8a6f9 5504 int dr, dr7, reg;
996ff542 5505 int err = 1;
16f8a6f9 5506
5addc235 5507 exit_qualification = vmx_get_exit_qual(vcpu);
16f8a6f9
NA
5508 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5509
5510 /* First, if DR does not exist, trigger UD */
5511 if (!kvm_require_dr(vcpu, dr))
5512 return 1;
6aa8b732 5513
ef2d488c 5514 if (vmx_get_cpl(vcpu) > 0)
996ff542
PB
5515 goto out;
5516
16f8a6f9
NA
5517 dr7 = vmcs_readl(GUEST_DR7);
5518 if (dr7 & DR7_GD) {
42dbaa5a
JK
5519 /*
5520 * As the vm-exit takes precedence over the debug trap, we
5521 * need to emulate the latter, either for the host or the
5522 * guest debugging itself.
5523 */
5524 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
9a3ecd5e 5525 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
16f8a6f9 5526 vcpu->run->debug.arch.dr7 = dr7;
82b32774 5527 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
851ba692
AK
5528 vcpu->run->debug.arch.exception = DB_VECTOR;
5529 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
42dbaa5a
JK
5530 return 0;
5531 } else {
4d5523cf 5532 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
42dbaa5a
JK
5533 return 1;
5534 }
5535 }
5536
81908bf4 5537 if (vcpu->guest_debug == 0) {
2183f564 5538 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
5539
5540 /*
5541 * No more DR vmexits; force a reload of the debug registers
5542 * and reenter on this instruction. The next vmexit will
5543 * retrieve the full state of the debug registers.
5544 */
5545 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5546 return 1;
5547 }
5548
42dbaa5a
JK
5549 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5550 if (exit_qualification & TYPE_MOV_FROM_DR) {
020df079 5551 unsigned long val;
4c4d563b 5552
29d6ca41 5553 kvm_get_dr(vcpu, dr, &val);
4c4d563b 5554 kvm_register_write(vcpu, reg, val);
996ff542
PB
5555 err = 0;
5556 } else {
27b4a9c4 5557 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
996ff542 5558 }
4c4d563b 5559
996ff542
PB
5560out:
5561 return kvm_complete_insn_gp(vcpu, err);
6aa8b732
AK
5562}
5563
81908bf4
PB
5564static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5565{
81908bf4
PB
5566 get_debugreg(vcpu->arch.db[0], 0);
5567 get_debugreg(vcpu->arch.db[1], 1);
5568 get_debugreg(vcpu->arch.db[2], 2);
5569 get_debugreg(vcpu->arch.db[3], 3);
5570 get_debugreg(vcpu->arch.dr6, 6);
5571 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5572
5573 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2183f564 5574 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
1ccb6f98
PB
5575
5576 /*
5577 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5578 * a stale dr6 from the guest.
5579 */
5580 set_debugreg(DR6_RESERVED, 6);
81908bf4
PB
5581}
5582
020df079
GN
5583static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5584{
5585 vmcs_writel(GUEST_DR7, val);
5586}
5587
851ba692 5588static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6e5d865c 5589{
eb90f341 5590 kvm_apic_update_ppr(vcpu);
6e5d865c
YS
5591 return 1;
5592}
5593
851ba692 5594static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6aa8b732 5595{
9dadc2f9 5596 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
2714d1d3 5597
3842d135
AK
5598 kvm_make_request(KVM_REQ_EVENT, vcpu);
5599
a26bf12a 5600 ++vcpu->stat.irq_window_exits;
6aa8b732
AK
5601 return 1;
5602}
5603
851ba692 5604static int handle_invlpg(struct kvm_vcpu *vcpu)
a7052897 5605{
5addc235 5606 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
a7052897
MT
5607
5608 kvm_mmu_invlpg(vcpu, exit_qualification);
6affcbed 5609 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
5610}
5611
851ba692 5612static int handle_apic_access(struct kvm_vcpu *vcpu)
f78e0e2e 5613{
58fbbf26 5614 if (likely(fasteoi)) {
5addc235 5615 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
58fbbf26
KT
5616 int access_type, offset;
5617
5618 access_type = exit_qualification & APIC_ACCESS_TYPE;
5619 offset = exit_qualification & APIC_ACCESS_OFFSET;
5620 /*
5621 * Sane guest uses MOV to write EOI, with written value
5622 * not cared. So make a short-circuit here by avoiding
5623 * heavy instruction emulation.
5624 */
5625 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5626 (offset == APIC_EOI)) {
5627 kvm_lapic_set_eoi(vcpu);
6affcbed 5628 return kvm_skip_emulated_instruction(vcpu);
58fbbf26
KT
5629 }
5630 }
60fc3d02 5631 return kvm_emulate_instruction(vcpu, 0);
f78e0e2e
SY
5632}
5633
c7c9c56c
YZ
5634static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5635{
5addc235 5636 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
c7c9c56c
YZ
5637 int vector = exit_qualification & 0xff;
5638
5639 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5640 kvm_apic_set_eoi_accelerated(vcpu, vector);
5641 return 1;
5642}
5643
83d4c286
YZ
5644static int handle_apic_write(struct kvm_vcpu *vcpu)
5645{
5addc235 5646 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
83d4c286 5647
b5ede3df
SC
5648 /*
5649 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5650 * hardware has done any necessary aliasing, offset adjustments, etc...
5651 * for the access. I.e. the correct value has already been written to
5652 * the vAPIC page for the correct 16-byte chunk. KVM needs only to
5653 * retrieve the register value and emulate the access.
5654 */
5655 u32 offset = exit_qualification & 0xff0;
5656
83d4c286
YZ
5657 kvm_apic_write_nodecode(vcpu, offset);
5658 return 1;
5659}
5660
851ba692 5661static int handle_task_switch(struct kvm_vcpu *vcpu)
37817f29 5662{
60637aac 5663 struct vcpu_vmx *vmx = to_vmx(vcpu);
37817f29 5664 unsigned long exit_qualification;
e269fb21
JK
5665 bool has_error_code = false;
5666 u32 error_code = 0;
37817f29 5667 u16 tss_selector;
7f3d35fd 5668 int reason, type, idt_v, idt_index;
64a7ec06
GN
5669
5670 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
7f3d35fd 5671 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
64a7ec06 5672 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
37817f29 5673
5addc235 5674 exit_qualification = vmx_get_exit_qual(vcpu);
37817f29
IE
5675
5676 reason = (u32)exit_qualification >> 30;
64a7ec06
GN
5677 if (reason == TASK_SWITCH_GATE && idt_v) {
5678 switch (type) {
5679 case INTR_TYPE_NMI_INTR:
5680 vcpu->arch.nmi_injected = false;
654f06fc 5681 vmx_set_nmi_mask(vcpu, true);
64a7ec06
GN
5682 break;
5683 case INTR_TYPE_EXT_INTR:
66fd3f7f 5684 case INTR_TYPE_SOFT_INTR:
64a7ec06
GN
5685 kvm_clear_interrupt_queue(vcpu);
5686 break;
5687 case INTR_TYPE_HARD_EXCEPTION:
e269fb21
JK
5688 if (vmx->idt_vectoring_info &
5689 VECTORING_INFO_DELIVER_CODE_MASK) {
5690 has_error_code = true;
5691 error_code =
5692 vmcs_read32(IDT_VECTORING_ERROR_CODE);
5693 }
df561f66 5694 fallthrough;
64a7ec06
GN
5695 case INTR_TYPE_SOFT_EXCEPTION:
5696 kvm_clear_exception_queue(vcpu);
5697 break;
5698 default:
5699 break;
5700 }
60637aac 5701 }
37817f29
IE
5702 tss_selector = exit_qualification;
5703
64a7ec06
GN
5704 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5705 type != INTR_TYPE_EXT_INTR &&
5706 type != INTR_TYPE_NMI_INTR))
1957aa63 5707 WARN_ON(!skip_emulated_instruction(vcpu));
64a7ec06 5708
42dbaa5a
JK
5709 /*
5710 * TODO: What about debug traps on tss switch?
5711 * Are we supposed to inject them and update dr6?
5712 */
1051778f
SC
5713 return kvm_task_switch(vcpu, tss_selector,
5714 type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
60fc3d02 5715 reason, has_error_code, error_code);
37817f29
IE
5716}
5717
851ba692 5718static int handle_ept_violation(struct kvm_vcpu *vcpu)
1439442c 5719{
f9c617f6 5720 unsigned long exit_qualification;
1439442c 5721 gpa_t gpa;
eebed243 5722 u64 error_code;
1439442c 5723
5addc235 5724 exit_qualification = vmx_get_exit_qual(vcpu);
1439442c 5725
0be9c7a8
GN
5726 /*
5727 * EPT violation happened while executing iret from NMI,
5728 * "blocked by NMI" bit has to be set before next VM entry.
5729 * There are errata that may cause this bit to not be set:
5730 * AAK134, BY25.
5731 */
bcd1c294 5732 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
d02fcf50 5733 enable_vnmi &&
bcd1c294 5734 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
0be9c7a8
GN
5735 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5736
1439442c 5737 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
faa03b39 5738 trace_kvm_page_fault(vcpu, gpa, exit_qualification);
4f5982a5 5739
27959a44 5740 /* Is it a read fault? */
ab22a473 5741 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
27959a44
JS
5742 ? PFERR_USER_MASK : 0;
5743 /* Is it a write fault? */
ab22a473 5744 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
27959a44
JS
5745 ? PFERR_WRITE_MASK : 0;
5746 /* Is it a fetch fault? */
ab22a473 5747 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
27959a44
JS
5748 ? PFERR_FETCH_MASK : 0;
5749 /* ept page table entry is present? */
ca2a7c22 5750 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
27959a44 5751 ? PFERR_PRESENT_MASK : 0;
4f5982a5 5752
10835602 5753 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
eebed243 5754 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
25d92081 5755
25d92081 5756 vcpu->arch.exit_qualification = exit_qualification;
1dbf5d68
MG
5757
5758 /*
5759 * Check that the GPA doesn't exceed physical memory limits, as that is
5760 * a guest page fault. We have to emulate the instruction here, because
5761 * if the illegal address is that of a paging structure, then
5762 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
5763 * would also use advanced VM-exit information for EPT violations to
5764 * reconstruct the page fault error code.
5765 */
c0623f5e 5766 if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
1dbf5d68
MG
5767 return kvm_emulate_instruction(vcpu, 0);
5768
4f5982a5 5769 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
1439442c
SY
5770}
5771
851ba692 5772static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
68f89400 5773{
68f89400
MT
5774 gpa_t gpa;
5775
4d31d9ef 5776 if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
3c0c2ad1
SC
5777 return 1;
5778
9034e6e8
PB
5779 /*
5780 * A nested guest cannot optimize MMIO vmexits, because we have an
5781 * nGPA here instead of the required GPA.
5782 */
68f89400 5783 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
9034e6e8
PB
5784 if (!is_guest_mode(vcpu) &&
5785 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
931c33b1 5786 trace_kvm_fast_mmio(gpa);
1957aa63 5787 return kvm_skip_emulated_instruction(vcpu);
68c3b4d1 5788 }
68f89400 5789
c75d0edc 5790 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
68f89400
MT
5791}
5792
851ba692 5793static int handle_nmi_window(struct kvm_vcpu *vcpu)
f08864b4 5794{
67369273
SC
5795 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5796 return -EIO;
5797
4e2a0bc5 5798 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
f08864b4 5799 ++vcpu->stat.nmi_window_exits;
3842d135 5800 kvm_make_request(KVM_REQ_EVENT, vcpu);
f08864b4
SY
5801
5802 return 1;
5803}
5804
fc4fad79
SC
5805static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5806{
5807 struct vcpu_vmx *vmx = to_vmx(vcpu);
5808
5809 return vmx->emulation_required && !vmx->rmode.vm86_active &&
7709aba8 5810 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
fc4fad79
SC
5811}
5812
80ced186 5813static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
ea953ef0 5814{
8b3079a5 5815 struct vcpu_vmx *vmx = to_vmx(vcpu);
49e9d557 5816 bool intr_window_requested;
b8405c18 5817 unsigned count = 130;
49e9d557 5818
2183f564 5819 intr_window_requested = exec_controls_get(vmx) &
9dadc2f9 5820 CPU_BASED_INTR_WINDOW_EXITING;
ea953ef0 5821
98eb2f8b 5822 while (vmx->emulation_required && count-- != 0) {
db438592 5823 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
49e9d557
AK
5824 return handle_interrupt_window(&vmx->vcpu);
5825
72875d8a 5826 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
de87dcdd
AK
5827 return 1;
5828
60fc3d02 5829 if (!kvm_emulate_instruction(vcpu, 0))
8fff2710 5830 return 0;
1d5a4d9b 5831
fc4fad79 5832 if (vmx_emulation_required_with_pending_exception(vcpu)) {
e615e355 5833 kvm_prepare_emulation_failure_exit(vcpu);
8fff2710
SC
5834 return 0;
5835 }
ea953ef0 5836
8d76c49e
GN
5837 if (vcpu->arch.halt_request) {
5838 vcpu->arch.halt_request = 0;
1460179d 5839 return kvm_emulate_halt_noskip(vcpu);
8d76c49e
GN
5840 }
5841
8fff2710 5842 /*
72c3c0fe
TG
5843 * Note, return 1 and not 0, vcpu_run() will invoke
5844 * xfer_to_guest_mode() which will create a proper return
5845 * code.
8fff2710 5846 */
72c3c0fe 5847 if (__xfer_to_guest_mode_work_pending())
8fff2710 5848 return 1;
ea953ef0
MG
5849 }
5850
8fff2710 5851 return 1;
b4a2d31d
RK
5852}
5853
fc4fad79
SC
5854static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5855{
5856 if (vmx_emulation_required_with_pending_exception(vcpu)) {
5857 kvm_prepare_emulation_failure_exit(vcpu);
5858 return 0;
5859 }
5860
5861 return 1;
5862}
5863
b4a2d31d
RK
5864static void grow_ple_window(struct kvm_vcpu *vcpu)
5865{
5866 struct vcpu_vmx *vmx = to_vmx(vcpu);
c5c5d6fa 5867 unsigned int old = vmx->ple_window;
b4a2d31d 5868
c8e88717
BM
5869 vmx->ple_window = __grow_ple_window(old, ple_window,
5870 ple_window_grow,
5871 ple_window_max);
b4a2d31d 5872
4f75bcc3 5873 if (vmx->ple_window != old) {
b4a2d31d 5874 vmx->ple_window_dirty = true;
4f75bcc3
PX
5875 trace_kvm_ple_window_update(vcpu->vcpu_id,
5876 vmx->ple_window, old);
5877 }
b4a2d31d
RK
5878}
5879
5880static void shrink_ple_window(struct kvm_vcpu *vcpu)
5881{
5882 struct vcpu_vmx *vmx = to_vmx(vcpu);
c5c5d6fa 5883 unsigned int old = vmx->ple_window;
b4a2d31d 5884
c8e88717
BM
5885 vmx->ple_window = __shrink_ple_window(old, ple_window,
5886 ple_window_shrink,
5887 ple_window);
b4a2d31d 5888
4f75bcc3 5889 if (vmx->ple_window != old) {
b4a2d31d 5890 vmx->ple_window_dirty = true;
4f75bcc3
PX
5891 trace_kvm_ple_window_update(vcpu->vcpu_id,
5892 vmx->ple_window, old);
5893 }
b4a2d31d
RK
5894}
5895
4b8d54f9
ZE
5896/*
5897 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5898 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5899 */
9fb41ba8 5900static int handle_pause(struct kvm_vcpu *vcpu)
4b8d54f9 5901{
b31c114b 5902 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d
RK
5903 grow_ple_window(vcpu);
5904
de63ad4c
LM
5905 /*
5906 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5907 * VM-execution control is ignored if CPL > 0. OTOH, KVM
5908 * never set PAUSE_EXITING and just set PLE if supported,
5909 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5910 */
5911 kvm_vcpu_on_spin(vcpu, true);
6affcbed 5912 return kvm_skip_emulated_instruction(vcpu);
4b8d54f9
ZE
5913}
5914
5f3d45e7
MD
5915static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5916{
5917 return 1;
5918}
5919
55d2375e 5920static int handle_invpcid(struct kvm_vcpu *vcpu)
19677e32 5921{
55d2375e
SC
5922 u32 vmx_instruction_info;
5923 unsigned long type;
55d2375e 5924 gva_t gva;
55d2375e
SC
5925 struct {
5926 u64 pcid;
5927 u64 gla;
5928 } operand;
329bd56c 5929 int gpr_index;
f9eb4af6 5930
55d2375e 5931 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
19677e32
BD
5932 kvm_queue_exception(vcpu, UD_VECTOR);
5933 return 1;
5934 }
5935
55d2375e 5936 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
329bd56c
VS
5937 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5938 type = kvm_register_read(vcpu, gpr_index);
f9eb4af6 5939
55d2375e
SC
5940 /* According to the Intel instruction reference, the memory operand
5941 * is read even if it isn't needed (e.g., for type==all)
5942 */
5addc235 5943 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619
EK
5944 vmx_instruction_info, false,
5945 sizeof(operand), &gva))
3573e22c
BD
5946 return 1;
5947
9715092f 5948 return kvm_handle_invpcid(vcpu, type, gva);
e29acc55
JM
5949}
5950
55d2375e 5951static int handle_pml_full(struct kvm_vcpu *vcpu)
ec378aee 5952{
55d2375e 5953 unsigned long exit_qualification;
b3897a49 5954
55d2375e 5955 trace_kvm_pml_full(vcpu->vcpu_id);
b3897a49 5956
5addc235 5957 exit_qualification = vmx_get_exit_qual(vcpu);
cbf71279
RK
5958
5959 /*
55d2375e
SC
5960 * PML buffer FULL happened while executing iret from NMI,
5961 * "blocked by NMI" bit has to be set before next VM entry.
cbf71279 5962 */
55d2375e
SC
5963 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5964 enable_vnmi &&
5965 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5966 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5967 GUEST_INTR_STATE_NMI);
e49fcb8b 5968
55d2375e
SC
5969 /*
5970 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5971 * here.., and there's no userspace involvement needed for PML.
5972 */
ec378aee
NHE
5973 return 1;
5974}
5975
26efe2fd 5976static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
8ca44e88 5977{
804939ea
SC
5978 struct vcpu_vmx *vmx = to_vmx(vcpu);
5979
5980 if (!vmx->req_immediate_exit &&
26efe2fd 5981 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
55d2375e 5982 kvm_lapic_expired_hv_timer(vcpu);
26efe2fd
WL
5983 return EXIT_FASTPATH_REENTER_GUEST;
5984 }
5985
5986 return EXIT_FASTPATH_NONE;
5987}
804939ea 5988
26efe2fd
WL
5989static int handle_preemption_timer(struct kvm_vcpu *vcpu)
5990{
5991 handle_fastpath_preemption_timer(vcpu);
55d2375e 5992 return 1;
8ca44e88
DM
5993}
5994
55d2375e
SC
5995/*
5996 * When nested=0, all VMX instruction VM Exits filter here. The handlers
5997 * are overwritten by nested_vmx_setup() when nested=1.
5998 */
5999static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
b8bbab92 6000{
55d2375e
SC
6001 kvm_queue_exception(vcpu, UD_VECTOR);
6002 return 1;
b8bbab92
VK
6003}
6004
9798adbc 6005#ifndef CONFIG_X86_SGX_KVM
55d2375e 6006static int handle_encls(struct kvm_vcpu *vcpu)
e7953d7f 6007{
55d2375e 6008 /*
9798adbc
SC
6009 * SGX virtualization is disabled. There is no software enable bit for
6010 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6011 * the guest from executing ENCLS (when SGX is supported by hardware).
55d2375e
SC
6012 */
6013 kvm_queue_exception(vcpu, UD_VECTOR);
6014 return 1;
e7953d7f 6015}
9798adbc 6016#endif /* CONFIG_X86_SGX_KVM */
e7953d7f 6017
fe6b6bc8
CQ
6018static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6019{
d61863c6
HX
6020 /*
6021 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6022 * VM-Exits. Unconditionally set the flag here and leave the handling to
6023 * vmx_handle_exit().
6024 */
6025 to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6026 return 1;
fe6b6bc8
CQ
6027}
6028
2f4073e0
TX
6029static int handle_notify(struct kvm_vcpu *vcpu)
6030{
6031 unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6032 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6033
6034 ++vcpu->stat.notify_window_exits;
6035
6036 /*
6037 * Notify VM exit happened while executing iret from NMI,
6038 * "blocked by NMI" bit has to be set before next VM entry.
6039 */
6040 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6041 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6042 GUEST_INTR_STATE_NMI);
6043
6044 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6045 context_invalid) {
6046 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6047 vcpu->run->notify.flags = context_invalid ?
6048 KVM_NOTIFY_CONTEXT_INVALID : 0;
6049 return 0;
6050 }
6051
6052 return 1;
6053}
6054
ec378aee 6055/*
55d2375e
SC
6056 * The exit handlers return 1 if the exit was handled fully and guest execution
6057 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
6058 * to be done to userspace and return 0.
ec378aee 6059 */
55d2375e 6060static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
95b5a48c 6061 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
55d2375e
SC
6062 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6063 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6064 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6065 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6066 [EXIT_REASON_CR_ACCESS] = handle_cr,
6067 [EXIT_REASON_DR_ACCESS] = handle_dr,
f399e60c
AA
6068 [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6069 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6070 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
9dadc2f9 6071 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
f399e60c 6072 [EXIT_REASON_HLT] = kvm_emulate_halt,
5ff3a351 6073 [EXIT_REASON_INVD] = kvm_emulate_invd,
55d2375e 6074 [EXIT_REASON_INVLPG] = handle_invlpg,
c483c454 6075 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
5ff3a351 6076 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
55d2375e
SC
6077 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6078 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6079 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6080 [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6081 [EXIT_REASON_VMREAD] = handle_vmx_instruction,
6082 [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6083 [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6084 [EXIT_REASON_VMOFF] = handle_vmx_instruction,
6085 [EXIT_REASON_VMON] = handle_vmx_instruction,
6086 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6087 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6088 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
6089 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5ff3a351 6090 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
92f9895c 6091 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
55d2375e
SC
6092 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6093 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6094 [EXIT_REASON_GDTR_IDTR] = handle_desc,
6095 [EXIT_REASON_LDTR_TR] = handle_desc,
6096 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6097 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6098 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
5ff3a351 6099 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
55d2375e 6100 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
5ff3a351 6101 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
55d2375e
SC
6102 [EXIT_REASON_INVEPT] = handle_vmx_instruction,
6103 [EXIT_REASON_INVVPID] = handle_vmx_instruction,
5ff3a351
SC
6104 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
6105 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
55d2375e
SC
6106 [EXIT_REASON_PML_FULL] = handle_pml_full,
6107 [EXIT_REASON_INVPCID] = handle_invpcid,
6108 [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6109 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6110 [EXIT_REASON_ENCLS] = handle_encls,
fe6b6bc8 6111 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
2f4073e0 6112 [EXIT_REASON_NOTIFY] = handle_notify,
55d2375e 6113};
b8bbab92 6114
55d2375e
SC
6115static const int kvm_vmx_max_exit_handlers =
6116 ARRAY_SIZE(kvm_vmx_exit_handlers);
ec378aee 6117
0a62a031
DE
6118static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6119 u64 *info1, u64 *info2,
235ba74f 6120 u32 *intr_info, u32 *error_code)
ec378aee 6121{
235ba74f
SC
6122 struct vcpu_vmx *vmx = to_vmx(vcpu);
6123
0a62a031 6124 *reason = vmx->exit_reason.full;
5addc235 6125 *info1 = vmx_get_exit_qual(vcpu);
8e533240 6126 if (!(vmx->exit_reason.failed_vmentry)) {
235ba74f
SC
6127 *info2 = vmx->idt_vectoring_info;
6128 *intr_info = vmx_get_intr_info(vcpu);
6129 if (is_exception_with_error_code(*intr_info))
6130 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6131 else
6132 *error_code = 0;
6133 } else {
6134 *info2 = 0;
6135 *intr_info = 0;
6136 *error_code = 0;
6137 }
ec378aee
NHE
6138}
6139
55d2375e 6140static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
27d6c865 6141{
55d2375e
SC
6142 if (vmx->pml_pg) {
6143 __free_page(vmx->pml_pg);
6144 vmx->pml_pg = NULL;
b8bbab92 6145 }
27d6c865
NHE
6146}
6147
55d2375e 6148static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
cd232ad0 6149{
55d2375e
SC
6150 struct vcpu_vmx *vmx = to_vmx(vcpu);
6151 u64 *pml_buf;
6152 u16 pml_idx;
cd232ad0 6153
55d2375e 6154 pml_idx = vmcs_read16(GUEST_PML_INDEX);
cd232ad0 6155
55d2375e
SC
6156 /* Do nothing if PML buffer is empty */
6157 if (pml_idx == (PML_ENTITY_NUM - 1))
6158 return;
cd232ad0 6159
55d2375e
SC
6160 /* PML index always points to next available PML buffer entity */
6161 if (pml_idx >= PML_ENTITY_NUM)
6162 pml_idx = 0;
6163 else
6164 pml_idx++;
945679e3 6165
55d2375e
SC
6166 pml_buf = page_address(vmx->pml_pg);
6167 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6168 u64 gpa;
945679e3 6169
55d2375e
SC
6170 gpa = pml_buf[pml_idx];
6171 WARN_ON(gpa & (PAGE_SIZE - 1));
6172 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
945679e3
VK
6173 }
6174
55d2375e
SC
6175 /* reset PML index */
6176 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
945679e3
VK
6177}
6178
55d2375e 6179static void vmx_dump_sel(char *name, uint32_t sel)
49f705c5 6180{
55d2375e
SC
6181 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6182 name, vmcs_read16(sel),
6183 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6184 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6185 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
49f705c5
NHE
6186}
6187
55d2375e 6188static void vmx_dump_dtsel(char *name, uint32_t limit)
a8bc284e 6189{
55d2375e
SC
6190 pr_err("%s limit=0x%08x, base=0x%016lx\n",
6191 name, vmcs_read32(limit),
6192 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
a8bc284e
JM
6193}
6194
8486039a
DE
6195static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6196{
6197 unsigned int i;
6198 struct vmx_msr_entry *e;
6199
6200 pr_err("MSR %s:\n", name);
6201 for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6202 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6203}
6204
0702a3cb 6205void dump_vmcs(struct kvm_vcpu *vcpu)
63846663 6206{
0702a3cb 6207 struct vcpu_vmx *vmx = to_vmx(vcpu);
6f2f8453
PB
6208 u32 vmentry_ctl, vmexit_ctl;
6209 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
0b85baa5 6210 u64 tertiary_exec_control;
6f2f8453 6211 unsigned long cr4;
0702a3cb 6212 int efer_slot;
63846663 6213
6f2f8453
PB
6214 if (!dump_invalid_vmcs) {
6215 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6216 return;
6217 }
6218
6219 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6220 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6221 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6222 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6223 cr4 = vmcs_readl(GUEST_CR4);
0b85baa5 6224
55d2375e
SC
6225 if (cpu_has_secondary_exec_ctrls())
6226 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
0b85baa5
RH
6227 else
6228 secondary_exec_control = 0;
6229
6230 if (cpu_has_tertiary_exec_ctrls())
6231 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6232 else
6233 tertiary_exec_control = 0;
14c07ad8 6234
18f63b15
JM
6235 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6236 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
55d2375e
SC
6237 pr_err("*** Guest State ***\n");
6238 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6239 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6240 vmcs_readl(CR0_GUEST_HOST_MASK));
6241 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6242 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6243 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
d9e46d34 6244 if (cpu_has_vmx_ept()) {
55d2375e
SC
6245 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6246 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6247 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6248 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
e9ac033e 6249 }
55d2375e
SC
6250 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6251 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6252 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6253 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6254 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6255 vmcs_readl(GUEST_SYSENTER_ESP),
6256 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6257 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
6258 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
6259 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
6260 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
6261 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
6262 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
6263 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6264 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6265 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6266 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
0702a3cb 6267 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
5518da62 6268 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
699e1b2e 6269 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
0702a3cb
DE
6270 else if (efer_slot >= 0)
6271 pr_err("EFER= 0x%016llx (autoload)\n",
6272 vmx->msr_autoload.guest.val[efer_slot].value);
6273 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6274 pr_err("EFER= 0x%016llx (effective)\n",
6275 vcpu->arch.efer | (EFER_LMA | EFER_LME));
6276 else
6277 pr_err("EFER= 0x%016llx (effective)\n",
6278 vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
5518da62 6279 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
699e1b2e 6280 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
55d2375e
SC
6281 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6282 vmcs_read64(GUEST_IA32_DEBUGCTL),
6283 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6284 if (cpu_has_load_perf_global_ctrl() &&
6285 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6286 pr_err("PerfGlobCtl = 0x%016llx\n",
6287 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6288 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6289 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6290 pr_err("Interruptibility = %08x ActivityState = %08x\n",
6291 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6292 vmcs_read32(GUEST_ACTIVITY_STATE));
6293 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6294 pr_err("InterruptStatus = %04x\n",
6295 vmcs_read16(GUEST_INTR_STATUS));
8486039a
DE
6296 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6297 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6298 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6299 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
ff651cb6 6300
55d2375e
SC
6301 pr_err("*** Host State ***\n");
6302 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6303 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6304 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6305 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6306 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6307 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6308 vmcs_read16(HOST_TR_SELECTOR));
6309 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6310 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6311 vmcs_readl(HOST_TR_BASE));
6312 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6313 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6314 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6315 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6316 vmcs_readl(HOST_CR4));
6317 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6318 vmcs_readl(HOST_IA32_SYSENTER_ESP),
6319 vmcs_read32(HOST_IA32_SYSENTER_CS),
6320 vmcs_readl(HOST_IA32_SYSENTER_EIP));
699e1b2e
DE
6321 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6322 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6323 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6324 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
55d2375e
SC
6325 if (cpu_has_load_perf_global_ctrl() &&
6326 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6327 pr_err("PerfGlobCtl = 0x%016llx\n",
6328 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
8486039a
DE
6329 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6330 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
ff651cb6 6331
55d2375e 6332 pr_err("*** Control State ***\n");
0b85baa5
RH
6333 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6334 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6335 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6336 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
55d2375e
SC
6337 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6338 vmcs_read32(EXCEPTION_BITMAP),
6339 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6340 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6341 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6342 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6343 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6344 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6345 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6346 vmcs_read32(VM_EXIT_INTR_INFO),
6347 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6348 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6349 pr_err(" reason=%08x qualification=%016lx\n",
6350 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6351 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6352 vmcs_read32(IDT_VECTORING_INFO_FIELD),
6353 vmcs_read32(IDT_VECTORING_ERROR_CODE));
6354 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6355 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6356 pr_err("TSC Multiplier = 0x%016llx\n",
6357 vmcs_read64(TSC_MULTIPLIER));
9d609649
PB
6358 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6359 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6360 u16 status = vmcs_read16(GUEST_INTR_STATUS);
6361 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6362 }
d6a85c32 6363 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
9d609649
PB
6364 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6365 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
d6a85c32 6366 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
9d609649 6367 }
55d2375e
SC
6368 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6369 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6370 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6371 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
55d2375e
SC
6372 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6373 pr_err("PLE Gap=%08x Window=%08x\n",
6374 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6375 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6376 pr_err("Virtual processor ID = 0x%04x\n",
6377 vmcs_read16(VIRTUAL_PROCESSOR_ID));
ff651cb6
WV
6378}
6379
55d2375e
SC
6380/*
6381 * The guest has exited. See if we can fix it or if we need userspace
6382 * assistance.
6383 */
fe6b6bc8 6384static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
ff651cb6 6385{
55d2375e 6386 struct vcpu_vmx *vmx = to_vmx(vcpu);
8e533240 6387 union vmx_exit_reason exit_reason = vmx->exit_reason;
55d2375e 6388 u32 vectoring_info = vmx->idt_vectoring_info;
8e533240 6389 u16 exit_handler_index;
ff651cb6 6390
55d2375e
SC
6391 /*
6392 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6393 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6394 * querying dirty_bitmap, we only need to kick all vcpus out of guest
6395 * mode as if vcpus is in root mode, the PML buffer must has been
c3bb9a20
SC
6396 * flushed already. Note, PML is never enabled in hardware while
6397 * running L2.
55d2375e 6398 */
c3bb9a20 6399 if (enable_pml && !is_guest_mode(vcpu))
55d2375e 6400 vmx_flush_pml_buffer(vcpu);
1dc35dac 6401
db438592 6402 /*
cd0e615c
SC
6403 * KVM should never reach this point with a pending nested VM-Enter.
6404 * More specifically, short-circuiting VM-Entry to emulate L2 due to
6405 * invalid guest state should never happen as that means KVM knowingly
6406 * allowed a nested VM-Enter with an invalid vmcs12. More below.
db438592 6407 */
67369273
SC
6408 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6409 return -EIO;
db438592 6410
96b100cd 6411 if (is_guest_mode(vcpu)) {
c3bb9a20
SC
6412 /*
6413 * PML is never enabled when running L2, bail immediately if a
6414 * PML full exit occurs as something is horribly wrong.
6415 */
6416 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6417 goto unexpected_vmexit;
6418
96b100cd
PB
6419 /*
6420 * The host physical addresses of some pages of guest memory
6421 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6422 * Page). The CPU may write to these pages via their host
6423 * physical address while L2 is running, bypassing any
6424 * address-translation-based dirty tracking (e.g. EPT write
6425 * protection).
6426 *
6427 * Mark them dirty on every exit from L2 to prevent them from
6428 * getting out of sync with dirty tracking.
6429 */
6430 nested_mark_vmcs12_pages_dirty(vcpu);
6431
cd0e615c
SC
6432 /*
6433 * Synthesize a triple fault if L2 state is invalid. In normal
6434 * operation, nested VM-Enter rejects any attempt to enter L2
6435 * with invalid state. However, those checks are skipped if
6436 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
6437 * L2 state is invalid, it means either L1 modified SMRAM state
6438 * or userspace provided bad state. Synthesize TRIPLE_FAULT as
6439 * doing so is architecturally allowed in the RSM case, and is
6440 * the least awful solution for the userspace case without
6441 * risking false positives.
6442 */
6443 if (vmx->emulation_required) {
6444 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6445 return 1;
6446 }
6447
f47baaed 6448 if (nested_vmx_reflect_vmexit(vcpu))
789afc5c 6449 return 1;
96b100cd 6450 }
9ed38ffa 6451
cd0e615c
SC
6452 /* If guest state is invalid, start emulating. L2 is handled above. */
6453 if (vmx->emulation_required)
6454 return handle_invalid_guest_state(vcpu);
6455
8e533240 6456 if (exit_reason.failed_vmentry) {
0702a3cb 6457 dump_vmcs(vcpu);
55d2375e
SC
6458 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6459 vcpu->run->fail_entry.hardware_entry_failure_reason
8e533240 6460 = exit_reason.full;
8a14fe4f 6461 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
55d2375e 6462 return 0;
9ed38ffa
LP
6463 }
6464
55d2375e 6465 if (unlikely(vmx->fail)) {
0702a3cb 6466 dump_vmcs(vcpu);
55d2375e
SC
6467 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6468 vcpu->run->fail_entry.hardware_entry_failure_reason
6469 = vmcs_read32(VM_INSTRUCTION_ERROR);
8a14fe4f 6470 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
55d2375e
SC
6471 return 0;
6472 }
50c28f21 6473
55d2375e
SC
6474 /*
6475 * Note:
6476 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6477 * delivery event since it indicates guest is accessing MMIO.
6478 * The vm-exit can be triggered again after return to guest that
6479 * will cause infinite loop.
6480 */
6481 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
8e533240
SC
6482 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6483 exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6484 exit_reason.basic != EXIT_REASON_PML_FULL &&
6485 exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
2f4073e0
TX
6486 exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6487 exit_reason.basic != EXIT_REASON_NOTIFY)) {
04c4f2ee
RW
6488 int ndata = 3;
6489
55d2375e
SC
6490 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6491 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
55d2375e 6492 vcpu->run->internal.data[0] = vectoring_info;
8e533240 6493 vcpu->run->internal.data[1] = exit_reason.full;
55d2375e 6494 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
8e533240 6495 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
04c4f2ee 6496 vcpu->run->internal.data[ndata++] =
55d2375e
SC
6497 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6498 }
04c4f2ee
RW
6499 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6500 vcpu->run->internal.ndata = ndata;
55d2375e
SC
6501 return 0;
6502 }
50c28f21 6503
55d2375e
SC
6504 if (unlikely(!enable_vnmi &&
6505 vmx->loaded_vmcs->soft_vnmi_blocked)) {
db438592 6506 if (!vmx_interrupt_blocked(vcpu)) {
55d2375e
SC
6507 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6508 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6509 vcpu->arch.nmi_pending) {
6510 /*
6511 * This CPU don't support us in finding the end of an
6512 * NMI-blocked window if the guest runs with IRQs
6513 * disabled. So we pull the trigger after 1 s of
6514 * futile waiting, but inform the user about this.
6515 */
6516 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6517 "state on VCPU %d after 1 s timeout\n",
6518 __func__, vcpu->vcpu_id);
6519 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6520 }
6521 }
50c28f21 6522
404d5d7b 6523 if (exit_fastpath != EXIT_FASTPATH_NONE)
1e9e2622 6524 return 1;
c926f2f7 6525
8e533240 6526 if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
c926f2f7 6527 goto unexpected_vmexit;
4289d272 6528#ifdef CONFIG_RETPOLINE
8e533240 6529 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
c926f2f7 6530 return kvm_emulate_wrmsr(vcpu);
8e533240 6531 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
c926f2f7 6532 return handle_preemption_timer(vcpu);
8e533240 6533 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
c926f2f7 6534 return handle_interrupt_window(vcpu);
8e533240 6535 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
c926f2f7 6536 return handle_external_interrupt(vcpu);
8e533240 6537 else if (exit_reason.basic == EXIT_REASON_HLT)
c926f2f7 6538 return kvm_emulate_halt(vcpu);
8e533240 6539 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
c926f2f7 6540 return handle_ept_misconfig(vcpu);
4289d272 6541#endif
c926f2f7 6542
8e533240
SC
6543 exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6544 kvm_vmx_max_exit_handlers);
6545 if (!kvm_vmx_exit_handlers[exit_handler_index])
c926f2f7
MP
6546 goto unexpected_vmexit;
6547
8e533240 6548 return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
c926f2f7
MP
6549
6550unexpected_vmexit:
8e533240
SC
6551 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6552 exit_reason.full);
0702a3cb 6553 dump_vmcs(vcpu);
c926f2f7
MP
6554 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6555 vcpu->run->internal.suberror =
7396d337 6556 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
1aa561b1 6557 vcpu->run->internal.ndata = 2;
8e533240 6558 vcpu->run->internal.data[0] = exit_reason.full;
8a14fe4f 6559 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
c926f2f7 6560 return 0;
9ed38ffa
LP
6561}
6562
fe6b6bc8
CQ
6563static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6564{
6565 int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6566
6567 /*
d61863c6
HX
6568 * Exit to user space when bus lock detected to inform that there is
6569 * a bus lock in guest.
fe6b6bc8
CQ
6570 */
6571 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6572 if (ret > 0)
6573 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6574
6575 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6576 return 0;
6577 }
6578 return ret;
6579}
6580
efebf0aa 6581/*
55d2375e
SC
6582 * Software based L1D cache flush which is used when microcode providing
6583 * the cache control MSR is not loaded.
efebf0aa 6584 *
55d2375e
SC
6585 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6586 * flush it is required to read in 64 KiB because the replacement algorithm
6587 * is not exactly LRU. This could be sized at runtime via topology
6588 * information but as all relevant affected CPUs have 32KiB L1D cache size
6589 * there is no point in doing so.
efebf0aa 6590 */
3ebccdf3 6591static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
fe3ef05c 6592{
55d2375e 6593 int size = PAGE_SIZE << L1D_CACHE_ORDER;
25a2e4fe
PB
6594
6595 /*
f7081834 6596 * This code is only executed when the flush mode is 'cond' or
55d2375e 6597 * 'always'
25a2e4fe 6598 */
55d2375e
SC
6599 if (static_branch_likely(&vmx_l1d_flush_cond)) {
6600 bool flush_l1d;
25a2e4fe 6601
55d2375e
SC
6602 /*
6603 * Clear the per-vcpu flush bit, it gets set again
6604 * either from vcpu_run() or from one of the unsafe
6605 * VMEXIT handlers.
6606 */
6607 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6608 vcpu->arch.l1tf_flush_l1d = false;
25a2e4fe 6609
55d2375e
SC
6610 /*
6611 * Clear the per-cpu flush bit, it gets set again from
6612 * the interrupt handlers.
6613 */
6614 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6615 kvm_clear_cpu_l1tf_flush_l1d();
25a2e4fe 6616
55d2375e
SC
6617 if (!flush_l1d)
6618 return;
6619 }
09abe320 6620
55d2375e 6621 vcpu->stat.l1d_flush++;
25a2e4fe 6622
55d2375e 6623 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
3ebccdf3 6624 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
55d2375e
SC
6625 return;
6626 }
25a2e4fe 6627
55d2375e
SC
6628 asm volatile(
6629 /* First ensure the pages are in the TLB */
6630 "xorl %%eax, %%eax\n"
6631 ".Lpopulate_tlb:\n\t"
6632 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6633 "addl $4096, %%eax\n\t"
6634 "cmpl %%eax, %[size]\n\t"
6635 "jne .Lpopulate_tlb\n\t"
6636 "xorl %%eax, %%eax\n\t"
6637 "cpuid\n\t"
6638 /* Now fill the cache */
6639 "xorl %%eax, %%eax\n"
6640 ".Lfill_cache:\n"
6641 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6642 "addl $64, %%eax\n\t"
6643 "cmpl %%eax, %[size]\n\t"
6644 "jne .Lfill_cache\n\t"
6645 "lfence\n"
6646 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6647 [size] "r" (size)
6648 : "eax", "ebx", "ecx", "edx");
09abe320 6649}
25a2e4fe 6650
b6a7cc35 6651static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
09abe320 6652{
55d2375e 6653 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
132f4f7e 6654 int tpr_threshold;
09abe320 6655
55d2375e
SC
6656 if (is_guest_mode(vcpu) &&
6657 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6658 return;
25a2e4fe 6659
132f4f7e 6660 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
02d496cf
LA
6661 if (is_guest_mode(vcpu))
6662 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6663 else
6664 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
8665c3f9
PB
6665}
6666
55d2375e 6667void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
8665c3f9 6668{
fe7f895d 6669 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 6670 u32 sec_exec_control;
8665c3f9 6671
55d2375e
SC
6672 if (!lapic_in_kernel(vcpu))
6673 return;
9314006d 6674
55d2375e
SC
6675 if (!flexpriority_enabled &&
6676 !cpu_has_vmx_virtualize_x2apic_mode())
6677 return;
705699a1 6678
55d2375e
SC
6679 /* Postpone execution until vmcs01 is the current VMCS. */
6680 if (is_guest_mode(vcpu)) {
fe7f895d 6681 vmx->nested.change_vmcs01_virtual_apic_mode = true;
55d2375e 6682 return;
6beb7bd5 6683 }
fe3ef05c 6684
fe7f895d 6685 sec_exec_control = secondary_exec_controls_get(vmx);
55d2375e
SC
6686 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6687 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
09abe320 6688
55d2375e
SC
6689 switch (kvm_get_apic_mode(vcpu)) {
6690 case LAPIC_MODE_INVALID:
6691 WARN_ONCE(true, "Invalid local APIC state");
551912d2 6692 break;
55d2375e
SC
6693 case LAPIC_MODE_DISABLED:
6694 break;
6695 case LAPIC_MODE_XAPIC:
6696 if (flexpriority_enabled) {
6697 sec_exec_control |=
6698 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4de1f9d4
SC
6699 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6700
6701 /*
6702 * Flush the TLB, reloading the APIC access page will
6703 * only do so if its physical address has changed, but
6704 * the guest may have inserted a non-APIC mapping into
6705 * the TLB while the APIC access page was disabled.
6706 */
6707 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
55d2375e
SC
6708 }
6709 break;
6710 case LAPIC_MODE_X2APIC:
6711 if (cpu_has_vmx_virtualize_x2apic_mode())
6712 sec_exec_control |=
6713 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6714 break;
09abe320 6715 }
fe7f895d 6716 secondary_exec_controls_set(vmx, sec_exec_control);
09abe320 6717
84ec8d2d 6718 vmx_update_msr_bitmap_x2apic(vcpu);
55d2375e 6719}
0238ea91 6720
a4148b7c 6721static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
55d2375e 6722{
a4148b7c
SC
6723 struct page *page;
6724
1196cb97
SC
6725 /* Defer reload until vmcs01 is the current VMCS. */
6726 if (is_guest_mode(vcpu)) {
6727 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6728 return;
55d2375e 6729 }
1196cb97 6730
4de1f9d4
SC
6731 if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6732 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6733 return;
6734
a4148b7c
SC
6735 page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6736 if (is_error_page(page))
6737 return;
6738
6739 vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
1196cb97 6740 vmx_flush_tlb_current(vcpu);
a4148b7c
SC
6741
6742 /*
6743 * Do not pin apic access page in memory, the MMU notifier
6744 * will call us again if it is migrated or swapped out.
6745 */
6746 put_page(page);
55d2375e 6747}
fe3ef05c 6748
d39850f5 6749static void vmx_hwapic_isr_update(int max_isr)
55d2375e
SC
6750{
6751 u16 status;
6752 u8 old;
32c7acf0 6753
55d2375e
SC
6754 if (max_isr == -1)
6755 max_isr = 0;
608406e2 6756
55d2375e
SC
6757 status = vmcs_read16(GUEST_INTR_STATUS);
6758 old = status >> 8;
6759 if (max_isr != old) {
6760 status &= 0xff;
6761 status |= max_isr << 8;
6762 vmcs_write16(GUEST_INTR_STATUS, status);
6763 }
6764}
6beb7bd5 6765
55d2375e
SC
6766static void vmx_set_rvi(int vector)
6767{
6768 u16 status;
6769 u8 old;
0b665d30 6770
55d2375e
SC
6771 if (vector == -1)
6772 vector = 0;
fe3ef05c 6773
55d2375e
SC
6774 status = vmcs_read16(GUEST_INTR_STATUS);
6775 old = (u8)status & 0xff;
6776 if ((u8)vector != old) {
6777 status &= ~0xff;
6778 status |= (u8)vector;
6779 vmcs_write16(GUEST_INTR_STATUS, status);
09abe320 6780 }
55d2375e 6781}
09abe320 6782
55d2375e
SC
6783static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6784{
09abe320 6785 /*
55d2375e
SC
6786 * When running L2, updating RVI is only relevant when
6787 * vmcs12 virtual-interrupt-delivery enabled.
6788 * However, it can be enabled only when L1 also
6789 * intercepts external-interrupts and in that case
6790 * we should not update vmcs02 RVI but instead intercept
6791 * interrupt. Therefore, do nothing when running L2.
fe3ef05c 6792 */
55d2375e
SC
6793 if (!is_guest_mode(vcpu))
6794 vmx_set_rvi(max_irr);
6795}
fe3ef05c 6796
55d2375e
SC
6797static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6798{
6799 struct vcpu_vmx *vmx = to_vmx(vcpu);
6800 int max_irr;
7e1901f6 6801 bool got_posted_interrupt;
a7c0b07d 6802
7e1901f6 6803 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
67369273
SC
6804 return -EIO;
6805
55d2375e
SC
6806 if (pi_test_on(&vmx->pi_desc)) {
6807 pi_clear_on(&vmx->pi_desc);
6808 /*
d9ff2744 6809 * IOMMU can write to PID.ON, so the barrier matters even on UP.
55d2375e
SC
6810 * But on x86 this is just a compiler barrier anyway.
6811 */
6812 smp_mb__after_atomic();
7e1901f6 6813 got_posted_interrupt =
55d2375e 6814 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
55d2375e
SC
6815 } else {
6816 max_irr = kvm_lapic_find_highest_irr(vcpu);
7e1901f6 6817 got_posted_interrupt = false;
a7c0b07d 6818 }
7e1901f6
PB
6819
6820 /*
6821 * Newly recognized interrupts are injected via either virtual interrupt
6822 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
6823 * disabled in two cases:
6824 *
6825 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
6826 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6827 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
6828 * into L2, but KVM doesn't use virtual interrupt delivery to inject
6829 * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6830 *
6831 * 2) If APICv is disabled for this vCPU, assigned devices may still
6832 * attempt to post interrupts. The posted interrupt vector will cause
6833 * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6834 */
6835 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6836 vmx_set_rvi(max_irr);
6837 else if (got_posted_interrupt)
6838 kvm_make_request(KVM_REQ_EVENT, vcpu);
6839
55d2375e
SC
6840 return max_irr;
6841}
a7c0b07d 6842
55d2375e
SC
6843static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6844{
6845 if (!kvm_vcpu_apicv_active(vcpu))
6846 return;
25a2e4fe 6847
55d2375e
SC
6848 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6849 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6850 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6851 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
8665c3f9
PB
6852}
6853
55d2375e 6854static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
8665c3f9
PB
6855{
6856 struct vcpu_vmx *vmx = to_vmx(vcpu);
9d1887ef 6857
55d2375e
SC
6858 pi_clear_on(&vmx->pi_desc);
6859 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6860}
8665c3f9 6861
535f7ef2
SC
6862void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
6863
a217a659
LJ
6864static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
6865 unsigned long entry)
1a5488ef 6866{
db215756
SC
6867 bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
6868
6869 kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
a217a659 6870 vmx_do_interrupt_nmi_irqoff(entry);
1a5488ef
SC
6871 kvm_after_interrupt(vcpu);
6872}
6873
ec5be88a
JL
6874static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6875{
6876 /*
6877 * Save xfd_err to guest_fpu before interrupt is enabled, so the
6878 * MSR value is not clobbered by the host activity before the guest
6879 * has chance to consume it.
6880 *
6881 * Do not blindly read xfd_err here, since this exception might
6882 * be caused by L1 interception on a platform which doesn't
6883 * support xfd at all.
6884 *
6885 * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6886 * only when xfd contains a non-zero value.
6887 *
6888 * Queuing exception is done in vmx_handle_exit. See comment there.
6889 */
6890 if (vcpu->arch.guest_fpu.fpstate->xfd)
6891 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6892}
6893
95b5a48c 6894static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
55d2375e 6895{
a217a659 6896 const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
87915858 6897 u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
fe3ef05c 6898
55d2375e 6899 /* if exit due to PF check for async PF */
1a5488ef 6900 if (is_page_fault(intr_info))
68fd66f1 6901 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
ec5be88a
JL
6902 /* if exit due to NM, handle before interrupts are enabled */
6903 else if (is_nm_fault(intr_info))
6904 handle_nm_fault_irqoff(&vmx->vcpu);
55d2375e 6905 /* Handle machine checks before interrupts are enabled */
1a5488ef 6906 else if (is_machine_check(intr_info))
55d2375e 6907 kvm_machine_check();
55d2375e 6908 /* We need to handle NMIs before interrupts are enabled */
1a5488ef 6909 else if (is_nmi(intr_info))
a217a659 6910 handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
55d2375e 6911}
fe3ef05c 6912
95b5a48c 6913static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
55d2375e 6914{
87915858 6915 u32 intr_info = vmx_get_intr_info(vcpu);
a217a659
LJ
6916 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6917 gate_desc *desc = (gate_desc *)host_idt_base + vector;
fe3ef05c 6918
67369273 6919 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
8d20bd63 6920 "unexpected VM-Exit interrupt info: 0x%x", intr_info))
49def500
SC
6921 return;
6922
a217a659 6923 handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
6cd88243 6924 vcpu->arch.at_instruction_boundary = true;
55d2375e 6925}
95b5a48c 6926
a9ab13ff 6927static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
95b5a48c
SC
6928{
6929 struct vcpu_vmx *vmx = to_vmx(vcpu);
6930
81b4b56d
ML
6931 if (vmx->emulation_required)
6932 return;
6933
8e533240 6934 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
95b5a48c 6935 handle_external_interrupt_irqoff(vcpu);
8e533240 6936 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
95b5a48c
SC
6937 handle_exception_nmi_irqoff(vmx);
6938}
5a6a9748 6939
5719455f
TL
6940/*
6941 * The kvm parameter can be NULL (module initialization, or invocation before
6942 * VM creation). Be sure to check the kvm parameter before using it.
6943 */
6944static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
55d2375e
SC
6945{
6946 switch (index) {
6947 case MSR_IA32_SMBASE:
4b8e1b32
PB
6948 if (!IS_ENABLED(CONFIG_KVM_SMM))
6949 return false;
55d2375e
SC
6950 /*
6951 * We cannot do SMM unless we can run the guest in big
6952 * real mode.
6953 */
6954 return enable_unrestricted_guest || emulate_invalid_guest_state;
95c5c7c7
PB
6955 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6956 return nested;
55d2375e 6957 case MSR_AMD64_VIRT_SPEC_CTRL:
5228eb96 6958 case MSR_AMD64_TSC_RATIO:
55d2375e
SC
6959 /* This is AMD only. */
6960 return false;
6961 default:
6962 return true;
3184a995 6963 }
55d2375e 6964}
2bb8cafe 6965
55d2375e
SC
6966static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6967{
6968 u32 exit_intr_info;
6969 bool unblock_nmi;
6970 u8 vector;
6971 bool idtv_info_valid;
7ca29de2 6972
55d2375e 6973 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
feaf0c7d 6974
55d2375e
SC
6975 if (enable_vnmi) {
6976 if (vmx->loaded_vmcs->nmi_known_unmasked)
6977 return;
87915858
SC
6978
6979 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
55d2375e
SC
6980 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6981 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6982 /*
6983 * SDM 3: 27.7.1.2 (September 2008)
6984 * Re-set bit "block by NMI" before VM entry if vmexit caused by
6985 * a guest IRET fault.
6986 * SDM 3: 23.2.2 (September 2008)
6987 * Bit 12 is undefined in any of the following cases:
6988 * If the VM exit sets the valid bit in the IDT-vectoring
6989 * information field.
6990 * If the VM exit is due to a double fault.
6991 */
6992 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6993 vector != DF_VECTOR && !idtv_info_valid)
6994 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6995 GUEST_INTR_STATE_NMI);
6996 else
6997 vmx->loaded_vmcs->nmi_known_unmasked =
6998 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
6999 & GUEST_INTR_STATE_NMI);
7000 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7001 vmx->loaded_vmcs->vnmi_blocked_time +=
7002 ktime_to_ns(ktime_sub(ktime_get(),
7003 vmx->loaded_vmcs->entry_time));
fe3ef05c
NHE
7004}
7005
55d2375e
SC
7006static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7007 u32 idt_vectoring_info,
7008 int instr_len_field,
7009 int error_code_field)
0c7f650e 7010{
55d2375e
SC
7011 u8 vector;
7012 int type;
7013 bool idtv_info_valid;
0c7f650e 7014
55d2375e 7015 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
0c7f650e 7016
55d2375e
SC
7017 vcpu->arch.nmi_injected = false;
7018 kvm_clear_exception_queue(vcpu);
7019 kvm_clear_interrupt_queue(vcpu);
27c42a1b 7020
55d2375e
SC
7021 if (!idtv_info_valid)
7022 return;
c7c2c709 7023
55d2375e 7024 kvm_make_request(KVM_REQ_EVENT, vcpu);
ca0bde28 7025
55d2375e
SC
7026 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7027 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
64a919f7 7028
55d2375e
SC
7029 switch (type) {
7030 case INTR_TYPE_NMI_INTR:
7031 vcpu->arch.nmi_injected = true;
7032 /*
7033 * SDM 3: 27.7.1.2 (September 2008)
7034 * Clear bit "block by NMI" before VM entry if a NMI
7035 * delivery faulted.
7036 */
7037 vmx_set_nmi_mask(vcpu, false);
7038 break;
7039 case INTR_TYPE_SOFT_EXCEPTION:
7040 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
df561f66 7041 fallthrough;
55d2375e
SC
7042 case INTR_TYPE_HARD_EXCEPTION:
7043 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7044 u32 err = vmcs_read32(error_code_field);
7045 kvm_requeue_exception_e(vcpu, vector, err);
7046 } else
7047 kvm_requeue_exception(vcpu, vector);
7048 break;
7049 case INTR_TYPE_SOFT_INTR:
7050 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
df561f66 7051 fallthrough;
55d2375e
SC
7052 case INTR_TYPE_EXT_INTR:
7053 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7054 break;
7055 default:
7056 break;
0447378a 7057 }
ca0bde28
JM
7058}
7059
55d2375e 7060static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
f145d90d 7061{
55d2375e
SC
7062 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7063 VM_EXIT_INSTRUCTION_LEN,
7064 IDT_VECTORING_ERROR_CODE);
f145d90d
LA
7065}
7066
55d2375e 7067static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
ca0bde28 7068{
55d2375e
SC
7069 __vmx_complete_interrupts(vcpu,
7070 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7071 VM_ENTRY_INSTRUCTION_LEN,
7072 VM_ENTRY_EXCEPTION_ERROR_CODE);
f1b026a3 7073
55d2375e 7074 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
ca0bde28
JM
7075}
7076
55d2375e 7077static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
52017608 7078{
55d2375e
SC
7079 int i, nr_msrs;
7080 struct perf_guest_switch_msr *msrs;
39a4d779 7081 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7c177938 7082
85425032
LX
7083 pmu->host_cross_mapped_mask = 0;
7084 if (pmu->pebs_enable & pmu->global_ctrl)
7085 intel_pmu_cross_mapped_check(pmu);
7c177938 7086
c8e2fe13 7087 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
39a4d779 7088 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
55d2375e
SC
7089 if (!msrs)
7090 return;
f1b026a3 7091
55d2375e
SC
7092 for (i = 0; i < nr_msrs; i++)
7093 if (msrs[i].host == msrs[i].guest)
7094 clear_atomic_switch_msr(vmx, msrs[i].msr);
7095 else
7096 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7097 msrs[i].host, false);
ca0bde28 7098}
52017608 7099
55d2375e 7100static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
858e25c0
JM
7101{
7102 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e
SC
7103 u64 tscl;
7104 u32 delta_tsc;
52017608 7105
55d2375e 7106 if (vmx->req_immediate_exit) {
804939ea
SC
7107 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7108 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7109 } else if (vmx->hv_deadline_tsc != -1) {
55d2375e
SC
7110 tscl = rdtsc();
7111 if (vmx->hv_deadline_tsc > tscl)
7112 /* set_hv_timer ensures the delta fits in 32-bits */
7113 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7114 cpu_preemption_timer_multi);
7115 else
7116 delta_tsc = 0;
858e25c0 7117
804939ea
SC
7118 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7119 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7120 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7121 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7122 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7f7f1ba3 7123 }
858e25c0
JM
7124}
7125
3ebccdf3 7126void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
ca0bde28 7127{
c09b03eb
SC
7128 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7129 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7130 vmcs_writel(HOST_RSP, host_rsp);
7131 }
5ad6ece8 7132}
5f3d5799 7133
fc02735b
JP
7134void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7135 unsigned int flags)
7136{
7137 u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7138
7139 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7140 return;
7141
7142 if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7143 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7144
7145 /*
7146 * If the guest/host SPEC_CTRL values differ, restore the host value.
bea7e31a
JP
7147 *
7148 * For legacy IBRS, the IBRS bit always needs to be written after
7149 * transitioning from a less privileged predictor mode, regardless of
7150 * whether the guest/host values differ.
fc02735b 7151 */
bea7e31a
JP
7152 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7153 vmx->spec_ctrl != hostval)
fc02735b
JP
7154 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7155
7156 barrier_nospec();
7157}
7158
404d5d7b 7159static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
dcf068da 7160{
8e533240 7161 switch (to_vmx(vcpu)->exit_reason.basic) {
dcf068da
WL
7162 case EXIT_REASON_MSR_WRITE:
7163 return handle_fastpath_set_msr_irqoff(vcpu);
26efe2fd
WL
7164 case EXIT_REASON_PREEMPTION_TIMER:
7165 return handle_fastpath_preemption_timer(vcpu);
dcf068da
WL
7166 default:
7167 return EXIT_FASTPATH_NONE;
7168 }
7169}
7170
3ebccdf3 7171static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
bb066506
JP
7172 struct vcpu_vmx *vmx,
7173 unsigned long flags)
3ebccdf3 7174{
b2d2af7e 7175 guest_state_enter_irqoff();
3ebccdf3
TG
7176
7177 /* L1D Flush includes CPU buffer clear to mitigate MDS */
7178 if (static_branch_unlikely(&vmx_l1d_should_flush))
7179 vmx_l1d_flush(vcpu);
7180 else if (static_branch_unlikely(&mds_user_clear))
7181 mds_clear_cpu_buffers();
8cb861e9
PG
7182 else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7183 kvm_arch_has_assigned_device(vcpu->kvm))
7184 mds_clear_cpu_buffers();
3ebccdf3 7185
027bbb88 7186 vmx_disable_fb_clear(vmx);
3ebccdf3 7187
2245d398
TG
7188 if (vcpu->arch.cr2 != native_read_cr2())
7189 native_write_cr2(vcpu->arch.cr2);
3ebccdf3
TG
7190
7191 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
bb066506 7192 flags);
3ebccdf3 7193
2245d398 7194 vcpu->arch.cr2 = native_read_cr2();
3ebccdf3 7195
027bbb88
PG
7196 vmx_enable_fb_clear(vmx);
7197
b2d2af7e 7198 guest_state_exit_irqoff();
3ebccdf3
TG
7199}
7200
404d5d7b 7201static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
5ad6ece8
SC
7202{
7203 struct vcpu_vmx *vmx = to_vmx(vcpu);
1a715810 7204 unsigned long cr3, cr4;
5ad6ece8
SC
7205
7206 /* Record the guest's net vcpu time for enforced NMI injections. */
7207 if (unlikely(!enable_vnmi &&
7208 vmx->loaded_vmcs->soft_vnmi_blocked))
7209 vmx->loaded_vmcs->entry_time = ktime_get();
7210
c42dec14
ML
7211 /*
7212 * Don't enter VMX if guest state is invalid, let the exit handler
7213 * start emulation until we arrive back to a valid state. Synthesize a
7214 * consistency check VM-Exit due to invalid guest state and bail.
7215 */
7216 if (unlikely(vmx->emulation_required)) {
a80dfc02 7217 vmx->fail = 0;
c8607e4a 7218
c42dec14
ML
7219 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7220 vmx->exit_reason.failed_vmentry = 1;
7221 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7222 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7223 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7224 vmx->exit_intr_info = 0;
a9ab13ff 7225 return EXIT_FASTPATH_NONE;
c42dec14 7226 }
5ad6ece8 7227
d95df951
LB
7228 trace_kvm_entry(vcpu);
7229
5ad6ece8
SC
7230 if (vmx->ple_window_dirty) {
7231 vmx->ple_window_dirty = false;
7232 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7233 }
7234
c9dfd3fb 7235 /*
7236 * We did this in prepare_switch_to_guest, because it needs to
7237 * be within srcu_read_lock.
7238 */
7239 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
5ad6ece8 7240
cb3c1e2f 7241 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
5ad6ece8 7242 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
cb3c1e2f 7243 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
5ad6ece8 7244 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
41e68b69 7245 vcpu->arch.regs_dirty = 0;
5ad6ece8 7246
1a715810
SC
7247 /*
7248 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
7249 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7250 * it switches back to the current->mm, which can occur in KVM context
7251 * when switching to a temporary mm to patch kernel code, e.g. if KVM
7252 * toggles a static key while handling a VM-Exit.
7253 */
7254 cr3 = __get_current_cr3_fast();
7255 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7256 vmcs_writel(HOST_CR3, cr3);
7257 vmx->loaded_vmcs->host_state.cr3 = cr3;
7258 }
7259
5ad6ece8
SC
7260 cr4 = cr4_read_shadow();
7261 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7262 vmcs_writel(HOST_CR4, cr4);
7263 vmx->loaded_vmcs->host_state.cr4 = cr4;
7264 }
7265
375e28ff
PB
7266 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
7267 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7268 set_debugreg(vcpu->arch.dr6, 6);
7269
5ad6ece8
SC
7270 /* When single-stepping over STI and MOV SS, we must clear the
7271 * corresponding interruptibility bits in the guest state. Otherwise
7272 * vmentry fails as it then expects bit 14 (BS) in pending debug
7273 * exceptions being set, but that's not correct for the guest debugging
7274 * case. */
7275 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7276 vmx_set_interrupt_shadow(vcpu, 0);
7277
139a12cf 7278 kvm_load_guest_xsave_state(vcpu);
1811d979 7279
5ad6ece8
SC
7280 pt_guest_enter(vmx);
7281
49097762 7282 atomic_switch_perf_msrs(vmx);
1b5ac322
LX
7283 if (intel_pmu_lbr_is_enabled(vcpu))
7284 vmx_passthrough_lbr_msrs(vcpu);
5ad6ece8 7285
804939ea
SC
7286 if (enable_preemption_timer)
7287 vmx_update_hv_timer(vcpu);
5ad6ece8 7288
010fd37f 7289 kvm_wait_lapic_expire(vcpu);
b6c4bc65 7290
3ebccdf3 7291 /* The actual VMENTER/EXIT is in the .noinstr.text section. */
bb066506 7292 vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx));
b6b8a145 7293
55d2375e 7294 /* All fields are clean at this point */
9ff5e030 7295 if (static_branch_unlikely(&enable_evmcs)) {
55d2375e
SC
7296 current_evmcs->hv_clean_fields |=
7297 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
f4124500 7298
f2bc14b6 7299 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
9ff5e030 7300 }
6f6a657c 7301
55d2375e
SC
7302 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7303 if (vmx->host_debugctlmsr)
7304 update_debugctlmsr(vmx->host_debugctlmsr);
f4124500 7305
55d2375e
SC
7306#ifndef CONFIG_X86_64
7307 /*
7308 * The sysexit path does not restore ds/es, so we must set them to
7309 * a reasonable value ourselves.
7310 *
7311 * We can't defer this to vmx_prepare_switch_to_host() since that
7312 * function may be executed in interrupt context, which saves and
7313 * restore segments around it, nullifying its effect.
7314 */
7315 loadsegment(ds, __USER_DS);
7316 loadsegment(es, __USER_DS);
7317#endif
4704d0be 7318
41e68b69 7319 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7854cbca 7320
2ef444f1
CP
7321 pt_guest_exit(vmx);
7322
139a12cf 7323 kvm_load_host_xsave_state(vcpu);
1811d979 7324
b93af02c
KS
7325 if (is_guest_mode(vcpu)) {
7326 /*
7327 * Track VMLAUNCH/VMRESUME that have made past guest state
7328 * checking.
7329 */
7330 if (vmx->nested.nested_run_pending &&
7331 !vmx->exit_reason.failed_vmentry)
7332 ++vcpu->stat.nested_run;
7333
7334 vmx->nested.nested_run_pending = 0;
7335 }
7336
55d2375e 7337 vmx->idt_vectoring_info = 0;
119a9c01 7338
873e1da1 7339 if (unlikely(vmx->fail)) {
8e533240 7340 vmx->exit_reason.full = 0xdead;
a9ab13ff 7341 return EXIT_FASTPATH_NONE;
873e1da1
SC
7342 }
7343
8e533240
SC
7344 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7345 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
beb8d93b
SC
7346 kvm_machine_check();
7347
f5c59b57
ML
7348 if (likely(!vmx->exit_reason.failed_vmentry))
7349 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7350
0a62a031 7351 trace_kvm_exit(vcpu, KVM_ISA_VMX);
dcf068da 7352
8e533240 7353 if (unlikely(vmx->exit_reason.failed_vmentry))
a9ab13ff
WL
7354 return EXIT_FASTPATH_NONE;
7355
55d2375e 7356 vmx->loaded_vmcs->launched = 1;
c18911a2 7357
55d2375e
SC
7358 vmx_recover_nmi_blocking(vmx);
7359 vmx_complete_interrupts(vmx);
a9ab13ff 7360
dcf068da
WL
7361 if (is_guest_mode(vcpu))
7362 return EXIT_FASTPATH_NONE;
7363
d89d04ab 7364 return vmx_exit_handlers_fastpath(vcpu);
55d2375e 7365}
2996fca0 7366
58fccda4 7367static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
cf8b84f4 7368{
55d2375e 7369 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be 7370
55d2375e
SC
7371 if (enable_pml)
7372 vmx_destroy_pml_buffer(vmx);
7373 free_vpid(vmx->vpid);
55d2375e
SC
7374 nested_vmx_free_vcpu(vcpu);
7375 free_loaded_vmcs(vmx->loaded_vmcs);
55d2375e 7376}
4704d0be 7377
58fccda4 7378static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
55d2375e 7379{
8ea8b8d6 7380 struct vmx_uret_msr *tsx_ctrl;
41836839 7381 struct vcpu_vmx *vmx;
06692e4b 7382 int i, err;
4704d0be 7383
a9dd6f09
SC
7384 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7385 vmx = to_vmx(vcpu);
d9a710e5 7386
12a8eee5
SC
7387 INIT_LIST_HEAD(&vmx->pi_wakeup_list);
7388
55d2375e 7389 err = -ENOMEM;
b666a4b6 7390
55d2375e 7391 vmx->vpid = allocate_vpid();
7cdc2d62 7392
5f3d5799 7393 /*
55d2375e
SC
7394 * If PML is turned on, failure on enabling PML just results in failure
7395 * of creating the vcpu, therefore we can simplify PML logic (by
7396 * avoiding dealing with cases, such as enabling PML partially on vcpus
67b0ae43 7397 * for the guest), etc.
5f3d5799 7398 */
55d2375e 7399 if (enable_pml) {
41836839 7400 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
55d2375e 7401 if (!vmx->pml_pg)
987b2594 7402 goto free_vpid;
55d2375e 7403 }
4704d0be 7404
d0656735 7405 for (i = 0; i < kvm_nr_uret_msrs; ++i)
8ea8b8d6 7406 vmx->guest_uret_msrs[i].mask = -1ull;
5e17c624 7407 if (boot_cpu_has(X86_FEATURE_RTM)) {
8ea8b8d6
SC
7408 /*
7409 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7410 * Keep the host value unchanged to avoid changing CPUID bits
7411 * under the host kernel's feet.
8ea8b8d6 7412 */
5e17c624
SC
7413 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7414 if (tsx_ctrl)
5c49d185 7415 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
4be53410
XL
7416 }
7417
55d2375e
SC
7418 err = alloc_loaded_vmcs(&vmx->vmcs01);
7419 if (err < 0)
7d73710d 7420 goto free_pml;
cb61de2f 7421
250552b9
VK
7422 /*
7423 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7424 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7425 * feature only for vmcs01, KVM currently isn't equipped to realize any
7426 * performance benefits from enabling it for vmcs02.
7427 */
7428 if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
7429 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7430 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7431
7432 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7433 }
7434
3eb90017
AG
7435 /* The MSR bitmap starts with all ones */
7436 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7437 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7438
476c9bd8 7439 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
dbdd096a 7440#ifdef CONFIG_X86_64
476c9bd8
AL
7441 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7442 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7443 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
dbdd096a 7444#endif
476c9bd8
AL
7445 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7446 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7447 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
987b2594 7448 if (kvm_cstate_in_guest(vcpu->kvm)) {
476c9bd8
AL
7449 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7450 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7451 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7452 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
b5170063 7453 }
4704d0be 7454
55d2375e 7455 vmx->loaded_vmcs = &vmx->vmcs01;
06692e4b 7456
34109c04 7457 if (cpu_need_virtualize_apic_accesses(vcpu)) {
c482f2ce 7458 err = kvm_alloc_apic_access_page(vcpu->kvm);
55d2375e
SC
7459 if (err)
7460 goto free_vmcs;
7461 }
7462
7463 if (enable_ept && !enable_unrestricted_guest) {
987b2594 7464 err = init_rmode_identity_map(vcpu->kvm);
55d2375e
SC
7465 if (err)
7466 goto free_vmcs;
7467 }
4704d0be 7468
d588bb9b
CG
7469 if (vmx_can_use_ipiv(vcpu))
7470 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7471 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
7472
a9dd6f09 7473 return 0;
4704d0be 7474
55d2375e
SC
7475free_vmcs:
7476 free_loaded_vmcs(vmx->loaded_vmcs);
55d2375e
SC
7477free_pml:
7478 vmx_destroy_pml_buffer(vmx);
987b2594 7479free_vpid:
55d2375e 7480 free_vpid(vmx->vpid);
a9dd6f09 7481 return err;
55d2375e 7482}
36be0b9d 7483
65fd4cb6
TG
7484#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7485#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
21feb4eb 7486
55d2375e
SC
7487static int vmx_vm_init(struct kvm *kvm)
7488{
55d2375e
SC
7489 if (!ple_gap)
7490 kvm->arch.pause_in_guest = true;
3af18d9c 7491
55d2375e
SC
7492 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7493 switch (l1tf_mitigation) {
7494 case L1TF_MITIGATION_OFF:
7495 case L1TF_MITIGATION_FLUSH_NOWARN:
7496 /* 'I explicitly don't care' is set */
7497 break;
7498 case L1TF_MITIGATION_FLUSH:
7499 case L1TF_MITIGATION_FLUSH_NOSMT:
7500 case L1TF_MITIGATION_FULL:
7501 /*
7502 * Warn upon starting the first VM in a potentially
7503 * insecure environment.
7504 */
b284909a 7505 if (sched_smt_active())
55d2375e
SC
7506 pr_warn_once(L1TF_MSG_SMT);
7507 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7508 pr_warn_once(L1TF_MSG_L1D);
7509 break;
7510 case L1TF_MITIGATION_FULL_FORCE:
7511 /* Flush is enforced */
7512 break;
7513 }
7514 }
7515 return 0;
4704d0be
NHE
7516}
7517
ba28401b 7518static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
bd18bffc 7519{
55d2375e 7520 u8 cache;
bd18bffc 7521
222f06e7
CW
7522 /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7523 * memory aliases with conflicting memory types and sometimes MCEs.
7524 * We have to be careful as to what are honored and when.
7525 *
7526 * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
7527 * UC. The effective memory type is UC or WC depending on guest PAT.
7528 * This was historically the source of MCEs and we want to be
7529 * conservative.
7530 *
7531 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7532 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
7533 * EPT memory type is set to WB. The effective memory type is forced
7534 * WB.
7535 *
7536 * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
7537 * EPT memory type is used to emulate guest CD/MTRR.
bd18bffc 7538 */
222f06e7 7539
fb43496c
BG
7540 if (is_mmio)
7541 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
bd18bffc 7542
fb43496c
BG
7543 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7544 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
bd18bffc 7545
55d2375e 7546 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
55d2375e
SC
7547 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7548 cache = MTRR_TYPE_WRBACK;
7549 else
7550 cache = MTRR_TYPE_UNCACHABLE;
bd18bffc 7551
fb43496c
BG
7552 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7553 }
bd18bffc 7554
fb43496c 7555 return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
55d2375e 7556}
bd18bffc 7557
b6247686 7558static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
55d2375e 7559{
bd18bffc 7560 /*
55d2375e
SC
7561 * These bits in the secondary execution controls field
7562 * are dynamic, the others are mostly based on the hypervisor
7563 * architecture and the guest's CPUID. Do not touch the
7564 * dynamic bits.
bd18bffc 7565 */
55d2375e
SC
7566 u32 mask =
7567 SECONDARY_EXEC_SHADOW_VMCS |
7568 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7569 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7570 SECONDARY_EXEC_DESC;
bd18bffc 7571
fe7f895d 7572 u32 cur_ctl = secondary_exec_controls_get(vmx);
bd18bffc 7573
fe7f895d 7574 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
bd18bffc
SC
7575}
7576
4704d0be 7577/*
55d2375e
SC
7578 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7579 * (indicating "allowed-1") if they are supported in the guest's CPUID.
4704d0be 7580 */
55d2375e 7581static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
4704d0be
NHE
7582{
7583 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 7584 struct kvm_cpuid_entry2 *entry;
4704d0be 7585
55d2375e
SC
7586 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7587 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
e79f245d 7588
55d2375e
SC
7589#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7590 if (entry && (entry->_reg & (_cpuid_mask))) \
7591 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7592} while (0)
ff651cb6 7593
277ad7d5 7594 entry = kvm_find_cpuid_entry(vcpu, 0x1);
87382003
SC
7595 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
7596 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
7597 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
7598 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
7599 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
7600 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
7601 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
7602 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
7603 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
7604 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7605 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
7606 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
7607 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
7608 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
61ada748 7609
277ad7d5 7610 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
87382003
SC
7611 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
7612 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
7613 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
7614 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
7615 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
7616 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
cf3215d9 7617
55d2375e
SC
7618#undef cr4_fixed1_update
7619}
36c3cc42 7620
6c0f0bba
LK
7621static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7622{
7623 struct vcpu_vmx *vmx = to_vmx(vcpu);
7624 struct kvm_cpuid_entry2 *best = NULL;
7625 int i;
7626
7627 for (i = 0; i < PT_CPUID_LEAVES; i++) {
277ad7d5 7628 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
6c0f0bba
LK
7629 if (!best)
7630 return;
7631 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7632 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7633 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7634 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7635 }
7636
7637 /* Get the number of configurable Address Ranges for filtering */
f4d3a902 7638 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
6c0f0bba
LK
7639 PT_CAP_num_address_ranges);
7640
7641 /* Initialize and clear the no dependency bits */
7642 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
e099f3eb
XL
7643 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7644 RTIT_CTL_BRANCH_EN);
6c0f0bba
LK
7645
7646 /*
7647 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7648 * will inject an #GP
7649 */
7650 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7651 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7652
7653 /*
7654 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7655 * PSBFreq can be set
7656 */
7657 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7658 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7659 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7660
7661 /*
e099f3eb 7662 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
6c0f0bba
LK
7663 */
7664 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7665 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
e099f3eb 7666 RTIT_CTL_MTC_RANGE);
6c0f0bba
LK
7667
7668 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7669 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7670 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7671 RTIT_CTL_PTW_EN);
7672
7673 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7674 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7675 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7676
7677 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7678 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7679 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7680
d9f6e12f 7681 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
6c0f0bba
LK
7682 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7683 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7684
7685 /* unmask address range configure area */
f4d3a902 7686 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
d14eff1b 7687 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
6c0f0bba
LK
7688}
7689
7c1b761b 7690static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
55d2375e
SC
7691{
7692 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be 7693
7204160e
AL
7694 /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7695 vcpu->arch.xsaves_enabled = false;
7696
432979b5
SC
7697 vmx_setup_uret_msrs(vmx);
7698
b6247686
SC
7699 if (cpu_has_secondary_exec_ctrls())
7700 vmcs_set_secondary_exec_control(vmx,
7701 vmx_secondary_exec_control(vmx));
4704d0be 7702
55d2375e 7703 if (nested_vmx_allowed(vcpu))
48ebd0cf 7704 vmx->msr_ia32_feature_control_valid_bits |=
32ad73db
SC
7705 FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7706 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
55d2375e 7707 else
48ebd0cf 7708 vmx->msr_ia32_feature_control_valid_bits &=
32ad73db
SC
7709 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7710 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
4f350c6d 7711
8805875a 7712 if (nested_vmx_allowed(vcpu))
55d2375e 7713 nested_vmx_cr_fixed1_bits_update(vcpu);
6c0f0bba
LK
7714
7715 if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7716 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7717 update_intel_pt_cfg(vcpu);
b07a5c53
PB
7718
7719 if (boot_cpu_has(X86_FEATURE_RTM)) {
eb3db1b1 7720 struct vmx_uret_msr *msr;
d85a8034 7721 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
b07a5c53
PB
7722 if (msr) {
7723 bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7bf662bb 7724 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
b07a5c53
PB
7725 }
7726 }
a6337a35 7727
61f20813
JL
7728 if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7729 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7730 !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7731
7732
2ed41aa6
SC
7733 set_cr4_guest_host_mask(vmx);
7734
72add915
SC
7735 vmx_write_encls_bitmap(vcpu, NULL);
7736 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7737 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7738 else
7739 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7740
7741 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7742 vmx->msr_ia32_feature_control_valid_bits |=
7743 FEAT_CTL_SGX_LC_ENABLED;
7744 else
7745 vmx->msr_ia32_feature_control_valid_bits &=
7746 ~FEAT_CTL_SGX_LC_ENABLED;
7747
a6337a35 7748 /* Refresh #PF interception to account for MAXPHYADDR changes. */
b6a7cc35 7749 vmx_update_exception_bitmap(vcpu);
55d2375e 7750}
09abb5e3 7751
bec46859
SC
7752static u64 vmx_get_perf_capabilities(void)
7753{
7754 u64 perf_cap = PMU_CAP_FW_WRITES;
7755 struct x86_pmu_lbr lbr;
7756 u64 host_perf_cap = 0;
7757
7758 if (!enable_pmu)
7759 return 0;
7760
7761 if (boot_cpu_has(X86_FEATURE_PDCM))
7762 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7763
7764 x86_perf_get_lbr(&lbr);
7765 if (lbr.nr)
7766 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
7767
7768 if (vmx_pebs_supported()) {
7769 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7770 if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
7771 perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7772 }
7773
7774 return perf_cap;
7775}
7776
3ec6fd8c 7777static __init void vmx_set_cpu_caps(void)
55d2375e 7778{
3ec6fd8c
SC
7779 kvm_set_cpu_caps();
7780
7781 /* CPUID 0x1 */
7782 if (nested)
7783 kvm_cpu_cap_set(X86_FEATURE_VMX);
7784
7785 /* CPUID 0x7 */
8721f5b0
SC
7786 if (kvm_mpx_supported())
7787 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
e4203334
SC
7788 if (!cpu_has_vmx_invpcid())
7789 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
8721f5b0
SC
7790 if (vmx_pt_mode_is_host_guest())
7791 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
cf8e55fe
LX
7792 if (vmx_pebs_supported()) {
7793 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7794 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7795 }
3ec6fd8c 7796
6ef25aa0
LX
7797 if (!enable_pmu)
7798 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
bec46859 7799 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
3ec6fd8c 7800
72add915
SC
7801 if (!enable_sgx) {
7802 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7803 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7804 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7805 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7806 }
7807
90d2f60f
SC
7808 if (vmx_umip_emulated())
7809 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7810
b3d895d5 7811 /* CPUID 0xD.1 */
938c8745 7812 kvm_caps.supported_xss = 0;
becdad85 7813 if (!cpu_has_vmx_xsaves())
b3d895d5
SC
7814 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7815
8aec21c0
SC
7816 /* CPUID 0x80000001 and 0x7 (RDPID) */
7817 if (!cpu_has_vmx_rdtscp()) {
3ec6fd8c 7818 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
8aec21c0
SC
7819 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7820 }
0abcc8f6 7821
becdad85 7822 if (cpu_has_vmx_waitpkg())
0abcc8f6 7823 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
4704d0be
NHE
7824}
7825
55d2375e 7826static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
42124925 7827{
55d2375e 7828 to_vmx(vcpu)->req_immediate_exit = true;
7c177938
NHE
7829}
7830
35a57134
OU
7831static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7832 struct x86_instruction_info *info)
7833{
7834 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7835 unsigned short port;
7836 bool intercept;
7837 int size;
7838
7839 if (info->intercept == x86_intercept_in ||
7840 info->intercept == x86_intercept_ins) {
7841 port = info->src_val;
7842 size = info->dst_bytes;
7843 } else {
7844 port = info->dst_val;
7845 size = info->src_bytes;
7846 }
7847
7848 /*
7849 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7850 * VM-exits depend on the 'unconditional IO exiting' VM-execution
7851 * control.
7852 *
7853 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7854 */
7855 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7856 intercept = nested_cpu_has(vmcs12,
7857 CPU_BASED_UNCOND_IO_EXITING);
7858 else
7859 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7860
86f7e90c 7861 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
35a57134
OU
7862 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7863}
7864
8a76d7f2
JR
7865static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7866 struct x86_instruction_info *info,
21f1b8f2
SC
7867 enum x86_intercept_stage stage,
7868 struct x86_exception *exception)
8a76d7f2 7869{
fb6d4d34 7870 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
fb6d4d34 7871
35a57134 7872 switch (info->intercept) {
fb6d4d34
PB
7873 /*
7874 * RDPID causes #UD if disabled through secondary execution controls.
7875 * Because it is marked as EmulateOnUD, we need to intercept it here.
2183de41 7876 * Note, RDPID is hidden behind ENABLE_RDTSCP.
fb6d4d34 7877 */
2183de41 7878 case x86_intercept_rdpid:
7f3603b6 7879 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
21f1b8f2
SC
7880 exception->vector = UD_VECTOR;
7881 exception->error_code_valid = false;
35a57134
OU
7882 return X86EMUL_PROPAGATE_FAULT;
7883 }
7884 break;
7885
7886 case x86_intercept_in:
7887 case x86_intercept_ins:
7888 case x86_intercept_out:
7889 case x86_intercept_outs:
7890 return vmx_check_intercept_io(vcpu, info);
fb6d4d34 7891
86f7e90c
OU
7892 case x86_intercept_lgdt:
7893 case x86_intercept_lidt:
7894 case x86_intercept_lldt:
7895 case x86_intercept_ltr:
7896 case x86_intercept_sgdt:
7897 case x86_intercept_sidt:
7898 case x86_intercept_sldt:
7899 case x86_intercept_str:
7900 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7901 return X86EMUL_CONTINUE;
7902
7903 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
7904 break;
7905
fb6d4d34 7906 /* TODO: check more intercepts... */
35a57134
OU
7907 default:
7908 break;
7909 }
7910
07721fee 7911 return X86EMUL_UNHANDLEABLE;
8a76d7f2
JR
7912}
7913
64672c95
YJ
7914#ifdef CONFIG_X86_64
7915/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7916static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7917 u64 divisor, u64 *result)
7918{
7919 u64 low = a << shift, high = a >> (64 - shift);
7920
7921 /* To avoid the overflow on divq */
7922 if (high >= divisor)
7923 return 1;
7924
7925 /* Low hold the result, high hold rem which is discarded */
7926 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7927 "rm" (divisor), "0" (low), "1" (high));
7928 *result = low;
7929
7930 return 0;
7931}
7932
f9927982
SC
7933static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7934 bool *expired)
64672c95 7935{
386c6ddb 7936 struct vcpu_vmx *vmx;
c5ce8235 7937 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
39497d76 7938 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
386c6ddb 7939
386c6ddb
KA
7940 vmx = to_vmx(vcpu);
7941 tscl = rdtsc();
7942 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7943 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
39497d76
SC
7944 lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7945 ktimer->timer_advance_ns);
c5ce8235
WL
7946
7947 if (delta_tsc > lapic_timer_advance_cycles)
7948 delta_tsc -= lapic_timer_advance_cycles;
7949 else
7950 delta_tsc = 0;
64672c95
YJ
7951
7952 /* Convert to host delta tsc if tsc scaling is enabled */
938c8745 7953 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
0967fa1c 7954 delta_tsc && u64_shl_div_u64(delta_tsc,
938c8745 7955 kvm_caps.tsc_scaling_ratio_frac_bits,
805d705f 7956 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
64672c95
YJ
7957 return -ERANGE;
7958
7959 /*
7960 * If the delta tsc can't fit in the 32 bit after the multi shift,
7961 * we can't use the preemption timer.
7962 * It's possible that it fits on later vmentries, but checking
7963 * on every vmentry is costly so we just use an hrtimer.
7964 */
7965 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7966 return -ERANGE;
7967
7968 vmx->hv_deadline_tsc = tscl + delta_tsc;
f9927982
SC
7969 *expired = !delta_tsc;
7970 return 0;
64672c95
YJ
7971}
7972
7973static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7974{
f459a707 7975 to_vmx(vcpu)->hv_deadline_tsc = -1;
64672c95
YJ
7976}
7977#endif
7978
48d89b92 7979static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
ae97a3b8 7980{
b31c114b 7981 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d 7982 shrink_ple_window(vcpu);
ae97a3b8
RK
7983}
7984
a85863c2
MS
7985void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
7986{
7987 struct vcpu_vmx *vmx = to_vmx(vcpu);
7988
7989 if (is_guest_mode(vcpu)) {
7990 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
7991 return;
7992 }
7993
7994 /*
7995 * Note, cpu_dirty_logging_count can be changed concurrent with this
7996 * code, but in that case another update request will be made and so
7997 * the guest will never run with a stale PML value.
7998 */
7999 if (vcpu->kvm->arch.cpu_dirty_logging_count)
8000 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8001 else
8002 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8003}
8004
c45dcc71
AR
8005static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8006{
8007 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8008 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
32ad73db 8009 FEAT_CTL_LMCE_ENABLED;
c45dcc71
AR
8010 else
8011 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
32ad73db 8012 ~FEAT_CTL_LMCE_ENABLED;
c45dcc71
AR
8013}
8014
31e83e21 8015#ifdef CONFIG_KVM_SMM
c9d40913 8016static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
72d7b374 8017{
72e9cbdb
LP
8018 /* we need a nested vmexit to enter SMM, postpone if run is pending */
8019 if (to_vmx(vcpu)->nested.nested_run_pending)
c9d40913 8020 return -EBUSY;
a9fa7cb6 8021 return !is_smm(vcpu);
72d7b374
LP
8022}
8023
58c1d206 8024static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
0234bf88 8025{
72e9cbdb
LP
8026 struct vcpu_vmx *vmx = to_vmx(vcpu);
8027
5d76b1f8
SC
8028 /*
8029 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8030 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
8031 * SMI and RSM only modify state that is saved and restored via SMRAM.
8032 * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8033 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8034 */
72e9cbdb
LP
8035 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8036 if (vmx->nested.smm.guest_mode)
8037 nested_vmx_vmexit(vcpu, -1, 0, 0);
8038
8039 vmx->nested.smm.vmxon = vmx->nested.vmxon;
8040 vmx->nested.vmxon = false;
caa057a2 8041 vmx_clear_hlt(vcpu);
0234bf88
LP
8042 return 0;
8043}
8044
58c1d206 8045static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
0234bf88 8046{
72e9cbdb
LP
8047 struct vcpu_vmx *vmx = to_vmx(vcpu);
8048 int ret;
8049
8050 if (vmx->nested.smm.vmxon) {
8051 vmx->nested.vmxon = true;
8052 vmx->nested.smm.vmxon = false;
8053 }
8054
8055 if (vmx->nested.smm.guest_mode) {
a633e41e 8056 ret = nested_vmx_enter_non_root_mode(vcpu, false);
72e9cbdb
LP
8057 if (ret)
8058 return ret;
8059
759cbd59 8060 vmx->nested.nested_run_pending = 1;
72e9cbdb
LP
8061 vmx->nested.smm.guest_mode = false;
8062 }
0234bf88
LP
8063 return 0;
8064}
8065
b6a7cc35 8066static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
cc3d967f 8067{
c9d40913 8068 /* RSM will cause a vmexit anyway. */
cc3d967f 8069}
31e83e21 8070#endif
cc3d967f 8071
4b9852f4
LA
8072static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8073{
1c96dcce 8074 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
4b9852f4
LA
8075}
8076
93dff2fe
JM
8077static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8078{
8079 if (is_guest_mode(vcpu)) {
8080 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8081
8082 if (hrtimer_try_to_cancel(timer) == 1)
8083 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8084 }
8085}
8086
58fccda4 8087static void vmx_hardware_unsetup(void)
484014fa 8088{
ec5a4919
SC
8089 kvm_set_posted_intr_wakeup_handler(NULL);
8090
484014fa
SC
8091 if (nested)
8092 nested_vmx_hardware_unsetup();
8093
8094 free_kvm_area();
8095}
8096
b3f257a8
SC
8097#define VMX_REQUIRED_APICV_INHIBITS \
8098( \
8099 BIT(APICV_INHIBIT_REASON_DISABLE)| \
8100 BIT(APICV_INHIBIT_REASON_ABSENT) | \
8101 BIT(APICV_INHIBIT_REASON_HYPERV) | \
8102 BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
8103 BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
8104 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
8105 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
8106)
484014fa 8107
d588bb9b
CG
8108static void vmx_vm_destroy(struct kvm *kvm)
8109{
8110 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8111
8112 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8113}
8114
e286ac0e 8115static struct kvm_x86_ops vmx_x86_ops __initdata = {
08a9d59c 8116 .name = KBUILD_MODNAME,
9dadfc4a 8117
d83420c2
SC
8118 .check_processor_compatibility = vmx_check_processor_compat,
8119
58fccda4 8120 .hardware_unsetup = vmx_hardware_unsetup,
484014fa 8121
58fccda4
SC
8122 .hardware_enable = vmx_hardware_enable,
8123 .hardware_disable = vmx_hardware_disable,
484014fa
SC
8124 .has_emulated_msr = vmx_has_emulated_msr,
8125
8126 .vm_size = sizeof(struct kvm_vmx),
8127 .vm_init = vmx_vm_init,
d588bb9b 8128 .vm_destroy = vmx_vm_destroy,
484014fa 8129
d588bb9b 8130 .vcpu_precreate = vmx_vcpu_precreate,
58fccda4
SC
8131 .vcpu_create = vmx_vcpu_create,
8132 .vcpu_free = vmx_vcpu_free,
484014fa
SC
8133 .vcpu_reset = vmx_vcpu_reset,
8134
e27bc044 8135 .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
484014fa
SC
8136 .vcpu_load = vmx_vcpu_load,
8137 .vcpu_put = vmx_vcpu_put,
8138
b6a7cc35 8139 .update_exception_bitmap = vmx_update_exception_bitmap,
484014fa
SC
8140 .get_msr_feature = vmx_get_msr_feature,
8141 .get_msr = vmx_get_msr,
8142 .set_msr = vmx_set_msr,
8143 .get_segment_base = vmx_get_segment_base,
8144 .get_segment = vmx_get_segment,
8145 .set_segment = vmx_set_segment,
8146 .get_cpl = vmx_get_cpl,
8147 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
484014fa 8148 .set_cr0 = vmx_set_cr0,
c2fe3cd4 8149 .is_valid_cr4 = vmx_is_valid_cr4,
484014fa
SC
8150 .set_cr4 = vmx_set_cr4,
8151 .set_efer = vmx_set_efer,
8152 .get_idt = vmx_get_idt,
8153 .set_idt = vmx_set_idt,
8154 .get_gdt = vmx_get_gdt,
8155 .set_gdt = vmx_set_gdt,
484014fa
SC
8156 .set_dr7 = vmx_set_dr7,
8157 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8158 .cache_reg = vmx_cache_reg,
8159 .get_rflags = vmx_get_rflags,
8160 .set_rflags = vmx_set_rflags,
c5063551 8161 .get_if_flag = vmx_get_if_flag,
484014fa 8162
e27bc044
SC
8163 .flush_tlb_all = vmx_flush_tlb_all,
8164 .flush_tlb_current = vmx_flush_tlb_current,
8165 .flush_tlb_gva = vmx_flush_tlb_gva,
8166 .flush_tlb_guest = vmx_flush_tlb_guest,
484014fa 8167
fc4fad79 8168 .vcpu_pre_run = vmx_vcpu_pre_run,
e27bc044 8169 .vcpu_run = vmx_vcpu_run,
484014fa
SC
8170 .handle_exit = vmx_handle_exit,
8171 .skip_emulated_instruction = vmx_skip_emulated_instruction,
8172 .update_emulated_instruction = vmx_update_emulated_instruction,
8173 .set_interrupt_shadow = vmx_set_interrupt_shadow,
8174 .get_interrupt_shadow = vmx_get_interrupt_shadow,
8175 .patch_hypercall = vmx_patch_hypercall,
e27bc044
SC
8176 .inject_irq = vmx_inject_irq,
8177 .inject_nmi = vmx_inject_nmi,
6ad75c5c 8178 .inject_exception = vmx_inject_exception,
484014fa
SC
8179 .cancel_injection = vmx_cancel_injection,
8180 .interrupt_allowed = vmx_interrupt_allowed,
8181 .nmi_allowed = vmx_nmi_allowed,
8182 .get_nmi_mask = vmx_get_nmi_mask,
8183 .set_nmi_mask = vmx_set_nmi_mask,
b6a7cc35
JB
8184 .enable_nmi_window = vmx_enable_nmi_window,
8185 .enable_irq_window = vmx_enable_irq_window,
8186 .update_cr8_intercept = vmx_update_cr8_intercept,
484014fa
SC
8187 .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8188 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8189 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8190 .load_eoi_exitmap = vmx_load_eoi_exitmap,
8191 .apicv_post_state_restore = vmx_apicv_post_state_restore,
b3f257a8 8192 .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
484014fa
SC
8193 .hwapic_irr_update = vmx_hwapic_irr_update,
8194 .hwapic_isr_update = vmx_hwapic_isr_update,
8195 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8196 .sync_pir_to_irr = vmx_sync_pir_to_irr,
57dfd7b5 8197 .deliver_interrupt = vmx_deliver_interrupt,
8888cdd0 8198 .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
484014fa
SC
8199
8200 .set_tss_addr = vmx_set_tss_addr,
8201 .set_identity_map_addr = vmx_set_identity_map_addr,
484014fa
SC
8202 .get_mt_mask = vmx_get_mt_mask,
8203
8204 .get_exit_info = vmx_get_exit_info,
8205
7c1b761b 8206 .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
484014fa
SC
8207
8208 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8209
307a94c7
IS
8210 .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
8211 .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
edcfe540 8212 .write_tsc_offset = vmx_write_tsc_offset,
1ab9287a 8213 .write_tsc_multiplier = vmx_write_tsc_multiplier,
484014fa
SC
8214
8215 .load_mmu_pgd = vmx_load_mmu_pgd,
8216
8217 .check_intercept = vmx_check_intercept,
8218 .handle_exit_irqoff = vmx_handle_exit_irqoff,
8219
8220 .request_immediate_exit = vmx_request_immediate_exit,
8221
8222 .sched_in = vmx_sched_in,
8223
6dd03800 8224 .cpu_dirty_log_size = PML_ENTITY_NUM,
a85863c2 8225 .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
484014fa 8226
33b22172 8227 .nested_ops = &vmx_nested_ops,
484014fa 8228
58fccda4 8229 .pi_update_irte = vmx_pi_update_irte,
e27bc044 8230 .pi_start_assignment = vmx_pi_start_assignment,
484014fa
SC
8231
8232#ifdef CONFIG_X86_64
8233 .set_hv_timer = vmx_set_hv_timer,
8234 .cancel_hv_timer = vmx_cancel_hv_timer,
8235#endif
8236
8237 .setup_mce = vmx_setup_mce,
8238
31e83e21 8239#ifdef CONFIG_KVM_SMM
484014fa 8240 .smi_allowed = vmx_smi_allowed,
ecc513e5
SC
8241 .enter_smm = vmx_enter_smm,
8242 .leave_smm = vmx_leave_smm,
b6a7cc35 8243 .enable_smi_window = vmx_enable_smi_window,
31e83e21 8244#endif
484014fa 8245
09e3e2a1 8246 .can_emulate_instruction = vmx_can_emulate_instruction,
484014fa 8247 .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
93dff2fe 8248 .migrate_timers = vmx_migrate_timers,
3eb90017
AG
8249
8250 .msr_filter_changed = vmx_msr_filter_changed,
f9a4d621 8251 .complete_emulated_msr = kvm_complete_insn_gp,
647daca2
TL
8252
8253 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
484014fa
SC
8254};
8255
33271a9e
SC
8256static unsigned int vmx_handle_intel_pt_intr(void)
8257{
8258 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8259
8260 /* '0' on failure so that the !PT case can use a RET0 static call. */
ffd1925a 8261 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
33271a9e
SC
8262 return 0;
8263
8264 kvm_make_request(KVM_REQ_PMI, vcpu);
8265 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8266 (unsigned long *)&vcpu->arch.pmu.global_status);
8267 return 1;
8268}
8269
b6194b94
SC
8270static __init void vmx_setup_user_return_msrs(void)
8271{
8ea8b8d6
SC
8272
8273 /*
8274 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8275 * will emulate SYSCALL in legacy mode if the vendor string in guest
8276 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8277 * support this emulation, MSR_STAR is included in the list for i386,
8278 * but is never loaded into hardware. MSR_CSTAR is also never loaded
8279 * into hardware and is here purely for emulation purposes.
8280 */
8281 const u32 vmx_uret_msrs_list[] = {
8282 #ifdef CONFIG_X86_64
8283 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8284 #endif
8285 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8286 MSR_IA32_TSX_CTRL,
8287 };
b6194b94
SC
8288 int i;
8289
8290 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8291
e5fda4bb
SC
8292 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8293 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
b6194b94
SC
8294}
8295
3c5c3245
KH
8296static void __init vmx_setup_me_spte_mask(void)
8297{
8298 u64 me_mask = 0;
8299
8300 /*
8301 * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
8302 * the former to avoid exposing shadow_phys_bits.
8303 *
8304 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8305 * shadow_phys_bits. On MKTME and/or TDX capable systems,
8306 * boot_cpu_data.x86_phys_bits holds the actual physical address
8307 * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
8308 * reported by CPUID. Those bits between are KeyID bits.
8309 */
8310 if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
8311 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8312 kvm_get_shadow_phys_bits() - 1);
8313 /*
8314 * Unlike SME, host kernel doesn't support setting up any
8315 * MKTME KeyID on Intel platforms. No memory encryption
8316 * bits should be included into the SPTE.
8317 */
8318 kvm_mmu_set_me_spte_mask(0, me_mask);
8319}
8320
33271a9e
SC
8321static struct kvm_x86_init_ops vmx_init_ops __initdata;
8322
a3203381
SC
8323static __init int hardware_setup(void)
8324{
8325 unsigned long host_bndcfgs;
2342080c 8326 struct desc_ptr dt;
f8cd457f 8327 int r;
a3203381 8328
2342080c
SC
8329 store_idt(&dt);
8330 host_idt_base = dt.address;
8331
b6194b94 8332 vmx_setup_user_return_msrs();
a3203381
SC
8333
8334 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8335 return -EIO;
8336
9d78d6fb 8337 if (cpu_has_perf_global_ctrl_bug())
8d20bd63 8338 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
9d78d6fb
VK
8339 "does not work properly. Using workaround\n");
8340
a3203381
SC
8341 if (boot_cpu_has(X86_FEATURE_NX))
8342 kvm_enable_efer_bits(EFER_NX);
8343
8344 if (boot_cpu_has(X86_FEATURE_MPX)) {
8345 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8d20bd63 8346 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
a3203381
SC
8347 }
8348
7f5581f5 8349 if (!cpu_has_vmx_mpx())
938c8745
SC
8350 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8351 XFEATURE_MASK_BNDCSR);
cfc48181 8352
a3203381
SC
8353 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8354 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8355 enable_vpid = 0;
8356
8357 if (!cpu_has_vmx_ept() ||
8358 !cpu_has_vmx_ept_4levels() ||
8359 !cpu_has_vmx_ept_mt_wb() ||
8360 !cpu_has_vmx_invept_global())
8361 enable_ept = 0;
8362
23f079c2
SC
8363 /* NX support is required for shadow paging. */
8364 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8d20bd63 8365 pr_err_ratelimited("NX (Execute Disable) not supported\n");
23f079c2
SC
8366 return -EOPNOTSUPP;
8367 }
8368
a3203381
SC
8369 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8370 enable_ept_ad_bits = 0;
8371
8372 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8373 enable_unrestricted_guest = 0;
8374
8375 if (!cpu_has_vmx_flexpriority())
8376 flexpriority_enabled = 0;
8377
8378 if (!cpu_has_virtual_nmis())
8379 enable_vnmi = 0;
8380
1c1a4149
EGE
8381#ifdef CONFIG_X86_SGX_KVM
8382 if (!cpu_has_vmx_encls_vmexit())
8383 enable_sgx = false;
8384#endif
8385
a3203381
SC
8386 /*
8387 * set_apic_access_page_addr() is used to reload apic access
8388 * page upon invalidation. No need to do anything if not
8389 * using the APIC_ACCESS_ADDR VMCS field.
8390 */
8391 if (!flexpriority_enabled)
72b0eaa9 8392 vmx_x86_ops.set_apic_access_page_addr = NULL;
a3203381
SC
8393
8394 if (!cpu_has_vmx_tpr_shadow())
72b0eaa9 8395 vmx_x86_ops.update_cr8_intercept = NULL;
a3203381
SC
8396
8397#if IS_ENABLED(CONFIG_HYPERV)
8398 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
1f3a3e46 8399 && enable_ept) {
72b0eaa9
SC
8400 vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
8401 vmx_x86_ops.tlb_remote_flush_with_range =
1f3a3e46
LT
8402 hv_remote_flush_tlb_with_range;
8403 }
a3203381
SC
8404#endif
8405
8406 if (!cpu_has_vmx_ple()) {
8407 ple_gap = 0;
8408 ple_window = 0;
8409 ple_window_grow = 0;
8410 ple_window_max = 0;
8411 ple_window_shrink = 0;
8412 }
8413
e90e51d5 8414 if (!cpu_has_vmx_apicv())
a3203381 8415 enable_apicv = 0;
e90e51d5 8416 if (!enable_apicv)
72b0eaa9 8417 vmx_x86_ops.sync_pir_to_irr = NULL;
a3203381 8418
d588bb9b
CG
8419 if (!enable_apicv || !cpu_has_vmx_ipiv())
8420 enable_ipiv = false;
8421
88099313 8422 if (cpu_has_vmx_tsc_scaling())
938c8745 8423 kvm_caps.has_tsc_control = true;
a3203381 8424
938c8745
SC
8425 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8426 kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8427 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
2f4073e0 8428 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
fe6b6bc8 8429
a3203381
SC
8430 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8431
8432 if (enable_ept)
e7b7bdea
SC
8433 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8434 cpu_has_vmx_ept_execute_only());
703c335d 8435
3c5c3245
KH
8436 /*
8437 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8438 * bits to shadow_zero_check.
8439 */
8440 vmx_setup_me_spte_mask();
8441
746700d2 8442 kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
f8cd457f 8443 ept_caps_to_lpage_level(vmx_capability.ept));
a3203381 8444
a3203381
SC
8445 /*
8446 * Only enable PML when hardware supports PML feature, and both EPT
8447 * and EPT A/D bit features are enabled -- PML depends on them to work.
8448 */
8449 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8450 enable_pml = 0;
8451
a018eba5 8452 if (!enable_pml)
6dd03800 8453 vmx_x86_ops.cpu_dirty_log_size = 0;
a3203381
SC
8454
8455 if (!cpu_has_vmx_preemption_timer())
804939ea 8456 enable_preemption_timer = false;
a3203381 8457
804939ea
SC
8458 if (enable_preemption_timer) {
8459 u64 use_timer_freq = 5000ULL * 1000 * 1000;
a3203381 8460
a3203381 8461 cpu_preemption_timer_multi =
0809d9b0 8462 vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
804939ea
SC
8463
8464 if (tsc_khz)
8465 use_timer_freq = (u64)tsc_khz * 1000;
8466 use_timer_freq >>= cpu_preemption_timer_multi;
8467
8468 /*
8469 * KVM "disables" the preemption timer by setting it to its max
8470 * value. Don't use the timer if it might cause spurious exits
8471 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8472 */
8473 if (use_timer_freq > 0xffffffffu / 10)
8474 enable_preemption_timer = false;
8475 }
8476
8477 if (!enable_preemption_timer) {
72b0eaa9
SC
8478 vmx_x86_ops.set_hv_timer = NULL;
8479 vmx_x86_ops.cancel_hv_timer = NULL;
8480 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
a3203381
SC
8481 }
8482
938c8745 8483 kvm_caps.supported_mce_cap |= MCG_LMCE_P;
aebc3ca1 8484 kvm_caps.supported_mce_cap |= MCG_CMCI_P;
a3203381 8485
f99e3daf
CP
8486 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8487 return -EINVAL;
6ef25aa0 8488 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
f99e3daf 8489 pt_mode = PT_MODE_SYSTEM;
33271a9e
SC
8490 if (pt_mode == PT_MODE_HOST_GUEST)
8491 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8492 else
8493 vmx_init_ops.handle_intel_pt_intr = NULL;
f99e3daf 8494
8f102445
SC
8495 setup_default_sgx_lepubkeyhash();
8496
a3203381 8497 if (nested) {
bcdf201f 8498 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
3e8eaccc 8499
6c1c6e58 8500 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
a3203381
SC
8501 if (r)
8502 return r;
8503 }
8504
3ec6fd8c 8505 vmx_set_cpu_caps();
66a6950f 8506
a3203381 8507 r = alloc_kvm_area();
fbc2dfe5 8508 if (r && nested)
a3203381 8509 nested_vmx_hardware_unsetup();
ec5a4919
SC
8510
8511 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8512
a3203381
SC
8513 return r;
8514}
8515
d008dfdb 8516static struct kvm_x86_init_ops vmx_init_ops __initdata = {
d008dfdb 8517 .hardware_setup = hardware_setup,
33271a9e 8518 .handle_intel_pt_intr = NULL,
57b119da 8519
d008dfdb 8520 .runtime_ops = &vmx_x86_ops,
34886e79 8521 .pmu_ops = &intel_pmu_ops,
6aa8b732
AK
8522};
8523
72c6d2db 8524static void vmx_cleanup_l1d_flush(void)
a47dd5f0
PB
8525{
8526 if (vmx_l1d_flush_pages) {
8527 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8528 vmx_l1d_flush_pages = NULL;
8529 }
72c6d2db
TG
8530 /* Restore state so sysfs ignores VMX */
8531 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
8532}
8533
e32b1200 8534static void __vmx_exit(void)
a7b9020b 8535{
e32b1200
SC
8536 allow_smaller_maxphyaddr = false;
8537
a7b9020b
TG
8538#ifdef CONFIG_KEXEC_CORE
8539 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8540 synchronize_rcu();
8541#endif
e32b1200
SC
8542 vmx_cleanup_l1d_flush();
8543}
a7b9020b 8544
e32b1200
SC
8545static void vmx_exit(void)
8546{
a7b9020b 8547 kvm_exit();
4f8396b9 8548 kvm_x86_vendor_exit();
a7b9020b 8549
e32b1200 8550 __vmx_exit();
a7b9020b
TG
8551}
8552module_exit(vmx_exit);
8553
6aa8b732
AK
8554static int __init vmx_init(void)
8555{
dbef2808 8556 int r, cpu;
773e8a04 8557
d4193132
SC
8558 if (!kvm_is_vmx_supported())
8559 return -EOPNOTSUPP;
8560
773e8a04 8561 /*
451d39e8
SC
8562 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8563 * to unwind if a later step fails.
773e8a04 8564 */
451d39e8 8565 hv_init_evmcs();
773e8a04 8566
4f8396b9
SC
8567 r = kvm_x86_vendor_init(&vmx_init_ops);
8568 if (r)
8569 return r;
8570
a7b9020b 8571 /*
4f8396b9 8572 * Must be called after common x86 init so enable_ept is properly set
7db92e16
TG
8573 * up. Hand the parameter mitigation value in which was stored in
8574 * the pre module init parser. If no parameter was given, it will
8575 * contain 'auto' which will be turned into the default 'cond'
8576 * mitigation mode.
8577 */
19a36d32 8578 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
4f8396b9
SC
8579 if (r)
8580 goto err_l1d_flush;
25c5f225 8581
027bbb88
PG
8582 vmx_setup_fb_clear_ctrl();
8583
dbef2808
VK
8584 for_each_possible_cpu(cpu) {
8585 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8888cdd0 8586
a3ff25fc 8587 pi_init_cpu(cpu);
dbef2808
VK
8588 }
8589
2965faa5 8590#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
8591 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8592 crash_vmclear_local_loaded_vmcss);
8593#endif
21ebf53b 8594 vmx_check_vmcs12_offsets();
8f536b76 8595
3edd6839 8596 /*
b96e6506
MG
8597 * Shadow paging doesn't have a (further) performance penalty
8598 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8599 * by default
3edd6839 8600 */
b96e6506
MG
8601 if (!enable_ept)
8602 allow_smaller_maxphyaddr = true;
3edd6839 8603
e32b1200
SC
8604 /*
8605 * Common KVM initialization _must_ come last, after this, /dev/kvm is
8606 * exposed to userspace!
8607 */
81a1cf9f
SC
8608 r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
8609 THIS_MODULE);
e32b1200
SC
8610 if (r)
8611 goto err_kvm_init;
8612
fdef3ad1 8613 return 0;
4f8396b9 8614
4f8396b9 8615err_kvm_init:
e32b1200
SC
8616 __vmx_exit();
8617err_l1d_flush:
4f8396b9
SC
8618 kvm_x86_vendor_exit();
8619 return r;
6aa8b732 8620}
a7b9020b 8621module_init(vmx_init);