KVM: x86/pmu: Disallow legacy LBRs if architectural LBRs are available
[linux-block.git] / arch / x86 / kvm / vmx / vmx.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
6aa8b732
AK
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9611c187 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
6aa8b732 14 */
8d20bd63 15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6aa8b732 16
199b118a
SC
17#include <linux/highmem.h>
18#include <linux/hrtimer.h>
19#include <linux/kernel.h>
edf88417 20#include <linux/kvm_host.h>
6aa8b732 21#include <linux/module.h>
c7addb90 22#include <linux/moduleparam.h>
e9bda3b3 23#include <linux/mod_devicetable.h>
199b118a 24#include <linux/mm.h>
00089c04 25#include <linux/objtool.h>
199b118a 26#include <linux/sched.h>
b284909a 27#include <linux/sched/smt.h>
5a0e3ad6 28#include <linux/slab.h>
cafd6659 29#include <linux/tboot.h>
199b118a 30#include <linux/trace_events.h>
72c3c0fe 31#include <linux/entry-kvm.h>
e495606d 32
199b118a 33#include <asm/apic.h>
fd8ca6da 34#include <asm/asm.h>
28b835d6 35#include <asm/cpu.h>
ba5bade4 36#include <asm/cpu_device_id.h>
199b118a 37#include <asm/debugreg.h>
3b3be0d1 38#include <asm/desc.h>
b56d2795 39#include <asm/fpu/api.h>
ec5be88a 40#include <asm/fpu/xstate.h>
a217a659 41#include <asm/idtentry.h>
199b118a 42#include <asm/io.h>
efc64404 43#include <asm/irq_remapping.h>
199b118a
SC
44#include <asm/kexec.h>
45#include <asm/perf_event.h>
d6e41f11 46#include <asm/mmu_context.h>
773e8a04 47#include <asm/mshyperv.h>
b10c307f 48#include <asm/mwait.h>
199b118a
SC
49#include <asm/spec-ctrl.h>
50#include <asm/virtext.h>
51#include <asm/vmx.h>
6aa8b732 52
3077c191 53#include "capabilities.h"
199b118a 54#include "cpuid.h"
05f04ae4 55#include "hyperv.h"
3c86c0d3 56#include "kvm_onhyperv.h"
199b118a
SC
57#include "irq.h"
58#include "kvm_cache_regs.h"
59#include "lapic.h"
60#include "mmu.h"
55d2375e 61#include "nested.h"
25462f7f 62#include "pmu.h"
9798adbc 63#include "sgx.h"
199b118a 64#include "trace.h"
cb1d474b 65#include "vmcs.h"
609363cf 66#include "vmcs12.h"
89b0c9f5 67#include "vmx.h"
199b118a 68#include "x86.h"
b0b42197 69#include "smm.h"
229456fc 70
6aa8b732
AK
71MODULE_AUTHOR("Qumranet");
72MODULE_LICENSE("GPL");
73
575b255c 74#ifdef MODULE
e9bda3b3 75static const struct x86_cpu_id vmx_cpu_id[] = {
320debe5 76 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
e9bda3b3
JT
77 {}
78};
79MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
575b255c 80#endif
e9bda3b3 81
2c4fd91d 82bool __read_mostly enable_vpid = 1;
736caefe 83module_param_named(vpid, enable_vpid, bool, 0444);
2384d2b3 84
d02fcf50
PB
85static bool __read_mostly enable_vnmi = 1;
86module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
87
2c4fd91d 88bool __read_mostly flexpriority_enabled = 1;
736caefe 89module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
4c9fc8ef 90
2c4fd91d 91bool __read_mostly enable_ept = 1;
736caefe 92module_param_named(ept, enable_ept, bool, S_IRUGO);
d56f546d 93
2c4fd91d 94bool __read_mostly enable_unrestricted_guest = 1;
3a624e29
NK
95module_param_named(unrestricted_guest,
96 enable_unrestricted_guest, bool, S_IRUGO);
97
2c4fd91d 98bool __read_mostly enable_ept_ad_bits = 1;
83c3a331
XH
99module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
100
a27685c3 101static bool __read_mostly emulate_invalid_guest_state = true;
c1f8bc04 102module_param(emulate_invalid_guest_state, bool, S_IRUGO);
04fa4d32 103
476bc001 104static bool __read_mostly fasteoi = 1;
58fbbf26
KT
105module_param(fasteoi, bool, S_IRUGO);
106
01e439be 107module_param(enable_apicv, bool, S_IRUGO);
83d4c286 108
d588bb9b
CG
109bool __read_mostly enable_ipiv = true;
110module_param(enable_ipiv, bool, 0444);
111
801d3424
NHE
112/*
113 * If nested=1, nested virtualization is supported, i.e., guests may use
114 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
115 * use VMX instructions.
116 */
1e58e5e5 117static bool __read_mostly nested = 1;
801d3424
NHE
118module_param(nested, bool, S_IRUGO);
119
2c4fd91d 120bool __read_mostly enable_pml = 1;
843e4330
KH
121module_param_named(pml, enable_pml, bool, S_IRUGO);
122
3dbec44d
SC
123static bool __read_mostly error_on_inconsistent_vmcs_config = true;
124module_param(error_on_inconsistent_vmcs_config, bool, 0444);
125
6f2f8453
PB
126static bool __read_mostly dump_invalid_vmcs = 0;
127module_param(dump_invalid_vmcs, bool, 0644);
128
904e14fb
PB
129#define MSR_BITMAP_MODE_X2APIC 1
130#define MSR_BITMAP_MODE_X2APIC_APICV 2
904e14fb 131
64903d61
HZ
132#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
133
64672c95
YJ
134/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
135static int __read_mostly cpu_preemption_timer_multi;
136static bool __read_mostly enable_preemption_timer = 1;
137#ifdef CONFIG_X86_64
138module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
139#endif
140
b96e6506
MG
141extern bool __read_mostly allow_smaller_maxphyaddr;
142module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
143
3de6347b 144#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
1706bd0c
SC
145#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
146#define KVM_VM_CR0_ALWAYS_ON \
ee5a5584 147 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
4c38609a 148
5dc1f044 149#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
cdc0e244
AK
150#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
151#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
152
78ac8b47
AK
153#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
154
bf8c55d8
CP
155#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
156 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
157 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
158 RTIT_STATUS_BYTECNT))
159
3eb90017
AG
160/*
161 * List of MSRs that can be directly passed to the guest.
162 * In addition to these x2apic and PT MSRs are handled specially.
163 */
164static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
165 MSR_IA32_SPEC_CTRL,
166 MSR_IA32_PRED_CMD,
167 MSR_IA32_TSC,
dbdd096a 168#ifdef CONFIG_X86_64
3eb90017
AG
169 MSR_FS_BASE,
170 MSR_GS_BASE,
171 MSR_KERNEL_GS_BASE,
b5274b1b 172 MSR_IA32_XFD,
61f20813 173 MSR_IA32_XFD_ERR,
dbdd096a 174#endif
3eb90017
AG
175 MSR_IA32_SYSENTER_CS,
176 MSR_IA32_SYSENTER_ESP,
177 MSR_IA32_SYSENTER_EIP,
178 MSR_CORE_C1_RES,
179 MSR_CORE_C3_RESIDENCY,
180 MSR_CORE_C6_RESIDENCY,
181 MSR_CORE_C7_RESIDENCY,
182};
bf8c55d8 183
4b8d54f9
ZE
184/*
185 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
186 * ple_gap: upper bound on the amount of time between two successive
187 * executions of PAUSE in a loop. Also indicate if ple enabled.
00c25bce 188 * According to test, this time is usually smaller than 128 cycles.
4b8d54f9
ZE
189 * ple_window: upper bound on the amount of time a guest is allowed to execute
190 * in a PAUSE loop. Tests indicate that most spinlocks are held for
191 * less than 2^12 cycles
192 * Time is measured based on a counter that runs at the same rate as the TSC,
193 * refer SDM volume 3b section 21.6.13 & 22.1.3.
194 */
c8e88717 195static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
a87c99e6 196module_param(ple_gap, uint, 0444);
b4a2d31d 197
7fbc85a5
BM
198static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
199module_param(ple_window, uint, 0444);
4b8d54f9 200
b4a2d31d 201/* Default doubles per-vcpu window every exit. */
c8e88717 202static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
7fbc85a5 203module_param(ple_window_grow, uint, 0444);
b4a2d31d
RK
204
205/* Default resets per-vcpu window every exit to ple_window. */
c8e88717 206static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
7fbc85a5 207module_param(ple_window_shrink, uint, 0444);
b4a2d31d
RK
208
209/* Default is to compute the maximum so we can never overflow. */
7fbc85a5
BM
210static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
211module_param(ple_window_max, uint, 0444);
b4a2d31d 212
f99e3daf
CP
213/* Default is SYSTEM mode, 1 for host-guest mode */
214int __read_mostly pt_mode = PT_MODE_SYSTEM;
215module_param(pt_mode, int, S_IRUGO);
216
a399477e 217static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
427362a1 218static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
dd4bfa73 219static DEFINE_MUTEX(vmx_l1d_flush_mutex);
a399477e 220
7db92e16
TG
221/* Storage for pre module init parameter parsing */
222static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
223
224static const struct {
225 const char *option;
0027ff2a 226 bool for_parse;
a399477e 227} vmentry_l1d_param[] = {
0027ff2a
PB
228 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
229 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
230 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
231 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
232 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
233 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
a399477e
KRW
234};
235
7db92e16
TG
236#define L1D_CACHE_ORDER 4
237static void *vmx_l1d_flush_pages;
238
027bbb88
PG
239/* Control for disabling CPU Fill buffer clear */
240static bool __read_mostly vmx_fb_clear_ctrl_available;
241
7db92e16 242static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
a399477e 243{
7db92e16 244 struct page *page;
288d152c 245 unsigned int i;
a399477e 246
19a36d32
WL
247 if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
248 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
249 return 0;
250 }
251
7db92e16
TG
252 if (!enable_ept) {
253 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
254 return 0;
a399477e
KRW
255 }
256
d806afa4
YW
257 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
258 u64 msr;
259
260 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
261 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
262 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
263 return 0;
264 }
265 }
8e0b2b91 266
d90a7a0e
JK
267 /* If set to auto use the default l1tf mitigation method */
268 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
269 switch (l1tf_mitigation) {
270 case L1TF_MITIGATION_OFF:
271 l1tf = VMENTER_L1D_FLUSH_NEVER;
272 break;
273 case L1TF_MITIGATION_FLUSH_NOWARN:
274 case L1TF_MITIGATION_FLUSH:
275 case L1TF_MITIGATION_FLUSH_NOSMT:
276 l1tf = VMENTER_L1D_FLUSH_COND;
277 break;
278 case L1TF_MITIGATION_FULL:
279 case L1TF_MITIGATION_FULL_FORCE:
280 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
281 break;
282 }
283 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
284 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
285 }
286
7db92e16
TG
287 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
288 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
41836839
BG
289 /*
290 * This allocation for vmx_l1d_flush_pages is not tied to a VM
291 * lifetime and so should not be charged to a memcg.
292 */
7db92e16
TG
293 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
294 if (!page)
295 return -ENOMEM;
296 vmx_l1d_flush_pages = page_address(page);
288d152c
NS
297
298 /*
299 * Initialize each page with a different pattern in
300 * order to protect against KSM in the nested
301 * virtualization case.
302 */
303 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
304 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
305 PAGE_SIZE);
306 }
7db92e16
TG
307 }
308
309 l1tf_vmx_mitigation = l1tf;
310
895ae47f
TG
311 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
312 static_branch_enable(&vmx_l1d_should_flush);
313 else
314 static_branch_disable(&vmx_l1d_should_flush);
4c6523ec 315
427362a1
NS
316 if (l1tf == VMENTER_L1D_FLUSH_COND)
317 static_branch_enable(&vmx_l1d_flush_cond);
895ae47f 318 else
427362a1 319 static_branch_disable(&vmx_l1d_flush_cond);
7db92e16
TG
320 return 0;
321}
322
323static int vmentry_l1d_flush_parse(const char *s)
324{
325 unsigned int i;
326
327 if (s) {
328 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
0027ff2a
PB
329 if (vmentry_l1d_param[i].for_parse &&
330 sysfs_streq(s, vmentry_l1d_param[i].option))
331 return i;
7db92e16
TG
332 }
333 }
a399477e
KRW
334 return -EINVAL;
335}
336
7db92e16
TG
337static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
338{
dd4bfa73 339 int l1tf, ret;
7db92e16 340
7db92e16
TG
341 l1tf = vmentry_l1d_flush_parse(s);
342 if (l1tf < 0)
343 return l1tf;
344
0027ff2a
PB
345 if (!boot_cpu_has(X86_BUG_L1TF))
346 return 0;
347
7db92e16
TG
348 /*
349 * Has vmx_init() run already? If not then this is the pre init
350 * parameter parsing. In that case just store the value and let
351 * vmx_init() do the proper setup after enable_ept has been
352 * established.
353 */
354 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
355 vmentry_l1d_flush_param = l1tf;
356 return 0;
357 }
358
dd4bfa73
TG
359 mutex_lock(&vmx_l1d_flush_mutex);
360 ret = vmx_setup_l1d_flush(l1tf);
361 mutex_unlock(&vmx_l1d_flush_mutex);
362 return ret;
7db92e16
TG
363}
364
a399477e
KRW
365static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
366{
0027ff2a
PB
367 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
368 return sprintf(s, "???\n");
369
7db92e16 370 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
a399477e
KRW
371}
372
027bbb88
PG
373static void vmx_setup_fb_clear_ctrl(void)
374{
375 u64 msr;
376
377 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
378 !boot_cpu_has_bug(X86_BUG_MDS) &&
379 !boot_cpu_has_bug(X86_BUG_TAA)) {
380 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
381 if (msr & ARCH_CAP_FB_CLEAR_CTRL)
382 vmx_fb_clear_ctrl_available = true;
383 }
384}
385
386static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
387{
388 u64 msr;
389
390 if (!vmx->disable_fb_clear)
391 return;
392
742ab6df 393 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
027bbb88 394 msr |= FB_CLEAR_DIS;
742ab6df 395 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
027bbb88
PG
396 /* Cache the MSR value to avoid reading it later */
397 vmx->msr_ia32_mcu_opt_ctrl = msr;
398}
399
400static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
401{
402 if (!vmx->disable_fb_clear)
403 return;
404
405 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
742ab6df 406 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
027bbb88
PG
407}
408
409static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
410{
411 vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
412
413 /*
414 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
415 * at VMEntry. Skip the MSR read/write when a guest has no use case to
416 * execute VERW.
417 */
418 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
419 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
420 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
421 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
422 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
423 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
424 vmx->disable_fb_clear = false;
425}
426
a399477e
KRW
427static const struct kernel_param_ops vmentry_l1d_flush_ops = {
428 .set = vmentry_l1d_flush_set,
429 .get = vmentry_l1d_flush_get,
430};
895ae47f 431module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
a399477e 432
d99e4152 433static u32 vmx_segment_access_rights(struct kvm_segment *var);
75880a01 434
453eafbe
SC
435void vmx_vmexit(void);
436
52a9fcbc
SC
437#define vmx_insn_failed(fmt...) \
438do { \
439 WARN_ONCE(1, fmt); \
440 pr_warn_ratelimited(fmt); \
441} while (0)
442
57abfa11 443void vmread_error(unsigned long field, bool fault)
6e202097
SC
444{
445 if (fault)
446 kvm_spurious_fault();
447 else
8d20bd63 448 vmx_insn_failed("vmread failed: field=%lx\n", field);
6e202097
SC
449}
450
52a9fcbc
SC
451noinline void vmwrite_error(unsigned long field, unsigned long value)
452{
8d20bd63 453 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
52a9fcbc
SC
454 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
455}
456
457noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
458{
8d20bd63 459 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
8e39efd8 460 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
52a9fcbc
SC
461}
462
463noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
464{
8d20bd63 465 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
8e39efd8 466 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
52a9fcbc
SC
467}
468
469noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
470{
8d20bd63 471 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
52a9fcbc
SC
472 ext, vpid, gva);
473}
474
475noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
476{
8d20bd63 477 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
52a9fcbc
SC
478 ext, eptp, gpa);
479}
480
6aa8b732 481static DEFINE_PER_CPU(struct vmcs *, vmxarea);
75edce8a 482DEFINE_PER_CPU(struct vmcs *, current_vmcs);
d462b819
NHE
483/*
484 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
485 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
486 */
487static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
6aa8b732 488
2384d2b3
SY
489static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
490static DEFINE_SPINLOCK(vmx_vpid_lock);
491
58ca1930
SC
492struct vmcs_config vmcs_config __ro_after_init;
493struct vmx_capability vmx_capability __ro_after_init;
d56f546d 494
6aa8b732
AK
495#define VMX_SEGMENT_FIELD(seg) \
496 [VCPU_SREG_##seg] = { \
497 .selector = GUEST_##seg##_SELECTOR, \
498 .base = GUEST_##seg##_BASE, \
499 .limit = GUEST_##seg##_LIMIT, \
500 .ar_bytes = GUEST_##seg##_AR_BYTES, \
501 }
502
772e0318 503static const struct kvm_vmx_segment_field {
6aa8b732
AK
504 unsigned selector;
505 unsigned base;
506 unsigned limit;
507 unsigned ar_bytes;
508} kvm_vmx_segment_fields[] = {
509 VMX_SEGMENT_FIELD(CS),
510 VMX_SEGMENT_FIELD(DS),
511 VMX_SEGMENT_FIELD(ES),
512 VMX_SEGMENT_FIELD(FS),
513 VMX_SEGMENT_FIELD(GS),
514 VMX_SEGMENT_FIELD(SS),
515 VMX_SEGMENT_FIELD(TR),
516 VMX_SEGMENT_FIELD(LDTR),
517};
518
ec0241f3
SC
519static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
520{
521 vmx->segment_cache.bitmask = 0;
522}
523
2342080c 524static unsigned long host_idt_base;
26bb0981 525
773e8a04 526#if IS_ENABLED(CONFIG_HYPERV)
451d39e8
SC
527static struct kvm_x86_ops vmx_x86_ops __initdata;
528
773e8a04
VK
529static bool __read_mostly enlightened_vmcs = true;
530module_param(enlightened_vmcs, bool, 0444);
531
b83237ad 532static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
6f6a657c
VK
533{
534 struct hv_enlightened_vmcs *evmcs;
535 struct hv_partition_assist_pg **p_hv_pa_pg =
05f04ae4 536 &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
6f6a657c
VK
537 /*
538 * Synthetic VM-Exit is not enabled in current code and so All
539 * evmcs in singe VM shares same assist page.
540 */
cab01850 541 if (!*p_hv_pa_pg)
eba04b20 542 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
cab01850
VK
543
544 if (!*p_hv_pa_pg)
545 return -ENOMEM;
6f6a657c
VK
546
547 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
548
549 evmcs->partition_assist_page =
550 __pa(*p_hv_pa_pg);
cab01850 551 evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
6f6a657c
VK
552 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
553
6f6a657c
VK
554 return 0;
555}
556
451d39e8
SC
557static __init void hv_init_evmcs(void)
558{
559 int cpu;
560
561 if (!enlightened_vmcs)
562 return;
563
564 /*
565 * Enlightened VMCS usage should be recommended and the host needs
566 * to support eVMCS v1 or above.
567 */
568 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
569 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
570 KVM_EVMCS_VERSION) {
571
572 /* Check that we have assist pages on all online CPUs */
573 for_each_online_cpu(cpu) {
574 if (!hv_get_vp_assist_page(cpu)) {
575 enlightened_vmcs = false;
576 break;
577 }
578 }
579
580 if (enlightened_vmcs) {
8d20bd63 581 pr_info("Using Hyper-V Enlightened VMCS\n");
fbc722aa 582 static_branch_enable(&__kvm_is_using_evmcs);
451d39e8
SC
583 }
584
585 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
586 vmx_x86_ops.enable_l2_tlb_flush
587 = hv_enable_l2_tlb_flush;
588
589 } else {
590 enlightened_vmcs = false;
591 }
592}
593
2916b70f
SC
594static void hv_reset_evmcs(void)
595{
596 struct hv_vp_assist_page *vp_ap;
597
19f10315 598 if (!kvm_is_using_evmcs())
2916b70f
SC
599 return;
600
601 /*
602 * KVM should enable eVMCS if and only if all CPUs have a VP assist
603 * page, and should reject CPU onlining if eVMCS is enabled the CPU
604 * doesn't have a VP assist page allocated.
605 */
606 vp_ap = hv_get_vp_assist_page(smp_processor_id());
607 if (WARN_ON_ONCE(!vp_ap))
608 return;
609
610 /*
611 * Reset everything to support using non-enlightened VMCS access later
612 * (e.g. when we reload the module with enlightened_vmcs=0)
613 */
614 vp_ap->nested_control.features.directhypercall = 0;
615 vp_ap->current_nested_vmcs = 0;
616 vp_ap->enlighten_vmentry = 0;
617}
618
619#else /* IS_ENABLED(CONFIG_HYPERV) */
451d39e8 620static void hv_init_evmcs(void) {}
2916b70f 621static void hv_reset_evmcs(void) {}
773e8a04
VK
622#endif /* IS_ENABLED(CONFIG_HYPERV) */
623
64672c95
YJ
624/*
625 * Comment's format: document - errata name - stepping - processor name.
626 * Refer from
627 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
628 */
629static u32 vmx_preemption_cpu_tfms[] = {
630/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
6310x000206E6,
632/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
633/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
634/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
6350x00020652,
636/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
6370x00020655,
638/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
639/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
640/*
641 * 320767.pdf - AAP86 - B1 -
642 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
643 */
6440x000106E5,
645/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
6460x000106A0,
647/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
6480x000106A1,
649/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
6500x000106A4,
651 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
652 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
653 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
6540x000106A5,
3d82c565
WH
655 /* Xeon E3-1220 V2 */
6560x000306A8,
64672c95
YJ
657};
658
659static inline bool cpu_has_broken_vmx_preemption_timer(void)
660{
661 u32 eax = cpuid_eax(0x00000001), i;
662
663 /* Clear the reserved bits */
664 eax &= ~(0x3U << 14 | 0xfU << 28);
03f6a22a 665 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
64672c95
YJ
666 if (eax == vmx_preemption_cpu_tfms[i])
667 return true;
668
669 return false;
670}
671
35754c98 672static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
f78e0e2e 673{
35754c98 674 return flexpriority_enabled && lapic_in_kernel(vcpu);
f78e0e2e
SY
675}
676
3eb90017
AG
677static int possible_passthrough_msr_slot(u32 msr)
678{
679 u32 i;
680
681 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
682 if (vmx_possible_passthrough_msrs[i] == msr)
683 return i;
684
685 return -ENOENT;
686}
687
688static bool is_valid_passthrough_msr(u32 msr)
689{
690 bool r;
691
692 switch (msr) {
693 case 0x800 ... 0x8ff:
694 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
695 return true;
696 case MSR_IA32_RTIT_STATUS:
697 case MSR_IA32_RTIT_OUTPUT_BASE:
698 case MSR_IA32_RTIT_OUTPUT_MASK:
699 case MSR_IA32_RTIT_CR3_MATCH:
700 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
701 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
1b5ac322
LX
702 case MSR_LBR_SELECT:
703 case MSR_LBR_TOS:
704 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
705 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
706 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
707 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
708 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
709 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
3eb90017
AG
710 return true;
711 }
712
713 r = possible_passthrough_msr_slot(msr) != -ENOENT;
714
715 WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
716
717 return r;
718}
719
d85a8034 720struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
a75beee6
ED
721{
722 int i;
723
8ea8b8d6 724 i = kvm_find_user_return_msr(msr);
a75beee6 725 if (i >= 0)
eb3db1b1 726 return &vmx->guest_uret_msrs[i];
8b6d44c7 727 return NULL;
7725f0ba
AK
728}
729
7bf662bb
SC
730static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
731 struct vmx_uret_msr *msr, u64 data)
b07a5c53 732{
ee9d22e0 733 unsigned int slot = msr - vmx->guest_uret_msrs;
b07a5c53
PB
734 int ret = 0;
735
ee9d22e0 736 if (msr->load_into_hardware) {
b07a5c53 737 preempt_disable();
3ab4ac87 738 ret = kvm_set_user_return_msr(slot, data, msr->mask);
b07a5c53 739 preempt_enable();
b07a5c53 740 }
3ab4ac87
LJ
741 if (!ret)
742 msr->data = data;
b07a5c53
PB
743 return ret;
744}
745
2965faa5 746#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
747static void crash_vmclear_local_loaded_vmcss(void)
748{
749 int cpu = raw_smp_processor_id();
750 struct loaded_vmcs *v;
751
8f536b76
ZY
752 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
753 loaded_vmcss_on_cpu_link)
754 vmcs_clear(v->vmcs);
755}
2965faa5 756#endif /* CONFIG_KEXEC_CORE */
8f536b76 757
d462b819 758static void __loaded_vmcs_clear(void *arg)
6aa8b732 759{
d462b819 760 struct loaded_vmcs *loaded_vmcs = arg;
d3b2c338 761 int cpu = raw_smp_processor_id();
6aa8b732 762
d462b819
NHE
763 if (loaded_vmcs->cpu != cpu)
764 return; /* vcpu migration can race with cpu offline */
765 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
6aa8b732 766 per_cpu(current_vmcs, cpu) = NULL;
31603d4f
SC
767
768 vmcs_clear(loaded_vmcs->vmcs);
769 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
770 vmcs_clear(loaded_vmcs->shadow_vmcs);
771
d462b819 772 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
5a560f8b
XG
773
774 /*
31603d4f 775 * Ensure all writes to loaded_vmcs, including deleting it from its
105e0c44
PH
776 * current percpu list, complete before setting loaded_vmcs->cpu to
777 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
778 * and add loaded_vmcs to its percpu list before it's deleted from this
779 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
5a560f8b
XG
780 */
781 smp_wmb();
782
31603d4f
SC
783 loaded_vmcs->cpu = -1;
784 loaded_vmcs->launched = 0;
6aa8b732
AK
785}
786
89b0c9f5 787void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
8d0be2b3 788{
e6c7d321
XG
789 int cpu = loaded_vmcs->cpu;
790
791 if (cpu != -1)
792 smp_call_function_single(cpu,
793 __loaded_vmcs_clear, loaded_vmcs, 1);
8d0be2b3
AK
794}
795
2fb92db1
AK
796static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
797 unsigned field)
798{
799 bool ret;
800 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
801
cb3c1e2f
SC
802 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
803 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
2fb92db1
AK
804 vmx->segment_cache.bitmask = 0;
805 }
806 ret = vmx->segment_cache.bitmask & mask;
807 vmx->segment_cache.bitmask |= mask;
808 return ret;
809}
810
811static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
812{
813 u16 *p = &vmx->segment_cache.seg[seg].selector;
814
815 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
816 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
817 return *p;
818}
819
820static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
821{
822 ulong *p = &vmx->segment_cache.seg[seg].base;
823
824 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
825 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
826 return *p;
827}
828
829static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
830{
831 u32 *p = &vmx->segment_cache.seg[seg].limit;
832
833 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
834 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
835 return *p;
836}
837
838static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
839{
840 u32 *p = &vmx->segment_cache.seg[seg].ar;
841
842 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
843 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
844 return *p;
845}
846
b6a7cc35 847void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
abd3f2d6
AK
848{
849 u32 eb;
850
fd7373cc 851 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
bd7e5b08 852 (1u << DB_VECTOR) | (1u << AC_VECTOR);
9e869480
LA
853 /*
854 * Guest access to VMware backdoor ports could legitimately
855 * trigger #GP because of TSS I/O permission bitmap.
856 * We intercept those #GP and allow access to them anyway
857 * as VMware does.
858 */
859 if (enable_vmware_backdoor)
860 eb |= (1u << GP_VECTOR);
fd7373cc
JK
861 if ((vcpu->guest_debug &
862 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
863 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
864 eb |= 1u << BP_VECTOR;
7ffd92c5 865 if (to_vmx(vcpu)->rmode.vm86_active)
abd3f2d6 866 eb = ~0;
a0c13434 867 if (!vmx_need_pf_intercept(vcpu))
49f933d4 868 eb &= ~(1u << PF_VECTOR);
36cf24e0
NHE
869
870 /* When we are running a nested L2 guest and L1 specified for it a
871 * certain exception bitmap, we must trap the same exceptions and pass
872 * them to L1. When running L2, we will only handle the exceptions
873 * specified above if L1 did not want them.
874 */
875 if (is_guest_mode(vcpu))
876 eb |= get_vmcs12(vcpu)->exception_bitmap;
06e18547 877 else {
5140bc7d
JM
878 int mask = 0, match = 0;
879
880 if (enable_ept && (eb & (1u << PF_VECTOR))) {
881 /*
882 * If EPT is enabled, #PF is currently only intercepted
883 * if MAXPHYADDR is smaller on the guest than on the
884 * host. In that case we only care about present,
885 * non-reserved faults. For vmcs02, however, PFEC_MASK
886 * and PFEC_MATCH are set in prepare_vmcs02_rare.
887 */
888 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
889 match = PFERR_PRESENT_MASK;
890 }
b502e6ec 891 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
5140bc7d 892 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
b502e6ec 893 }
36cf24e0 894
ec5be88a 895 /*
b5274b1b
KT
896 * Disabling xfd interception indicates that dynamic xfeatures
897 * might be used in the guest. Always trap #NM in this case
898 * to save guest xfd_err timely.
ec5be88a 899 */
b5274b1b 900 if (vcpu->arch.xfd_no_write_intercept)
ec5be88a
JL
901 eb |= (1u << NM_VECTOR);
902
abd3f2d6
AK
903 vmcs_write32(EXCEPTION_BITMAP, eb);
904}
905
d28b387f
KA
906/*
907 * Check if MSR is intercepted for currently loaded MSR bitmap.
908 */
7dfbc624 909static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
d28b387f 910{
7dfbc624 911 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
d28b387f
KA
912 return true;
913
020dac41 914 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
d28b387f
KA
915}
916
bb066506
JP
917unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
918{
919 unsigned int flags = 0;
920
921 if (vmx->loaded_vmcs->launched)
922 flags |= VMX_RUN_VMRESUME;
923
fc02735b
JP
924 /*
925 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
926 * to change it directly without causing a vmexit. In that case read
927 * it after vmexit and store it in vmx->spec_ctrl.
928 */
4f209989 929 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
fc02735b
JP
930 flags |= VMX_RUN_SAVE_SPEC_CTRL;
931
bb066506
JP
932 return flags;
933}
934
ee087b4d 935static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2961e876 936 unsigned long entry, unsigned long exit)
8bf00a52 937{
2961e876
GN
938 vm_entry_controls_clearbit(vmx, entry);
939 vm_exit_controls_clearbit(vmx, exit);
8bf00a52
GN
940}
941
a128a934 942int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
ca83b4a7
KRW
943{
944 unsigned int i;
945
946 for (i = 0; i < m->nr; ++i) {
947 if (m->val[i].index == msr)
948 return i;
949 }
950 return -ENOENT;
951}
952
61d2ef2c
AK
953static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
954{
ca83b4a7 955 int i;
61d2ef2c
AK
956 struct msr_autoload *m = &vmx->msr_autoload;
957
8bf00a52
GN
958 switch (msr) {
959 case MSR_EFER:
c73da3fc 960 if (cpu_has_load_ia32_efer()) {
2961e876
GN
961 clear_atomic_switch_msr_special(vmx,
962 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
963 VM_EXIT_LOAD_IA32_EFER);
964 return;
965 }
966 break;
967 case MSR_CORE_PERF_GLOBAL_CTRL:
c73da3fc 968 if (cpu_has_load_perf_global_ctrl()) {
2961e876 969 clear_atomic_switch_msr_special(vmx,
8bf00a52
GN
970 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
971 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
972 return;
973 }
974 break;
110312c8 975 }
a128a934 976 i = vmx_find_loadstore_msr_slot(&m->guest, msr);
ca83b4a7 977 if (i < 0)
31907093 978 goto skip_guest;
33966dd6 979 --m->guest.nr;
33966dd6 980 m->guest.val[i] = m->guest.val[m->guest.nr];
33966dd6 981 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
110312c8 982
31907093 983skip_guest:
a128a934 984 i = vmx_find_loadstore_msr_slot(&m->host, msr);
31907093 985 if (i < 0)
61d2ef2c 986 return;
31907093
KRW
987
988 --m->host.nr;
989 m->host.val[i] = m->host.val[m->host.nr];
33966dd6 990 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c
AK
991}
992
ee087b4d 993static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2961e876
GN
994 unsigned long entry, unsigned long exit,
995 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
996 u64 guest_val, u64 host_val)
8bf00a52
GN
997{
998 vmcs_write64(guest_val_vmcs, guest_val);
5a5e8a15
SC
999 if (host_val_vmcs != HOST_IA32_EFER)
1000 vmcs_write64(host_val_vmcs, host_val);
2961e876
GN
1001 vm_entry_controls_setbit(vmx, entry);
1002 vm_exit_controls_setbit(vmx, exit);
8bf00a52
GN
1003}
1004
61d2ef2c 1005static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
989e3992 1006 u64 guest_val, u64 host_val, bool entry_only)
61d2ef2c 1007{
989e3992 1008 int i, j = 0;
61d2ef2c
AK
1009 struct msr_autoload *m = &vmx->msr_autoload;
1010
8bf00a52
GN
1011 switch (msr) {
1012 case MSR_EFER:
c73da3fc 1013 if (cpu_has_load_ia32_efer()) {
2961e876
GN
1014 add_atomic_switch_msr_special(vmx,
1015 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
1016 VM_EXIT_LOAD_IA32_EFER,
1017 GUEST_IA32_EFER,
1018 HOST_IA32_EFER,
1019 guest_val, host_val);
1020 return;
1021 }
1022 break;
1023 case MSR_CORE_PERF_GLOBAL_CTRL:
c73da3fc 1024 if (cpu_has_load_perf_global_ctrl()) {
2961e876 1025 add_atomic_switch_msr_special(vmx,
8bf00a52
GN
1026 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1027 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1028 GUEST_IA32_PERF_GLOBAL_CTRL,
1029 HOST_IA32_PERF_GLOBAL_CTRL,
1030 guest_val, host_val);
1031 return;
1032 }
1033 break;
7099e2e1
RK
1034 case MSR_IA32_PEBS_ENABLE:
1035 /* PEBS needs a quiescent period after being disabled (to write
1036 * a record). Disabling PEBS through VMX MSR swapping doesn't
1037 * provide that period, so a CPU could write host's record into
1038 * guest's memory.
1039 */
1040 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
110312c8
AK
1041 }
1042
a128a934 1043 i = vmx_find_loadstore_msr_slot(&m->guest, msr);
989e3992 1044 if (!entry_only)
a128a934 1045 j = vmx_find_loadstore_msr_slot(&m->host, msr);
61d2ef2c 1046
ce833b23
SC
1047 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1048 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
60266204 1049 printk_once(KERN_WARNING "Not enough msr switch entries. "
e7fc6f93
GN
1050 "Can't add msr %x\n", msr);
1051 return;
61d2ef2c 1052 }
31907093 1053 if (i < 0) {
ca83b4a7 1054 i = m->guest.nr++;
33966dd6 1055 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
31907093 1056 }
989e3992
KRW
1057 m->guest.val[i].index = msr;
1058 m->guest.val[i].value = guest_val;
1059
1060 if (entry_only)
1061 return;
61d2ef2c 1062
31907093
KRW
1063 if (j < 0) {
1064 j = m->host.nr++;
33966dd6 1065 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c 1066 }
31907093
KRW
1067 m->host.val[j].index = msr;
1068 m->host.val[j].value = host_val;
61d2ef2c
AK
1069}
1070
86e3e494 1071static bool update_transition_efer(struct vcpu_vmx *vmx)
2cc51560 1072{
844a5fe2
PB
1073 u64 guest_efer = vmx->vcpu.arch.efer;
1074 u64 ignore_bits = 0;
86e3e494 1075 int i;
844a5fe2 1076
9167ab79
PB
1077 /* Shadow paging assumes NX to be available. */
1078 if (!enable_ept)
1079 guest_efer |= EFER_NX;
3a34a881 1080
51c6cf66 1081 /*
844a5fe2 1082 * LMA and LME handled by hardware; SCE meaningless outside long mode.
51c6cf66 1083 */
844a5fe2 1084 ignore_bits |= EFER_SCE;
51c6cf66
AK
1085#ifdef CONFIG_X86_64
1086 ignore_bits |= EFER_LMA | EFER_LME;
1087 /* SCE is meaningful only in long mode on Intel */
1088 if (guest_efer & EFER_LMA)
1089 ignore_bits &= ~(u64)EFER_SCE;
1090#endif
84ad33ef 1091
f6577a5f
AL
1092 /*
1093 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1094 * On CPUs that support "load IA32_EFER", always switch EFER
1095 * atomically, since it's faster than switching it manually.
1096 */
c73da3fc 1097 if (cpu_has_load_ia32_efer() ||
f6577a5f 1098 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
84ad33ef
AK
1099 if (!(guest_efer & EFER_LMA))
1100 guest_efer &= ~EFER_LME;
54b98bff
AL
1101 if (guest_efer != host_efer)
1102 add_atomic_switch_msr(vmx, MSR_EFER,
989e3992 1103 guest_efer, host_efer, false);
02343cf2
SC
1104 else
1105 clear_atomic_switch_msr(vmx, MSR_EFER);
84ad33ef 1106 return false;
86e3e494 1107 }
02343cf2 1108
8ea8b8d6 1109 i = kvm_find_user_return_msr(MSR_EFER);
86e3e494
SC
1110 if (i < 0)
1111 return false;
02343cf2 1112
86e3e494 1113 clear_atomic_switch_msr(vmx, MSR_EFER);
844a5fe2 1114
86e3e494
SC
1115 guest_efer &= ~ignore_bits;
1116 guest_efer |= host_efer & ignore_bits;
84ad33ef 1117
86e3e494
SC
1118 vmx->guest_uret_msrs[i].data = guest_efer;
1119 vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1120
1121 return true;
51c6cf66
AK
1122}
1123
e28baead
AL
1124#ifdef CONFIG_X86_32
1125/*
1126 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1127 * VMCS rather than the segment table. KVM uses this helper to figure
1128 * out the current bases to poke them into the VMCS before entry.
1129 */
2d49ec72
GN
1130static unsigned long segment_base(u16 selector)
1131{
8c2e41f7 1132 struct desc_struct *table;
2d49ec72
GN
1133 unsigned long v;
1134
8c2e41f7 1135 if (!(selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
1136 return 0;
1137
45fc8757 1138 table = get_current_gdt_ro();
2d49ec72 1139
8c2e41f7 1140 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2d49ec72
GN
1141 u16 ldt_selector = kvm_read_ldt();
1142
8c2e41f7 1143 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
1144 return 0;
1145
8c2e41f7 1146 table = (struct desc_struct *)segment_base(ldt_selector);
2d49ec72 1147 }
8c2e41f7 1148 v = get_desc_base(&table[selector >> 3]);
2d49ec72
GN
1149 return v;
1150}
e28baead 1151#endif
2d49ec72 1152
e348ac7c
SC
1153static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1154{
2ef7619d 1155 return vmx_pt_mode_is_host_guest() &&
e348ac7c
SC
1156 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1157}
1158
1cc6cbc3
SC
1159static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1160{
1161 /* The base must be 128-byte aligned and a legal physical address. */
636e8b73 1162 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1cc6cbc3
SC
1163}
1164
2ef444f1
CP
1165static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1166{
1167 u32 i;
1168
1169 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1170 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1171 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1172 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1173 for (i = 0; i < addr_range; i++) {
1174 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1175 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1176 }
1177}
1178
1179static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1180{
1181 u32 i;
1182
1183 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1184 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1185 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1186 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1187 for (i = 0; i < addr_range; i++) {
1188 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1189 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1190 }
1191}
1192
1193static void pt_guest_enter(struct vcpu_vmx *vmx)
1194{
2ef7619d 1195 if (vmx_pt_mode_is_system())
2ef444f1
CP
1196 return;
1197
2ef444f1 1198 /*
b08c2896
CP
1199 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1200 * Save host state before VM entry.
2ef444f1 1201 */
b08c2896 1202 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
2ef444f1
CP
1203 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1204 wrmsrl(MSR_IA32_RTIT_CTL, 0);
f4d3a902
XL
1205 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1206 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
2ef444f1
CP
1207 }
1208}
1209
1210static void pt_guest_exit(struct vcpu_vmx *vmx)
1211{
2ef7619d 1212 if (vmx_pt_mode_is_system())
2ef444f1
CP
1213 return;
1214
1215 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
f4d3a902
XL
1216 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1217 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
2ef444f1
CP
1218 }
1219
2e6e0d68
XL
1220 /*
1221 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1222 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
1223 */
1224 if (vmx->pt_desc.host.ctl)
1225 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
2ef444f1
CP
1226}
1227
bca06b85
SC
1228void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1229 unsigned long fs_base, unsigned long gs_base)
13b964a2
SC
1230{
1231 if (unlikely(fs_sel != host->fs_sel)) {
1232 if (!(fs_sel & 7))
1233 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1234 else
1235 vmcs_write16(HOST_FS_SELECTOR, 0);
1236 host->fs_sel = fs_sel;
1237 }
1238 if (unlikely(gs_sel != host->gs_sel)) {
1239 if (!(gs_sel & 7))
1240 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1241 else
1242 vmcs_write16(HOST_GS_SELECTOR, 0);
1243 host->gs_sel = gs_sel;
1244 }
1245 if (unlikely(fs_base != host->fs_base)) {
1246 vmcs_writel(HOST_FS_BASE, fs_base);
1247 host->fs_base = fs_base;
1248 }
1249 if (unlikely(gs_base != host->gs_base)) {
1250 vmcs_writel(HOST_GS_BASE, gs_base);
1251 host->gs_base = gs_base;
1252 }
1253}
1254
97b7ead3 1255void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
33ed6329 1256{
04d2cc77 1257 struct vcpu_vmx *vmx = to_vmx(vcpu);
d7ee039e 1258 struct vmcs_host_state *host_state;
51e8a8cc 1259#ifdef CONFIG_X86_64
35060ed6 1260 int cpu = raw_smp_processor_id();
51e8a8cc 1261#endif
e368b875
SC
1262 unsigned long fs_base, gs_base;
1263 u16 fs_sel, gs_sel;
26bb0981 1264 int i;
04d2cc77 1265
d264ee0c
SC
1266 vmx->req_immediate_exit = false;
1267
f48b4711
LA
1268 /*
1269 * Note that guest MSRs to be saved/restored can also be changed
1270 * when guest state is loaded. This happens when guest transitions
1271 * to/from long-mode by setting MSR_EFER.LMA.
1272 */
658ece84
SC
1273 if (!vmx->guest_uret_msrs_loaded) {
1274 vmx->guest_uret_msrs_loaded = true;
e5fda4bb 1275 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
ee9d22e0
SC
1276 if (!vmx->guest_uret_msrs[i].load_into_hardware)
1277 continue;
1278
1279 kvm_set_user_return_msr(i,
eb3db1b1
SC
1280 vmx->guest_uret_msrs[i].data,
1281 vmx->guest_uret_msrs[i].mask);
ee9d22e0 1282 }
f48b4711 1283 }
c9dfd3fb 1284
06e18547 1285 if (vmx->nested.need_vmcs12_to_shadow_sync)
c9dfd3fb 1286 nested_sync_vmcs12_to_shadow(vcpu);
1287
b464f57e 1288 if (vmx->guest_state_loaded)
33ed6329
AK
1289 return;
1290
b464f57e 1291 host_state = &vmx->loaded_vmcs->host_state;
bd9966de 1292
33ed6329
AK
1293 /*
1294 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1295 * allow segment selectors with cpl > 0 or ti == 1.
1296 */
d7ee039e 1297 host_state->ldt_sel = kvm_read_ldt();
42b933b5
VK
1298
1299#ifdef CONFIG_X86_64
d7ee039e
SC
1300 savesegment(ds, host_state->ds_sel);
1301 savesegment(es, host_state->es_sel);
e368b875
SC
1302
1303 gs_base = cpu_kernelmode_gs_base(cpu);
b062b794 1304 if (likely(is_64bit_mm(current->mm))) {
6758034e 1305 current_save_fsgs();
e368b875
SC
1306 fs_sel = current->thread.fsindex;
1307 gs_sel = current->thread.gsindex;
b062b794 1308 fs_base = current->thread.fsbase;
e368b875 1309 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
b062b794 1310 } else {
e368b875
SC
1311 savesegment(fs, fs_sel);
1312 savesegment(gs, gs_sel);
b062b794 1313 fs_base = read_msr(MSR_FS_BASE);
e368b875 1314 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
33ed6329 1315 }
b2da15ac 1316
4679b61f 1317 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
4fde8d57 1318#else
e368b875
SC
1319 savesegment(fs, fs_sel);
1320 savesegment(gs, gs_sel);
1321 fs_base = segment_base(fs_sel);
1322 gs_base = segment_base(gs_sel);
707c0874 1323#endif
e368b875 1324
bca06b85 1325 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
b464f57e 1326 vmx->guest_state_loaded = true;
33ed6329
AK
1327}
1328
6d6095bd 1329static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
33ed6329 1330{
d7ee039e
SC
1331 struct vmcs_host_state *host_state;
1332
b464f57e 1333 if (!vmx->guest_state_loaded)
33ed6329
AK
1334 return;
1335
b464f57e 1336 host_state = &vmx->loaded_vmcs->host_state;
bd9966de 1337
e1beb1d3 1338 ++vmx->vcpu.stat.host_state_reload;
bd9966de 1339
c8770e7b 1340#ifdef CONFIG_X86_64
4679b61f 1341 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
c8770e7b 1342#endif
d7ee039e
SC
1343 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1344 kvm_load_ldt(host_state->ldt_sel);
33ed6329 1345#ifdef CONFIG_X86_64
d7ee039e 1346 load_gs_index(host_state->gs_sel);
9581d442 1347#else
d7ee039e 1348 loadsegment(gs, host_state->gs_sel);
33ed6329 1349#endif
33ed6329 1350 }
d7ee039e
SC
1351 if (host_state->fs_sel & 7)
1352 loadsegment(fs, host_state->fs_sel);
b2da15ac 1353#ifdef CONFIG_X86_64
d7ee039e
SC
1354 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1355 loadsegment(ds, host_state->ds_sel);
1356 loadsegment(es, host_state->es_sel);
b2da15ac 1357 }
b2da15ac 1358#endif
b7ffc44d 1359 invalidate_tss_limit();
44ea2b17 1360#ifdef CONFIG_X86_64
c8770e7b 1361 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
44ea2b17 1362#endif
45fc8757 1363 load_fixmap_gdt(raw_smp_processor_id());
b464f57e 1364 vmx->guest_state_loaded = false;
658ece84 1365 vmx->guest_uret_msrs_loaded = false;
33ed6329
AK
1366}
1367
678e315e
SC
1368#ifdef CONFIG_X86_64
1369static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
a9b21b62 1370{
4679b61f 1371 preempt_disable();
b464f57e 1372 if (vmx->guest_state_loaded)
4679b61f
PB
1373 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1374 preempt_enable();
678e315e 1375 return vmx->msr_guest_kernel_gs_base;
a9b21b62
AK
1376}
1377
678e315e
SC
1378static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1379{
4679b61f 1380 preempt_disable();
b464f57e 1381 if (vmx->guest_state_loaded)
4679b61f
PB
1382 wrmsrl(MSR_KERNEL_GS_BASE, data);
1383 preempt_enable();
678e315e
SC
1384 vmx->msr_guest_kernel_gs_base = data;
1385}
1386#endif
1387
5c911bef
SC
1388void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1389 struct loaded_vmcs *buddy)
6aa8b732 1390{
a2fa3e9f 1391 struct vcpu_vmx *vmx = to_vmx(vcpu);
b80c76ec 1392 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
5c911bef 1393 struct vmcs *prev;
6aa8b732 1394
b80c76ec 1395 if (!already_loaded) {
fe0e80be 1396 loaded_vmcs_clear(vmx->loaded_vmcs);
92fe13be 1397 local_irq_disable();
5a560f8b
XG
1398
1399 /*
31603d4f
SC
1400 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1401 * this cpu's percpu list, otherwise it may not yet be deleted
1402 * from its previous cpu's percpu list. Pairs with the
1403 * smb_wmb() in __loaded_vmcs_clear().
5a560f8b
XG
1404 */
1405 smp_rmb();
1406
d462b819
NHE
1407 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1408 &per_cpu(loaded_vmcss_on_cpu, cpu));
92fe13be 1409 local_irq_enable();
b80c76ec
JM
1410 }
1411
5c911bef
SC
1412 prev = per_cpu(current_vmcs, cpu);
1413 if (prev != vmx->loaded_vmcs->vmcs) {
b80c76ec
JM
1414 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1415 vmcs_load(vmx->loaded_vmcs->vmcs);
5c911bef
SC
1416
1417 /*
1418 * No indirect branch prediction barrier needed when switching
2e7eab81
JM
1419 * the active VMCS within a vCPU, unless IBRS is advertised to
1420 * the vCPU. To minimize the number of IBPBs executed, KVM
1421 * performs IBPB on nested VM-Exit (a single nested transition
1422 * may switch the active VMCS multiple times).
5c911bef
SC
1423 */
1424 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1425 indirect_branch_prediction_barrier();
b80c76ec
JM
1426 }
1427
1428 if (!already_loaded) {
59c58ceb 1429 void *gdt = get_current_gdt_ro();
b80c76ec 1430
eeeb4f67
SC
1431 /*
1432 * Flush all EPTP/VPID contexts, the new pCPU may have stale
1433 * TLB entries from its previous association with the vCPU.
1434 */
b80c76ec 1435 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
92fe13be 1436
6aa8b732
AK
1437 /*
1438 * Linux uses per-cpu TSS and GDT, so set these when switching
e0c23063 1439 * processors. See 22.2.4.
6aa8b732 1440 */
e0c23063 1441 vmcs_writel(HOST_TR_BASE,
72f5e08d 1442 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
59c58ceb 1443 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
6aa8b732 1444
6ab8a405
LJ
1445 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1446 /* 22.2.3 */
1447 vmcs_writel(HOST_IA32_SYSENTER_ESP,
1448 (unsigned long)(cpu_entry_stack(cpu) + 1));
1449 }
ff2c3a18 1450
d462b819 1451 vmx->loaded_vmcs->cpu = cpu;
6aa8b732 1452 }
8ef863e6
SC
1453}
1454
1455/*
1456 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1457 * vcpu mutex is already taken.
1458 */
1af1bb05 1459static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
8ef863e6
SC
1460{
1461 struct vcpu_vmx *vmx = to_vmx(vcpu);
1462
5c911bef 1463 vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
2680d6da 1464
28b835d6 1465 vmx_vcpu_pi_load(vcpu, cpu);
8ef863e6 1466
74c55931 1467 vmx->host_debugctlmsr = get_debugctlmsr();
28b835d6
FW
1468}
1469
13b964a2 1470static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 1471{
28b835d6
FW
1472 vmx_vcpu_pi_put(vcpu);
1473
6d6095bd 1474 vmx_prepare_switch_to_host(to_vmx(vcpu));
6aa8b732
AK
1475}
1476
dbab610a 1477bool vmx_emulation_required(struct kvm_vcpu *vcpu)
f244deed 1478{
2ba4493a 1479 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
f244deed
WL
1480}
1481
97b7ead3 1482unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
6aa8b732 1483{
e7bddc52 1484 struct vcpu_vmx *vmx = to_vmx(vcpu);
78ac8b47 1485 unsigned long rflags, save_rflags;
345dcaa8 1486
cb3c1e2f
SC
1487 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1488 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
6de12732 1489 rflags = vmcs_readl(GUEST_RFLAGS);
e7bddc52 1490 if (vmx->rmode.vm86_active) {
6de12732 1491 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
e7bddc52 1492 save_rflags = vmx->rmode.save_rflags;
6de12732
AK
1493 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1494 }
e7bddc52 1495 vmx->rflags = rflags;
78ac8b47 1496 }
e7bddc52 1497 return vmx->rflags;
6aa8b732
AK
1498}
1499
97b7ead3 1500void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6aa8b732 1501{
e7bddc52 1502 struct vcpu_vmx *vmx = to_vmx(vcpu);
491c1ad1 1503 unsigned long old_rflags;
f244deed 1504
bddd82d1 1505 if (is_unrestricted_guest(vcpu)) {
cb3c1e2f 1506 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
491c1ad1
SC
1507 vmx->rflags = rflags;
1508 vmcs_writel(GUEST_RFLAGS, rflags);
1509 return;
1510 }
1511
1512 old_rflags = vmx_get_rflags(vcpu);
e7bddc52
SC
1513 vmx->rflags = rflags;
1514 if (vmx->rmode.vm86_active) {
1515 vmx->rmode.save_rflags = rflags;
053de044 1516 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
78ac8b47 1517 }
6aa8b732 1518 vmcs_writel(GUEST_RFLAGS, rflags);
f244deed 1519
e7bddc52 1520 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
dbab610a 1521 vmx->emulation_required = vmx_emulation_required(vcpu);
6aa8b732
AK
1522}
1523
c5063551
MO
1524static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1525{
1526 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1527}
1528
97b7ead3 1529u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
1530{
1531 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1532 int ret = 0;
1533
1534 if (interruptibility & GUEST_INTR_STATE_STI)
48005f64 1535 ret |= KVM_X86_SHADOW_INT_STI;
2809f5d2 1536 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
48005f64 1537 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2809f5d2 1538
37ccdcbe 1539 return ret;
2809f5d2
GC
1540}
1541
97b7ead3 1542void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2809f5d2
GC
1543{
1544 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1545 u32 interruptibility = interruptibility_old;
1546
1547 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1548
48005f64 1549 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2809f5d2 1550 interruptibility |= GUEST_INTR_STATE_MOV_SS;
48005f64 1551 else if (mask & KVM_X86_SHADOW_INT_STI)
2809f5d2
GC
1552 interruptibility |= GUEST_INTR_STATE_STI;
1553
1554 if ((interruptibility != interruptibility_old))
1555 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1556}
1557
bf8c55d8
CP
1558static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1559{
1560 struct vcpu_vmx *vmx = to_vmx(vcpu);
1561 unsigned long value;
1562
1563 /*
1564 * Any MSR write that attempts to change bits marked reserved will
1565 * case a #GP fault.
1566 */
1567 if (data & vmx->pt_desc.ctl_bitmask)
1568 return 1;
1569
1570 /*
1571 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1572 * result in a #GP unless the same write also clears TraceEn.
1573 */
1574 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1575 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1576 return 1;
1577
1578 /*
1579 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1580 * and FabricEn would cause #GP, if
1581 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1582 */
1583 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1584 !(data & RTIT_CTL_FABRIC_EN) &&
1585 !intel_pt_validate_cap(vmx->pt_desc.caps,
1586 PT_CAP_single_range_output))
1587 return 1;
1588
1589 /*
1590 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
d9f6e12f 1591 * utilize encodings marked reserved will cause a #GP fault.
bf8c55d8
CP
1592 */
1593 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1594 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1595 !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1596 RTIT_CTL_MTC_RANGE_OFFSET, &value))
1597 return 1;
1598 value = intel_pt_validate_cap(vmx->pt_desc.caps,
1599 PT_CAP_cycle_thresholds);
1600 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1601 !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1602 RTIT_CTL_CYC_THRESH_OFFSET, &value))
1603 return 1;
1604 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1605 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1606 !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1607 RTIT_CTL_PSB_FREQ_OFFSET, &value))
1608 return 1;
1609
1610 /*
1611 * If ADDRx_CFG is reserved or the encodings is >2 will
1612 * cause a #GP fault.
1613 */
1614 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
f4d3a902 1615 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
bf8c55d8
CP
1616 return 1;
1617 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
f4d3a902 1618 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
bf8c55d8
CP
1619 return 1;
1620 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
f4d3a902 1621 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
bf8c55d8
CP
1622 return 1;
1623 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
f4d3a902 1624 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
bf8c55d8
CP
1625 return 1;
1626
1627 return 0;
1628}
1629
4d31d9ef
SC
1630static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1631 void *insn, int insn_len)
09e3e2a1 1632{
3c0c2ad1
SC
1633 /*
1634 * Emulation of instructions in SGX enclaves is impossible as RIP does
4d31d9ef 1635 * not point at the failing instruction, and even if it did, the code
3c0c2ad1
SC
1636 * stream is inaccessible. Inject #UD instead of exiting to userspace
1637 * so that guest userspace can't DoS the guest simply by triggering
1638 * emulation (enclaves are CPL3 only).
1639 */
1640 if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1641 kvm_queue_exception(vcpu, UD_VECTOR);
1642 return false;
1643 }
09e3e2a1
SC
1644 return true;
1645}
1646
1957aa63 1647static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
6aa8b732 1648{
3c0c2ad1 1649 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
fede8076 1650 unsigned long rip, orig_rip;
3c0c2ad1 1651 u32 instr_len;
6aa8b732 1652
1957aa63
SC
1653 /*
1654 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1655 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1656 * set when EPT misconfig occurs. In practice, real hardware updates
1657 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1658 * (namely Hyper-V) don't set it due to it being undefined behavior,
1659 * i.e. we end up advancing IP with some random value.
1660 */
1661 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
3c0c2ad1
SC
1662 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1663 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1664
1665 /*
1666 * Emulating an enclave's instructions isn't supported as KVM
1667 * cannot access the enclave's memory or its true RIP, e.g. the
1668 * vmcs.GUEST_RIP points at the exit point of the enclave, not
1669 * the RIP that actually triggered the VM-Exit. But, because
1670 * most instructions that cause VM-Exit will #UD in an enclave,
1671 * most instruction-based VM-Exits simply do not occur.
1672 *
1673 * There are a few exceptions, notably the debug instructions
1674 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1675 * and generate #DB/#BP as expected, which KVM might intercept.
1676 * But again, the CPU does the dirty work and saves an instr
1677 * length of zero so VMMs don't shoot themselves in the foot.
1678 * WARN if KVM tries to skip a non-zero length instruction on
1679 * a VM-Exit from an enclave.
1680 */
1681 if (!instr_len)
1682 goto rip_updated;
1683
8d20bd63
SC
1684 WARN_ONCE(exit_reason.enclave_mode,
1685 "skipping instruction after SGX enclave VM-Exit");
3c0c2ad1 1686
fede8076 1687 orig_rip = kvm_rip_read(vcpu);
3c0c2ad1 1688 rip = orig_rip + instr_len;
fede8076
PB
1689#ifdef CONFIG_X86_64
1690 /*
1691 * We need to mask out the high 32 bits of RIP if not in 64-bit
1692 * mode, but just finding out that we are in 64-bit mode is
1693 * quite expensive. Only do it if there was a carry.
1694 */
1695 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1696 rip = (u32)rip;
1697#endif
1957aa63
SC
1698 kvm_rip_write(vcpu, rip);
1699 } else {
1700 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1701 return 0;
1702 }
6aa8b732 1703
3c0c2ad1 1704rip_updated:
2809f5d2
GC
1705 /* skipping an emulated instruction also counts */
1706 vmx_set_interrupt_shadow(vcpu, 0);
f8ea7c60 1707
60fc3d02 1708 return 1;
f8ea7c60
VK
1709}
1710
5ef8acbd
OU
1711/*
1712 * Recognizes a pending MTF VM-exit and records the nested state for later
1713 * delivery.
1714 */
1715static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1716{
1717 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1718 struct vcpu_vmx *vmx = to_vmx(vcpu);
1719
1720 if (!is_guest_mode(vcpu))
1721 return;
1722
1723 /*
1724 * Per the SDM, MTF takes priority over debug-trap exceptions besides
65ec8f01
SC
1725 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
1726 * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1727 * intercepted #DB deliberately avoids single-step #DB and MTF updates
1728 * as ICEBP is higher priority than both. As instruction emulation is
1729 * completed at this point (i.e. KVM is at the instruction boundary),
1730 * any #DB exception pending delivery must be a debug-trap of lower
1731 * priority than MTF. Record the pending MTF state to be delivered in
5ef8acbd
OU
1732 * vmx_check_nested_events().
1733 */
1734 if (nested_cpu_has_mtf(vmcs12) &&
1735 (!vcpu->arch.exception.pending ||
7709aba8
SC
1736 vcpu->arch.exception.vector == DB_VECTOR) &&
1737 (!vcpu->arch.exception_vmexit.pending ||
2ea89c7f 1738 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
5ef8acbd 1739 vmx->nested.mtf_pending = true;
2ea89c7f
SC
1740 kvm_make_request(KVM_REQ_EVENT, vcpu);
1741 } else {
5ef8acbd 1742 vmx->nested.mtf_pending = false;
2ea89c7f 1743 }
5ef8acbd
OU
1744}
1745
1746static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1747{
1748 vmx_update_emulated_instruction(vcpu);
1749 return skip_emulated_instruction(vcpu);
1750}
1751
caa057a2
WL
1752static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1753{
1754 /*
1755 * Ensure that we clear the HLT state in the VMCS. We don't need to
1756 * explicitly skip the instruction because if the HLT state is set,
1757 * then the instruction is already executing and RIP has already been
1758 * advanced.
1759 */
1760 if (kvm_hlt_in_guest(vcpu->kvm) &&
1761 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1762 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1763}
1764
6ad75c5c 1765static void vmx_inject_exception(struct kvm_vcpu *vcpu)
298101da 1766{
d4963e31
SC
1767 struct kvm_queued_exception *ex = &vcpu->arch.exception;
1768 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
77ab6db0
JK
1769 struct vcpu_vmx *vmx = to_vmx(vcpu);
1770
d4963e31 1771 kvm_deliver_exception_payload(vcpu, ex);
da998b46 1772
d4963e31 1773 if (ex->has_error_code) {
eba9799b
SC
1774 /*
1775 * Despite the error code being architecturally defined as 32
1776 * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1777 * VMX don't actually supporting setting bits 31:16. Hardware
1778 * will (should) never provide a bogus error code, but AMD CPUs
1779 * do generate error codes with bits 31:16 set, and so KVM's
1780 * ABI lets userspace shove in arbitrary 32-bit values. Drop
1781 * the upper bits to avoid VM-Fail, losing information that
1782 * does't really exist is preferable to killing the VM.
1783 */
d4963e31 1784 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
8ab2d2e2
JK
1785 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1786 }
77ab6db0 1787
7ffd92c5 1788 if (vmx->rmode.vm86_active) {
71f9833b 1789 int inc_eip = 0;
d4963e31 1790 if (kvm_exception_is_soft(ex->vector))
71f9833b 1791 inc_eip = vcpu->arch.event_exit_inst_len;
d4963e31 1792 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
77ab6db0
JK
1793 return;
1794 }
1795
add5ff7a
SC
1796 WARN_ON_ONCE(vmx->emulation_required);
1797
d4963e31 1798 if (kvm_exception_is_soft(ex->vector)) {
66fd3f7f
GN
1799 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1800 vmx->vcpu.arch.event_exit_inst_len);
8ab2d2e2
JK
1801 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1802 } else
1803 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1804
1805 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
caa057a2
WL
1806
1807 vmx_clear_hlt(vcpu);
298101da
AK
1808}
1809
ee9d22e0
SC
1810static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1811 bool load_into_hardware)
a75beee6 1812{
ee9d22e0 1813 struct vmx_uret_msr *uret_msr;
a2fa3e9f 1814
ee9d22e0
SC
1815 uret_msr = vmx_find_uret_msr(vmx, msr);
1816 if (!uret_msr)
bd65ba82 1817 return;
a2fa3e9f 1818
ee9d22e0 1819 uret_msr->load_into_hardware = load_into_hardware;
a75beee6
ED
1820}
1821
e38aea3e 1822/*
400dd54b
SC
1823 * Configuring user return MSRs to automatically save, load, and restore MSRs
1824 * that need to be shoved into hardware when running the guest. Note, omitting
1825 * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1826 * loaded into hardware when running the guest.
e38aea3e 1827 */
400dd54b 1828static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
e38aea3e 1829{
a75beee6 1830#ifdef CONFIG_X86_64
ee9d22e0
SC
1831 bool load_syscall_msrs;
1832
84c8c5b8
JM
1833 /*
1834 * The SYSCALL MSRs are only needed on long mode guests, and only
1835 * when EFER.SCE is set.
1836 */
ee9d22e0
SC
1837 load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1838 (vmx->vcpu.arch.efer & EFER_SCE);
1839
1840 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1841 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1842 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
a75beee6 1843#endif
ee9d22e0 1844 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
e38aea3e 1845
ee9d22e0
SC
1846 vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1847 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1848 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
bd65ba82 1849
5e17c624
SC
1850 /*
1851 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1852 * kernel and old userspace. If those guests run on a tsx=off host, do
1853 * allow guests to use TSX_CTRL, but don't change the value in hardware
1854 * so that TSX remains always disabled.
1855 */
1856 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
5897297b 1857
ee9d22e0
SC
1858 /*
1859 * The set of MSRs to load may have changed, reload MSRs before the
1860 * next VM-Enter.
1861 */
1862 vmx->guest_uret_msrs_loaded = false;
e38aea3e
AK
1863}
1864
307a94c7
IS
1865u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1866{
1867 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1868
1869 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1870 return vmcs12->tsc_offset;
1871
1872 return 0;
1873}
1874
1875u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1876{
1877 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1878
1879 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1880 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1881 return vmcs12->tsc_multiplier;
1882
938c8745 1883 return kvm_caps.default_tsc_scaling_ratio;
307a94c7
IS
1884}
1885
edcfe540 1886static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
6aa8b732 1887{
edcfe540 1888 vmcs_write64(TSC_OFFSET, offset);
6aa8b732
AK
1889}
1890
1ab9287a
IS
1891static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1892{
1893 vmcs_write64(TSC_MULTIPLIER, multiplier);
1894}
1895
801d3424
NHE
1896/*
1897 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1898 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1899 * all guests if the "nested" module option is off, and can also be disabled
1900 * for a single guest by disabling its VMX cpuid bit.
1901 */
7c97fcb3 1902bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
801d3424 1903{
d6321d49 1904 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
801d3424
NHE
1905}
1906
d2a00af2
SC
1907/*
1908 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1909 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
1910 * backwards compatibility even though KVM doesn't support emulating SMX. And
1911 * because userspace set "VMX in SMX", the guest must also be allowed to set it,
1912 * e.g. if the MSR is left unlocked and the guest does a RMW operation.
1913 */
1914#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
1915 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
1916 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
1917 FEAT_CTL_SGX_LC_ENABLED | \
1918 FEAT_CTL_SGX_ENABLED | \
1919 FEAT_CTL_LMCE_ENABLED)
1920
2d6cd686
SC
1921static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1922 struct msr_data *msr)
62cc6b9d 1923{
d2a00af2
SC
1924 uint64_t valid_bits;
1925
1926 /*
1927 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1928 * exposed to the guest.
1929 */
1930 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1931 ~KVM_SUPPORTED_FEATURE_CONTROL);
1932
2d6cd686
SC
1933 if (!msr->host_initiated &&
1934 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1935 return false;
1936
d2a00af2
SC
1937 if (msr->host_initiated)
1938 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1939 else
1940 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
62cc6b9d 1941
d2a00af2 1942 return !(msr->data & ~valid_bits);
62cc6b9d
DM
1943}
1944
55d2375e 1945static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
62cc6b9d 1946{
55d2375e
SC
1947 switch (msr->index) {
1948 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1949 if (!nested)
1950 return 1;
1951 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1952 default:
12bc2132 1953 return KVM_MSR_RET_INVALID;
55d2375e 1954 }
62cc6b9d
DM
1955}
1956
55d2375e 1957/*
fe26f91d 1958 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
55d2375e
SC
1959 * Returns 0 on success, non-0 otherwise.
1960 * Assumes vcpu_load() was already called.
1961 */
1962static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
62cc6b9d 1963{
55d2375e 1964 struct vcpu_vmx *vmx = to_vmx(vcpu);
eb3db1b1 1965 struct vmx_uret_msr *msr;
bf8c55d8 1966 u32 index;
62cc6b9d 1967
55d2375e
SC
1968 switch (msr_info->index) {
1969#ifdef CONFIG_X86_64
1970 case MSR_FS_BASE:
1971 msr_info->data = vmcs_readl(GUEST_FS_BASE);
62cc6b9d 1972 break;
55d2375e
SC
1973 case MSR_GS_BASE:
1974 msr_info->data = vmcs_readl(GUEST_GS_BASE);
62cc6b9d 1975 break;
55d2375e
SC
1976 case MSR_KERNEL_GS_BASE:
1977 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
62cc6b9d 1978 break;
55d2375e
SC
1979#endif
1980 case MSR_EFER:
1981 return kvm_get_msr_common(vcpu, msr_info);
c11f83e0
PB
1982 case MSR_IA32_TSX_CTRL:
1983 if (!msr_info->host_initiated &&
1984 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1985 return 1;
eb3db1b1 1986 goto find_uret_msr;
6e3ba4ab
TX
1987 case MSR_IA32_UMWAIT_CONTROL:
1988 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1989 return 1;
1990
1991 msr_info->data = vmx->msr_ia32_umwait_control;
1992 break;
55d2375e
SC
1993 case MSR_IA32_SPEC_CTRL:
1994 if (!msr_info->host_initiated &&
39485ed9 1995 !guest_has_spec_ctrl_msr(vcpu))
55d2375e
SC
1996 return 1;
1997
1998 msr_info->data = to_vmx(vcpu)->spec_ctrl;
62cc6b9d 1999 break;
6aa8b732 2000 case MSR_IA32_SYSENTER_CS:
609e36d3 2001 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
6aa8b732
AK
2002 break;
2003 case MSR_IA32_SYSENTER_EIP:
609e36d3 2004 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
6aa8b732
AK
2005 break;
2006 case MSR_IA32_SYSENTER_ESP:
609e36d3 2007 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
6aa8b732 2008 break;
0dd376e7 2009 case MSR_IA32_BNDCFGS:
691bd434 2010 if (!kvm_mpx_supported() ||
d6321d49
RK
2011 (!msr_info->host_initiated &&
2012 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 2013 return 1;
609e36d3 2014 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
0dd376e7 2015 break;
c45dcc71
AR
2016 case MSR_IA32_MCG_EXT_CTL:
2017 if (!msr_info->host_initiated &&
a6cb099a 2018 !(vmx->msr_ia32_feature_control &
32ad73db 2019 FEAT_CTL_LMCE_ENABLED))
cae50139 2020 return 1;
c45dcc71
AR
2021 msr_info->data = vcpu->arch.mcg_ext_ctl;
2022 break;
32ad73db 2023 case MSR_IA32_FEAT_CTL:
a6cb099a 2024 msr_info->data = vmx->msr_ia32_feature_control;
cae50139 2025 break;
8f102445
SC
2026 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2027 if (!msr_info->host_initiated &&
2028 !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2029 return 1;
2030 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2031 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2032 break;
cae50139
JK
2033 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2034 if (!nested_vmx_allowed(vcpu))
2035 return 1;
31de3d25
VK
2036 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2037 &msr_info->data))
2038 return 1;
2039 /*
8d68bad6
VK
2040 * Enlightened VMCS v1 doesn't have certain VMCS fields but
2041 * instead of just ignoring the features, different Hyper-V
2042 * versions are either trying to use them and fail or do some
2043 * sanity checking and refuse to boot. Filter all unsupported
2044 * features out.
31de3d25 2045 */
85ab071a 2046 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
4da77090 2047 nested_evmcs_filter_control_msr(vcpu, msr_info->index,
31de3d25
VK
2048 &msr_info->data);
2049 break;
bf8c55d8 2050 case MSR_IA32_RTIT_CTL:
2ef7619d 2051 if (!vmx_pt_mode_is_host_guest())
bf8c55d8
CP
2052 return 1;
2053 msr_info->data = vmx->pt_desc.guest.ctl;
2054 break;
2055 case MSR_IA32_RTIT_STATUS:
2ef7619d 2056 if (!vmx_pt_mode_is_host_guest())
bf8c55d8
CP
2057 return 1;
2058 msr_info->data = vmx->pt_desc.guest.status;
2059 break;
2060 case MSR_IA32_RTIT_CR3_MATCH:
2ef7619d 2061 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
2062 !intel_pt_validate_cap(vmx->pt_desc.caps,
2063 PT_CAP_cr3_filtering))
2064 return 1;
2065 msr_info->data = vmx->pt_desc.guest.cr3_match;
2066 break;
2067 case MSR_IA32_RTIT_OUTPUT_BASE:
2ef7619d 2068 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
2069 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2070 PT_CAP_topa_output) &&
2071 !intel_pt_validate_cap(vmx->pt_desc.caps,
2072 PT_CAP_single_range_output)))
2073 return 1;
2074 msr_info->data = vmx->pt_desc.guest.output_base;
2075 break;
2076 case MSR_IA32_RTIT_OUTPUT_MASK:
2ef7619d 2077 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
2078 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2079 PT_CAP_topa_output) &&
2080 !intel_pt_validate_cap(vmx->pt_desc.caps,
2081 PT_CAP_single_range_output)))
2082 return 1;
2083 msr_info->data = vmx->pt_desc.guest.output_mask;
2084 break;
2085 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2086 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2ef7619d 2087 if (!vmx_pt_mode_is_host_guest() ||
f4d3a902 2088 (index >= 2 * vmx->pt_desc.num_address_ranges))
bf8c55d8
CP
2089 return 1;
2090 if (index % 2)
2091 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2092 else
2093 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2094 break;
d855066f
LX
2095 case MSR_IA32_DEBUGCTLMSR:
2096 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
2097 break;
6aa8b732 2098 default:
eb3db1b1 2099 find_uret_msr:
d85a8034 2100 msr = vmx_find_uret_msr(vmx, msr_info->index);
3bab1f5d 2101 if (msr) {
609e36d3 2102 msr_info->data = msr->data;
3bab1f5d 2103 break;
6aa8b732 2104 }
609e36d3 2105 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
2106 }
2107
6aa8b732
AK
2108 return 0;
2109}
2110
2408500d
SC
2111static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2112 u64 data)
2113{
2114#ifdef CONFIG_X86_64
2115 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2116 return (u32)data;
2117#endif
2118 return (unsigned long)data;
2119}
2120
b333b8eb 2121static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
c6462363 2122{
18e897d2 2123 u64 debugctl = 0;
c6462363 2124
18e897d2 2125 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
b333b8eb 2126 (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
18e897d2 2127 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
c6462363 2128
bec46859 2129 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
b333b8eb 2130 (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
18e897d2 2131 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
76ea438b 2132
c6462363
LX
2133 return debugctl;
2134}
2135
a807b78a
EGE
2136static int vmx_set_msr_ia32_cmd(struct kvm_vcpu *vcpu,
2137 struct msr_data *msr_info,
2138 bool guest_has_feat, u64 cmd,
2139 int x86_feature_bit)
2140{
2141 if (!msr_info->host_initiated && !guest_has_feat)
2142 return 1;
2143
2144 if (!(msr_info->data & ~cmd))
2145 return 1;
2146 if (!boot_cpu_has(x86_feature_bit))
2147 return 1;
2148 if (!msr_info->data)
2149 return 0;
2150
2151 wrmsrl(msr_info->index, cmd);
2152
2153 /*
2154 * For non-nested:
2155 * When it's written (to non-zero) for the first time, pass
2156 * it through.
2157 *
2158 * For nested:
2159 * The handling of the MSR bitmap for L2 guests is done in
2160 * nested_vmx_prepare_msr_bitmap. We should not touch the
2161 * vmcs02.msr_bitmap here since it gets completely overwritten
2162 * in the merging.
2163 */
2164 vmx_disable_intercept_for_msr(vcpu, msr_info->index, MSR_TYPE_W);
2165
2166 return 0;
2167}
2168
6aa8b732 2169/*
311497e0 2170 * Writes msr value into the appropriate "register".
6aa8b732
AK
2171 * Returns 0 on success, non-0 otherwise.
2172 * Assumes vcpu_load() was already called.
2173 */
8fe8ab46 2174static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 2175{
a2fa3e9f 2176 struct vcpu_vmx *vmx = to_vmx(vcpu);
eb3db1b1 2177 struct vmx_uret_msr *msr;
2cc51560 2178 int ret = 0;
8fe8ab46
WA
2179 u32 msr_index = msr_info->index;
2180 u64 data = msr_info->data;
bf8c55d8 2181 u32 index;
2cc51560 2182
6aa8b732 2183 switch (msr_index) {
3bab1f5d 2184 case MSR_EFER:
8fe8ab46 2185 ret = kvm_set_msr_common(vcpu, msr_info);
2cc51560 2186 break;
16175a79 2187#ifdef CONFIG_X86_64
6aa8b732 2188 case MSR_FS_BASE:
2fb92db1 2189 vmx_segment_cache_clear(vmx);
6aa8b732
AK
2190 vmcs_writel(GUEST_FS_BASE, data);
2191 break;
2192 case MSR_GS_BASE:
2fb92db1 2193 vmx_segment_cache_clear(vmx);
6aa8b732
AK
2194 vmcs_writel(GUEST_GS_BASE, data);
2195 break;
44ea2b17 2196 case MSR_KERNEL_GS_BASE:
678e315e 2197 vmx_write_guest_kernel_gs_base(vmx, data);
44ea2b17 2198 break;
ec5be88a
JL
2199 case MSR_IA32_XFD:
2200 ret = kvm_set_msr_common(vcpu, msr_info);
b5274b1b
KT
2201 /*
2202 * Always intercepting WRMSR could incur non-negligible
2203 * overhead given xfd might be changed frequently in
2204 * guest context switch. Disable write interception
2205 * upon the first write with a non-zero value (indicating
2206 * potential usage on dynamic xfeatures). Also update
2207 * exception bitmap to trap #NM for proper virtualization
2208 * of guest xfd_err.
2209 */
2210 if (!ret && data) {
2211 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2212 MSR_TYPE_RW);
2213 vcpu->arch.xfd_no_write_intercept = true;
ec5be88a 2214 vmx_update_exception_bitmap(vcpu);
b5274b1b 2215 }
ec5be88a 2216 break;
6aa8b732
AK
2217#endif
2218 case MSR_IA32_SYSENTER_CS:
de70d279
SC
2219 if (is_guest_mode(vcpu))
2220 get_vmcs12(vcpu)->guest_sysenter_cs = data;
6aa8b732
AK
2221 vmcs_write32(GUEST_SYSENTER_CS, data);
2222 break;
2223 case MSR_IA32_SYSENTER_EIP:
2408500d
SC
2224 if (is_guest_mode(vcpu)) {
2225 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
de70d279 2226 get_vmcs12(vcpu)->guest_sysenter_eip = data;
2408500d 2227 }
f5b42c33 2228 vmcs_writel(GUEST_SYSENTER_EIP, data);
6aa8b732
AK
2229 break;
2230 case MSR_IA32_SYSENTER_ESP:
2408500d
SC
2231 if (is_guest_mode(vcpu)) {
2232 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
de70d279 2233 get_vmcs12(vcpu)->guest_sysenter_esp = data;
2408500d 2234 }
f5b42c33 2235 vmcs_writel(GUEST_SYSENTER_ESP, data);
6aa8b732 2236 break;
d855066f 2237 case MSR_IA32_DEBUGCTLMSR: {
b333b8eb
SC
2238 u64 invalid;
2239
2240 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
d855066f 2241 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
e76ae527 2242 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
d855066f
LX
2243 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2244 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2245 }
2246
2247 if (invalid)
2248 return 1;
2249
699a1ac2
SC
2250 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2251 VM_EXIT_SAVE_DEBUG_CONTROLS)
2252 get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2253
d855066f 2254 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
8e12911b
LX
2255 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2256 (data & DEBUGCTLMSR_LBR))
2257 intel_pmu_create_guest_lbr_event(vcpu);
d855066f
LX
2258 return 0;
2259 }
0dd376e7 2260 case MSR_IA32_BNDCFGS:
691bd434 2261 if (!kvm_mpx_supported() ||
d6321d49
RK
2262 (!msr_info->host_initiated &&
2263 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 2264 return 1;
fd8cb433 2265 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
4531662d 2266 (data & MSR_IA32_BNDCFGS_RSVD))
93c4adc7 2267 return 1;
913d6c9b
SC
2268
2269 if (is_guest_mode(vcpu) &&
2270 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2271 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2272 get_vmcs12(vcpu)->guest_bndcfgs = data;
2273
0dd376e7
LJ
2274 vmcs_write64(GUEST_BNDCFGS, data);
2275 break;
6e3ba4ab
TX
2276 case MSR_IA32_UMWAIT_CONTROL:
2277 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2278 return 1;
2279
2280 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2281 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2282 return 1;
2283
2284 vmx->msr_ia32_umwait_control = data;
2285 break;
d28b387f
KA
2286 case MSR_IA32_SPEC_CTRL:
2287 if (!msr_info->host_initiated &&
39485ed9 2288 !guest_has_spec_ctrl_msr(vcpu))
d28b387f
KA
2289 return 1;
2290
841c2be0 2291 if (kvm_spec_ctrl_test_value(data))
d28b387f
KA
2292 return 1;
2293
2294 vmx->spec_ctrl = data;
d28b387f
KA
2295 if (!data)
2296 break;
2297
2298 /*
2299 * For non-nested:
2300 * When it's written (to non-zero) for the first time, pass
2301 * it through.
2302 *
2303 * For nested:
2304 * The handling of the MSR bitmap for L2 guests is done in
4d516fe7 2305 * nested_vmx_prepare_msr_bitmap. We should not touch the
d28b387f
KA
2306 * vmcs02.msr_bitmap here since it gets completely overwritten
2307 * in the merging. We update the vmcs01 here for L1 as well
2308 * since it will end up touching the MSR anyway now.
2309 */
476c9bd8 2310 vmx_disable_intercept_for_msr(vcpu,
d28b387f
KA
2311 MSR_IA32_SPEC_CTRL,
2312 MSR_TYPE_RW);
2313 break;
c11f83e0
PB
2314 case MSR_IA32_TSX_CTRL:
2315 if (!msr_info->host_initiated &&
2316 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2317 return 1;
2318 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2319 return 1;
eb3db1b1 2320 goto find_uret_msr;
15d45071 2321 case MSR_IA32_PRED_CMD:
a807b78a
EGE
2322 ret = vmx_set_msr_ia32_cmd(vcpu, msr_info,
2323 guest_has_pred_cmd_msr(vcpu),
2324 PRED_CMD_IBPB,
2325 X86_FEATURE_IBPB);
2326 break;
2327 case MSR_IA32_FLUSH_CMD:
2328 ret = vmx_set_msr_ia32_cmd(vcpu, msr_info,
2329 guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D),
2330 L1D_FLUSH,
2331 X86_FEATURE_FLUSH_L1D);
15d45071 2332 break;
468d472f 2333 case MSR_IA32_CR_PAT:
d28f4290
SC
2334 if (!kvm_pat_valid(data))
2335 return 1;
2336
142e4be7
SC
2337 if (is_guest_mode(vcpu) &&
2338 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2339 get_vmcs12(vcpu)->guest_ia32_pat = data;
2340
468d472f
SY
2341 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2342 vmcs_write64(GUEST_IA32_PAT, data);
2343 vcpu->arch.pat = data;
2344 break;
2345 }
8fe8ab46 2346 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 2347 break;
c45dcc71
AR
2348 case MSR_IA32_MCG_EXT_CTL:
2349 if ((!msr_info->host_initiated &&
2350 !(to_vmx(vcpu)->msr_ia32_feature_control &
32ad73db 2351 FEAT_CTL_LMCE_ENABLED)) ||
c45dcc71
AR
2352 (data & ~MCG_EXT_CTL_LMCE_EN))
2353 return 1;
2354 vcpu->arch.mcg_ext_ctl = data;
2355 break;
32ad73db 2356 case MSR_IA32_FEAT_CTL:
2d6cd686 2357 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
cae50139 2358 return 1;
2d6cd686 2359
3b84080b 2360 vmx->msr_ia32_feature_control = data;
cae50139
JK
2361 if (msr_info->host_initiated && data == 0)
2362 vmx_leave_nested(vcpu);
72add915
SC
2363
2364 /* SGX may be enabled/disabled by guest's firmware */
2365 vmx_write_encls_bitmap(vcpu, NULL);
cae50139 2366 break;
8f102445
SC
2367 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2368 /*
2369 * On real hardware, the LE hash MSRs are writable before
2370 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2371 * at which point SGX related bits in IA32_FEATURE_CONTROL
2372 * become writable.
2373 *
2374 * KVM does not emulate SGX activation for simplicity, so
2375 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2376 * is unlocked. This is technically not architectural
2377 * behavior, but it's close enough.
2378 */
2379 if (!msr_info->host_initiated &&
2380 (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2381 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2382 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2383 return 1;
2384 vmx->msr_ia32_sgxlepubkeyhash
2385 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
cae50139
JK
2386 break;
2387 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
62cc6b9d
DM
2388 if (!msr_info->host_initiated)
2389 return 1; /* they are read-only */
2390 if (!nested_vmx_allowed(vcpu))
2391 return 1;
2392 return vmx_set_vmx_msr(vcpu, msr_index, data);
bf8c55d8 2393 case MSR_IA32_RTIT_CTL:
2ef7619d 2394 if (!vmx_pt_mode_is_host_guest() ||
ee85dec2
LK
2395 vmx_rtit_ctl_check(vcpu, data) ||
2396 vmx->nested.vmxon)
bf8c55d8
CP
2397 return 1;
2398 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2399 vmx->pt_desc.guest.ctl = data;
476c9bd8 2400 pt_update_intercept_for_msr(vcpu);
bf8c55d8
CP
2401 break;
2402 case MSR_IA32_RTIT_STATUS:
e348ac7c
SC
2403 if (!pt_can_write_msr(vmx))
2404 return 1;
2405 if (data & MSR_IA32_RTIT_STATUS_MASK)
bf8c55d8
CP
2406 return 1;
2407 vmx->pt_desc.guest.status = data;
2408 break;
2409 case MSR_IA32_RTIT_CR3_MATCH:
e348ac7c
SC
2410 if (!pt_can_write_msr(vmx))
2411 return 1;
2412 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2413 PT_CAP_cr3_filtering))
bf8c55d8
CP
2414 return 1;
2415 vmx->pt_desc.guest.cr3_match = data;
2416 break;
2417 case MSR_IA32_RTIT_OUTPUT_BASE:
e348ac7c
SC
2418 if (!pt_can_write_msr(vmx))
2419 return 1;
2420 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2421 PT_CAP_topa_output) &&
2422 !intel_pt_validate_cap(vmx->pt_desc.caps,
2423 PT_CAP_single_range_output))
2424 return 1;
1cc6cbc3 2425 if (!pt_output_base_valid(vcpu, data))
bf8c55d8
CP
2426 return 1;
2427 vmx->pt_desc.guest.output_base = data;
2428 break;
2429 case MSR_IA32_RTIT_OUTPUT_MASK:
e348ac7c
SC
2430 if (!pt_can_write_msr(vmx))
2431 return 1;
2432 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2433 PT_CAP_topa_output) &&
2434 !intel_pt_validate_cap(vmx->pt_desc.caps,
2435 PT_CAP_single_range_output))
bf8c55d8
CP
2436 return 1;
2437 vmx->pt_desc.guest.output_mask = data;
2438 break;
2439 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
e348ac7c
SC
2440 if (!pt_can_write_msr(vmx))
2441 return 1;
bf8c55d8 2442 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
f4d3a902 2443 if (index >= 2 * vmx->pt_desc.num_address_ranges)
bf8c55d8 2444 return 1;
fe6ed369 2445 if (is_noncanonical_address(data, vcpu))
bf8c55d8
CP
2446 return 1;
2447 if (index % 2)
2448 vmx->pt_desc.guest.addr_b[index / 2] = data;
2449 else
2450 vmx->pt_desc.guest.addr_a[index / 2] = data;
2451 break;
9c9520ce
PB
2452 case MSR_IA32_PERF_CAPABILITIES:
2453 if (data && !vcpu_to_pmu(vcpu)->version)
2454 return 1;
2455 if (data & PMU_CAP_LBR_FMT) {
2456 if ((data & PMU_CAP_LBR_FMT) !=
bec46859 2457 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
9c9520ce 2458 return 1;
59cc99f6 2459 if (!cpuid_model_is_consistent(vcpu))
9c9520ce
PB
2460 return 1;
2461 }
cf8e55fe
LX
2462 if (data & PERF_CAP_PEBS_FORMAT) {
2463 if ((data & PERF_CAP_PEBS_MASK) !=
bec46859 2464 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
cf8e55fe
LX
2465 return 1;
2466 if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2467 return 1;
2468 if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2469 return 1;
2470 if (!cpuid_model_is_consistent(vcpu))
9c9520ce
PB
2471 return 1;
2472 }
2473 ret = kvm_set_msr_common(vcpu, msr_info);
2474 break;
c11f83e0 2475
6aa8b732 2476 default:
eb3db1b1 2477 find_uret_msr:
d85a8034 2478 msr = vmx_find_uret_msr(vmx, msr_index);
b07a5c53 2479 if (msr)
7bf662bb 2480 ret = vmx_set_guest_uret_msr(vmx, msr, data);
b07a5c53
PB
2481 else
2482 ret = kvm_set_msr_common(vcpu, msr_info);
6aa8b732
AK
2483 }
2484
027bbb88
PG
2485 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2486 if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2487 vmx_update_fb_clear_dis(vcpu, vmx);
2488
2cc51560 2489 return ret;
6aa8b732
AK
2490}
2491
5fdbf976 2492static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
6aa8b732 2493{
f98c1e77
SC
2494 unsigned long guest_owned_bits;
2495
cb3c1e2f
SC
2496 kvm_register_mark_available(vcpu, reg);
2497
5fdbf976
MT
2498 switch (reg) {
2499 case VCPU_REGS_RSP:
2500 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2501 break;
2502 case VCPU_REGS_RIP:
2503 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2504 break;
6de4f3ad
AK
2505 case VCPU_EXREG_PDPTR:
2506 if (enable_ept)
2507 ept_save_pdptrs(vcpu);
2508 break;
bd31fe49
SC
2509 case VCPU_EXREG_CR0:
2510 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2511
2512 vcpu->arch.cr0 &= ~guest_owned_bits;
2513 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2514 break;
34059c25 2515 case VCPU_EXREG_CR3:
81ca0e73
SC
2516 /*
2517 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2518 * CR3 is loaded into hardware, not the guest's CR3.
2519 */
2520 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
34059c25
SC
2521 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2522 break;
f98c1e77
SC
2523 case VCPU_EXREG_CR4:
2524 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2525
2526 vcpu->arch.cr4 &= ~guest_owned_bits;
2527 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2528 break;
5fdbf976 2529 default:
67369273 2530 KVM_BUG_ON(1, vcpu->kvm);
5fdbf976
MT
2531 break;
2532 }
6aa8b732
AK
2533}
2534
7a57c09b
SC
2535/*
2536 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2537 * directly instead of going through cpu_has(), to ensure KVM is trapping
2538 * ENCLS whenever it's supported in hardware. It does not matter whether
2539 * the host OS supports or has enabled SGX.
2540 */
2541static bool cpu_has_sgx(void)
2542{
2543 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2544}
2545
9d78d6fb
VK
2546/*
2547 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2548 * can't be used due to errata where VM Exit may incorrectly clear
2549 * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
2550 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2551 */
2552static bool cpu_has_perf_global_ctrl_bug(void)
2553{
2554 if (boot_cpu_data.x86 == 0x6) {
2555 switch (boot_cpu_data.x86_model) {
2556 case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
2557 case INTEL_FAM6_NEHALEM: /* AAP115 */
2558 case INTEL_FAM6_WESTMERE: /* AAT100 */
2559 case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
2560 case INTEL_FAM6_NEHALEM_EX: /* BA97 */
2561 return true;
2562 default:
2563 break;
2564 }
2565 }
2566
2567 return false;
2568}
2569
d83420c2 2570static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
1c3d14fe
YS
2571{
2572 u32 vmx_msr_low, vmx_msr_high;
2573 u32 ctl = ctl_min | ctl_opt;
2574
2575 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2576
2577 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2578 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2579
2580 /* Ensure minimum (required) set of control bits are supported. */
2581 if (ctl_min & ~ctl)
002c7f7c 2582 return -EIO;
1c3d14fe
YS
2583
2584 *result = ctl;
2585 return 0;
2586}
2587
d83420c2 2588static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
1ad4e543
RH
2589{
2590 u64 allowed;
2591
2592 rdmsrl(msr, allowed);
2593
2594 return ctl_opt & allowed;
2595}
2596
d83420c2
SC
2597static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2598 struct vmx_capability *vmx_cap)
6aa8b732
AK
2599{
2600 u32 vmx_msr_low, vmx_msr_high;
1c3d14fe
YS
2601 u32 _pin_based_exec_control = 0;
2602 u32 _cpu_based_exec_control = 0;
f78e0e2e 2603 u32 _cpu_based_2nd_exec_control = 0;
1ad4e543 2604 u64 _cpu_based_3rd_exec_control = 0;
1c3d14fe
YS
2605 u32 _vmexit_control = 0;
2606 u32 _vmentry_control = 0;
0809d9b0 2607 u64 misc_msr;
f5a81d0e
SC
2608 int i;
2609
2610 /*
2611 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2612 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2613 * intercepts writes to PAT and EFER, i.e. never enables those controls.
2614 */
2615 struct {
2616 u32 entry_control;
2617 u32 exit_control;
2618 } const vmcs_entry_exit_pairs[] = {
2619 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2620 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
2621 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
2622 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
2623 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
2624 };
1c3d14fe 2625
1389309c 2626 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
ee087b4d
VK
2627
2628 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2629 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2630 MSR_IA32_VMX_PROCBASED_CTLS,
2631 &_cpu_based_exec_control))
002c7f7c 2632 return -EIO;
f78e0e2e 2633 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
ee087b4d
VK
2634 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2635 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
d56f546d 2636 MSR_IA32_VMX_PROCBASED_CTLS2,
ee087b4d 2637 &_cpu_based_2nd_exec_control))
f78e0e2e
SY
2638 return -EIO;
2639 }
2640#ifndef CONFIG_X86_64
2641 if (!(_cpu_based_2nd_exec_control &
2642 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2643 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2644#endif
83d4c286
YZ
2645
2646 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2647 _cpu_based_2nd_exec_control &= ~(
8d14695f 2648 SECONDARY_EXEC_APIC_REGISTER_VIRT |
c7c9c56c
YZ
2649 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2650 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
83d4c286 2651
61f1dd90 2652 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
7caaa711 2653 &vmx_cap->ept, &vmx_cap->vpid);
61f1dd90 2654
64f80ea7
SC
2655 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2656 vmx_cap->ept) {
61f1dd90
WL
2657 pr_warn_once("EPT CAP should not exist if not support "
2658 "1-setting enable EPT VM-execution control\n");
3dbec44d
SC
2659
2660 if (error_on_inconsistent_vmcs_config)
2661 return -EIO;
2662
2663 vmx_cap->ept = 0;
61f1dd90
WL
2664 }
2665 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
3dbec44d 2666 vmx_cap->vpid) {
61f1dd90
WL
2667 pr_warn_once("VPID CAP should not exist if not support "
2668 "1-setting enable VPID VM-execution control\n");
3dbec44d
SC
2669
2670 if (error_on_inconsistent_vmcs_config)
2671 return -EIO;
2672
2673 vmx_cap->vpid = 0;
d56f546d 2674 }
1c3d14fe 2675
1dae2765
VK
2676 if (!cpu_has_sgx())
2677 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2678
ee087b4d
VK
2679 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2680 _cpu_based_3rd_exec_control =
2681 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
1ad4e543 2682 MSR_IA32_VMX_PROCBASED_CTLS3);
1c3d14fe 2683
ee087b4d
VK
2684 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2685 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2686 MSR_IA32_VMX_EXIT_CTLS,
2687 &_vmexit_control))
002c7f7c 2688 return -EIO;
1c3d14fe 2689
ee087b4d
VK
2690 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2691 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2692 MSR_IA32_VMX_PINBASED_CTLS,
2693 &_pin_based_exec_control))
01e439be
YZ
2694 return -EIO;
2695
1c17c3e6
PB
2696 if (cpu_has_broken_vmx_preemption_timer())
2697 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be 2698 if (!(_cpu_based_2nd_exec_control &
91fa0f8e 2699 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
01e439be
YZ
2700 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2701
ee087b4d
VK
2702 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2703 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2704 MSR_IA32_VMX_ENTRY_CTLS,
2705 &_vmentry_control))
002c7f7c 2706 return -EIO;
6aa8b732 2707
f5a81d0e
SC
2708 for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2709 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2710 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2711
2712 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2713 continue;
2714
2715 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2716 _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2717
3dbec44d
SC
2718 if (error_on_inconsistent_vmcs_config)
2719 return -EIO;
2720
f5a81d0e
SC
2721 _vmentry_control &= ~n_ctrl;
2722 _vmexit_control &= ~x_ctrl;
2723 }
2724
c68876fd 2725 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1c3d14fe
YS
2726
2727 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2728 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
002c7f7c 2729 return -EIO;
1c3d14fe
YS
2730
2731#ifdef CONFIG_X86_64
2732 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2733 if (vmx_msr_high & (1u<<16))
002c7f7c 2734 return -EIO;
1c3d14fe
YS
2735#endif
2736
2737 /* Require Write-Back (WB) memory type for VMCS accesses. */
2738 if (((vmx_msr_high >> 18) & 15) != 6)
002c7f7c 2739 return -EIO;
1c3d14fe 2740
0809d9b0
VK
2741 rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2742
002c7f7c 2743 vmcs_conf->size = vmx_msr_high & 0x1fff;
9ac7e3e8 2744 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
773e8a04 2745
2307af1c 2746 vmcs_conf->revision_id = vmx_msr_low;
1c3d14fe 2747
002c7f7c
YS
2748 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2749 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
f78e0e2e 2750 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1ad4e543 2751 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
002c7f7c
YS
2752 vmcs_conf->vmexit_ctrl = _vmexit_control;
2753 vmcs_conf->vmentry_ctrl = _vmentry_control;
0809d9b0 2754 vmcs_conf->misc = misc_msr;
1c3d14fe 2755
80edc49f
VK
2756#if IS_ENABLED(CONFIG_HYPERV)
2757 if (enlightened_vmcs)
2758 evmcs_sanitize_exec_ctrls(vmcs_conf);
2759#endif
2760
1c3d14fe 2761 return 0;
c68876fd 2762}
6aa8b732 2763
d83420c2 2764static bool kvm_is_vmx_supported(void)
8504ef21 2765{
c82a5c5c
CG
2766 int cpu = raw_smp_processor_id();
2767
8504ef21 2768 if (!cpu_has_vmx()) {
c82a5c5c 2769 pr_err("VMX not supported by CPU %d\n", cpu);
8504ef21
SC
2770 return false;
2771 }
2772
2773 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2774 !this_cpu_has(X86_FEATURE_VMX)) {
c82a5c5c 2775 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
8504ef21
SC
2776 return false;
2777 }
2778
2779 return true;
2780}
2781
d83420c2 2782static int vmx_check_processor_compat(void)
8504ef21 2783{
c82a5c5c 2784 int cpu = raw_smp_processor_id();
8504ef21
SC
2785 struct vmcs_config vmcs_conf;
2786 struct vmx_capability vmx_cap;
2787
2788 if (!kvm_is_vmx_supported())
2789 return -EIO;
2790
c82a5c5c
CG
2791 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2792 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
8504ef21 2793 return -EIO;
c82a5c5c 2794 }
8504ef21
SC
2795 if (nested)
2796 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
c82a5c5c
CG
2797 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2798 pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
8504ef21
SC
2799 return -EIO;
2800 }
2801 return 0;
2802}
2803
2804static int kvm_cpu_vmxon(u64 vmxon_pointer)
2805{
2806 u64 msr;
2807
2808 cr4_set_bits(X86_CR4_VMXE);
2809
2810 asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2811 _ASM_EXTABLE(1b, %l[fault])
2812 : : [vmxon_pointer] "m"(vmxon_pointer)
2813 : : fault);
2814 return 0;
2815
2816fault:
2817 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2818 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2819 cr4_clear_bits(X86_CR4_VMXE);
2820
2821 return -EFAULT;
2822}
2823
2824static int vmx_hardware_enable(void)
2825{
2826 int cpu = raw_smp_processor_id();
2827 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2828 int r;
2829
2830 if (cr4_read_shadow() & X86_CR4_VMXE)
2831 return -EBUSY;
2832
2833 /*
2834 * This can happen if we hot-added a CPU but failed to allocate
2835 * VP assist page for it.
2836 */
19f10315 2837 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
8504ef21
SC
2838 return -EFAULT;
2839
2840 intel_pt_handle_vmx(1);
2841
2842 r = kvm_cpu_vmxon(phys_addr);
2843 if (r) {
2844 intel_pt_handle_vmx(0);
2845 return r;
2846 }
2847
2848 if (enable_ept)
2849 ept_sync_global();
2850
2851 return 0;
2852}
2853
2854static void vmclear_local_loaded_vmcss(void)
2855{
2856 int cpu = raw_smp_processor_id();
2857 struct loaded_vmcs *v, *n;
2858
2859 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2860 loaded_vmcss_on_cpu_link)
2861 __loaded_vmcs_clear(v);
2862}
2863
2864static void vmx_hardware_disable(void)
2865{
2866 vmclear_local_loaded_vmcss();
2867
2868 if (cpu_vmxoff())
2869 kvm_spurious_fault();
2870
2871 hv_reset_evmcs();
2872
2873 intel_pt_handle_vmx(0);
2874}
2875
41836839 2876struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
6aa8b732
AK
2877{
2878 int node = cpu_to_node(cpu);
2879 struct page *pages;
2880 struct vmcs *vmcs;
2881
519669cc 2882 pages = __alloc_pages_node(node, flags, 0);
6aa8b732
AK
2883 if (!pages)
2884 return NULL;
2885 vmcs = page_address(pages);
1c3d14fe 2886 memset(vmcs, 0, vmcs_config.size);
2307af1c
LA
2887
2888 /* KVM supports Enlightened VMCS v1 only */
19f10315 2889 if (kvm_is_using_evmcs())
392b2f25 2890 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2307af1c 2891 else
392b2f25 2892 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 2893
491a6038
LA
2894 if (shadow)
2895 vmcs->hdr.shadow_vmcs = 1;
6aa8b732
AK
2896 return vmcs;
2897}
2898
89b0c9f5 2899void free_vmcs(struct vmcs *vmcs)
6aa8b732 2900{
519669cc 2901 free_page((unsigned long)vmcs);
6aa8b732
AK
2902}
2903
d462b819
NHE
2904/*
2905 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2906 */
89b0c9f5 2907void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
d462b819
NHE
2908{
2909 if (!loaded_vmcs->vmcs)
2910 return;
2911 loaded_vmcs_clear(loaded_vmcs);
2912 free_vmcs(loaded_vmcs->vmcs);
2913 loaded_vmcs->vmcs = NULL;
904e14fb
PB
2914 if (loaded_vmcs->msr_bitmap)
2915 free_page((unsigned long)loaded_vmcs->msr_bitmap);
355f4fb1 2916 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
d462b819
NHE
2917}
2918
89b0c9f5 2919int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
f21f165e 2920{
491a6038 2921 loaded_vmcs->vmcs = alloc_vmcs(false);
f21f165e
PB
2922 if (!loaded_vmcs->vmcs)
2923 return -ENOMEM;
2924
d260f9ef
SC
2925 vmcs_clear(loaded_vmcs->vmcs);
2926
f21f165e 2927 loaded_vmcs->shadow_vmcs = NULL;
804939ea 2928 loaded_vmcs->hv_timer_soft_disabled = false;
d260f9ef
SC
2929 loaded_vmcs->cpu = -1;
2930 loaded_vmcs->launched = 0;
904e14fb
PB
2931
2932 if (cpu_has_vmx_msr_bitmap()) {
41836839
BG
2933 loaded_vmcs->msr_bitmap = (unsigned long *)
2934 __get_free_page(GFP_KERNEL_ACCOUNT);
904e14fb
PB
2935 if (!loaded_vmcs->msr_bitmap)
2936 goto out_vmcs;
2937 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2938 }
d7ee039e
SC
2939
2940 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3af80fec
SC
2941 memset(&loaded_vmcs->controls_shadow, 0,
2942 sizeof(struct vmcs_controls_shadow));
d7ee039e 2943
f21f165e 2944 return 0;
904e14fb
PB
2945
2946out_vmcs:
2947 free_loaded_vmcs(loaded_vmcs);
2948 return -ENOMEM;
f21f165e
PB
2949}
2950
39959588 2951static void free_kvm_area(void)
6aa8b732
AK
2952{
2953 int cpu;
2954
3230bb47 2955 for_each_possible_cpu(cpu) {
6aa8b732 2956 free_vmcs(per_cpu(vmxarea, cpu));
3230bb47
ZA
2957 per_cpu(vmxarea, cpu) = NULL;
2958 }
6aa8b732
AK
2959}
2960
6aa8b732
AK
2961static __init int alloc_kvm_area(void)
2962{
2963 int cpu;
2964
3230bb47 2965 for_each_possible_cpu(cpu) {
6aa8b732
AK
2966 struct vmcs *vmcs;
2967
41836839 2968 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
6aa8b732
AK
2969 if (!vmcs) {
2970 free_kvm_area();
2971 return -ENOMEM;
2972 }
2973
2307af1c
LA
2974 /*
2975 * When eVMCS is enabled, alloc_vmcs_cpu() sets
2976 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2977 * revision_id reported by MSR_IA32_VMX_BASIC.
2978 *
312a4661 2979 * However, even though not explicitly documented by
2307af1c
LA
2980 * TLFS, VMXArea passed as VMXON argument should
2981 * still be marked with revision_id reported by
2982 * physical CPU.
2983 */
19f10315 2984 if (kvm_is_using_evmcs())
392b2f25 2985 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 2986
6aa8b732
AK
2987 per_cpu(vmxarea, cpu) = vmcs;
2988 }
2989 return 0;
2990}
2991
91b0aa2c 2992static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
d99e4152 2993 struct kvm_segment *save)
6aa8b732 2994{
d99e4152
GN
2995 if (!emulate_invalid_guest_state) {
2996 /*
2997 * CS and SS RPL should be equal during guest entry according
2998 * to VMX spec, but in reality it is not always so. Since vcpu
2999 * is in the middle of the transition from real mode to
3000 * protected mode it is safe to assume that RPL 0 is a good
3001 * default value.
3002 */
3003 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
b32a9918
NA
3004 save->selector &= ~SEGMENT_RPL_MASK;
3005 save->dpl = save->selector & SEGMENT_RPL_MASK;
d99e4152 3006 save->s = 1;
6aa8b732 3007 }
1dd7a4f1 3008 __vmx_set_segment(vcpu, save, seg);
6aa8b732
AK
3009}
3010
3011static void enter_pmode(struct kvm_vcpu *vcpu)
3012{
3013 unsigned long flags;
a89a8fb9 3014 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732 3015
d99e4152 3016 /*
d9f6e12f 3017 * Update real mode segment cache. It may be not up-to-date if segment
d99e4152
GN
3018 * register was written while vcpu was in a guest mode.
3019 */
3020 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3021 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3022 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3023 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3024 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3025 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3026
7ffd92c5 3027 vmx->rmode.vm86_active = 0;
6aa8b732 3028
1dd7a4f1 3029 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
6aa8b732
AK
3030
3031 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47
AK
3032 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3033 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
6aa8b732
AK
3034 vmcs_writel(GUEST_RFLAGS, flags);
3035
66aee91a
RR
3036 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3037 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
6aa8b732 3038
b6a7cc35 3039 vmx_update_exception_bitmap(vcpu);
6aa8b732 3040
91b0aa2c
GN
3041 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3042 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3043 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3044 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3045 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3046 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
6aa8b732
AK
3047}
3048
f5f7b2fe 3049static void fix_rmode_seg(int seg, struct kvm_segment *save)
6aa8b732 3050{
772e0318 3051 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
d99e4152
GN
3052 struct kvm_segment var = *save;
3053
3054 var.dpl = 0x3;
3055 if (seg == VCPU_SREG_CS)
3056 var.type = 0x3;
3057
3058 if (!emulate_invalid_guest_state) {
3059 var.selector = var.base >> 4;
3060 var.base = var.base & 0xffff0;
3061 var.limit = 0xffff;
3062 var.g = 0;
3063 var.db = 0;
3064 var.present = 1;
3065 var.s = 1;
3066 var.l = 0;
3067 var.unusable = 0;
3068 var.type = 0x3;
3069 var.avl = 0;
3070 if (save->base & 0xf)
8d20bd63
SC
3071 pr_warn_once("segment base is not paragraph aligned "
3072 "when entering protected mode (seg=%d)", seg);
d99e4152 3073 }
6aa8b732 3074
d99e4152 3075 vmcs_write16(sf->selector, var.selector);
96794e4e 3076 vmcs_writel(sf->base, var.base);
d99e4152
GN
3077 vmcs_write32(sf->limit, var.limit);
3078 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
6aa8b732
AK
3079}
3080
3081static void enter_rmode(struct kvm_vcpu *vcpu)
3082{
3083 unsigned long flags;
a89a8fb9 3084 struct vcpu_vmx *vmx = to_vmx(vcpu);
40bbb9d0 3085 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
6aa8b732 3086
f5f7b2fe
AK
3087 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3088 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3089 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3090 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3091 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
c6ad1153
GN
3092 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3093 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
f5f7b2fe 3094
7ffd92c5 3095 vmx->rmode.vm86_active = 1;
6aa8b732 3096
776e58ea
GN
3097 /*
3098 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4918c6ca 3099 * vcpu. Warn the user that an update is overdue.
776e58ea 3100 */
40bbb9d0 3101 if (!kvm_vmx->tss_addr)
8d20bd63 3102 pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
776e58ea 3103
2fb92db1
AK
3104 vmx_segment_cache_clear(vmx);
3105
40bbb9d0 3106 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
6aa8b732 3107 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
6aa8b732
AK
3108 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3109
3110 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47 3111 vmx->rmode.save_rflags = flags;
6aa8b732 3112
053de044 3113 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
6aa8b732
AK
3114
3115 vmcs_writel(GUEST_RFLAGS, flags);
66aee91a 3116 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
b6a7cc35 3117 vmx_update_exception_bitmap(vcpu);
6aa8b732 3118
d99e4152
GN
3119 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3120 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3121 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3122 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3123 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3124 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
6aa8b732
AK
3125}
3126
72f211ec 3127int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
401d10de
AS
3128{
3129 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 3130
72f211ec 3131 /* Nothing to do if hardware doesn't support EFER. */
b76edfe9 3132 if (!vmx_find_uret_msr(vmx, MSR_EFER))
72f211ec 3133 return 0;
401d10de 3134
f6801dff 3135 vcpu->arch.efer = efer;
ebb3c8d4 3136#ifdef CONFIG_X86_64
b76edfe9
ZD
3137 if (efer & EFER_LMA)
3138 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3139 else
3140 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
ebb3c8d4
SC
3141#else
3142 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3143 return 1;
3144#endif
401d10de 3145
400dd54b 3146 vmx_setup_uret_msrs(vmx);
72f211ec 3147 return 0;
401d10de
AS
3148}
3149
05b3e0c2 3150#ifdef CONFIG_X86_64
6aa8b732
AK
3151
3152static void enter_lmode(struct kvm_vcpu *vcpu)
3153{
3154 u32 guest_tr_ar;
3155
2fb92db1
AK
3156 vmx_segment_cache_clear(to_vmx(vcpu));
3157
6aa8b732 3158 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4d283ec9 3159 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
bd80158a
JK
3160 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3161 __func__);
6aa8b732 3162 vmcs_write32(GUEST_TR_AR_BYTES,
4d283ec9
AL
3163 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3164 | VMX_AR_TYPE_BUSY_64_TSS);
6aa8b732 3165 }
da38f438 3166 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
6aa8b732
AK
3167}
3168
3169static void exit_lmode(struct kvm_vcpu *vcpu)
3170{
da38f438 3171 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
6aa8b732
AK
3172}
3173
3174#endif
3175
7780938c 3176static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
5058b692
SC
3177{
3178 struct vcpu_vmx *vmx = to_vmx(vcpu);
3179
3180 /*
7780938c
SC
3181 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3182 * the CPU is not required to invalidate guest-physical mappings on
3183 * VM-Entry, even if VPID is disabled. Guest-physical mappings are
3184 * associated with the root EPT structure and not any particular VPID
3185 * (INVVPID also isn't required to invalidate guest-physical mappings).
5058b692
SC
3186 */
3187 if (enable_ept) {
3188 ept_sync_global();
3189 } else if (enable_vpid) {
3190 if (cpu_has_vmx_invvpid_global()) {
3191 vpid_sync_vcpu_global();
3192 } else {
3193 vpid_sync_vcpu_single(vmx->vpid);
3194 vpid_sync_vcpu_single(vmx->nested.vpid02);
3195 }
3196 }
3197}
3198
2b4a5a5d
SC
3199static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3200{
3201 if (is_guest_mode(vcpu))
3202 return nested_get_vpid02(vcpu);
3203 return to_vmx(vcpu)->vpid;
3204}
3205
33d19ec9
SC
3206static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3207{
2a40b900 3208 struct kvm_mmu *mmu = vcpu->arch.mmu;
b9e5603c 3209 u64 root_hpa = mmu->root.hpa;
33d19ec9
SC
3210
3211 /* No flush required if the current context is invalid. */
3212 if (!VALID_PAGE(root_hpa))
3213 return;
3214
3215 if (enable_ept)
2a40b900 3216 ept_sync_context(construct_eptp(vcpu, root_hpa,
a972e29c 3217 mmu->root_role.level));
33d19ec9 3218 else
2b4a5a5d 3219 vpid_sync_context(vmx_get_current_vpid(vcpu));
33d19ec9
SC
3220}
3221
faff8758
JS
3222static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3223{
faff8758 3224 /*
2b4a5a5d 3225 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
ad104b5e 3226 * vmx_flush_tlb_guest() for an explanation of why this is ok.
faff8758 3227 */
2b4a5a5d 3228 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
faff8758
JS
3229}
3230
e64419d9
SC
3231static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3232{
3233 /*
2b4a5a5d
SC
3234 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3235 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
3236 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
e64419d9
SC
3237 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3238 * i.e. no explicit INVVPID is necessary.
3239 */
2b4a5a5d 3240 vpid_sync_context(vmx_get_current_vpid(vcpu));
e64419d9
SC
3241}
3242
43fea4e4 3243void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
1439442c 3244{
d0d538b9
GN
3245 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3246
cb3c1e2f 3247 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
6de4f3ad
AK
3248 return;
3249
bf03d4f9 3250 if (is_pae_paging(vcpu)) {
d0d538b9
GN
3251 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3252 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3253 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3254 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
1439442c
SY
3255 }
3256}
3257
97b7ead3 3258void ept_save_pdptrs(struct kvm_vcpu *vcpu)
8f5d549f 3259{
d0d538b9
GN
3260 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3261
9932b49e
SC
3262 if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3263 return;
3264
3265 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3266 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3267 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3268 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
6de4f3ad 3269
c0d6956e 3270 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
8f5d549f
AK
3271}
3272
470750b3
SC
3273#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3274 CPU_BASED_CR3_STORE_EXITING)
3275
97b7ead3 3276void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
6aa8b732 3277{
7ffd92c5 3278 struct vcpu_vmx *vmx = to_vmx(vcpu);
32437c2a 3279 unsigned long hw_cr0, old_cr0_pg;
470750b3 3280 u32 tmp;
3a624e29 3281
32437c2a
SC
3282 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3283
3de6347b 3284 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
bddd82d1 3285 if (is_unrestricted_guest(vcpu))
5037878e 3286 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
218e763f 3287 else {
5037878e 3288 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
ee5a5584
SC
3289 if (!enable_ept)
3290 hw_cr0 |= X86_CR0_WP;
1439442c 3291
218e763f
GN
3292 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3293 enter_pmode(vcpu);
6aa8b732 3294
218e763f
GN
3295 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3296 enter_rmode(vcpu);
3297 }
6aa8b732 3298
32437c2a
SC
3299 vmcs_writel(CR0_READ_SHADOW, cr0);
3300 vmcs_writel(GUEST_CR0, hw_cr0);
3301 vcpu->arch.cr0 = cr0;
3302 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3303
05b3e0c2 3304#ifdef CONFIG_X86_64
f6801dff 3305 if (vcpu->arch.efer & EFER_LME) {
32437c2a 3306 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
6aa8b732 3307 enter_lmode(vcpu);
32437c2a 3308 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
6aa8b732
AK
3309 exit_lmode(vcpu);
3310 }
3311#endif
3312
c834fd7f 3313 if (enable_ept && !is_unrestricted_guest(vcpu)) {
470750b3
SC
3314 /*
3315 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
3316 * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3317 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3318 * KVM's CR3 is installed.
3319 */
c834fd7f
SC
3320 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3321 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
470750b3
SC
3322
3323 /*
3324 * When running with EPT but not unrestricted guest, KVM must
3325 * intercept CR3 accesses when paging is _disabled_. This is
3326 * necessary because restricted guests can't actually run with
3327 * paging disabled, and so KVM stuffs its own CR3 in order to
3328 * run the guest when identity mapped page tables.
3329 *
3330 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3331 * update, it may be stale with respect to CR3 interception,
3332 * e.g. after nested VM-Enter.
3333 *
3334 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3335 * stores to forward them to L1, even if KVM does not need to
3336 * intercept them to preserve its identity mapped page tables.
3337 */
c834fd7f 3338 if (!(cr0 & X86_CR0_PG)) {
470750b3
SC
3339 exec_controls_setbit(vmx, CR3_EXITING_BITS);
3340 } else if (!is_guest_mode(vcpu)) {
3341 exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3342 } else {
3343 tmp = exec_controls_get(vmx);
3344 tmp &= ~CR3_EXITING_BITS;
3345 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3346 exec_controls_set(vmx, tmp);
3347 }
3348
32437c2a
SC
3349 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3350 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
c834fd7f 3351 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
5b61178c
LJ
3352
3353 /*
3354 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3355 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3356 */
3357 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3358 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
c834fd7f 3359 }
1439442c 3360
14168786 3361 /* depends on vcpu->arch.cr0 to be set to a new value */
dbab610a 3362 vmx->emulation_required = vmx_emulation_required(vcpu);
6aa8b732
AK
3363}
3364
d468d94b 3365static int vmx_get_max_tdp_level(void)
855feb67 3366{
d468d94b 3367 if (cpu_has_vmx_ept_5levels())
855feb67
YZ
3368 return 5;
3369 return 4;
3370}
3371
e83bc09c 3372u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
1439442c 3373{
855feb67
YZ
3374 u64 eptp = VMX_EPTP_MT_WB;
3375
2a40b900 3376 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
1439442c 3377
995f00a6
PF
3378 if (enable_ept_ad_bits &&
3379 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
bb97a016 3380 eptp |= VMX_EPTP_AD_ENABLE_BIT;
e83bc09c 3381 eptp |= root_hpa;
1439442c
SY
3382
3383 return eptp;
3384}
3385
e83bc09c
SC
3386static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3387 int root_level)
6aa8b732 3388{
877ad952 3389 struct kvm *kvm = vcpu->kvm;
04f11ef4 3390 bool update_guest_cr3 = true;
1439442c
SY
3391 unsigned long guest_cr3;
3392 u64 eptp;
3393
089d034e 3394 if (enable_ept) {
e83bc09c 3395 eptp = construct_eptp(vcpu, root_hpa, root_level);
1439442c 3396 vmcs_write64(EPT_POINTER, eptp);
877ad952 3397
3c86c0d3 3398 hv_track_root_tdp(vcpu, root_hpa);
877ad952 3399
df7e0681 3400 if (!enable_unrestricted_guest && !is_paging(vcpu))
877ad952 3401 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
c62c7bd4 3402 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
b17b7436 3403 guest_cr3 = vcpu->arch.cr3;
c62c7bd4 3404 else /* vmcs.GUEST_CR3 is already up-to-date. */
b17b7436 3405 update_guest_cr3 = false;
43fea4e4 3406 vmx_ept_load_pdptrs(vcpu);
be100ef1 3407 } else {
e83bc09c 3408 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
1439442c
SY
3409 }
3410
04f11ef4
SC
3411 if (update_guest_cr3)
3412 vmcs_writel(GUEST_CR3, guest_cr3);
6aa8b732
AK
3413}
3414
405329fc 3415
c2fe3cd4
SC
3416static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3417{
3418 /*
3419 * We operate under the default treatment of SMM, so VMX cannot be
c33f6f22
SC
3420 * enabled under SMM. Note, whether or not VMXE is allowed at all,
3421 * i.e. is a reserved bit, is handled by common x86 code.
c2fe3cd4
SC
3422 */
3423 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3424 return false;
3425
3426 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3427 return false;
3428
3429 return true;
3430}
3431
3432void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 3433{
2259c17f 3434 unsigned long old_cr4 = vcpu->arch.cr4;
fe7f895d 3435 struct vcpu_vmx *vmx = to_vmx(vcpu);
085e68ee
BS
3436 /*
3437 * Pass through host's Machine Check Enable value to hw_cr4, which
3438 * is in force while we are in guest mode. Do not let guests control
3439 * this bit, even if host CR4.MCE == 0.
3440 */
5dc1f044
SC
3441 unsigned long hw_cr4;
3442
3443 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
bddd82d1 3444 if (is_unrestricted_guest(vcpu))
5dc1f044 3445 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
fe7f895d 3446 else if (vmx->rmode.vm86_active)
5dc1f044
SC
3447 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3448 else
3449 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
1439442c 3450
64f7a115
SC
3451 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3452 if (cr4 & X86_CR4_UMIP) {
fe7f895d 3453 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
64f7a115
SC
3454 hw_cr4 &= ~X86_CR4_UMIP;
3455 } else if (!is_guest_mode(vcpu) ||
fe7f895d
SC
3456 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3457 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3458 }
64f7a115 3459 }
0367f205 3460
ad312c7c 3461 vcpu->arch.cr4 = cr4;
f98c1e77 3462 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
5dc1f044 3463
bddd82d1 3464 if (!is_unrestricted_guest(vcpu)) {
5dc1f044
SC
3465 if (enable_ept) {
3466 if (!is_paging(vcpu)) {
3467 hw_cr4 &= ~X86_CR4_PAE;
3468 hw_cr4 |= X86_CR4_PSE;
3469 } else if (!(cr4 & X86_CR4_PAE)) {
3470 hw_cr4 &= ~X86_CR4_PAE;
3471 }
bc23008b 3472 }
1439442c 3473
656ec4a4 3474 /*
ddba2628
HH
3475 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3476 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3477 * to be manually disabled when guest switches to non-paging
3478 * mode.
3479 *
3480 * If !enable_unrestricted_guest, the CPU is always running
3481 * with CR0.PG=1 and CR4 needs to be modified.
3482 * If enable_unrestricted_guest, the CPU automatically
3483 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
656ec4a4 3484 */
5dc1f044
SC
3485 if (!is_paging(vcpu))
3486 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3487 }
656ec4a4 3488
1439442c
SY
3489 vmcs_writel(CR4_READ_SHADOW, cr4);
3490 vmcs_writel(GUEST_CR4, hw_cr4);
2259c17f
JM
3491
3492 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3493 kvm_update_cpuid_runtime(vcpu);
6aa8b732
AK
3494}
3495
97b7ead3 3496void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
6aa8b732 3497{
a9179499 3498 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732
AK
3499 u32 ar;
3500
c6ad1153 3501 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
f5f7b2fe 3502 *var = vmx->rmode.segs[seg];
a9179499 3503 if (seg == VCPU_SREG_TR
2fb92db1 3504 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
f5f7b2fe 3505 return;
1390a28b
AK
3506 var->base = vmx_read_guest_seg_base(vmx, seg);
3507 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3508 return;
a9179499 3509 }
2fb92db1
AK
3510 var->base = vmx_read_guest_seg_base(vmx, seg);
3511 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3512 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3513 ar = vmx_read_guest_seg_ar(vmx, seg);
03617c18 3514 var->unusable = (ar >> 16) & 1;
6aa8b732
AK
3515 var->type = ar & 15;
3516 var->s = (ar >> 4) & 1;
3517 var->dpl = (ar >> 5) & 3;
03617c18
GN
3518 /*
3519 * Some userspaces do not preserve unusable property. Since usable
3520 * segment has to be present according to VMX spec we can use present
3521 * property to amend userspace bug by making unusable segment always
3522 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3523 * segment as unusable.
3524 */
3525 var->present = !var->unusable;
6aa8b732
AK
3526 var->avl = (ar >> 12) & 1;
3527 var->l = (ar >> 13) & 1;
3528 var->db = (ar >> 14) & 1;
3529 var->g = (ar >> 15) & 1;
6aa8b732
AK
3530}
3531
a9179499
AK
3532static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3533{
a9179499
AK
3534 struct kvm_segment s;
3535
3536 if (to_vmx(vcpu)->rmode.vm86_active) {
3537 vmx_get_segment(vcpu, &s, seg);
3538 return s.base;
3539 }
2fb92db1 3540 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
a9179499
AK
3541}
3542
97b7ead3 3543int vmx_get_cpl(struct kvm_vcpu *vcpu)
2e4d2653 3544{
b09408d0
MT
3545 struct vcpu_vmx *vmx = to_vmx(vcpu);
3546
ae9fedc7 3547 if (unlikely(vmx->rmode.vm86_active))
2e4d2653 3548 return 0;
ae9fedc7
PB
3549 else {
3550 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4d283ec9 3551 return VMX_AR_DPL(ar);
69c73028 3552 }
69c73028
AK
3553}
3554
653e3108 3555static u32 vmx_segment_access_rights(struct kvm_segment *var)
6aa8b732 3556{
6aa8b732
AK
3557 u32 ar;
3558
a44b3316
HB
3559 ar = var->type & 15;
3560 ar |= (var->s & 1) << 4;
3561 ar |= (var->dpl & 3) << 5;
3562 ar |= (var->present & 1) << 7;
3563 ar |= (var->avl & 1) << 12;
3564 ar |= (var->l & 1) << 13;
3565 ar |= (var->db & 1) << 14;
3566 ar |= (var->g & 1) << 15;
3567 ar |= (var->unusable || !var->present) << 16;
653e3108
AK
3568
3569 return ar;
3570}
3571
816be9e9 3572void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
653e3108 3573{
7ffd92c5 3574 struct vcpu_vmx *vmx = to_vmx(vcpu);
772e0318 3575 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
653e3108 3576
2fb92db1
AK
3577 vmx_segment_cache_clear(vmx);
3578
1ecd50a9
GN
3579 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3580 vmx->rmode.segs[seg] = *var;
3581 if (seg == VCPU_SREG_TR)
3582 vmcs_write16(sf->selector, var->selector);
3583 else if (var->s)
3584 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
1dd7a4f1 3585 return;
653e3108 3586 }
1ecd50a9 3587
653e3108
AK
3588 vmcs_writel(sf->base, var->base);
3589 vmcs_write32(sf->limit, var->limit);
3590 vmcs_write16(sf->selector, var->selector);
3a624e29
NK
3591
3592 /*
3593 * Fix the "Accessed" bit in AR field of segment registers for older
3594 * qemu binaries.
3595 * IA32 arch specifies that at the time of processor reset the
3596 * "Accessed" bit in the AR field of segment registers is 1. And qemu
0fa06071 3597 * is setting it to 0 in the userland code. This causes invalid guest
3a624e29
NK
3598 * state vmexit when "unrestricted guest" mode is turned on.
3599 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3600 * tree. Newer qemu binaries with that qemu fix would not need this
3601 * kvm hack.
3602 */
bddd82d1 3603 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
f924d66d 3604 var->type |= 0x1; /* Accessed */
3a624e29 3605
f924d66d 3606 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
1dd7a4f1 3607}
d99e4152 3608
816be9e9 3609static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
1dd7a4f1
SC
3610{
3611 __vmx_set_segment(vcpu, var, seg);
3612
dbab610a 3613 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
6aa8b732
AK
3614}
3615
6aa8b732
AK
3616static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3617{
2fb92db1 3618 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
6aa8b732
AK
3619
3620 *db = (ar >> 14) & 1;
3621 *l = (ar >> 13) & 1;
3622}
3623
89a27f4d 3624static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3625{
89a27f4d
GN
3626 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3627 dt->address = vmcs_readl(GUEST_IDTR_BASE);
6aa8b732
AK
3628}
3629
89a27f4d 3630static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3631{
89a27f4d
GN
3632 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3633 vmcs_writel(GUEST_IDTR_BASE, dt->address);
6aa8b732
AK
3634}
3635
89a27f4d 3636static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3637{
89a27f4d
GN
3638 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3639 dt->address = vmcs_readl(GUEST_GDTR_BASE);
6aa8b732
AK
3640}
3641
89a27f4d 3642static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3643{
89a27f4d
GN
3644 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3645 vmcs_writel(GUEST_GDTR_BASE, dt->address);
6aa8b732
AK
3646}
3647
648dfaa7
MG
3648static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3649{
3650 struct kvm_segment var;
3651 u32 ar;
3652
3653 vmx_get_segment(vcpu, &var, seg);
07f42f5f 3654 var.dpl = 0x3;
0647f4aa
GN
3655 if (seg == VCPU_SREG_CS)
3656 var.type = 0x3;
648dfaa7
MG
3657 ar = vmx_segment_access_rights(&var);
3658
3659 if (var.base != (var.selector << 4))
3660 return false;
89efbed0 3661 if (var.limit != 0xffff)
648dfaa7 3662 return false;
07f42f5f 3663 if (ar != 0xf3)
648dfaa7
MG
3664 return false;
3665
3666 return true;
3667}
3668
3669static bool code_segment_valid(struct kvm_vcpu *vcpu)
3670{
3671 struct kvm_segment cs;
3672 unsigned int cs_rpl;
3673
3674 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
b32a9918 3675 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
648dfaa7 3676
1872a3f4
AK
3677 if (cs.unusable)
3678 return false;
4d283ec9 3679 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
648dfaa7
MG
3680 return false;
3681 if (!cs.s)
3682 return false;
4d283ec9 3683 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
648dfaa7
MG
3684 if (cs.dpl > cs_rpl)
3685 return false;
1872a3f4 3686 } else {
648dfaa7
MG
3687 if (cs.dpl != cs_rpl)
3688 return false;
3689 }
3690 if (!cs.present)
3691 return false;
3692
3693 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3694 return true;
3695}
3696
3697static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3698{
3699 struct kvm_segment ss;
3700 unsigned int ss_rpl;
3701
3702 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
b32a9918 3703 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
648dfaa7 3704
1872a3f4
AK
3705 if (ss.unusable)
3706 return true;
3707 if (ss.type != 3 && ss.type != 7)
648dfaa7
MG
3708 return false;
3709 if (!ss.s)
3710 return false;
3711 if (ss.dpl != ss_rpl) /* DPL != RPL */
3712 return false;
3713 if (!ss.present)
3714 return false;
3715
3716 return true;
3717}
3718
3719static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3720{
3721 struct kvm_segment var;
3722 unsigned int rpl;
3723
3724 vmx_get_segment(vcpu, &var, seg);
b32a9918 3725 rpl = var.selector & SEGMENT_RPL_MASK;
648dfaa7 3726
1872a3f4
AK
3727 if (var.unusable)
3728 return true;
648dfaa7
MG
3729 if (!var.s)
3730 return false;
3731 if (!var.present)
3732 return false;
4d283ec9 3733 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
648dfaa7
MG
3734 if (var.dpl < rpl) /* DPL < RPL */
3735 return false;
3736 }
3737
3738 /* TODO: Add other members to kvm_segment_field to allow checking for other access
3739 * rights flags
3740 */
3741 return true;
3742}
3743
3744static bool tr_valid(struct kvm_vcpu *vcpu)
3745{
3746 struct kvm_segment tr;
3747
3748 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3749
1872a3f4
AK
3750 if (tr.unusable)
3751 return false;
b32a9918 3752 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7 3753 return false;
1872a3f4 3754 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
648dfaa7
MG
3755 return false;
3756 if (!tr.present)
3757 return false;
3758
3759 return true;
3760}
3761
3762static bool ldtr_valid(struct kvm_vcpu *vcpu)
3763{
3764 struct kvm_segment ldtr;
3765
3766 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3767
1872a3f4
AK
3768 if (ldtr.unusable)
3769 return true;
b32a9918 3770 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7
MG
3771 return false;
3772 if (ldtr.type != 2)
3773 return false;
3774 if (!ldtr.present)
3775 return false;
3776
3777 return true;
3778}
3779
3780static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3781{
3782 struct kvm_segment cs, ss;
3783
3784 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3785 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3786
b32a9918
NA
3787 return ((cs.selector & SEGMENT_RPL_MASK) ==
3788 (ss.selector & SEGMENT_RPL_MASK));
648dfaa7
MG
3789}
3790
3791/*
3792 * Check if guest state is valid. Returns true if valid, false if
3793 * not.
3794 * We assume that registers are always usable
3795 */
2ba4493a 3796bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
648dfaa7
MG
3797{
3798 /* real mode guest state checks */
f13882d8 3799 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
648dfaa7
MG
3800 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3801 return false;
3802 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3803 return false;
3804 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3805 return false;
3806 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3807 return false;
3808 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3809 return false;
3810 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3811 return false;
3812 } else {
3813 /* protected mode guest state checks */
3814 if (!cs_ss_rpl_check(vcpu))
3815 return false;
3816 if (!code_segment_valid(vcpu))
3817 return false;
3818 if (!stack_segment_valid(vcpu))
3819 return false;
3820 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3821 return false;
3822 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3823 return false;
3824 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3825 return false;
3826 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3827 return false;
3828 if (!tr_valid(vcpu))
3829 return false;
3830 if (!ldtr_valid(vcpu))
3831 return false;
3832 }
3833 /* TODO:
3834 * - Add checks on RIP
3835 * - Add checks on RFLAGS
3836 */
3837
3838 return true;
3839}
3840
ff5a983c 3841static int init_rmode_tss(struct kvm *kvm, void __user *ua)
6aa8b732 3842{
ff5a983c
PX
3843 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3844 u16 data;
3845 int i;
3846
3847 for (i = 0; i < 3; i++) {
3848 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3849 return -EFAULT;
3850 }
6aa8b732 3851
195aefde 3852 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
ff5a983c
PX
3853 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3854 return -EFAULT;
3855
195aefde 3856 data = ~0;
ff5a983c
PX
3857 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3858 return -EFAULT;
3859
3860 return 0;
6aa8b732
AK
3861}
3862
b7ebfb05
SY
3863static int init_rmode_identity_map(struct kvm *kvm)
3864{
40bbb9d0 3865 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
2a5755bb 3866 int i, r = 0;
ff5a983c 3867 void __user *uaddr;
b7ebfb05
SY
3868 u32 tmp;
3869
40bbb9d0 3870 /* Protect kvm_vmx->ept_identity_pagetable_done. */
a255d479
TC
3871 mutex_lock(&kvm->slots_lock);
3872
40bbb9d0 3873 if (likely(kvm_vmx->ept_identity_pagetable_done))
2a5755bb 3874 goto out;
a255d479 3875
40bbb9d0
SC
3876 if (!kvm_vmx->ept_identity_map_addr)
3877 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
a255d479 3878
ff5a983c
PX
3879 uaddr = __x86_set_memory_region(kvm,
3880 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3881 kvm_vmx->ept_identity_map_addr,
3882 PAGE_SIZE);
3883 if (IS_ERR(uaddr)) {
3884 r = PTR_ERR(uaddr);
2a5755bb 3885 goto out;
ff5a983c 3886 }
a255d479 3887
b7ebfb05 3888 /* Set up identity-mapping pagetable for EPT in real mode */
1ae20e0b 3889 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
b7ebfb05
SY
3890 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3891 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
ff5a983c
PX
3892 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3893 r = -EFAULT;
b7ebfb05 3894 goto out;
ff5a983c 3895 }
b7ebfb05 3896 }
40bbb9d0 3897 kvm_vmx->ept_identity_pagetable_done = true;
f51770ed 3898
b7ebfb05 3899out:
a255d479 3900 mutex_unlock(&kvm->slots_lock);
f51770ed 3901 return r;
b7ebfb05
SY
3902}
3903
6aa8b732
AK
3904static void seg_setup(int seg)
3905{
772e0318 3906 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3a624e29 3907 unsigned int ar;
6aa8b732
AK
3908
3909 vmcs_write16(sf->selector, 0);
3910 vmcs_writel(sf->base, 0);
3911 vmcs_write32(sf->limit, 0xffff);
d54d07b2
GN
3912 ar = 0x93;
3913 if (seg == VCPU_SREG_CS)
3914 ar |= 0x08; /* code segment */
3a624e29
NK
3915
3916 vmcs_write32(sf->ar_bytes, ar);
6aa8b732
AK
3917}
3918
97b7ead3 3919int allocate_vpid(void)
2384d2b3
SY
3920{
3921 int vpid;
3922
919818ab 3923 if (!enable_vpid)
991e7a0e 3924 return 0;
2384d2b3
SY
3925 spin_lock(&vmx_vpid_lock);
3926 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
991e7a0e 3927 if (vpid < VMX_NR_VPIDS)
2384d2b3 3928 __set_bit(vpid, vmx_vpid_bitmap);
991e7a0e
WL
3929 else
3930 vpid = 0;
2384d2b3 3931 spin_unlock(&vmx_vpid_lock);
991e7a0e 3932 return vpid;
2384d2b3
SY
3933}
3934
97b7ead3 3935void free_vpid(int vpid)
cdbecfc3 3936{
991e7a0e 3937 if (!enable_vpid || vpid == 0)
cdbecfc3
LJ
3938 return;
3939 spin_lock(&vmx_vpid_lock);
991e7a0e 3940 __clear_bit(vpid, vmx_vpid_bitmap);
cdbecfc3
LJ
3941 spin_unlock(&vmx_vpid_lock);
3942}
3943
b84155c3
VK
3944static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3945{
3946 /*
3947 * When KVM is a nested hypervisor on top of Hyper-V and uses
3948 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3949 * bitmap has changed.
3950 */
19f10315 3951 if (kvm_is_using_evmcs()) {
93827a0a
AM
3952 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
3953
3954 if (evmcs->hv_enlightenments_control.msr_bitmap)
3955 evmcs->hv_clean_fields &=
3956 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3957 }
ed2a4800
VK
3958
3959 vmx->nested.force_msr_bitmap_recalc = true;
b84155c3
VK
3960}
3961
e23f6d49 3962void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
25c5f225 3963{
476c9bd8
AL
3964 struct vcpu_vmx *vmx = to_vmx(vcpu);
3965 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
25c5f225
SY
3966
3967 if (!cpu_has_vmx_msr_bitmap())
3968 return;
3969
b84155c3 3970 vmx_msr_bitmap_l01_changed(vmx);
ceef7d10 3971
25c5f225 3972 /*
3eb90017
AG
3973 * Mark the desired intercept state in shadow bitmap, this is needed
3974 * for resync when the MSR filters change.
3975 */
3976 if (is_valid_passthrough_msr(msr)) {
3977 int idx = possible_passthrough_msr_slot(msr);
3978
3979 if (idx != -ENOENT) {
3980 if (type & MSR_TYPE_R)
3981 clear_bit(idx, vmx->shadow_msr_intercept.read);
3982 if (type & MSR_TYPE_W)
3983 clear_bit(idx, vmx->shadow_msr_intercept.write);
3984 }
3985 }
8d14695f 3986
3eb90017
AG
3987 if ((type & MSR_TYPE_R) &&
3988 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3989 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3990 type &= ~MSR_TYPE_R;
3991 }
8d14695f 3992
3eb90017
AG
3993 if ((type & MSR_TYPE_W) &&
3994 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3995 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3996 type &= ~MSR_TYPE_W;
3997 }
8d14695f 3998
3eb90017
AG
3999 if (type & MSR_TYPE_R)
4000 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
8d14695f 4001
3eb90017
AG
4002 if (type & MSR_TYPE_W)
4003 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
8d14695f
YZ
4004}
4005
e23f6d49 4006void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
904e14fb 4007{
476c9bd8
AL
4008 struct vcpu_vmx *vmx = to_vmx(vcpu);
4009 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
904e14fb
PB
4010
4011 if (!cpu_has_vmx_msr_bitmap())
4012 return;
4013
b84155c3 4014 vmx_msr_bitmap_l01_changed(vmx);
ceef7d10 4015
904e14fb 4016 /*
3eb90017
AG
4017 * Mark the desired intercept state in shadow bitmap, this is needed
4018 * for resync when the MSR filter changes.
4019 */
4020 if (is_valid_passthrough_msr(msr)) {
4021 int idx = possible_passthrough_msr_slot(msr);
4022
4023 if (idx != -ENOENT) {
4024 if (type & MSR_TYPE_R)
4025 set_bit(idx, vmx->shadow_msr_intercept.read);
4026 if (type & MSR_TYPE_W)
4027 set_bit(idx, vmx->shadow_msr_intercept.write);
4028 }
4029 }
904e14fb 4030
3eb90017
AG
4031 if (type & MSR_TYPE_R)
4032 vmx_set_msr_bitmap_read(msr_bitmap, msr);
904e14fb 4033
3eb90017
AG
4034 if (type & MSR_TYPE_W)
4035 vmx_set_msr_bitmap_write(msr_bitmap, msr);
904e14fb
PB
4036}
4037
84ec8d2d 4038static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
9389b9d5 4039{
c39857ce
SC
4040 /*
4041 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4042 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
4043 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4044 */
4045 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4046 const int write_idx = read_idx + (0x800 / sizeof(u64));
84ec8d2d 4047 struct vcpu_vmx *vmx = to_vmx(vcpu);
c39857ce 4048 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
84ec8d2d
SC
4049 u8 mode;
4050
02efd818 4051 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
9389b9d5
SC
4052 return;
4053
84ec8d2d
SC
4054 if (cpu_has_secondary_exec_ctrls() &&
4055 (secondary_exec_controls_get(vmx) &
4056 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4057 mode = MSR_BITMAP_MODE_X2APIC;
4058 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4059 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4060 } else {
4061 mode = 0;
4062 }
4063
4064 if (mode == vmx->x2apic_msr_bitmap_mode)
4065 return;
4066
4067 vmx->x2apic_msr_bitmap_mode = mode;
4068
c39857ce
SC
4069 /*
4070 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
4071 * registers (0x840 and above) intercepted, KVM doesn't support them.
4072 * Intercept all writes by default and poke holes as needed. Pass
02efd818
SC
4073 * through reads for all valid registers by default in x2APIC+APICv
4074 * mode, only the current timer count needs on-demand emulation by KVM.
c39857ce
SC
4075 */
4076 if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
02efd818 4077 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
c39857ce
SC
4078 else
4079 msr_bitmap[read_idx] = ~0ull;
4080 msr_bitmap[write_idx] = ~0ull;
9389b9d5
SC
4081
4082 /*
4083 * TPR reads and writes can be virtualized even if virtual interrupt
4084 * delivery is not in use.
4085 */
4086 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4087 !(mode & MSR_BITMAP_MODE_X2APIC));
4088
4089 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4090 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4091 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4092 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
d588bb9b
CG
4093 if (enable_ipiv)
4094 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
f6e90f9e 4095 }
5897297b
AK
4096}
4097
476c9bd8 4098void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
b08c2896 4099{
476c9bd8 4100 struct vcpu_vmx *vmx = to_vmx(vcpu);
b08c2896
CP
4101 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4102 u32 i;
4103
476c9bd8
AL
4104 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4105 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4106 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4107 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
f4d3a902 4108 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
476c9bd8
AL
4109 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4110 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
b08c2896
CP
4111 }
4112}
4113
e6c67d8c
LA
4114static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4115{
4116 struct vcpu_vmx *vmx = to_vmx(vcpu);
4117 void *vapic_page;
4118 u32 vppr;
4119 int rvi;
4120
4121 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4122 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
96c66e87 4123 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
e6c67d8c
LA
4124 return false;
4125
7e712684 4126 rvi = vmx_get_rvi();
e6c67d8c 4127
96c66e87 4128 vapic_page = vmx->nested.virtual_apic_map.hva;
e6c67d8c 4129 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
e6c67d8c
LA
4130
4131 return ((rvi & 0xf0) > (vppr & 0xf0));
4132}
4133
3eb90017
AG
4134static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4135{
4136 struct vcpu_vmx *vmx = to_vmx(vcpu);
4137 u32 i;
4138
4139 /*
d895f28e
SC
4140 * Redo intercept permissions for MSRs that KVM is passing through to
4141 * the guest. Disabling interception will check the new MSR filter and
4142 * ensure that KVM enables interception if usersepace wants to filter
4143 * the MSR. MSRs that KVM is already intercepting don't need to be
4144 * refreshed since KVM is going to intercept them regardless of what
4145 * userspace wants.
3eb90017
AG
4146 */
4147 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4148 u32 msr = vmx_possible_passthrough_msrs[i];
3eb90017 4149
d895f28e
SC
4150 if (!test_bit(i, vmx->shadow_msr_intercept.read))
4151 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4152
4153 if (!test_bit(i, vmx->shadow_msr_intercept.write))
4154 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
3eb90017
AG
4155 }
4156
b184b35d
SC
4157 /* PT MSRs can be passed through iff PT is exposed to the guest. */
4158 if (vmx_pt_mode_is_host_guest())
4159 pt_update_intercept_for_msr(vcpu);
3eb90017
AG
4160}
4161
ccf8d687 4162static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
296aa266 4163 int pi_vec)
21bc8dc5
RK
4164{
4165#ifdef CONFIG_SMP
4166 if (vcpu->mode == IN_GUEST_MODE) {
28b835d6 4167 /*
9b44423b
WL
4168 * The vector of the virtual has already been set in the PIR.
4169 * Send a notification event to deliver the virtual interrupt
4170 * unless the vCPU is the currently running vCPU, i.e. the
4171 * event is being sent from a fastpath VM-Exit handler, in
4172 * which case the PIR will be synced to the vIRR before
4173 * re-entering the guest.
5753743f 4174 *
9b44423b
WL
4175 * When the target is not the running vCPU, the following
4176 * possibilities emerge:
5753743f 4177 *
9b44423b
WL
4178 * Case 1: vCPU stays in non-root mode. Sending a notification
4179 * event posts the interrupt to the vCPU.
5753743f 4180 *
9b44423b
WL
4181 * Case 2: vCPU exits to root mode and is still runnable. The
4182 * PIR will be synced to the vIRR before re-entering the guest.
4183 * Sending a notification event is ok as the host IRQ handler
4184 * will ignore the spurious event.
28b835d6 4185 *
9b44423b
WL
4186 * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4187 * has already synced PIR to vIRR and never blocks the vCPU if
4188 * the vIRR is not empty. Therefore, a blocked vCPU here does
4189 * not wait for any requested interrupts in PIR, and sending a
4190 * notification event also results in a benign, spurious event.
28b835d6 4191 */
28b835d6 4192
9b44423b
WL
4193 if (vcpu != kvm_get_running_vcpu())
4194 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
ccf8d687 4195 return;
21bc8dc5
RK
4196 }
4197#endif
ccf8d687
SC
4198 /*
4199 * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4200 * otherwise do nothing as KVM will grab the highest priority pending
4201 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4202 */
4203 kvm_vcpu_wake_up(vcpu);
21bc8dc5
RK
4204}
4205
705699a1
WV
4206static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4207 int vector)
4208{
4209 struct vcpu_vmx *vmx = to_vmx(vcpu);
4210
4211 if (is_guest_mode(vcpu) &&
4212 vector == vmx->nested.posted_intr_nv) {
705699a1
WV
4213 /*
4214 * If a posted intr is not recognized by hardware,
4215 * we will accomplish it in the next vmentry.
4216 */
4217 vmx->nested.pi_pending = true;
4218 kvm_make_request(KVM_REQ_EVENT, vcpu);
83c98007
SC
4219
4220 /*
4221 * This pairs with the smp_mb_*() after setting vcpu->mode in
4222 * vcpu_enter_guest() to guarantee the vCPU sees the event
4223 * request if triggering a posted interrupt "fails" because
4224 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
4225 * the smb_wmb() in kvm_make_request() only ensures everything
4226 * done before making the request is visible when the request
4227 * is visible, it doesn't ensure ordering between the store to
4228 * vcpu->requests and the load from vcpu->mode.
4229 */
4230 smp_mb__after_atomic();
4231
6b697711 4232 /* the PIR and ON have been set by L1. */
ccf8d687 4233 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
705699a1
WV
4234 return 0;
4235 }
4236 return -1;
4237}
a20ed54d
YZ
4238/*
4239 * Send interrupt to vcpu via posted interrupt way.
4240 * 1. If target vcpu is running(non-root mode), send posted interrupt
4241 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4242 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4243 * interrupt from PIR in next vmentry.
4244 */
91a5f413 4245static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
a20ed54d
YZ
4246{
4247 struct vcpu_vmx *vmx = to_vmx(vcpu);
4248 int r;
4249
705699a1
WV
4250 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4251 if (!r)
91a5f413
VK
4252 return 0;
4253
ce0a58f4
SC
4254 /* Note, this is called iff the local APIC is in-kernel. */
4255 if (!vcpu->arch.apic->apicv_active)
91a5f413 4256 return -1;
705699a1 4257
a20ed54d 4258 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
91a5f413 4259 return 0;
a20ed54d 4260
b95234c8
PB
4261 /* If a previous notification has sent the IPI, nothing to do. */
4262 if (pi_test_and_set_on(&vmx->pi_desc))
91a5f413 4263 return 0;
b95234c8 4264
83c98007
SC
4265 /*
4266 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4267 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4268 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4269 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4270 */
ccf8d687 4271 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
91a5f413 4272 return 0;
a20ed54d
YZ
4273}
4274
57dfd7b5
SC
4275static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4276 int trig_mode, int vector)
4277{
4278 struct kvm_vcpu *vcpu = apic->vcpu;
4279
4280 if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4281 kvm_lapic_set_irr(vector, apic);
4282 kvm_make_request(KVM_REQ_EVENT, vcpu);
4283 kvm_vcpu_kick(vcpu);
4284 } else {
4285 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4286 trig_mode, vector);
4287 }
4288}
4289
a3a8ff8e
NHE
4290/*
4291 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4292 * will not change in the lifetime of the guest.
4293 * Note that host-state that does change is set elsewhere. E.g., host-state
4294 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4295 */
97b7ead3 4296void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
a3a8ff8e
NHE
4297{
4298 u32 low32, high32;
4299 unsigned long tmpl;
d6e41f11 4300 unsigned long cr0, cr3, cr4;
a3a8ff8e 4301
04ac88ab
AL
4302 cr0 = read_cr0();
4303 WARN_ON(cr0 & X86_CR0_TS);
4304 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
d6e41f11
AL
4305
4306 /*
4307 * Save the most likely value for this task's CR3 in the VMCS.
4308 * We can't use __get_current_cr3_fast() because we're not atomic.
4309 */
6c690ee1 4310 cr3 = __read_cr3();
d6e41f11 4311 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
d7ee039e 4312 vmx->loaded_vmcs->host_state.cr3 = cr3;
a3a8ff8e 4313
d974baa3 4314 /* Save the most likely value for this task's CR4 in the VMCS. */
1e02ce4c 4315 cr4 = cr4_read_shadow();
d974baa3 4316 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
d7ee039e 4317 vmx->loaded_vmcs->host_state.cr4 = cr4;
d974baa3 4318
a3a8ff8e 4319 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
b2da15ac
AK
4320#ifdef CONFIG_X86_64
4321 /*
4322 * Load null selectors, so we can avoid reloading them in
6d6095bd
SC
4323 * vmx_prepare_switch_to_host(), in case userspace uses
4324 * the null selectors too (the expected case).
b2da15ac
AK
4325 */
4326 vmcs_write16(HOST_DS_SELECTOR, 0);
4327 vmcs_write16(HOST_ES_SELECTOR, 0);
4328#else
a3a8ff8e
NHE
4329 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4330 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
b2da15ac 4331#endif
a3a8ff8e
NHE
4332 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4333 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
4334
2342080c 4335 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
a3a8ff8e 4336
453eafbe 4337 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
a3a8ff8e
NHE
4338
4339 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4340 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
6ab8a405
LJ
4341
4342 /*
94fea1d8
SC
4343 * SYSENTER is used for 32-bit system calls on either 32-bit or
4344 * 64-bit kernels. It is always zero If neither is allowed, otherwise
4345 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4346 * have already done so!).
6ab8a405 4347 */
94fea1d8
SC
4348 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4349 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4350
a3a8ff8e
NHE
4351 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4352 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
4353
4354 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4355 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4356 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4357 }
5a5e8a15 4358
c73da3fc 4359 if (cpu_has_load_ia32_efer())
5a5e8a15 4360 vmcs_write64(HOST_IA32_EFER, host_efer);
a3a8ff8e
NHE
4361}
4362
97b7ead3 4363void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
bf8179a0 4364{
2ed41aa6
SC
4365 struct kvm_vcpu *vcpu = &vmx->vcpu;
4366
4367 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4368 ~vcpu->arch.cr4_guest_rsvd_bits;
a37ebdce 4369 if (!enable_ept) {
5ec60aad 4370 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
a37ebdce
LJ
4371 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4372 }
fe3ef05c 4373 if (is_guest_mode(&vmx->vcpu))
2ed41aa6
SC
4374 vcpu->arch.cr4_guest_owned_bits &=
4375 ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4376 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
bf8179a0
NHE
4377}
4378
2fba4fc1 4379static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
01e439be
YZ
4380{
4381 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4382
d62caabb 4383 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
01e439be 4384 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
d02fcf50
PB
4385
4386 if (!enable_vnmi)
4387 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4388
804939ea
SC
4389 if (!enable_preemption_timer)
4390 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4391
01e439be
YZ
4392 return pin_based_exec_ctrl;
4393}
4394
2fba4fc1
SC
4395static u32 vmx_vmentry_ctrl(void)
4396{
4397 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4398
4399 if (vmx_pt_mode_is_system())
4400 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4401 VM_ENTRY_LOAD_IA32_RTIT_CTL);
ffaaf591
VK
4402 /*
4403 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4404 */
4405 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4406 VM_ENTRY_LOAD_IA32_EFER |
4407 VM_ENTRY_IA32E_MODE);
4408
9d78d6fb
VK
4409 if (cpu_has_perf_global_ctrl_bug())
4410 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4411
ffaaf591 4412 return vmentry_ctrl;
2fba4fc1
SC
4413}
4414
4415static u32 vmx_vmexit_ctrl(void)
4416{
4417 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4418
f16e4742
VK
4419 /*
4420 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4421 * nested virtualization and thus allowed to be set in vmcs12.
4422 */
4423 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4424 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4425
2fba4fc1
SC
4426 if (vmx_pt_mode_is_system())
4427 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4428 VM_EXIT_CLEAR_IA32_RTIT_CTL);
9d78d6fb
VK
4429
4430 if (cpu_has_perf_global_ctrl_bug())
4431 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4432
2fba4fc1
SC
4433 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4434 return vmexit_ctrl &
4435 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4436}
4437
d62caabb
AS
4438static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4439{
4440 struct vcpu_vmx *vmx = to_vmx(vcpu);
4441
7c69661e
SC
4442 if (is_guest_mode(vcpu)) {
4443 vmx->nested.update_vmcs01_apicv_status = true;
4444 return;
4445 }
4446
c5f2c766 4447 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
f08a06c9 4448
d588bb9b 4449 if (kvm_vcpu_apicv_active(vcpu)) {
f08a06c9
ZG
4450 secondary_exec_controls_setbit(vmx,
4451 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4452 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
d588bb9b
CG
4453 if (enable_ipiv)
4454 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4455 } else {
f08a06c9
ZG
4456 secondary_exec_controls_clearbit(vmx,
4457 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4458 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
d588bb9b
CG
4459 if (enable_ipiv)
4460 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
3ce424e4
RK
4461 }
4462
84ec8d2d 4463 vmx_update_msr_bitmap_x2apic(vcpu);
d62caabb
AS
4464}
4465
2fba4fc1 4466static u32 vmx_exec_control(struct vcpu_vmx *vmx)
89b0c9f5
SC
4467{
4468 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4469
a83bea73
VK
4470 /*
4471 * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4472 * vmcs12 and propagated to vmcs02 when set in vmcs12.
4473 */
4474 exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4475 CPU_BASED_USE_IO_BITMAPS |
4476 CPU_BASED_MONITOR_TRAP_FLAG |
4477 CPU_BASED_PAUSE_EXITING);
4478
378c4c18
VK
4479 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4480 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4481 CPU_BASED_NMI_WINDOW_EXITING);
4482
89b0c9f5
SC
4483 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4484 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4485
e89e1e23 4486 if (!cpu_need_tpr_shadow(&vmx->vcpu))
89b0c9f5 4487 exec_control &= ~CPU_BASED_TPR_SHADOW;
e89e1e23 4488
89b0c9f5 4489#ifdef CONFIG_X86_64
e89e1e23
VK
4490 if (exec_control & CPU_BASED_TPR_SHADOW)
4491 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4492 CPU_BASED_CR8_STORE_EXITING);
4493 else
89b0c9f5
SC
4494 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4495 CPU_BASED_CR8_LOAD_EXITING;
4496#endif
64f80ea7
SC
4497 /* No need to intercept CR3 access or INVPLG when using EPT. */
4498 if (enable_ept)
4499 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4500 CPU_BASED_CR3_STORE_EXITING |
4501 CPU_BASED_INVLPG_EXITING);
89b0c9f5
SC
4502 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4503 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4504 CPU_BASED_MONITOR_EXITING);
4505 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4506 exec_control &= ~CPU_BASED_HLT_EXITING;
4507 return exec_control;
4508}
4509
1ad4e543
RH
4510static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4511{
d588bb9b
CG
4512 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4513
4514 /*
4515 * IPI virtualization relies on APICv. Disable IPI virtualization if
4516 * APICv is inhibited.
4517 */
4518 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4519 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4520
4521 return exec_control;
1ad4e543
RH
4522}
4523
8b50b92f
SC
4524/*
4525 * Adjust a single secondary execution control bit to intercept/allow an
4526 * instruction in the guest. This is usually done based on whether or not a
4527 * feature has been exposed to the guest in order to correctly emulate faults.
4528 */
4529static inline void
4530vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4531 u32 control, bool enabled, bool exiting)
4532{
4533 /*
4534 * If the control is for an opt-in feature, clear the control if the
4535 * feature is not exposed to the guest, i.e. not enabled. If the
4536 * control is opt-out, i.e. an exiting control, clear the control if
4537 * the feature _is_ exposed to the guest, i.e. exiting/interception is
4538 * disabled for the associated instruction. Note, the caller is
4539 * responsible presetting exec_control to set all supported bits.
4540 */
4541 if (enabled == exiting)
4542 *exec_control &= ~control;
4543
4544 /*
4545 * Update the nested MSR settings so that a nested VMM can/can't set
4546 * controls for features that are/aren't exposed to the guest.
4547 */
4548 if (nested) {
a0860d68
SC
4549 /*
4550 * All features that can be added or removed to VMX MSRs must
4551 * be supported in the first place for nested virtualization.
4552 */
4553 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4554 enabled = false;
4555
8b50b92f
SC
4556 if (enabled)
4557 vmx->nested.msrs.secondary_ctls_high |= control;
4558 else
4559 vmx->nested.msrs.secondary_ctls_high &= ~control;
4560 }
4561}
4562
4563/*
4564 * Wrapper macro for the common case of adjusting a secondary execution control
4565 * based on a single guest CPUID bit, with a dedicated feature bit. This also
4566 * verifies that the control is actually supported by KVM and hardware.
4567 */
4568#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4569({ \
4570 bool __enabled; \
4571 \
4572 if (cpu_has_vmx_##name()) { \
4573 __enabled = guest_cpuid_has(&(vmx)->vcpu, \
4574 X86_FEATURE_##feat_name); \
4575 vmx_adjust_secondary_exec_control(vmx, exec_control, \
4576 SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4577 } \
4578})
4579
4580/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4581#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4582 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4583
4584#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4585 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
89b0c9f5 4586
2fba4fc1 4587static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
bf8179a0 4588{
80154d77
PB
4589 struct kvm_vcpu *vcpu = &vmx->vcpu;
4590
bf8179a0 4591 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
0367f205 4592
2ef7619d 4593 if (vmx_pt_mode_is_system())
f99e3daf 4594 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
80154d77 4595 if (!cpu_need_virtualize_apic_accesses(vcpu))
bf8179a0
NHE
4596 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4597 if (vmx->vpid == 0)
4598 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4599 if (!enable_ept) {
4600 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4601 enable_unrestricted_guest = 0;
4602 }
4603 if (!enable_unrestricted_guest)
4604 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
b31c114b 4605 if (kvm_pause_in_guest(vmx->vcpu.kvm))
bf8179a0 4606 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
80154d77 4607 if (!kvm_vcpu_apicv_active(vcpu))
c7c9c56c
YZ
4608 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4609 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
8d14695f 4610 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
0367f205 4611
41acdd41
YZ
4612 /*
4613 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4614 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4615 */
4616 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4617
0367f205
PB
4618 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4619 * in vmx_set_cr4. */
4620 exec_control &= ~SECONDARY_EXEC_DESC;
4621
abc4fc58
AG
4622 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4623 (handle_vmptrld).
4624 We can NOT enable shadow_vmcs here because we don't have yet
4625 a current VMCS12
4626 */
4627 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
a3eaa864 4628
a85863c2
MS
4629 /*
4630 * PML is enabled/disabled when dirty logging of memsmlots changes, but
4631 * it needs to be set here when dirty logging is already active, e.g.
4632 * if this vCPU was created after dirty logging was enabled.
4633 */
ee661d8e 4634 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
a3eaa864 4635 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
843e4330 4636
becdad85 4637 if (cpu_has_vmx_xsaves()) {
3db13480
PB
4638 /* Exposing XSAVES only when XSAVE is exposed */
4639 bool xsaves_enabled =
96be4e06 4640 boot_cpu_has(X86_FEATURE_XSAVE) &&
3db13480
PB
4641 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4642 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4643
7204160e
AL
4644 vcpu->arch.xsaves_enabled = xsaves_enabled;
4645
8b50b92f
SC
4646 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4647 SECONDARY_EXEC_XSAVES,
4648 xsaves_enabled, false);
45ec368c
JM
4649 }
4650
36fa06f9
SC
4651 /*
4652 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4653 * feature is exposed to the guest. This creates a virtualization hole
4654 * if both are supported in hardware but only one is exposed to the
4655 * guest, but letting the guest execute RDTSCP or RDPID when either one
4656 * is advertised is preferable to emulating the advertised instruction
4657 * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4658 */
4659 if (cpu_has_vmx_rdtscp()) {
4660 bool rdpid_or_rdtscp_enabled =
4661 guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4662 guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4663
4664 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4665 SECONDARY_EXEC_ENABLE_RDTSCP,
4666 rdpid_or_rdtscp_enabled, false);
4667 }
8b50b92f 4668 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
75f4fc8d 4669
8b50b92f
SC
4670 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4671 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
e69e72fa 4672
8b50b92f
SC
4673 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4674 ENABLE_USR_WAIT_PAUSE, false);
e69e72fa 4675
fe6b6bc8
CQ
4676 if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4677 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4678
2f4073e0
TX
4679 if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4680 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4681
b6247686 4682 return exec_control;
bf8179a0
NHE
4683}
4684
d588bb9b
CG
4685static inline int vmx_get_pid_table_order(struct kvm *kvm)
4686{
4687 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4688}
4689
4690static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4691{
4692 struct page *pages;
4693 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4694
4695 if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4696 return 0;
4697
4698 if (kvm_vmx->pid_table)
4699 return 0;
4700
4701 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
4702 if (!pages)
4703 return -ENOMEM;
4704
4705 kvm_vmx->pid_table = (void *)page_address(pages);
4706 return 0;
4707}
4708
4709static int vmx_vcpu_precreate(struct kvm *kvm)
4710{
4711 return vmx_alloc_ipiv_pid_table(kvm);
4712}
4713
f53cd63c 4714#define VMX_XSS_EXIT_BITMAP 0
6aa8b732 4715
1b84292b 4716static void init_vmcs(struct vcpu_vmx *vmx)
944c3464 4717{
d588bb9b
CG
4718 struct kvm *kvm = vmx->vcpu.kvm;
4719 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4720
944c3464 4721 if (nested)
1b84292b 4722 nested_vmx_set_vmcs_shadowing_bitmap();
944c3464 4723
25c5f225 4724 if (cpu_has_vmx_msr_bitmap())
904e14fb 4725 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
25c5f225 4726
64c78508 4727 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
6aa8b732 4728
6aa8b732 4729 /* Control */
3af80fec 4730 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
6e5d865c 4731
3af80fec 4732 exec_controls_set(vmx, vmx_exec_control(vmx));
6aa8b732 4733
b6247686
SC
4734 if (cpu_has_secondary_exec_ctrls())
4735 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
f78e0e2e 4736
1ad4e543
RH
4737 if (cpu_has_tertiary_exec_ctrls())
4738 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4739
1421211a 4740 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
c7c9c56c
YZ
4741 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4742 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4743 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4744 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4745
4746 vmcs_write16(GUEST_INTR_STATUS, 0);
01e439be 4747
0bcf261c 4748 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
01e439be 4749 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
c7c9c56c
YZ
4750 }
4751
d588bb9b
CG
4752 if (vmx_can_use_ipiv(&vmx->vcpu)) {
4753 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4754 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4755 }
4756
4757 if (!kvm_pause_in_guest(kvm)) {
4b8d54f9 4758 vmcs_write32(PLE_GAP, ple_gap);
a7653ecd
RK
4759 vmx->ple_window = ple_window;
4760 vmx->ple_window_dirty = true;
4b8d54f9
ZE
4761 }
4762
2f4073e0
TX
4763 if (kvm_notify_vmexit_enabled(kvm))
4764 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4765
c3707958
XG
4766 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4767 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6aa8b732
AK
4768 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4769
9581d442
AK
4770 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4771 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
a547c6db 4772 vmx_set_constant_host_state(vmx);
6aa8b732
AK
4773 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4774 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6aa8b732 4775
2a499e49
BD
4776 if (cpu_has_vmx_vmfunc())
4777 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4778
2cc51560
ED
4779 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4780 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
33966dd6 4781 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2cc51560 4782 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
33966dd6 4783 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6aa8b732 4784
74545705
RK
4785 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4786 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
468d472f 4787
3af80fec 4788 vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
6aa8b732
AK
4789
4790 /* 22.2.1, 20.8.1 */
3af80fec 4791 vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
1c3d14fe 4792
fa71e952
SC
4793 vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4794 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
bd7e5b08 4795
bf8179a0 4796 set_cr4_guest_host_mask(vmx);
e00c8cf2 4797
35fbe0d4
XL
4798 if (vmx->vpid != 0)
4799 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4800
becdad85 4801 if (cpu_has_vmx_xsaves())
f53cd63c
WL
4802 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4803
4e59516a 4804 if (enable_pml) {
4e59516a
PF
4805 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4806 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4807 }
0b665d30 4808
72add915 4809 vmx_write_encls_bitmap(&vmx->vcpu, NULL);
2ef444f1 4810
2ef7619d 4811 if (vmx_pt_mode_is_host_guest()) {
2ef444f1
CP
4812 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4813 /* Bit[6~0] are forced to 1, writes are ignored. */
4814 vmx->pt_desc.guest.output_mask = 0x7F;
4815 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4816 }
c5c9f920 4817
e5494940
SC
4818 vmcs_write32(GUEST_SYSENTER_CS, 0);
4819 vmcs_writel(GUEST_SYSENTER_ESP, 0);
4820 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4821 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4822
4823 if (cpu_has_vmx_tpr_shadow()) {
4824 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4825 if (cpu_need_tpr_shadow(&vmx->vcpu))
4826 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4827 __pa(vmx->vcpu.arch.apic->regs));
4828 vmcs_write32(TPR_THRESHOLD, 0);
4829 }
4830
c5c9f920 4831 vmx_setup_uret_msrs(vmx);
e00c8cf2
AK
4832}
4833
06692e4b
SC
4834static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4835{
4836 struct vcpu_vmx *vmx = to_vmx(vcpu);
4837
4838 init_vmcs(vmx);
4839
4840 if (nested)
4841 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4842
4843 vcpu_setup_sgx_lepubkeyhash(vcpu);
4844
4845 vmx->nested.posted_intr_nv = -1;
4846 vmx->nested.vmxon_ptr = INVALID_GPA;
4847 vmx->nested.current_vmptr = INVALID_GPA;
4848 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4849
4850 vcpu->arch.microcode_version = 0x100000000ULL;
4851 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4852
4853 /*
4854 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4855 * or POSTED_INTR_WAKEUP_VECTOR.
4856 */
4857 vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4858 vmx->pi_desc.sn = 1;
4859}
4860
d28bc9dd 4861static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
e00c8cf2
AK
4862{
4863 struct vcpu_vmx *vmx = to_vmx(vcpu);
e00c8cf2 4864
06692e4b
SC
4865 if (!init_event)
4866 __vmx_vcpu_reset(vcpu);
4867
7ffd92c5 4868 vmx->rmode.vm86_active = 0;
d28b387f 4869 vmx->spec_ctrl = 0;
e00c8cf2 4870
6e3ba4ab
TX
4871 vmx->msr_ia32_umwait_control = 0;
4872
95c06540 4873 vmx->hv_deadline_tsc = -1;
d28bc9dd
NA
4874 kvm_set_cr8(vcpu, 0);
4875
2fb92db1 4876 vmx_segment_cache_clear(vmx);
ff8828c8 4877 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
2fb92db1 4878
5706be0d 4879 seg_setup(VCPU_SREG_CS);
66450a21 4880 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
f3531054 4881 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
e00c8cf2
AK
4882
4883 seg_setup(VCPU_SREG_DS);
4884 seg_setup(VCPU_SREG_ES);
4885 seg_setup(VCPU_SREG_FS);
4886 seg_setup(VCPU_SREG_GS);
4887 seg_setup(VCPU_SREG_SS);
4888
4889 vmcs_write16(GUEST_TR_SELECTOR, 0);
4890 vmcs_writel(GUEST_TR_BASE, 0);
4891 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4892 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4893
4894 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4895 vmcs_writel(GUEST_LDTR_BASE, 0);
4896 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4897 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4898
e00c8cf2
AK
4899 vmcs_writel(GUEST_GDTR_BASE, 0);
4900 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4901
4902 vmcs_writel(GUEST_IDTR_BASE, 0);
4903 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4904
443381a8 4905 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
e00c8cf2 4906 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
f3531054 4907 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
a554d207
WL
4908 if (kvm_mpx_supported())
4909 vmcs_write64(GUEST_BNDCFGS, 0);
e00c8cf2 4910
6aa8b732
AK
4911 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
4912
a73896cb 4913 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6aa8b732 4914
dd5f5341 4915 vpid_sync_context(vmx->vpid);
027bbb88
PG
4916
4917 vmx_update_fb_clear_dis(vcpu, vmx);
6aa8b732
AK
4918}
4919
b6a7cc35 4920static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
3b86cd99 4921{
9dadc2f9 4922 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
3b86cd99
JK
4923}
4924
b6a7cc35 4925static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
3b86cd99 4926{
d02fcf50 4927 if (!enable_vnmi ||
8a1b4392 4928 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
b6a7cc35 4929 vmx_enable_irq_window(vcpu);
c9a7953f
JK
4930 return;
4931 }
3b86cd99 4932
4e2a0bc5 4933 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
3b86cd99
JK
4934}
4935
2d613912 4936static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
85f455f7 4937{
9c8cba37 4938 struct vcpu_vmx *vmx = to_vmx(vcpu);
66fd3f7f
GN
4939 uint32_t intr;
4940 int irq = vcpu->arch.interrupt.nr;
9c8cba37 4941
2d613912 4942 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
2714d1d3 4943
fa89a817 4944 ++vcpu->stat.irq_injections;
7ffd92c5 4945 if (vmx->rmode.vm86_active) {
71f9833b
SH
4946 int inc_eip = 0;
4947 if (vcpu->arch.interrupt.soft)
4948 inc_eip = vcpu->arch.event_exit_inst_len;
9497e1f2 4949 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
85f455f7
ED
4950 return;
4951 }
66fd3f7f
GN
4952 intr = irq | INTR_INFO_VALID_MASK;
4953 if (vcpu->arch.interrupt.soft) {
4954 intr |= INTR_TYPE_SOFT_INTR;
4955 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4956 vmx->vcpu.arch.event_exit_inst_len);
4957 } else
4958 intr |= INTR_TYPE_EXT_INTR;
4959 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
caa057a2
WL
4960
4961 vmx_clear_hlt(vcpu);
85f455f7
ED
4962}
4963
f08864b4
SY
4964static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4965{
66a5a347
JK
4966 struct vcpu_vmx *vmx = to_vmx(vcpu);
4967
d02fcf50 4968 if (!enable_vnmi) {
8a1b4392
PB
4969 /*
4970 * Tracking the NMI-blocked state in software is built upon
4971 * finding the next open IRQ window. This, in turn, depends on
4972 * well-behaving guests: They have to keep IRQs disabled at
4973 * least as long as the NMI handler runs. Otherwise we may
4974 * cause NMI nesting, maybe breaking the guest. But as this is
4975 * highly unlikely, we can live with the residual risk.
4976 */
4977 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4978 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4979 }
4980
4c4a6f79
PB
4981 ++vcpu->stat.nmi_injections;
4982 vmx->loaded_vmcs->nmi_known_unmasked = false;
3b86cd99 4983
7ffd92c5 4984 if (vmx->rmode.vm86_active) {
9497e1f2 4985 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
66a5a347
JK
4986 return;
4987 }
c5a6d5f7 4988
f08864b4
SY
4989 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4990 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
caa057a2
WL
4991
4992 vmx_clear_hlt(vcpu);
f08864b4
SY
4993}
4994
97b7ead3 4995bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
3cfc3092 4996{
4c4a6f79
PB
4997 struct vcpu_vmx *vmx = to_vmx(vcpu);
4998 bool masked;
4999
d02fcf50 5000 if (!enable_vnmi)
8a1b4392 5001 return vmx->loaded_vmcs->soft_vnmi_blocked;
4c4a6f79 5002 if (vmx->loaded_vmcs->nmi_known_unmasked)
9d58b931 5003 return false;
4c4a6f79
PB
5004 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5005 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5006 return masked;
3cfc3092
JK
5007}
5008
97b7ead3 5009void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3cfc3092
JK
5010{
5011 struct vcpu_vmx *vmx = to_vmx(vcpu);
5012
d02fcf50 5013 if (!enable_vnmi) {
8a1b4392
PB
5014 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5015 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5016 vmx->loaded_vmcs->vnmi_blocked_time = 0;
5017 }
5018 } else {
5019 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5020 if (masked)
5021 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5022 GUEST_INTR_STATE_NMI);
5023 else
5024 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5025 GUEST_INTR_STATE_NMI);
5026 }
3cfc3092
JK
5027}
5028
1b660b6b
SC
5029bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5030{
5031 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5032 return false;
5033
5034 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5035 return true;
5036
5037 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5038 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5039 GUEST_INTR_STATE_NMI));
5040}
5041
c9d40913 5042static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
2505dc9f 5043{
b6b8a145 5044 if (to_vmx(vcpu)->nested.nested_run_pending)
c9d40913 5045 return -EBUSY;
ea8ceb83 5046
c300ab9f
PB
5047 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
5048 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
c9d40913 5049 return -EBUSY;
c300ab9f 5050
1b660b6b
SC
5051 return !vmx_nmi_blocked(vcpu);
5052}
429ab576 5053
1b660b6b
SC
5054bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5055{
5056 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
88c604b6 5057 return false;
8a1b4392 5058
7ab0abdb 5059 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
1b660b6b
SC
5060 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5061 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2505dc9f
JK
5062}
5063
c9d40913 5064static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
78646121 5065{
a1c77abb 5066 if (to_vmx(vcpu)->nested.nested_run_pending)
c9d40913 5067 return -EBUSY;
a1c77abb 5068
06e18547
RT
5069 /*
5070 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5071 * e.g. if the IRQ arrived asynchronously after checking nested events.
5072 */
c300ab9f 5073 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
c9d40913 5074 return -EBUSY;
c300ab9f 5075
1b660b6b 5076 return !vmx_interrupt_blocked(vcpu);
78646121
GN
5077}
5078
cbc94022
IE
5079static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5080{
ff5a983c 5081 void __user *ret;
cbc94022 5082
f7eaeb0a
SC
5083 if (enable_unrestricted_guest)
5084 return 0;
5085
6a3c623b
PX
5086 mutex_lock(&kvm->slots_lock);
5087 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5088 PAGE_SIZE * 3);
5089 mutex_unlock(&kvm->slots_lock);
5090
ff5a983c
PX
5091 if (IS_ERR(ret))
5092 return PTR_ERR(ret);
5093
40bbb9d0 5094 to_kvm_vmx(kvm)->tss_addr = addr;
ff5a983c
PX
5095
5096 return init_rmode_tss(kvm, ret);
cbc94022
IE
5097}
5098
2ac52ab8
SC
5099static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5100{
40bbb9d0 5101 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
2ac52ab8
SC
5102 return 0;
5103}
5104
0ca1b4f4 5105static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6aa8b732 5106{
77ab6db0 5107 switch (vec) {
77ab6db0 5108 case BP_VECTOR:
c573cd22
JK
5109 /*
5110 * Update instruction length as we may reinject the exception
5111 * from user space while in guest debugging mode.
5112 */
5113 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5114 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
d0bfb940 5115 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
0ca1b4f4 5116 return false;
df561f66 5117 fallthrough;
0ca1b4f4 5118 case DB_VECTOR:
a8cfbae5
ML
5119 return !(vcpu->guest_debug &
5120 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
d0bfb940 5121 case DE_VECTOR:
77ab6db0
JK
5122 case OF_VECTOR:
5123 case BR_VECTOR:
5124 case UD_VECTOR:
5125 case DF_VECTOR:
5126 case SS_VECTOR:
5127 case GP_VECTOR:
5128 case MF_VECTOR:
0ca1b4f4 5129 return true;
77ab6db0 5130 }
0ca1b4f4
GN
5131 return false;
5132}
5133
5134static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5135 int vec, u32 err_code)
5136{
5137 /*
5138 * Instruction with address size override prefix opcode 0x67
5139 * Cause the #SS fault with 0 error code in VM86 mode.
5140 */
5141 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
60fc3d02 5142 if (kvm_emulate_instruction(vcpu, 0)) {
0ca1b4f4
GN
5143 if (vcpu->arch.halt_request) {
5144 vcpu->arch.halt_request = 0;
1460179d 5145 return kvm_emulate_halt_noskip(vcpu);
0ca1b4f4
GN
5146 }
5147 return 1;
5148 }
5149 return 0;
5150 }
5151
5152 /*
5153 * Forward all other exceptions that are valid in real mode.
5154 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5155 * the required debugging infrastructure rework.
5156 */
5157 kvm_queue_exception(vcpu, vec);
5158 return 1;
6aa8b732
AK
5159}
5160
851ba692 5161static int handle_machine_check(struct kvm_vcpu *vcpu)
a0861c02 5162{
95b5a48c 5163 /* handled by vmx_vcpu_run() */
a0861c02
AK
5164 return 1;
5165}
5166
e6f8b6c1
XL
5167/*
5168 * If the host has split lock detection disabled, then #AC is
5169 * unconditionally injected into the guest, which is the pre split lock
5170 * detection behaviour.
5171 *
5172 * If the host has split lock detection enabled then #AC is
5173 * only injected into the guest when:
5174 * - Guest CPL == 3 (user mode)
5175 * - Guest has #AC detection enabled in CR0
5176 * - Guest EFLAGS has AC bit set
5177 */
b33bb78a 5178bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
e6f8b6c1
XL
5179{
5180 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5181 return true;
5182
5183 return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
5184 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5185}
5186
95b5a48c 5187static int handle_exception_nmi(struct kvm_vcpu *vcpu)
6aa8b732 5188{
1155f76a 5189 struct vcpu_vmx *vmx = to_vmx(vcpu);
851ba692 5190 struct kvm_run *kvm_run = vcpu->run;
d0bfb940 5191 u32 intr_info, ex_no, error_code;
e87e46d5 5192 unsigned long cr2, dr6;
6aa8b732 5193 u32 vect_info;
6aa8b732 5194
1155f76a 5195 vect_info = vmx->idt_vectoring_info;
f27ad73a 5196 intr_info = vmx_get_intr_info(vcpu);
6aa8b732 5197
11df586d
SC
5198 /*
5199 * Machine checks are handled by handle_exception_irqoff(), or by
5200 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
5201 * vmx_vcpu_enter_exit().
5202 */
2ea72039 5203 if (is_machine_check(intr_info) || is_nmi(intr_info))
11df586d 5204 return 1;
2ab455cc 5205
ec5be88a
JL
5206 /*
5207 * Queue the exception here instead of in handle_nm_fault_irqoff().
5208 * This ensures the nested_vmx check is not skipped so vmexit can
5209 * be reflected to L1 (when it intercepts #NM) before reaching this
5210 * point.
5211 */
5212 if (is_nm_fault(intr_info)) {
5213 kvm_queue_exception(vcpu, NM_VECTOR);
5214 return 1;
5215 }
5216
082d06ed
WL
5217 if (is_invalid_opcode(intr_info))
5218 return handle_ud(vcpu);
7aa81cc0 5219
6aa8b732 5220 error_code = 0;
2e11384c 5221 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6aa8b732 5222 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
bf4ca23e 5223
9e869480
LA
5224 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5225 WARN_ON_ONCE(!enable_vmware_backdoor);
a6c6ed1e
SC
5226
5227 /*
5228 * VMware backdoor emulation on #GP interception only handles
5229 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5230 * error code on #GP.
5231 */
5232 if (error_code) {
5233 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5234 return 1;
5235 }
60fc3d02 5236 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
9e869480
LA
5237 }
5238
bf4ca23e
XG
5239 /*
5240 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5241 * MMIO, it is better to report an internal error.
5242 * See the comments in vmx_handle_exit.
5243 */
5244 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5245 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5246 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5247 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
1aa561b1 5248 vcpu->run->internal.ndata = 4;
bf4ca23e
XG
5249 vcpu->run->internal.data[0] = vect_info;
5250 vcpu->run->internal.data[1] = intr_info;
80f0e95d 5251 vcpu->run->internal.data[2] = error_code;
8a14fe4f 5252 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
bf4ca23e
XG
5253 return 0;
5254 }
5255
6aa8b732 5256 if (is_page_fault(intr_info)) {
5addc235 5257 cr2 = vmx_get_exit_qual(vcpu);
1dbf5d68
MG
5258 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5259 /*
5260 * EPT will cause page fault only if we need to
5261 * detect illegal GPAs.
5262 */
b96e6506 5263 WARN_ON_ONCE(!allow_smaller_maxphyaddr);
1dbf5d68
MG
5264 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5265 return 1;
5266 } else
5267 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
6aa8b732
AK
5268 }
5269
d0bfb940 5270 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
0ca1b4f4
GN
5271
5272 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5273 return handle_rmode_exception(vcpu, ex_no, error_code);
5274
42dbaa5a
JK
5275 switch (ex_no) {
5276 case DB_VECTOR:
5addc235 5277 dr6 = vmx_get_exit_qual(vcpu);
42dbaa5a
JK
5278 if (!(vcpu->guest_debug &
5279 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
b9bed78e
SC
5280 /*
5281 * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5282 * instruction. ICEBP generates a trap-like #DB, but
5283 * despite its interception control being tied to #DB,
5284 * is an instruction intercept, i.e. the VM-Exit occurs
65ec8f01
SC
5285 * on the ICEBP itself. Use the inner "skip" helper to
5286 * avoid single-step #DB and MTF updates, as ICEBP is
5287 * higher priority. Note, skipping ICEBP still clears
5288 * STI and MOVSS blocking.
b9bed78e
SC
5289 *
5290 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5291 * if single-step is enabled in RFLAGS and STI or MOVSS
5292 * blocking is active, as the CPU doesn't set the bit
5293 * on VM-Exit due to #DB interception. VM-Entry has a
5294 * consistency check that a single-step #DB is pending
5295 * in this scenario as the previous instruction cannot
5296 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5297 * don't modify RFLAGS), therefore the one instruction
5298 * delay when activating single-step breakpoints must
5299 * have already expired. Note, the CPU sets/clears BS
5300 * as appropriate for all other VM-Exits types.
5301 */
32d43cd3 5302 if (is_icebp(intr_info))
1957aa63 5303 WARN_ON(!skip_emulated_instruction(vcpu));
b9bed78e
SC
5304 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5305 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5306 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5307 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5308 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
fd2a445a 5309
4d5523cf 5310 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
42dbaa5a
JK
5311 return 1;
5312 }
9a3ecd5e 5313 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
42dbaa5a 5314 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
df561f66 5315 fallthrough;
42dbaa5a 5316 case BP_VECTOR:
c573cd22
JK
5317 /*
5318 * Update instruction length as we may reinject #BP from
5319 * user space while in guest debugging mode. Reading it for
5320 * #DB as well causes no harm, it is not used in that case.
5321 */
5322 vmx->vcpu.arch.event_exit_inst_len =
5323 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6aa8b732 5324 kvm_run->exit_reason = KVM_EXIT_DEBUG;
e87e46d5 5325 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
d0bfb940 5326 kvm_run->debug.arch.exception = ex_no;
42dbaa5a 5327 break;
e6f8b6c1 5328 case AC_VECTOR:
b33bb78a 5329 if (vmx_guest_inject_ac(vcpu)) {
e6f8b6c1
XL
5330 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5331 return 1;
5332 }
5333
5334 /*
5335 * Handle split lock. Depending on detection mode this will
5336 * either warn and disable split lock detection for this
5337 * task or force SIGBUS on it.
5338 */
5339 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5340 return 1;
5341 fallthrough;
42dbaa5a 5342 default:
d0bfb940
JK
5343 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5344 kvm_run->ex.exception = ex_no;
5345 kvm_run->ex.error_code = error_code;
42dbaa5a 5346 break;
6aa8b732 5347 }
6aa8b732
AK
5348 return 0;
5349}
5350
f399e60c 5351static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
6aa8b732 5352{
1165f5fe 5353 ++vcpu->stat.irq_exits;
6aa8b732
AK
5354 return 1;
5355}
5356
851ba692 5357static int handle_triple_fault(struct kvm_vcpu *vcpu)
988ad74f 5358{
851ba692 5359 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
bbeac283 5360 vcpu->mmio_needed = 0;
988ad74f
AK
5361 return 0;
5362}
6aa8b732 5363
851ba692 5364static int handle_io(struct kvm_vcpu *vcpu)
6aa8b732 5365{
bfdaab09 5366 unsigned long exit_qualification;
dca7f128 5367 int size, in, string;
039576c0 5368 unsigned port;
6aa8b732 5369
5addc235 5370 exit_qualification = vmx_get_exit_qual(vcpu);
039576c0 5371 string = (exit_qualification & 16) != 0;
e70669ab 5372
cf8f70bf 5373 ++vcpu->stat.io_exits;
e70669ab 5374
432baf60 5375 if (string)
60fc3d02 5376 return kvm_emulate_instruction(vcpu, 0);
e70669ab 5377
cf8f70bf
GN
5378 port = exit_qualification >> 16;
5379 size = (exit_qualification & 7) + 1;
432baf60 5380 in = (exit_qualification & 8) != 0;
cf8f70bf 5381
dca7f128 5382 return kvm_fast_pio(vcpu, size, port, in);
6aa8b732
AK
5383}
5384
102d8325
IM
5385static void
5386vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5387{
5388 /*
5389 * Patch in the VMCALL instruction:
5390 */
5391 hypercall[0] = 0x0f;
5392 hypercall[1] = 0x01;
5393 hypercall[2] = 0xc1;
102d8325
IM
5394}
5395
0fa06071 5396/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
eeadf9e7
NHE
5397static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5398{
eeadf9e7 5399 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
5400 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5401 unsigned long orig_val = val;
5402
eeadf9e7
NHE
5403 /*
5404 * We get here when L2 changed cr0 in a way that did not change
5405 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
1a0d74e6
JK
5406 * but did change L0 shadowed bits. So we first calculate the
5407 * effective cr0 value that L1 would like to write into the
5408 * hardware. It consists of the L2-owned bits from the new
5409 * value combined with the L1-owned bits from L1's guest_cr0.
eeadf9e7 5410 */
1a0d74e6
JK
5411 val = (val & ~vmcs12->cr0_guest_host_mask) |
5412 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5413
3899152c 5414 if (!nested_guest_cr0_valid(vcpu, val))
eeadf9e7 5415 return 1;
1a0d74e6
JK
5416
5417 if (kvm_set_cr0(vcpu, val))
5418 return 1;
5419 vmcs_writel(CR0_READ_SHADOW, orig_val);
eeadf9e7 5420 return 0;
1a0d74e6
JK
5421 } else {
5422 if (to_vmx(vcpu)->nested.vmxon &&
3899152c 5423 !nested_host_cr0_valid(vcpu, val))
1a0d74e6 5424 return 1;
3899152c 5425
eeadf9e7 5426 return kvm_set_cr0(vcpu, val);
1a0d74e6 5427 }
eeadf9e7
NHE
5428}
5429
5430static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5431{
5432 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
5433 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5434 unsigned long orig_val = val;
5435
5436 /* analogously to handle_set_cr0 */
5437 val = (val & ~vmcs12->cr4_guest_host_mask) |
5438 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5439 if (kvm_set_cr4(vcpu, val))
eeadf9e7 5440 return 1;
1a0d74e6 5441 vmcs_writel(CR4_READ_SHADOW, orig_val);
eeadf9e7
NHE
5442 return 0;
5443 } else
5444 return kvm_set_cr4(vcpu, val);
5445}
5446
0367f205
PB
5447static int handle_desc(struct kvm_vcpu *vcpu)
5448{
5449 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
60fc3d02 5450 return kvm_emulate_instruction(vcpu, 0);
0367f205
PB
5451}
5452
851ba692 5453static int handle_cr(struct kvm_vcpu *vcpu)
6aa8b732 5454{
229456fc 5455 unsigned long exit_qualification, val;
6aa8b732
AK
5456 int cr;
5457 int reg;
49a9b07e 5458 int err;
6affcbed 5459 int ret;
6aa8b732 5460
5addc235 5461 exit_qualification = vmx_get_exit_qual(vcpu);
6aa8b732
AK
5462 cr = exit_qualification & 15;
5463 reg = (exit_qualification >> 8) & 15;
5464 switch ((exit_qualification >> 4) & 3) {
5465 case 0: /* mov to cr */
27b4a9c4 5466 val = kvm_register_read(vcpu, reg);
229456fc 5467 trace_kvm_cr_write(cr, val);
6aa8b732
AK
5468 switch (cr) {
5469 case 0:
eeadf9e7 5470 err = handle_set_cr0(vcpu, val);
6affcbed 5471 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 5472 case 3:
e1de91cc 5473 WARN_ON_ONCE(enable_unrestricted_guest);
67369273 5474
2390218b 5475 err = kvm_set_cr3(vcpu, val);
6affcbed 5476 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 5477 case 4:
eeadf9e7 5478 err = handle_set_cr4(vcpu, val);
6affcbed 5479 return kvm_complete_insn_gp(vcpu, err);
0a5fff19
GN
5480 case 8: {
5481 u8 cr8_prev = kvm_get_cr8(vcpu);
1e32c079 5482 u8 cr8 = (u8)val;
eea1cff9 5483 err = kvm_set_cr8(vcpu, cr8);
6affcbed 5484 ret = kvm_complete_insn_gp(vcpu, err);
35754c98 5485 if (lapic_in_kernel(vcpu))
6affcbed 5486 return ret;
0a5fff19 5487 if (cr8_prev <= cr8)
6affcbed
KH
5488 return ret;
5489 /*
5490 * TODO: we might be squashing a
5491 * KVM_GUESTDBG_SINGLESTEP-triggered
5492 * KVM_EXIT_DEBUG here.
5493 */
851ba692 5494 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
0a5fff19
GN
5495 return 0;
5496 }
4b8073e4 5497 }
6aa8b732 5498 break;
25c4c276 5499 case 2: /* clts */
67369273
SC
5500 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5501 return -EIO;
6aa8b732
AK
5502 case 1: /*mov from cr*/
5503 switch (cr) {
5504 case 3:
e1de91cc 5505 WARN_ON_ONCE(enable_unrestricted_guest);
67369273 5506
9f8fe504
AK
5507 val = kvm_read_cr3(vcpu);
5508 kvm_register_write(vcpu, reg, val);
5509 trace_kvm_cr_read(cr, val);
6affcbed 5510 return kvm_skip_emulated_instruction(vcpu);
6aa8b732 5511 case 8:
229456fc
MT
5512 val = kvm_get_cr8(vcpu);
5513 kvm_register_write(vcpu, reg, val);
5514 trace_kvm_cr_read(cr, val);
6affcbed 5515 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
5516 }
5517 break;
5518 case 3: /* lmsw */
a1f83a74 5519 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4d4ec087 5520 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
a1f83a74 5521 kvm_lmsw(vcpu, val);
6aa8b732 5522
6affcbed 5523 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
5524 default:
5525 break;
5526 }
851ba692 5527 vcpu->run->exit_reason = 0;
a737f256 5528 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6aa8b732
AK
5529 (int)(exit_qualification >> 4) & 3, cr);
5530 return 0;
5531}
5532
851ba692 5533static int handle_dr(struct kvm_vcpu *vcpu)
6aa8b732 5534{
bfdaab09 5535 unsigned long exit_qualification;
16f8a6f9 5536 int dr, dr7, reg;
996ff542 5537 int err = 1;
16f8a6f9 5538
5addc235 5539 exit_qualification = vmx_get_exit_qual(vcpu);
16f8a6f9
NA
5540 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5541
5542 /* First, if DR does not exist, trigger UD */
5543 if (!kvm_require_dr(vcpu, dr))
5544 return 1;
6aa8b732 5545
ef2d488c 5546 if (vmx_get_cpl(vcpu) > 0)
996ff542
PB
5547 goto out;
5548
16f8a6f9
NA
5549 dr7 = vmcs_readl(GUEST_DR7);
5550 if (dr7 & DR7_GD) {
42dbaa5a
JK
5551 /*
5552 * As the vm-exit takes precedence over the debug trap, we
5553 * need to emulate the latter, either for the host or the
5554 * guest debugging itself.
5555 */
5556 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
9a3ecd5e 5557 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
16f8a6f9 5558 vcpu->run->debug.arch.dr7 = dr7;
82b32774 5559 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
851ba692
AK
5560 vcpu->run->debug.arch.exception = DB_VECTOR;
5561 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
42dbaa5a
JK
5562 return 0;
5563 } else {
4d5523cf 5564 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
42dbaa5a
JK
5565 return 1;
5566 }
5567 }
5568
81908bf4 5569 if (vcpu->guest_debug == 0) {
2183f564 5570 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
5571
5572 /*
5573 * No more DR vmexits; force a reload of the debug registers
5574 * and reenter on this instruction. The next vmexit will
5575 * retrieve the full state of the debug registers.
5576 */
5577 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5578 return 1;
5579 }
5580
42dbaa5a
JK
5581 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5582 if (exit_qualification & TYPE_MOV_FROM_DR) {
020df079 5583 unsigned long val;
4c4d563b 5584
29d6ca41 5585 kvm_get_dr(vcpu, dr, &val);
4c4d563b 5586 kvm_register_write(vcpu, reg, val);
996ff542
PB
5587 err = 0;
5588 } else {
27b4a9c4 5589 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
996ff542 5590 }
4c4d563b 5591
996ff542
PB
5592out:
5593 return kvm_complete_insn_gp(vcpu, err);
6aa8b732
AK
5594}
5595
81908bf4
PB
5596static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5597{
81908bf4
PB
5598 get_debugreg(vcpu->arch.db[0], 0);
5599 get_debugreg(vcpu->arch.db[1], 1);
5600 get_debugreg(vcpu->arch.db[2], 2);
5601 get_debugreg(vcpu->arch.db[3], 3);
5602 get_debugreg(vcpu->arch.dr6, 6);
5603 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5604
5605 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2183f564 5606 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
1ccb6f98
PB
5607
5608 /*
5609 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5610 * a stale dr6 from the guest.
5611 */
5612 set_debugreg(DR6_RESERVED, 6);
81908bf4
PB
5613}
5614
020df079
GN
5615static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5616{
5617 vmcs_writel(GUEST_DR7, val);
5618}
5619
851ba692 5620static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6e5d865c 5621{
eb90f341 5622 kvm_apic_update_ppr(vcpu);
6e5d865c
YS
5623 return 1;
5624}
5625
851ba692 5626static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6aa8b732 5627{
9dadc2f9 5628 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
2714d1d3 5629
3842d135
AK
5630 kvm_make_request(KVM_REQ_EVENT, vcpu);
5631
a26bf12a 5632 ++vcpu->stat.irq_window_exits;
6aa8b732
AK
5633 return 1;
5634}
5635
851ba692 5636static int handle_invlpg(struct kvm_vcpu *vcpu)
a7052897 5637{
5addc235 5638 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
a7052897
MT
5639
5640 kvm_mmu_invlpg(vcpu, exit_qualification);
6affcbed 5641 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
5642}
5643
851ba692 5644static int handle_apic_access(struct kvm_vcpu *vcpu)
f78e0e2e 5645{
58fbbf26 5646 if (likely(fasteoi)) {
5addc235 5647 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
58fbbf26
KT
5648 int access_type, offset;
5649
5650 access_type = exit_qualification & APIC_ACCESS_TYPE;
5651 offset = exit_qualification & APIC_ACCESS_OFFSET;
5652 /*
5653 * Sane guest uses MOV to write EOI, with written value
5654 * not cared. So make a short-circuit here by avoiding
5655 * heavy instruction emulation.
5656 */
5657 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5658 (offset == APIC_EOI)) {
5659 kvm_lapic_set_eoi(vcpu);
6affcbed 5660 return kvm_skip_emulated_instruction(vcpu);
58fbbf26
KT
5661 }
5662 }
60fc3d02 5663 return kvm_emulate_instruction(vcpu, 0);
f78e0e2e
SY
5664}
5665
c7c9c56c
YZ
5666static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5667{
5addc235 5668 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
c7c9c56c
YZ
5669 int vector = exit_qualification & 0xff;
5670
5671 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5672 kvm_apic_set_eoi_accelerated(vcpu, vector);
5673 return 1;
5674}
5675
83d4c286
YZ
5676static int handle_apic_write(struct kvm_vcpu *vcpu)
5677{
5addc235 5678 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
83d4c286 5679
b5ede3df
SC
5680 /*
5681 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5682 * hardware has done any necessary aliasing, offset adjustments, etc...
5683 * for the access. I.e. the correct value has already been written to
5684 * the vAPIC page for the correct 16-byte chunk. KVM needs only to
5685 * retrieve the register value and emulate the access.
5686 */
5687 u32 offset = exit_qualification & 0xff0;
5688
83d4c286
YZ
5689 kvm_apic_write_nodecode(vcpu, offset);
5690 return 1;
5691}
5692
851ba692 5693static int handle_task_switch(struct kvm_vcpu *vcpu)
37817f29 5694{
60637aac 5695 struct vcpu_vmx *vmx = to_vmx(vcpu);
37817f29 5696 unsigned long exit_qualification;
e269fb21
JK
5697 bool has_error_code = false;
5698 u32 error_code = 0;
37817f29 5699 u16 tss_selector;
7f3d35fd 5700 int reason, type, idt_v, idt_index;
64a7ec06
GN
5701
5702 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
7f3d35fd 5703 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
64a7ec06 5704 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
37817f29 5705
5addc235 5706 exit_qualification = vmx_get_exit_qual(vcpu);
37817f29
IE
5707
5708 reason = (u32)exit_qualification >> 30;
64a7ec06
GN
5709 if (reason == TASK_SWITCH_GATE && idt_v) {
5710 switch (type) {
5711 case INTR_TYPE_NMI_INTR:
5712 vcpu->arch.nmi_injected = false;
654f06fc 5713 vmx_set_nmi_mask(vcpu, true);
64a7ec06
GN
5714 break;
5715 case INTR_TYPE_EXT_INTR:
66fd3f7f 5716 case INTR_TYPE_SOFT_INTR:
64a7ec06
GN
5717 kvm_clear_interrupt_queue(vcpu);
5718 break;
5719 case INTR_TYPE_HARD_EXCEPTION:
e269fb21
JK
5720 if (vmx->idt_vectoring_info &
5721 VECTORING_INFO_DELIVER_CODE_MASK) {
5722 has_error_code = true;
5723 error_code =
5724 vmcs_read32(IDT_VECTORING_ERROR_CODE);
5725 }
df561f66 5726 fallthrough;
64a7ec06
GN
5727 case INTR_TYPE_SOFT_EXCEPTION:
5728 kvm_clear_exception_queue(vcpu);
5729 break;
5730 default:
5731 break;
5732 }
60637aac 5733 }
37817f29
IE
5734 tss_selector = exit_qualification;
5735
64a7ec06
GN
5736 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5737 type != INTR_TYPE_EXT_INTR &&
5738 type != INTR_TYPE_NMI_INTR))
1957aa63 5739 WARN_ON(!skip_emulated_instruction(vcpu));
64a7ec06 5740
42dbaa5a
JK
5741 /*
5742 * TODO: What about debug traps on tss switch?
5743 * Are we supposed to inject them and update dr6?
5744 */
1051778f
SC
5745 return kvm_task_switch(vcpu, tss_selector,
5746 type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
60fc3d02 5747 reason, has_error_code, error_code);
37817f29
IE
5748}
5749
851ba692 5750static int handle_ept_violation(struct kvm_vcpu *vcpu)
1439442c 5751{
f9c617f6 5752 unsigned long exit_qualification;
1439442c 5753 gpa_t gpa;
eebed243 5754 u64 error_code;
1439442c 5755
5addc235 5756 exit_qualification = vmx_get_exit_qual(vcpu);
1439442c 5757
0be9c7a8
GN
5758 /*
5759 * EPT violation happened while executing iret from NMI,
5760 * "blocked by NMI" bit has to be set before next VM entry.
5761 * There are errata that may cause this bit to not be set:
5762 * AAK134, BY25.
5763 */
bcd1c294 5764 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
d02fcf50 5765 enable_vnmi &&
bcd1c294 5766 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
0be9c7a8
GN
5767 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5768
1439442c 5769 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
faa03b39 5770 trace_kvm_page_fault(vcpu, gpa, exit_qualification);
4f5982a5 5771
27959a44 5772 /* Is it a read fault? */
ab22a473 5773 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
27959a44
JS
5774 ? PFERR_USER_MASK : 0;
5775 /* Is it a write fault? */
ab22a473 5776 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
27959a44
JS
5777 ? PFERR_WRITE_MASK : 0;
5778 /* Is it a fetch fault? */
ab22a473 5779 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
27959a44
JS
5780 ? PFERR_FETCH_MASK : 0;
5781 /* ept page table entry is present? */
ca2a7c22 5782 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
27959a44 5783 ? PFERR_PRESENT_MASK : 0;
4f5982a5 5784
10835602 5785 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
eebed243 5786 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
25d92081 5787
25d92081 5788 vcpu->arch.exit_qualification = exit_qualification;
1dbf5d68
MG
5789
5790 /*
5791 * Check that the GPA doesn't exceed physical memory limits, as that is
5792 * a guest page fault. We have to emulate the instruction here, because
5793 * if the illegal address is that of a paging structure, then
5794 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
5795 * would also use advanced VM-exit information for EPT violations to
5796 * reconstruct the page fault error code.
5797 */
c0623f5e 5798 if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
1dbf5d68
MG
5799 return kvm_emulate_instruction(vcpu, 0);
5800
4f5982a5 5801 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
1439442c
SY
5802}
5803
851ba692 5804static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
68f89400 5805{
68f89400
MT
5806 gpa_t gpa;
5807
4d31d9ef 5808 if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
3c0c2ad1
SC
5809 return 1;
5810
9034e6e8
PB
5811 /*
5812 * A nested guest cannot optimize MMIO vmexits, because we have an
5813 * nGPA here instead of the required GPA.
5814 */
68f89400 5815 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
9034e6e8
PB
5816 if (!is_guest_mode(vcpu) &&
5817 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
931c33b1 5818 trace_kvm_fast_mmio(gpa);
1957aa63 5819 return kvm_skip_emulated_instruction(vcpu);
68c3b4d1 5820 }
68f89400 5821
c75d0edc 5822 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
68f89400
MT
5823}
5824
851ba692 5825static int handle_nmi_window(struct kvm_vcpu *vcpu)
f08864b4 5826{
67369273
SC
5827 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5828 return -EIO;
5829
4e2a0bc5 5830 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
f08864b4 5831 ++vcpu->stat.nmi_window_exits;
3842d135 5832 kvm_make_request(KVM_REQ_EVENT, vcpu);
f08864b4
SY
5833
5834 return 1;
5835}
5836
fc4fad79
SC
5837static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5838{
5839 struct vcpu_vmx *vmx = to_vmx(vcpu);
5840
5841 return vmx->emulation_required && !vmx->rmode.vm86_active &&
7709aba8 5842 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
fc4fad79
SC
5843}
5844
80ced186 5845static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
ea953ef0 5846{
8b3079a5 5847 struct vcpu_vmx *vmx = to_vmx(vcpu);
49e9d557 5848 bool intr_window_requested;
b8405c18 5849 unsigned count = 130;
49e9d557 5850
2183f564 5851 intr_window_requested = exec_controls_get(vmx) &
9dadc2f9 5852 CPU_BASED_INTR_WINDOW_EXITING;
ea953ef0 5853
98eb2f8b 5854 while (vmx->emulation_required && count-- != 0) {
db438592 5855 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
49e9d557
AK
5856 return handle_interrupt_window(&vmx->vcpu);
5857
72875d8a 5858 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
de87dcdd
AK
5859 return 1;
5860
60fc3d02 5861 if (!kvm_emulate_instruction(vcpu, 0))
8fff2710 5862 return 0;
1d5a4d9b 5863
fc4fad79 5864 if (vmx_emulation_required_with_pending_exception(vcpu)) {
e615e355 5865 kvm_prepare_emulation_failure_exit(vcpu);
8fff2710
SC
5866 return 0;
5867 }
ea953ef0 5868
8d76c49e
GN
5869 if (vcpu->arch.halt_request) {
5870 vcpu->arch.halt_request = 0;
1460179d 5871 return kvm_emulate_halt_noskip(vcpu);
8d76c49e
GN
5872 }
5873
8fff2710 5874 /*
72c3c0fe
TG
5875 * Note, return 1 and not 0, vcpu_run() will invoke
5876 * xfer_to_guest_mode() which will create a proper return
5877 * code.
8fff2710 5878 */
72c3c0fe 5879 if (__xfer_to_guest_mode_work_pending())
8fff2710 5880 return 1;
ea953ef0
MG
5881 }
5882
8fff2710 5883 return 1;
b4a2d31d
RK
5884}
5885
fc4fad79
SC
5886static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5887{
5888 if (vmx_emulation_required_with_pending_exception(vcpu)) {
5889 kvm_prepare_emulation_failure_exit(vcpu);
5890 return 0;
5891 }
5892
5893 return 1;
5894}
5895
b4a2d31d
RK
5896static void grow_ple_window(struct kvm_vcpu *vcpu)
5897{
5898 struct vcpu_vmx *vmx = to_vmx(vcpu);
c5c5d6fa 5899 unsigned int old = vmx->ple_window;
b4a2d31d 5900
c8e88717
BM
5901 vmx->ple_window = __grow_ple_window(old, ple_window,
5902 ple_window_grow,
5903 ple_window_max);
b4a2d31d 5904
4f75bcc3 5905 if (vmx->ple_window != old) {
b4a2d31d 5906 vmx->ple_window_dirty = true;
4f75bcc3
PX
5907 trace_kvm_ple_window_update(vcpu->vcpu_id,
5908 vmx->ple_window, old);
5909 }
b4a2d31d
RK
5910}
5911
5912static void shrink_ple_window(struct kvm_vcpu *vcpu)
5913{
5914 struct vcpu_vmx *vmx = to_vmx(vcpu);
c5c5d6fa 5915 unsigned int old = vmx->ple_window;
b4a2d31d 5916
c8e88717
BM
5917 vmx->ple_window = __shrink_ple_window(old, ple_window,
5918 ple_window_shrink,
5919 ple_window);
b4a2d31d 5920
4f75bcc3 5921 if (vmx->ple_window != old) {
b4a2d31d 5922 vmx->ple_window_dirty = true;
4f75bcc3
PX
5923 trace_kvm_ple_window_update(vcpu->vcpu_id,
5924 vmx->ple_window, old);
5925 }
b4a2d31d
RK
5926}
5927
4b8d54f9
ZE
5928/*
5929 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5930 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5931 */
9fb41ba8 5932static int handle_pause(struct kvm_vcpu *vcpu)
4b8d54f9 5933{
b31c114b 5934 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d
RK
5935 grow_ple_window(vcpu);
5936
de63ad4c
LM
5937 /*
5938 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5939 * VM-execution control is ignored if CPL > 0. OTOH, KVM
5940 * never set PAUSE_EXITING and just set PLE if supported,
5941 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5942 */
5943 kvm_vcpu_on_spin(vcpu, true);
6affcbed 5944 return kvm_skip_emulated_instruction(vcpu);
4b8d54f9
ZE
5945}
5946
5f3d45e7
MD
5947static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5948{
5949 return 1;
5950}
5951
55d2375e 5952static int handle_invpcid(struct kvm_vcpu *vcpu)
19677e32 5953{
55d2375e
SC
5954 u32 vmx_instruction_info;
5955 unsigned long type;
55d2375e 5956 gva_t gva;
55d2375e
SC
5957 struct {
5958 u64 pcid;
5959 u64 gla;
5960 } operand;
329bd56c 5961 int gpr_index;
f9eb4af6 5962
55d2375e 5963 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
19677e32
BD
5964 kvm_queue_exception(vcpu, UD_VECTOR);
5965 return 1;
5966 }
5967
55d2375e 5968 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
329bd56c
VS
5969 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5970 type = kvm_register_read(vcpu, gpr_index);
f9eb4af6 5971
55d2375e
SC
5972 /* According to the Intel instruction reference, the memory operand
5973 * is read even if it isn't needed (e.g., for type==all)
5974 */
5addc235 5975 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619
EK
5976 vmx_instruction_info, false,
5977 sizeof(operand), &gva))
3573e22c
BD
5978 return 1;
5979
9715092f 5980 return kvm_handle_invpcid(vcpu, type, gva);
e29acc55
JM
5981}
5982
55d2375e 5983static int handle_pml_full(struct kvm_vcpu *vcpu)
ec378aee 5984{
55d2375e 5985 unsigned long exit_qualification;
b3897a49 5986
55d2375e 5987 trace_kvm_pml_full(vcpu->vcpu_id);
b3897a49 5988
5addc235 5989 exit_qualification = vmx_get_exit_qual(vcpu);
cbf71279
RK
5990
5991 /*
55d2375e
SC
5992 * PML buffer FULL happened while executing iret from NMI,
5993 * "blocked by NMI" bit has to be set before next VM entry.
cbf71279 5994 */
55d2375e
SC
5995 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5996 enable_vnmi &&
5997 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5998 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5999 GUEST_INTR_STATE_NMI);
e49fcb8b 6000
55d2375e
SC
6001 /*
6002 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
6003 * here.., and there's no userspace involvement needed for PML.
6004 */
ec378aee
NHE
6005 return 1;
6006}
6007
26efe2fd 6008static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
8ca44e88 6009{
804939ea
SC
6010 struct vcpu_vmx *vmx = to_vmx(vcpu);
6011
6012 if (!vmx->req_immediate_exit &&
26efe2fd 6013 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
55d2375e 6014 kvm_lapic_expired_hv_timer(vcpu);
26efe2fd
WL
6015 return EXIT_FASTPATH_REENTER_GUEST;
6016 }
6017
6018 return EXIT_FASTPATH_NONE;
6019}
804939ea 6020
26efe2fd
WL
6021static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6022{
6023 handle_fastpath_preemption_timer(vcpu);
55d2375e 6024 return 1;
8ca44e88
DM
6025}
6026
55d2375e
SC
6027/*
6028 * When nested=0, all VMX instruction VM Exits filter here. The handlers
6029 * are overwritten by nested_vmx_setup() when nested=1.
6030 */
6031static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
b8bbab92 6032{
55d2375e
SC
6033 kvm_queue_exception(vcpu, UD_VECTOR);
6034 return 1;
b8bbab92
VK
6035}
6036
9798adbc 6037#ifndef CONFIG_X86_SGX_KVM
55d2375e 6038static int handle_encls(struct kvm_vcpu *vcpu)
e7953d7f 6039{
55d2375e 6040 /*
9798adbc
SC
6041 * SGX virtualization is disabled. There is no software enable bit for
6042 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6043 * the guest from executing ENCLS (when SGX is supported by hardware).
55d2375e
SC
6044 */
6045 kvm_queue_exception(vcpu, UD_VECTOR);
6046 return 1;
e7953d7f 6047}
9798adbc 6048#endif /* CONFIG_X86_SGX_KVM */
e7953d7f 6049
fe6b6bc8
CQ
6050static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6051{
d61863c6
HX
6052 /*
6053 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6054 * VM-Exits. Unconditionally set the flag here and leave the handling to
6055 * vmx_handle_exit().
6056 */
6057 to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6058 return 1;
fe6b6bc8
CQ
6059}
6060
2f4073e0
TX
6061static int handle_notify(struct kvm_vcpu *vcpu)
6062{
6063 unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6064 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6065
6066 ++vcpu->stat.notify_window_exits;
6067
6068 /*
6069 * Notify VM exit happened while executing iret from NMI,
6070 * "blocked by NMI" bit has to be set before next VM entry.
6071 */
6072 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6073 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6074 GUEST_INTR_STATE_NMI);
6075
6076 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6077 context_invalid) {
6078 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6079 vcpu->run->notify.flags = context_invalid ?
6080 KVM_NOTIFY_CONTEXT_INVALID : 0;
6081 return 0;
6082 }
6083
6084 return 1;
6085}
6086
ec378aee 6087/*
55d2375e
SC
6088 * The exit handlers return 1 if the exit was handled fully and guest execution
6089 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
6090 * to be done to userspace and return 0.
ec378aee 6091 */
55d2375e 6092static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
95b5a48c 6093 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
55d2375e
SC
6094 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6095 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6096 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6097 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6098 [EXIT_REASON_CR_ACCESS] = handle_cr,
6099 [EXIT_REASON_DR_ACCESS] = handle_dr,
f399e60c
AA
6100 [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6101 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6102 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
9dadc2f9 6103 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
f399e60c 6104 [EXIT_REASON_HLT] = kvm_emulate_halt,
5ff3a351 6105 [EXIT_REASON_INVD] = kvm_emulate_invd,
55d2375e 6106 [EXIT_REASON_INVLPG] = handle_invlpg,
c483c454 6107 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
5ff3a351 6108 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
55d2375e
SC
6109 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6110 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6111 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6112 [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6113 [EXIT_REASON_VMREAD] = handle_vmx_instruction,
6114 [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6115 [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6116 [EXIT_REASON_VMOFF] = handle_vmx_instruction,
6117 [EXIT_REASON_VMON] = handle_vmx_instruction,
6118 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6119 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6120 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
6121 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5ff3a351 6122 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
92f9895c 6123 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
55d2375e
SC
6124 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6125 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6126 [EXIT_REASON_GDTR_IDTR] = handle_desc,
6127 [EXIT_REASON_LDTR_TR] = handle_desc,
6128 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6129 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6130 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
5ff3a351 6131 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
55d2375e 6132 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
5ff3a351 6133 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
55d2375e
SC
6134 [EXIT_REASON_INVEPT] = handle_vmx_instruction,
6135 [EXIT_REASON_INVVPID] = handle_vmx_instruction,
5ff3a351
SC
6136 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
6137 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
55d2375e
SC
6138 [EXIT_REASON_PML_FULL] = handle_pml_full,
6139 [EXIT_REASON_INVPCID] = handle_invpcid,
6140 [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6141 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6142 [EXIT_REASON_ENCLS] = handle_encls,
fe6b6bc8 6143 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
2f4073e0 6144 [EXIT_REASON_NOTIFY] = handle_notify,
55d2375e 6145};
b8bbab92 6146
55d2375e
SC
6147static const int kvm_vmx_max_exit_handlers =
6148 ARRAY_SIZE(kvm_vmx_exit_handlers);
ec378aee 6149
0a62a031
DE
6150static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6151 u64 *info1, u64 *info2,
235ba74f 6152 u32 *intr_info, u32 *error_code)
ec378aee 6153{
235ba74f
SC
6154 struct vcpu_vmx *vmx = to_vmx(vcpu);
6155
0a62a031 6156 *reason = vmx->exit_reason.full;
5addc235 6157 *info1 = vmx_get_exit_qual(vcpu);
8e533240 6158 if (!(vmx->exit_reason.failed_vmentry)) {
235ba74f
SC
6159 *info2 = vmx->idt_vectoring_info;
6160 *intr_info = vmx_get_intr_info(vcpu);
6161 if (is_exception_with_error_code(*intr_info))
6162 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6163 else
6164 *error_code = 0;
6165 } else {
6166 *info2 = 0;
6167 *intr_info = 0;
6168 *error_code = 0;
6169 }
ec378aee
NHE
6170}
6171
55d2375e 6172static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
27d6c865 6173{
55d2375e
SC
6174 if (vmx->pml_pg) {
6175 __free_page(vmx->pml_pg);
6176 vmx->pml_pg = NULL;
b8bbab92 6177 }
27d6c865
NHE
6178}
6179
55d2375e 6180static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
cd232ad0 6181{
55d2375e
SC
6182 struct vcpu_vmx *vmx = to_vmx(vcpu);
6183 u64 *pml_buf;
6184 u16 pml_idx;
cd232ad0 6185
55d2375e 6186 pml_idx = vmcs_read16(GUEST_PML_INDEX);
cd232ad0 6187
55d2375e
SC
6188 /* Do nothing if PML buffer is empty */
6189 if (pml_idx == (PML_ENTITY_NUM - 1))
6190 return;
cd232ad0 6191
55d2375e
SC
6192 /* PML index always points to next available PML buffer entity */
6193 if (pml_idx >= PML_ENTITY_NUM)
6194 pml_idx = 0;
6195 else
6196 pml_idx++;
945679e3 6197
55d2375e
SC
6198 pml_buf = page_address(vmx->pml_pg);
6199 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6200 u64 gpa;
945679e3 6201
55d2375e
SC
6202 gpa = pml_buf[pml_idx];
6203 WARN_ON(gpa & (PAGE_SIZE - 1));
6204 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
945679e3
VK
6205 }
6206
55d2375e
SC
6207 /* reset PML index */
6208 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
945679e3
VK
6209}
6210
55d2375e 6211static void vmx_dump_sel(char *name, uint32_t sel)
49f705c5 6212{
55d2375e
SC
6213 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6214 name, vmcs_read16(sel),
6215 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6216 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6217 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
49f705c5
NHE
6218}
6219
55d2375e 6220static void vmx_dump_dtsel(char *name, uint32_t limit)
a8bc284e 6221{
55d2375e
SC
6222 pr_err("%s limit=0x%08x, base=0x%016lx\n",
6223 name, vmcs_read32(limit),
6224 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
a8bc284e
JM
6225}
6226
8486039a
DE
6227static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6228{
6229 unsigned int i;
6230 struct vmx_msr_entry *e;
6231
6232 pr_err("MSR %s:\n", name);
6233 for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6234 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6235}
6236
0702a3cb 6237void dump_vmcs(struct kvm_vcpu *vcpu)
63846663 6238{
0702a3cb 6239 struct vcpu_vmx *vmx = to_vmx(vcpu);
6f2f8453
PB
6240 u32 vmentry_ctl, vmexit_ctl;
6241 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
0b85baa5 6242 u64 tertiary_exec_control;
6f2f8453 6243 unsigned long cr4;
0702a3cb 6244 int efer_slot;
63846663 6245
6f2f8453
PB
6246 if (!dump_invalid_vmcs) {
6247 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6248 return;
6249 }
6250
6251 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6252 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6253 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6254 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6255 cr4 = vmcs_readl(GUEST_CR4);
0b85baa5 6256
55d2375e
SC
6257 if (cpu_has_secondary_exec_ctrls())
6258 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
0b85baa5
RH
6259 else
6260 secondary_exec_control = 0;
6261
6262 if (cpu_has_tertiary_exec_ctrls())
6263 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6264 else
6265 tertiary_exec_control = 0;
14c07ad8 6266
18f63b15
JM
6267 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6268 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
55d2375e
SC
6269 pr_err("*** Guest State ***\n");
6270 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6271 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6272 vmcs_readl(CR0_GUEST_HOST_MASK));
6273 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6274 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6275 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
d9e46d34 6276 if (cpu_has_vmx_ept()) {
55d2375e
SC
6277 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6278 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6279 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6280 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
e9ac033e 6281 }
55d2375e
SC
6282 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6283 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6284 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6285 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6286 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6287 vmcs_readl(GUEST_SYSENTER_ESP),
6288 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6289 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
6290 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
6291 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
6292 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
6293 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
6294 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
6295 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6296 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6297 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6298 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
0702a3cb 6299 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
5518da62 6300 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
699e1b2e 6301 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
0702a3cb
DE
6302 else if (efer_slot >= 0)
6303 pr_err("EFER= 0x%016llx (autoload)\n",
6304 vmx->msr_autoload.guest.val[efer_slot].value);
6305 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6306 pr_err("EFER= 0x%016llx (effective)\n",
6307 vcpu->arch.efer | (EFER_LMA | EFER_LME));
6308 else
6309 pr_err("EFER= 0x%016llx (effective)\n",
6310 vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
5518da62 6311 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
699e1b2e 6312 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
55d2375e
SC
6313 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6314 vmcs_read64(GUEST_IA32_DEBUGCTL),
6315 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6316 if (cpu_has_load_perf_global_ctrl() &&
6317 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6318 pr_err("PerfGlobCtl = 0x%016llx\n",
6319 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6320 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6321 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6322 pr_err("Interruptibility = %08x ActivityState = %08x\n",
6323 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6324 vmcs_read32(GUEST_ACTIVITY_STATE));
6325 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6326 pr_err("InterruptStatus = %04x\n",
6327 vmcs_read16(GUEST_INTR_STATUS));
8486039a
DE
6328 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6329 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6330 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6331 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
ff651cb6 6332
55d2375e
SC
6333 pr_err("*** Host State ***\n");
6334 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6335 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6336 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6337 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6338 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6339 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6340 vmcs_read16(HOST_TR_SELECTOR));
6341 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6342 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6343 vmcs_readl(HOST_TR_BASE));
6344 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6345 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6346 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6347 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6348 vmcs_readl(HOST_CR4));
6349 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6350 vmcs_readl(HOST_IA32_SYSENTER_ESP),
6351 vmcs_read32(HOST_IA32_SYSENTER_CS),
6352 vmcs_readl(HOST_IA32_SYSENTER_EIP));
699e1b2e
DE
6353 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6354 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6355 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6356 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
55d2375e
SC
6357 if (cpu_has_load_perf_global_ctrl() &&
6358 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6359 pr_err("PerfGlobCtl = 0x%016llx\n",
6360 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
8486039a
DE
6361 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6362 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
ff651cb6 6363
55d2375e 6364 pr_err("*** Control State ***\n");
0b85baa5
RH
6365 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6366 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6367 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6368 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
55d2375e
SC
6369 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6370 vmcs_read32(EXCEPTION_BITMAP),
6371 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6372 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6373 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6374 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6375 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6376 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6377 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6378 vmcs_read32(VM_EXIT_INTR_INFO),
6379 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6380 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6381 pr_err(" reason=%08x qualification=%016lx\n",
6382 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6383 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6384 vmcs_read32(IDT_VECTORING_INFO_FIELD),
6385 vmcs_read32(IDT_VECTORING_ERROR_CODE));
6386 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6387 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6388 pr_err("TSC Multiplier = 0x%016llx\n",
6389 vmcs_read64(TSC_MULTIPLIER));
9d609649
PB
6390 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6391 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6392 u16 status = vmcs_read16(GUEST_INTR_STATUS);
6393 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6394 }
d6a85c32 6395 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
9d609649
PB
6396 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6397 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
d6a85c32 6398 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
9d609649 6399 }
55d2375e
SC
6400 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6401 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6402 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6403 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
55d2375e
SC
6404 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6405 pr_err("PLE Gap=%08x Window=%08x\n",
6406 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6407 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6408 pr_err("Virtual processor ID = 0x%04x\n",
6409 vmcs_read16(VIRTUAL_PROCESSOR_ID));
ff651cb6
WV
6410}
6411
55d2375e
SC
6412/*
6413 * The guest has exited. See if we can fix it or if we need userspace
6414 * assistance.
6415 */
fe6b6bc8 6416static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
ff651cb6 6417{
55d2375e 6418 struct vcpu_vmx *vmx = to_vmx(vcpu);
8e533240 6419 union vmx_exit_reason exit_reason = vmx->exit_reason;
55d2375e 6420 u32 vectoring_info = vmx->idt_vectoring_info;
8e533240 6421 u16 exit_handler_index;
ff651cb6 6422
55d2375e
SC
6423 /*
6424 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6425 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6426 * querying dirty_bitmap, we only need to kick all vcpus out of guest
6427 * mode as if vcpus is in root mode, the PML buffer must has been
c3bb9a20
SC
6428 * flushed already. Note, PML is never enabled in hardware while
6429 * running L2.
55d2375e 6430 */
c3bb9a20 6431 if (enable_pml && !is_guest_mode(vcpu))
55d2375e 6432 vmx_flush_pml_buffer(vcpu);
1dc35dac 6433
db438592 6434 /*
cd0e615c
SC
6435 * KVM should never reach this point with a pending nested VM-Enter.
6436 * More specifically, short-circuiting VM-Entry to emulate L2 due to
6437 * invalid guest state should never happen as that means KVM knowingly
6438 * allowed a nested VM-Enter with an invalid vmcs12. More below.
db438592 6439 */
67369273
SC
6440 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6441 return -EIO;
db438592 6442
96b100cd 6443 if (is_guest_mode(vcpu)) {
c3bb9a20
SC
6444 /*
6445 * PML is never enabled when running L2, bail immediately if a
6446 * PML full exit occurs as something is horribly wrong.
6447 */
6448 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6449 goto unexpected_vmexit;
6450
96b100cd
PB
6451 /*
6452 * The host physical addresses of some pages of guest memory
6453 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6454 * Page). The CPU may write to these pages via their host
6455 * physical address while L2 is running, bypassing any
6456 * address-translation-based dirty tracking (e.g. EPT write
6457 * protection).
6458 *
6459 * Mark them dirty on every exit from L2 to prevent them from
6460 * getting out of sync with dirty tracking.
6461 */
6462 nested_mark_vmcs12_pages_dirty(vcpu);
6463
cd0e615c
SC
6464 /*
6465 * Synthesize a triple fault if L2 state is invalid. In normal
6466 * operation, nested VM-Enter rejects any attempt to enter L2
6467 * with invalid state. However, those checks are skipped if
6468 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
6469 * L2 state is invalid, it means either L1 modified SMRAM state
6470 * or userspace provided bad state. Synthesize TRIPLE_FAULT as
6471 * doing so is architecturally allowed in the RSM case, and is
6472 * the least awful solution for the userspace case without
6473 * risking false positives.
6474 */
6475 if (vmx->emulation_required) {
6476 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6477 return 1;
6478 }
6479
f47baaed 6480 if (nested_vmx_reflect_vmexit(vcpu))
789afc5c 6481 return 1;
96b100cd 6482 }
9ed38ffa 6483
cd0e615c
SC
6484 /* If guest state is invalid, start emulating. L2 is handled above. */
6485 if (vmx->emulation_required)
6486 return handle_invalid_guest_state(vcpu);
6487
8e533240 6488 if (exit_reason.failed_vmentry) {
0702a3cb 6489 dump_vmcs(vcpu);
55d2375e
SC
6490 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6491 vcpu->run->fail_entry.hardware_entry_failure_reason
8e533240 6492 = exit_reason.full;
8a14fe4f 6493 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
55d2375e 6494 return 0;
9ed38ffa
LP
6495 }
6496
55d2375e 6497 if (unlikely(vmx->fail)) {
0702a3cb 6498 dump_vmcs(vcpu);
55d2375e
SC
6499 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6500 vcpu->run->fail_entry.hardware_entry_failure_reason
6501 = vmcs_read32(VM_INSTRUCTION_ERROR);
8a14fe4f 6502 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
55d2375e
SC
6503 return 0;
6504 }
50c28f21 6505
55d2375e
SC
6506 /*
6507 * Note:
6508 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6509 * delivery event since it indicates guest is accessing MMIO.
6510 * The vm-exit can be triggered again after return to guest that
6511 * will cause infinite loop.
6512 */
6513 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
8e533240
SC
6514 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6515 exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6516 exit_reason.basic != EXIT_REASON_PML_FULL &&
6517 exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
2f4073e0
TX
6518 exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6519 exit_reason.basic != EXIT_REASON_NOTIFY)) {
04c4f2ee
RW
6520 int ndata = 3;
6521
55d2375e
SC
6522 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6523 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
55d2375e 6524 vcpu->run->internal.data[0] = vectoring_info;
8e533240 6525 vcpu->run->internal.data[1] = exit_reason.full;
55d2375e 6526 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
8e533240 6527 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
04c4f2ee 6528 vcpu->run->internal.data[ndata++] =
55d2375e
SC
6529 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6530 }
04c4f2ee
RW
6531 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6532 vcpu->run->internal.ndata = ndata;
55d2375e
SC
6533 return 0;
6534 }
50c28f21 6535
55d2375e
SC
6536 if (unlikely(!enable_vnmi &&
6537 vmx->loaded_vmcs->soft_vnmi_blocked)) {
db438592 6538 if (!vmx_interrupt_blocked(vcpu)) {
55d2375e
SC
6539 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6540 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6541 vcpu->arch.nmi_pending) {
6542 /*
6543 * This CPU don't support us in finding the end of an
6544 * NMI-blocked window if the guest runs with IRQs
6545 * disabled. So we pull the trigger after 1 s of
6546 * futile waiting, but inform the user about this.
6547 */
6548 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6549 "state on VCPU %d after 1 s timeout\n",
6550 __func__, vcpu->vcpu_id);
6551 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6552 }
6553 }
50c28f21 6554
404d5d7b 6555 if (exit_fastpath != EXIT_FASTPATH_NONE)
1e9e2622 6556 return 1;
c926f2f7 6557
8e533240 6558 if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
c926f2f7 6559 goto unexpected_vmexit;
4289d272 6560#ifdef CONFIG_RETPOLINE
8e533240 6561 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
c926f2f7 6562 return kvm_emulate_wrmsr(vcpu);
8e533240 6563 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
c926f2f7 6564 return handle_preemption_timer(vcpu);
8e533240 6565 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
c926f2f7 6566 return handle_interrupt_window(vcpu);
8e533240 6567 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
c926f2f7 6568 return handle_external_interrupt(vcpu);
8e533240 6569 else if (exit_reason.basic == EXIT_REASON_HLT)
c926f2f7 6570 return kvm_emulate_halt(vcpu);
8e533240 6571 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
c926f2f7 6572 return handle_ept_misconfig(vcpu);
4289d272 6573#endif
c926f2f7 6574
8e533240
SC
6575 exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6576 kvm_vmx_max_exit_handlers);
6577 if (!kvm_vmx_exit_handlers[exit_handler_index])
c926f2f7
MP
6578 goto unexpected_vmexit;
6579
8e533240 6580 return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
c926f2f7
MP
6581
6582unexpected_vmexit:
8e533240
SC
6583 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6584 exit_reason.full);
0702a3cb 6585 dump_vmcs(vcpu);
c926f2f7
MP
6586 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6587 vcpu->run->internal.suberror =
7396d337 6588 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
1aa561b1 6589 vcpu->run->internal.ndata = 2;
8e533240 6590 vcpu->run->internal.data[0] = exit_reason.full;
8a14fe4f 6591 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
c926f2f7 6592 return 0;
9ed38ffa
LP
6593}
6594
fe6b6bc8
CQ
6595static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6596{
6597 int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6598
6599 /*
d61863c6
HX
6600 * Exit to user space when bus lock detected to inform that there is
6601 * a bus lock in guest.
fe6b6bc8
CQ
6602 */
6603 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6604 if (ret > 0)
6605 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6606
6607 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6608 return 0;
6609 }
6610 return ret;
6611}
6612
efebf0aa 6613/*
55d2375e
SC
6614 * Software based L1D cache flush which is used when microcode providing
6615 * the cache control MSR is not loaded.
efebf0aa 6616 *
55d2375e
SC
6617 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6618 * flush it is required to read in 64 KiB because the replacement algorithm
6619 * is not exactly LRU. This could be sized at runtime via topology
6620 * information but as all relevant affected CPUs have 32KiB L1D cache size
6621 * there is no point in doing so.
efebf0aa 6622 */
3ebccdf3 6623static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
fe3ef05c 6624{
55d2375e 6625 int size = PAGE_SIZE << L1D_CACHE_ORDER;
25a2e4fe
PB
6626
6627 /*
f7081834 6628 * This code is only executed when the flush mode is 'cond' or
55d2375e 6629 * 'always'
25a2e4fe 6630 */
55d2375e
SC
6631 if (static_branch_likely(&vmx_l1d_flush_cond)) {
6632 bool flush_l1d;
25a2e4fe 6633
55d2375e
SC
6634 /*
6635 * Clear the per-vcpu flush bit, it gets set again
6636 * either from vcpu_run() or from one of the unsafe
6637 * VMEXIT handlers.
6638 */
6639 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6640 vcpu->arch.l1tf_flush_l1d = false;
25a2e4fe 6641
55d2375e
SC
6642 /*
6643 * Clear the per-cpu flush bit, it gets set again from
6644 * the interrupt handlers.
6645 */
6646 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6647 kvm_clear_cpu_l1tf_flush_l1d();
25a2e4fe 6648
55d2375e
SC
6649 if (!flush_l1d)
6650 return;
6651 }
09abe320 6652
55d2375e 6653 vcpu->stat.l1d_flush++;
25a2e4fe 6654
55d2375e 6655 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
3ebccdf3 6656 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
55d2375e
SC
6657 return;
6658 }
25a2e4fe 6659
55d2375e
SC
6660 asm volatile(
6661 /* First ensure the pages are in the TLB */
6662 "xorl %%eax, %%eax\n"
6663 ".Lpopulate_tlb:\n\t"
6664 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6665 "addl $4096, %%eax\n\t"
6666 "cmpl %%eax, %[size]\n\t"
6667 "jne .Lpopulate_tlb\n\t"
6668 "xorl %%eax, %%eax\n\t"
6669 "cpuid\n\t"
6670 /* Now fill the cache */
6671 "xorl %%eax, %%eax\n"
6672 ".Lfill_cache:\n"
6673 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6674 "addl $64, %%eax\n\t"
6675 "cmpl %%eax, %[size]\n\t"
6676 "jne .Lfill_cache\n\t"
6677 "lfence\n"
6678 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6679 [size] "r" (size)
6680 : "eax", "ebx", "ecx", "edx");
09abe320 6681}
25a2e4fe 6682
b6a7cc35 6683static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
09abe320 6684{
55d2375e 6685 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
132f4f7e 6686 int tpr_threshold;
09abe320 6687
55d2375e
SC
6688 if (is_guest_mode(vcpu) &&
6689 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6690 return;
25a2e4fe 6691
132f4f7e 6692 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
02d496cf
LA
6693 if (is_guest_mode(vcpu))
6694 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6695 else
6696 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
8665c3f9
PB
6697}
6698
55d2375e 6699void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
8665c3f9 6700{
fe7f895d 6701 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 6702 u32 sec_exec_control;
8665c3f9 6703
55d2375e
SC
6704 if (!lapic_in_kernel(vcpu))
6705 return;
9314006d 6706
55d2375e
SC
6707 if (!flexpriority_enabled &&
6708 !cpu_has_vmx_virtualize_x2apic_mode())
6709 return;
705699a1 6710
55d2375e
SC
6711 /* Postpone execution until vmcs01 is the current VMCS. */
6712 if (is_guest_mode(vcpu)) {
fe7f895d 6713 vmx->nested.change_vmcs01_virtual_apic_mode = true;
55d2375e 6714 return;
6beb7bd5 6715 }
fe3ef05c 6716
fe7f895d 6717 sec_exec_control = secondary_exec_controls_get(vmx);
55d2375e
SC
6718 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6719 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
09abe320 6720
55d2375e
SC
6721 switch (kvm_get_apic_mode(vcpu)) {
6722 case LAPIC_MODE_INVALID:
6723 WARN_ONCE(true, "Invalid local APIC state");
551912d2 6724 break;
55d2375e
SC
6725 case LAPIC_MODE_DISABLED:
6726 break;
6727 case LAPIC_MODE_XAPIC:
6728 if (flexpriority_enabled) {
6729 sec_exec_control |=
6730 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4de1f9d4
SC
6731 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6732
6733 /*
6734 * Flush the TLB, reloading the APIC access page will
6735 * only do so if its physical address has changed, but
6736 * the guest may have inserted a non-APIC mapping into
6737 * the TLB while the APIC access page was disabled.
6738 */
6739 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
55d2375e
SC
6740 }
6741 break;
6742 case LAPIC_MODE_X2APIC:
6743 if (cpu_has_vmx_virtualize_x2apic_mode())
6744 sec_exec_control |=
6745 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6746 break;
09abe320 6747 }
fe7f895d 6748 secondary_exec_controls_set(vmx, sec_exec_control);
09abe320 6749
84ec8d2d 6750 vmx_update_msr_bitmap_x2apic(vcpu);
55d2375e 6751}
0238ea91 6752
a4148b7c 6753static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
55d2375e 6754{
a4148b7c
SC
6755 struct page *page;
6756
1196cb97
SC
6757 /* Defer reload until vmcs01 is the current VMCS. */
6758 if (is_guest_mode(vcpu)) {
6759 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6760 return;
55d2375e 6761 }
1196cb97 6762
4de1f9d4
SC
6763 if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6764 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6765 return;
6766
a4148b7c
SC
6767 page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6768 if (is_error_page(page))
6769 return;
6770
6771 vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
1196cb97 6772 vmx_flush_tlb_current(vcpu);
a4148b7c
SC
6773
6774 /*
6775 * Do not pin apic access page in memory, the MMU notifier
6776 * will call us again if it is migrated or swapped out.
6777 */
6778 put_page(page);
55d2375e 6779}
fe3ef05c 6780
d39850f5 6781static void vmx_hwapic_isr_update(int max_isr)
55d2375e
SC
6782{
6783 u16 status;
6784 u8 old;
32c7acf0 6785
55d2375e
SC
6786 if (max_isr == -1)
6787 max_isr = 0;
608406e2 6788
55d2375e
SC
6789 status = vmcs_read16(GUEST_INTR_STATUS);
6790 old = status >> 8;
6791 if (max_isr != old) {
6792 status &= 0xff;
6793 status |= max_isr << 8;
6794 vmcs_write16(GUEST_INTR_STATUS, status);
6795 }
6796}
6beb7bd5 6797
55d2375e
SC
6798static void vmx_set_rvi(int vector)
6799{
6800 u16 status;
6801 u8 old;
0b665d30 6802
55d2375e
SC
6803 if (vector == -1)
6804 vector = 0;
fe3ef05c 6805
55d2375e
SC
6806 status = vmcs_read16(GUEST_INTR_STATUS);
6807 old = (u8)status & 0xff;
6808 if ((u8)vector != old) {
6809 status &= ~0xff;
6810 status |= (u8)vector;
6811 vmcs_write16(GUEST_INTR_STATUS, status);
09abe320 6812 }
55d2375e 6813}
09abe320 6814
55d2375e
SC
6815static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6816{
09abe320 6817 /*
55d2375e
SC
6818 * When running L2, updating RVI is only relevant when
6819 * vmcs12 virtual-interrupt-delivery enabled.
6820 * However, it can be enabled only when L1 also
6821 * intercepts external-interrupts and in that case
6822 * we should not update vmcs02 RVI but instead intercept
6823 * interrupt. Therefore, do nothing when running L2.
fe3ef05c 6824 */
55d2375e
SC
6825 if (!is_guest_mode(vcpu))
6826 vmx_set_rvi(max_irr);
6827}
fe3ef05c 6828
55d2375e
SC
6829static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6830{
6831 struct vcpu_vmx *vmx = to_vmx(vcpu);
6832 int max_irr;
7e1901f6 6833 bool got_posted_interrupt;
a7c0b07d 6834
7e1901f6 6835 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
67369273
SC
6836 return -EIO;
6837
55d2375e
SC
6838 if (pi_test_on(&vmx->pi_desc)) {
6839 pi_clear_on(&vmx->pi_desc);
6840 /*
d9ff2744 6841 * IOMMU can write to PID.ON, so the barrier matters even on UP.
55d2375e
SC
6842 * But on x86 this is just a compiler barrier anyway.
6843 */
6844 smp_mb__after_atomic();
7e1901f6 6845 got_posted_interrupt =
55d2375e 6846 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
55d2375e
SC
6847 } else {
6848 max_irr = kvm_lapic_find_highest_irr(vcpu);
7e1901f6 6849 got_posted_interrupt = false;
a7c0b07d 6850 }
7e1901f6
PB
6851
6852 /*
6853 * Newly recognized interrupts are injected via either virtual interrupt
6854 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
6855 * disabled in two cases:
6856 *
6857 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
6858 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6859 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
6860 * into L2, but KVM doesn't use virtual interrupt delivery to inject
6861 * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6862 *
6863 * 2) If APICv is disabled for this vCPU, assigned devices may still
6864 * attempt to post interrupts. The posted interrupt vector will cause
6865 * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6866 */
6867 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6868 vmx_set_rvi(max_irr);
6869 else if (got_posted_interrupt)
6870 kvm_make_request(KVM_REQ_EVENT, vcpu);
6871
55d2375e
SC
6872 return max_irr;
6873}
a7c0b07d 6874
55d2375e
SC
6875static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6876{
6877 if (!kvm_vcpu_apicv_active(vcpu))
6878 return;
25a2e4fe 6879
55d2375e
SC
6880 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6881 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6882 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6883 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
8665c3f9
PB
6884}
6885
55d2375e 6886static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
8665c3f9
PB
6887{
6888 struct vcpu_vmx *vmx = to_vmx(vcpu);
9d1887ef 6889
55d2375e
SC
6890 pi_clear_on(&vmx->pi_desc);
6891 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6892}
8665c3f9 6893
4f76e86f
SC
6894void vmx_do_interrupt_irqoff(unsigned long entry);
6895void vmx_do_nmi_irqoff(void);
1a5488ef 6896
ec5be88a
JL
6897static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6898{
6899 /*
6900 * Save xfd_err to guest_fpu before interrupt is enabled, so the
6901 * MSR value is not clobbered by the host activity before the guest
6902 * has chance to consume it.
6903 *
6904 * Do not blindly read xfd_err here, since this exception might
6905 * be caused by L1 interception on a platform which doesn't
6906 * support xfd at all.
6907 *
6908 * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6909 * only when xfd contains a non-zero value.
6910 *
6911 * Queuing exception is done in vmx_handle_exit. See comment there.
6912 */
6913 if (vcpu->arch.guest_fpu.fpstate->xfd)
6914 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6915}
6916
11df586d 6917static void handle_exception_irqoff(struct vcpu_vmx *vmx)
55d2375e 6918{
87915858 6919 u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
fe3ef05c 6920
55d2375e 6921 /* if exit due to PF check for async PF */
1a5488ef 6922 if (is_page_fault(intr_info))
68fd66f1 6923 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
ec5be88a
JL
6924 /* if exit due to NM, handle before interrupts are enabled */
6925 else if (is_nm_fault(intr_info))
6926 handle_nm_fault_irqoff(&vmx->vcpu);
55d2375e 6927 /* Handle machine checks before interrupts are enabled */
1a5488ef 6928 else if (is_machine_check(intr_info))
55d2375e 6929 kvm_machine_check();
55d2375e 6930}
fe3ef05c 6931
95b5a48c 6932static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
55d2375e 6933{
87915858 6934 u32 intr_info = vmx_get_intr_info(vcpu);
a217a659
LJ
6935 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6936 gate_desc *desc = (gate_desc *)host_idt_base + vector;
fe3ef05c 6937
67369273 6938 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
8d20bd63 6939 "unexpected VM-Exit interrupt info: 0x%x", intr_info))
49def500
SC
6940 return;
6941
4f76e86f
SC
6942 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
6943 vmx_do_interrupt_irqoff(gate_offset(desc));
6944 kvm_after_interrupt(vcpu);
6945
6cd88243 6946 vcpu->arch.at_instruction_boundary = true;
55d2375e 6947}
95b5a48c 6948
a9ab13ff 6949static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
95b5a48c
SC
6950{
6951 struct vcpu_vmx *vmx = to_vmx(vcpu);
6952
81b4b56d
ML
6953 if (vmx->emulation_required)
6954 return;
6955
8e533240 6956 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
95b5a48c 6957 handle_external_interrupt_irqoff(vcpu);
8e533240 6958 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
11df586d 6959 handle_exception_irqoff(vmx);
95b5a48c 6960}
5a6a9748 6961
5719455f
TL
6962/*
6963 * The kvm parameter can be NULL (module initialization, or invocation before
6964 * VM creation). Be sure to check the kvm parameter before using it.
6965 */
6966static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
55d2375e
SC
6967{
6968 switch (index) {
6969 case MSR_IA32_SMBASE:
4b8e1b32
PB
6970 if (!IS_ENABLED(CONFIG_KVM_SMM))
6971 return false;
55d2375e
SC
6972 /*
6973 * We cannot do SMM unless we can run the guest in big
6974 * real mode.
6975 */
6976 return enable_unrestricted_guest || emulate_invalid_guest_state;
95c5c7c7
PB
6977 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6978 return nested;
55d2375e 6979 case MSR_AMD64_VIRT_SPEC_CTRL:
5228eb96 6980 case MSR_AMD64_TSC_RATIO:
55d2375e
SC
6981 /* This is AMD only. */
6982 return false;
6983 default:
6984 return true;
3184a995 6985 }
55d2375e 6986}
2bb8cafe 6987
55d2375e
SC
6988static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6989{
6990 u32 exit_intr_info;
6991 bool unblock_nmi;
6992 u8 vector;
6993 bool idtv_info_valid;
7ca29de2 6994
55d2375e 6995 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
feaf0c7d 6996
55d2375e
SC
6997 if (enable_vnmi) {
6998 if (vmx->loaded_vmcs->nmi_known_unmasked)
6999 return;
87915858
SC
7000
7001 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
55d2375e
SC
7002 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7003 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7004 /*
7005 * SDM 3: 27.7.1.2 (September 2008)
7006 * Re-set bit "block by NMI" before VM entry if vmexit caused by
7007 * a guest IRET fault.
7008 * SDM 3: 23.2.2 (September 2008)
7009 * Bit 12 is undefined in any of the following cases:
7010 * If the VM exit sets the valid bit in the IDT-vectoring
7011 * information field.
7012 * If the VM exit is due to a double fault.
7013 */
7014 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7015 vector != DF_VECTOR && !idtv_info_valid)
7016 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7017 GUEST_INTR_STATE_NMI);
7018 else
7019 vmx->loaded_vmcs->nmi_known_unmasked =
7020 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7021 & GUEST_INTR_STATE_NMI);
7022 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7023 vmx->loaded_vmcs->vnmi_blocked_time +=
7024 ktime_to_ns(ktime_sub(ktime_get(),
7025 vmx->loaded_vmcs->entry_time));
fe3ef05c
NHE
7026}
7027
55d2375e
SC
7028static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7029 u32 idt_vectoring_info,
7030 int instr_len_field,
7031 int error_code_field)
0c7f650e 7032{
55d2375e
SC
7033 u8 vector;
7034 int type;
7035 bool idtv_info_valid;
0c7f650e 7036
55d2375e 7037 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
0c7f650e 7038
55d2375e
SC
7039 vcpu->arch.nmi_injected = false;
7040 kvm_clear_exception_queue(vcpu);
7041 kvm_clear_interrupt_queue(vcpu);
27c42a1b 7042
55d2375e
SC
7043 if (!idtv_info_valid)
7044 return;
c7c2c709 7045
55d2375e 7046 kvm_make_request(KVM_REQ_EVENT, vcpu);
ca0bde28 7047
55d2375e
SC
7048 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7049 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
64a919f7 7050
55d2375e
SC
7051 switch (type) {
7052 case INTR_TYPE_NMI_INTR:
7053 vcpu->arch.nmi_injected = true;
7054 /*
7055 * SDM 3: 27.7.1.2 (September 2008)
7056 * Clear bit "block by NMI" before VM entry if a NMI
7057 * delivery faulted.
7058 */
7059 vmx_set_nmi_mask(vcpu, false);
7060 break;
7061 case INTR_TYPE_SOFT_EXCEPTION:
7062 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
df561f66 7063 fallthrough;
55d2375e
SC
7064 case INTR_TYPE_HARD_EXCEPTION:
7065 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7066 u32 err = vmcs_read32(error_code_field);
7067 kvm_requeue_exception_e(vcpu, vector, err);
7068 } else
7069 kvm_requeue_exception(vcpu, vector);
7070 break;
7071 case INTR_TYPE_SOFT_INTR:
7072 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
df561f66 7073 fallthrough;
55d2375e
SC
7074 case INTR_TYPE_EXT_INTR:
7075 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7076 break;
7077 default:
7078 break;
0447378a 7079 }
ca0bde28
JM
7080}
7081
55d2375e 7082static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
f145d90d 7083{
55d2375e
SC
7084 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7085 VM_EXIT_INSTRUCTION_LEN,
7086 IDT_VECTORING_ERROR_CODE);
f145d90d
LA
7087}
7088
55d2375e 7089static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
ca0bde28 7090{
55d2375e
SC
7091 __vmx_complete_interrupts(vcpu,
7092 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7093 VM_ENTRY_INSTRUCTION_LEN,
7094 VM_ENTRY_EXCEPTION_ERROR_CODE);
f1b026a3 7095
55d2375e 7096 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
ca0bde28
JM
7097}
7098
55d2375e 7099static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
52017608 7100{
55d2375e
SC
7101 int i, nr_msrs;
7102 struct perf_guest_switch_msr *msrs;
39a4d779 7103 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7c177938 7104
85425032
LX
7105 pmu->host_cross_mapped_mask = 0;
7106 if (pmu->pebs_enable & pmu->global_ctrl)
7107 intel_pmu_cross_mapped_check(pmu);
7c177938 7108
c8e2fe13 7109 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
39a4d779 7110 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
55d2375e
SC
7111 if (!msrs)
7112 return;
f1b026a3 7113
55d2375e
SC
7114 for (i = 0; i < nr_msrs; i++)
7115 if (msrs[i].host == msrs[i].guest)
7116 clear_atomic_switch_msr(vmx, msrs[i].msr);
7117 else
7118 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7119 msrs[i].host, false);
ca0bde28 7120}
52017608 7121
55d2375e 7122static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
858e25c0
JM
7123{
7124 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e
SC
7125 u64 tscl;
7126 u32 delta_tsc;
52017608 7127
55d2375e 7128 if (vmx->req_immediate_exit) {
804939ea
SC
7129 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7130 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7131 } else if (vmx->hv_deadline_tsc != -1) {
55d2375e
SC
7132 tscl = rdtsc();
7133 if (vmx->hv_deadline_tsc > tscl)
7134 /* set_hv_timer ensures the delta fits in 32-bits */
7135 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7136 cpu_preemption_timer_multi);
7137 else
7138 delta_tsc = 0;
858e25c0 7139
804939ea
SC
7140 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7141 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7142 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7143 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7144 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7f7f1ba3 7145 }
858e25c0
JM
7146}
7147
3ebccdf3 7148void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
ca0bde28 7149{
c09b03eb
SC
7150 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7151 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7152 vmcs_writel(HOST_RSP, host_rsp);
7153 }
5ad6ece8 7154}
5f3d5799 7155
fc02735b
JP
7156void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7157 unsigned int flags)
7158{
7159 u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7160
7161 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7162 return;
7163
7164 if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7165 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7166
7167 /*
7168 * If the guest/host SPEC_CTRL values differ, restore the host value.
bea7e31a
JP
7169 *
7170 * For legacy IBRS, the IBRS bit always needs to be written after
7171 * transitioning from a less privileged predictor mode, regardless of
7172 * whether the guest/host values differ.
fc02735b 7173 */
bea7e31a
JP
7174 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7175 vmx->spec_ctrl != hostval)
fc02735b
JP
7176 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7177
7178 barrier_nospec();
7179}
7180
404d5d7b 7181static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
dcf068da 7182{
8e533240 7183 switch (to_vmx(vcpu)->exit_reason.basic) {
dcf068da
WL
7184 case EXIT_REASON_MSR_WRITE:
7185 return handle_fastpath_set_msr_irqoff(vcpu);
26efe2fd
WL
7186 case EXIT_REASON_PREEMPTION_TIMER:
7187 return handle_fastpath_preemption_timer(vcpu);
dcf068da
WL
7188 default:
7189 return EXIT_FASTPATH_NONE;
7190 }
7191}
7192
3ebccdf3 7193static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
e8733482 7194 unsigned int flags)
3ebccdf3 7195{
432727f1
SC
7196 struct vcpu_vmx *vmx = to_vmx(vcpu);
7197
b2d2af7e 7198 guest_state_enter_irqoff();
3ebccdf3
TG
7199
7200 /* L1D Flush includes CPU buffer clear to mitigate MDS */
7201 if (static_branch_unlikely(&vmx_l1d_should_flush))
7202 vmx_l1d_flush(vcpu);
7203 else if (static_branch_unlikely(&mds_user_clear))
7204 mds_clear_cpu_buffers();
8cb861e9
PG
7205 else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7206 kvm_arch_has_assigned_device(vcpu->kvm))
7207 mds_clear_cpu_buffers();
3ebccdf3 7208
027bbb88 7209 vmx_disable_fb_clear(vmx);
3ebccdf3 7210
2245d398
TG
7211 if (vcpu->arch.cr2 != native_read_cr2())
7212 native_write_cr2(vcpu->arch.cr2);
3ebccdf3
TG
7213
7214 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
bb066506 7215 flags);
3ebccdf3 7216
2245d398 7217 vcpu->arch.cr2 = native_read_cr2();
3ebccdf3 7218
027bbb88
PG
7219 vmx_enable_fb_clear(vmx);
7220
11df586d
SC
7221 if (unlikely(vmx->fail))
7222 vmx->exit_reason.full = 0xdead;
7223 else
7224 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7225
7226 if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
7227 is_nmi(vmx_get_intr_info(vcpu))) {
7228 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7229 vmx_do_nmi_irqoff();
7230 kvm_after_interrupt(vcpu);
7231 }
7232
b2d2af7e 7233 guest_state_exit_irqoff();
3ebccdf3
TG
7234}
7235
404d5d7b 7236static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
5ad6ece8
SC
7237{
7238 struct vcpu_vmx *vmx = to_vmx(vcpu);
1a715810 7239 unsigned long cr3, cr4;
5ad6ece8
SC
7240
7241 /* Record the guest's net vcpu time for enforced NMI injections. */
7242 if (unlikely(!enable_vnmi &&
7243 vmx->loaded_vmcs->soft_vnmi_blocked))
7244 vmx->loaded_vmcs->entry_time = ktime_get();
7245
c42dec14
ML
7246 /*
7247 * Don't enter VMX if guest state is invalid, let the exit handler
7248 * start emulation until we arrive back to a valid state. Synthesize a
7249 * consistency check VM-Exit due to invalid guest state and bail.
7250 */
7251 if (unlikely(vmx->emulation_required)) {
a80dfc02 7252 vmx->fail = 0;
c8607e4a 7253
c42dec14
ML
7254 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7255 vmx->exit_reason.failed_vmentry = 1;
7256 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7257 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7258 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7259 vmx->exit_intr_info = 0;
a9ab13ff 7260 return EXIT_FASTPATH_NONE;
c42dec14 7261 }
5ad6ece8 7262
d95df951
LB
7263 trace_kvm_entry(vcpu);
7264
5ad6ece8
SC
7265 if (vmx->ple_window_dirty) {
7266 vmx->ple_window_dirty = false;
7267 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7268 }
7269
c9dfd3fb 7270 /*
7271 * We did this in prepare_switch_to_guest, because it needs to
7272 * be within srcu_read_lock.
7273 */
7274 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
5ad6ece8 7275
cb3c1e2f 7276 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
5ad6ece8 7277 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
cb3c1e2f 7278 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
5ad6ece8 7279 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
41e68b69 7280 vcpu->arch.regs_dirty = 0;
5ad6ece8 7281
1a715810
SC
7282 /*
7283 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
7284 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7285 * it switches back to the current->mm, which can occur in KVM context
7286 * when switching to a temporary mm to patch kernel code, e.g. if KVM
7287 * toggles a static key while handling a VM-Exit.
7288 */
7289 cr3 = __get_current_cr3_fast();
7290 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7291 vmcs_writel(HOST_CR3, cr3);
7292 vmx->loaded_vmcs->host_state.cr3 = cr3;
7293 }
7294
5ad6ece8
SC
7295 cr4 = cr4_read_shadow();
7296 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7297 vmcs_writel(HOST_CR4, cr4);
7298 vmx->loaded_vmcs->host_state.cr4 = cr4;
7299 }
7300
375e28ff
PB
7301 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
7302 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7303 set_debugreg(vcpu->arch.dr6, 6);
7304
5ad6ece8
SC
7305 /* When single-stepping over STI and MOV SS, we must clear the
7306 * corresponding interruptibility bits in the guest state. Otherwise
7307 * vmentry fails as it then expects bit 14 (BS) in pending debug
7308 * exceptions being set, but that's not correct for the guest debugging
7309 * case. */
7310 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7311 vmx_set_interrupt_shadow(vcpu, 0);
7312
139a12cf 7313 kvm_load_guest_xsave_state(vcpu);
1811d979 7314
5ad6ece8
SC
7315 pt_guest_enter(vmx);
7316
49097762 7317 atomic_switch_perf_msrs(vmx);
1b5ac322
LX
7318 if (intel_pmu_lbr_is_enabled(vcpu))
7319 vmx_passthrough_lbr_msrs(vcpu);
5ad6ece8 7320
804939ea
SC
7321 if (enable_preemption_timer)
7322 vmx_update_hv_timer(vcpu);
5ad6ece8 7323
010fd37f 7324 kvm_wait_lapic_expire(vcpu);
b6c4bc65 7325
3ebccdf3 7326 /* The actual VMENTER/EXIT is in the .noinstr.text section. */
432727f1 7327 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
b6b8a145 7328
55d2375e 7329 /* All fields are clean at this point */
19f10315 7330 if (kvm_is_using_evmcs()) {
55d2375e
SC
7331 current_evmcs->hv_clean_fields |=
7332 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
f4124500 7333
f2bc14b6 7334 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
9ff5e030 7335 }
6f6a657c 7336
55d2375e
SC
7337 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7338 if (vmx->host_debugctlmsr)
7339 update_debugctlmsr(vmx->host_debugctlmsr);
f4124500 7340
55d2375e
SC
7341#ifndef CONFIG_X86_64
7342 /*
7343 * The sysexit path does not restore ds/es, so we must set them to
7344 * a reasonable value ourselves.
7345 *
7346 * We can't defer this to vmx_prepare_switch_to_host() since that
7347 * function may be executed in interrupt context, which saves and
7348 * restore segments around it, nullifying its effect.
7349 */
7350 loadsegment(ds, __USER_DS);
7351 loadsegment(es, __USER_DS);
7352#endif
4704d0be 7353
41e68b69 7354 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7854cbca 7355
2ef444f1
CP
7356 pt_guest_exit(vmx);
7357
139a12cf 7358 kvm_load_host_xsave_state(vcpu);
1811d979 7359
b93af02c
KS
7360 if (is_guest_mode(vcpu)) {
7361 /*
7362 * Track VMLAUNCH/VMRESUME that have made past guest state
7363 * checking.
7364 */
7365 if (vmx->nested.nested_run_pending &&
7366 !vmx->exit_reason.failed_vmentry)
7367 ++vcpu->stat.nested_run;
7368
7369 vmx->nested.nested_run_pending = 0;
7370 }
7371
55d2375e 7372 vmx->idt_vectoring_info = 0;
119a9c01 7373
11df586d 7374 if (unlikely(vmx->fail))
a9ab13ff 7375 return EXIT_FASTPATH_NONE;
873e1da1 7376
8e533240 7377 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
beb8d93b
SC
7378 kvm_machine_check();
7379
f5c59b57
ML
7380 if (likely(!vmx->exit_reason.failed_vmentry))
7381 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7382
0a62a031 7383 trace_kvm_exit(vcpu, KVM_ISA_VMX);
dcf068da 7384
8e533240 7385 if (unlikely(vmx->exit_reason.failed_vmentry))
a9ab13ff
WL
7386 return EXIT_FASTPATH_NONE;
7387
55d2375e 7388 vmx->loaded_vmcs->launched = 1;
c18911a2 7389
55d2375e
SC
7390 vmx_recover_nmi_blocking(vmx);
7391 vmx_complete_interrupts(vmx);
a9ab13ff 7392
dcf068da
WL
7393 if (is_guest_mode(vcpu))
7394 return EXIT_FASTPATH_NONE;
7395
d89d04ab 7396 return vmx_exit_handlers_fastpath(vcpu);
55d2375e 7397}
2996fca0 7398
58fccda4 7399static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
cf8b84f4 7400{
55d2375e 7401 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be 7402
55d2375e
SC
7403 if (enable_pml)
7404 vmx_destroy_pml_buffer(vmx);
7405 free_vpid(vmx->vpid);
55d2375e
SC
7406 nested_vmx_free_vcpu(vcpu);
7407 free_loaded_vmcs(vmx->loaded_vmcs);
55d2375e 7408}
4704d0be 7409
58fccda4 7410static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
55d2375e 7411{
8ea8b8d6 7412 struct vmx_uret_msr *tsx_ctrl;
41836839 7413 struct vcpu_vmx *vmx;
06692e4b 7414 int i, err;
4704d0be 7415
a9dd6f09
SC
7416 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7417 vmx = to_vmx(vcpu);
d9a710e5 7418
12a8eee5
SC
7419 INIT_LIST_HEAD(&vmx->pi_wakeup_list);
7420
55d2375e 7421 err = -ENOMEM;
b666a4b6 7422
55d2375e 7423 vmx->vpid = allocate_vpid();
7cdc2d62 7424
5f3d5799 7425 /*
55d2375e
SC
7426 * If PML is turned on, failure on enabling PML just results in failure
7427 * of creating the vcpu, therefore we can simplify PML logic (by
7428 * avoiding dealing with cases, such as enabling PML partially on vcpus
67b0ae43 7429 * for the guest), etc.
5f3d5799 7430 */
55d2375e 7431 if (enable_pml) {
41836839 7432 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
55d2375e 7433 if (!vmx->pml_pg)
987b2594 7434 goto free_vpid;
55d2375e 7435 }
4704d0be 7436
d0656735 7437 for (i = 0; i < kvm_nr_uret_msrs; ++i)
8ea8b8d6 7438 vmx->guest_uret_msrs[i].mask = -1ull;
5e17c624 7439 if (boot_cpu_has(X86_FEATURE_RTM)) {
8ea8b8d6
SC
7440 /*
7441 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7442 * Keep the host value unchanged to avoid changing CPUID bits
7443 * under the host kernel's feet.
8ea8b8d6 7444 */
5e17c624
SC
7445 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7446 if (tsx_ctrl)
5c49d185 7447 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
4be53410
XL
7448 }
7449
55d2375e
SC
7450 err = alloc_loaded_vmcs(&vmx->vmcs01);
7451 if (err < 0)
7d73710d 7452 goto free_pml;
cb61de2f 7453
250552b9
VK
7454 /*
7455 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7456 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7457 * feature only for vmcs01, KVM currently isn't equipped to realize any
7458 * performance benefits from enabling it for vmcs02.
7459 */
19f10315 7460 if (kvm_is_using_evmcs() &&
250552b9
VK
7461 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7462 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7463
7464 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7465 }
7466
3eb90017
AG
7467 /* The MSR bitmap starts with all ones */
7468 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7469 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7470
476c9bd8 7471 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
dbdd096a 7472#ifdef CONFIG_X86_64
476c9bd8
AL
7473 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7474 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7475 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
dbdd096a 7476#endif
476c9bd8
AL
7477 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7478 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7479 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
987b2594 7480 if (kvm_cstate_in_guest(vcpu->kvm)) {
476c9bd8
AL
7481 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7482 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7483 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7484 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
b5170063 7485 }
4704d0be 7486
55d2375e 7487 vmx->loaded_vmcs = &vmx->vmcs01;
06692e4b 7488
34109c04 7489 if (cpu_need_virtualize_apic_accesses(vcpu)) {
c482f2ce 7490 err = kvm_alloc_apic_access_page(vcpu->kvm);
55d2375e
SC
7491 if (err)
7492 goto free_vmcs;
7493 }
7494
7495 if (enable_ept && !enable_unrestricted_guest) {
987b2594 7496 err = init_rmode_identity_map(vcpu->kvm);
55d2375e
SC
7497 if (err)
7498 goto free_vmcs;
7499 }
4704d0be 7500
d588bb9b
CG
7501 if (vmx_can_use_ipiv(vcpu))
7502 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7503 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
7504
a9dd6f09 7505 return 0;
4704d0be 7506
55d2375e
SC
7507free_vmcs:
7508 free_loaded_vmcs(vmx->loaded_vmcs);
55d2375e
SC
7509free_pml:
7510 vmx_destroy_pml_buffer(vmx);
987b2594 7511free_vpid:
55d2375e 7512 free_vpid(vmx->vpid);
a9dd6f09 7513 return err;
55d2375e 7514}
36be0b9d 7515
65fd4cb6
TG
7516#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7517#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
21feb4eb 7518
55d2375e
SC
7519static int vmx_vm_init(struct kvm *kvm)
7520{
55d2375e
SC
7521 if (!ple_gap)
7522 kvm->arch.pause_in_guest = true;
3af18d9c 7523
55d2375e
SC
7524 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7525 switch (l1tf_mitigation) {
7526 case L1TF_MITIGATION_OFF:
7527 case L1TF_MITIGATION_FLUSH_NOWARN:
7528 /* 'I explicitly don't care' is set */
7529 break;
7530 case L1TF_MITIGATION_FLUSH:
7531 case L1TF_MITIGATION_FLUSH_NOSMT:
7532 case L1TF_MITIGATION_FULL:
7533 /*
7534 * Warn upon starting the first VM in a potentially
7535 * insecure environment.
7536 */
b284909a 7537 if (sched_smt_active())
55d2375e
SC
7538 pr_warn_once(L1TF_MSG_SMT);
7539 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7540 pr_warn_once(L1TF_MSG_L1D);
7541 break;
7542 case L1TF_MITIGATION_FULL_FORCE:
7543 /* Flush is enforced */
7544 break;
7545 }
7546 }
7547 return 0;
4704d0be
NHE
7548}
7549
ba28401b 7550static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
bd18bffc 7551{
55d2375e 7552 u8 cache;
bd18bffc 7553
222f06e7
CW
7554 /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7555 * memory aliases with conflicting memory types and sometimes MCEs.
7556 * We have to be careful as to what are honored and when.
7557 *
7558 * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
7559 * UC. The effective memory type is UC or WC depending on guest PAT.
7560 * This was historically the source of MCEs and we want to be
7561 * conservative.
7562 *
7563 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7564 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
7565 * EPT memory type is set to WB. The effective memory type is forced
7566 * WB.
7567 *
7568 * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
7569 * EPT memory type is used to emulate guest CD/MTRR.
bd18bffc 7570 */
222f06e7 7571
fb43496c
BG
7572 if (is_mmio)
7573 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
bd18bffc 7574
fb43496c
BG
7575 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7576 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
bd18bffc 7577
55d2375e 7578 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
55d2375e
SC
7579 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7580 cache = MTRR_TYPE_WRBACK;
7581 else
7582 cache = MTRR_TYPE_UNCACHABLE;
bd18bffc 7583
fb43496c
BG
7584 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7585 }
bd18bffc 7586
fb43496c 7587 return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
55d2375e 7588}
bd18bffc 7589
b6247686 7590static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
55d2375e 7591{
bd18bffc 7592 /*
55d2375e
SC
7593 * These bits in the secondary execution controls field
7594 * are dynamic, the others are mostly based on the hypervisor
7595 * architecture and the guest's CPUID. Do not touch the
7596 * dynamic bits.
bd18bffc 7597 */
55d2375e
SC
7598 u32 mask =
7599 SECONDARY_EXEC_SHADOW_VMCS |
7600 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7601 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7602 SECONDARY_EXEC_DESC;
bd18bffc 7603
fe7f895d 7604 u32 cur_ctl = secondary_exec_controls_get(vmx);
bd18bffc 7605
fe7f895d 7606 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
bd18bffc
SC
7607}
7608
4704d0be 7609/*
55d2375e
SC
7610 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7611 * (indicating "allowed-1") if they are supported in the guest's CPUID.
4704d0be 7612 */
55d2375e 7613static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
4704d0be
NHE
7614{
7615 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 7616 struct kvm_cpuid_entry2 *entry;
4704d0be 7617
55d2375e
SC
7618 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7619 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
e79f245d 7620
55d2375e
SC
7621#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7622 if (entry && (entry->_reg & (_cpuid_mask))) \
7623 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7624} while (0)
ff651cb6 7625
277ad7d5 7626 entry = kvm_find_cpuid_entry(vcpu, 0x1);
87382003
SC
7627 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
7628 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
7629 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
7630 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
7631 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
7632 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
7633 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
7634 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
7635 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
7636 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7637 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
7638 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
7639 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
7640 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
61ada748 7641
277ad7d5 7642 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
87382003
SC
7643 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
7644 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
7645 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
7646 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
7647 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
7648 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
cf3215d9 7649
55d2375e
SC
7650#undef cr4_fixed1_update
7651}
36c3cc42 7652
6c0f0bba
LK
7653static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7654{
7655 struct vcpu_vmx *vmx = to_vmx(vcpu);
7656 struct kvm_cpuid_entry2 *best = NULL;
7657 int i;
7658
7659 for (i = 0; i < PT_CPUID_LEAVES; i++) {
277ad7d5 7660 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
6c0f0bba
LK
7661 if (!best)
7662 return;
7663 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7664 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7665 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7666 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7667 }
7668
7669 /* Get the number of configurable Address Ranges for filtering */
f4d3a902 7670 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
6c0f0bba
LK
7671 PT_CAP_num_address_ranges);
7672
7673 /* Initialize and clear the no dependency bits */
7674 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
e099f3eb
XL
7675 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7676 RTIT_CTL_BRANCH_EN);
6c0f0bba
LK
7677
7678 /*
7679 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7680 * will inject an #GP
7681 */
7682 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7683 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7684
7685 /*
7686 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7687 * PSBFreq can be set
7688 */
7689 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7690 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7691 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7692
7693 /*
e099f3eb 7694 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
6c0f0bba
LK
7695 */
7696 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7697 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
e099f3eb 7698 RTIT_CTL_MTC_RANGE);
6c0f0bba
LK
7699
7700 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7701 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7702 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7703 RTIT_CTL_PTW_EN);
7704
7705 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7706 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7707 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7708
7709 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7710 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7711 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7712
d9f6e12f 7713 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
6c0f0bba
LK
7714 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7715 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7716
7717 /* unmask address range configure area */
f4d3a902 7718 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
d14eff1b 7719 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
6c0f0bba
LK
7720}
7721
7c1b761b 7722static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
55d2375e
SC
7723{
7724 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be 7725
7204160e
AL
7726 /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7727 vcpu->arch.xsaves_enabled = false;
7728
432979b5
SC
7729 vmx_setup_uret_msrs(vmx);
7730
b6247686
SC
7731 if (cpu_has_secondary_exec_ctrls())
7732 vmcs_set_secondary_exec_control(vmx,
7733 vmx_secondary_exec_control(vmx));
4704d0be 7734
55d2375e 7735 if (nested_vmx_allowed(vcpu))
48ebd0cf 7736 vmx->msr_ia32_feature_control_valid_bits |=
32ad73db
SC
7737 FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7738 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
55d2375e 7739 else
48ebd0cf 7740 vmx->msr_ia32_feature_control_valid_bits &=
32ad73db
SC
7741 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7742 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
4f350c6d 7743
8805875a 7744 if (nested_vmx_allowed(vcpu))
55d2375e 7745 nested_vmx_cr_fixed1_bits_update(vcpu);
6c0f0bba
LK
7746
7747 if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7748 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7749 update_intel_pt_cfg(vcpu);
b07a5c53
PB
7750
7751 if (boot_cpu_has(X86_FEATURE_RTM)) {
eb3db1b1 7752 struct vmx_uret_msr *msr;
d85a8034 7753 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
b07a5c53
PB
7754 if (msr) {
7755 bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7bf662bb 7756 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
b07a5c53
PB
7757 }
7758 }
a6337a35 7759
61f20813
JL
7760 if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7761 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7762 !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7763
7764
2ed41aa6
SC
7765 set_cr4_guest_host_mask(vmx);
7766
72add915
SC
7767 vmx_write_encls_bitmap(vcpu, NULL);
7768 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7769 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7770 else
7771 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7772
7773 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7774 vmx->msr_ia32_feature_control_valid_bits |=
7775 FEAT_CTL_SGX_LC_ENABLED;
7776 else
7777 vmx->msr_ia32_feature_control_valid_bits &=
7778 ~FEAT_CTL_SGX_LC_ENABLED;
7779
a6337a35 7780 /* Refresh #PF interception to account for MAXPHYADDR changes. */
b6a7cc35 7781 vmx_update_exception_bitmap(vcpu);
55d2375e 7782}
09abb5e3 7783
bec46859
SC
7784static u64 vmx_get_perf_capabilities(void)
7785{
7786 u64 perf_cap = PMU_CAP_FW_WRITES;
7787 struct x86_pmu_lbr lbr;
7788 u64 host_perf_cap = 0;
7789
7790 if (!enable_pmu)
7791 return 0;
7792
7793 if (boot_cpu_has(X86_FEATURE_PDCM))
7794 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7795
098f4c06
SC
7796 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
7797 x86_perf_get_lbr(&lbr);
7798 if (lbr.nr)
7799 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
7800 }
bec46859
SC
7801
7802 if (vmx_pebs_supported()) {
7803 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7804 if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
7805 perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7806 }
7807
7808 return perf_cap;
7809}
7810
3ec6fd8c 7811static __init void vmx_set_cpu_caps(void)
55d2375e 7812{
3ec6fd8c
SC
7813 kvm_set_cpu_caps();
7814
7815 /* CPUID 0x1 */
7816 if (nested)
7817 kvm_cpu_cap_set(X86_FEATURE_VMX);
7818
7819 /* CPUID 0x7 */
8721f5b0
SC
7820 if (kvm_mpx_supported())
7821 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
e4203334
SC
7822 if (!cpu_has_vmx_invpcid())
7823 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
8721f5b0
SC
7824 if (vmx_pt_mode_is_host_guest())
7825 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
cf8e55fe
LX
7826 if (vmx_pebs_supported()) {
7827 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7828 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7829 }
3ec6fd8c 7830
6ef25aa0
LX
7831 if (!enable_pmu)
7832 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
bec46859 7833 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
3ec6fd8c 7834
72add915
SC
7835 if (!enable_sgx) {
7836 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7837 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7838 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7839 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7840 }
7841
90d2f60f
SC
7842 if (vmx_umip_emulated())
7843 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7844
b3d895d5 7845 /* CPUID 0xD.1 */
938c8745 7846 kvm_caps.supported_xss = 0;
becdad85 7847 if (!cpu_has_vmx_xsaves())
b3d895d5
SC
7848 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7849
8aec21c0
SC
7850 /* CPUID 0x80000001 and 0x7 (RDPID) */
7851 if (!cpu_has_vmx_rdtscp()) {
3ec6fd8c 7852 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
8aec21c0
SC
7853 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7854 }
0abcc8f6 7855
becdad85 7856 if (cpu_has_vmx_waitpkg())
0abcc8f6 7857 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
4704d0be
NHE
7858}
7859
55d2375e 7860static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
42124925 7861{
55d2375e 7862 to_vmx(vcpu)->req_immediate_exit = true;
7c177938
NHE
7863}
7864
35a57134
OU
7865static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7866 struct x86_instruction_info *info)
7867{
7868 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7869 unsigned short port;
7870 bool intercept;
7871 int size;
7872
7873 if (info->intercept == x86_intercept_in ||
7874 info->intercept == x86_intercept_ins) {
7875 port = info->src_val;
7876 size = info->dst_bytes;
7877 } else {
7878 port = info->dst_val;
7879 size = info->src_bytes;
7880 }
7881
7882 /*
7883 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7884 * VM-exits depend on the 'unconditional IO exiting' VM-execution
7885 * control.
7886 *
7887 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7888 */
7889 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7890 intercept = nested_cpu_has(vmcs12,
7891 CPU_BASED_UNCOND_IO_EXITING);
7892 else
7893 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7894
86f7e90c 7895 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
35a57134
OU
7896 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7897}
7898
8a76d7f2
JR
7899static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7900 struct x86_instruction_info *info,
21f1b8f2
SC
7901 enum x86_intercept_stage stage,
7902 struct x86_exception *exception)
8a76d7f2 7903{
fb6d4d34 7904 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
fb6d4d34 7905
35a57134 7906 switch (info->intercept) {
fb6d4d34
PB
7907 /*
7908 * RDPID causes #UD if disabled through secondary execution controls.
7909 * Because it is marked as EmulateOnUD, we need to intercept it here.
2183de41 7910 * Note, RDPID is hidden behind ENABLE_RDTSCP.
fb6d4d34 7911 */
2183de41 7912 case x86_intercept_rdpid:
7f3603b6 7913 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
21f1b8f2
SC
7914 exception->vector = UD_VECTOR;
7915 exception->error_code_valid = false;
35a57134
OU
7916 return X86EMUL_PROPAGATE_FAULT;
7917 }
7918 break;
7919
7920 case x86_intercept_in:
7921 case x86_intercept_ins:
7922 case x86_intercept_out:
7923 case x86_intercept_outs:
7924 return vmx_check_intercept_io(vcpu, info);
fb6d4d34 7925
86f7e90c
OU
7926 case x86_intercept_lgdt:
7927 case x86_intercept_lidt:
7928 case x86_intercept_lldt:
7929 case x86_intercept_ltr:
7930 case x86_intercept_sgdt:
7931 case x86_intercept_sidt:
7932 case x86_intercept_sldt:
7933 case x86_intercept_str:
7934 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7935 return X86EMUL_CONTINUE;
7936
7937 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
7938 break;
7939
fb6d4d34 7940 /* TODO: check more intercepts... */
35a57134
OU
7941 default:
7942 break;
7943 }
7944
07721fee 7945 return X86EMUL_UNHANDLEABLE;
8a76d7f2
JR
7946}
7947
64672c95
YJ
7948#ifdef CONFIG_X86_64
7949/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7950static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7951 u64 divisor, u64 *result)
7952{
7953 u64 low = a << shift, high = a >> (64 - shift);
7954
7955 /* To avoid the overflow on divq */
7956 if (high >= divisor)
7957 return 1;
7958
7959 /* Low hold the result, high hold rem which is discarded */
7960 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7961 "rm" (divisor), "0" (low), "1" (high));
7962 *result = low;
7963
7964 return 0;
7965}
7966
f9927982
SC
7967static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7968 bool *expired)
64672c95 7969{
386c6ddb 7970 struct vcpu_vmx *vmx;
c5ce8235 7971 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
39497d76 7972 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
386c6ddb 7973
386c6ddb
KA
7974 vmx = to_vmx(vcpu);
7975 tscl = rdtsc();
7976 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7977 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
39497d76
SC
7978 lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7979 ktimer->timer_advance_ns);
c5ce8235
WL
7980
7981 if (delta_tsc > lapic_timer_advance_cycles)
7982 delta_tsc -= lapic_timer_advance_cycles;
7983 else
7984 delta_tsc = 0;
64672c95
YJ
7985
7986 /* Convert to host delta tsc if tsc scaling is enabled */
938c8745 7987 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
0967fa1c 7988 delta_tsc && u64_shl_div_u64(delta_tsc,
938c8745 7989 kvm_caps.tsc_scaling_ratio_frac_bits,
805d705f 7990 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
64672c95
YJ
7991 return -ERANGE;
7992
7993 /*
7994 * If the delta tsc can't fit in the 32 bit after the multi shift,
7995 * we can't use the preemption timer.
7996 * It's possible that it fits on later vmentries, but checking
7997 * on every vmentry is costly so we just use an hrtimer.
7998 */
7999 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8000 return -ERANGE;
8001
8002 vmx->hv_deadline_tsc = tscl + delta_tsc;
f9927982
SC
8003 *expired = !delta_tsc;
8004 return 0;
64672c95
YJ
8005}
8006
8007static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8008{
f459a707 8009 to_vmx(vcpu)->hv_deadline_tsc = -1;
64672c95
YJ
8010}
8011#endif
8012
48d89b92 8013static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
ae97a3b8 8014{
b31c114b 8015 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d 8016 shrink_ple_window(vcpu);
ae97a3b8
RK
8017}
8018
a85863c2
MS
8019void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8020{
8021 struct vcpu_vmx *vmx = to_vmx(vcpu);
8022
ee661d8e
DM
8023 if (WARN_ON_ONCE(!enable_pml))
8024 return;
8025
a85863c2
MS
8026 if (is_guest_mode(vcpu)) {
8027 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
8028 return;
8029 }
8030
8031 /*
ee661d8e 8032 * Note, nr_memslots_dirty_logging can be changed concurrent with this
a85863c2
MS
8033 * code, but in that case another update request will be made and so
8034 * the guest will never run with a stale PML value.
8035 */
ee661d8e 8036 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
a85863c2
MS
8037 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8038 else
8039 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8040}
8041
c45dcc71
AR
8042static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8043{
8044 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8045 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
32ad73db 8046 FEAT_CTL_LMCE_ENABLED;
c45dcc71
AR
8047 else
8048 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
32ad73db 8049 ~FEAT_CTL_LMCE_ENABLED;
c45dcc71
AR
8050}
8051
31e83e21 8052#ifdef CONFIG_KVM_SMM
c9d40913 8053static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
72d7b374 8054{
72e9cbdb
LP
8055 /* we need a nested vmexit to enter SMM, postpone if run is pending */
8056 if (to_vmx(vcpu)->nested.nested_run_pending)
c9d40913 8057 return -EBUSY;
a9fa7cb6 8058 return !is_smm(vcpu);
72d7b374
LP
8059}
8060
58c1d206 8061static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
0234bf88 8062{
72e9cbdb
LP
8063 struct vcpu_vmx *vmx = to_vmx(vcpu);
8064
5d76b1f8
SC
8065 /*
8066 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8067 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
8068 * SMI and RSM only modify state that is saved and restored via SMRAM.
8069 * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8070 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8071 */
72e9cbdb
LP
8072 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8073 if (vmx->nested.smm.guest_mode)
8074 nested_vmx_vmexit(vcpu, -1, 0, 0);
8075
8076 vmx->nested.smm.vmxon = vmx->nested.vmxon;
8077 vmx->nested.vmxon = false;
caa057a2 8078 vmx_clear_hlt(vcpu);
0234bf88
LP
8079 return 0;
8080}
8081
58c1d206 8082static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
0234bf88 8083{
72e9cbdb
LP
8084 struct vcpu_vmx *vmx = to_vmx(vcpu);
8085 int ret;
8086
8087 if (vmx->nested.smm.vmxon) {
8088 vmx->nested.vmxon = true;
8089 vmx->nested.smm.vmxon = false;
8090 }
8091
8092 if (vmx->nested.smm.guest_mode) {
a633e41e 8093 ret = nested_vmx_enter_non_root_mode(vcpu, false);
72e9cbdb
LP
8094 if (ret)
8095 return ret;
8096
759cbd59 8097 vmx->nested.nested_run_pending = 1;
72e9cbdb
LP
8098 vmx->nested.smm.guest_mode = false;
8099 }
0234bf88
LP
8100 return 0;
8101}
8102
b6a7cc35 8103static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
cc3d967f 8104{
c9d40913 8105 /* RSM will cause a vmexit anyway. */
cc3d967f 8106}
31e83e21 8107#endif
cc3d967f 8108
4b9852f4
LA
8109static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8110{
1c96dcce 8111 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
4b9852f4
LA
8112}
8113
93dff2fe
JM
8114static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8115{
8116 if (is_guest_mode(vcpu)) {
8117 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8118
8119 if (hrtimer_try_to_cancel(timer) == 1)
8120 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8121 }
8122}
8123
58fccda4 8124static void vmx_hardware_unsetup(void)
484014fa 8125{
ec5a4919
SC
8126 kvm_set_posted_intr_wakeup_handler(NULL);
8127
484014fa
SC
8128 if (nested)
8129 nested_vmx_hardware_unsetup();
8130
8131 free_kvm_area();
8132}
8133
b3f257a8
SC
8134#define VMX_REQUIRED_APICV_INHIBITS \
8135( \
8136 BIT(APICV_INHIBIT_REASON_DISABLE)| \
8137 BIT(APICV_INHIBIT_REASON_ABSENT) | \
8138 BIT(APICV_INHIBIT_REASON_HYPERV) | \
8139 BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
8140 BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
8141 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
8142 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
8143)
484014fa 8144
d588bb9b
CG
8145static void vmx_vm_destroy(struct kvm *kvm)
8146{
8147 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8148
8149 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8150}
8151
e286ac0e 8152static struct kvm_x86_ops vmx_x86_ops __initdata = {
08a9d59c 8153 .name = KBUILD_MODNAME,
9dadfc4a 8154
d83420c2
SC
8155 .check_processor_compatibility = vmx_check_processor_compat,
8156
58fccda4 8157 .hardware_unsetup = vmx_hardware_unsetup,
484014fa 8158
58fccda4
SC
8159 .hardware_enable = vmx_hardware_enable,
8160 .hardware_disable = vmx_hardware_disable,
484014fa
SC
8161 .has_emulated_msr = vmx_has_emulated_msr,
8162
8163 .vm_size = sizeof(struct kvm_vmx),
8164 .vm_init = vmx_vm_init,
d588bb9b 8165 .vm_destroy = vmx_vm_destroy,
484014fa 8166
d588bb9b 8167 .vcpu_precreate = vmx_vcpu_precreate,
58fccda4
SC
8168 .vcpu_create = vmx_vcpu_create,
8169 .vcpu_free = vmx_vcpu_free,
484014fa
SC
8170 .vcpu_reset = vmx_vcpu_reset,
8171
e27bc044 8172 .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
484014fa
SC
8173 .vcpu_load = vmx_vcpu_load,
8174 .vcpu_put = vmx_vcpu_put,
8175
b6a7cc35 8176 .update_exception_bitmap = vmx_update_exception_bitmap,
484014fa
SC
8177 .get_msr_feature = vmx_get_msr_feature,
8178 .get_msr = vmx_get_msr,
8179 .set_msr = vmx_set_msr,
8180 .get_segment_base = vmx_get_segment_base,
8181 .get_segment = vmx_get_segment,
8182 .set_segment = vmx_set_segment,
8183 .get_cpl = vmx_get_cpl,
8184 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
484014fa 8185 .set_cr0 = vmx_set_cr0,
c2fe3cd4 8186 .is_valid_cr4 = vmx_is_valid_cr4,
484014fa
SC
8187 .set_cr4 = vmx_set_cr4,
8188 .set_efer = vmx_set_efer,
8189 .get_idt = vmx_get_idt,
8190 .set_idt = vmx_set_idt,
8191 .get_gdt = vmx_get_gdt,
8192 .set_gdt = vmx_set_gdt,
484014fa
SC
8193 .set_dr7 = vmx_set_dr7,
8194 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8195 .cache_reg = vmx_cache_reg,
8196 .get_rflags = vmx_get_rflags,
8197 .set_rflags = vmx_set_rflags,
c5063551 8198 .get_if_flag = vmx_get_if_flag,
484014fa 8199
e27bc044
SC
8200 .flush_tlb_all = vmx_flush_tlb_all,
8201 .flush_tlb_current = vmx_flush_tlb_current,
8202 .flush_tlb_gva = vmx_flush_tlb_gva,
8203 .flush_tlb_guest = vmx_flush_tlb_guest,
484014fa 8204
fc4fad79 8205 .vcpu_pre_run = vmx_vcpu_pre_run,
e27bc044 8206 .vcpu_run = vmx_vcpu_run,
484014fa
SC
8207 .handle_exit = vmx_handle_exit,
8208 .skip_emulated_instruction = vmx_skip_emulated_instruction,
8209 .update_emulated_instruction = vmx_update_emulated_instruction,
8210 .set_interrupt_shadow = vmx_set_interrupt_shadow,
8211 .get_interrupt_shadow = vmx_get_interrupt_shadow,
8212 .patch_hypercall = vmx_patch_hypercall,
e27bc044
SC
8213 .inject_irq = vmx_inject_irq,
8214 .inject_nmi = vmx_inject_nmi,
6ad75c5c 8215 .inject_exception = vmx_inject_exception,
484014fa
SC
8216 .cancel_injection = vmx_cancel_injection,
8217 .interrupt_allowed = vmx_interrupt_allowed,
8218 .nmi_allowed = vmx_nmi_allowed,
8219 .get_nmi_mask = vmx_get_nmi_mask,
8220 .set_nmi_mask = vmx_set_nmi_mask,
b6a7cc35
JB
8221 .enable_nmi_window = vmx_enable_nmi_window,
8222 .enable_irq_window = vmx_enable_irq_window,
8223 .update_cr8_intercept = vmx_update_cr8_intercept,
484014fa
SC
8224 .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8225 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8226 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8227 .load_eoi_exitmap = vmx_load_eoi_exitmap,
8228 .apicv_post_state_restore = vmx_apicv_post_state_restore,
b3f257a8 8229 .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
484014fa
SC
8230 .hwapic_irr_update = vmx_hwapic_irr_update,
8231 .hwapic_isr_update = vmx_hwapic_isr_update,
8232 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8233 .sync_pir_to_irr = vmx_sync_pir_to_irr,
57dfd7b5 8234 .deliver_interrupt = vmx_deliver_interrupt,
8888cdd0 8235 .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
484014fa
SC
8236
8237 .set_tss_addr = vmx_set_tss_addr,
8238 .set_identity_map_addr = vmx_set_identity_map_addr,
484014fa
SC
8239 .get_mt_mask = vmx_get_mt_mask,
8240
8241 .get_exit_info = vmx_get_exit_info,
8242
7c1b761b 8243 .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
484014fa
SC
8244
8245 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8246
307a94c7
IS
8247 .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
8248 .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
edcfe540 8249 .write_tsc_offset = vmx_write_tsc_offset,
1ab9287a 8250 .write_tsc_multiplier = vmx_write_tsc_multiplier,
484014fa
SC
8251
8252 .load_mmu_pgd = vmx_load_mmu_pgd,
8253
8254 .check_intercept = vmx_check_intercept,
8255 .handle_exit_irqoff = vmx_handle_exit_irqoff,
8256
8257 .request_immediate_exit = vmx_request_immediate_exit,
8258
8259 .sched_in = vmx_sched_in,
8260
6dd03800 8261 .cpu_dirty_log_size = PML_ENTITY_NUM,
a85863c2 8262 .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
484014fa 8263
33b22172 8264 .nested_ops = &vmx_nested_ops,
484014fa 8265
58fccda4 8266 .pi_update_irte = vmx_pi_update_irte,
e27bc044 8267 .pi_start_assignment = vmx_pi_start_assignment,
484014fa
SC
8268
8269#ifdef CONFIG_X86_64
8270 .set_hv_timer = vmx_set_hv_timer,
8271 .cancel_hv_timer = vmx_cancel_hv_timer,
8272#endif
8273
8274 .setup_mce = vmx_setup_mce,
8275
31e83e21 8276#ifdef CONFIG_KVM_SMM
484014fa 8277 .smi_allowed = vmx_smi_allowed,
ecc513e5
SC
8278 .enter_smm = vmx_enter_smm,
8279 .leave_smm = vmx_leave_smm,
b6a7cc35 8280 .enable_smi_window = vmx_enable_smi_window,
31e83e21 8281#endif
484014fa 8282
09e3e2a1 8283 .can_emulate_instruction = vmx_can_emulate_instruction,
484014fa 8284 .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
93dff2fe 8285 .migrate_timers = vmx_migrate_timers,
3eb90017
AG
8286
8287 .msr_filter_changed = vmx_msr_filter_changed,
f9a4d621 8288 .complete_emulated_msr = kvm_complete_insn_gp,
647daca2
TL
8289
8290 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
484014fa
SC
8291};
8292
33271a9e
SC
8293static unsigned int vmx_handle_intel_pt_intr(void)
8294{
8295 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8296
8297 /* '0' on failure so that the !PT case can use a RET0 static call. */
ffd1925a 8298 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
33271a9e
SC
8299 return 0;
8300
8301 kvm_make_request(KVM_REQ_PMI, vcpu);
8302 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8303 (unsigned long *)&vcpu->arch.pmu.global_status);
8304 return 1;
8305}
8306
b6194b94
SC
8307static __init void vmx_setup_user_return_msrs(void)
8308{
8ea8b8d6
SC
8309
8310 /*
8311 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8312 * will emulate SYSCALL in legacy mode if the vendor string in guest
8313 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8314 * support this emulation, MSR_STAR is included in the list for i386,
8315 * but is never loaded into hardware. MSR_CSTAR is also never loaded
8316 * into hardware and is here purely for emulation purposes.
8317 */
8318 const u32 vmx_uret_msrs_list[] = {
8319 #ifdef CONFIG_X86_64
8320 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8321 #endif
8322 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8323 MSR_IA32_TSX_CTRL,
8324 };
b6194b94
SC
8325 int i;
8326
8327 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8328
e5fda4bb
SC
8329 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8330 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
b6194b94
SC
8331}
8332
3c5c3245
KH
8333static void __init vmx_setup_me_spte_mask(void)
8334{
8335 u64 me_mask = 0;
8336
8337 /*
8338 * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
8339 * the former to avoid exposing shadow_phys_bits.
8340 *
8341 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8342 * shadow_phys_bits. On MKTME and/or TDX capable systems,
8343 * boot_cpu_data.x86_phys_bits holds the actual physical address
8344 * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
8345 * reported by CPUID. Those bits between are KeyID bits.
8346 */
8347 if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
8348 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8349 kvm_get_shadow_phys_bits() - 1);
8350 /*
8351 * Unlike SME, host kernel doesn't support setting up any
8352 * MKTME KeyID on Intel platforms. No memory encryption
8353 * bits should be included into the SPTE.
8354 */
8355 kvm_mmu_set_me_spte_mask(0, me_mask);
8356}
8357
33271a9e
SC
8358static struct kvm_x86_init_ops vmx_init_ops __initdata;
8359
a3203381
SC
8360static __init int hardware_setup(void)
8361{
8362 unsigned long host_bndcfgs;
2342080c 8363 struct desc_ptr dt;
f8cd457f 8364 int r;
a3203381 8365
2342080c
SC
8366 store_idt(&dt);
8367 host_idt_base = dt.address;
8368
b6194b94 8369 vmx_setup_user_return_msrs();
a3203381
SC
8370
8371 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8372 return -EIO;
8373
9d78d6fb 8374 if (cpu_has_perf_global_ctrl_bug())
8d20bd63 8375 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
9d78d6fb
VK
8376 "does not work properly. Using workaround\n");
8377
a3203381
SC
8378 if (boot_cpu_has(X86_FEATURE_NX))
8379 kvm_enable_efer_bits(EFER_NX);
8380
8381 if (boot_cpu_has(X86_FEATURE_MPX)) {
8382 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8d20bd63 8383 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
a3203381
SC
8384 }
8385
7f5581f5 8386 if (!cpu_has_vmx_mpx())
938c8745
SC
8387 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8388 XFEATURE_MASK_BNDCSR);
cfc48181 8389
a3203381
SC
8390 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8391 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8392 enable_vpid = 0;
8393
8394 if (!cpu_has_vmx_ept() ||
8395 !cpu_has_vmx_ept_4levels() ||
8396 !cpu_has_vmx_ept_mt_wb() ||
8397 !cpu_has_vmx_invept_global())
8398 enable_ept = 0;
8399
23f079c2
SC
8400 /* NX support is required for shadow paging. */
8401 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8d20bd63 8402 pr_err_ratelimited("NX (Execute Disable) not supported\n");
23f079c2
SC
8403 return -EOPNOTSUPP;
8404 }
8405
a3203381
SC
8406 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8407 enable_ept_ad_bits = 0;
8408
8409 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8410 enable_unrestricted_guest = 0;
8411
8412 if (!cpu_has_vmx_flexpriority())
8413 flexpriority_enabled = 0;
8414
8415 if (!cpu_has_virtual_nmis())
8416 enable_vnmi = 0;
8417
1c1a4149
EGE
8418#ifdef CONFIG_X86_SGX_KVM
8419 if (!cpu_has_vmx_encls_vmexit())
8420 enable_sgx = false;
8421#endif
8422
a3203381
SC
8423 /*
8424 * set_apic_access_page_addr() is used to reload apic access
8425 * page upon invalidation. No need to do anything if not
8426 * using the APIC_ACCESS_ADDR VMCS field.
8427 */
8428 if (!flexpriority_enabled)
72b0eaa9 8429 vmx_x86_ops.set_apic_access_page_addr = NULL;
a3203381
SC
8430
8431 if (!cpu_has_vmx_tpr_shadow())
72b0eaa9 8432 vmx_x86_ops.update_cr8_intercept = NULL;
a3203381
SC
8433
8434#if IS_ENABLED(CONFIG_HYPERV)
8435 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
1f3a3e46 8436 && enable_ept) {
72b0eaa9
SC
8437 vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
8438 vmx_x86_ops.tlb_remote_flush_with_range =
1f3a3e46
LT
8439 hv_remote_flush_tlb_with_range;
8440 }
a3203381
SC
8441#endif
8442
8443 if (!cpu_has_vmx_ple()) {
8444 ple_gap = 0;
8445 ple_window = 0;
8446 ple_window_grow = 0;
8447 ple_window_max = 0;
8448 ple_window_shrink = 0;
8449 }
8450
e90e51d5 8451 if (!cpu_has_vmx_apicv())
a3203381 8452 enable_apicv = 0;
e90e51d5 8453 if (!enable_apicv)
72b0eaa9 8454 vmx_x86_ops.sync_pir_to_irr = NULL;
a3203381 8455
d588bb9b
CG
8456 if (!enable_apicv || !cpu_has_vmx_ipiv())
8457 enable_ipiv = false;
8458
88099313 8459 if (cpu_has_vmx_tsc_scaling())
938c8745 8460 kvm_caps.has_tsc_control = true;
a3203381 8461
938c8745
SC
8462 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8463 kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8464 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
2f4073e0 8465 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
fe6b6bc8 8466
a3203381
SC
8467 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8468
8469 if (enable_ept)
e7b7bdea
SC
8470 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8471 cpu_has_vmx_ept_execute_only());
703c335d 8472
3c5c3245
KH
8473 /*
8474 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8475 * bits to shadow_zero_check.
8476 */
8477 vmx_setup_me_spte_mask();
8478
746700d2 8479 kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
f8cd457f 8480 ept_caps_to_lpage_level(vmx_capability.ept));
a3203381 8481
a3203381
SC
8482 /*
8483 * Only enable PML when hardware supports PML feature, and both EPT
8484 * and EPT A/D bit features are enabled -- PML depends on them to work.
8485 */
8486 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8487 enable_pml = 0;
8488
a018eba5 8489 if (!enable_pml)
6dd03800 8490 vmx_x86_ops.cpu_dirty_log_size = 0;
a3203381
SC
8491
8492 if (!cpu_has_vmx_preemption_timer())
804939ea 8493 enable_preemption_timer = false;
a3203381 8494
804939ea
SC
8495 if (enable_preemption_timer) {
8496 u64 use_timer_freq = 5000ULL * 1000 * 1000;
a3203381 8497
a3203381 8498 cpu_preemption_timer_multi =
0809d9b0 8499 vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
804939ea
SC
8500
8501 if (tsc_khz)
8502 use_timer_freq = (u64)tsc_khz * 1000;
8503 use_timer_freq >>= cpu_preemption_timer_multi;
8504
8505 /*
8506 * KVM "disables" the preemption timer by setting it to its max
8507 * value. Don't use the timer if it might cause spurious exits
8508 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8509 */
8510 if (use_timer_freq > 0xffffffffu / 10)
8511 enable_preemption_timer = false;
8512 }
8513
8514 if (!enable_preemption_timer) {
72b0eaa9
SC
8515 vmx_x86_ops.set_hv_timer = NULL;
8516 vmx_x86_ops.cancel_hv_timer = NULL;
8517 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
a3203381
SC
8518 }
8519
938c8745 8520 kvm_caps.supported_mce_cap |= MCG_LMCE_P;
aebc3ca1 8521 kvm_caps.supported_mce_cap |= MCG_CMCI_P;
a3203381 8522
f99e3daf
CP
8523 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8524 return -EINVAL;
6ef25aa0 8525 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
f99e3daf 8526 pt_mode = PT_MODE_SYSTEM;
33271a9e
SC
8527 if (pt_mode == PT_MODE_HOST_GUEST)
8528 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8529 else
8530 vmx_init_ops.handle_intel_pt_intr = NULL;
f99e3daf 8531
8f102445
SC
8532 setup_default_sgx_lepubkeyhash();
8533
a3203381 8534 if (nested) {
bcdf201f 8535 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
3e8eaccc 8536
6c1c6e58 8537 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
a3203381
SC
8538 if (r)
8539 return r;
8540 }
8541
3ec6fd8c 8542 vmx_set_cpu_caps();
66a6950f 8543
a3203381 8544 r = alloc_kvm_area();
fbc2dfe5 8545 if (r && nested)
a3203381 8546 nested_vmx_hardware_unsetup();
ec5a4919
SC
8547
8548 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8549
a3203381
SC
8550 return r;
8551}
8552
d008dfdb 8553static struct kvm_x86_init_ops vmx_init_ops __initdata = {
d008dfdb 8554 .hardware_setup = hardware_setup,
33271a9e 8555 .handle_intel_pt_intr = NULL,
57b119da 8556
d008dfdb 8557 .runtime_ops = &vmx_x86_ops,
34886e79 8558 .pmu_ops = &intel_pmu_ops,
6aa8b732
AK
8559};
8560
72c6d2db 8561static void vmx_cleanup_l1d_flush(void)
a47dd5f0
PB
8562{
8563 if (vmx_l1d_flush_pages) {
8564 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8565 vmx_l1d_flush_pages = NULL;
8566 }
72c6d2db
TG
8567 /* Restore state so sysfs ignores VMX */
8568 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
8569}
8570
e32b1200 8571static void __vmx_exit(void)
a7b9020b 8572{
e32b1200
SC
8573 allow_smaller_maxphyaddr = false;
8574
a7b9020b
TG
8575#ifdef CONFIG_KEXEC_CORE
8576 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8577 synchronize_rcu();
8578#endif
e32b1200
SC
8579 vmx_cleanup_l1d_flush();
8580}
a7b9020b 8581
e32b1200
SC
8582static void vmx_exit(void)
8583{
a7b9020b 8584 kvm_exit();
4f8396b9 8585 kvm_x86_vendor_exit();
a7b9020b 8586
e32b1200 8587 __vmx_exit();
a7b9020b
TG
8588}
8589module_exit(vmx_exit);
8590
6aa8b732
AK
8591static int __init vmx_init(void)
8592{
dbef2808 8593 int r, cpu;
773e8a04 8594
d4193132
SC
8595 if (!kvm_is_vmx_supported())
8596 return -EOPNOTSUPP;
8597
773e8a04 8598 /*
451d39e8
SC
8599 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8600 * to unwind if a later step fails.
773e8a04 8601 */
451d39e8 8602 hv_init_evmcs();
773e8a04 8603
4f8396b9
SC
8604 r = kvm_x86_vendor_init(&vmx_init_ops);
8605 if (r)
8606 return r;
8607
a7b9020b 8608 /*
4f8396b9 8609 * Must be called after common x86 init so enable_ept is properly set
7db92e16
TG
8610 * up. Hand the parameter mitigation value in which was stored in
8611 * the pre module init parser. If no parameter was given, it will
8612 * contain 'auto' which will be turned into the default 'cond'
8613 * mitigation mode.
8614 */
19a36d32 8615 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
4f8396b9
SC
8616 if (r)
8617 goto err_l1d_flush;
25c5f225 8618
027bbb88
PG
8619 vmx_setup_fb_clear_ctrl();
8620
dbef2808
VK
8621 for_each_possible_cpu(cpu) {
8622 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8888cdd0 8623
a3ff25fc 8624 pi_init_cpu(cpu);
dbef2808
VK
8625 }
8626
2965faa5 8627#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
8628 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8629 crash_vmclear_local_loaded_vmcss);
8630#endif
21ebf53b 8631 vmx_check_vmcs12_offsets();
8f536b76 8632
3edd6839 8633 /*
b96e6506
MG
8634 * Shadow paging doesn't have a (further) performance penalty
8635 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8636 * by default
3edd6839 8637 */
b96e6506
MG
8638 if (!enable_ept)
8639 allow_smaller_maxphyaddr = true;
3edd6839 8640
e32b1200
SC
8641 /*
8642 * Common KVM initialization _must_ come last, after this, /dev/kvm is
8643 * exposed to userspace!
8644 */
81a1cf9f
SC
8645 r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
8646 THIS_MODULE);
e32b1200
SC
8647 if (r)
8648 goto err_kvm_init;
8649
fdef3ad1 8650 return 0;
4f8396b9 8651
4f8396b9 8652err_kvm_init:
e32b1200
SC
8653 __vmx_exit();
8654err_l1d_flush:
4f8396b9
SC
8655 kvm_x86_vendor_exit();
8656 return r;
6aa8b732 8657}
a7b9020b 8658module_init(vmx_init);