x86/kvm/vmx: fix old-style function declaration
[linux-2.6-block.git] / arch / x86 / kvm / vmx.c
CommitLineData
6aa8b732
AK
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
9611c187 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
9 *
10 * Authors:
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory.
16 *
17 */
18
85f455f7 19#include "irq.h"
1d737c8a 20#include "mmu.h"
00b27a3e 21#include "cpuid.h"
d62caabb 22#include "lapic.h"
b8bbab92 23#include "hyperv.h"
e495606d 24
edf88417 25#include <linux/kvm_host.h>
6aa8b732 26#include <linux/module.h>
9d8f549d 27#include <linux/kernel.h>
6aa8b732
AK
28#include <linux/mm.h>
29#include <linux/highmem.h>
e8edc6e0 30#include <linux/sched.h>
c7addb90 31#include <linux/moduleparam.h>
e9bda3b3 32#include <linux/mod_devicetable.h>
af658dca 33#include <linux/trace_events.h>
5a0e3ad6 34#include <linux/slab.h>
cafd6659 35#include <linux/tboot.h>
f4124500 36#include <linux/hrtimer.h>
c207aee4 37#include <linux/frame.h>
085331df 38#include <linux/nospec.h>
5fdbf976 39#include "kvm_cache_regs.h"
35920a35 40#include "x86.h"
e495606d 41
fd8ca6da 42#include <asm/asm.h>
28b835d6 43#include <asm/cpu.h>
6aa8b732 44#include <asm/io.h>
3b3be0d1 45#include <asm/desc.h>
13673a90 46#include <asm/vmx.h>
6210e37b 47#include <asm/virtext.h>
a0861c02 48#include <asm/mce.h>
952f07ec 49#include <asm/fpu/internal.h>
d7cd9796 50#include <asm/perf_event.h>
81908bf4 51#include <asm/debugreg.h>
8f536b76 52#include <asm/kexec.h>
dab2087d 53#include <asm/apic.h>
efc64404 54#include <asm/irq_remapping.h>
d6e41f11 55#include <asm/mmu_context.h>
28a27752 56#include <asm/spec-ctrl.h>
773e8a04 57#include <asm/mshyperv.h>
6aa8b732 58
229456fc 59#include "trace.h"
25462f7f 60#include "pmu.h"
773e8a04 61#include "vmx_evmcs.h"
229456fc 62
4ecac3fd 63#define __ex(x) __kvm_handle_fault_on_reboot(x)
5e520e62 64#define __ex_clear(x, reg) \
43ce76ce 65 ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
4ecac3fd 66
6aa8b732
AK
67MODULE_AUTHOR("Qumranet");
68MODULE_LICENSE("GPL");
69
e9bda3b3
JT
70static const struct x86_cpu_id vmx_cpu_id[] = {
71 X86_FEATURE_MATCH(X86_FEATURE_VMX),
72 {}
73};
74MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
75
476bc001 76static bool __read_mostly enable_vpid = 1;
736caefe 77module_param_named(vpid, enable_vpid, bool, 0444);
2384d2b3 78
d02fcf50
PB
79static bool __read_mostly enable_vnmi = 1;
80module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
81
476bc001 82static bool __read_mostly flexpriority_enabled = 1;
736caefe 83module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
4c9fc8ef 84
476bc001 85static bool __read_mostly enable_ept = 1;
736caefe 86module_param_named(ept, enable_ept, bool, S_IRUGO);
d56f546d 87
476bc001 88static bool __read_mostly enable_unrestricted_guest = 1;
3a624e29
NK
89module_param_named(unrestricted_guest,
90 enable_unrestricted_guest, bool, S_IRUGO);
91
83c3a331
XH
92static bool __read_mostly enable_ept_ad_bits = 1;
93module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
94
a27685c3 95static bool __read_mostly emulate_invalid_guest_state = true;
c1f8bc04 96module_param(emulate_invalid_guest_state, bool, S_IRUGO);
04fa4d32 97
476bc001 98static bool __read_mostly fasteoi = 1;
58fbbf26
KT
99module_param(fasteoi, bool, S_IRUGO);
100
5a71785d 101static bool __read_mostly enable_apicv = 1;
01e439be 102module_param(enable_apicv, bool, S_IRUGO);
83d4c286 103
abc4fc58
AG
104static bool __read_mostly enable_shadow_vmcs = 1;
105module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
801d3424
NHE
106/*
107 * If nested=1, nested virtualization is supported, i.e., guests may use
108 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
109 * use VMX instructions.
110 */
1e58e5e5 111static bool __read_mostly nested = 1;
801d3424
NHE
112module_param(nested, bool, S_IRUGO);
113
52017608
SC
114static bool __read_mostly nested_early_check = 0;
115module_param(nested_early_check, bool, S_IRUGO);
116
20300099
WL
117static u64 __read_mostly host_xss;
118
843e4330
KH
119static bool __read_mostly enable_pml = 1;
120module_param_named(pml, enable_pml, bool, S_IRUGO);
121
904e14fb
PB
122#define MSR_TYPE_R 1
123#define MSR_TYPE_W 2
124#define MSR_TYPE_RW 3
125
126#define MSR_BITMAP_MODE_X2APIC 1
127#define MSR_BITMAP_MODE_X2APIC_APICV 2
904e14fb 128
64903d61
HZ
129#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
130
64672c95
YJ
131/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
132static int __read_mostly cpu_preemption_timer_multi;
133static bool __read_mostly enable_preemption_timer = 1;
134#ifdef CONFIG_X86_64
135module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
136#endif
137
3de6347b 138#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
1706bd0c
SC
139#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
140#define KVM_VM_CR0_ALWAYS_ON \
141 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
142 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
4c38609a
AK
143#define KVM_CR4_GUEST_OWNED_BITS \
144 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
fd8cb433 145 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
4c38609a 146
5dc1f044 147#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
cdc0e244
AK
148#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
149#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
150
78ac8b47
AK
151#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
152
f4124500
JK
153#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
154
16c2aec6
JD
155/*
156 * Hyper-V requires all of these, so mark them as supported even though
157 * they are just treated the same as all-context.
158 */
159#define VMX_VPID_EXTENT_SUPPORTED_MASK \
160 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
161 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
162 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
163 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
164
4b8d54f9
ZE
165/*
166 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
167 * ple_gap: upper bound on the amount of time between two successive
168 * executions of PAUSE in a loop. Also indicate if ple enabled.
00c25bce 169 * According to test, this time is usually smaller than 128 cycles.
4b8d54f9
ZE
170 * ple_window: upper bound on the amount of time a guest is allowed to execute
171 * in a PAUSE loop. Tests indicate that most spinlocks are held for
172 * less than 2^12 cycles
173 * Time is measured based on a counter that runs at the same rate as the TSC,
174 * refer SDM volume 3b section 21.6.13 & 22.1.3.
175 */
c8e88717 176static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
a87c99e6 177module_param(ple_gap, uint, 0444);
b4a2d31d 178
7fbc85a5
BM
179static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
180module_param(ple_window, uint, 0444);
4b8d54f9 181
b4a2d31d 182/* Default doubles per-vcpu window every exit. */
c8e88717 183static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
7fbc85a5 184module_param(ple_window_grow, uint, 0444);
b4a2d31d
RK
185
186/* Default resets per-vcpu window every exit to ple_window. */
c8e88717 187static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
7fbc85a5 188module_param(ple_window_shrink, uint, 0444);
b4a2d31d
RK
189
190/* Default is to compute the maximum so we can never overflow. */
7fbc85a5
BM
191static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
192module_param(ple_window_max, uint, 0444);
b4a2d31d 193
83287ea4 194extern const ulong vmx_return;
52017608 195extern const ulong vmx_early_consistency_check_return;
83287ea4 196
a399477e 197static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
427362a1 198static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
dd4bfa73 199static DEFINE_MUTEX(vmx_l1d_flush_mutex);
a399477e 200
7db92e16
TG
201/* Storage for pre module init parameter parsing */
202static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
203
204static const struct {
205 const char *option;
0027ff2a 206 bool for_parse;
a399477e 207} vmentry_l1d_param[] = {
0027ff2a
PB
208 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
209 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
210 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
211 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
212 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
213 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
a399477e
KRW
214};
215
7db92e16
TG
216#define L1D_CACHE_ORDER 4
217static void *vmx_l1d_flush_pages;
218
219static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
a399477e 220{
7db92e16 221 struct page *page;
288d152c 222 unsigned int i;
a399477e 223
7db92e16
TG
224 if (!enable_ept) {
225 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
226 return 0;
a399477e
KRW
227 }
228
d806afa4
YW
229 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
230 u64 msr;
231
232 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
233 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
234 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
235 return 0;
236 }
237 }
8e0b2b91 238
d90a7a0e
JK
239 /* If set to auto use the default l1tf mitigation method */
240 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
241 switch (l1tf_mitigation) {
242 case L1TF_MITIGATION_OFF:
243 l1tf = VMENTER_L1D_FLUSH_NEVER;
244 break;
245 case L1TF_MITIGATION_FLUSH_NOWARN:
246 case L1TF_MITIGATION_FLUSH:
247 case L1TF_MITIGATION_FLUSH_NOSMT:
248 l1tf = VMENTER_L1D_FLUSH_COND;
249 break;
250 case L1TF_MITIGATION_FULL:
251 case L1TF_MITIGATION_FULL_FORCE:
252 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
253 break;
254 }
255 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
256 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
257 }
258
7db92e16
TG
259 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
260 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
261 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
262 if (!page)
263 return -ENOMEM;
264 vmx_l1d_flush_pages = page_address(page);
288d152c
NS
265
266 /*
267 * Initialize each page with a different pattern in
268 * order to protect against KSM in the nested
269 * virtualization case.
270 */
271 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
272 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
273 PAGE_SIZE);
274 }
7db92e16
TG
275 }
276
277 l1tf_vmx_mitigation = l1tf;
278
895ae47f
TG
279 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
280 static_branch_enable(&vmx_l1d_should_flush);
281 else
282 static_branch_disable(&vmx_l1d_should_flush);
4c6523ec 283
427362a1
NS
284 if (l1tf == VMENTER_L1D_FLUSH_COND)
285 static_branch_enable(&vmx_l1d_flush_cond);
895ae47f 286 else
427362a1 287 static_branch_disable(&vmx_l1d_flush_cond);
7db92e16
TG
288 return 0;
289}
290
291static int vmentry_l1d_flush_parse(const char *s)
292{
293 unsigned int i;
294
295 if (s) {
296 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
0027ff2a
PB
297 if (vmentry_l1d_param[i].for_parse &&
298 sysfs_streq(s, vmentry_l1d_param[i].option))
299 return i;
7db92e16
TG
300 }
301 }
a399477e
KRW
302 return -EINVAL;
303}
304
7db92e16
TG
305static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
306{
dd4bfa73 307 int l1tf, ret;
7db92e16 308
7db92e16
TG
309 l1tf = vmentry_l1d_flush_parse(s);
310 if (l1tf < 0)
311 return l1tf;
312
0027ff2a
PB
313 if (!boot_cpu_has(X86_BUG_L1TF))
314 return 0;
315
7db92e16
TG
316 /*
317 * Has vmx_init() run already? If not then this is the pre init
318 * parameter parsing. In that case just store the value and let
319 * vmx_init() do the proper setup after enable_ept has been
320 * established.
321 */
322 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
323 vmentry_l1d_flush_param = l1tf;
324 return 0;
325 }
326
dd4bfa73
TG
327 mutex_lock(&vmx_l1d_flush_mutex);
328 ret = vmx_setup_l1d_flush(l1tf);
329 mutex_unlock(&vmx_l1d_flush_mutex);
330 return ret;
7db92e16
TG
331}
332
a399477e
KRW
333static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
334{
0027ff2a
PB
335 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
336 return sprintf(s, "???\n");
337
7db92e16 338 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
a399477e
KRW
339}
340
341static const struct kernel_param_ops vmentry_l1d_flush_ops = {
342 .set = vmentry_l1d_flush_set,
343 .get = vmentry_l1d_flush_get,
344};
895ae47f 345module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
a399477e 346
877ad952
TL
347enum ept_pointers_status {
348 EPT_POINTERS_CHECK = 0,
349 EPT_POINTERS_MATCH = 1,
350 EPT_POINTERS_MISMATCH = 2
351};
352
40bbb9d0
SC
353struct kvm_vmx {
354 struct kvm kvm;
355
356 unsigned int tss_addr;
357 bool ept_identity_pagetable_done;
358 gpa_t ept_identity_map_addr;
877ad952
TL
359
360 enum ept_pointers_status ept_pointers_match;
361 spinlock_t ept_pointer_lock;
40bbb9d0
SC
362};
363
8bf00a52 364#define NR_AUTOLOAD_MSRS 8
61d2ef2c 365
392b2f25
LA
366struct vmcs_hdr {
367 u32 revision_id:31;
368 u32 shadow_vmcs:1;
369};
370
a2fa3e9f 371struct vmcs {
392b2f25 372 struct vmcs_hdr hdr;
a2fa3e9f
GH
373 u32 abort;
374 char data[0];
375};
376
d7ee039e
SC
377/*
378 * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
379 * and whose values change infrequently, but are not constant. I.e. this is
380 * used as a write-through cache of the corresponding VMCS fields.
381 */
382struct vmcs_host_state {
383 unsigned long cr3; /* May not match real cr3 */
384 unsigned long cr4; /* May not match real cr4 */
5e079c7e
SC
385 unsigned long gs_base;
386 unsigned long fs_base;
d7ee039e
SC
387
388 u16 fs_sel, gs_sel, ldt_sel;
389#ifdef CONFIG_X86_64
390 u16 ds_sel, es_sel;
391#endif
392};
393
d462b819
NHE
394/*
395 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
396 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
397 * loaded on this CPU (so we can clear them if the CPU goes down).
398 */
399struct loaded_vmcs {
400 struct vmcs *vmcs;
355f4fb1 401 struct vmcs *shadow_vmcs;
d462b819 402 int cpu;
4c4a6f79
PB
403 bool launched;
404 bool nmi_known_unmasked;
f459a707 405 bool hv_timer_armed;
8a1b4392
PB
406 /* Support for vnmi-less CPUs */
407 int soft_vnmi_blocked;
408 ktime_t entry_time;
409 s64 vnmi_blocked_time;
904e14fb 410 unsigned long *msr_bitmap;
d462b819 411 struct list_head loaded_vmcss_on_cpu_link;
d7ee039e 412 struct vmcs_host_state host_state;
d462b819
NHE
413};
414
26bb0981
AK
415struct shared_msr_entry {
416 unsigned index;
417 u64 data;
d5696725 418 u64 mask;
26bb0981
AK
419};
420
a9d30f33
NHE
421/*
422 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
423 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
424 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
425 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
426 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
427 * More than one of these structures may exist, if L1 runs multiple L2 guests.
de3a0021 428 * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
a9d30f33
NHE
429 * underlying hardware which will be used to run L2.
430 * This structure is packed to ensure that its layout is identical across
431 * machines (necessary for live migration).
b348e793
JM
432 *
433 * IMPORTANT: Changing the layout of existing fields in this structure
434 * will break save/restore compatibility with older kvm releases. When
435 * adding new fields, either use space in the reserved padding* arrays
436 * or add the new fields to the end of the structure.
a9d30f33 437 */
22bd0358 438typedef u64 natural_width;
a9d30f33
NHE
439struct __packed vmcs12 {
440 /* According to the Intel spec, a VMCS region must start with the
441 * following two fields. Then follow implementation-specific data.
442 */
392b2f25 443 struct vmcs_hdr hdr;
a9d30f33 444 u32 abort;
22bd0358 445
27d6c865
NHE
446 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
447 u32 padding[7]; /* room for future expansion */
448
22bd0358
NHE
449 u64 io_bitmap_a;
450 u64 io_bitmap_b;
451 u64 msr_bitmap;
452 u64 vm_exit_msr_store_addr;
453 u64 vm_exit_msr_load_addr;
454 u64 vm_entry_msr_load_addr;
455 u64 tsc_offset;
456 u64 virtual_apic_page_addr;
457 u64 apic_access_addr;
705699a1 458 u64 posted_intr_desc_addr;
22bd0358 459 u64 ept_pointer;
608406e2
WV
460 u64 eoi_exit_bitmap0;
461 u64 eoi_exit_bitmap1;
462 u64 eoi_exit_bitmap2;
463 u64 eoi_exit_bitmap3;
81dc01f7 464 u64 xss_exit_bitmap;
22bd0358
NHE
465 u64 guest_physical_address;
466 u64 vmcs_link_pointer;
467 u64 guest_ia32_debugctl;
468 u64 guest_ia32_pat;
469 u64 guest_ia32_efer;
470 u64 guest_ia32_perf_global_ctrl;
471 u64 guest_pdptr0;
472 u64 guest_pdptr1;
473 u64 guest_pdptr2;
474 u64 guest_pdptr3;
36be0b9d 475 u64 guest_bndcfgs;
22bd0358
NHE
476 u64 host_ia32_pat;
477 u64 host_ia32_efer;
478 u64 host_ia32_perf_global_ctrl;
b348e793
JM
479 u64 vmread_bitmap;
480 u64 vmwrite_bitmap;
481 u64 vm_function_control;
482 u64 eptp_list_address;
483 u64 pml_address;
484 u64 padding64[3]; /* room for future expansion */
22bd0358
NHE
485 /*
486 * To allow migration of L1 (complete with its L2 guests) between
487 * machines of different natural widths (32 or 64 bit), we cannot have
488 * unsigned long fields with no explict size. We use u64 (aliased
489 * natural_width) instead. Luckily, x86 is little-endian.
490 */
491 natural_width cr0_guest_host_mask;
492 natural_width cr4_guest_host_mask;
493 natural_width cr0_read_shadow;
494 natural_width cr4_read_shadow;
495 natural_width cr3_target_value0;
496 natural_width cr3_target_value1;
497 natural_width cr3_target_value2;
498 natural_width cr3_target_value3;
499 natural_width exit_qualification;
500 natural_width guest_linear_address;
501 natural_width guest_cr0;
502 natural_width guest_cr3;
503 natural_width guest_cr4;
504 natural_width guest_es_base;
505 natural_width guest_cs_base;
506 natural_width guest_ss_base;
507 natural_width guest_ds_base;
508 natural_width guest_fs_base;
509 natural_width guest_gs_base;
510 natural_width guest_ldtr_base;
511 natural_width guest_tr_base;
512 natural_width guest_gdtr_base;
513 natural_width guest_idtr_base;
514 natural_width guest_dr7;
515 natural_width guest_rsp;
516 natural_width guest_rip;
517 natural_width guest_rflags;
518 natural_width guest_pending_dbg_exceptions;
519 natural_width guest_sysenter_esp;
520 natural_width guest_sysenter_eip;
521 natural_width host_cr0;
522 natural_width host_cr3;
523 natural_width host_cr4;
524 natural_width host_fs_base;
525 natural_width host_gs_base;
526 natural_width host_tr_base;
527 natural_width host_gdtr_base;
528 natural_width host_idtr_base;
529 natural_width host_ia32_sysenter_esp;
530 natural_width host_ia32_sysenter_eip;
531 natural_width host_rsp;
532 natural_width host_rip;
533 natural_width paddingl[8]; /* room for future expansion */
534 u32 pin_based_vm_exec_control;
535 u32 cpu_based_vm_exec_control;
536 u32 exception_bitmap;
537 u32 page_fault_error_code_mask;
538 u32 page_fault_error_code_match;
539 u32 cr3_target_count;
540 u32 vm_exit_controls;
541 u32 vm_exit_msr_store_count;
542 u32 vm_exit_msr_load_count;
543 u32 vm_entry_controls;
544 u32 vm_entry_msr_load_count;
545 u32 vm_entry_intr_info_field;
546 u32 vm_entry_exception_error_code;
547 u32 vm_entry_instruction_len;
548 u32 tpr_threshold;
549 u32 secondary_vm_exec_control;
550 u32 vm_instruction_error;
551 u32 vm_exit_reason;
552 u32 vm_exit_intr_info;
553 u32 vm_exit_intr_error_code;
554 u32 idt_vectoring_info_field;
555 u32 idt_vectoring_error_code;
556 u32 vm_exit_instruction_len;
557 u32 vmx_instruction_info;
558 u32 guest_es_limit;
559 u32 guest_cs_limit;
560 u32 guest_ss_limit;
561 u32 guest_ds_limit;
562 u32 guest_fs_limit;
563 u32 guest_gs_limit;
564 u32 guest_ldtr_limit;
565 u32 guest_tr_limit;
566 u32 guest_gdtr_limit;
567 u32 guest_idtr_limit;
568 u32 guest_es_ar_bytes;
569 u32 guest_cs_ar_bytes;
570 u32 guest_ss_ar_bytes;
571 u32 guest_ds_ar_bytes;
572 u32 guest_fs_ar_bytes;
573 u32 guest_gs_ar_bytes;
574 u32 guest_ldtr_ar_bytes;
575 u32 guest_tr_ar_bytes;
576 u32 guest_interruptibility_info;
577 u32 guest_activity_state;
578 u32 guest_sysenter_cs;
579 u32 host_ia32_sysenter_cs;
0238ea91
JK
580 u32 vmx_preemption_timer_value;
581 u32 padding32[7]; /* room for future expansion */
22bd0358 582 u16 virtual_processor_id;
705699a1 583 u16 posted_intr_nv;
22bd0358
NHE
584 u16 guest_es_selector;
585 u16 guest_cs_selector;
586 u16 guest_ss_selector;
587 u16 guest_ds_selector;
588 u16 guest_fs_selector;
589 u16 guest_gs_selector;
590 u16 guest_ldtr_selector;
591 u16 guest_tr_selector;
608406e2 592 u16 guest_intr_status;
22bd0358
NHE
593 u16 host_es_selector;
594 u16 host_cs_selector;
595 u16 host_ss_selector;
596 u16 host_ds_selector;
597 u16 host_fs_selector;
598 u16 host_gs_selector;
599 u16 host_tr_selector;
b348e793 600 u16 guest_pml_index;
a9d30f33
NHE
601};
602
21ebf53b
JM
603/*
604 * For save/restore compatibility, the vmcs12 field offsets must not change.
605 */
606#define CHECK_OFFSET(field, loc) \
607 BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
608 "Offset of " #field " in struct vmcs12 has changed.")
609
610static inline void vmx_check_vmcs12_offsets(void) {
392b2f25 611 CHECK_OFFSET(hdr, 0);
21ebf53b
JM
612 CHECK_OFFSET(abort, 4);
613 CHECK_OFFSET(launch_state, 8);
614 CHECK_OFFSET(io_bitmap_a, 40);
615 CHECK_OFFSET(io_bitmap_b, 48);
616 CHECK_OFFSET(msr_bitmap, 56);
617 CHECK_OFFSET(vm_exit_msr_store_addr, 64);
618 CHECK_OFFSET(vm_exit_msr_load_addr, 72);
619 CHECK_OFFSET(vm_entry_msr_load_addr, 80);
620 CHECK_OFFSET(tsc_offset, 88);
621 CHECK_OFFSET(virtual_apic_page_addr, 96);
622 CHECK_OFFSET(apic_access_addr, 104);
623 CHECK_OFFSET(posted_intr_desc_addr, 112);
624 CHECK_OFFSET(ept_pointer, 120);
625 CHECK_OFFSET(eoi_exit_bitmap0, 128);
626 CHECK_OFFSET(eoi_exit_bitmap1, 136);
627 CHECK_OFFSET(eoi_exit_bitmap2, 144);
628 CHECK_OFFSET(eoi_exit_bitmap3, 152);
629 CHECK_OFFSET(xss_exit_bitmap, 160);
630 CHECK_OFFSET(guest_physical_address, 168);
631 CHECK_OFFSET(vmcs_link_pointer, 176);
632 CHECK_OFFSET(guest_ia32_debugctl, 184);
633 CHECK_OFFSET(guest_ia32_pat, 192);
634 CHECK_OFFSET(guest_ia32_efer, 200);
635 CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
636 CHECK_OFFSET(guest_pdptr0, 216);
637 CHECK_OFFSET(guest_pdptr1, 224);
638 CHECK_OFFSET(guest_pdptr2, 232);
639 CHECK_OFFSET(guest_pdptr3, 240);
640 CHECK_OFFSET(guest_bndcfgs, 248);
641 CHECK_OFFSET(host_ia32_pat, 256);
642 CHECK_OFFSET(host_ia32_efer, 264);
643 CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
644 CHECK_OFFSET(vmread_bitmap, 280);
645 CHECK_OFFSET(vmwrite_bitmap, 288);
646 CHECK_OFFSET(vm_function_control, 296);
647 CHECK_OFFSET(eptp_list_address, 304);
648 CHECK_OFFSET(pml_address, 312);
649 CHECK_OFFSET(cr0_guest_host_mask, 344);
650 CHECK_OFFSET(cr4_guest_host_mask, 352);
651 CHECK_OFFSET(cr0_read_shadow, 360);
652 CHECK_OFFSET(cr4_read_shadow, 368);
653 CHECK_OFFSET(cr3_target_value0, 376);
654 CHECK_OFFSET(cr3_target_value1, 384);
655 CHECK_OFFSET(cr3_target_value2, 392);
656 CHECK_OFFSET(cr3_target_value3, 400);
657 CHECK_OFFSET(exit_qualification, 408);
658 CHECK_OFFSET(guest_linear_address, 416);
659 CHECK_OFFSET(guest_cr0, 424);
660 CHECK_OFFSET(guest_cr3, 432);
661 CHECK_OFFSET(guest_cr4, 440);
662 CHECK_OFFSET(guest_es_base, 448);
663 CHECK_OFFSET(guest_cs_base, 456);
664 CHECK_OFFSET(guest_ss_base, 464);
665 CHECK_OFFSET(guest_ds_base, 472);
666 CHECK_OFFSET(guest_fs_base, 480);
667 CHECK_OFFSET(guest_gs_base, 488);
668 CHECK_OFFSET(guest_ldtr_base, 496);
669 CHECK_OFFSET(guest_tr_base, 504);
670 CHECK_OFFSET(guest_gdtr_base, 512);
671 CHECK_OFFSET(guest_idtr_base, 520);
672 CHECK_OFFSET(guest_dr7, 528);
673 CHECK_OFFSET(guest_rsp, 536);
674 CHECK_OFFSET(guest_rip, 544);
675 CHECK_OFFSET(guest_rflags, 552);
676 CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
677 CHECK_OFFSET(guest_sysenter_esp, 568);
678 CHECK_OFFSET(guest_sysenter_eip, 576);
679 CHECK_OFFSET(host_cr0, 584);
680 CHECK_OFFSET(host_cr3, 592);
681 CHECK_OFFSET(host_cr4, 600);
682 CHECK_OFFSET(host_fs_base, 608);
683 CHECK_OFFSET(host_gs_base, 616);
684 CHECK_OFFSET(host_tr_base, 624);
685 CHECK_OFFSET(host_gdtr_base, 632);
686 CHECK_OFFSET(host_idtr_base, 640);
687 CHECK_OFFSET(host_ia32_sysenter_esp, 648);
688 CHECK_OFFSET(host_ia32_sysenter_eip, 656);
689 CHECK_OFFSET(host_rsp, 664);
690 CHECK_OFFSET(host_rip, 672);
691 CHECK_OFFSET(pin_based_vm_exec_control, 744);
692 CHECK_OFFSET(cpu_based_vm_exec_control, 748);
693 CHECK_OFFSET(exception_bitmap, 752);
694 CHECK_OFFSET(page_fault_error_code_mask, 756);
695 CHECK_OFFSET(page_fault_error_code_match, 760);
696 CHECK_OFFSET(cr3_target_count, 764);
697 CHECK_OFFSET(vm_exit_controls, 768);
698 CHECK_OFFSET(vm_exit_msr_store_count, 772);
699 CHECK_OFFSET(vm_exit_msr_load_count, 776);
700 CHECK_OFFSET(vm_entry_controls, 780);
701 CHECK_OFFSET(vm_entry_msr_load_count, 784);
702 CHECK_OFFSET(vm_entry_intr_info_field, 788);
703 CHECK_OFFSET(vm_entry_exception_error_code, 792);
704 CHECK_OFFSET(vm_entry_instruction_len, 796);
705 CHECK_OFFSET(tpr_threshold, 800);
706 CHECK_OFFSET(secondary_vm_exec_control, 804);
707 CHECK_OFFSET(vm_instruction_error, 808);
708 CHECK_OFFSET(vm_exit_reason, 812);
709 CHECK_OFFSET(vm_exit_intr_info, 816);
710 CHECK_OFFSET(vm_exit_intr_error_code, 820);
711 CHECK_OFFSET(idt_vectoring_info_field, 824);
712 CHECK_OFFSET(idt_vectoring_error_code, 828);
713 CHECK_OFFSET(vm_exit_instruction_len, 832);
714 CHECK_OFFSET(vmx_instruction_info, 836);
715 CHECK_OFFSET(guest_es_limit, 840);
716 CHECK_OFFSET(guest_cs_limit, 844);
717 CHECK_OFFSET(guest_ss_limit, 848);
718 CHECK_OFFSET(guest_ds_limit, 852);
719 CHECK_OFFSET(guest_fs_limit, 856);
720 CHECK_OFFSET(guest_gs_limit, 860);
721 CHECK_OFFSET(guest_ldtr_limit, 864);
722 CHECK_OFFSET(guest_tr_limit, 868);
723 CHECK_OFFSET(guest_gdtr_limit, 872);
724 CHECK_OFFSET(guest_idtr_limit, 876);
725 CHECK_OFFSET(guest_es_ar_bytes, 880);
726 CHECK_OFFSET(guest_cs_ar_bytes, 884);
727 CHECK_OFFSET(guest_ss_ar_bytes, 888);
728 CHECK_OFFSET(guest_ds_ar_bytes, 892);
729 CHECK_OFFSET(guest_fs_ar_bytes, 896);
730 CHECK_OFFSET(guest_gs_ar_bytes, 900);
731 CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
732 CHECK_OFFSET(guest_tr_ar_bytes, 908);
733 CHECK_OFFSET(guest_interruptibility_info, 912);
734 CHECK_OFFSET(guest_activity_state, 916);
735 CHECK_OFFSET(guest_sysenter_cs, 920);
736 CHECK_OFFSET(host_ia32_sysenter_cs, 924);
737 CHECK_OFFSET(vmx_preemption_timer_value, 928);
738 CHECK_OFFSET(virtual_processor_id, 960);
739 CHECK_OFFSET(posted_intr_nv, 962);
740 CHECK_OFFSET(guest_es_selector, 964);
741 CHECK_OFFSET(guest_cs_selector, 966);
742 CHECK_OFFSET(guest_ss_selector, 968);
743 CHECK_OFFSET(guest_ds_selector, 970);
744 CHECK_OFFSET(guest_fs_selector, 972);
745 CHECK_OFFSET(guest_gs_selector, 974);
746 CHECK_OFFSET(guest_ldtr_selector, 976);
747 CHECK_OFFSET(guest_tr_selector, 978);
748 CHECK_OFFSET(guest_intr_status, 980);
749 CHECK_OFFSET(host_es_selector, 982);
750 CHECK_OFFSET(host_cs_selector, 984);
751 CHECK_OFFSET(host_ss_selector, 986);
752 CHECK_OFFSET(host_ds_selector, 988);
753 CHECK_OFFSET(host_fs_selector, 990);
754 CHECK_OFFSET(host_gs_selector, 992);
755 CHECK_OFFSET(host_tr_selector, 994);
756 CHECK_OFFSET(guest_pml_index, 996);
757}
758
a9d30f33
NHE
759/*
760 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
761 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
762 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
b348e793
JM
763 *
764 * IMPORTANT: Changing this value will break save/restore compatibility with
765 * older kvm releases.
a9d30f33
NHE
766 */
767#define VMCS12_REVISION 0x11e57ed0
768
769/*
770 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
771 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
772 * current implementation, 4K are reserved to avoid future complications.
773 */
774#define VMCS12_SIZE 0x1000
775
5b15706d
JM
776/*
777 * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
778 * supported VMCS12 field encoding.
779 */
780#define VMCS12_MAX_FIELD_INDEX 0x17
781
6677f3da
PB
782struct nested_vmx_msrs {
783 /*
784 * We only store the "true" versions of the VMX capability MSRs. We
785 * generate the "non-true" versions by setting the must-be-1 bits
786 * according to the SDM.
787 */
788 u32 procbased_ctls_low;
789 u32 procbased_ctls_high;
790 u32 secondary_ctls_low;
791 u32 secondary_ctls_high;
792 u32 pinbased_ctls_low;
793 u32 pinbased_ctls_high;
794 u32 exit_ctls_low;
795 u32 exit_ctls_high;
796 u32 entry_ctls_low;
797 u32 entry_ctls_high;
798 u32 misc_low;
799 u32 misc_high;
800 u32 ept_caps;
801 u32 vpid_caps;
802 u64 basic;
803 u64 cr0_fixed0;
804 u64 cr0_fixed1;
805 u64 cr4_fixed0;
806 u64 cr4_fixed1;
807 u64 vmcs_enum;
808 u64 vmfunc_controls;
809};
810
ec378aee
NHE
811/*
812 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
813 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
814 */
815struct nested_vmx {
816 /* Has the level1 guest done vmxon? */
817 bool vmxon;
3573e22c 818 gpa_t vmxon_ptr;
c5f983f6 819 bool pml_full;
a9d30f33
NHE
820
821 /* The guest-physical address of the current VMCS L1 keeps for L2 */
822 gpa_t current_vmptr;
4f2777bc
DM
823 /*
824 * Cache of the guest's VMCS, existing outside of guest memory.
825 * Loaded from guest memory during VMPTRLD. Flushed to guest
8ca44e88 826 * memory during VMCLEAR and VMPTRLD.
4f2777bc
DM
827 */
828 struct vmcs12 *cached_vmcs12;
61ada748
LA
829 /*
830 * Cache of the guest's shadow VMCS, existing outside of guest
831 * memory. Loaded from guest memory during VM entry. Flushed
832 * to guest memory during VM exit.
833 */
834 struct vmcs12 *cached_shadow_vmcs12;
012f83cb 835 /*
945679e3
VK
836 * Indicates if the shadow vmcs or enlightened vmcs must be updated
837 * with the data held by struct vmcs12.
012f83cb 838 */
945679e3 839 bool need_vmcs12_sync;
74a497fa 840 bool dirty_vmcs12;
ff2f6fe9 841
9d6105b2
SC
842 /*
843 * vmcs02 has been initialized, i.e. state that is constant for
844 * vmcs02 has been written to the backing VMCS. Initialization
845 * is delayed until L1 actually attempts to run a nested VM.
846 */
847 bool vmcs02_initialized;
848
8d860bbe
JM
849 bool change_vmcs01_virtual_apic_mode;
850
57b119da
VK
851 /*
852 * Enlightened VMCS has been enabled. It does not mean that L1 has to
853 * use it. However, VMX features available to L1 will be limited based
854 * on what the enlightened VMCS supports.
855 */
856 bool enlightened_vmcs_enabled;
857
644d711a
NHE
858 /* L2 must run next, and mustn't decide to exit to L1. */
859 bool nested_run_pending;
de3a0021
JM
860
861 struct loaded_vmcs vmcs02;
862
fe3ef05c 863 /*
de3a0021
JM
864 * Guest pages referred to in the vmcs02 with host-physical
865 * pointers, so we must keep them pinned while L2 runs.
fe3ef05c
NHE
866 */
867 struct page *apic_access_page;
a7c0b07d 868 struct page *virtual_apic_page;
705699a1
WV
869 struct page *pi_desc_page;
870 struct pi_desc *pi_desc;
871 bool pi_pending;
872 u16 posted_intr_nv;
f4124500
JK
873
874 struct hrtimer preemption_timer;
875 bool preemption_timer_expired;
2996fca0
JK
876
877 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
878 u64 vmcs01_debugctl;
62cf9bd8 879 u64 vmcs01_guest_bndcfgs;
b9c237bb 880
5c614b35
WL
881 u16 vpid02;
882 u16 last_vpid;
883
6677f3da 884 struct nested_vmx_msrs msrs;
72e9cbdb
LP
885
886 /* SMM related state */
887 struct {
888 /* in VMX operation on SMM entry? */
889 bool vmxon;
890 /* in guest mode on SMM entry? */
891 bool guest_mode;
892 } smm;
945679e3 893
b8bbab92
VK
894 gpa_t hv_evmcs_vmptr;
895 struct page *hv_evmcs_page;
945679e3 896 struct hv_enlightened_vmcs *hv_evmcs;
ec378aee
NHE
897};
898
01e439be 899#define POSTED_INTR_ON 0
ebbfc765
FW
900#define POSTED_INTR_SN 1
901
01e439be
YZ
902/* Posted-Interrupt Descriptor */
903struct pi_desc {
904 u32 pir[8]; /* Posted interrupt requested */
6ef1522f
FW
905 union {
906 struct {
907 /* bit 256 - Outstanding Notification */
908 u16 on : 1,
909 /* bit 257 - Suppress Notification */
910 sn : 1,
911 /* bit 271:258 - Reserved */
912 rsvd_1 : 14;
913 /* bit 279:272 - Notification Vector */
914 u8 nv;
915 /* bit 287:280 - Reserved */
916 u8 rsvd_2;
917 /* bit 319:288 - Notification Destination */
918 u32 ndst;
919 };
920 u64 control;
921 };
922 u32 rsvd[6];
01e439be
YZ
923} __aligned(64);
924
a20ed54d
YZ
925static bool pi_test_and_set_on(struct pi_desc *pi_desc)
926{
927 return test_and_set_bit(POSTED_INTR_ON,
928 (unsigned long *)&pi_desc->control);
929}
930
931static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
932{
933 return test_and_clear_bit(POSTED_INTR_ON,
934 (unsigned long *)&pi_desc->control);
935}
936
937static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
938{
939 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
940}
941
ebbfc765
FW
942static inline void pi_clear_sn(struct pi_desc *pi_desc)
943{
944 return clear_bit(POSTED_INTR_SN,
945 (unsigned long *)&pi_desc->control);
946}
947
948static inline void pi_set_sn(struct pi_desc *pi_desc)
949{
950 return set_bit(POSTED_INTR_SN,
951 (unsigned long *)&pi_desc->control);
952}
953
ad361091
PB
954static inline void pi_clear_on(struct pi_desc *pi_desc)
955{
956 clear_bit(POSTED_INTR_ON,
957 (unsigned long *)&pi_desc->control);
958}
959
ebbfc765
FW
960static inline int pi_test_on(struct pi_desc *pi_desc)
961{
962 return test_bit(POSTED_INTR_ON,
963 (unsigned long *)&pi_desc->control);
964}
965
966static inline int pi_test_sn(struct pi_desc *pi_desc)
967{
968 return test_bit(POSTED_INTR_SN,
969 (unsigned long *)&pi_desc->control);
970}
971
33966dd6
KRW
972struct vmx_msrs {
973 unsigned int nr;
974 struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
975};
976
a2fa3e9f 977struct vcpu_vmx {
fb3f0f51 978 struct kvm_vcpu vcpu;
313dbd49 979 unsigned long host_rsp;
29bd8a78 980 u8 fail;
904e14fb 981 u8 msr_bitmap_mode;
51aa01d1 982 u32 exit_intr_info;
1155f76a 983 u32 idt_vectoring_info;
6de12732 984 ulong rflags;
26bb0981 985 struct shared_msr_entry *guest_msrs;
a2fa3e9f
GH
986 int nmsrs;
987 int save_nmsrs;
f48b4711 988 bool guest_msrs_dirty;
a547c6db 989 unsigned long host_idt_base;
a2fa3e9f 990#ifdef CONFIG_X86_64
44ea2b17
AK
991 u64 msr_host_kernel_gs_base;
992 u64 msr_guest_kernel_gs_base;
a2fa3e9f 993#endif
15d45071 994
28c1c9fa 995 u64 arch_capabilities;
d28b387f 996 u64 spec_ctrl;
28c1c9fa 997
2961e876
GN
998 u32 vm_entry_controls_shadow;
999 u32 vm_exit_controls_shadow;
80154d77
PB
1000 u32 secondary_exec_control;
1001
d462b819
NHE
1002 /*
1003 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
1004 * non-nested (L1) guest, it always points to vmcs01. For a nested
bd9966de
SC
1005 * guest (L2), it points to a different VMCS. loaded_cpu_state points
1006 * to the VMCS whose state is loaded into the CPU registers that only
1007 * need to be switched when transitioning to/from the kernel; a NULL
1008 * value indicates that host state is loaded.
d462b819
NHE
1009 */
1010 struct loaded_vmcs vmcs01;
1011 struct loaded_vmcs *loaded_vmcs;
bd9966de 1012 struct loaded_vmcs *loaded_cpu_state;
d462b819 1013 bool __launched; /* temporary, used in vmx_vcpu_run */
61d2ef2c 1014 struct msr_autoload {
33966dd6
KRW
1015 struct vmx_msrs guest;
1016 struct vmx_msrs host;
61d2ef2c 1017 } msr_autoload;
bd9966de 1018
9c8cba37 1019 struct {
7ffd92c5 1020 int vm86_active;
78ac8b47 1021 ulong save_rflags;
f5f7b2fe
AK
1022 struct kvm_segment segs[8];
1023 } rmode;
1024 struct {
1025 u32 bitmask; /* 4 bits per segment (1 bit per field) */
7ffd92c5
AK
1026 struct kvm_save_segment {
1027 u16 selector;
1028 unsigned long base;
1029 u32 limit;
1030 u32 ar;
f5f7b2fe 1031 } seg[8];
2fb92db1 1032 } segment_cache;
2384d2b3 1033 int vpid;
04fa4d32 1034 bool emulation_required;
3b86cd99 1035
a0861c02 1036 u32 exit_reason;
4e47c7a6 1037
01e439be
YZ
1038 /* Posted interrupt descriptor */
1039 struct pi_desc pi_desc;
1040
ec378aee
NHE
1041 /* Support for a guest hypervisor (nested VMX) */
1042 struct nested_vmx nested;
a7653ecd
RK
1043
1044 /* Dynamic PLE window. */
1045 int ple_window;
1046 bool ple_window_dirty;
843e4330 1047
d264ee0c
SC
1048 bool req_immediate_exit;
1049
843e4330
KH
1050 /* Support for PML */
1051#define PML_ENTITY_NUM 512
1052 struct page *pml_pg;
2680d6da 1053
64672c95
YJ
1054 /* apic deadline value in host tsc */
1055 u64 hv_deadline_tsc;
1056
2680d6da 1057 u64 current_tsc_ratio;
1be0e61c 1058
1be0e61c 1059 u32 host_pkru;
3b84080b 1060
74c55931
WL
1061 unsigned long host_debugctlmsr;
1062
37e4c997
HZ
1063 /*
1064 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
1065 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
1066 * in msr_ia32_feature_control_valid_bits.
1067 */
3b84080b 1068 u64 msr_ia32_feature_control;
37e4c997 1069 u64 msr_ia32_feature_control_valid_bits;
877ad952 1070 u64 ept_pointer;
a2fa3e9f
GH
1071};
1072
2fb92db1
AK
1073enum segment_cache_field {
1074 SEG_FIELD_SEL = 0,
1075 SEG_FIELD_BASE = 1,
1076 SEG_FIELD_LIMIT = 2,
1077 SEG_FIELD_AR = 3,
1078
1079 SEG_FIELD_NR = 4
1080};
1081
40bbb9d0
SC
1082static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
1083{
1084 return container_of(kvm, struct kvm_vmx, kvm);
1085}
1086
a2fa3e9f
GH
1087static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
1088{
fb3f0f51 1089 return container_of(vcpu, struct vcpu_vmx, vcpu);
a2fa3e9f
GH
1090}
1091
efc64404
FW
1092static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
1093{
1094 return &(to_vmx(vcpu)->pi_desc);
1095}
1096
58e9ffae 1097#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
22bd0358 1098#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
58e9ffae
JM
1099#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
1100#define FIELD64(number, name) \
1101 FIELD(number, name), \
1102 [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
22bd0358 1103
4607c2d7 1104
44900ba6 1105static u16 shadow_read_only_fields[] = {
c9e9deae
PB
1106#define SHADOW_FIELD_RO(x) x,
1107#include "vmx_shadow_fields.h"
4607c2d7 1108};
fe2b201b 1109static int max_shadow_read_only_fields =
4607c2d7
AG
1110 ARRAY_SIZE(shadow_read_only_fields);
1111
44900ba6 1112static u16 shadow_read_write_fields[] = {
c9e9deae
PB
1113#define SHADOW_FIELD_RW(x) x,
1114#include "vmx_shadow_fields.h"
4607c2d7 1115};
fe2b201b 1116static int max_shadow_read_write_fields =
4607c2d7
AG
1117 ARRAY_SIZE(shadow_read_write_fields);
1118
772e0318 1119static const unsigned short vmcs_field_to_offset_table[] = {
22bd0358 1120 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
705699a1 1121 FIELD(POSTED_INTR_NV, posted_intr_nv),
22bd0358
NHE
1122 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
1123 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
1124 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
1125 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
1126 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
1127 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
1128 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
1129 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
608406e2 1130 FIELD(GUEST_INTR_STATUS, guest_intr_status),
c5f983f6 1131 FIELD(GUEST_PML_INDEX, guest_pml_index),
22bd0358
NHE
1132 FIELD(HOST_ES_SELECTOR, host_es_selector),
1133 FIELD(HOST_CS_SELECTOR, host_cs_selector),
1134 FIELD(HOST_SS_SELECTOR, host_ss_selector),
1135 FIELD(HOST_DS_SELECTOR, host_ds_selector),
1136 FIELD(HOST_FS_SELECTOR, host_fs_selector),
1137 FIELD(HOST_GS_SELECTOR, host_gs_selector),
1138 FIELD(HOST_TR_SELECTOR, host_tr_selector),
1139 FIELD64(IO_BITMAP_A, io_bitmap_a),
1140 FIELD64(IO_BITMAP_B, io_bitmap_b),
1141 FIELD64(MSR_BITMAP, msr_bitmap),
1142 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
1143 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
1144 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
b348e793 1145 FIELD64(PML_ADDRESS, pml_address),
22bd0358
NHE
1146 FIELD64(TSC_OFFSET, tsc_offset),
1147 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
1148 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
705699a1 1149 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
27c42a1b 1150 FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
22bd0358 1151 FIELD64(EPT_POINTER, ept_pointer),
608406e2
WV
1152 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
1153 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
1154 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
1155 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
41ab9372 1156 FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
b348e793
JM
1157 FIELD64(VMREAD_BITMAP, vmread_bitmap),
1158 FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
81dc01f7 1159 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
22bd0358
NHE
1160 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
1161 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
1162 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
1163 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
1164 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
1165 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
1166 FIELD64(GUEST_PDPTR0, guest_pdptr0),
1167 FIELD64(GUEST_PDPTR1, guest_pdptr1),
1168 FIELD64(GUEST_PDPTR2, guest_pdptr2),
1169 FIELD64(GUEST_PDPTR3, guest_pdptr3),
36be0b9d 1170 FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
22bd0358
NHE
1171 FIELD64(HOST_IA32_PAT, host_ia32_pat),
1172 FIELD64(HOST_IA32_EFER, host_ia32_efer),
1173 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
1174 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
1175 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
1176 FIELD(EXCEPTION_BITMAP, exception_bitmap),
1177 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
1178 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
1179 FIELD(CR3_TARGET_COUNT, cr3_target_count),
1180 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
1181 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
1182 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
1183 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
1184 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
1185 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
1186 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
1187 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
1188 FIELD(TPR_THRESHOLD, tpr_threshold),
1189 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
1190 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
1191 FIELD(VM_EXIT_REASON, vm_exit_reason),
1192 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
1193 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
1194 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
1195 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
1196 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
1197 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
1198 FIELD(GUEST_ES_LIMIT, guest_es_limit),
1199 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
1200 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
1201 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
1202 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
1203 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
1204 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
1205 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
1206 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
1207 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
1208 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
1209 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
1210 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
1211 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
1212 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
1213 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
1214 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
1215 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
1216 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
1217 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
1218 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
1219 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
0238ea91 1220 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
22bd0358
NHE
1221 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
1222 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
1223 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
1224 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
1225 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
1226 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
1227 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
1228 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
1229 FIELD(EXIT_QUALIFICATION, exit_qualification),
1230 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
1231 FIELD(GUEST_CR0, guest_cr0),
1232 FIELD(GUEST_CR3, guest_cr3),
1233 FIELD(GUEST_CR4, guest_cr4),
1234 FIELD(GUEST_ES_BASE, guest_es_base),
1235 FIELD(GUEST_CS_BASE, guest_cs_base),
1236 FIELD(GUEST_SS_BASE, guest_ss_base),
1237 FIELD(GUEST_DS_BASE, guest_ds_base),
1238 FIELD(GUEST_FS_BASE, guest_fs_base),
1239 FIELD(GUEST_GS_BASE, guest_gs_base),
1240 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
1241 FIELD(GUEST_TR_BASE, guest_tr_base),
1242 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
1243 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
1244 FIELD(GUEST_DR7, guest_dr7),
1245 FIELD(GUEST_RSP, guest_rsp),
1246 FIELD(GUEST_RIP, guest_rip),
1247 FIELD(GUEST_RFLAGS, guest_rflags),
1248 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
1249 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
1250 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
1251 FIELD(HOST_CR0, host_cr0),
1252 FIELD(HOST_CR3, host_cr3),
1253 FIELD(HOST_CR4, host_cr4),
1254 FIELD(HOST_FS_BASE, host_fs_base),
1255 FIELD(HOST_GS_BASE, host_gs_base),
1256 FIELD(HOST_TR_BASE, host_tr_base),
1257 FIELD(HOST_GDTR_BASE, host_gdtr_base),
1258 FIELD(HOST_IDTR_BASE, host_idtr_base),
1259 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
1260 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
1261 FIELD(HOST_RSP, host_rsp),
1262 FIELD(HOST_RIP, host_rip),
1263};
22bd0358
NHE
1264
1265static inline short vmcs_field_to_offset(unsigned long field)
1266{
085331df
DW
1267 const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
1268 unsigned short offset;
58e9ffae
JM
1269 unsigned index;
1270
1271 if (field >> 15)
1272 return -ENOENT;
a2ae9df7 1273
58e9ffae 1274 index = ROL16(field, 6);
15303ba5 1275 if (index >= size)
75f139aa
AH
1276 return -ENOENT;
1277
15303ba5
LT
1278 index = array_index_nospec(index, size);
1279 offset = vmcs_field_to_offset_table[index];
085331df 1280 if (offset == 0)
a2ae9df7 1281 return -ENOENT;
085331df 1282 return offset;
22bd0358
NHE
1283}
1284
a9d30f33
NHE
1285static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
1286{
4f2777bc 1287 return to_vmx(vcpu)->nested.cached_vmcs12;
a9d30f33
NHE
1288}
1289
61ada748
LA
1290static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
1291{
1292 return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
1293}
1294
995f00a6 1295static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
bfd0a56b 1296static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
995f00a6 1297static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
f53cd63c 1298static bool vmx_xsaves_supported(void);
b246dd5d
OW
1299static void vmx_set_segment(struct kvm_vcpu *vcpu,
1300 struct kvm_segment *var, int seg);
1301static void vmx_get_segment(struct kvm_vcpu *vcpu,
1302 struct kvm_segment *var, int seg);
d99e4152
GN
1303static bool guest_state_valid(struct kvm_vcpu *vcpu);
1304static u32 vmx_segment_access_rights(struct kvm_segment *var);
16f5b903 1305static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
b96fb439
PB
1306static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
1307static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
1308static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
1309 u16 error_code);
904e14fb 1310static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
1e4329ee 1311static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
15d45071 1312 u32 msr, int type);
75880a01 1313
6aa8b732
AK
1314static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1315static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
d462b819
NHE
1316/*
1317 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
1318 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
1319 */
1320static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
6aa8b732 1321
bf9f6ac8
FW
1322/*
1323 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
1324 * can find which vCPU should be waken up.
1325 */
1326static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
1327static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1328
23611332 1329enum {
23611332
RK
1330 VMX_VMREAD_BITMAP,
1331 VMX_VMWRITE_BITMAP,
1332 VMX_BITMAP_NR
1333};
1334
1335static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
1336
23611332
RK
1337#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
1338#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
fdef3ad1 1339
110312c8 1340static bool cpu_has_load_ia32_efer;
8bf00a52 1341static bool cpu_has_load_perf_global_ctrl;
110312c8 1342
2384d2b3
SY
1343static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
1344static DEFINE_SPINLOCK(vmx_vpid_lock);
1345
1c3d14fe 1346static struct vmcs_config {
6aa8b732
AK
1347 int size;
1348 int order;
9ac7e3e8 1349 u32 basic_cap;
6aa8b732 1350 u32 revision_id;
1c3d14fe
YS
1351 u32 pin_based_exec_ctrl;
1352 u32 cpu_based_exec_ctrl;
f78e0e2e 1353 u32 cpu_based_2nd_exec_ctrl;
1c3d14fe
YS
1354 u32 vmexit_ctrl;
1355 u32 vmentry_ctrl;
1389309c 1356 struct nested_vmx_msrs nested;
1c3d14fe 1357} vmcs_config;
6aa8b732 1358
efff9e53 1359static struct vmx_capability {
d56f546d
SY
1360 u32 ept;
1361 u32 vpid;
1362} vmx_capability;
1363
6aa8b732
AK
1364#define VMX_SEGMENT_FIELD(seg) \
1365 [VCPU_SREG_##seg] = { \
1366 .selector = GUEST_##seg##_SELECTOR, \
1367 .base = GUEST_##seg##_BASE, \
1368 .limit = GUEST_##seg##_LIMIT, \
1369 .ar_bytes = GUEST_##seg##_AR_BYTES, \
1370 }
1371
772e0318 1372static const struct kvm_vmx_segment_field {
6aa8b732
AK
1373 unsigned selector;
1374 unsigned base;
1375 unsigned limit;
1376 unsigned ar_bytes;
1377} kvm_vmx_segment_fields[] = {
1378 VMX_SEGMENT_FIELD(CS),
1379 VMX_SEGMENT_FIELD(DS),
1380 VMX_SEGMENT_FIELD(ES),
1381 VMX_SEGMENT_FIELD(FS),
1382 VMX_SEGMENT_FIELD(GS),
1383 VMX_SEGMENT_FIELD(SS),
1384 VMX_SEGMENT_FIELD(TR),
1385 VMX_SEGMENT_FIELD(LDTR),
1386};
1387
26bb0981
AK
1388static u64 host_efer;
1389
6de4f3ad
AK
1390static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1391
4d56c8a7 1392/*
8c06585d 1393 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
4d56c8a7
AK
1394 * away by decrementing the array size.
1395 */
6aa8b732 1396static const u32 vmx_msr_index[] = {
05b3e0c2 1397#ifdef CONFIG_X86_64
44ea2b17 1398 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
6aa8b732 1399#endif
8c06585d 1400 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
6aa8b732 1401};
6aa8b732 1402
773e8a04
VK
1403DEFINE_STATIC_KEY_FALSE(enable_evmcs);
1404
1405#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
1406
1407#define KVM_EVMCS_VERSION 1
1408
5d7a6443
VK
1409/*
1410 * Enlightened VMCSv1 doesn't support these:
1411 *
1412 * POSTED_INTR_NV = 0x00000002,
1413 * GUEST_INTR_STATUS = 0x00000810,
1414 * APIC_ACCESS_ADDR = 0x00002014,
1415 * POSTED_INTR_DESC_ADDR = 0x00002016,
1416 * EOI_EXIT_BITMAP0 = 0x0000201c,
1417 * EOI_EXIT_BITMAP1 = 0x0000201e,
1418 * EOI_EXIT_BITMAP2 = 0x00002020,
1419 * EOI_EXIT_BITMAP3 = 0x00002022,
1420 * GUEST_PML_INDEX = 0x00000812,
1421 * PML_ADDRESS = 0x0000200e,
1422 * VM_FUNCTION_CONTROL = 0x00002018,
1423 * EPTP_LIST_ADDRESS = 0x00002024,
1424 * VMREAD_BITMAP = 0x00002026,
1425 * VMWRITE_BITMAP = 0x00002028,
1426 *
1427 * TSC_MULTIPLIER = 0x00002032,
1428 * PLE_GAP = 0x00004020,
1429 * PLE_WINDOW = 0x00004022,
1430 * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
1431 * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
1432 * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
1433 *
1434 * Currently unsupported in KVM:
1435 * GUEST_IA32_RTIT_CTL = 0x00002814,
1436 */
1437#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
1438 PIN_BASED_VMX_PREEMPTION_TIMER)
1439#define EVMCS1_UNSUPPORTED_2NDEXEC \
1440 (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
1441 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
1442 SECONDARY_EXEC_APIC_REGISTER_VIRT | \
1443 SECONDARY_EXEC_ENABLE_PML | \
1444 SECONDARY_EXEC_ENABLE_VMFUNC | \
1445 SECONDARY_EXEC_SHADOW_VMCS | \
1446 SECONDARY_EXEC_TSC_SCALING | \
1447 SECONDARY_EXEC_PAUSE_LOOP_EXITING)
1448#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
1449#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
1450#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
1451
773e8a04
VK
1452#if IS_ENABLED(CONFIG_HYPERV)
1453static bool __read_mostly enlightened_vmcs = true;
1454module_param(enlightened_vmcs, bool, 0444);
1455
1456static inline void evmcs_write64(unsigned long field, u64 value)
1457{
1458 u16 clean_field;
1459 int offset = get_evmcs_offset(field, &clean_field);
1460
1461 if (offset < 0)
1462 return;
1463
1464 *(u64 *)((char *)current_evmcs + offset) = value;
1465
1466 current_evmcs->hv_clean_fields &= ~clean_field;
1467}
1468
1469static inline void evmcs_write32(unsigned long field, u32 value)
1470{
1471 u16 clean_field;
1472 int offset = get_evmcs_offset(field, &clean_field);
1473
1474 if (offset < 0)
1475 return;
1476
1477 *(u32 *)((char *)current_evmcs + offset) = value;
1478 current_evmcs->hv_clean_fields &= ~clean_field;
1479}
1480
1481static inline void evmcs_write16(unsigned long field, u16 value)
1482{
1483 u16 clean_field;
1484 int offset = get_evmcs_offset(field, &clean_field);
1485
1486 if (offset < 0)
1487 return;
1488
1489 *(u16 *)((char *)current_evmcs + offset) = value;
1490 current_evmcs->hv_clean_fields &= ~clean_field;
1491}
1492
1493static inline u64 evmcs_read64(unsigned long field)
1494{
1495 int offset = get_evmcs_offset(field, NULL);
1496
1497 if (offset < 0)
1498 return 0;
1499
1500 return *(u64 *)((char *)current_evmcs + offset);
1501}
1502
1503static inline u32 evmcs_read32(unsigned long field)
1504{
1505 int offset = get_evmcs_offset(field, NULL);
1506
1507 if (offset < 0)
1508 return 0;
1509
1510 return *(u32 *)((char *)current_evmcs + offset);
1511}
1512
1513static inline u16 evmcs_read16(unsigned long field)
1514{
1515 int offset = get_evmcs_offset(field, NULL);
1516
1517 if (offset < 0)
1518 return 0;
1519
1520 return *(u16 *)((char *)current_evmcs + offset);
1521}
1522
ceef7d10
VK
1523static inline void evmcs_touch_msr_bitmap(void)
1524{
1525 if (unlikely(!current_evmcs))
1526 return;
1527
1528 if (current_evmcs->hv_enlightenments_control.msr_bitmap)
1529 current_evmcs->hv_clean_fields &=
1530 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
1531}
1532
773e8a04
VK
1533static void evmcs_load(u64 phys_addr)
1534{
1535 struct hv_vp_assist_page *vp_ap =
1536 hv_get_vp_assist_page(smp_processor_id());
1537
1538 vp_ap->current_nested_vmcs = phys_addr;
1539 vp_ap->enlighten_vmentry = 1;
1540}
1541
1542static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
1543{
5d7a6443
VK
1544 vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
1545 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
773e8a04 1546
5d7a6443
VK
1547 vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
1548 vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
773e8a04 1549
773e8a04 1550}
877ad952
TL
1551
1552/* check_ept_pointer() should be under protection of ept_pointer_lock. */
1553static void check_ept_pointer_match(struct kvm *kvm)
1554{
1555 struct kvm_vcpu *vcpu;
1556 u64 tmp_eptp = INVALID_PAGE;
1557 int i;
1558
1559 kvm_for_each_vcpu(i, vcpu, kvm) {
1560 if (!VALID_PAGE(tmp_eptp)) {
1561 tmp_eptp = to_vmx(vcpu)->ept_pointer;
1562 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
1563 to_kvm_vmx(kvm)->ept_pointers_match
1564 = EPT_POINTERS_MISMATCH;
1565 return;
1566 }
1567 }
1568
1569 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
1570}
1571
1572static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
1573{
a5c214da
LT
1574 struct kvm_vcpu *vcpu;
1575 int ret = -ENOTSUPP, i;
877ad952
TL
1576
1577 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1578
1579 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
1580 check_ept_pointer_match(kvm);
1581
5f8bb004
VK
1582 /*
1583 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
1584 * base of EPT PML4 table, strip off EPT configuration information.
1585 */
877ad952 1586 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
a5c214da
LT
1587 kvm_for_each_vcpu(i, vcpu, kvm)
1588 ret |= hyperv_flush_guest_mapping(
0d1e8b8d 1589 to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK);
a5c214da
LT
1590 } else {
1591 ret = hyperv_flush_guest_mapping(
0d1e8b8d 1592 to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
877ad952 1593 }
877ad952 1594
877ad952
TL
1595 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1596 return ret;
1597}
773e8a04
VK
1598#else /* !IS_ENABLED(CONFIG_HYPERV) */
1599static inline void evmcs_write64(unsigned long field, u64 value) {}
1600static inline void evmcs_write32(unsigned long field, u32 value) {}
1601static inline void evmcs_write16(unsigned long field, u16 value) {}
1602static inline u64 evmcs_read64(unsigned long field) { return 0; }
1603static inline u32 evmcs_read32(unsigned long field) { return 0; }
1604static inline u16 evmcs_read16(unsigned long field) { return 0; }
1605static inline void evmcs_load(u64 phys_addr) {}
1606static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
ceef7d10 1607static inline void evmcs_touch_msr_bitmap(void) {}
773e8a04
VK
1608#endif /* IS_ENABLED(CONFIG_HYPERV) */
1609
57b119da
VK
1610static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
1611 uint16_t *vmcs_version)
1612{
1613 struct vcpu_vmx *vmx = to_vmx(vcpu);
1614
57b119da
VK
1615 /*
1616 * vmcs_version represents the range of supported Enlightened VMCS
1617 * versions: lower 8 bits is the minimal version, higher 8 bits is the
1618 * maximum supported version. KVM supports versions from 1 to
1619 * KVM_EVMCS_VERSION.
1620 */
8cab6507
VK
1621 if (vmcs_version)
1622 *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
57b119da 1623
7f9ad1df
LA
1624 /* We don't support disabling the feature for simplicity. */
1625 if (vmx->nested.enlightened_vmcs_enabled)
1626 return 0;
1627
1628 vmx->nested.enlightened_vmcs_enabled = true;
1629
57b119da
VK
1630 vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
1631 vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
1632 vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
1633 vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
1634 vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
1635
1636 return 0;
1637}
1638
5bb16016 1639static inline bool is_exception_n(u32 intr_info, u8 vector)
6aa8b732
AK
1640{
1641 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1642 INTR_INFO_VALID_MASK)) ==
5bb16016
JK
1643 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1644}
1645
6f05485d
JK
1646static inline bool is_debug(u32 intr_info)
1647{
1648 return is_exception_n(intr_info, DB_VECTOR);
1649}
1650
1651static inline bool is_breakpoint(u32 intr_info)
1652{
1653 return is_exception_n(intr_info, BP_VECTOR);
1654}
1655
5bb16016
JK
1656static inline bool is_page_fault(u32 intr_info)
1657{
1658 return is_exception_n(intr_info, PF_VECTOR);
6aa8b732
AK
1659}
1660
31299944 1661static inline bool is_invalid_opcode(u32 intr_info)
7aa81cc0 1662{
5bb16016 1663 return is_exception_n(intr_info, UD_VECTOR);
7aa81cc0
AL
1664}
1665
9e869480
LA
1666static inline bool is_gp_fault(u32 intr_info)
1667{
1668 return is_exception_n(intr_info, GP_VECTOR);
1669}
1670
31299944 1671static inline bool is_machine_check(u32 intr_info)
a0861c02
AK
1672{
1673 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1674 INTR_INFO_VALID_MASK)) ==
1675 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1676}
1677
32d43cd3
LT
1678/* Undocumented: icebp/int1 */
1679static inline bool is_icebp(u32 intr_info)
1680{
1681 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1682 == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
1683}
1684
31299944 1685static inline bool cpu_has_vmx_msr_bitmap(void)
25c5f225 1686{
04547156 1687 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
25c5f225
SY
1688}
1689
31299944 1690static inline bool cpu_has_vmx_tpr_shadow(void)
6e5d865c 1691{
04547156 1692 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
6e5d865c
YS
1693}
1694
35754c98 1695static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
6e5d865c 1696{
35754c98 1697 return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
6e5d865c
YS
1698}
1699
31299944 1700static inline bool cpu_has_secondary_exec_ctrls(void)
f78e0e2e 1701{
04547156
SY
1702 return vmcs_config.cpu_based_exec_ctrl &
1703 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
f78e0e2e
SY
1704}
1705
774ead3a 1706static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
f78e0e2e 1707{
04547156
SY
1708 return vmcs_config.cpu_based_2nd_exec_ctrl &
1709 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1710}
1711
8d14695f
YZ
1712static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1713{
1714 return vmcs_config.cpu_based_2nd_exec_ctrl &
1715 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1716}
1717
83d4c286
YZ
1718static inline bool cpu_has_vmx_apic_register_virt(void)
1719{
1720 return vmcs_config.cpu_based_2nd_exec_ctrl &
1721 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1722}
1723
c7c9c56c
YZ
1724static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1725{
1726 return vmcs_config.cpu_based_2nd_exec_ctrl &
1727 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1728}
1729
0b665d30
SC
1730static inline bool cpu_has_vmx_encls_vmexit(void)
1731{
1732 return vmcs_config.cpu_based_2nd_exec_ctrl &
1733 SECONDARY_EXEC_ENCLS_EXITING;
1734}
1735
64672c95
YJ
1736/*
1737 * Comment's format: document - errata name - stepping - processor name.
1738 * Refer from
1739 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1740 */
1741static u32 vmx_preemption_cpu_tfms[] = {
1742/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
17430x000206E6,
1744/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
1745/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1746/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
17470x00020652,
1748/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
17490x00020655,
1750/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
1751/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
1752/*
1753 * 320767.pdf - AAP86 - B1 -
1754 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1755 */
17560x000106E5,
1757/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
17580x000106A0,
1759/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
17600x000106A1,
1761/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
17620x000106A4,
1763 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1764 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1765 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
17660x000106A5,
1767};
1768
1769static inline bool cpu_has_broken_vmx_preemption_timer(void)
1770{
1771 u32 eax = cpuid_eax(0x00000001), i;
1772
1773 /* Clear the reserved bits */
1774 eax &= ~(0x3U << 14 | 0xfU << 28);
03f6a22a 1775 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
64672c95
YJ
1776 if (eax == vmx_preemption_cpu_tfms[i])
1777 return true;
1778
1779 return false;
1780}
1781
1782static inline bool cpu_has_vmx_preemption_timer(void)
1783{
64672c95
YJ
1784 return vmcs_config.pin_based_exec_ctrl &
1785 PIN_BASED_VMX_PREEMPTION_TIMER;
1786}
1787
01e439be
YZ
1788static inline bool cpu_has_vmx_posted_intr(void)
1789{
d6a858d1
PB
1790 return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1791 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
01e439be
YZ
1792}
1793
1794static inline bool cpu_has_vmx_apicv(void)
1795{
1796 return cpu_has_vmx_apic_register_virt() &&
1797 cpu_has_vmx_virtual_intr_delivery() &&
1798 cpu_has_vmx_posted_intr();
1799}
1800
04547156
SY
1801static inline bool cpu_has_vmx_flexpriority(void)
1802{
1803 return cpu_has_vmx_tpr_shadow() &&
1804 cpu_has_vmx_virtualize_apic_accesses();
f78e0e2e
SY
1805}
1806
e799794e
MT
1807static inline bool cpu_has_vmx_ept_execute_only(void)
1808{
31299944 1809 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
e799794e
MT
1810}
1811
e799794e
MT
1812static inline bool cpu_has_vmx_ept_2m_page(void)
1813{
31299944 1814 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
e799794e
MT
1815}
1816
878403b7
SY
1817static inline bool cpu_has_vmx_ept_1g_page(void)
1818{
31299944 1819 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
878403b7
SY
1820}
1821
4bc9b982
SY
1822static inline bool cpu_has_vmx_ept_4levels(void)
1823{
1824 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1825}
1826
42aa53b4
DH
1827static inline bool cpu_has_vmx_ept_mt_wb(void)
1828{
1829 return vmx_capability.ept & VMX_EPTP_WB_BIT;
1830}
1831
855feb67
YZ
1832static inline bool cpu_has_vmx_ept_5levels(void)
1833{
1834 return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1835}
1836
83c3a331
XH
1837static inline bool cpu_has_vmx_ept_ad_bits(void)
1838{
1839 return vmx_capability.ept & VMX_EPT_AD_BIT;
1840}
1841
31299944 1842static inline bool cpu_has_vmx_invept_context(void)
d56f546d 1843{
31299944 1844 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
d56f546d
SY
1845}
1846
31299944 1847static inline bool cpu_has_vmx_invept_global(void)
d56f546d 1848{
31299944 1849 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
d56f546d
SY
1850}
1851
cd9a491f
LA
1852static inline bool cpu_has_vmx_invvpid_individual_addr(void)
1853{
1854 return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
1855}
1856
518c8aee
GJ
1857static inline bool cpu_has_vmx_invvpid_single(void)
1858{
1859 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1860}
1861
b9d762fa
GJ
1862static inline bool cpu_has_vmx_invvpid_global(void)
1863{
1864 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1865}
1866
08d839c4
WL
1867static inline bool cpu_has_vmx_invvpid(void)
1868{
1869 return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1870}
1871
31299944 1872static inline bool cpu_has_vmx_ept(void)
d56f546d 1873{
04547156
SY
1874 return vmcs_config.cpu_based_2nd_exec_ctrl &
1875 SECONDARY_EXEC_ENABLE_EPT;
d56f546d
SY
1876}
1877
31299944 1878static inline bool cpu_has_vmx_unrestricted_guest(void)
3a624e29
NK
1879{
1880 return vmcs_config.cpu_based_2nd_exec_ctrl &
1881 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1882}
1883
31299944 1884static inline bool cpu_has_vmx_ple(void)
4b8d54f9
ZE
1885{
1886 return vmcs_config.cpu_based_2nd_exec_ctrl &
1887 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1888}
1889
9ac7e3e8
JD
1890static inline bool cpu_has_vmx_basic_inout(void)
1891{
1892 return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1893}
1894
35754c98 1895static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
f78e0e2e 1896{
35754c98 1897 return flexpriority_enabled && lapic_in_kernel(vcpu);
f78e0e2e
SY
1898}
1899
31299944 1900static inline bool cpu_has_vmx_vpid(void)
2384d2b3 1901{
04547156
SY
1902 return vmcs_config.cpu_based_2nd_exec_ctrl &
1903 SECONDARY_EXEC_ENABLE_VPID;
2384d2b3
SY
1904}
1905
31299944 1906static inline bool cpu_has_vmx_rdtscp(void)
4e47c7a6
SY
1907{
1908 return vmcs_config.cpu_based_2nd_exec_ctrl &
1909 SECONDARY_EXEC_RDTSCP;
1910}
1911
ad756a16
MJ
1912static inline bool cpu_has_vmx_invpcid(void)
1913{
1914 return vmcs_config.cpu_based_2nd_exec_ctrl &
1915 SECONDARY_EXEC_ENABLE_INVPCID;
1916}
1917
8a1b4392
PB
1918static inline bool cpu_has_virtual_nmis(void)
1919{
1920 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1921}
1922
f5f48ee1
SY
1923static inline bool cpu_has_vmx_wbinvd_exit(void)
1924{
1925 return vmcs_config.cpu_based_2nd_exec_ctrl &
1926 SECONDARY_EXEC_WBINVD_EXITING;
1927}
1928
abc4fc58
AG
1929static inline bool cpu_has_vmx_shadow_vmcs(void)
1930{
1931 u64 vmx_msr;
1932 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1933 /* check if the cpu supports writing r/o exit information fields */
1934 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1935 return false;
1936
1937 return vmcs_config.cpu_based_2nd_exec_ctrl &
1938 SECONDARY_EXEC_SHADOW_VMCS;
1939}
1940
843e4330
KH
1941static inline bool cpu_has_vmx_pml(void)
1942{
1943 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1944}
1945
64903d61
HZ
1946static inline bool cpu_has_vmx_tsc_scaling(void)
1947{
1948 return vmcs_config.cpu_based_2nd_exec_ctrl &
1949 SECONDARY_EXEC_TSC_SCALING;
1950}
1951
2a499e49
BD
1952static inline bool cpu_has_vmx_vmfunc(void)
1953{
1954 return vmcs_config.cpu_based_2nd_exec_ctrl &
1955 SECONDARY_EXEC_ENABLE_VMFUNC;
1956}
1957
64f7a115
SC
1958static bool vmx_umip_emulated(void)
1959{
1960 return vmcs_config.cpu_based_2nd_exec_ctrl &
1961 SECONDARY_EXEC_DESC;
1962}
1963
04547156
SY
1964static inline bool report_flexpriority(void)
1965{
1966 return flexpriority_enabled;
1967}
1968
c7c2c709
JM
1969static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1970{
6677f3da 1971 return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
c7c2c709
JM
1972}
1973
f4160e45
JM
1974/*
1975 * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
1976 * to modify any valid field of the VMCS, or are the VM-exit
1977 * information fields read-only?
1978 */
1979static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
1980{
1981 return to_vmx(vcpu)->nested.msrs.misc_low &
1982 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
1983}
1984
0447378a
MO
1985static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
1986{
1987 return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
1988}
1989
1990static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
1991{
1992 return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
1993 CPU_BASED_MONITOR_TRAP_FLAG;
1994}
1995
fa97d7db
LA
1996static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
1997{
1998 return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
1999 SECONDARY_EXEC_SHADOW_VMCS;
2000}
2001
fe3ef05c
NHE
2002static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
2003{
2004 return vmcs12->cpu_based_vm_exec_control & bit;
2005}
2006
2007static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
2008{
2009 return (vmcs12->cpu_based_vm_exec_control &
2010 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2011 (vmcs12->secondary_vm_exec_control & bit);
2012}
2013
f4124500
JK
2014static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
2015{
2016 return vmcs12->pin_based_vm_exec_control &
2017 PIN_BASED_VMX_PREEMPTION_TIMER;
2018}
2019
0c7f650e
KS
2020static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
2021{
2022 return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
2023}
2024
2025static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
2026{
2027 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
2028}
2029
155a97a3
NHE
2030static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
2031{
2032 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
2033}
2034
81dc01f7
WL
2035static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
2036{
3db13480 2037 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
81dc01f7
WL
2038}
2039
c5f983f6
BD
2040static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
2041{
2042 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
2043}
2044
f2b93280
WV
2045static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
2046{
2047 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
2048}
2049
5c614b35
WL
2050static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
2051{
2052 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
2053}
2054
82f0dd4b
WV
2055static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
2056{
2057 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
2058}
2059
608406e2
WV
2060static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
2061{
2062 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2063}
2064
705699a1
WV
2065static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
2066{
2067 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
2068}
2069
27c42a1b
BD
2070static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
2071{
2072 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
2073}
2074
41ab9372
BD
2075static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
2076{
2077 return nested_cpu_has_vmfunc(vmcs12) &&
2078 (vmcs12->vm_function_control &
2079 VMX_VMFUNC_EPTP_SWITCHING);
2080}
2081
f792d274
LA
2082static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
2083{
2084 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
2085}
2086
ef85b673 2087static inline bool is_nmi(u32 intr_info)
644d711a
NHE
2088{
2089 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
ef85b673 2090 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
644d711a
NHE
2091}
2092
533558bc
JK
2093static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2094 u32 exit_intr_info,
2095 unsigned long exit_qualification);
7c177938 2096
8b9cf98c 2097static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
7725f0ba
AK
2098{
2099 int i;
2100
a2fa3e9f 2101 for (i = 0; i < vmx->nmsrs; ++i)
26bb0981 2102 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
a75beee6
ED
2103 return i;
2104 return -1;
2105}
2106
5ebb272b 2107static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
2384d2b3
SY
2108{
2109 struct {
2110 u64 vpid : 16;
2111 u64 rsvd : 48;
2112 u64 gva;
2113 } operand = { vpid, 0, gva };
fd8ca6da 2114 bool error;
2384d2b3 2115
4b1e5478
UB
2116 asm volatile (__ex("invvpid %2, %1") CC_SET(na)
2117 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
fd8ca6da 2118 BUG_ON(error);
2384d2b3
SY
2119}
2120
5ebb272b 2121static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
1439442c
SY
2122{
2123 struct {
2124 u64 eptp, gpa;
2125 } operand = {eptp, gpa};
fd8ca6da 2126 bool error;
1439442c 2127
4b1e5478
UB
2128 asm volatile (__ex("invept %2, %1") CC_SET(na)
2129 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
fd8ca6da 2130 BUG_ON(error);
1439442c
SY
2131}
2132
26bb0981 2133static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
a75beee6
ED
2134{
2135 int i;
2136
8b9cf98c 2137 i = __find_msr_index(vmx, msr);
a75beee6 2138 if (i >= 0)
a2fa3e9f 2139 return &vmx->guest_msrs[i];
8b6d44c7 2140 return NULL;
7725f0ba
AK
2141}
2142
6aa8b732
AK
2143static void vmcs_clear(struct vmcs *vmcs)
2144{
2145 u64 phys_addr = __pa(vmcs);
fd8ca6da 2146 bool error;
6aa8b732 2147
4b1e5478
UB
2148 asm volatile (__ex("vmclear %1") CC_SET(na)
2149 : CC_OUT(na) (error) : "m"(phys_addr));
fd8ca6da 2150 if (unlikely(error))
6aa8b732
AK
2151 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
2152 vmcs, phys_addr);
2153}
2154
d462b819
NHE
2155static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
2156{
2157 vmcs_clear(loaded_vmcs->vmcs);
355f4fb1
JM
2158 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
2159 vmcs_clear(loaded_vmcs->shadow_vmcs);
d462b819
NHE
2160 loaded_vmcs->cpu = -1;
2161 loaded_vmcs->launched = 0;
2162}
2163
7725b894
DX
2164static void vmcs_load(struct vmcs *vmcs)
2165{
2166 u64 phys_addr = __pa(vmcs);
fd8ca6da 2167 bool error;
7725b894 2168
773e8a04
VK
2169 if (static_branch_unlikely(&enable_evmcs))
2170 return evmcs_load(phys_addr);
2171
4b1e5478
UB
2172 asm volatile (__ex("vmptrld %1") CC_SET(na)
2173 : CC_OUT(na) (error) : "m"(phys_addr));
fd8ca6da 2174 if (unlikely(error))
2844d849 2175 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
7725b894
DX
2176 vmcs, phys_addr);
2177}
2178
2965faa5 2179#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
2180/*
2181 * This bitmap is used to indicate whether the vmclear
2182 * operation is enabled on all cpus. All disabled by
2183 * default.
2184 */
2185static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
2186
2187static inline void crash_enable_local_vmclear(int cpu)
2188{
2189 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
2190}
2191
2192static inline void crash_disable_local_vmclear(int cpu)
2193{
2194 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
2195}
2196
2197static inline int crash_local_vmclear_enabled(int cpu)
2198{
2199 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
2200}
2201
2202static void crash_vmclear_local_loaded_vmcss(void)
2203{
2204 int cpu = raw_smp_processor_id();
2205 struct loaded_vmcs *v;
2206
2207 if (!crash_local_vmclear_enabled(cpu))
2208 return;
2209
2210 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
2211 loaded_vmcss_on_cpu_link)
2212 vmcs_clear(v->vmcs);
2213}
2214#else
2215static inline void crash_enable_local_vmclear(int cpu) { }
2216static inline void crash_disable_local_vmclear(int cpu) { }
2965faa5 2217#endif /* CONFIG_KEXEC_CORE */
8f536b76 2218
d462b819 2219static void __loaded_vmcs_clear(void *arg)
6aa8b732 2220{
d462b819 2221 struct loaded_vmcs *loaded_vmcs = arg;
d3b2c338 2222 int cpu = raw_smp_processor_id();
6aa8b732 2223
d462b819
NHE
2224 if (loaded_vmcs->cpu != cpu)
2225 return; /* vcpu migration can race with cpu offline */
2226 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
6aa8b732 2227 per_cpu(current_vmcs, cpu) = NULL;
8f536b76 2228 crash_disable_local_vmclear(cpu);
d462b819 2229 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
5a560f8b
XG
2230
2231 /*
2232 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
2233 * is before setting loaded_vmcs->vcpu to -1 which is done in
2234 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
2235 * then adds the vmcs into percpu list before it is deleted.
2236 */
2237 smp_wmb();
2238
d462b819 2239 loaded_vmcs_init(loaded_vmcs);
8f536b76 2240 crash_enable_local_vmclear(cpu);
6aa8b732
AK
2241}
2242
d462b819 2243static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
8d0be2b3 2244{
e6c7d321
XG
2245 int cpu = loaded_vmcs->cpu;
2246
2247 if (cpu != -1)
2248 smp_call_function_single(cpu,
2249 __loaded_vmcs_clear, loaded_vmcs, 1);
8d0be2b3
AK
2250}
2251
faff8758
JS
2252static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
2253{
2254 if (vpid == 0)
2255 return true;
2256
2257 if (cpu_has_vmx_invvpid_individual_addr()) {
2258 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
2259 return true;
2260 }
2261
2262 return false;
2263}
2264
dd5f5341 2265static inline void vpid_sync_vcpu_single(int vpid)
2384d2b3 2266{
dd5f5341 2267 if (vpid == 0)
2384d2b3
SY
2268 return;
2269
518c8aee 2270 if (cpu_has_vmx_invvpid_single())
dd5f5341 2271 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
2384d2b3
SY
2272}
2273
b9d762fa
GJ
2274static inline void vpid_sync_vcpu_global(void)
2275{
2276 if (cpu_has_vmx_invvpid_global())
2277 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
2278}
2279
dd5f5341 2280static inline void vpid_sync_context(int vpid)
b9d762fa
GJ
2281{
2282 if (cpu_has_vmx_invvpid_single())
dd5f5341 2283 vpid_sync_vcpu_single(vpid);
b9d762fa
GJ
2284 else
2285 vpid_sync_vcpu_global();
2286}
2287
1439442c
SY
2288static inline void ept_sync_global(void)
2289{
f5f51586 2290 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1439442c
SY
2291}
2292
2293static inline void ept_sync_context(u64 eptp)
2294{
0e1252dc
DH
2295 if (cpu_has_vmx_invept_context())
2296 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
2297 else
2298 ept_sync_global();
1439442c
SY
2299}
2300
8a86aea9
PB
2301static __always_inline void vmcs_check16(unsigned long field)
2302{
2303 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2304 "16-bit accessor invalid for 64-bit field");
2305 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2306 "16-bit accessor invalid for 64-bit high field");
2307 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2308 "16-bit accessor invalid for 32-bit high field");
2309 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2310 "16-bit accessor invalid for natural width field");
2311}
2312
2313static __always_inline void vmcs_check32(unsigned long field)
2314{
2315 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2316 "32-bit accessor invalid for 16-bit field");
2317 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2318 "32-bit accessor invalid for natural width field");
2319}
2320
2321static __always_inline void vmcs_check64(unsigned long field)
2322{
2323 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2324 "64-bit accessor invalid for 16-bit field");
2325 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2326 "64-bit accessor invalid for 64-bit high field");
2327 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2328 "64-bit accessor invalid for 32-bit field");
2329 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2330 "64-bit accessor invalid for natural width field");
2331}
2332
2333static __always_inline void vmcs_checkl(unsigned long field)
2334{
2335 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2336 "Natural width accessor invalid for 16-bit field");
2337 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2338 "Natural width accessor invalid for 64-bit field");
2339 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2340 "Natural width accessor invalid for 64-bit high field");
2341 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2342 "Natural width accessor invalid for 32-bit field");
2343}
2344
2345static __always_inline unsigned long __vmcs_readl(unsigned long field)
6aa8b732 2346{
5e520e62 2347 unsigned long value;
6aa8b732 2348
44c2d667 2349 asm volatile (__ex_clear("vmread %1, %0", "%k0")
4b1e5478 2350 : "=r"(value) : "r"(field));
6aa8b732
AK
2351 return value;
2352}
2353
96304217 2354static __always_inline u16 vmcs_read16(unsigned long field)
6aa8b732 2355{
8a86aea9 2356 vmcs_check16(field);
773e8a04
VK
2357 if (static_branch_unlikely(&enable_evmcs))
2358 return evmcs_read16(field);
8a86aea9 2359 return __vmcs_readl(field);
6aa8b732
AK
2360}
2361
96304217 2362static __always_inline u32 vmcs_read32(unsigned long field)
6aa8b732 2363{
8a86aea9 2364 vmcs_check32(field);
773e8a04
VK
2365 if (static_branch_unlikely(&enable_evmcs))
2366 return evmcs_read32(field);
8a86aea9 2367 return __vmcs_readl(field);
6aa8b732
AK
2368}
2369
96304217 2370static __always_inline u64 vmcs_read64(unsigned long field)
6aa8b732 2371{
8a86aea9 2372 vmcs_check64(field);
773e8a04
VK
2373 if (static_branch_unlikely(&enable_evmcs))
2374 return evmcs_read64(field);
05b3e0c2 2375#ifdef CONFIG_X86_64
8a86aea9 2376 return __vmcs_readl(field);
6aa8b732 2377#else
8a86aea9 2378 return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
6aa8b732
AK
2379#endif
2380}
2381
8a86aea9
PB
2382static __always_inline unsigned long vmcs_readl(unsigned long field)
2383{
2384 vmcs_checkl(field);
773e8a04
VK
2385 if (static_branch_unlikely(&enable_evmcs))
2386 return evmcs_read64(field);
8a86aea9
PB
2387 return __vmcs_readl(field);
2388}
2389
e52de1b8
AK
2390static noinline void vmwrite_error(unsigned long field, unsigned long value)
2391{
2392 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
2393 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
2394 dump_stack();
2395}
2396
8a86aea9 2397static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
6aa8b732 2398{
fd8ca6da 2399 bool error;
6aa8b732 2400
4b1e5478
UB
2401 asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
2402 : CC_OUT(na) (error) : "r"(field), "rm"(value));
e52de1b8
AK
2403 if (unlikely(error))
2404 vmwrite_error(field, value);
6aa8b732
AK
2405}
2406
8a86aea9 2407static __always_inline void vmcs_write16(unsigned long field, u16 value)
6aa8b732 2408{
8a86aea9 2409 vmcs_check16(field);
773e8a04
VK
2410 if (static_branch_unlikely(&enable_evmcs))
2411 return evmcs_write16(field, value);
2412
8a86aea9 2413 __vmcs_writel(field, value);
6aa8b732
AK
2414}
2415
8a86aea9 2416static __always_inline void vmcs_write32(unsigned long field, u32 value)
6aa8b732 2417{
8a86aea9 2418 vmcs_check32(field);
773e8a04
VK
2419 if (static_branch_unlikely(&enable_evmcs))
2420 return evmcs_write32(field, value);
2421
8a86aea9 2422 __vmcs_writel(field, value);
6aa8b732
AK
2423}
2424
8a86aea9 2425static __always_inline void vmcs_write64(unsigned long field, u64 value)
6aa8b732 2426{
8a86aea9 2427 vmcs_check64(field);
773e8a04
VK
2428 if (static_branch_unlikely(&enable_evmcs))
2429 return evmcs_write64(field, value);
2430
8a86aea9 2431 __vmcs_writel(field, value);
7682f2d0 2432#ifndef CONFIG_X86_64
6aa8b732 2433 asm volatile ("");
8a86aea9 2434 __vmcs_writel(field+1, value >> 32);
6aa8b732
AK
2435#endif
2436}
2437
8a86aea9 2438static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
2ab455cc 2439{
8a86aea9 2440 vmcs_checkl(field);
773e8a04
VK
2441 if (static_branch_unlikely(&enable_evmcs))
2442 return evmcs_write64(field, value);
2443
8a86aea9 2444 __vmcs_writel(field, value);
2ab455cc
AL
2445}
2446
8a86aea9 2447static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
2ab455cc 2448{
8a86aea9
PB
2449 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2450 "vmcs_clear_bits does not support 64-bit fields");
773e8a04
VK
2451 if (static_branch_unlikely(&enable_evmcs))
2452 return evmcs_write32(field, evmcs_read32(field) & ~mask);
2453
8a86aea9 2454 __vmcs_writel(field, __vmcs_readl(field) & ~mask);
2ab455cc
AL
2455}
2456
8a86aea9 2457static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
2ab455cc 2458{
8a86aea9
PB
2459 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2460 "vmcs_set_bits does not support 64-bit fields");
773e8a04
VK
2461 if (static_branch_unlikely(&enable_evmcs))
2462 return evmcs_write32(field, evmcs_read32(field) | mask);
2463
8a86aea9 2464 __vmcs_writel(field, __vmcs_readl(field) | mask);
2ab455cc
AL
2465}
2466
8391ce44
PB
2467static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
2468{
2469 vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
2470}
2471
2961e876
GN
2472static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
2473{
2474 vmcs_write32(VM_ENTRY_CONTROLS, val);
2475 vmx->vm_entry_controls_shadow = val;
2476}
2477
2478static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
2479{
2480 if (vmx->vm_entry_controls_shadow != val)
2481 vm_entry_controls_init(vmx, val);
2482}
2483
2484static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
2485{
2486 return vmx->vm_entry_controls_shadow;
2487}
2488
2489
2490static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2491{
2492 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
2493}
2494
2495static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2496{
2497 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
2498}
2499
8391ce44
PB
2500static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
2501{
2502 vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
2503}
2504
2961e876
GN
2505static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
2506{
2507 vmcs_write32(VM_EXIT_CONTROLS, val);
2508 vmx->vm_exit_controls_shadow = val;
2509}
2510
2511static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
2512{
2513 if (vmx->vm_exit_controls_shadow != val)
2514 vm_exit_controls_init(vmx, val);
2515}
2516
2517static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
2518{
2519 return vmx->vm_exit_controls_shadow;
2520}
2521
2522
2523static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2524{
2525 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
2526}
2527
2528static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2529{
2530 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
2531}
2532
2fb92db1
AK
2533static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
2534{
2535 vmx->segment_cache.bitmask = 0;
2536}
2537
2538static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
2539 unsigned field)
2540{
2541 bool ret;
2542 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
2543
2544 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
2545 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
2546 vmx->segment_cache.bitmask = 0;
2547 }
2548 ret = vmx->segment_cache.bitmask & mask;
2549 vmx->segment_cache.bitmask |= mask;
2550 return ret;
2551}
2552
2553static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
2554{
2555 u16 *p = &vmx->segment_cache.seg[seg].selector;
2556
2557 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
2558 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
2559 return *p;
2560}
2561
2562static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
2563{
2564 ulong *p = &vmx->segment_cache.seg[seg].base;
2565
2566 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
2567 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
2568 return *p;
2569}
2570
2571static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
2572{
2573 u32 *p = &vmx->segment_cache.seg[seg].limit;
2574
2575 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
2576 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
2577 return *p;
2578}
2579
2580static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
2581{
2582 u32 *p = &vmx->segment_cache.seg[seg].ar;
2583
2584 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
2585 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
2586 return *p;
2587}
2588
abd3f2d6
AK
2589static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2590{
2591 u32 eb;
2592
fd7373cc 2593 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
bd7e5b08 2594 (1u << DB_VECTOR) | (1u << AC_VECTOR);
9e869480
LA
2595 /*
2596 * Guest access to VMware backdoor ports could legitimately
2597 * trigger #GP because of TSS I/O permission bitmap.
2598 * We intercept those #GP and allow access to them anyway
2599 * as VMware does.
2600 */
2601 if (enable_vmware_backdoor)
2602 eb |= (1u << GP_VECTOR);
fd7373cc
JK
2603 if ((vcpu->guest_debug &
2604 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
2605 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
2606 eb |= 1u << BP_VECTOR;
7ffd92c5 2607 if (to_vmx(vcpu)->rmode.vm86_active)
abd3f2d6 2608 eb = ~0;
089d034e 2609 if (enable_ept)
1439442c 2610 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
36cf24e0
NHE
2611
2612 /* When we are running a nested L2 guest and L1 specified for it a
2613 * certain exception bitmap, we must trap the same exceptions and pass
2614 * them to L1. When running L2, we will only handle the exceptions
2615 * specified above if L1 did not want them.
2616 */
2617 if (is_guest_mode(vcpu))
2618 eb |= get_vmcs12(vcpu)->exception_bitmap;
2619
abd3f2d6
AK
2620 vmcs_write32(EXCEPTION_BITMAP, eb);
2621}
2622
d28b387f
KA
2623/*
2624 * Check if MSR is intercepted for currently loaded MSR bitmap.
2625 */
2626static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2627{
2628 unsigned long *msr_bitmap;
2629 int f = sizeof(unsigned long);
2630
2631 if (!cpu_has_vmx_msr_bitmap())
2632 return true;
2633
2634 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2635
2636 if (msr <= 0x1fff) {
2637 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2638 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2639 msr &= 0x1fff;
2640 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2641 }
2642
2643 return true;
2644}
2645
15d45071
AR
2646/*
2647 * Check if MSR is intercepted for L01 MSR bitmap.
2648 */
2649static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2650{
2651 unsigned long *msr_bitmap;
2652 int f = sizeof(unsigned long);
2653
2654 if (!cpu_has_vmx_msr_bitmap())
2655 return true;
2656
2657 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2658
2659 if (msr <= 0x1fff) {
2660 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2661 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2662 msr &= 0x1fff;
2663 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2664 }
2665
2666 return true;
2667}
2668
2961e876
GN
2669static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2670 unsigned long entry, unsigned long exit)
8bf00a52 2671{
2961e876
GN
2672 vm_entry_controls_clearbit(vmx, entry);
2673 vm_exit_controls_clearbit(vmx, exit);
8bf00a52
GN
2674}
2675
ca83b4a7
KRW
2676static int find_msr(struct vmx_msrs *m, unsigned int msr)
2677{
2678 unsigned int i;
2679
2680 for (i = 0; i < m->nr; ++i) {
2681 if (m->val[i].index == msr)
2682 return i;
2683 }
2684 return -ENOENT;
2685}
2686
61d2ef2c
AK
2687static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
2688{
ca83b4a7 2689 int i;
61d2ef2c
AK
2690 struct msr_autoload *m = &vmx->msr_autoload;
2691
8bf00a52
GN
2692 switch (msr) {
2693 case MSR_EFER:
2694 if (cpu_has_load_ia32_efer) {
2961e876
GN
2695 clear_atomic_switch_msr_special(vmx,
2696 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
2697 VM_EXIT_LOAD_IA32_EFER);
2698 return;
2699 }
2700 break;
2701 case MSR_CORE_PERF_GLOBAL_CTRL:
2702 if (cpu_has_load_perf_global_ctrl) {
2961e876 2703 clear_atomic_switch_msr_special(vmx,
8bf00a52
GN
2704 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2705 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2706 return;
2707 }
2708 break;
110312c8 2709 }
ca83b4a7
KRW
2710 i = find_msr(&m->guest, msr);
2711 if (i < 0)
31907093 2712 goto skip_guest;
33966dd6 2713 --m->guest.nr;
33966dd6 2714 m->guest.val[i] = m->guest.val[m->guest.nr];
33966dd6 2715 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
110312c8 2716
31907093
KRW
2717skip_guest:
2718 i = find_msr(&m->host, msr);
2719 if (i < 0)
61d2ef2c 2720 return;
31907093
KRW
2721
2722 --m->host.nr;
2723 m->host.val[i] = m->host.val[m->host.nr];
33966dd6 2724 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c
AK
2725}
2726
2961e876
GN
2727static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2728 unsigned long entry, unsigned long exit,
2729 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
2730 u64 guest_val, u64 host_val)
8bf00a52
GN
2731{
2732 vmcs_write64(guest_val_vmcs, guest_val);
5a5e8a15
SC
2733 if (host_val_vmcs != HOST_IA32_EFER)
2734 vmcs_write64(host_val_vmcs, host_val);
2961e876
GN
2735 vm_entry_controls_setbit(vmx, entry);
2736 vm_exit_controls_setbit(vmx, exit);
8bf00a52
GN
2737}
2738
61d2ef2c 2739static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
989e3992 2740 u64 guest_val, u64 host_val, bool entry_only)
61d2ef2c 2741{
989e3992 2742 int i, j = 0;
61d2ef2c
AK
2743 struct msr_autoload *m = &vmx->msr_autoload;
2744
8bf00a52
GN
2745 switch (msr) {
2746 case MSR_EFER:
2747 if (cpu_has_load_ia32_efer) {
2961e876
GN
2748 add_atomic_switch_msr_special(vmx,
2749 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
2750 VM_EXIT_LOAD_IA32_EFER,
2751 GUEST_IA32_EFER,
2752 HOST_IA32_EFER,
2753 guest_val, host_val);
2754 return;
2755 }
2756 break;
2757 case MSR_CORE_PERF_GLOBAL_CTRL:
2758 if (cpu_has_load_perf_global_ctrl) {
2961e876 2759 add_atomic_switch_msr_special(vmx,
8bf00a52
GN
2760 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2761 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
2762 GUEST_IA32_PERF_GLOBAL_CTRL,
2763 HOST_IA32_PERF_GLOBAL_CTRL,
2764 guest_val, host_val);
2765 return;
2766 }
2767 break;
7099e2e1
RK
2768 case MSR_IA32_PEBS_ENABLE:
2769 /* PEBS needs a quiescent period after being disabled (to write
2770 * a record). Disabling PEBS through VMX MSR swapping doesn't
2771 * provide that period, so a CPU could write host's record into
2772 * guest's memory.
2773 */
2774 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
110312c8
AK
2775 }
2776
ca83b4a7 2777 i = find_msr(&m->guest, msr);
989e3992
KRW
2778 if (!entry_only)
2779 j = find_msr(&m->host, msr);
61d2ef2c 2780
31907093 2781 if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
60266204 2782 printk_once(KERN_WARNING "Not enough msr switch entries. "
e7fc6f93
GN
2783 "Can't add msr %x\n", msr);
2784 return;
61d2ef2c 2785 }
31907093 2786 if (i < 0) {
ca83b4a7 2787 i = m->guest.nr++;
33966dd6 2788 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
31907093 2789 }
989e3992
KRW
2790 m->guest.val[i].index = msr;
2791 m->guest.val[i].value = guest_val;
2792
2793 if (entry_only)
2794 return;
61d2ef2c 2795
31907093
KRW
2796 if (j < 0) {
2797 j = m->host.nr++;
33966dd6 2798 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c 2799 }
31907093
KRW
2800 m->host.val[j].index = msr;
2801 m->host.val[j].value = host_val;
61d2ef2c
AK
2802}
2803
92c0d900 2804static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2cc51560 2805{
844a5fe2
PB
2806 u64 guest_efer = vmx->vcpu.arch.efer;
2807 u64 ignore_bits = 0;
2808
2809 if (!enable_ept) {
2810 /*
2811 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
2812 * host CPUID is more efficient than testing guest CPUID
2813 * or CR4. Host SMEP is anyway a requirement for guest SMEP.
2814 */
2815 if (boot_cpu_has(X86_FEATURE_SMEP))
2816 guest_efer |= EFER_NX;
2817 else if (!(guest_efer & EFER_NX))
2818 ignore_bits |= EFER_NX;
2819 }
3a34a881 2820
51c6cf66 2821 /*
844a5fe2 2822 * LMA and LME handled by hardware; SCE meaningless outside long mode.
51c6cf66 2823 */
844a5fe2 2824 ignore_bits |= EFER_SCE;
51c6cf66
AK
2825#ifdef CONFIG_X86_64
2826 ignore_bits |= EFER_LMA | EFER_LME;
2827 /* SCE is meaningful only in long mode on Intel */
2828 if (guest_efer & EFER_LMA)
2829 ignore_bits &= ~(u64)EFER_SCE;
2830#endif
84ad33ef 2831
f6577a5f
AL
2832 /*
2833 * On EPT, we can't emulate NX, so we must switch EFER atomically.
2834 * On CPUs that support "load IA32_EFER", always switch EFER
2835 * atomically, since it's faster than switching it manually.
2836 */
2837 if (cpu_has_load_ia32_efer ||
2838 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
84ad33ef
AK
2839 if (!(guest_efer & EFER_LMA))
2840 guest_efer &= ~EFER_LME;
54b98bff
AL
2841 if (guest_efer != host_efer)
2842 add_atomic_switch_msr(vmx, MSR_EFER,
989e3992 2843 guest_efer, host_efer, false);
02343cf2
SC
2844 else
2845 clear_atomic_switch_msr(vmx, MSR_EFER);
84ad33ef 2846 return false;
844a5fe2 2847 } else {
02343cf2
SC
2848 clear_atomic_switch_msr(vmx, MSR_EFER);
2849
844a5fe2
PB
2850 guest_efer &= ~ignore_bits;
2851 guest_efer |= host_efer & ignore_bits;
2852
2853 vmx->guest_msrs[efer_offset].data = guest_efer;
2854 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
84ad33ef 2855
844a5fe2
PB
2856 return true;
2857 }
51c6cf66
AK
2858}
2859
e28baead
AL
2860#ifdef CONFIG_X86_32
2861/*
2862 * On 32-bit kernels, VM exits still load the FS and GS bases from the
2863 * VMCS rather than the segment table. KVM uses this helper to figure
2864 * out the current bases to poke them into the VMCS before entry.
2865 */
2d49ec72
GN
2866static unsigned long segment_base(u16 selector)
2867{
8c2e41f7 2868 struct desc_struct *table;
2d49ec72
GN
2869 unsigned long v;
2870
8c2e41f7 2871 if (!(selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
2872 return 0;
2873
45fc8757 2874 table = get_current_gdt_ro();
2d49ec72 2875
8c2e41f7 2876 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2d49ec72
GN
2877 u16 ldt_selector = kvm_read_ldt();
2878
8c2e41f7 2879 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
2880 return 0;
2881
8c2e41f7 2882 table = (struct desc_struct *)segment_base(ldt_selector);
2d49ec72 2883 }
8c2e41f7 2884 v = get_desc_base(&table[selector >> 3]);
2d49ec72
GN
2885 return v;
2886}
e28baead 2887#endif
2d49ec72 2888
6d6095bd 2889static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
33ed6329 2890{
04d2cc77 2891 struct vcpu_vmx *vmx = to_vmx(vcpu);
d7ee039e 2892 struct vmcs_host_state *host_state;
51e8a8cc 2893#ifdef CONFIG_X86_64
35060ed6 2894 int cpu = raw_smp_processor_id();
51e8a8cc 2895#endif
e368b875
SC
2896 unsigned long fs_base, gs_base;
2897 u16 fs_sel, gs_sel;
26bb0981 2898 int i;
04d2cc77 2899
d264ee0c
SC
2900 vmx->req_immediate_exit = false;
2901
f48b4711
LA
2902 /*
2903 * Note that guest MSRs to be saved/restored can also be changed
2904 * when guest state is loaded. This happens when guest transitions
2905 * to/from long-mode by setting MSR_EFER.LMA.
2906 */
2907 if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
2908 vmx->guest_msrs_dirty = false;
2909 for (i = 0; i < vmx->save_nmsrs; ++i)
2910 kvm_set_shared_msr(vmx->guest_msrs[i].index,
2911 vmx->guest_msrs[i].data,
2912 vmx->guest_msrs[i].mask);
2913
2914 }
2915
bd9966de 2916 if (vmx->loaded_cpu_state)
33ed6329
AK
2917 return;
2918
bd9966de 2919 vmx->loaded_cpu_state = vmx->loaded_vmcs;
d7ee039e 2920 host_state = &vmx->loaded_cpu_state->host_state;
bd9966de 2921
33ed6329
AK
2922 /*
2923 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
2924 * allow segment selectors with cpl > 0 or ti == 1.
2925 */
d7ee039e 2926 host_state->ldt_sel = kvm_read_ldt();
42b933b5
VK
2927
2928#ifdef CONFIG_X86_64
d7ee039e
SC
2929 savesegment(ds, host_state->ds_sel);
2930 savesegment(es, host_state->es_sel);
e368b875
SC
2931
2932 gs_base = cpu_kernelmode_gs_base(cpu);
b062b794
VK
2933 if (likely(is_64bit_mm(current->mm))) {
2934 save_fsgs_for_kvm();
e368b875
SC
2935 fs_sel = current->thread.fsindex;
2936 gs_sel = current->thread.gsindex;
b062b794 2937 fs_base = current->thread.fsbase;
e368b875 2938 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
b062b794 2939 } else {
e368b875
SC
2940 savesegment(fs, fs_sel);
2941 savesegment(gs, gs_sel);
b062b794 2942 fs_base = read_msr(MSR_FS_BASE);
e368b875 2943 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
33ed6329 2944 }
b2da15ac 2945
4679b61f 2946 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
4fde8d57 2947#else
e368b875
SC
2948 savesegment(fs, fs_sel);
2949 savesegment(gs, gs_sel);
2950 fs_base = segment_base(fs_sel);
2951 gs_base = segment_base(gs_sel);
707c0874 2952#endif
e368b875 2953
8f21a0bb
SC
2954 if (unlikely(fs_sel != host_state->fs_sel)) {
2955 if (!(fs_sel & 7))
2956 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2957 else
2958 vmcs_write16(HOST_FS_SELECTOR, 0);
2959 host_state->fs_sel = fs_sel;
2960 }
2961 if (unlikely(gs_sel != host_state->gs_sel)) {
2962 if (!(gs_sel & 7))
2963 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2964 else
2965 vmcs_write16(HOST_GS_SELECTOR, 0);
2966 host_state->gs_sel = gs_sel;
2967 }
5e079c7e
SC
2968 if (unlikely(fs_base != host_state->fs_base)) {
2969 vmcs_writel(HOST_FS_BASE, fs_base);
2970 host_state->fs_base = fs_base;
2971 }
2972 if (unlikely(gs_base != host_state->gs_base)) {
2973 vmcs_writel(HOST_GS_BASE, gs_base);
2974 host_state->gs_base = gs_base;
2975 }
33ed6329
AK
2976}
2977
6d6095bd 2978static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
33ed6329 2979{
d7ee039e
SC
2980 struct vmcs_host_state *host_state;
2981
bd9966de 2982 if (!vmx->loaded_cpu_state)
33ed6329
AK
2983 return;
2984
bd9966de 2985 WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
d7ee039e 2986 host_state = &vmx->loaded_cpu_state->host_state;
bd9966de 2987
e1beb1d3 2988 ++vmx->vcpu.stat.host_state_reload;
bd9966de
SC
2989 vmx->loaded_cpu_state = NULL;
2990
c8770e7b 2991#ifdef CONFIG_X86_64
4679b61f 2992 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
c8770e7b 2993#endif
d7ee039e
SC
2994 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
2995 kvm_load_ldt(host_state->ldt_sel);
33ed6329 2996#ifdef CONFIG_X86_64
d7ee039e 2997 load_gs_index(host_state->gs_sel);
9581d442 2998#else
d7ee039e 2999 loadsegment(gs, host_state->gs_sel);
33ed6329 3000#endif
33ed6329 3001 }
d7ee039e
SC
3002 if (host_state->fs_sel & 7)
3003 loadsegment(fs, host_state->fs_sel);
b2da15ac 3004#ifdef CONFIG_X86_64
d7ee039e
SC
3005 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
3006 loadsegment(ds, host_state->ds_sel);
3007 loadsegment(es, host_state->es_sel);
b2da15ac 3008 }
b2da15ac 3009#endif
b7ffc44d 3010 invalidate_tss_limit();
44ea2b17 3011#ifdef CONFIG_X86_64
c8770e7b 3012 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
44ea2b17 3013#endif
45fc8757 3014 load_fixmap_gdt(raw_smp_processor_id());
33ed6329
AK
3015}
3016
678e315e
SC
3017#ifdef CONFIG_X86_64
3018static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
a9b21b62 3019{
4679b61f
PB
3020 preempt_disable();
3021 if (vmx->loaded_cpu_state)
3022 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
3023 preempt_enable();
678e315e 3024 return vmx->msr_guest_kernel_gs_base;
a9b21b62
AK
3025}
3026
678e315e
SC
3027static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
3028{
4679b61f
PB
3029 preempt_disable();
3030 if (vmx->loaded_cpu_state)
3031 wrmsrl(MSR_KERNEL_GS_BASE, data);
3032 preempt_enable();
678e315e
SC
3033 vmx->msr_guest_kernel_gs_base = data;
3034}
3035#endif
3036
28b835d6
FW
3037static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
3038{
3039 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3040 struct pi_desc old, new;
3041 unsigned int dest;
3042
31afb2ea
PB
3043 /*
3044 * In case of hot-plug or hot-unplug, we may have to undo
3045 * vmx_vcpu_pi_put even if there is no assigned device. And we
3046 * always keep PI.NDST up to date for simplicity: it makes the
3047 * code easier, and CPU migration is not a fast path.
3048 */
3049 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
28b835d6
FW
3050 return;
3051
31afb2ea
PB
3052 /*
3053 * First handle the simple case where no cmpxchg is necessary; just
3054 * allow posting non-urgent interrupts.
3055 *
3056 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
3057 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
3058 * expects the VCPU to be on the blocked_vcpu_list that matches
3059 * PI.NDST.
3060 */
3061 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
3062 vcpu->cpu == cpu) {
3063 pi_clear_sn(pi_desc);
28b835d6 3064 return;
31afb2ea 3065 }
28b835d6 3066
31afb2ea 3067 /* The full case. */
28b835d6
FW
3068 do {
3069 old.control = new.control = pi_desc->control;
3070
31afb2ea 3071 dest = cpu_physical_id(cpu);
28b835d6 3072
31afb2ea
PB
3073 if (x2apic_enabled())
3074 new.ndst = dest;
3075 else
3076 new.ndst = (dest << 8) & 0xFF00;
28b835d6 3077
28b835d6 3078 new.sn = 0;
c0a1666b
PB
3079 } while (cmpxchg64(&pi_desc->control, old.control,
3080 new.control) != old.control);
28b835d6 3081}
1be0e61c 3082
c95ba92a
PF
3083static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
3084{
3085 vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
3086 vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
3087}
3088
6aa8b732
AK
3089/*
3090 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
3091 * vcpu mutex is already taken.
3092 */
15ad7146 3093static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
6aa8b732 3094{
a2fa3e9f 3095 struct vcpu_vmx *vmx = to_vmx(vcpu);
b80c76ec 3096 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
6aa8b732 3097
b80c76ec 3098 if (!already_loaded) {
fe0e80be 3099 loaded_vmcs_clear(vmx->loaded_vmcs);
92fe13be 3100 local_irq_disable();
8f536b76 3101 crash_disable_local_vmclear(cpu);
5a560f8b
XG
3102
3103 /*
3104 * Read loaded_vmcs->cpu should be before fetching
3105 * loaded_vmcs->loaded_vmcss_on_cpu_link.
3106 * See the comments in __loaded_vmcs_clear().
3107 */
3108 smp_rmb();
3109
d462b819
NHE
3110 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
3111 &per_cpu(loaded_vmcss_on_cpu, cpu));
8f536b76 3112 crash_enable_local_vmclear(cpu);
92fe13be 3113 local_irq_enable();
b80c76ec
JM
3114 }
3115
3116 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
3117 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
3118 vmcs_load(vmx->loaded_vmcs->vmcs);
15d45071 3119 indirect_branch_prediction_barrier();
b80c76ec
JM
3120 }
3121
3122 if (!already_loaded) {
59c58ceb 3123 void *gdt = get_current_gdt_ro();
b80c76ec
JM
3124 unsigned long sysenter_esp;
3125
3126 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
92fe13be 3127
6aa8b732
AK
3128 /*
3129 * Linux uses per-cpu TSS and GDT, so set these when switching
e0c23063 3130 * processors. See 22.2.4.
6aa8b732 3131 */
e0c23063 3132 vmcs_writel(HOST_TR_BASE,
72f5e08d 3133 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
59c58ceb 3134 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
6aa8b732 3135
b7ffc44d
AL
3136 /*
3137 * VM exits change the host TR limit to 0x67 after a VM
3138 * exit. This is okay, since 0x67 covers everything except
3139 * the IO bitmap and have have code to handle the IO bitmap
3140 * being lost after a VM exit.
3141 */
3142 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
3143
6aa8b732
AK
3144 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
3145 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
ff2c3a18 3146
d462b819 3147 vmx->loaded_vmcs->cpu = cpu;
6aa8b732 3148 }
28b835d6 3149
2680d6da
OH
3150 /* Setup TSC multiplier */
3151 if (kvm_has_tsc_control &&
c95ba92a
PF
3152 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
3153 decache_tsc_multiplier(vmx);
2680d6da 3154
28b835d6 3155 vmx_vcpu_pi_load(vcpu, cpu);
1be0e61c 3156 vmx->host_pkru = read_pkru();
74c55931 3157 vmx->host_debugctlmsr = get_debugctlmsr();
28b835d6
FW
3158}
3159
3160static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
3161{
3162 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3163
3164 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
a0052191
YZ
3165 !irq_remapping_cap(IRQ_POSTING_CAP) ||
3166 !kvm_vcpu_apicv_active(vcpu))
28b835d6
FW
3167 return;
3168
3169 /* Set SN when the vCPU is preempted */
3170 if (vcpu->preempted)
3171 pi_set_sn(pi_desc);
6aa8b732
AK
3172}
3173
3174static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
3175{
28b835d6
FW
3176 vmx_vcpu_pi_put(vcpu);
3177
6d6095bd 3178 vmx_prepare_switch_to_host(to_vmx(vcpu));
6aa8b732
AK
3179}
3180
f244deed
WL
3181static bool emulation_required(struct kvm_vcpu *vcpu)
3182{
3183 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3184}
3185
edcafe3c
AK
3186static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
3187
fe3ef05c
NHE
3188/*
3189 * Return the cr0 value that a nested guest would read. This is a combination
3190 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
3191 * its hypervisor (cr0_read_shadow).
3192 */
3193static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
3194{
3195 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
3196 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
3197}
3198static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
3199{
3200 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
3201 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
3202}
3203
6aa8b732
AK
3204static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
3205{
78ac8b47 3206 unsigned long rflags, save_rflags;
345dcaa8 3207
6de12732
AK
3208 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
3209 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3210 rflags = vmcs_readl(GUEST_RFLAGS);
3211 if (to_vmx(vcpu)->rmode.vm86_active) {
3212 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3213 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
3214 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3215 }
3216 to_vmx(vcpu)->rflags = rflags;
78ac8b47 3217 }
6de12732 3218 return to_vmx(vcpu)->rflags;
6aa8b732
AK
3219}
3220
3221static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3222{
f244deed
WL
3223 unsigned long old_rflags = vmx_get_rflags(vcpu);
3224
6de12732
AK
3225 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3226 to_vmx(vcpu)->rflags = rflags;
78ac8b47
AK
3227 if (to_vmx(vcpu)->rmode.vm86_active) {
3228 to_vmx(vcpu)->rmode.save_rflags = rflags;
053de044 3229 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
78ac8b47 3230 }
6aa8b732 3231 vmcs_writel(GUEST_RFLAGS, rflags);
f244deed
WL
3232
3233 if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
3234 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
6aa8b732
AK
3235}
3236
37ccdcbe 3237static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
3238{
3239 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3240 int ret = 0;
3241
3242 if (interruptibility & GUEST_INTR_STATE_STI)
48005f64 3243 ret |= KVM_X86_SHADOW_INT_STI;
2809f5d2 3244 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
48005f64 3245 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2809f5d2 3246
37ccdcbe 3247 return ret;
2809f5d2
GC
3248}
3249
3250static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
3251{
3252 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3253 u32 interruptibility = interruptibility_old;
3254
3255 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
3256
48005f64 3257 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2809f5d2 3258 interruptibility |= GUEST_INTR_STATE_MOV_SS;
48005f64 3259 else if (mask & KVM_X86_SHADOW_INT_STI)
2809f5d2
GC
3260 interruptibility |= GUEST_INTR_STATE_STI;
3261
3262 if ((interruptibility != interruptibility_old))
3263 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
3264}
3265
6aa8b732
AK
3266static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
3267{
3268 unsigned long rip;
6aa8b732 3269
5fdbf976 3270 rip = kvm_rip_read(vcpu);
6aa8b732 3271 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5fdbf976 3272 kvm_rip_write(vcpu, rip);
6aa8b732 3273
2809f5d2
GC
3274 /* skipping an emulated instruction also counts */
3275 vmx_set_interrupt_shadow(vcpu, 0);
6aa8b732
AK
3276}
3277
b96fb439
PB
3278static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3279 unsigned long exit_qual)
3280{
3281 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3282 unsigned int nr = vcpu->arch.exception.nr;
3283 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3284
3285 if (vcpu->arch.exception.has_error_code) {
3286 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3287 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3288 }
3289
3290 if (kvm_exception_is_soft(nr))
3291 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3292 else
3293 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3294
3295 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3296 vmx_get_nmi_mask(vcpu))
3297 intr_info |= INTR_INFO_UNBLOCK_NMI;
3298
3299 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3300}
3301
0b6ac343
NHE
3302/*
3303 * KVM wants to inject page-faults which it got to the guest. This function
3304 * checks whether in a nested guest, we need to inject them to L1 or L2.
0b6ac343 3305 */
bfcf83b1 3306static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
0b6ac343
NHE
3307{
3308 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
adfe20fb 3309 unsigned int nr = vcpu->arch.exception.nr;
da998b46
JM
3310 bool has_payload = vcpu->arch.exception.has_payload;
3311 unsigned long payload = vcpu->arch.exception.payload;
0b6ac343 3312
b96fb439
PB
3313 if (nr == PF_VECTOR) {
3314 if (vcpu->arch.exception.nested_apf) {
bfcf83b1 3315 *exit_qual = vcpu->arch.apf.nested_apf_token;
b96fb439
PB
3316 return 1;
3317 }
b96fb439
PB
3318 if (nested_vmx_is_page_fault_vmexit(vmcs12,
3319 vcpu->arch.exception.error_code)) {
da998b46 3320 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
b96fb439
PB
3321 return 1;
3322 }
f10c729f
JM
3323 } else if (vmcs12->exception_bitmap & (1u << nr)) {
3324 if (nr == DB_VECTOR) {
3325 if (!has_payload) {
3326 payload = vcpu->arch.dr6;
3327 payload &= ~(DR6_FIXED_1 | DR6_BT);
3328 payload ^= DR6_RTM;
cfb634fe 3329 }
f10c729f
JM
3330 *exit_qual = payload;
3331 } else
3332 *exit_qual = 0;
3333 return 1;
adfe20fb
WL
3334 }
3335
b96fb439 3336 return 0;
0b6ac343
NHE
3337}
3338
caa057a2
WL
3339static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
3340{
3341 /*
3342 * Ensure that we clear the HLT state in the VMCS. We don't need to
3343 * explicitly skip the instruction because if the HLT state is set,
3344 * then the instruction is already executing and RIP has already been
3345 * advanced.
3346 */
3347 if (kvm_hlt_in_guest(vcpu->kvm) &&
3348 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
3349 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3350}
3351
cfcd20e5 3352static void vmx_queue_exception(struct kvm_vcpu *vcpu)
298101da 3353{
77ab6db0 3354 struct vcpu_vmx *vmx = to_vmx(vcpu);
cfcd20e5
WL
3355 unsigned nr = vcpu->arch.exception.nr;
3356 bool has_error_code = vcpu->arch.exception.has_error_code;
cfcd20e5 3357 u32 error_code = vcpu->arch.exception.error_code;
8ab2d2e2 3358 u32 intr_info = nr | INTR_INFO_VALID_MASK;
77ab6db0 3359
da998b46
JM
3360 kvm_deliver_exception_payload(vcpu);
3361
8ab2d2e2 3362 if (has_error_code) {
77ab6db0 3363 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
8ab2d2e2
JK
3364 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3365 }
77ab6db0 3366
7ffd92c5 3367 if (vmx->rmode.vm86_active) {
71f9833b
SH
3368 int inc_eip = 0;
3369 if (kvm_exception_is_soft(nr))
3370 inc_eip = vcpu->arch.event_exit_inst_len;
3371 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
a92601bb 3372 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
77ab6db0
JK
3373 return;
3374 }
3375
add5ff7a
SC
3376 WARN_ON_ONCE(vmx->emulation_required);
3377
66fd3f7f
GN
3378 if (kvm_exception_is_soft(nr)) {
3379 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
3380 vmx->vcpu.arch.event_exit_inst_len);
8ab2d2e2
JK
3381 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3382 } else
3383 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3384
3385 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
caa057a2
WL
3386
3387 vmx_clear_hlt(vcpu);
298101da
AK
3388}
3389
4e47c7a6
SY
3390static bool vmx_rdtscp_supported(void)
3391{
3392 return cpu_has_vmx_rdtscp();
3393}
3394
ad756a16
MJ
3395static bool vmx_invpcid_supported(void)
3396{
eb4b248e 3397 return cpu_has_vmx_invpcid();
ad756a16
MJ
3398}
3399
a75beee6
ED
3400/*
3401 * Swap MSR entry in host/guest MSR entry array.
3402 */
8b9cf98c 3403static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
a75beee6 3404{
26bb0981 3405 struct shared_msr_entry tmp;
a2fa3e9f
GH
3406
3407 tmp = vmx->guest_msrs[to];
3408 vmx->guest_msrs[to] = vmx->guest_msrs[from];
3409 vmx->guest_msrs[from] = tmp;
a75beee6
ED
3410}
3411
e38aea3e
AK
3412/*
3413 * Set up the vmcs to automatically save and restore system
3414 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
3415 * mode, as fiddling with msrs is very expensive.
3416 */
8b9cf98c 3417static void setup_msrs(struct vcpu_vmx *vmx)
e38aea3e 3418{
26bb0981 3419 int save_nmsrs, index;
e38aea3e 3420
a75beee6
ED
3421 save_nmsrs = 0;
3422#ifdef CONFIG_X86_64
8b9cf98c 3423 if (is_long_mode(&vmx->vcpu)) {
8b9cf98c 3424 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
a75beee6 3425 if (index >= 0)
8b9cf98c
RR
3426 move_msr_up(vmx, index, save_nmsrs++);
3427 index = __find_msr_index(vmx, MSR_LSTAR);
a75beee6 3428 if (index >= 0)
8b9cf98c
RR
3429 move_msr_up(vmx, index, save_nmsrs++);
3430 index = __find_msr_index(vmx, MSR_CSTAR);
a75beee6 3431 if (index >= 0)
8b9cf98c 3432 move_msr_up(vmx, index, save_nmsrs++);
4e47c7a6 3433 index = __find_msr_index(vmx, MSR_TSC_AUX);
d6321d49 3434 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
4e47c7a6 3435 move_msr_up(vmx, index, save_nmsrs++);
a75beee6 3436 /*
8c06585d 3437 * MSR_STAR is only needed on long mode guests, and only
a75beee6
ED
3438 * if efer.sce is enabled.
3439 */
8c06585d 3440 index = __find_msr_index(vmx, MSR_STAR);
f6801dff 3441 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
8b9cf98c 3442 move_msr_up(vmx, index, save_nmsrs++);
a75beee6
ED
3443 }
3444#endif
92c0d900
AK
3445 index = __find_msr_index(vmx, MSR_EFER);
3446 if (index >= 0 && update_transition_efer(vmx, index))
26bb0981 3447 move_msr_up(vmx, index, save_nmsrs++);
e38aea3e 3448
26bb0981 3449 vmx->save_nmsrs = save_nmsrs;
f48b4711 3450 vmx->guest_msrs_dirty = true;
5897297b 3451
8d14695f 3452 if (cpu_has_vmx_msr_bitmap())
904e14fb 3453 vmx_update_msr_bitmap(&vmx->vcpu);
e38aea3e
AK
3454}
3455
e79f245d 3456static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
6aa8b732 3457{
e79f245d 3458 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6aa8b732 3459
e79f245d
KA
3460 if (is_guest_mode(vcpu) &&
3461 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
3462 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
3463
3464 return vcpu->arch.tsc_offset;
6aa8b732
AK
3465}
3466
3467/*
99e3e30a 3468 * writes 'offset' into guest's timestamp counter offset register
6aa8b732 3469 */
99e3e30a 3470static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
6aa8b732 3471{
27fc51b2 3472 if (is_guest_mode(vcpu)) {
7991825b 3473 /*
27fc51b2
NHE
3474 * We're here if L1 chose not to trap WRMSR to TSC. According
3475 * to the spec, this should set L1's TSC; The offset that L1
3476 * set for L2 remains unchanged, and still needs to be added
3477 * to the newly set TSC to get L2's TSC.
7991825b 3478 */
27fc51b2 3479 struct vmcs12 *vmcs12;
27fc51b2
NHE
3480 /* recalculate vmcs02.TSC_OFFSET: */
3481 vmcs12 = get_vmcs12(vcpu);
3482 vmcs_write64(TSC_OFFSET, offset +
3483 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
3484 vmcs12->tsc_offset : 0));
3485 } else {
489223ed
YY
3486 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
3487 vmcs_read64(TSC_OFFSET), offset);
27fc51b2
NHE
3488 vmcs_write64(TSC_OFFSET, offset);
3489 }
6aa8b732
AK
3490}
3491
801d3424
NHE
3492/*
3493 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
3494 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
3495 * all guests if the "nested" module option is off, and can also be disabled
3496 * for a single guest by disabling its VMX cpuid bit.
3497 */
3498static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
3499{
d6321d49 3500 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
801d3424
NHE
3501}
3502
b87a51ae
NHE
3503/*
3504 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
3505 * returned for the various VMX controls MSRs when nested VMX is enabled.
3506 * The same values should also be used to verify that vmcs12 control fields are
3507 * valid during nested entry from L1 to L2.
3508 * Each of these control msrs has a low and high 32-bit half: A low bit is on
3509 * if the corresponding bit in the (32-bit) control field *must* be on, and a
3510 * bit in the high half is on if the corresponding bit in the control field
3511 * may be on. See also vmx_control_verify().
b87a51ae 3512 */
6677f3da 3513static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
b87a51ae 3514{
1389309c
PB
3515 if (!nested) {
3516 memset(msrs, 0, sizeof(*msrs));
3517 return;
3518 }
3519
b87a51ae
NHE
3520 /*
3521 * Note that as a general rule, the high half of the MSRs (bits in
3522 * the control fields which may be 1) should be initialized by the
3523 * intersection of the underlying hardware's MSR (i.e., features which
3524 * can be supported) and the list of features we want to expose -
3525 * because they are known to be properly supported in our code.
3526 * Also, usually, the low half of the MSRs (bits which must be 1) can
3527 * be set to 0, meaning that L1 may turn off any of these bits. The
3528 * reason is that if one of these bits is necessary, it will appear
3529 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
3530 * fields of vmcs01 and vmcs02, will turn these bits off - and
7313c698 3531 * nested_vmx_exit_reflected() will not pass related exits to L1.
b87a51ae
NHE
3532 * These rules have exceptions below.
3533 */
3534
3535 /* pin-based controls */
eabeaacc 3536 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6677f3da
PB
3537 msrs->pinbased_ctls_low,
3538 msrs->pinbased_ctls_high);
3539 msrs->pinbased_ctls_low |=
b9c237bb 3540 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6677f3da 3541 msrs->pinbased_ctls_high &=
b9c237bb
WV
3542 PIN_BASED_EXT_INTR_MASK |
3543 PIN_BASED_NMI_EXITING |
1389309c
PB
3544 PIN_BASED_VIRTUAL_NMIS |
3545 (apicv ? PIN_BASED_POSTED_INTR : 0);
6677f3da 3546 msrs->pinbased_ctls_high |=
b9c237bb 3547 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
0238ea91 3548 PIN_BASED_VMX_PREEMPTION_TIMER;
b87a51ae 3549
3dbcd8da 3550 /* exit controls */
c0dfee58 3551 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6677f3da
PB
3552 msrs->exit_ctls_low,
3553 msrs->exit_ctls_high);
3554 msrs->exit_ctls_low =
b9c237bb 3555 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
e0ba1a6f 3556
6677f3da 3557 msrs->exit_ctls_high &=
b87a51ae 3558#ifdef CONFIG_X86_64
c0dfee58 3559 VM_EXIT_HOST_ADDR_SPACE_SIZE |
b87a51ae 3560#endif
f4124500 3561 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
6677f3da 3562 msrs->exit_ctls_high |=
b9c237bb 3563 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
f4124500 3564 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
e0ba1a6f
BD
3565 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
3566
2996fca0 3567 /* We support free control of debug control saving. */
6677f3da 3568 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2996fca0 3569
b87a51ae
NHE
3570 /* entry controls */
3571 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6677f3da
PB
3572 msrs->entry_ctls_low,
3573 msrs->entry_ctls_high);
3574 msrs->entry_ctls_low =
b9c237bb 3575 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6677f3da 3576 msrs->entry_ctls_high &=
57435349
JK
3577#ifdef CONFIG_X86_64
3578 VM_ENTRY_IA32E_MODE |
3579#endif
3580 VM_ENTRY_LOAD_IA32_PAT;
6677f3da 3581 msrs->entry_ctls_high |=
b9c237bb 3582 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
57435349 3583
2996fca0 3584 /* We support free control of debug control loading. */
6677f3da 3585 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2996fca0 3586
b87a51ae
NHE
3587 /* cpu-based controls */
3588 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6677f3da
PB
3589 msrs->procbased_ctls_low,
3590 msrs->procbased_ctls_high);
3591 msrs->procbased_ctls_low =
b9c237bb 3592 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6677f3da 3593 msrs->procbased_ctls_high &=
a294c9bb
JK
3594 CPU_BASED_VIRTUAL_INTR_PENDING |
3595 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
b87a51ae
NHE
3596 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
3597 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
3598 CPU_BASED_CR3_STORE_EXITING |
3599#ifdef CONFIG_X86_64
3600 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
3601#endif
3602 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
5f3d45e7
MD
3603 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
3604 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
3605 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
3606 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
b87a51ae
NHE
3607 /*
3608 * We can allow some features even when not supported by the
3609 * hardware. For example, L1 can specify an MSR bitmap - and we
3610 * can use it to avoid exits to L1 - even when L0 runs L2
3611 * without MSR bitmaps.
3612 */
6677f3da 3613 msrs->procbased_ctls_high |=
b9c237bb 3614 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
560b7ee1 3615 CPU_BASED_USE_MSR_BITMAPS;
b87a51ae 3616
3dcdf3ec 3617 /* We support free control of CR3 access interception. */
6677f3da 3618 msrs->procbased_ctls_low &=
3dcdf3ec
JK
3619 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
3620
80154d77
PB
3621 /*
3622 * secondary cpu-based controls. Do not include those that
3623 * depend on CPUID bits, they are added later by vmx_cpuid_update.
3624 */
b87a51ae 3625 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6677f3da
PB
3626 msrs->secondary_ctls_low,
3627 msrs->secondary_ctls_high);
3628 msrs->secondary_ctls_low = 0;
3629 msrs->secondary_ctls_high &=
1b07304c 3630 SECONDARY_EXEC_DESC |
f2b93280 3631 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
82f0dd4b 3632 SECONDARY_EXEC_APIC_REGISTER_VIRT |
608406e2 3633 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3db13480 3634 SECONDARY_EXEC_WBINVD_EXITING;
2cf7ea9f 3635
32c7acf0
LA
3636 /*
3637 * We can emulate "VMCS shadowing," even if the hardware
3638 * doesn't support it.
3639 */
3640 msrs->secondary_ctls_high |=
3641 SECONDARY_EXEC_SHADOW_VMCS;
c18911a2 3642
afa61f75
NHE
3643 if (enable_ept) {
3644 /* nested EPT: emulate EPT also to L1 */
6677f3da 3645 msrs->secondary_ctls_high |=
0790ec17 3646 SECONDARY_EXEC_ENABLE_EPT;
6677f3da 3647 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
7db74265 3648 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
02120c45 3649 if (cpu_has_vmx_ept_execute_only())
6677f3da 3650 msrs->ept_caps |=
02120c45 3651 VMX_EPT_EXECUTE_ONLY_BIT;
6677f3da
PB
3652 msrs->ept_caps &= vmx_capability.ept;
3653 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
7db74265
PB
3654 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
3655 VMX_EPT_1GB_PAGE_BIT;
03efce6f 3656 if (enable_ept_ad_bits) {
6677f3da 3657 msrs->secondary_ctls_high |=
03efce6f 3658 SECONDARY_EXEC_ENABLE_PML;
6677f3da 3659 msrs->ept_caps |= VMX_EPT_AD_BIT;
03efce6f 3660 }
1c13bffd 3661 }
afa61f75 3662
27c42a1b 3663 if (cpu_has_vmx_vmfunc()) {
6677f3da 3664 msrs->secondary_ctls_high |=
27c42a1b 3665 SECONDARY_EXEC_ENABLE_VMFUNC;
41ab9372
BD
3666 /*
3667 * Advertise EPTP switching unconditionally
3668 * since we emulate it
3669 */
575b3a2c 3670 if (enable_ept)
6677f3da 3671 msrs->vmfunc_controls =
575b3a2c 3672 VMX_VMFUNC_EPTP_SWITCHING;
27c42a1b
BD
3673 }
3674
ef697a71
PB
3675 /*
3676 * Old versions of KVM use the single-context version without
3677 * checking for support, so declare that it is supported even
3678 * though it is treated as global context. The alternative is
3679 * not failing the single-context invvpid, and it is worse.
3680 */
63cb6d5f 3681 if (enable_vpid) {
6677f3da 3682 msrs->secondary_ctls_high |=
63cb6d5f 3683 SECONDARY_EXEC_ENABLE_VPID;
6677f3da 3684 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
bcdde302 3685 VMX_VPID_EXTENT_SUPPORTED_MASK;
1c13bffd 3686 }
99b83ac8 3687
0790ec17 3688 if (enable_unrestricted_guest)
6677f3da 3689 msrs->secondary_ctls_high |=
0790ec17
RK
3690 SECONDARY_EXEC_UNRESTRICTED_GUEST;
3691
2cf7ea9f
PB
3692 if (flexpriority_enabled)
3693 msrs->secondary_ctls_high |=
3694 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3695
c18911a2 3696 /* miscellaneous data */
b9c237bb 3697 rdmsr(MSR_IA32_VMX_MISC,
6677f3da
PB
3698 msrs->misc_low,
3699 msrs->misc_high);
3700 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
3701 msrs->misc_low |=
f4160e45 3702 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
b9c237bb 3703 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
f4124500 3704 VMX_MISC_ACTIVITY_HLT;
6677f3da 3705 msrs->misc_high = 0;
62cc6b9d
DM
3706
3707 /*
3708 * This MSR reports some information about VMX support. We
3709 * should return information about the VMX we emulate for the
3710 * guest, and the VMCS structure we give it - not about the
3711 * VMX support of the underlying hardware.
3712 */
6677f3da 3713 msrs->basic =
62cc6b9d
DM
3714 VMCS12_REVISION |
3715 VMX_BASIC_TRUE_CTLS |
3716 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
3717 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
3718
3719 if (cpu_has_vmx_basic_inout())
6677f3da 3720 msrs->basic |= VMX_BASIC_INOUT;
62cc6b9d
DM
3721
3722 /*
8322ebbb 3723 * These MSRs specify bits which the guest must keep fixed on
62cc6b9d
DM
3724 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
3725 * We picked the standard core2 setting.
3726 */
3727#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
3728#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6677f3da
PB
3729 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
3730 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
8322ebbb
DM
3731
3732 /* These MSRs specify bits which the guest must keep fixed off. */
6677f3da
PB
3733 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
3734 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
62cc6b9d
DM
3735
3736 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6677f3da 3737 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
b87a51ae
NHE
3738}
3739
3899152c
DM
3740/*
3741 * if fixed0[i] == 1: val[i] must be 1
3742 * if fixed1[i] == 0: val[i] must be 0
3743 */
3744static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
3745{
3746 return ((val & fixed1) | fixed0) == val;
b87a51ae
NHE
3747}
3748
3749static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
3750{
3899152c 3751 return fixed_bits_valid(control, low, high);
b87a51ae
NHE
3752}
3753
3754static inline u64 vmx_control_msr(u32 low, u32 high)
3755{
3756 return low | ((u64)high << 32);
3757}
3758
62cc6b9d
DM
3759static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
3760{
3761 superset &= mask;
3762 subset &= mask;
3763
3764 return (superset | subset) == superset;
3765}
3766
3767static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
3768{
3769 const u64 feature_and_reserved =
3770 /* feature (except bit 48; see below) */
3771 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
3772 /* reserved */
3773 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
6677f3da 3774 u64 vmx_basic = vmx->nested.msrs.basic;
62cc6b9d
DM
3775
3776 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
3777 return -EINVAL;
3778
3779 /*
3780 * KVM does not emulate a version of VMX that constrains physical
3781 * addresses of VMX structures (e.g. VMCS) to 32-bits.
3782 */
3783 if (data & BIT_ULL(48))
3784 return -EINVAL;
3785
3786 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
3787 vmx_basic_vmcs_revision_id(data))
3788 return -EINVAL;
3789
3790 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
3791 return -EINVAL;
3792
6677f3da 3793 vmx->nested.msrs.basic = data;
62cc6b9d
DM
3794 return 0;
3795}
3796
3797static int
3798vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3799{
3800 u64 supported;
3801 u32 *lowp, *highp;
3802
3803 switch (msr_index) {
3804 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
6677f3da
PB
3805 lowp = &vmx->nested.msrs.pinbased_ctls_low;
3806 highp = &vmx->nested.msrs.pinbased_ctls_high;
62cc6b9d
DM
3807 break;
3808 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
6677f3da
PB
3809 lowp = &vmx->nested.msrs.procbased_ctls_low;
3810 highp = &vmx->nested.msrs.procbased_ctls_high;
62cc6b9d
DM
3811 break;
3812 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
6677f3da
PB
3813 lowp = &vmx->nested.msrs.exit_ctls_low;
3814 highp = &vmx->nested.msrs.exit_ctls_high;
62cc6b9d
DM
3815 break;
3816 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
6677f3da
PB
3817 lowp = &vmx->nested.msrs.entry_ctls_low;
3818 highp = &vmx->nested.msrs.entry_ctls_high;
62cc6b9d
DM
3819 break;
3820 case MSR_IA32_VMX_PROCBASED_CTLS2:
6677f3da
PB
3821 lowp = &vmx->nested.msrs.secondary_ctls_low;
3822 highp = &vmx->nested.msrs.secondary_ctls_high;
62cc6b9d
DM
3823 break;
3824 default:
3825 BUG();
3826 }
3827
3828 supported = vmx_control_msr(*lowp, *highp);
3829
3830 /* Check must-be-1 bits are still 1. */
3831 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
3832 return -EINVAL;
3833
3834 /* Check must-be-0 bits are still 0. */
3835 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
3836 return -EINVAL;
3837
3838 *lowp = data;
3839 *highp = data >> 32;
3840 return 0;
3841}
3842
3843static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3844{
3845 const u64 feature_and_reserved_bits =
3846 /* feature */
3847 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
3848 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
3849 /* reserved */
3850 GENMASK_ULL(13, 9) | BIT_ULL(31);
3851 u64 vmx_misc;
3852
6677f3da
PB
3853 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
3854 vmx->nested.msrs.misc_high);
62cc6b9d
DM
3855
3856 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3857 return -EINVAL;
3858
6677f3da 3859 if ((vmx->nested.msrs.pinbased_ctls_high &
62cc6b9d
DM
3860 PIN_BASED_VMX_PREEMPTION_TIMER) &&
3861 vmx_misc_preemption_timer_rate(data) !=
3862 vmx_misc_preemption_timer_rate(vmx_misc))
3863 return -EINVAL;
3864
3865 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
3866 return -EINVAL;
3867
3868 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
3869 return -EINVAL;
3870
3871 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3872 return -EINVAL;
3873
6677f3da
PB
3874 vmx->nested.msrs.misc_low = data;
3875 vmx->nested.msrs.misc_high = data >> 32;
f4160e45
JM
3876
3877 /*
3878 * If L1 has read-only VM-exit information fields, use the
3879 * less permissive vmx_vmwrite_bitmap to specify write
3880 * permissions for the shadow VMCS.
3881 */
3882 if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
3883 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
3884
62cc6b9d
DM
3885 return 0;
3886}
3887
3888static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3889{
3890 u64 vmx_ept_vpid_cap;
3891
6677f3da
PB
3892 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
3893 vmx->nested.msrs.vpid_caps);
62cc6b9d
DM
3894
3895 /* Every bit is either reserved or a feature bit. */
3896 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3897 return -EINVAL;
3898
6677f3da
PB
3899 vmx->nested.msrs.ept_caps = data;
3900 vmx->nested.msrs.vpid_caps = data >> 32;
62cc6b9d
DM
3901 return 0;
3902}
3903
3904static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3905{
3906 u64 *msr;
3907
3908 switch (msr_index) {
3909 case MSR_IA32_VMX_CR0_FIXED0:
6677f3da 3910 msr = &vmx->nested.msrs.cr0_fixed0;
62cc6b9d
DM
3911 break;
3912 case MSR_IA32_VMX_CR4_FIXED0:
6677f3da 3913 msr = &vmx->nested.msrs.cr4_fixed0;
62cc6b9d
DM
3914 break;
3915 default:
3916 BUG();
3917 }
3918
3919 /*
3920 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3921 * must be 1 in the restored value.
3922 */
3923 if (!is_bitwise_subset(data, *msr, -1ULL))
3924 return -EINVAL;
3925
3926 *msr = data;
3927 return 0;
3928}
3929
3930/*
3931 * Called when userspace is restoring VMX MSRs.
3932 *
3933 * Returns 0 on success, non-0 otherwise.
3934 */
3935static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
b87a51ae 3936{
b9c237bb
WV
3937 struct vcpu_vmx *vmx = to_vmx(vcpu);
3938
a943ac50
JM
3939 /*
3940 * Don't allow changes to the VMX capability MSRs while the vCPU
3941 * is in VMX operation.
3942 */
3943 if (vmx->nested.vmxon)
3944 return -EBUSY;
3945
b87a51ae 3946 switch (msr_index) {
b87a51ae 3947 case MSR_IA32_VMX_BASIC:
62cc6b9d
DM
3948 return vmx_restore_vmx_basic(vmx, data);
3949 case MSR_IA32_VMX_PINBASED_CTLS:
3950 case MSR_IA32_VMX_PROCBASED_CTLS:
3951 case MSR_IA32_VMX_EXIT_CTLS:
3952 case MSR_IA32_VMX_ENTRY_CTLS:
b87a51ae 3953 /*
62cc6b9d
DM
3954 * The "non-true" VMX capability MSRs are generated from the
3955 * "true" MSRs, so we do not support restoring them directly.
3956 *
3957 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3958 * should restore the "true" MSRs with the must-be-1 bits
3959 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3960 * DEFAULT SETTINGS".
b87a51ae 3961 */
62cc6b9d
DM
3962 return -EINVAL;
3963 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3964 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3965 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3966 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3967 case MSR_IA32_VMX_PROCBASED_CTLS2:
3968 return vmx_restore_control_msr(vmx, msr_index, data);
3969 case MSR_IA32_VMX_MISC:
3970 return vmx_restore_vmx_misc(vmx, data);
3971 case MSR_IA32_VMX_CR0_FIXED0:
3972 case MSR_IA32_VMX_CR4_FIXED0:
3973 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3974 case MSR_IA32_VMX_CR0_FIXED1:
3975 case MSR_IA32_VMX_CR4_FIXED1:
3976 /*
3977 * These MSRs are generated based on the vCPU's CPUID, so we
3978 * do not support restoring them directly.
3979 */
3980 return -EINVAL;
3981 case MSR_IA32_VMX_EPT_VPID_CAP:
3982 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3983 case MSR_IA32_VMX_VMCS_ENUM:
6677f3da 3984 vmx->nested.msrs.vmcs_enum = data;
62cc6b9d
DM
3985 return 0;
3986 default:
b87a51ae 3987 /*
62cc6b9d 3988 * The rest of the VMX capability MSRs do not support restore.
b87a51ae 3989 */
62cc6b9d
DM
3990 return -EINVAL;
3991 }
3992}
3993
3994/* Returns 0 on success, non-0 otherwise. */
6677f3da 3995static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
62cc6b9d 3996{
62cc6b9d
DM
3997 switch (msr_index) {
3998 case MSR_IA32_VMX_BASIC:
6677f3da 3999 *pdata = msrs->basic;
b87a51ae
NHE
4000 break;
4001 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
4002 case MSR_IA32_VMX_PINBASED_CTLS:
b9c237bb 4003 *pdata = vmx_control_msr(
6677f3da
PB
4004 msrs->pinbased_ctls_low,
4005 msrs->pinbased_ctls_high);
0115f9cb
DM
4006 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
4007 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
4008 break;
4009 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
4010 case MSR_IA32_VMX_PROCBASED_CTLS:
b9c237bb 4011 *pdata = vmx_control_msr(
6677f3da
PB
4012 msrs->procbased_ctls_low,
4013 msrs->procbased_ctls_high);
0115f9cb
DM
4014 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
4015 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
4016 break;
4017 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
4018 case MSR_IA32_VMX_EXIT_CTLS:
b9c237bb 4019 *pdata = vmx_control_msr(
6677f3da
PB
4020 msrs->exit_ctls_low,
4021 msrs->exit_ctls_high);
0115f9cb
DM
4022 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
4023 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
4024 break;
4025 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
4026 case MSR_IA32_VMX_ENTRY_CTLS:
b9c237bb 4027 *pdata = vmx_control_msr(
6677f3da
PB
4028 msrs->entry_ctls_low,
4029 msrs->entry_ctls_high);
0115f9cb
DM
4030 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
4031 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
4032 break;
4033 case MSR_IA32_VMX_MISC:
b9c237bb 4034 *pdata = vmx_control_msr(
6677f3da
PB
4035 msrs->misc_low,
4036 msrs->misc_high);
b87a51ae 4037 break;
b87a51ae 4038 case MSR_IA32_VMX_CR0_FIXED0:
6677f3da 4039 *pdata = msrs->cr0_fixed0;
b87a51ae
NHE
4040 break;
4041 case MSR_IA32_VMX_CR0_FIXED1:
6677f3da 4042 *pdata = msrs->cr0_fixed1;
b87a51ae
NHE
4043 break;
4044 case MSR_IA32_VMX_CR4_FIXED0:
6677f3da 4045 *pdata = msrs->cr4_fixed0;
b87a51ae
NHE
4046 break;
4047 case MSR_IA32_VMX_CR4_FIXED1:
6677f3da 4048 *pdata = msrs->cr4_fixed1;
b87a51ae
NHE
4049 break;
4050 case MSR_IA32_VMX_VMCS_ENUM:
6677f3da 4051 *pdata = msrs->vmcs_enum;
b87a51ae
NHE
4052 break;
4053 case MSR_IA32_VMX_PROCBASED_CTLS2:
b9c237bb 4054 *pdata = vmx_control_msr(
6677f3da
PB
4055 msrs->secondary_ctls_low,
4056 msrs->secondary_ctls_high);
b87a51ae
NHE
4057 break;
4058 case MSR_IA32_VMX_EPT_VPID_CAP:
6677f3da
PB
4059 *pdata = msrs->ept_caps |
4060 ((u64)msrs->vpid_caps << 32);
b87a51ae 4061 break;
27c42a1b 4062 case MSR_IA32_VMX_VMFUNC:
6677f3da 4063 *pdata = msrs->vmfunc_controls;
27c42a1b 4064 break;
b87a51ae 4065 default:
b87a51ae 4066 return 1;
b3897a49
NHE
4067 }
4068
b87a51ae
NHE
4069 return 0;
4070}
4071
37e4c997
HZ
4072static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
4073 uint64_t val)
4074{
4075 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
4076
4077 return !(val & ~valid_bits);
4078}
4079
801e459a
TL
4080static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
4081{
1389309c
PB
4082 switch (msr->index) {
4083 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4084 if (!nested)
4085 return 1;
4086 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
4087 default:
4088 return 1;
4089 }
4090
4091 return 0;
801e459a
TL
4092}
4093
6aa8b732
AK
4094/*
4095 * Reads an msr value (of 'msr_index') into 'pdata'.
4096 * Returns 0 on success, non-0 otherwise.
4097 * Assumes vcpu_load() was already called.
4098 */
609e36d3 4099static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 4100{
a6cb099a 4101 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 4102 struct shared_msr_entry *msr;
6aa8b732 4103
609e36d3 4104 switch (msr_info->index) {
05b3e0c2 4105#ifdef CONFIG_X86_64
6aa8b732 4106 case MSR_FS_BASE:
609e36d3 4107 msr_info->data = vmcs_readl(GUEST_FS_BASE);
6aa8b732
AK
4108 break;
4109 case MSR_GS_BASE:
609e36d3 4110 msr_info->data = vmcs_readl(GUEST_GS_BASE);
6aa8b732 4111 break;
44ea2b17 4112 case MSR_KERNEL_GS_BASE:
678e315e 4113 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
44ea2b17 4114 break;
26bb0981 4115#endif
6aa8b732 4116 case MSR_EFER:
609e36d3 4117 return kvm_get_msr_common(vcpu, msr_info);
d28b387f
KA
4118 case MSR_IA32_SPEC_CTRL:
4119 if (!msr_info->host_initiated &&
d28b387f
KA
4120 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4121 return 1;
4122
4123 msr_info->data = to_vmx(vcpu)->spec_ctrl;
4124 break;
28c1c9fa
KA
4125 case MSR_IA32_ARCH_CAPABILITIES:
4126 if (!msr_info->host_initiated &&
4127 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
4128 return 1;
4129 msr_info->data = to_vmx(vcpu)->arch_capabilities;
4130 break;
6aa8b732 4131 case MSR_IA32_SYSENTER_CS:
609e36d3 4132 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
6aa8b732
AK
4133 break;
4134 case MSR_IA32_SYSENTER_EIP:
609e36d3 4135 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
6aa8b732
AK
4136 break;
4137 case MSR_IA32_SYSENTER_ESP:
609e36d3 4138 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
6aa8b732 4139 break;
0dd376e7 4140 case MSR_IA32_BNDCFGS:
691bd434 4141 if (!kvm_mpx_supported() ||
d6321d49
RK
4142 (!msr_info->host_initiated &&
4143 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 4144 return 1;
609e36d3 4145 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
0dd376e7 4146 break;
c45dcc71
AR
4147 case MSR_IA32_MCG_EXT_CTL:
4148 if (!msr_info->host_initiated &&
a6cb099a 4149 !(vmx->msr_ia32_feature_control &
c45dcc71 4150 FEATURE_CONTROL_LMCE))
cae50139 4151 return 1;
c45dcc71
AR
4152 msr_info->data = vcpu->arch.mcg_ext_ctl;
4153 break;
cae50139 4154 case MSR_IA32_FEATURE_CONTROL:
a6cb099a 4155 msr_info->data = vmx->msr_ia32_feature_control;
cae50139
JK
4156 break;
4157 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4158 if (!nested_vmx_allowed(vcpu))
4159 return 1;
6677f3da
PB
4160 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
4161 &msr_info->data);
20300099
WL
4162 case MSR_IA32_XSS:
4163 if (!vmx_xsaves_supported())
4164 return 1;
609e36d3 4165 msr_info->data = vcpu->arch.ia32_xss;
20300099 4166 break;
4e47c7a6 4167 case MSR_TSC_AUX:
d6321d49
RK
4168 if (!msr_info->host_initiated &&
4169 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4e47c7a6
SY
4170 return 1;
4171 /* Otherwise falls through */
6aa8b732 4172 default:
a6cb099a 4173 msr = find_msr_entry(vmx, msr_info->index);
3bab1f5d 4174 if (msr) {
609e36d3 4175 msr_info->data = msr->data;
3bab1f5d 4176 break;
6aa8b732 4177 }
609e36d3 4178 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
4179 }
4180
6aa8b732
AK
4181 return 0;
4182}
4183
cae50139
JK
4184static void vmx_leave_nested(struct kvm_vcpu *vcpu);
4185
6aa8b732
AK
4186/*
4187 * Writes msr value into into the appropriate "register".
4188 * Returns 0 on success, non-0 otherwise.
4189 * Assumes vcpu_load() was already called.
4190 */
8fe8ab46 4191static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 4192{
a2fa3e9f 4193 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 4194 struct shared_msr_entry *msr;
2cc51560 4195 int ret = 0;
8fe8ab46
WA
4196 u32 msr_index = msr_info->index;
4197 u64 data = msr_info->data;
2cc51560 4198
6aa8b732 4199 switch (msr_index) {
3bab1f5d 4200 case MSR_EFER:
8fe8ab46 4201 ret = kvm_set_msr_common(vcpu, msr_info);
2cc51560 4202 break;
16175a79 4203#ifdef CONFIG_X86_64
6aa8b732 4204 case MSR_FS_BASE:
2fb92db1 4205 vmx_segment_cache_clear(vmx);
6aa8b732
AK
4206 vmcs_writel(GUEST_FS_BASE, data);
4207 break;
4208 case MSR_GS_BASE:
2fb92db1 4209 vmx_segment_cache_clear(vmx);
6aa8b732
AK
4210 vmcs_writel(GUEST_GS_BASE, data);
4211 break;
44ea2b17 4212 case MSR_KERNEL_GS_BASE:
678e315e 4213 vmx_write_guest_kernel_gs_base(vmx, data);
44ea2b17 4214 break;
6aa8b732
AK
4215#endif
4216 case MSR_IA32_SYSENTER_CS:
4217 vmcs_write32(GUEST_SYSENTER_CS, data);
4218 break;
4219 case MSR_IA32_SYSENTER_EIP:
f5b42c33 4220 vmcs_writel(GUEST_SYSENTER_EIP, data);
6aa8b732
AK
4221 break;
4222 case MSR_IA32_SYSENTER_ESP:
f5b42c33 4223 vmcs_writel(GUEST_SYSENTER_ESP, data);
6aa8b732 4224 break;
0dd376e7 4225 case MSR_IA32_BNDCFGS:
691bd434 4226 if (!kvm_mpx_supported() ||
d6321d49
RK
4227 (!msr_info->host_initiated &&
4228 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 4229 return 1;
fd8cb433 4230 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
4531662d 4231 (data & MSR_IA32_BNDCFGS_RSVD))
93c4adc7 4232 return 1;
0dd376e7
LJ
4233 vmcs_write64(GUEST_BNDCFGS, data);
4234 break;
d28b387f
KA
4235 case MSR_IA32_SPEC_CTRL:
4236 if (!msr_info->host_initiated &&
d28b387f
KA
4237 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4238 return 1;
4239
4240 /* The STIBP bit doesn't fault even if it's not advertised */
9f65fb29 4241 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
d28b387f
KA
4242 return 1;
4243
4244 vmx->spec_ctrl = data;
4245
4246 if (!data)
4247 break;
4248
4249 /*
4250 * For non-nested:
4251 * When it's written (to non-zero) for the first time, pass
4252 * it through.
4253 *
4254 * For nested:
4255 * The handling of the MSR bitmap for L2 guests is done in
4256 * nested_vmx_merge_msr_bitmap. We should not touch the
4257 * vmcs02.msr_bitmap here since it gets completely overwritten
4258 * in the merging. We update the vmcs01 here for L1 as well
4259 * since it will end up touching the MSR anyway now.
4260 */
4261 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
4262 MSR_IA32_SPEC_CTRL,
4263 MSR_TYPE_RW);
4264 break;
15d45071
AR
4265 case MSR_IA32_PRED_CMD:
4266 if (!msr_info->host_initiated &&
15d45071
AR
4267 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4268 return 1;
4269
4270 if (data & ~PRED_CMD_IBPB)
4271 return 1;
4272
4273 if (!data)
4274 break;
4275
4276 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
4277
4278 /*
4279 * For non-nested:
4280 * When it's written (to non-zero) for the first time, pass
4281 * it through.
4282 *
4283 * For nested:
4284 * The handling of the MSR bitmap for L2 guests is done in
4285 * nested_vmx_merge_msr_bitmap. We should not touch the
4286 * vmcs02.msr_bitmap here since it gets completely overwritten
4287 * in the merging.
4288 */
4289 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
4290 MSR_TYPE_W);
4291 break;
28c1c9fa
KA
4292 case MSR_IA32_ARCH_CAPABILITIES:
4293 if (!msr_info->host_initiated)
4294 return 1;
4295 vmx->arch_capabilities = data;
4296 break;
468d472f
SY
4297 case MSR_IA32_CR_PAT:
4298 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
4566654b
NA
4299 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
4300 return 1;
468d472f
SY
4301 vmcs_write64(GUEST_IA32_PAT, data);
4302 vcpu->arch.pat = data;
4303 break;
4304 }
8fe8ab46 4305 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 4306 break;
ba904635
WA
4307 case MSR_IA32_TSC_ADJUST:
4308 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 4309 break;
c45dcc71
AR
4310 case MSR_IA32_MCG_EXT_CTL:
4311 if ((!msr_info->host_initiated &&
4312 !(to_vmx(vcpu)->msr_ia32_feature_control &
4313 FEATURE_CONTROL_LMCE)) ||
4314 (data & ~MCG_EXT_CTL_LMCE_EN))
4315 return 1;
4316 vcpu->arch.mcg_ext_ctl = data;
4317 break;
cae50139 4318 case MSR_IA32_FEATURE_CONTROL:
37e4c997 4319 if (!vmx_feature_control_msr_valid(vcpu, data) ||
3b84080b 4320 (to_vmx(vcpu)->msr_ia32_feature_control &
cae50139
JK
4321 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
4322 return 1;
3b84080b 4323 vmx->msr_ia32_feature_control = data;
cae50139
JK
4324 if (msr_info->host_initiated && data == 0)
4325 vmx_leave_nested(vcpu);
4326 break;
4327 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
62cc6b9d
DM
4328 if (!msr_info->host_initiated)
4329 return 1; /* they are read-only */
4330 if (!nested_vmx_allowed(vcpu))
4331 return 1;
4332 return vmx_set_vmx_msr(vcpu, msr_index, data);
20300099
WL
4333 case MSR_IA32_XSS:
4334 if (!vmx_xsaves_supported())
4335 return 1;
4336 /*
4337 * The only supported bit as of Skylake is bit 8, but
4338 * it is not supported on KVM.
4339 */
4340 if (data != 0)
4341 return 1;
4342 vcpu->arch.ia32_xss = data;
4343 if (vcpu->arch.ia32_xss != host_xss)
4344 add_atomic_switch_msr(vmx, MSR_IA32_XSS,
989e3992 4345 vcpu->arch.ia32_xss, host_xss, false);
20300099
WL
4346 else
4347 clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
4348 break;
4e47c7a6 4349 case MSR_TSC_AUX:
d6321d49
RK
4350 if (!msr_info->host_initiated &&
4351 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4e47c7a6
SY
4352 return 1;
4353 /* Check reserved bit, higher 32 bits should be zero */
4354 if ((data >> 32) != 0)
4355 return 1;
4356 /* Otherwise falls through */
6aa8b732 4357 default:
8b9cf98c 4358 msr = find_msr_entry(vmx, msr_index);
3bab1f5d 4359 if (msr) {
8b3c3104 4360 u64 old_msr_data = msr->data;
3bab1f5d 4361 msr->data = data;
2225fd56
AK
4362 if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
4363 preempt_disable();
8b3c3104
AH
4364 ret = kvm_set_shared_msr(msr->index, msr->data,
4365 msr->mask);
2225fd56 4366 preempt_enable();
8b3c3104
AH
4367 if (ret)
4368 msr->data = old_msr_data;
2225fd56 4369 }
3bab1f5d 4370 break;
6aa8b732 4371 }
8fe8ab46 4372 ret = kvm_set_msr_common(vcpu, msr_info);
6aa8b732
AK
4373 }
4374
2cc51560 4375 return ret;
6aa8b732
AK
4376}
4377
5fdbf976 4378static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
6aa8b732 4379{
5fdbf976
MT
4380 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
4381 switch (reg) {
4382 case VCPU_REGS_RSP:
4383 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
4384 break;
4385 case VCPU_REGS_RIP:
4386 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
4387 break;
6de4f3ad
AK
4388 case VCPU_EXREG_PDPTR:
4389 if (enable_ept)
4390 ept_save_pdptrs(vcpu);
4391 break;
5fdbf976
MT
4392 default:
4393 break;
4394 }
6aa8b732
AK
4395}
4396
6aa8b732
AK
4397static __init int cpu_has_kvm_support(void)
4398{
6210e37b 4399 return cpu_has_vmx();
6aa8b732
AK
4400}
4401
4402static __init int vmx_disabled_by_bios(void)
4403{
4404 u64 msr;
4405
4406 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
cafd6659 4407 if (msr & FEATURE_CONTROL_LOCKED) {
23f3e991 4408 /* launched w/ TXT and VMX disabled */
cafd6659
SW
4409 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
4410 && tboot_enabled())
4411 return 1;
23f3e991 4412 /* launched w/o TXT and VMX only enabled w/ TXT */
cafd6659 4413 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
23f3e991 4414 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
f9335afe
SW
4415 && !tboot_enabled()) {
4416 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
23f3e991 4417 "activate TXT before enabling KVM\n");
cafd6659 4418 return 1;
f9335afe 4419 }
23f3e991
JC
4420 /* launched w/o TXT and VMX disabled */
4421 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
4422 && !tboot_enabled())
4423 return 1;
cafd6659
SW
4424 }
4425
4426 return 0;
6aa8b732
AK
4427}
4428
7725b894
DX
4429static void kvm_cpu_vmxon(u64 addr)
4430{
fe0e80be 4431 cr4_set_bits(X86_CR4_VMXE);
1c5ac21a
AS
4432 intel_pt_handle_vmx(1);
4433
4b1e5478 4434 asm volatile ("vmxon %0" : : "m"(addr));
7725b894
DX
4435}
4436
13a34e06 4437static int hardware_enable(void)
6aa8b732
AK
4438{
4439 int cpu = raw_smp_processor_id();
4440 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
cafd6659 4441 u64 old, test_bits;
6aa8b732 4442
1e02ce4c 4443 if (cr4_read_shadow() & X86_CR4_VMXE)
10474ae8
AG
4444 return -EBUSY;
4445
773e8a04
VK
4446 /*
4447 * This can happen if we hot-added a CPU but failed to allocate
4448 * VP assist page for it.
4449 */
4450 if (static_branch_unlikely(&enable_evmcs) &&
4451 !hv_get_vp_assist_page(cpu))
4452 return -EFAULT;
4453
d462b819 4454 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
bf9f6ac8
FW
4455 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
4456 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
8f536b76
ZY
4457
4458 /*
4459 * Now we can enable the vmclear operation in kdump
4460 * since the loaded_vmcss_on_cpu list on this cpu
4461 * has been initialized.
4462 *
4463 * Though the cpu is not in VMX operation now, there
4464 * is no problem to enable the vmclear operation
4465 * for the loaded_vmcss_on_cpu list is empty!
4466 */
4467 crash_enable_local_vmclear(cpu);
4468
6aa8b732 4469 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
cafd6659
SW
4470
4471 test_bits = FEATURE_CONTROL_LOCKED;
4472 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4473 if (tboot_enabled())
4474 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
4475
4476 if ((old & test_bits) != test_bits) {
6aa8b732 4477 /* enable and lock */
cafd6659
SW
4478 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
4479 }
fe0e80be 4480 kvm_cpu_vmxon(phys_addr);
fdf288bf
DH
4481 if (enable_ept)
4482 ept_sync_global();
10474ae8
AG
4483
4484 return 0;
6aa8b732
AK
4485}
4486
d462b819 4487static void vmclear_local_loaded_vmcss(void)
543e4243
AK
4488{
4489 int cpu = raw_smp_processor_id();
d462b819 4490 struct loaded_vmcs *v, *n;
543e4243 4491
d462b819
NHE
4492 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
4493 loaded_vmcss_on_cpu_link)
4494 __loaded_vmcs_clear(v);
543e4243
AK
4495}
4496
710ff4a8
EH
4497
4498/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
4499 * tricks.
4500 */
4501static void kvm_cpu_vmxoff(void)
6aa8b732 4502{
4b1e5478 4503 asm volatile (__ex("vmxoff"));
1c5ac21a
AS
4504
4505 intel_pt_handle_vmx(0);
fe0e80be 4506 cr4_clear_bits(X86_CR4_VMXE);
6aa8b732
AK
4507}
4508
13a34e06 4509static void hardware_disable(void)
710ff4a8 4510{
fe0e80be
DH
4511 vmclear_local_loaded_vmcss();
4512 kvm_cpu_vmxoff();
710ff4a8
EH
4513}
4514
1c3d14fe 4515static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
d77c26fc 4516 u32 msr, u32 *result)
1c3d14fe
YS
4517{
4518 u32 vmx_msr_low, vmx_msr_high;
4519 u32 ctl = ctl_min | ctl_opt;
4520
4521 rdmsr(msr, vmx_msr_low, vmx_msr_high);
4522
4523 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
4524 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
4525
4526 /* Ensure minimum (required) set of control bits are supported. */
4527 if (ctl_min & ~ctl)
002c7f7c 4528 return -EIO;
1c3d14fe
YS
4529
4530 *result = ctl;
4531 return 0;
4532}
4533
110312c8
AK
4534static __init bool allow_1_setting(u32 msr, u32 ctl)
4535{
4536 u32 vmx_msr_low, vmx_msr_high;
4537
4538 rdmsr(msr, vmx_msr_low, vmx_msr_high);
4539 return vmx_msr_high & ctl;
4540}
4541
002c7f7c 4542static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
6aa8b732
AK
4543{
4544 u32 vmx_msr_low, vmx_msr_high;
d56f546d 4545 u32 min, opt, min2, opt2;
1c3d14fe
YS
4546 u32 _pin_based_exec_control = 0;
4547 u32 _cpu_based_exec_control = 0;
f78e0e2e 4548 u32 _cpu_based_2nd_exec_control = 0;
1c3d14fe
YS
4549 u32 _vmexit_control = 0;
4550 u32 _vmentry_control = 0;
4551
1389309c 4552 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
10166744 4553 min = CPU_BASED_HLT_EXITING |
1c3d14fe
YS
4554#ifdef CONFIG_X86_64
4555 CPU_BASED_CR8_LOAD_EXITING |
4556 CPU_BASED_CR8_STORE_EXITING |
4557#endif
d56f546d
SY
4558 CPU_BASED_CR3_LOAD_EXITING |
4559 CPU_BASED_CR3_STORE_EXITING |
8eb73e2d 4560 CPU_BASED_UNCOND_IO_EXITING |
1c3d14fe 4561 CPU_BASED_MOV_DR_EXITING |
a7052897 4562 CPU_BASED_USE_TSC_OFFSETING |
4d5422ce
WL
4563 CPU_BASED_MWAIT_EXITING |
4564 CPU_BASED_MONITOR_EXITING |
fee84b07
AK
4565 CPU_BASED_INVLPG_EXITING |
4566 CPU_BASED_RDPMC_EXITING;
443381a8 4567
f78e0e2e 4568 opt = CPU_BASED_TPR_SHADOW |
25c5f225 4569 CPU_BASED_USE_MSR_BITMAPS |
f78e0e2e 4570 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1c3d14fe
YS
4571 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
4572 &_cpu_based_exec_control) < 0)
002c7f7c 4573 return -EIO;
6e5d865c
YS
4574#ifdef CONFIG_X86_64
4575 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4576 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
4577 ~CPU_BASED_CR8_STORE_EXITING;
4578#endif
f78e0e2e 4579 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
d56f546d
SY
4580 min2 = 0;
4581 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
8d14695f 4582 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2384d2b3 4583 SECONDARY_EXEC_WBINVD_EXITING |
d56f546d 4584 SECONDARY_EXEC_ENABLE_VPID |
3a624e29 4585 SECONDARY_EXEC_ENABLE_EPT |
4b8d54f9 4586 SECONDARY_EXEC_UNRESTRICTED_GUEST |
4e47c7a6 4587 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
0367f205 4588 SECONDARY_EXEC_DESC |
ad756a16 4589 SECONDARY_EXEC_RDTSCP |
83d4c286 4590 SECONDARY_EXEC_ENABLE_INVPCID |
c7c9c56c 4591 SECONDARY_EXEC_APIC_REGISTER_VIRT |
abc4fc58 4592 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
20300099 4593 SECONDARY_EXEC_SHADOW_VMCS |
843e4330 4594 SECONDARY_EXEC_XSAVES |
736fdf72
DH
4595 SECONDARY_EXEC_RDSEED_EXITING |
4596 SECONDARY_EXEC_RDRAND_EXITING |
8b3e34e4 4597 SECONDARY_EXEC_ENABLE_PML |
2a499e49 4598 SECONDARY_EXEC_TSC_SCALING |
0b665d30
SC
4599 SECONDARY_EXEC_ENABLE_VMFUNC |
4600 SECONDARY_EXEC_ENCLS_EXITING;
d56f546d
SY
4601 if (adjust_vmx_controls(min2, opt2,
4602 MSR_IA32_VMX_PROCBASED_CTLS2,
f78e0e2e
SY
4603 &_cpu_based_2nd_exec_control) < 0)
4604 return -EIO;
4605 }
4606#ifndef CONFIG_X86_64
4607 if (!(_cpu_based_2nd_exec_control &
4608 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
4609 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
4610#endif
83d4c286
YZ
4611
4612 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4613 _cpu_based_2nd_exec_control &= ~(
8d14695f 4614 SECONDARY_EXEC_APIC_REGISTER_VIRT |
c7c9c56c
YZ
4615 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4616 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
83d4c286 4617
61f1dd90
WL
4618 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
4619 &vmx_capability.ept, &vmx_capability.vpid);
4620
d56f546d 4621 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
a7052897
MT
4622 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
4623 enabled */
5fff7d27
GN
4624 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4625 CPU_BASED_CR3_STORE_EXITING |
4626 CPU_BASED_INVLPG_EXITING);
61f1dd90
WL
4627 } else if (vmx_capability.ept) {
4628 vmx_capability.ept = 0;
4629 pr_warn_once("EPT CAP should not exist if not support "
4630 "1-setting enable EPT VM-execution control\n");
4631 }
4632 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
4633 vmx_capability.vpid) {
4634 vmx_capability.vpid = 0;
4635 pr_warn_once("VPID CAP should not exist if not support "
4636 "1-setting enable VPID VM-execution control\n");
d56f546d 4637 }
1c3d14fe 4638
91fa0f8e 4639 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
1c3d14fe
YS
4640#ifdef CONFIG_X86_64
4641 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
4642#endif
a547c6db 4643 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
91fa0f8e 4644 VM_EXIT_CLEAR_BNDCFGS;
1c3d14fe
YS
4645 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
4646 &_vmexit_control) < 0)
002c7f7c 4647 return -EIO;
1c3d14fe 4648
8a1b4392
PB
4649 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
4650 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
4651 PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be
YZ
4652 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
4653 &_pin_based_exec_control) < 0)
4654 return -EIO;
4655
1c17c3e6
PB
4656 if (cpu_has_broken_vmx_preemption_timer())
4657 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be 4658 if (!(_cpu_based_2nd_exec_control &
91fa0f8e 4659 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
01e439be
YZ
4660 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
4661
c845f9c6 4662 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
da8999d3 4663 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
1c3d14fe
YS
4664 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
4665 &_vmentry_control) < 0)
002c7f7c 4666 return -EIO;
6aa8b732 4667
c68876fd 4668 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1c3d14fe
YS
4669
4670 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
4671 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
002c7f7c 4672 return -EIO;
1c3d14fe
YS
4673
4674#ifdef CONFIG_X86_64
4675 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
4676 if (vmx_msr_high & (1u<<16))
002c7f7c 4677 return -EIO;
1c3d14fe
YS
4678#endif
4679
4680 /* Require Write-Back (WB) memory type for VMCS accesses. */
4681 if (((vmx_msr_high >> 18) & 15) != 6)
002c7f7c 4682 return -EIO;
1c3d14fe 4683
002c7f7c 4684 vmcs_conf->size = vmx_msr_high & 0x1fff;
16cb0255 4685 vmcs_conf->order = get_order(vmcs_conf->size);
9ac7e3e8 4686 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
773e8a04 4687
2307af1c 4688 vmcs_conf->revision_id = vmx_msr_low;
1c3d14fe 4689
002c7f7c
YS
4690 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
4691 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
f78e0e2e 4692 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
002c7f7c
YS
4693 vmcs_conf->vmexit_ctrl = _vmexit_control;
4694 vmcs_conf->vmentry_ctrl = _vmentry_control;
1c3d14fe 4695
773e8a04
VK
4696 if (static_branch_unlikely(&enable_evmcs))
4697 evmcs_sanitize_exec_ctrls(vmcs_conf);
4698
110312c8
AK
4699 cpu_has_load_ia32_efer =
4700 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4701 VM_ENTRY_LOAD_IA32_EFER)
4702 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4703 VM_EXIT_LOAD_IA32_EFER);
4704
8bf00a52
GN
4705 cpu_has_load_perf_global_ctrl =
4706 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4707 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
4708 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4709 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
4710
4711 /*
4712 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
bb3541f1 4713 * but due to errata below it can't be used. Workaround is to use
8bf00a52
GN
4714 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
4715 *
4716 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
4717 *
4718 * AAK155 (model 26)
4719 * AAP115 (model 30)
4720 * AAT100 (model 37)
4721 * BC86,AAY89,BD102 (model 44)
4722 * BA97 (model 46)
4723 *
4724 */
4725 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
4726 switch (boot_cpu_data.x86_model) {
4727 case 26:
4728 case 30:
4729 case 37:
4730 case 44:
4731 case 46:
4732 cpu_has_load_perf_global_ctrl = false;
4733 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
4734 "does not work properly. Using workaround\n");
4735 break;
4736 default:
4737 break;
4738 }
4739 }
4740
782511b0 4741 if (boot_cpu_has(X86_FEATURE_XSAVES))
20300099
WL
4742 rdmsrl(MSR_IA32_XSS, host_xss);
4743
1c3d14fe 4744 return 0;
c68876fd 4745}
6aa8b732 4746
491a6038 4747static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
6aa8b732
AK
4748{
4749 int node = cpu_to_node(cpu);
4750 struct page *pages;
4751 struct vmcs *vmcs;
4752
96db800f 4753 pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
6aa8b732
AK
4754 if (!pages)
4755 return NULL;
4756 vmcs = page_address(pages);
1c3d14fe 4757 memset(vmcs, 0, vmcs_config.size);
2307af1c
LA
4758
4759 /* KVM supports Enlightened VMCS v1 only */
4760 if (static_branch_unlikely(&enable_evmcs))
392b2f25 4761 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2307af1c 4762 else
392b2f25 4763 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 4764
491a6038
LA
4765 if (shadow)
4766 vmcs->hdr.shadow_vmcs = 1;
6aa8b732
AK
4767 return vmcs;
4768}
4769
6aa8b732
AK
4770static void free_vmcs(struct vmcs *vmcs)
4771{
1c3d14fe 4772 free_pages((unsigned long)vmcs, vmcs_config.order);
6aa8b732
AK
4773}
4774
d462b819
NHE
4775/*
4776 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
4777 */
4778static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4779{
4780 if (!loaded_vmcs->vmcs)
4781 return;
4782 loaded_vmcs_clear(loaded_vmcs);
4783 free_vmcs(loaded_vmcs->vmcs);
4784 loaded_vmcs->vmcs = NULL;
904e14fb
PB
4785 if (loaded_vmcs->msr_bitmap)
4786 free_page((unsigned long)loaded_vmcs->msr_bitmap);
355f4fb1 4787 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
d462b819
NHE
4788}
4789
491a6038 4790static struct vmcs *alloc_vmcs(bool shadow)
f21f165e 4791{
491a6038 4792 return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
f21f165e
PB
4793}
4794
4795static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4796{
491a6038 4797 loaded_vmcs->vmcs = alloc_vmcs(false);
f21f165e
PB
4798 if (!loaded_vmcs->vmcs)
4799 return -ENOMEM;
4800
4801 loaded_vmcs->shadow_vmcs = NULL;
4802 loaded_vmcs_init(loaded_vmcs);
904e14fb
PB
4803
4804 if (cpu_has_vmx_msr_bitmap()) {
4805 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
4806 if (!loaded_vmcs->msr_bitmap)
4807 goto out_vmcs;
4808 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
ceef7d10 4809
1f008e11
AB
4810 if (IS_ENABLED(CONFIG_HYPERV) &&
4811 static_branch_unlikely(&enable_evmcs) &&
ceef7d10
VK
4812 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
4813 struct hv_enlightened_vmcs *evmcs =
4814 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
4815
4816 evmcs->hv_enlightenments_control.msr_bitmap = 1;
4817 }
904e14fb 4818 }
d7ee039e
SC
4819
4820 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
4821
f21f165e 4822 return 0;
904e14fb
PB
4823
4824out_vmcs:
4825 free_loaded_vmcs(loaded_vmcs);
4826 return -ENOMEM;
f21f165e
PB
4827}
4828
39959588 4829static void free_kvm_area(void)
6aa8b732
AK
4830{
4831 int cpu;
4832
3230bb47 4833 for_each_possible_cpu(cpu) {
6aa8b732 4834 free_vmcs(per_cpu(vmxarea, cpu));
3230bb47
ZA
4835 per_cpu(vmxarea, cpu) = NULL;
4836 }
6aa8b732
AK
4837}
4838
d37f4267
JM
4839enum vmcs_field_width {
4840 VMCS_FIELD_WIDTH_U16 = 0,
4841 VMCS_FIELD_WIDTH_U64 = 1,
4842 VMCS_FIELD_WIDTH_U32 = 2,
4843 VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
85fd514e
JM
4844};
4845
d37f4267 4846static inline int vmcs_field_width(unsigned long field)
85fd514e
JM
4847{
4848 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
d37f4267 4849 return VMCS_FIELD_WIDTH_U32;
85fd514e
JM
4850 return (field >> 13) & 0x3 ;
4851}
4852
4853static inline int vmcs_field_readonly(unsigned long field)
4854{
4855 return (((field >> 10) & 0x3) == 1);
4856}
4857
fe2b201b
BD
4858static void init_vmcs_shadow_fields(void)
4859{
4860 int i, j;
4861
44900ba6
PB
4862 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
4863 u16 field = shadow_read_only_fields[i];
d37f4267 4864 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
44900ba6
PB
4865 (i + 1 == max_shadow_read_only_fields ||
4866 shadow_read_only_fields[i + 1] != field + 1))
4867 pr_err("Missing field from shadow_read_only_field %x\n",
4868 field + 1);
4869
4870 clear_bit(field, vmx_vmread_bitmap);
4871#ifdef CONFIG_X86_64
4872 if (field & 1)
4873 continue;
4874#endif
4875 if (j < i)
4876 shadow_read_only_fields[j] = field;
4877 j++;
4878 }
4879 max_shadow_read_only_fields = j;
fe2b201b
BD
4880
4881 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
44900ba6 4882 u16 field = shadow_read_write_fields[i];
d37f4267 4883 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
44900ba6
PB
4884 (i + 1 == max_shadow_read_write_fields ||
4885 shadow_read_write_fields[i + 1] != field + 1))
4886 pr_err("Missing field from shadow_read_write_field %x\n",
4887 field + 1);
4888
c5d167b2
PB
4889 /*
4890 * PML and the preemption timer can be emulated, but the
4891 * processor cannot vmwrite to fields that don't exist
4892 * on bare metal.
4893 */
44900ba6 4894 switch (field) {
c5d167b2
PB
4895 case GUEST_PML_INDEX:
4896 if (!cpu_has_vmx_pml())
4897 continue;
4898 break;
4899 case VMX_PREEMPTION_TIMER_VALUE:
4900 if (!cpu_has_vmx_preemption_timer())
4901 continue;
4902 break;
4903 case GUEST_INTR_STATUS:
4904 if (!cpu_has_vmx_apicv())
fe2b201b
BD
4905 continue;
4906 break;
4907 default:
4908 break;
4909 }
4910
44900ba6
PB
4911 clear_bit(field, vmx_vmwrite_bitmap);
4912 clear_bit(field, vmx_vmread_bitmap);
4913#ifdef CONFIG_X86_64
4914 if (field & 1)
4915 continue;
4916#endif
fe2b201b 4917 if (j < i)
44900ba6 4918 shadow_read_write_fields[j] = field;
fe2b201b
BD
4919 j++;
4920 }
4921 max_shadow_read_write_fields = j;
fe2b201b
BD
4922}
4923
6aa8b732
AK
4924static __init int alloc_kvm_area(void)
4925{
4926 int cpu;
4927
3230bb47 4928 for_each_possible_cpu(cpu) {
6aa8b732
AK
4929 struct vmcs *vmcs;
4930
491a6038 4931 vmcs = alloc_vmcs_cpu(false, cpu);
6aa8b732
AK
4932 if (!vmcs) {
4933 free_kvm_area();
4934 return -ENOMEM;
4935 }
4936
2307af1c
LA
4937 /*
4938 * When eVMCS is enabled, alloc_vmcs_cpu() sets
4939 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
4940 * revision_id reported by MSR_IA32_VMX_BASIC.
4941 *
4942 * However, even though not explictly documented by
4943 * TLFS, VMXArea passed as VMXON argument should
4944 * still be marked with revision_id reported by
4945 * physical CPU.
4946 */
4947 if (static_branch_unlikely(&enable_evmcs))
392b2f25 4948 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 4949
6aa8b732
AK
4950 per_cpu(vmxarea, cpu) = vmcs;
4951 }
4952 return 0;
4953}
4954
91b0aa2c 4955static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
d99e4152 4956 struct kvm_segment *save)
6aa8b732 4957{
d99e4152
GN
4958 if (!emulate_invalid_guest_state) {
4959 /*
4960 * CS and SS RPL should be equal during guest entry according
4961 * to VMX spec, but in reality it is not always so. Since vcpu
4962 * is in the middle of the transition from real mode to
4963 * protected mode it is safe to assume that RPL 0 is a good
4964 * default value.
4965 */
4966 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
b32a9918
NA
4967 save->selector &= ~SEGMENT_RPL_MASK;
4968 save->dpl = save->selector & SEGMENT_RPL_MASK;
d99e4152 4969 save->s = 1;
6aa8b732 4970 }
d99e4152 4971 vmx_set_segment(vcpu, save, seg);
6aa8b732
AK
4972}
4973
4974static void enter_pmode(struct kvm_vcpu *vcpu)
4975{
4976 unsigned long flags;
a89a8fb9 4977 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732 4978
d99e4152
GN
4979 /*
4980 * Update real mode segment cache. It may be not up-to-date if sement
4981 * register was written while vcpu was in a guest mode.
4982 */
4983 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4984 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4985 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4986 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4987 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4988 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4989
7ffd92c5 4990 vmx->rmode.vm86_active = 0;
6aa8b732 4991
2fb92db1
AK
4992 vmx_segment_cache_clear(vmx);
4993
f5f7b2fe 4994 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
6aa8b732
AK
4995
4996 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47
AK
4997 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
4998 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
6aa8b732
AK
4999 vmcs_writel(GUEST_RFLAGS, flags);
5000
66aee91a
RR
5001 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
5002 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
6aa8b732
AK
5003
5004 update_exception_bitmap(vcpu);
5005
91b0aa2c
GN
5006 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
5007 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
5008 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
5009 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
5010 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
5011 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
6aa8b732
AK
5012}
5013
f5f7b2fe 5014static void fix_rmode_seg(int seg, struct kvm_segment *save)
6aa8b732 5015{
772e0318 5016 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
d99e4152
GN
5017 struct kvm_segment var = *save;
5018
5019 var.dpl = 0x3;
5020 if (seg == VCPU_SREG_CS)
5021 var.type = 0x3;
5022
5023 if (!emulate_invalid_guest_state) {
5024 var.selector = var.base >> 4;
5025 var.base = var.base & 0xffff0;
5026 var.limit = 0xffff;
5027 var.g = 0;
5028 var.db = 0;
5029 var.present = 1;
5030 var.s = 1;
5031 var.l = 0;
5032 var.unusable = 0;
5033 var.type = 0x3;
5034 var.avl = 0;
5035 if (save->base & 0xf)
5036 printk_once(KERN_WARNING "kvm: segment base is not "
5037 "paragraph aligned when entering "
5038 "protected mode (seg=%d)", seg);
5039 }
6aa8b732 5040
d99e4152 5041 vmcs_write16(sf->selector, var.selector);
96794e4e 5042 vmcs_writel(sf->base, var.base);
d99e4152
GN
5043 vmcs_write32(sf->limit, var.limit);
5044 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
6aa8b732
AK
5045}
5046
5047static void enter_rmode(struct kvm_vcpu *vcpu)
5048{
5049 unsigned long flags;
a89a8fb9 5050 struct vcpu_vmx *vmx = to_vmx(vcpu);
40bbb9d0 5051 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
6aa8b732 5052
f5f7b2fe
AK
5053 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
5054 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
5055 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
5056 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
5057 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
c6ad1153
GN
5058 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
5059 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
f5f7b2fe 5060
7ffd92c5 5061 vmx->rmode.vm86_active = 1;
6aa8b732 5062
776e58ea
GN
5063 /*
5064 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4918c6ca 5065 * vcpu. Warn the user that an update is overdue.
776e58ea 5066 */
40bbb9d0 5067 if (!kvm_vmx->tss_addr)
776e58ea
GN
5068 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
5069 "called before entering vcpu\n");
776e58ea 5070
2fb92db1
AK
5071 vmx_segment_cache_clear(vmx);
5072
40bbb9d0 5073 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
6aa8b732 5074 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
6aa8b732
AK
5075 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5076
5077 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47 5078 vmx->rmode.save_rflags = flags;
6aa8b732 5079
053de044 5080 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
6aa8b732
AK
5081
5082 vmcs_writel(GUEST_RFLAGS, flags);
66aee91a 5083 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
6aa8b732
AK
5084 update_exception_bitmap(vcpu);
5085
d99e4152
GN
5086 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
5087 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
5088 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
5089 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
5090 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
5091 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
b246dd5d 5092
8668a3c4 5093 kvm_mmu_reset_context(vcpu);
6aa8b732
AK
5094}
5095
401d10de
AS
5096static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
5097{
5098 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981
AK
5099 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
5100
5101 if (!msr)
5102 return;
401d10de 5103
f6801dff 5104 vcpu->arch.efer = efer;
401d10de 5105 if (efer & EFER_LMA) {
2961e876 5106 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
401d10de
AS
5107 msr->data = efer;
5108 } else {
2961e876 5109 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
401d10de
AS
5110
5111 msr->data = efer & ~EFER_LME;
5112 }
5113 setup_msrs(vmx);
5114}
5115
05b3e0c2 5116#ifdef CONFIG_X86_64
6aa8b732
AK
5117
5118static void enter_lmode(struct kvm_vcpu *vcpu)
5119{
5120 u32 guest_tr_ar;
5121
2fb92db1
AK
5122 vmx_segment_cache_clear(to_vmx(vcpu));
5123
6aa8b732 5124 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4d283ec9 5125 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
bd80158a
JK
5126 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
5127 __func__);
6aa8b732 5128 vmcs_write32(GUEST_TR_AR_BYTES,
4d283ec9
AL
5129 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
5130 | VMX_AR_TYPE_BUSY_64_TSS);
6aa8b732 5131 }
da38f438 5132 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
6aa8b732
AK
5133}
5134
5135static void exit_lmode(struct kvm_vcpu *vcpu)
5136{
2961e876 5137 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
da38f438 5138 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
6aa8b732
AK
5139}
5140
5141#endif
5142
c2ba05cc
WL
5143static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
5144 bool invalidate_gpa)
2384d2b3 5145{
c2ba05cc 5146 if (enable_ept && (invalidate_gpa || !enable_vpid)) {
44dd3ffa 5147 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
dd180b3e 5148 return;
44dd3ffa
VK
5149 ept_sync_context(construct_eptp(vcpu,
5150 vcpu->arch.mmu->root_hpa));
f0b98c02
JM
5151 } else {
5152 vpid_sync_context(vpid);
dd180b3e 5153 }
2384d2b3
SY
5154}
5155
c2ba05cc 5156static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
dd5f5341 5157{
c2ba05cc 5158 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
dd5f5341
WL
5159}
5160
faff8758
JS
5161static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
5162{
5163 int vpid = to_vmx(vcpu)->vpid;
5164
5165 if (!vpid_sync_vcpu_addr(vpid, addr))
5166 vpid_sync_context(vpid);
5167
5168 /*
5169 * If VPIDs are not supported or enabled, then the above is a no-op.
5170 * But we don't really need a TLB flush in that case anyway, because
5171 * each VM entry/exit includes an implicit flush when VPID is 0.
5172 */
5173}
5174
e8467fda
AK
5175static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
5176{
5177 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
5178
5179 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
5180 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
5181}
5182
aff48baa
AK
5183static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
5184{
b4d18517 5185 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
aff48baa
AK
5186 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
5187 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5188}
5189
25c4c276 5190static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
399badf3 5191{
fc78f519
AK
5192 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
5193
5194 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
5195 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
399badf3
AK
5196}
5197
1439442c
SY
5198static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
5199{
d0d538b9
GN
5200 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5201
6de4f3ad
AK
5202 if (!test_bit(VCPU_EXREG_PDPTR,
5203 (unsigned long *)&vcpu->arch.regs_dirty))
5204 return;
5205
1439442c 5206 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
d0d538b9
GN
5207 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
5208 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
5209 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
5210 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
1439442c
SY
5211 }
5212}
5213
8f5d549f
AK
5214static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
5215{
d0d538b9
GN
5216 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5217
8f5d549f 5218 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
d0d538b9
GN
5219 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
5220 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
5221 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
5222 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
8f5d549f 5223 }
6de4f3ad
AK
5224
5225 __set_bit(VCPU_EXREG_PDPTR,
5226 (unsigned long *)&vcpu->arch.regs_avail);
5227 __set_bit(VCPU_EXREG_PDPTR,
5228 (unsigned long *)&vcpu->arch.regs_dirty);
8f5d549f
AK
5229}
5230
3899152c
DM
5231static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5232{
6677f3da
PB
5233 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5234 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
3899152c
DM
5235 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5236
6677f3da 5237 if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
3899152c
DM
5238 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
5239 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
5240 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
5241
5242 return fixed_bits_valid(val, fixed0, fixed1);
5243}
5244
5245static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5246{
6677f3da
PB
5247 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5248 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
3899152c
DM
5249
5250 return fixed_bits_valid(val, fixed0, fixed1);
5251}
5252
5253static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
5254{
6677f3da
PB
5255 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
5256 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
3899152c
DM
5257
5258 return fixed_bits_valid(val, fixed0, fixed1);
5259}
5260
5261/* No difference in the restrictions on guest and host CR4 in VMX operation. */
5262#define nested_guest_cr4_valid nested_cr4_valid
5263#define nested_host_cr4_valid nested_cr4_valid
5264
5e1746d6 5265static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1439442c
SY
5266
5267static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
5268 unsigned long cr0,
5269 struct kvm_vcpu *vcpu)
5270{
5233dd51
MT
5271 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
5272 vmx_decache_cr3(vcpu);
1439442c
SY
5273 if (!(cr0 & X86_CR0_PG)) {
5274 /* From paging/starting to nonpaging */
5275 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
65267ea1 5276 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1439442c
SY
5277 (CPU_BASED_CR3_LOAD_EXITING |
5278 CPU_BASED_CR3_STORE_EXITING));
5279 vcpu->arch.cr0 = cr0;
fc78f519 5280 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c
SY
5281 } else if (!is_paging(vcpu)) {
5282 /* From nonpaging to paging */
5283 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
65267ea1 5284 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1439442c
SY
5285 ~(CPU_BASED_CR3_LOAD_EXITING |
5286 CPU_BASED_CR3_STORE_EXITING));
5287 vcpu->arch.cr0 = cr0;
fc78f519 5288 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c 5289 }
95eb84a7
SY
5290
5291 if (!(cr0 & X86_CR0_WP))
5292 *hw_cr0 &= ~X86_CR0_WP;
1439442c
SY
5293}
5294
6aa8b732
AK
5295static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
5296{
7ffd92c5 5297 struct vcpu_vmx *vmx = to_vmx(vcpu);
3a624e29
NK
5298 unsigned long hw_cr0;
5299
3de6347b 5300 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3a624e29 5301 if (enable_unrestricted_guest)
5037878e 5302 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
218e763f 5303 else {
5037878e 5304 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
1439442c 5305
218e763f
GN
5306 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
5307 enter_pmode(vcpu);
6aa8b732 5308
218e763f
GN
5309 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
5310 enter_rmode(vcpu);
5311 }
6aa8b732 5312
05b3e0c2 5313#ifdef CONFIG_X86_64
f6801dff 5314 if (vcpu->arch.efer & EFER_LME) {
707d92fa 5315 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
6aa8b732 5316 enter_lmode(vcpu);
707d92fa 5317 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
6aa8b732
AK
5318 exit_lmode(vcpu);
5319 }
5320#endif
5321
b4d18517 5322 if (enable_ept && !enable_unrestricted_guest)
1439442c
SY
5323 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
5324
6aa8b732 5325 vmcs_writel(CR0_READ_SHADOW, cr0);
1439442c 5326 vmcs_writel(GUEST_CR0, hw_cr0);
ad312c7c 5327 vcpu->arch.cr0 = cr0;
14168786
GN
5328
5329 /* depends on vcpu->arch.cr0 to be set to a new value */
5330 vmx->emulation_required = emulation_required(vcpu);
6aa8b732
AK
5331}
5332
855feb67
YZ
5333static int get_ept_level(struct kvm_vcpu *vcpu)
5334{
5335 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
5336 return 5;
5337 return 4;
5338}
5339
995f00a6 5340static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
1439442c 5341{
855feb67
YZ
5342 u64 eptp = VMX_EPTP_MT_WB;
5343
5344 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
1439442c 5345
995f00a6
PF
5346 if (enable_ept_ad_bits &&
5347 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
bb97a016 5348 eptp |= VMX_EPTP_AD_ENABLE_BIT;
1439442c
SY
5349 eptp |= (root_hpa & PAGE_MASK);
5350
5351 return eptp;
5352}
5353
6aa8b732
AK
5354static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
5355{
877ad952 5356 struct kvm *kvm = vcpu->kvm;
1439442c
SY
5357 unsigned long guest_cr3;
5358 u64 eptp;
5359
5360 guest_cr3 = cr3;
089d034e 5361 if (enable_ept) {
995f00a6 5362 eptp = construct_eptp(vcpu, cr3);
1439442c 5363 vmcs_write64(EPT_POINTER, eptp);
877ad952
TL
5364
5365 if (kvm_x86_ops->tlb_remote_flush) {
5366 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5367 to_vmx(vcpu)->ept_pointer = eptp;
5368 to_kvm_vmx(kvm)->ept_pointers_match
5369 = EPT_POINTERS_CHECK;
5370 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5371 }
5372
e90008df
SC
5373 if (enable_unrestricted_guest || is_paging(vcpu) ||
5374 is_guest_mode(vcpu))
59ab5a8f
JK
5375 guest_cr3 = kvm_read_cr3(vcpu);
5376 else
877ad952 5377 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
7c93be44 5378 ept_load_pdptrs(vcpu);
1439442c
SY
5379 }
5380
1439442c 5381 vmcs_writel(GUEST_CR3, guest_cr3);
6aa8b732
AK
5382}
5383
5e1746d6 5384static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 5385{
085e68ee
BS
5386 /*
5387 * Pass through host's Machine Check Enable value to hw_cr4, which
5388 * is in force while we are in guest mode. Do not let guests control
5389 * this bit, even if host CR4.MCE == 0.
5390 */
5dc1f044
SC
5391 unsigned long hw_cr4;
5392
5393 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
5394 if (enable_unrestricted_guest)
5395 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
5396 else if (to_vmx(vcpu)->rmode.vm86_active)
5397 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
5398 else
5399 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
1439442c 5400
64f7a115
SC
5401 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
5402 if (cr4 & X86_CR4_UMIP) {
5403 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
0367f205 5404 SECONDARY_EXEC_DESC);
64f7a115
SC
5405 hw_cr4 &= ~X86_CR4_UMIP;
5406 } else if (!is_guest_mode(vcpu) ||
5407 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
5408 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
5409 SECONDARY_EXEC_DESC);
5410 }
0367f205 5411
5e1746d6
NHE
5412 if (cr4 & X86_CR4_VMXE) {
5413 /*
5414 * To use VMXON (and later other VMX instructions), a guest
5415 * must first be able to turn on cr4.VMXE (see handle_vmon()).
5416 * So basically the check on whether to allow nested VMX
5bea5123
PB
5417 * is here. We operate under the default treatment of SMM,
5418 * so VMX cannot be enabled under SMM.
5e1746d6 5419 */
5bea5123 5420 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
5e1746d6 5421 return 1;
1a0d74e6 5422 }
3899152c
DM
5423
5424 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
5e1746d6
NHE
5425 return 1;
5426
ad312c7c 5427 vcpu->arch.cr4 = cr4;
5dc1f044
SC
5428
5429 if (!enable_unrestricted_guest) {
5430 if (enable_ept) {
5431 if (!is_paging(vcpu)) {
5432 hw_cr4 &= ~X86_CR4_PAE;
5433 hw_cr4 |= X86_CR4_PSE;
5434 } else if (!(cr4 & X86_CR4_PAE)) {
5435 hw_cr4 &= ~X86_CR4_PAE;
5436 }
bc23008b 5437 }
1439442c 5438
656ec4a4 5439 /*
ddba2628
HH
5440 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
5441 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
5442 * to be manually disabled when guest switches to non-paging
5443 * mode.
5444 *
5445 * If !enable_unrestricted_guest, the CPU is always running
5446 * with CR0.PG=1 and CR4 needs to be modified.
5447 * If enable_unrestricted_guest, the CPU automatically
5448 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
656ec4a4 5449 */
5dc1f044
SC
5450 if (!is_paging(vcpu))
5451 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
5452 }
656ec4a4 5453
1439442c
SY
5454 vmcs_writel(CR4_READ_SHADOW, cr4);
5455 vmcs_writel(GUEST_CR4, hw_cr4);
5e1746d6 5456 return 0;
6aa8b732
AK
5457}
5458
6aa8b732
AK
5459static void vmx_get_segment(struct kvm_vcpu *vcpu,
5460 struct kvm_segment *var, int seg)
5461{
a9179499 5462 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732
AK
5463 u32 ar;
5464
c6ad1153 5465 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
f5f7b2fe 5466 *var = vmx->rmode.segs[seg];
a9179499 5467 if (seg == VCPU_SREG_TR
2fb92db1 5468 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
f5f7b2fe 5469 return;
1390a28b
AK
5470 var->base = vmx_read_guest_seg_base(vmx, seg);
5471 var->selector = vmx_read_guest_seg_selector(vmx, seg);
5472 return;
a9179499 5473 }
2fb92db1
AK
5474 var->base = vmx_read_guest_seg_base(vmx, seg);
5475 var->limit = vmx_read_guest_seg_limit(vmx, seg);
5476 var->selector = vmx_read_guest_seg_selector(vmx, seg);
5477 ar = vmx_read_guest_seg_ar(vmx, seg);
03617c18 5478 var->unusable = (ar >> 16) & 1;
6aa8b732
AK
5479 var->type = ar & 15;
5480 var->s = (ar >> 4) & 1;
5481 var->dpl = (ar >> 5) & 3;
03617c18
GN
5482 /*
5483 * Some userspaces do not preserve unusable property. Since usable
5484 * segment has to be present according to VMX spec we can use present
5485 * property to amend userspace bug by making unusable segment always
5486 * nonpresent. vmx_segment_access_rights() already marks nonpresent
5487 * segment as unusable.
5488 */
5489 var->present = !var->unusable;
6aa8b732
AK
5490 var->avl = (ar >> 12) & 1;
5491 var->l = (ar >> 13) & 1;
5492 var->db = (ar >> 14) & 1;
5493 var->g = (ar >> 15) & 1;
6aa8b732
AK
5494}
5495
a9179499
AK
5496static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
5497{
a9179499
AK
5498 struct kvm_segment s;
5499
5500 if (to_vmx(vcpu)->rmode.vm86_active) {
5501 vmx_get_segment(vcpu, &s, seg);
5502 return s.base;
5503 }
2fb92db1 5504 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
a9179499
AK
5505}
5506
b09408d0 5507static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2e4d2653 5508{
b09408d0
MT
5509 struct vcpu_vmx *vmx = to_vmx(vcpu);
5510
ae9fedc7 5511 if (unlikely(vmx->rmode.vm86_active))
2e4d2653 5512 return 0;
ae9fedc7
PB
5513 else {
5514 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4d283ec9 5515 return VMX_AR_DPL(ar);
69c73028 5516 }
69c73028
AK
5517}
5518
653e3108 5519static u32 vmx_segment_access_rights(struct kvm_segment *var)
6aa8b732 5520{
6aa8b732
AK
5521 u32 ar;
5522
f0495f9b 5523 if (var->unusable || !var->present)
6aa8b732
AK
5524 ar = 1 << 16;
5525 else {
5526 ar = var->type & 15;
5527 ar |= (var->s & 1) << 4;
5528 ar |= (var->dpl & 3) << 5;
5529 ar |= (var->present & 1) << 7;
5530 ar |= (var->avl & 1) << 12;
5531 ar |= (var->l & 1) << 13;
5532 ar |= (var->db & 1) << 14;
5533 ar |= (var->g & 1) << 15;
5534 }
653e3108
AK
5535
5536 return ar;
5537}
5538
5539static void vmx_set_segment(struct kvm_vcpu *vcpu,
5540 struct kvm_segment *var, int seg)
5541{
7ffd92c5 5542 struct vcpu_vmx *vmx = to_vmx(vcpu);
772e0318 5543 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
653e3108 5544
2fb92db1
AK
5545 vmx_segment_cache_clear(vmx);
5546
1ecd50a9
GN
5547 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
5548 vmx->rmode.segs[seg] = *var;
5549 if (seg == VCPU_SREG_TR)
5550 vmcs_write16(sf->selector, var->selector);
5551 else if (var->s)
5552 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
d99e4152 5553 goto out;
653e3108 5554 }
1ecd50a9 5555
653e3108
AK
5556 vmcs_writel(sf->base, var->base);
5557 vmcs_write32(sf->limit, var->limit);
5558 vmcs_write16(sf->selector, var->selector);
3a624e29
NK
5559
5560 /*
5561 * Fix the "Accessed" bit in AR field of segment registers for older
5562 * qemu binaries.
5563 * IA32 arch specifies that at the time of processor reset the
5564 * "Accessed" bit in the AR field of segment registers is 1. And qemu
0fa06071 5565 * is setting it to 0 in the userland code. This causes invalid guest
3a624e29
NK
5566 * state vmexit when "unrestricted guest" mode is turned on.
5567 * Fix for this setup issue in cpu_reset is being pushed in the qemu
5568 * tree. Newer qemu binaries with that qemu fix would not need this
5569 * kvm hack.
5570 */
5571 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
f924d66d 5572 var->type |= 0x1; /* Accessed */
3a624e29 5573
f924d66d 5574 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
d99e4152
GN
5575
5576out:
98eb2f8b 5577 vmx->emulation_required = emulation_required(vcpu);
6aa8b732
AK
5578}
5579
6aa8b732
AK
5580static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
5581{
2fb92db1 5582 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
6aa8b732
AK
5583
5584 *db = (ar >> 14) & 1;
5585 *l = (ar >> 13) & 1;
5586}
5587
89a27f4d 5588static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5589{
89a27f4d
GN
5590 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
5591 dt->address = vmcs_readl(GUEST_IDTR_BASE);
6aa8b732
AK
5592}
5593
89a27f4d 5594static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5595{
89a27f4d
GN
5596 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
5597 vmcs_writel(GUEST_IDTR_BASE, dt->address);
6aa8b732
AK
5598}
5599
89a27f4d 5600static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5601{
89a27f4d
GN
5602 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
5603 dt->address = vmcs_readl(GUEST_GDTR_BASE);
6aa8b732
AK
5604}
5605
89a27f4d 5606static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5607{
89a27f4d
GN
5608 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
5609 vmcs_writel(GUEST_GDTR_BASE, dt->address);
6aa8b732
AK
5610}
5611
648dfaa7
MG
5612static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
5613{
5614 struct kvm_segment var;
5615 u32 ar;
5616
5617 vmx_get_segment(vcpu, &var, seg);
07f42f5f 5618 var.dpl = 0x3;
0647f4aa
GN
5619 if (seg == VCPU_SREG_CS)
5620 var.type = 0x3;
648dfaa7
MG
5621 ar = vmx_segment_access_rights(&var);
5622
5623 if (var.base != (var.selector << 4))
5624 return false;
89efbed0 5625 if (var.limit != 0xffff)
648dfaa7 5626 return false;
07f42f5f 5627 if (ar != 0xf3)
648dfaa7
MG
5628 return false;
5629
5630 return true;
5631}
5632
5633static bool code_segment_valid(struct kvm_vcpu *vcpu)
5634{
5635 struct kvm_segment cs;
5636 unsigned int cs_rpl;
5637
5638 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
b32a9918 5639 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
648dfaa7 5640
1872a3f4
AK
5641 if (cs.unusable)
5642 return false;
4d283ec9 5643 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
648dfaa7
MG
5644 return false;
5645 if (!cs.s)
5646 return false;
4d283ec9 5647 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
648dfaa7
MG
5648 if (cs.dpl > cs_rpl)
5649 return false;
1872a3f4 5650 } else {
648dfaa7
MG
5651 if (cs.dpl != cs_rpl)
5652 return false;
5653 }
5654 if (!cs.present)
5655 return false;
5656
5657 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
5658 return true;
5659}
5660
5661static bool stack_segment_valid(struct kvm_vcpu *vcpu)
5662{
5663 struct kvm_segment ss;
5664 unsigned int ss_rpl;
5665
5666 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
b32a9918 5667 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
648dfaa7 5668
1872a3f4
AK
5669 if (ss.unusable)
5670 return true;
5671 if (ss.type != 3 && ss.type != 7)
648dfaa7
MG
5672 return false;
5673 if (!ss.s)
5674 return false;
5675 if (ss.dpl != ss_rpl) /* DPL != RPL */
5676 return false;
5677 if (!ss.present)
5678 return false;
5679
5680 return true;
5681}
5682
5683static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
5684{
5685 struct kvm_segment var;
5686 unsigned int rpl;
5687
5688 vmx_get_segment(vcpu, &var, seg);
b32a9918 5689 rpl = var.selector & SEGMENT_RPL_MASK;
648dfaa7 5690
1872a3f4
AK
5691 if (var.unusable)
5692 return true;
648dfaa7
MG
5693 if (!var.s)
5694 return false;
5695 if (!var.present)
5696 return false;
4d283ec9 5697 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
648dfaa7
MG
5698 if (var.dpl < rpl) /* DPL < RPL */
5699 return false;
5700 }
5701
5702 /* TODO: Add other members to kvm_segment_field to allow checking for other access
5703 * rights flags
5704 */
5705 return true;
5706}
5707
5708static bool tr_valid(struct kvm_vcpu *vcpu)
5709{
5710 struct kvm_segment tr;
5711
5712 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
5713
1872a3f4
AK
5714 if (tr.unusable)
5715 return false;
b32a9918 5716 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7 5717 return false;
1872a3f4 5718 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
648dfaa7
MG
5719 return false;
5720 if (!tr.present)
5721 return false;
5722
5723 return true;
5724}
5725
5726static bool ldtr_valid(struct kvm_vcpu *vcpu)
5727{
5728 struct kvm_segment ldtr;
5729
5730 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
5731
1872a3f4
AK
5732 if (ldtr.unusable)
5733 return true;
b32a9918 5734 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7
MG
5735 return false;
5736 if (ldtr.type != 2)
5737 return false;
5738 if (!ldtr.present)
5739 return false;
5740
5741 return true;
5742}
5743
5744static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
5745{
5746 struct kvm_segment cs, ss;
5747
5748 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5749 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
5750
b32a9918
NA
5751 return ((cs.selector & SEGMENT_RPL_MASK) ==
5752 (ss.selector & SEGMENT_RPL_MASK));
648dfaa7
MG
5753}
5754
5755/*
5756 * Check if guest state is valid. Returns true if valid, false if
5757 * not.
5758 * We assume that registers are always usable
5759 */
5760static bool guest_state_valid(struct kvm_vcpu *vcpu)
5761{
c5e97c80
GN
5762 if (enable_unrestricted_guest)
5763 return true;
5764
648dfaa7 5765 /* real mode guest state checks */
f13882d8 5766 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
648dfaa7
MG
5767 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
5768 return false;
5769 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
5770 return false;
5771 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
5772 return false;
5773 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
5774 return false;
5775 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
5776 return false;
5777 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
5778 return false;
5779 } else {
5780 /* protected mode guest state checks */
5781 if (!cs_ss_rpl_check(vcpu))
5782 return false;
5783 if (!code_segment_valid(vcpu))
5784 return false;
5785 if (!stack_segment_valid(vcpu))
5786 return false;
5787 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
5788 return false;
5789 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
5790 return false;
5791 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
5792 return false;
5793 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
5794 return false;
5795 if (!tr_valid(vcpu))
5796 return false;
5797 if (!ldtr_valid(vcpu))
5798 return false;
5799 }
5800 /* TODO:
5801 * - Add checks on RIP
5802 * - Add checks on RFLAGS
5803 */
5804
5805 return true;
5806}
5807
5fa99cbe
JM
5808static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
5809{
5810 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
5811}
5812
d77c26fc 5813static int init_rmode_tss(struct kvm *kvm)
6aa8b732 5814{
40dcaa9f 5815 gfn_t fn;
195aefde 5816 u16 data = 0;
1f755a82 5817 int idx, r;
6aa8b732 5818
40dcaa9f 5819 idx = srcu_read_lock(&kvm->srcu);
40bbb9d0 5820 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
195aefde
IE
5821 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5822 if (r < 0)
10589a46 5823 goto out;
195aefde 5824 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
464d17c8
SY
5825 r = kvm_write_guest_page(kvm, fn++, &data,
5826 TSS_IOPB_BASE_OFFSET, sizeof(u16));
195aefde 5827 if (r < 0)
10589a46 5828 goto out;
195aefde
IE
5829 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
5830 if (r < 0)
10589a46 5831 goto out;
195aefde
IE
5832 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5833 if (r < 0)
10589a46 5834 goto out;
195aefde 5835 data = ~0;
10589a46
MT
5836 r = kvm_write_guest_page(kvm, fn, &data,
5837 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
5838 sizeof(u8));
10589a46 5839out:
40dcaa9f 5840 srcu_read_unlock(&kvm->srcu, idx);
1f755a82 5841 return r;
6aa8b732
AK
5842}
5843
b7ebfb05
SY
5844static int init_rmode_identity_map(struct kvm *kvm)
5845{
40bbb9d0 5846 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
f51770ed 5847 int i, idx, r = 0;
ba049e93 5848 kvm_pfn_t identity_map_pfn;
b7ebfb05
SY
5849 u32 tmp;
5850
40bbb9d0 5851 /* Protect kvm_vmx->ept_identity_pagetable_done. */
a255d479
TC
5852 mutex_lock(&kvm->slots_lock);
5853
40bbb9d0 5854 if (likely(kvm_vmx->ept_identity_pagetable_done))
a255d479 5855 goto out2;
a255d479 5856
40bbb9d0
SC
5857 if (!kvm_vmx->ept_identity_map_addr)
5858 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
5859 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
a255d479 5860
d8a6e365 5861 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
40bbb9d0 5862 kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
f51770ed 5863 if (r < 0)
a255d479
TC
5864 goto out2;
5865
40dcaa9f 5866 idx = srcu_read_lock(&kvm->srcu);
b7ebfb05
SY
5867 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
5868 if (r < 0)
5869 goto out;
5870 /* Set up identity-mapping pagetable for EPT in real mode */
5871 for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
5872 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5873 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5874 r = kvm_write_guest_page(kvm, identity_map_pfn,
5875 &tmp, i * sizeof(tmp), sizeof(tmp));
5876 if (r < 0)
5877 goto out;
5878 }
40bbb9d0 5879 kvm_vmx->ept_identity_pagetable_done = true;
f51770ed 5880
b7ebfb05 5881out:
40dcaa9f 5882 srcu_read_unlock(&kvm->srcu, idx);
a255d479
TC
5883
5884out2:
5885 mutex_unlock(&kvm->slots_lock);
f51770ed 5886 return r;
b7ebfb05
SY
5887}
5888
6aa8b732
AK
5889static void seg_setup(int seg)
5890{
772e0318 5891 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3a624e29 5892 unsigned int ar;
6aa8b732
AK
5893
5894 vmcs_write16(sf->selector, 0);
5895 vmcs_writel(sf->base, 0);
5896 vmcs_write32(sf->limit, 0xffff);
d54d07b2
GN
5897 ar = 0x93;
5898 if (seg == VCPU_SREG_CS)
5899 ar |= 0x08; /* code segment */
3a624e29
NK
5900
5901 vmcs_write32(sf->ar_bytes, ar);
6aa8b732
AK
5902}
5903
f78e0e2e
SY
5904static int alloc_apic_access_page(struct kvm *kvm)
5905{
4484141a 5906 struct page *page;
f78e0e2e
SY
5907 int r = 0;
5908
79fac95e 5909 mutex_lock(&kvm->slots_lock);
c24ae0dc 5910 if (kvm->arch.apic_access_page_done)
f78e0e2e 5911 goto out;
1d8007bd
PB
5912 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
5913 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
f78e0e2e
SY
5914 if (r)
5915 goto out;
72dc67a6 5916
73a6d941 5917 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4484141a
XG
5918 if (is_error_page(page)) {
5919 r = -EFAULT;
5920 goto out;
5921 }
5922
c24ae0dc
TC
5923 /*
5924 * Do not pin the page in memory, so that memory hot-unplug
5925 * is able to migrate it.
5926 */
5927 put_page(page);
5928 kvm->arch.apic_access_page_done = true;
f78e0e2e 5929out:
79fac95e 5930 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
5931 return r;
5932}
5933
991e7a0e 5934static int allocate_vpid(void)
2384d2b3
SY
5935{
5936 int vpid;
5937
919818ab 5938 if (!enable_vpid)
991e7a0e 5939 return 0;
2384d2b3
SY
5940 spin_lock(&vmx_vpid_lock);
5941 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
991e7a0e 5942 if (vpid < VMX_NR_VPIDS)
2384d2b3 5943 __set_bit(vpid, vmx_vpid_bitmap);
991e7a0e
WL
5944 else
5945 vpid = 0;
2384d2b3 5946 spin_unlock(&vmx_vpid_lock);
991e7a0e 5947 return vpid;
2384d2b3
SY
5948}
5949
991e7a0e 5950static void free_vpid(int vpid)
cdbecfc3 5951{
991e7a0e 5952 if (!enable_vpid || vpid == 0)
cdbecfc3
LJ
5953 return;
5954 spin_lock(&vmx_vpid_lock);
991e7a0e 5955 __clear_bit(vpid, vmx_vpid_bitmap);
cdbecfc3
LJ
5956 spin_unlock(&vmx_vpid_lock);
5957}
5958
1e4329ee 5959static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
904e14fb 5960 u32 msr, int type)
25c5f225 5961{
3e7c73e9 5962 int f = sizeof(unsigned long);
25c5f225
SY
5963
5964 if (!cpu_has_vmx_msr_bitmap())
5965 return;
5966
ceef7d10
VK
5967 if (static_branch_unlikely(&enable_evmcs))
5968 evmcs_touch_msr_bitmap();
5969
25c5f225
SY
5970 /*
5971 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5972 * have the write-low and read-high bitmap offsets the wrong way round.
5973 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5974 */
25c5f225 5975 if (msr <= 0x1fff) {
8d14695f
YZ
5976 if (type & MSR_TYPE_R)
5977 /* read-low */
5978 __clear_bit(msr, msr_bitmap + 0x000 / f);
5979
5980 if (type & MSR_TYPE_W)
5981 /* write-low */
5982 __clear_bit(msr, msr_bitmap + 0x800 / f);
5983
25c5f225
SY
5984 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5985 msr &= 0x1fff;
8d14695f
YZ
5986 if (type & MSR_TYPE_R)
5987 /* read-high */
5988 __clear_bit(msr, msr_bitmap + 0x400 / f);
5989
5990 if (type & MSR_TYPE_W)
5991 /* write-high */
5992 __clear_bit(msr, msr_bitmap + 0xc00 / f);
5993
5994 }
5995}
5996
1e4329ee 5997static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
904e14fb
PB
5998 u32 msr, int type)
5999{
6000 int f = sizeof(unsigned long);
6001
6002 if (!cpu_has_vmx_msr_bitmap())
6003 return;
6004
ceef7d10
VK
6005 if (static_branch_unlikely(&enable_evmcs))
6006 evmcs_touch_msr_bitmap();
6007
904e14fb
PB
6008 /*
6009 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
6010 * have the write-low and read-high bitmap offsets the wrong way round.
6011 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
6012 */
6013 if (msr <= 0x1fff) {
6014 if (type & MSR_TYPE_R)
6015 /* read-low */
6016 __set_bit(msr, msr_bitmap + 0x000 / f);
6017
6018 if (type & MSR_TYPE_W)
6019 /* write-low */
6020 __set_bit(msr, msr_bitmap + 0x800 / f);
6021
6022 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6023 msr &= 0x1fff;
6024 if (type & MSR_TYPE_R)
6025 /* read-high */
6026 __set_bit(msr, msr_bitmap + 0x400 / f);
6027
6028 if (type & MSR_TYPE_W)
6029 /* write-high */
6030 __set_bit(msr, msr_bitmap + 0xc00 / f);
6031
6032 }
6033}
6034
1e4329ee 6035static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
904e14fb
PB
6036 u32 msr, int type, bool value)
6037{
6038 if (value)
6039 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
6040 else
6041 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
6042}
6043
f2b93280
WV
6044/*
6045 * If a msr is allowed by L0, we should check whether it is allowed by L1.
6046 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
6047 */
6048static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
6049 unsigned long *msr_bitmap_nested,
6050 u32 msr, int type)
6051{
6052 int f = sizeof(unsigned long);
6053
f2b93280
WV
6054 /*
6055 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
6056 * have the write-low and read-high bitmap offsets the wrong way round.
6057 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
6058 */
6059 if (msr <= 0x1fff) {
6060 if (type & MSR_TYPE_R &&
6061 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
6062 /* read-low */
6063 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
6064
6065 if (type & MSR_TYPE_W &&
6066 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
6067 /* write-low */
6068 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
6069
6070 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6071 msr &= 0x1fff;
6072 if (type & MSR_TYPE_R &&
6073 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
6074 /* read-high */
6075 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
6076
6077 if (type & MSR_TYPE_W &&
6078 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
6079 /* write-high */
6080 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
6081
6082 }
6083}
6084
904e14fb 6085static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
5897297b 6086{
904e14fb
PB
6087 u8 mode = 0;
6088
6089 if (cpu_has_secondary_exec_ctrls() &&
6090 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
6091 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
6092 mode |= MSR_BITMAP_MODE_X2APIC;
6093 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
6094 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
6095 }
6096
904e14fb 6097 return mode;
8d14695f
YZ
6098}
6099
904e14fb
PB
6100#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
6101
6102static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
6103 u8 mode)
8d14695f 6104{
904e14fb
PB
6105 int msr;
6106
6107 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
6108 unsigned word = msr / BITS_PER_LONG;
6109 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
6110 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
6111 }
6112
6113 if (mode & MSR_BITMAP_MODE_X2APIC) {
6114 /*
6115 * TPR reads and writes can be virtualized even if virtual interrupt
6116 * delivery is not in use.
6117 */
6118 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
6119 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
6120 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
6121 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
6122 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
6123 }
f6e90f9e 6124 }
5897297b
AK
6125}
6126
904e14fb
PB
6127static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
6128{
6129 struct vcpu_vmx *vmx = to_vmx(vcpu);
6130 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
6131 u8 mode = vmx_msr_bitmap_mode(vcpu);
6132 u8 changed = mode ^ vmx->msr_bitmap_mode;
6133
6134 if (!changed)
6135 return;
6136
904e14fb
PB
6137 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
6138 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
6139
6140 vmx->msr_bitmap_mode = mode;
6141}
6142
b2a05fef 6143static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
d50ab6c1 6144{
d62caabb 6145 return enable_apicv;
d50ab6c1
PB
6146}
6147
c9f04407
DM
6148static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
6149{
6150 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6151 gfn_t gfn;
6152
6153 /*
6154 * Don't need to mark the APIC access page dirty; it is never
6155 * written to by the CPU during APIC virtualization.
6156 */
6157
6158 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
6159 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
6160 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6161 }
6162
6163 if (nested_cpu_has_posted_intr(vmcs12)) {
6164 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
6165 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6166 }
6167}
6168
6169
6342c50a 6170static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
705699a1
WV
6171{
6172 struct vcpu_vmx *vmx = to_vmx(vcpu);
6173 int max_irr;
6174 void *vapic_page;
6175 u16 status;
6176
c9f04407
DM
6177 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
6178 return;
705699a1 6179
c9f04407
DM
6180 vmx->nested.pi_pending = false;
6181 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
6182 return;
705699a1 6183
c9f04407
DM
6184 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
6185 if (max_irr != 256) {
705699a1 6186 vapic_page = kmap(vmx->nested.virtual_apic_page);
e7387b0e
LA
6187 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
6188 vapic_page, &max_irr);
705699a1
WV
6189 kunmap(vmx->nested.virtual_apic_page);
6190
6191 status = vmcs_read16(GUEST_INTR_STATUS);
6192 if ((u8)max_irr > ((u8)status & 0xff)) {
6193 status &= ~0xff;
6194 status |= (u8)max_irr;
6195 vmcs_write16(GUEST_INTR_STATUS, status);
6196 }
6197 }
c9f04407
DM
6198
6199 nested_mark_vmcs12_pages_dirty(vcpu);
705699a1
WV
6200}
6201
7e712684
PB
6202static u8 vmx_get_rvi(void)
6203{
6204 return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
6205}
6206
e6c67d8c
LA
6207static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
6208{
6209 struct vcpu_vmx *vmx = to_vmx(vcpu);
6210 void *vapic_page;
6211 u32 vppr;
6212 int rvi;
6213
6214 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
6215 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
6216 WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
6217 return false;
6218
7e712684 6219 rvi = vmx_get_rvi();
e6c67d8c
LA
6220
6221 vapic_page = kmap(vmx->nested.virtual_apic_page);
6222 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
6223 kunmap(vmx->nested.virtual_apic_page);
6224
6225 return ((rvi & 0xf0) > (vppr & 0xf0));
6226}
6227
06a5524f
WV
6228static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
6229 bool nested)
21bc8dc5
RK
6230{
6231#ifdef CONFIG_SMP
06a5524f
WV
6232 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
6233
21bc8dc5 6234 if (vcpu->mode == IN_GUEST_MODE) {
28b835d6 6235 /*
5753743f
HZ
6236 * The vector of interrupt to be delivered to vcpu had
6237 * been set in PIR before this function.
6238 *
6239 * Following cases will be reached in this block, and
6240 * we always send a notification event in all cases as
6241 * explained below.
6242 *
6243 * Case 1: vcpu keeps in non-root mode. Sending a
6244 * notification event posts the interrupt to vcpu.
6245 *
6246 * Case 2: vcpu exits to root mode and is still
6247 * runnable. PIR will be synced to vIRR before the
6248 * next vcpu entry. Sending a notification event in
6249 * this case has no effect, as vcpu is not in root
6250 * mode.
28b835d6 6251 *
5753743f
HZ
6252 * Case 3: vcpu exits to root mode and is blocked.
6253 * vcpu_block() has already synced PIR to vIRR and
6254 * never blocks vcpu if vIRR is not cleared. Therefore,
6255 * a blocked vcpu here does not wait for any requested
6256 * interrupts in PIR, and sending a notification event
6257 * which has no effect is safe here.
28b835d6 6258 */
28b835d6 6259
06a5524f 6260 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
21bc8dc5
RK
6261 return true;
6262 }
6263#endif
6264 return false;
6265}
6266
705699a1
WV
6267static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
6268 int vector)
6269{
6270 struct vcpu_vmx *vmx = to_vmx(vcpu);
6271
6272 if (is_guest_mode(vcpu) &&
6273 vector == vmx->nested.posted_intr_nv) {
705699a1
WV
6274 /*
6275 * If a posted intr is not recognized by hardware,
6276 * we will accomplish it in the next vmentry.
6277 */
6278 vmx->nested.pi_pending = true;
6279 kvm_make_request(KVM_REQ_EVENT, vcpu);
6b697711
LA
6280 /* the PIR and ON have been set by L1. */
6281 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
6282 kvm_vcpu_kick(vcpu);
705699a1
WV
6283 return 0;
6284 }
6285 return -1;
6286}
a20ed54d
YZ
6287/*
6288 * Send interrupt to vcpu via posted interrupt way.
6289 * 1. If target vcpu is running(non-root mode), send posted interrupt
6290 * notification to vcpu and hardware will sync PIR to vIRR atomically.
6291 * 2. If target vcpu isn't running(root mode), kick it to pick up the
6292 * interrupt from PIR in next vmentry.
6293 */
6294static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
6295{
6296 struct vcpu_vmx *vmx = to_vmx(vcpu);
6297 int r;
6298
705699a1
WV
6299 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
6300 if (!r)
6301 return;
6302
a20ed54d
YZ
6303 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
6304 return;
6305
b95234c8
PB
6306 /* If a previous notification has sent the IPI, nothing to do. */
6307 if (pi_test_and_set_on(&vmx->pi_desc))
6308 return;
6309
06a5524f 6310 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
a20ed54d
YZ
6311 kvm_vcpu_kick(vcpu);
6312}
6313
a3a8ff8e
NHE
6314/*
6315 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
6316 * will not change in the lifetime of the guest.
6317 * Note that host-state that does change is set elsewhere. E.g., host-state
6318 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
6319 */
a547c6db 6320static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
a3a8ff8e
NHE
6321{
6322 u32 low32, high32;
6323 unsigned long tmpl;
6324 struct desc_ptr dt;
d6e41f11 6325 unsigned long cr0, cr3, cr4;
a3a8ff8e 6326
04ac88ab
AL
6327 cr0 = read_cr0();
6328 WARN_ON(cr0 & X86_CR0_TS);
6329 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
d6e41f11
AL
6330
6331 /*
6332 * Save the most likely value for this task's CR3 in the VMCS.
6333 * We can't use __get_current_cr3_fast() because we're not atomic.
6334 */
6c690ee1 6335 cr3 = __read_cr3();
d6e41f11 6336 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
d7ee039e 6337 vmx->loaded_vmcs->host_state.cr3 = cr3;
a3a8ff8e 6338
d974baa3 6339 /* Save the most likely value for this task's CR4 in the VMCS. */
1e02ce4c 6340 cr4 = cr4_read_shadow();
d974baa3 6341 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
d7ee039e 6342 vmx->loaded_vmcs->host_state.cr4 = cr4;
d974baa3 6343
a3a8ff8e 6344 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
b2da15ac
AK
6345#ifdef CONFIG_X86_64
6346 /*
6347 * Load null selectors, so we can avoid reloading them in
6d6095bd
SC
6348 * vmx_prepare_switch_to_host(), in case userspace uses
6349 * the null selectors too (the expected case).
b2da15ac
AK
6350 */
6351 vmcs_write16(HOST_DS_SELECTOR, 0);
6352 vmcs_write16(HOST_ES_SELECTOR, 0);
6353#else
a3a8ff8e
NHE
6354 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
6355 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
b2da15ac 6356#endif
a3a8ff8e
NHE
6357 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
6358 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
6359
87930019 6360 store_idt(&dt);
a3a8ff8e 6361 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
a547c6db 6362 vmx->host_idt_base = dt.address;
a3a8ff8e 6363
83287ea4 6364 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
a3a8ff8e
NHE
6365
6366 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
6367 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
6368 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
6369 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
6370
6371 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
6372 rdmsr(MSR_IA32_CR_PAT, low32, high32);
6373 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
6374 }
5a5e8a15
SC
6375
6376 if (cpu_has_load_ia32_efer)
6377 vmcs_write64(HOST_IA32_EFER, host_efer);
a3a8ff8e
NHE
6378}
6379
bf8179a0
NHE
6380static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
6381{
6382 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
6383 if (enable_ept)
6384 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
fe3ef05c
NHE
6385 if (is_guest_mode(&vmx->vcpu))
6386 vmx->vcpu.arch.cr4_guest_owned_bits &=
6387 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
bf8179a0
NHE
6388 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
6389}
6390
01e439be
YZ
6391static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
6392{
6393 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
6394
d62caabb 6395 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
01e439be 6396 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
d02fcf50
PB
6397
6398 if (!enable_vnmi)
6399 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
6400
64672c95
YJ
6401 /* Enable the preemption timer dynamically */
6402 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be
YZ
6403 return pin_based_exec_ctrl;
6404}
6405
d62caabb
AS
6406static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
6407{
6408 struct vcpu_vmx *vmx = to_vmx(vcpu);
6409
6410 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
3ce424e4
RK
6411 if (cpu_has_secondary_exec_ctrls()) {
6412 if (kvm_vcpu_apicv_active(vcpu))
6413 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
6414 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6415 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6416 else
6417 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
6418 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6419 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6420 }
6421
6422 if (cpu_has_vmx_msr_bitmap())
904e14fb 6423 vmx_update_msr_bitmap(vcpu);
d62caabb
AS
6424}
6425
bf8179a0
NHE
6426static u32 vmx_exec_control(struct vcpu_vmx *vmx)
6427{
6428 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
d16c293e
PB
6429
6430 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
6431 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
6432
35754c98 6433 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
bf8179a0
NHE
6434 exec_control &= ~CPU_BASED_TPR_SHADOW;
6435#ifdef CONFIG_X86_64
6436 exec_control |= CPU_BASED_CR8_STORE_EXITING |
6437 CPU_BASED_CR8_LOAD_EXITING;
6438#endif
6439 }
6440 if (!enable_ept)
6441 exec_control |= CPU_BASED_CR3_STORE_EXITING |
6442 CPU_BASED_CR3_LOAD_EXITING |
6443 CPU_BASED_INVLPG_EXITING;
4d5422ce
WL
6444 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
6445 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
6446 CPU_BASED_MONITOR_EXITING);
caa057a2
WL
6447 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
6448 exec_control &= ~CPU_BASED_HLT_EXITING;
bf8179a0
NHE
6449 return exec_control;
6450}
6451
45ec368c 6452static bool vmx_rdrand_supported(void)
bf8179a0 6453{
45ec368c 6454 return vmcs_config.cpu_based_2nd_exec_ctrl &
736fdf72 6455 SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
6456}
6457
75f4fc8d
JM
6458static bool vmx_rdseed_supported(void)
6459{
6460 return vmcs_config.cpu_based_2nd_exec_ctrl &
736fdf72 6461 SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
6462}
6463
80154d77 6464static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
bf8179a0 6465{
80154d77
PB
6466 struct kvm_vcpu *vcpu = &vmx->vcpu;
6467
bf8179a0 6468 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
0367f205 6469
80154d77 6470 if (!cpu_need_virtualize_apic_accesses(vcpu))
bf8179a0
NHE
6471 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6472 if (vmx->vpid == 0)
6473 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
6474 if (!enable_ept) {
6475 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
6476 enable_unrestricted_guest = 0;
6477 }
6478 if (!enable_unrestricted_guest)
6479 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
b31c114b 6480 if (kvm_pause_in_guest(vmx->vcpu.kvm))
bf8179a0 6481 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
80154d77 6482 if (!kvm_vcpu_apicv_active(vcpu))
c7c9c56c
YZ
6483 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
6484 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
8d14695f 6485 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
0367f205
PB
6486
6487 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
6488 * in vmx_set_cr4. */
6489 exec_control &= ~SECONDARY_EXEC_DESC;
6490
abc4fc58
AG
6491 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
6492 (handle_vmptrld).
6493 We can NOT enable shadow_vmcs here because we don't have yet
6494 a current VMCS12
6495 */
6496 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
a3eaa864
KH
6497
6498 if (!enable_pml)
6499 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
843e4330 6500
3db13480
PB
6501 if (vmx_xsaves_supported()) {
6502 /* Exposing XSAVES only when XSAVE is exposed */
6503 bool xsaves_enabled =
6504 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
6505 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
6506
6507 if (!xsaves_enabled)
6508 exec_control &= ~SECONDARY_EXEC_XSAVES;
6509
6510 if (nested) {
6511 if (xsaves_enabled)
6677f3da 6512 vmx->nested.msrs.secondary_ctls_high |=
3db13480
PB
6513 SECONDARY_EXEC_XSAVES;
6514 else
6677f3da 6515 vmx->nested.msrs.secondary_ctls_high &=
3db13480
PB
6516 ~SECONDARY_EXEC_XSAVES;
6517 }
6518 }
6519
80154d77
PB
6520 if (vmx_rdtscp_supported()) {
6521 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
6522 if (!rdtscp_enabled)
6523 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6524
6525 if (nested) {
6526 if (rdtscp_enabled)
6677f3da 6527 vmx->nested.msrs.secondary_ctls_high |=
80154d77
PB
6528 SECONDARY_EXEC_RDTSCP;
6529 else
6677f3da 6530 vmx->nested.msrs.secondary_ctls_high &=
80154d77
PB
6531 ~SECONDARY_EXEC_RDTSCP;
6532 }
6533 }
6534
6535 if (vmx_invpcid_supported()) {
6536 /* Exposing INVPCID only when PCID is exposed */
6537 bool invpcid_enabled =
6538 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
6539 guest_cpuid_has(vcpu, X86_FEATURE_PCID);
6540
6541 if (!invpcid_enabled) {
6542 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6543 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
6544 }
6545
6546 if (nested) {
6547 if (invpcid_enabled)
6677f3da 6548 vmx->nested.msrs.secondary_ctls_high |=
80154d77
PB
6549 SECONDARY_EXEC_ENABLE_INVPCID;
6550 else
6677f3da 6551 vmx->nested.msrs.secondary_ctls_high &=
80154d77
PB
6552 ~SECONDARY_EXEC_ENABLE_INVPCID;
6553 }
6554 }
6555
45ec368c
JM
6556 if (vmx_rdrand_supported()) {
6557 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
6558 if (rdrand_enabled)
736fdf72 6559 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
6560
6561 if (nested) {
6562 if (rdrand_enabled)
6677f3da 6563 vmx->nested.msrs.secondary_ctls_high |=
736fdf72 6564 SECONDARY_EXEC_RDRAND_EXITING;
45ec368c 6565 else
6677f3da 6566 vmx->nested.msrs.secondary_ctls_high &=
736fdf72 6567 ~SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
6568 }
6569 }
6570
75f4fc8d
JM
6571 if (vmx_rdseed_supported()) {
6572 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
6573 if (rdseed_enabled)
736fdf72 6574 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
6575
6576 if (nested) {
6577 if (rdseed_enabled)
6677f3da 6578 vmx->nested.msrs.secondary_ctls_high |=
736fdf72 6579 SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d 6580 else
6677f3da 6581 vmx->nested.msrs.secondary_ctls_high &=
736fdf72 6582 ~SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
6583 }
6584 }
6585
80154d77 6586 vmx->secondary_exec_control = exec_control;
bf8179a0
NHE
6587}
6588
ce88decf
XG
6589static void ept_set_mmio_spte_mask(void)
6590{
6591 /*
6592 * EPT Misconfigurations can be generated if the value of bits 2:0
6593 * of an EPT paging-structure entry is 110b (write/execute).
ce88decf 6594 */
dcdca5fe
PF
6595 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
6596 VMX_EPT_MISCONFIG_WX_VALUE);
ce88decf
XG
6597}
6598
f53cd63c 6599#define VMX_XSS_EXIT_BITMAP 0
6aa8b732
AK
6600/*
6601 * Sets up the vmcs for emulated real mode.
6602 */
12d79917 6603static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
6aa8b732 6604{
6aa8b732 6605 int i;
6aa8b732 6606
4607c2d7 6607 if (enable_shadow_vmcs) {
f4160e45
JM
6608 /*
6609 * At vCPU creation, "VMWRITE to any supported field
6610 * in the VMCS" is supported, so use the more
6611 * permissive vmx_vmread_bitmap to specify both read
6612 * and write permissions for the shadow VMCS.
6613 */
4607c2d7 6614 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
f4160e45 6615 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
4607c2d7 6616 }
25c5f225 6617 if (cpu_has_vmx_msr_bitmap())
904e14fb 6618 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
25c5f225 6619
6aa8b732
AK
6620 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
6621
6aa8b732 6622 /* Control */
01e439be 6623 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
64672c95 6624 vmx->hv_deadline_tsc = -1;
6e5d865c 6625
bf8179a0 6626 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
6aa8b732 6627
dfa169bb 6628 if (cpu_has_secondary_exec_ctrls()) {
80154d77 6629 vmx_compute_secondary_exec_control(vmx);
bf8179a0 6630 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
80154d77 6631 vmx->secondary_exec_control);
dfa169bb 6632 }
f78e0e2e 6633
d62caabb 6634 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
c7c9c56c
YZ
6635 vmcs_write64(EOI_EXIT_BITMAP0, 0);
6636 vmcs_write64(EOI_EXIT_BITMAP1, 0);
6637 vmcs_write64(EOI_EXIT_BITMAP2, 0);
6638 vmcs_write64(EOI_EXIT_BITMAP3, 0);
6639
6640 vmcs_write16(GUEST_INTR_STATUS, 0);
01e439be 6641
0bcf261c 6642 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
01e439be 6643 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
c7c9c56c
YZ
6644 }
6645
b31c114b 6646 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4b8d54f9 6647 vmcs_write32(PLE_GAP, ple_gap);
a7653ecd
RK
6648 vmx->ple_window = ple_window;
6649 vmx->ple_window_dirty = true;
4b8d54f9
ZE
6650 }
6651
c3707958
XG
6652 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
6653 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6aa8b732
AK
6654 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
6655
9581d442
AK
6656 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
6657 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
a547c6db 6658 vmx_set_constant_host_state(vmx);
6aa8b732
AK
6659 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
6660 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6aa8b732 6661
2a499e49
BD
6662 if (cpu_has_vmx_vmfunc())
6663 vmcs_write64(VM_FUNCTION_CONTROL, 0);
6664
2cc51560
ED
6665 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
6666 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
33966dd6 6667 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2cc51560 6668 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
33966dd6 6669 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6aa8b732 6670
74545705
RK
6671 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6672 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
468d472f 6673
03916db9 6674 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
6aa8b732
AK
6675 u32 index = vmx_msr_index[i];
6676 u32 data_low, data_high;
a2fa3e9f 6677 int j = vmx->nmsrs;
6aa8b732
AK
6678
6679 if (rdmsr_safe(index, &data_low, &data_high) < 0)
6680 continue;
432bd6cb
AK
6681 if (wrmsr_safe(index, data_low, data_high) < 0)
6682 continue;
26bb0981
AK
6683 vmx->guest_msrs[j].index = i;
6684 vmx->guest_msrs[j].data = 0;
d5696725 6685 vmx->guest_msrs[j].mask = -1ull;
a2fa3e9f 6686 ++vmx->nmsrs;
6aa8b732 6687 }
6aa8b732 6688
5b76a3cf 6689 vmx->arch_capabilities = kvm_get_arch_capabilities();
2961e876
GN
6690
6691 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
6aa8b732
AK
6692
6693 /* 22.2.1, 20.8.1 */
2961e876 6694 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
1c3d14fe 6695
bd7e5b08
PB
6696 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
6697 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
6698
bf8179a0 6699 set_cr4_guest_host_mask(vmx);
e00c8cf2 6700
f53cd63c
WL
6701 if (vmx_xsaves_supported())
6702 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
6703
4e59516a 6704 if (enable_pml) {
4e59516a
PF
6705 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
6706 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6707 }
0b665d30
SC
6708
6709 if (cpu_has_vmx_encls_vmexit())
6710 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
e00c8cf2
AK
6711}
6712
d28bc9dd 6713static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
e00c8cf2
AK
6714{
6715 struct vcpu_vmx *vmx = to_vmx(vcpu);
58cb628d 6716 struct msr_data apic_base_msr;
d28bc9dd 6717 u64 cr0;
e00c8cf2 6718
7ffd92c5 6719 vmx->rmode.vm86_active = 0;
d28b387f 6720 vmx->spec_ctrl = 0;
e00c8cf2 6721
518e7b94 6722 vcpu->arch.microcode_version = 0x100000000ULL;
ad312c7c 6723 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
d28bc9dd
NA
6724 kvm_set_cr8(vcpu, 0);
6725
6726 if (!init_event) {
6727 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
6728 MSR_IA32_APICBASE_ENABLE;
6729 if (kvm_vcpu_is_reset_bsp(vcpu))
6730 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
6731 apic_base_msr.host_initiated = true;
6732 kvm_set_apic_base(vcpu, &apic_base_msr);
6733 }
e00c8cf2 6734
2fb92db1
AK
6735 vmx_segment_cache_clear(vmx);
6736
5706be0d 6737 seg_setup(VCPU_SREG_CS);
66450a21 6738 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
f3531054 6739 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
e00c8cf2
AK
6740
6741 seg_setup(VCPU_SREG_DS);
6742 seg_setup(VCPU_SREG_ES);
6743 seg_setup(VCPU_SREG_FS);
6744 seg_setup(VCPU_SREG_GS);
6745 seg_setup(VCPU_SREG_SS);
6746
6747 vmcs_write16(GUEST_TR_SELECTOR, 0);
6748 vmcs_writel(GUEST_TR_BASE, 0);
6749 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
6750 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
6751
6752 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
6753 vmcs_writel(GUEST_LDTR_BASE, 0);
6754 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
6755 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
6756
d28bc9dd
NA
6757 if (!init_event) {
6758 vmcs_write32(GUEST_SYSENTER_CS, 0);
6759 vmcs_writel(GUEST_SYSENTER_ESP, 0);
6760 vmcs_writel(GUEST_SYSENTER_EIP, 0);
6761 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
6762 }
e00c8cf2 6763
c37c2873 6764 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
66450a21 6765 kvm_rip_write(vcpu, 0xfff0);
e00c8cf2 6766
e00c8cf2
AK
6767 vmcs_writel(GUEST_GDTR_BASE, 0);
6768 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
6769
6770 vmcs_writel(GUEST_IDTR_BASE, 0);
6771 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
6772
443381a8 6773 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
e00c8cf2 6774 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
f3531054 6775 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
a554d207
WL
6776 if (kvm_mpx_supported())
6777 vmcs_write64(GUEST_BNDCFGS, 0);
e00c8cf2 6778
e00c8cf2
AK
6779 setup_msrs(vmx);
6780
6aa8b732
AK
6781 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
6782
d28bc9dd 6783 if (cpu_has_vmx_tpr_shadow() && !init_event) {
f78e0e2e 6784 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
35754c98 6785 if (cpu_need_tpr_shadow(vcpu))
f78e0e2e 6786 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
d28bc9dd 6787 __pa(vcpu->arch.apic->regs));
f78e0e2e
SY
6788 vmcs_write32(TPR_THRESHOLD, 0);
6789 }
6790
a73896cb 6791 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6aa8b732 6792
2384d2b3
SY
6793 if (vmx->vpid != 0)
6794 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6795
d28bc9dd 6796 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
d28bc9dd 6797 vmx->vcpu.arch.cr0 = cr0;
f2463247 6798 vmx_set_cr0(vcpu, cr0); /* enter rmode */
d28bc9dd 6799 vmx_set_cr4(vcpu, 0);
5690891b 6800 vmx_set_efer(vcpu, 0);
bd7e5b08 6801
d28bc9dd 6802 update_exception_bitmap(vcpu);
6aa8b732 6803
dd5f5341 6804 vpid_sync_context(vmx->vpid);
caa057a2
WL
6805 if (init_event)
6806 vmx_clear_hlt(vcpu);
6aa8b732
AK
6807}
6808
b6f1250e
NHE
6809/*
6810 * In nested virtualization, check if L1 asked to exit on external interrupts.
6811 * For most existing hypervisors, this will always return true.
6812 */
6813static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
6814{
6815 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
6816 PIN_BASED_EXT_INTR_MASK;
6817}
6818
77b0f5d6
BD
6819/*
6820 * In nested virtualization, check if L1 has set
6821 * VM_EXIT_ACK_INTR_ON_EXIT
6822 */
6823static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
6824{
6825 return get_vmcs12(vcpu)->vm_exit_controls &
6826 VM_EXIT_ACK_INTR_ON_EXIT;
6827}
6828
ea8ceb83
JK
6829static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
6830{
0c7f650e 6831 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
ea8ceb83
JK
6832}
6833
c9a7953f 6834static void enable_irq_window(struct kvm_vcpu *vcpu)
3b86cd99 6835{
47c0152e
PB
6836 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6837 CPU_BASED_VIRTUAL_INTR_PENDING);
3b86cd99
JK
6838}
6839
c9a7953f 6840static void enable_nmi_window(struct kvm_vcpu *vcpu)
3b86cd99 6841{
d02fcf50 6842 if (!enable_vnmi ||
8a1b4392 6843 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
c9a7953f
JK
6844 enable_irq_window(vcpu);
6845 return;
6846 }
3b86cd99 6847
47c0152e
PB
6848 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6849 CPU_BASED_VIRTUAL_NMI_PENDING);
3b86cd99
JK
6850}
6851
66fd3f7f 6852static void vmx_inject_irq(struct kvm_vcpu *vcpu)
85f455f7 6853{
9c8cba37 6854 struct vcpu_vmx *vmx = to_vmx(vcpu);
66fd3f7f
GN
6855 uint32_t intr;
6856 int irq = vcpu->arch.interrupt.nr;
9c8cba37 6857
229456fc 6858 trace_kvm_inj_virq(irq);
2714d1d3 6859
fa89a817 6860 ++vcpu->stat.irq_injections;
7ffd92c5 6861 if (vmx->rmode.vm86_active) {
71f9833b
SH
6862 int inc_eip = 0;
6863 if (vcpu->arch.interrupt.soft)
6864 inc_eip = vcpu->arch.event_exit_inst_len;
6865 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
a92601bb 6866 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
85f455f7
ED
6867 return;
6868 }
66fd3f7f
GN
6869 intr = irq | INTR_INFO_VALID_MASK;
6870 if (vcpu->arch.interrupt.soft) {
6871 intr |= INTR_TYPE_SOFT_INTR;
6872 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6873 vmx->vcpu.arch.event_exit_inst_len);
6874 } else
6875 intr |= INTR_TYPE_EXT_INTR;
6876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
caa057a2
WL
6877
6878 vmx_clear_hlt(vcpu);
85f455f7
ED
6879}
6880
f08864b4
SY
6881static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
6882{
66a5a347
JK
6883 struct vcpu_vmx *vmx = to_vmx(vcpu);
6884
d02fcf50 6885 if (!enable_vnmi) {
8a1b4392
PB
6886 /*
6887 * Tracking the NMI-blocked state in software is built upon
6888 * finding the next open IRQ window. This, in turn, depends on
6889 * well-behaving guests: They have to keep IRQs disabled at
6890 * least as long as the NMI handler runs. Otherwise we may
6891 * cause NMI nesting, maybe breaking the guest. But as this is
6892 * highly unlikely, we can live with the residual risk.
6893 */
6894 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
6895 vmx->loaded_vmcs->vnmi_blocked_time = 0;
6896 }
6897
4c4a6f79
PB
6898 ++vcpu->stat.nmi_injections;
6899 vmx->loaded_vmcs->nmi_known_unmasked = false;
3b86cd99 6900
7ffd92c5 6901 if (vmx->rmode.vm86_active) {
71f9833b 6902 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
a92601bb 6903 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
66a5a347
JK
6904 return;
6905 }
c5a6d5f7 6906
f08864b4
SY
6907 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6908 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
caa057a2
WL
6909
6910 vmx_clear_hlt(vcpu);
f08864b4
SY
6911}
6912
3cfc3092
JK
6913static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
6914{
4c4a6f79
PB
6915 struct vcpu_vmx *vmx = to_vmx(vcpu);
6916 bool masked;
6917
d02fcf50 6918 if (!enable_vnmi)
8a1b4392 6919 return vmx->loaded_vmcs->soft_vnmi_blocked;
4c4a6f79 6920 if (vmx->loaded_vmcs->nmi_known_unmasked)
9d58b931 6921 return false;
4c4a6f79
PB
6922 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
6923 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6924 return masked;
3cfc3092
JK
6925}
6926
6927static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
6928{
6929 struct vcpu_vmx *vmx = to_vmx(vcpu);
6930
d02fcf50 6931 if (!enable_vnmi) {
8a1b4392
PB
6932 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
6933 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
6934 vmx->loaded_vmcs->vnmi_blocked_time = 0;
6935 }
6936 } else {
6937 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6938 if (masked)
6939 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6940 GUEST_INTR_STATE_NMI);
6941 else
6942 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
6943 GUEST_INTR_STATE_NMI);
6944 }
3cfc3092
JK
6945}
6946
2505dc9f
JK
6947static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
6948{
b6b8a145
JK
6949 if (to_vmx(vcpu)->nested.nested_run_pending)
6950 return 0;
ea8ceb83 6951
d02fcf50 6952 if (!enable_vnmi &&
8a1b4392
PB
6953 to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
6954 return 0;
6955
2505dc9f
JK
6956 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6957 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
6958 | GUEST_INTR_STATE_NMI));
6959}
6960
78646121
GN
6961static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
6962{
b6b8a145
JK
6963 return (!to_vmx(vcpu)->nested.nested_run_pending &&
6964 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
c4282df9
GN
6965 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6966 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
78646121
GN
6967}
6968
cbc94022
IE
6969static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
6970{
6971 int ret;
cbc94022 6972
f7eaeb0a
SC
6973 if (enable_unrestricted_guest)
6974 return 0;
6975
1d8007bd
PB
6976 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
6977 PAGE_SIZE * 3);
cbc94022
IE
6978 if (ret)
6979 return ret;
40bbb9d0 6980 to_kvm_vmx(kvm)->tss_addr = addr;
1f755a82 6981 return init_rmode_tss(kvm);
cbc94022
IE
6982}
6983
2ac52ab8
SC
6984static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
6985{
40bbb9d0 6986 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
2ac52ab8
SC
6987 return 0;
6988}
6989
0ca1b4f4 6990static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6aa8b732 6991{
77ab6db0 6992 switch (vec) {
77ab6db0 6993 case BP_VECTOR:
c573cd22
JK
6994 /*
6995 * Update instruction length as we may reinject the exception
6996 * from user space while in guest debugging mode.
6997 */
6998 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
6999 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
d0bfb940 7000 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
0ca1b4f4
GN
7001 return false;
7002 /* fall through */
7003 case DB_VECTOR:
7004 if (vcpu->guest_debug &
7005 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
7006 return false;
d0bfb940
JK
7007 /* fall through */
7008 case DE_VECTOR:
77ab6db0
JK
7009 case OF_VECTOR:
7010 case BR_VECTOR:
7011 case UD_VECTOR:
7012 case DF_VECTOR:
7013 case SS_VECTOR:
7014 case GP_VECTOR:
7015 case MF_VECTOR:
0ca1b4f4
GN
7016 return true;
7017 break;
77ab6db0 7018 }
0ca1b4f4
GN
7019 return false;
7020}
7021
7022static int handle_rmode_exception(struct kvm_vcpu *vcpu,
7023 int vec, u32 err_code)
7024{
7025 /*
7026 * Instruction with address size override prefix opcode 0x67
7027 * Cause the #SS fault with 0 error code in VM86 mode.
7028 */
7029 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
0ce97a2b 7030 if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
0ca1b4f4
GN
7031 if (vcpu->arch.halt_request) {
7032 vcpu->arch.halt_request = 0;
5cb56059 7033 return kvm_vcpu_halt(vcpu);
0ca1b4f4
GN
7034 }
7035 return 1;
7036 }
7037 return 0;
7038 }
7039
7040 /*
7041 * Forward all other exceptions that are valid in real mode.
7042 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
7043 * the required debugging infrastructure rework.
7044 */
7045 kvm_queue_exception(vcpu, vec);
7046 return 1;
6aa8b732
AK
7047}
7048
a0861c02
AK
7049/*
7050 * Trigger machine check on the host. We assume all the MSRs are already set up
7051 * by the CPU and that we still run on the same CPU as the MCE occurred on.
7052 * We pass a fake environment to the machine check handler because we want
7053 * the guest to be always treated like user space, no matter what context
7054 * it used internally.
7055 */
7056static void kvm_machine_check(void)
7057{
7058#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
7059 struct pt_regs regs = {
7060 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
7061 .flags = X86_EFLAGS_IF,
7062 };
7063
7064 do_machine_check(&regs, 0);
7065#endif
7066}
7067
851ba692 7068static int handle_machine_check(struct kvm_vcpu *vcpu)
a0861c02
AK
7069{
7070 /* already handled by vcpu_run */
7071 return 1;
7072}
7073
851ba692 7074static int handle_exception(struct kvm_vcpu *vcpu)
6aa8b732 7075{
1155f76a 7076 struct vcpu_vmx *vmx = to_vmx(vcpu);
851ba692 7077 struct kvm_run *kvm_run = vcpu->run;
d0bfb940 7078 u32 intr_info, ex_no, error_code;
42dbaa5a 7079 unsigned long cr2, rip, dr6;
6aa8b732
AK
7080 u32 vect_info;
7081 enum emulation_result er;
7082
1155f76a 7083 vect_info = vmx->idt_vectoring_info;
88786475 7084 intr_info = vmx->exit_intr_info;
6aa8b732 7085
a0861c02 7086 if (is_machine_check(intr_info))
851ba692 7087 return handle_machine_check(vcpu);
a0861c02 7088
ef85b673 7089 if (is_nmi(intr_info))
1b6269db 7090 return 1; /* already handled by vmx_vcpu_run() */
2ab455cc 7091
082d06ed
WL
7092 if (is_invalid_opcode(intr_info))
7093 return handle_ud(vcpu);
7aa81cc0 7094
6aa8b732 7095 error_code = 0;
2e11384c 7096 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6aa8b732 7097 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
bf4ca23e 7098
9e869480
LA
7099 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
7100 WARN_ON_ONCE(!enable_vmware_backdoor);
0ce97a2b 7101 er = kvm_emulate_instruction(vcpu,
9e869480
LA
7102 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
7103 if (er == EMULATE_USER_EXIT)
7104 return 0;
7105 else if (er != EMULATE_DONE)
7106 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
7107 return 1;
7108 }
7109
bf4ca23e
XG
7110 /*
7111 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
7112 * MMIO, it is better to report an internal error.
7113 * See the comments in vmx_handle_exit.
7114 */
7115 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
7116 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
7117 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7118 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
80f0e95d 7119 vcpu->run->internal.ndata = 3;
bf4ca23e
XG
7120 vcpu->run->internal.data[0] = vect_info;
7121 vcpu->run->internal.data[1] = intr_info;
80f0e95d 7122 vcpu->run->internal.data[2] = error_code;
bf4ca23e
XG
7123 return 0;
7124 }
7125
6aa8b732
AK
7126 if (is_page_fault(intr_info)) {
7127 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1261bfa3
WL
7128 /* EPT won't cause page fault directly */
7129 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
d0006530 7130 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
6aa8b732
AK
7131 }
7132
d0bfb940 7133 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
0ca1b4f4
GN
7134
7135 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
7136 return handle_rmode_exception(vcpu, ex_no, error_code);
7137
42dbaa5a 7138 switch (ex_no) {
54a20552
EN
7139 case AC_VECTOR:
7140 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
7141 return 1;
42dbaa5a
JK
7142 case DB_VECTOR:
7143 dr6 = vmcs_readl(EXIT_QUALIFICATION);
7144 if (!(vcpu->guest_debug &
7145 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
8246bf52 7146 vcpu->arch.dr6 &= ~15;
6f43ed01 7147 vcpu->arch.dr6 |= dr6 | DR6_RTM;
32d43cd3 7148 if (is_icebp(intr_info))
fd2a445a
HD
7149 skip_emulated_instruction(vcpu);
7150
42dbaa5a
JK
7151 kvm_queue_exception(vcpu, DB_VECTOR);
7152 return 1;
7153 }
7154 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
7155 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
7156 /* fall through */
7157 case BP_VECTOR:
c573cd22
JK
7158 /*
7159 * Update instruction length as we may reinject #BP from
7160 * user space while in guest debugging mode. Reading it for
7161 * #DB as well causes no harm, it is not used in that case.
7162 */
7163 vmx->vcpu.arch.event_exit_inst_len =
7164 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6aa8b732 7165 kvm_run->exit_reason = KVM_EXIT_DEBUG;
0a434bb2 7166 rip = kvm_rip_read(vcpu);
d0bfb940
JK
7167 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
7168 kvm_run->debug.arch.exception = ex_no;
42dbaa5a
JK
7169 break;
7170 default:
d0bfb940
JK
7171 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
7172 kvm_run->ex.exception = ex_no;
7173 kvm_run->ex.error_code = error_code;
42dbaa5a 7174 break;
6aa8b732 7175 }
6aa8b732
AK
7176 return 0;
7177}
7178
851ba692 7179static int handle_external_interrupt(struct kvm_vcpu *vcpu)
6aa8b732 7180{
1165f5fe 7181 ++vcpu->stat.irq_exits;
6aa8b732
AK
7182 return 1;
7183}
7184
851ba692 7185static int handle_triple_fault(struct kvm_vcpu *vcpu)
988ad74f 7186{
851ba692 7187 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
bbeac283 7188 vcpu->mmio_needed = 0;
988ad74f
AK
7189 return 0;
7190}
6aa8b732 7191
851ba692 7192static int handle_io(struct kvm_vcpu *vcpu)
6aa8b732 7193{
bfdaab09 7194 unsigned long exit_qualification;
dca7f128 7195 int size, in, string;
039576c0 7196 unsigned port;
6aa8b732 7197
bfdaab09 7198 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
039576c0 7199 string = (exit_qualification & 16) != 0;
e70669ab 7200
cf8f70bf 7201 ++vcpu->stat.io_exits;
e70669ab 7202
432baf60 7203 if (string)
0ce97a2b 7204 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
e70669ab 7205
cf8f70bf
GN
7206 port = exit_qualification >> 16;
7207 size = (exit_qualification & 7) + 1;
432baf60 7208 in = (exit_qualification & 8) != 0;
cf8f70bf 7209
dca7f128 7210 return kvm_fast_pio(vcpu, size, port, in);
6aa8b732
AK
7211}
7212
102d8325
IM
7213static void
7214vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
7215{
7216 /*
7217 * Patch in the VMCALL instruction:
7218 */
7219 hypercall[0] = 0x0f;
7220 hypercall[1] = 0x01;
7221 hypercall[2] = 0xc1;
102d8325
IM
7222}
7223
0fa06071 7224/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
eeadf9e7
NHE
7225static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
7226{
eeadf9e7 7227 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
7228 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7229 unsigned long orig_val = val;
7230
eeadf9e7
NHE
7231 /*
7232 * We get here when L2 changed cr0 in a way that did not change
7233 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
1a0d74e6
JK
7234 * but did change L0 shadowed bits. So we first calculate the
7235 * effective cr0 value that L1 would like to write into the
7236 * hardware. It consists of the L2-owned bits from the new
7237 * value combined with the L1-owned bits from L1's guest_cr0.
eeadf9e7 7238 */
1a0d74e6
JK
7239 val = (val & ~vmcs12->cr0_guest_host_mask) |
7240 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
7241
3899152c 7242 if (!nested_guest_cr0_valid(vcpu, val))
eeadf9e7 7243 return 1;
1a0d74e6
JK
7244
7245 if (kvm_set_cr0(vcpu, val))
7246 return 1;
7247 vmcs_writel(CR0_READ_SHADOW, orig_val);
eeadf9e7 7248 return 0;
1a0d74e6
JK
7249 } else {
7250 if (to_vmx(vcpu)->nested.vmxon &&
3899152c 7251 !nested_host_cr0_valid(vcpu, val))
1a0d74e6 7252 return 1;
3899152c 7253
eeadf9e7 7254 return kvm_set_cr0(vcpu, val);
1a0d74e6 7255 }
eeadf9e7
NHE
7256}
7257
7258static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
7259{
7260 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
7261 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7262 unsigned long orig_val = val;
7263
7264 /* analogously to handle_set_cr0 */
7265 val = (val & ~vmcs12->cr4_guest_host_mask) |
7266 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
7267 if (kvm_set_cr4(vcpu, val))
eeadf9e7 7268 return 1;
1a0d74e6 7269 vmcs_writel(CR4_READ_SHADOW, orig_val);
eeadf9e7
NHE
7270 return 0;
7271 } else
7272 return kvm_set_cr4(vcpu, val);
7273}
7274
0367f205
PB
7275static int handle_desc(struct kvm_vcpu *vcpu)
7276{
7277 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
0ce97a2b 7278 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
0367f205
PB
7279}
7280
851ba692 7281static int handle_cr(struct kvm_vcpu *vcpu)
6aa8b732 7282{
229456fc 7283 unsigned long exit_qualification, val;
6aa8b732
AK
7284 int cr;
7285 int reg;
49a9b07e 7286 int err;
6affcbed 7287 int ret;
6aa8b732 7288
bfdaab09 7289 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6aa8b732
AK
7290 cr = exit_qualification & 15;
7291 reg = (exit_qualification >> 8) & 15;
7292 switch ((exit_qualification >> 4) & 3) {
7293 case 0: /* mov to cr */
1e32c079 7294 val = kvm_register_readl(vcpu, reg);
229456fc 7295 trace_kvm_cr_write(cr, val);
6aa8b732
AK
7296 switch (cr) {
7297 case 0:
eeadf9e7 7298 err = handle_set_cr0(vcpu, val);
6affcbed 7299 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 7300 case 3:
e1de91cc 7301 WARN_ON_ONCE(enable_unrestricted_guest);
2390218b 7302 err = kvm_set_cr3(vcpu, val);
6affcbed 7303 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 7304 case 4:
eeadf9e7 7305 err = handle_set_cr4(vcpu, val);
6affcbed 7306 return kvm_complete_insn_gp(vcpu, err);
0a5fff19
GN
7307 case 8: {
7308 u8 cr8_prev = kvm_get_cr8(vcpu);
1e32c079 7309 u8 cr8 = (u8)val;
eea1cff9 7310 err = kvm_set_cr8(vcpu, cr8);
6affcbed 7311 ret = kvm_complete_insn_gp(vcpu, err);
35754c98 7312 if (lapic_in_kernel(vcpu))
6affcbed 7313 return ret;
0a5fff19 7314 if (cr8_prev <= cr8)
6affcbed
KH
7315 return ret;
7316 /*
7317 * TODO: we might be squashing a
7318 * KVM_GUESTDBG_SINGLESTEP-triggered
7319 * KVM_EXIT_DEBUG here.
7320 */
851ba692 7321 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
0a5fff19
GN
7322 return 0;
7323 }
4b8073e4 7324 }
6aa8b732 7325 break;
25c4c276 7326 case 2: /* clts */
bd7e5b08
PB
7327 WARN_ONCE(1, "Guest should always own CR0.TS");
7328 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4d4ec087 7329 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
6affcbed 7330 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7331 case 1: /*mov from cr*/
7332 switch (cr) {
7333 case 3:
e1de91cc 7334 WARN_ON_ONCE(enable_unrestricted_guest);
9f8fe504
AK
7335 val = kvm_read_cr3(vcpu);
7336 kvm_register_write(vcpu, reg, val);
7337 trace_kvm_cr_read(cr, val);
6affcbed 7338 return kvm_skip_emulated_instruction(vcpu);
6aa8b732 7339 case 8:
229456fc
MT
7340 val = kvm_get_cr8(vcpu);
7341 kvm_register_write(vcpu, reg, val);
7342 trace_kvm_cr_read(cr, val);
6affcbed 7343 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7344 }
7345 break;
7346 case 3: /* lmsw */
a1f83a74 7347 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4d4ec087 7348 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
a1f83a74 7349 kvm_lmsw(vcpu, val);
6aa8b732 7350
6affcbed 7351 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7352 default:
7353 break;
7354 }
851ba692 7355 vcpu->run->exit_reason = 0;
a737f256 7356 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6aa8b732
AK
7357 (int)(exit_qualification >> 4) & 3, cr);
7358 return 0;
7359}
7360
851ba692 7361static int handle_dr(struct kvm_vcpu *vcpu)
6aa8b732 7362{
bfdaab09 7363 unsigned long exit_qualification;
16f8a6f9
NA
7364 int dr, dr7, reg;
7365
7366 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7367 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
7368
7369 /* First, if DR does not exist, trigger UD */
7370 if (!kvm_require_dr(vcpu, dr))
7371 return 1;
6aa8b732 7372
f2483415 7373 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
0a79b009
AK
7374 if (!kvm_require_cpl(vcpu, 0))
7375 return 1;
16f8a6f9
NA
7376 dr7 = vmcs_readl(GUEST_DR7);
7377 if (dr7 & DR7_GD) {
42dbaa5a
JK
7378 /*
7379 * As the vm-exit takes precedence over the debug trap, we
7380 * need to emulate the latter, either for the host or the
7381 * guest debugging itself.
7382 */
7383 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
851ba692 7384 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
16f8a6f9 7385 vcpu->run->debug.arch.dr7 = dr7;
82b32774 7386 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
851ba692
AK
7387 vcpu->run->debug.arch.exception = DB_VECTOR;
7388 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
42dbaa5a
JK
7389 return 0;
7390 } else {
7305eb5d 7391 vcpu->arch.dr6 &= ~15;
6f43ed01 7392 vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
42dbaa5a
JK
7393 kvm_queue_exception(vcpu, DB_VECTOR);
7394 return 1;
7395 }
7396 }
7397
81908bf4 7398 if (vcpu->guest_debug == 0) {
8f22372f
PB
7399 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7400 CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
7401
7402 /*
7403 * No more DR vmexits; force a reload of the debug registers
7404 * and reenter on this instruction. The next vmexit will
7405 * retrieve the full state of the debug registers.
7406 */
7407 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
7408 return 1;
7409 }
7410
42dbaa5a
JK
7411 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
7412 if (exit_qualification & TYPE_MOV_FROM_DR) {
020df079 7413 unsigned long val;
4c4d563b
JK
7414
7415 if (kvm_get_dr(vcpu, dr, &val))
7416 return 1;
7417 kvm_register_write(vcpu, reg, val);
020df079 7418 } else
5777392e 7419 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
4c4d563b
JK
7420 return 1;
7421
6affcbed 7422 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7423}
7424
73aaf249
JK
7425static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
7426{
7427 return vcpu->arch.dr6;
7428}
7429
7430static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
7431{
7432}
7433
81908bf4
PB
7434static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
7435{
81908bf4
PB
7436 get_debugreg(vcpu->arch.db[0], 0);
7437 get_debugreg(vcpu->arch.db[1], 1);
7438 get_debugreg(vcpu->arch.db[2], 2);
7439 get_debugreg(vcpu->arch.db[3], 3);
7440 get_debugreg(vcpu->arch.dr6, 6);
7441 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
7442
7443 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
8f22372f 7444 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
7445}
7446
020df079
GN
7447static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
7448{
7449 vmcs_writel(GUEST_DR7, val);
7450}
7451
851ba692 7452static int handle_cpuid(struct kvm_vcpu *vcpu)
6aa8b732 7453{
6a908b62 7454 return kvm_emulate_cpuid(vcpu);
6aa8b732
AK
7455}
7456
851ba692 7457static int handle_rdmsr(struct kvm_vcpu *vcpu)
6aa8b732 7458{
ad312c7c 7459 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
609e36d3 7460 struct msr_data msr_info;
6aa8b732 7461
609e36d3
PB
7462 msr_info.index = ecx;
7463 msr_info.host_initiated = false;
7464 if (vmx_get_msr(vcpu, &msr_info)) {
59200273 7465 trace_kvm_msr_read_ex(ecx);
c1a5d4f9 7466 kvm_inject_gp(vcpu, 0);
6aa8b732
AK
7467 return 1;
7468 }
7469
609e36d3 7470 trace_kvm_msr_read(ecx, msr_info.data);
2714d1d3 7471
6aa8b732 7472 /* FIXME: handling of bits 32:63 of rax, rdx */
609e36d3
PB
7473 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
7474 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
6affcbed 7475 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7476}
7477
851ba692 7478static int handle_wrmsr(struct kvm_vcpu *vcpu)
6aa8b732 7479{
8fe8ab46 7480 struct msr_data msr;
ad312c7c
ZX
7481 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
7482 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
7483 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
6aa8b732 7484
8fe8ab46
WA
7485 msr.data = data;
7486 msr.index = ecx;
7487 msr.host_initiated = false;
854e8bb1 7488 if (kvm_set_msr(vcpu, &msr) != 0) {
59200273 7489 trace_kvm_msr_write_ex(ecx, data);
c1a5d4f9 7490 kvm_inject_gp(vcpu, 0);
6aa8b732
AK
7491 return 1;
7492 }
7493
59200273 7494 trace_kvm_msr_write(ecx, data);
6affcbed 7495 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7496}
7497
851ba692 7498static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6e5d865c 7499{
eb90f341 7500 kvm_apic_update_ppr(vcpu);
6e5d865c
YS
7501 return 1;
7502}
7503
851ba692 7504static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6aa8b732 7505{
47c0152e
PB
7506 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7507 CPU_BASED_VIRTUAL_INTR_PENDING);
2714d1d3 7508
3842d135
AK
7509 kvm_make_request(KVM_REQ_EVENT, vcpu);
7510
a26bf12a 7511 ++vcpu->stat.irq_window_exits;
6aa8b732
AK
7512 return 1;
7513}
7514
851ba692 7515static int handle_halt(struct kvm_vcpu *vcpu)
6aa8b732 7516{
d3bef15f 7517 return kvm_emulate_halt(vcpu);
6aa8b732
AK
7518}
7519
851ba692 7520static int handle_vmcall(struct kvm_vcpu *vcpu)
c21415e8 7521{
0d9c055e 7522 return kvm_emulate_hypercall(vcpu);
c21415e8
IM
7523}
7524
ec25d5e6
GN
7525static int handle_invd(struct kvm_vcpu *vcpu)
7526{
0ce97a2b 7527 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
ec25d5e6
GN
7528}
7529
851ba692 7530static int handle_invlpg(struct kvm_vcpu *vcpu)
a7052897 7531{
f9c617f6 7532 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
a7052897
MT
7533
7534 kvm_mmu_invlpg(vcpu, exit_qualification);
6affcbed 7535 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
7536}
7537
fee84b07
AK
7538static int handle_rdpmc(struct kvm_vcpu *vcpu)
7539{
7540 int err;
7541
7542 err = kvm_rdpmc(vcpu);
6affcbed 7543 return kvm_complete_insn_gp(vcpu, err);
fee84b07
AK
7544}
7545
851ba692 7546static int handle_wbinvd(struct kvm_vcpu *vcpu)
e5edaa01 7547{
6affcbed 7548 return kvm_emulate_wbinvd(vcpu);
e5edaa01
ED
7549}
7550
2acf923e
DC
7551static int handle_xsetbv(struct kvm_vcpu *vcpu)
7552{
7553 u64 new_bv = kvm_read_edx_eax(vcpu);
7554 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
7555
7556 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
6affcbed 7557 return kvm_skip_emulated_instruction(vcpu);
2acf923e
DC
7558 return 1;
7559}
7560
f53cd63c
WL
7561static int handle_xsaves(struct kvm_vcpu *vcpu)
7562{
6affcbed 7563 kvm_skip_emulated_instruction(vcpu);
f53cd63c
WL
7564 WARN(1, "this should never happen\n");
7565 return 1;
7566}
7567
7568static int handle_xrstors(struct kvm_vcpu *vcpu)
7569{
6affcbed 7570 kvm_skip_emulated_instruction(vcpu);
f53cd63c
WL
7571 WARN(1, "this should never happen\n");
7572 return 1;
7573}
7574
851ba692 7575static int handle_apic_access(struct kvm_vcpu *vcpu)
f78e0e2e 7576{
58fbbf26
KT
7577 if (likely(fasteoi)) {
7578 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7579 int access_type, offset;
7580
7581 access_type = exit_qualification & APIC_ACCESS_TYPE;
7582 offset = exit_qualification & APIC_ACCESS_OFFSET;
7583 /*
7584 * Sane guest uses MOV to write EOI, with written value
7585 * not cared. So make a short-circuit here by avoiding
7586 * heavy instruction emulation.
7587 */
7588 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
7589 (offset == APIC_EOI)) {
7590 kvm_lapic_set_eoi(vcpu);
6affcbed 7591 return kvm_skip_emulated_instruction(vcpu);
58fbbf26
KT
7592 }
7593 }
0ce97a2b 7594 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
f78e0e2e
SY
7595}
7596
c7c9c56c
YZ
7597static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
7598{
7599 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7600 int vector = exit_qualification & 0xff;
7601
7602 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
7603 kvm_apic_set_eoi_accelerated(vcpu, vector);
7604 return 1;
7605}
7606
83d4c286
YZ
7607static int handle_apic_write(struct kvm_vcpu *vcpu)
7608{
7609 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7610 u32 offset = exit_qualification & 0xfff;
7611
7612 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
7613 kvm_apic_write_nodecode(vcpu, offset);
7614 return 1;
7615}
7616
851ba692 7617static int handle_task_switch(struct kvm_vcpu *vcpu)
37817f29 7618{
60637aac 7619 struct vcpu_vmx *vmx = to_vmx(vcpu);
37817f29 7620 unsigned long exit_qualification;
e269fb21
JK
7621 bool has_error_code = false;
7622 u32 error_code = 0;
37817f29 7623 u16 tss_selector;
7f3d35fd 7624 int reason, type, idt_v, idt_index;
64a7ec06
GN
7625
7626 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
7f3d35fd 7627 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
64a7ec06 7628 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
37817f29
IE
7629
7630 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7631
7632 reason = (u32)exit_qualification >> 30;
64a7ec06
GN
7633 if (reason == TASK_SWITCH_GATE && idt_v) {
7634 switch (type) {
7635 case INTR_TYPE_NMI_INTR:
7636 vcpu->arch.nmi_injected = false;
654f06fc 7637 vmx_set_nmi_mask(vcpu, true);
64a7ec06
GN
7638 break;
7639 case INTR_TYPE_EXT_INTR:
66fd3f7f 7640 case INTR_TYPE_SOFT_INTR:
64a7ec06
GN
7641 kvm_clear_interrupt_queue(vcpu);
7642 break;
7643 case INTR_TYPE_HARD_EXCEPTION:
e269fb21
JK
7644 if (vmx->idt_vectoring_info &
7645 VECTORING_INFO_DELIVER_CODE_MASK) {
7646 has_error_code = true;
7647 error_code =
7648 vmcs_read32(IDT_VECTORING_ERROR_CODE);
7649 }
7650 /* fall through */
64a7ec06
GN
7651 case INTR_TYPE_SOFT_EXCEPTION:
7652 kvm_clear_exception_queue(vcpu);
7653 break;
7654 default:
7655 break;
7656 }
60637aac 7657 }
37817f29
IE
7658 tss_selector = exit_qualification;
7659
64a7ec06
GN
7660 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
7661 type != INTR_TYPE_EXT_INTR &&
7662 type != INTR_TYPE_NMI_INTR))
7663 skip_emulated_instruction(vcpu);
7664
7f3d35fd
KW
7665 if (kvm_task_switch(vcpu, tss_selector,
7666 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
7667 has_error_code, error_code) == EMULATE_FAIL) {
acb54517
GN
7668 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7669 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7670 vcpu->run->internal.ndata = 0;
42dbaa5a 7671 return 0;
acb54517 7672 }
42dbaa5a 7673
42dbaa5a
JK
7674 /*
7675 * TODO: What about debug traps on tss switch?
7676 * Are we supposed to inject them and update dr6?
7677 */
7678
7679 return 1;
37817f29
IE
7680}
7681
851ba692 7682static int handle_ept_violation(struct kvm_vcpu *vcpu)
1439442c 7683{
f9c617f6 7684 unsigned long exit_qualification;
1439442c 7685 gpa_t gpa;
eebed243 7686 u64 error_code;
1439442c 7687
f9c617f6 7688 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1439442c 7689
0be9c7a8
GN
7690 /*
7691 * EPT violation happened while executing iret from NMI,
7692 * "blocked by NMI" bit has to be set before next VM entry.
7693 * There are errata that may cause this bit to not be set:
7694 * AAK134, BY25.
7695 */
bcd1c294 7696 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
d02fcf50 7697 enable_vnmi &&
bcd1c294 7698 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
0be9c7a8
GN
7699 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
7700
1439442c 7701 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
229456fc 7702 trace_kvm_page_fault(gpa, exit_qualification);
4f5982a5 7703
27959a44 7704 /* Is it a read fault? */
ab22a473 7705 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
27959a44
JS
7706 ? PFERR_USER_MASK : 0;
7707 /* Is it a write fault? */
ab22a473 7708 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
27959a44
JS
7709 ? PFERR_WRITE_MASK : 0;
7710 /* Is it a fetch fault? */
ab22a473 7711 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
27959a44
JS
7712 ? PFERR_FETCH_MASK : 0;
7713 /* ept page table entry is present? */
7714 error_code |= (exit_qualification &
7715 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
7716 EPT_VIOLATION_EXECUTABLE))
7717 ? PFERR_PRESENT_MASK : 0;
4f5982a5 7718
eebed243
PB
7719 error_code |= (exit_qualification & 0x100) != 0 ?
7720 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
25d92081 7721
25d92081 7722 vcpu->arch.exit_qualification = exit_qualification;
4f5982a5 7723 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
1439442c
SY
7724}
7725
851ba692 7726static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
68f89400 7727{
68f89400
MT
7728 gpa_t gpa;
7729
9034e6e8
PB
7730 /*
7731 * A nested guest cannot optimize MMIO vmexits, because we have an
7732 * nGPA here instead of the required GPA.
7733 */
68f89400 7734 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
9034e6e8
PB
7735 if (!is_guest_mode(vcpu) &&
7736 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
931c33b1 7737 trace_kvm_fast_mmio(gpa);
d391f120
VK
7738 /*
7739 * Doing kvm_skip_emulated_instruction() depends on undefined
7740 * behavior: Intel's manual doesn't mandate
7741 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
7742 * occurs and while on real hardware it was observed to be set,
7743 * other hypervisors (namely Hyper-V) don't set it, we end up
7744 * advancing IP with some random value. Disable fast mmio when
7745 * running nested and keep it for real hardware in hope that
7746 * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
7747 */
7748 if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
7749 return kvm_skip_emulated_instruction(vcpu);
7750 else
0ce97a2b 7751 return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
c4409905 7752 EMULATE_DONE;
68c3b4d1 7753 }
68f89400 7754
c75d0edc 7755 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
68f89400
MT
7756}
7757
851ba692 7758static int handle_nmi_window(struct kvm_vcpu *vcpu)
f08864b4 7759{
d02fcf50 7760 WARN_ON_ONCE(!enable_vnmi);
47c0152e
PB
7761 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7762 CPU_BASED_VIRTUAL_NMI_PENDING);
f08864b4 7763 ++vcpu->stat.nmi_window_exits;
3842d135 7764 kvm_make_request(KVM_REQ_EVENT, vcpu);
f08864b4
SY
7765
7766 return 1;
7767}
7768
80ced186 7769static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
ea953ef0 7770{
8b3079a5
AK
7771 struct vcpu_vmx *vmx = to_vmx(vcpu);
7772 enum emulation_result err = EMULATE_DONE;
80ced186 7773 int ret = 1;
49e9d557
AK
7774 u32 cpu_exec_ctrl;
7775 bool intr_window_requested;
b8405c18 7776 unsigned count = 130;
49e9d557 7777
2bb8cafe
SC
7778 /*
7779 * We should never reach the point where we are emulating L2
7780 * due to invalid guest state as that means we incorrectly
7781 * allowed a nested VMEntry with an invalid vmcs12.
7782 */
7783 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
7784
49e9d557
AK
7785 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
7786 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
ea953ef0 7787
98eb2f8b 7788 while (vmx->emulation_required && count-- != 0) {
bdea48e3 7789 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
49e9d557
AK
7790 return handle_interrupt_window(&vmx->vcpu);
7791
72875d8a 7792 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
de87dcdd
AK
7793 return 1;
7794
0ce97a2b 7795 err = kvm_emulate_instruction(vcpu, 0);
ea953ef0 7796
ac0a48c3 7797 if (err == EMULATE_USER_EXIT) {
94452b9e 7798 ++vcpu->stat.mmio_exits;
80ced186
MG
7799 ret = 0;
7800 goto out;
7801 }
1d5a4d9b 7802
add5ff7a
SC
7803 if (err != EMULATE_DONE)
7804 goto emulation_error;
7805
7806 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
7807 vcpu->arch.exception.pending)
7808 goto emulation_error;
ea953ef0 7809
8d76c49e
GN
7810 if (vcpu->arch.halt_request) {
7811 vcpu->arch.halt_request = 0;
5cb56059 7812 ret = kvm_vcpu_halt(vcpu);
8d76c49e
GN
7813 goto out;
7814 }
7815
ea953ef0 7816 if (signal_pending(current))
80ced186 7817 goto out;
ea953ef0
MG
7818 if (need_resched())
7819 schedule();
7820 }
7821
80ced186
MG
7822out:
7823 return ret;
b4a2d31d 7824
add5ff7a
SC
7825emulation_error:
7826 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7827 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7828 vcpu->run->internal.ndata = 0;
7829 return 0;
b4a2d31d
RK
7830}
7831
7832static void grow_ple_window(struct kvm_vcpu *vcpu)
7833{
7834 struct vcpu_vmx *vmx = to_vmx(vcpu);
7835 int old = vmx->ple_window;
7836
c8e88717
BM
7837 vmx->ple_window = __grow_ple_window(old, ple_window,
7838 ple_window_grow,
7839 ple_window_max);
b4a2d31d
RK
7840
7841 if (vmx->ple_window != old)
7842 vmx->ple_window_dirty = true;
7b46268d
RK
7843
7844 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
b4a2d31d
RK
7845}
7846
7847static void shrink_ple_window(struct kvm_vcpu *vcpu)
7848{
7849 struct vcpu_vmx *vmx = to_vmx(vcpu);
7850 int old = vmx->ple_window;
7851
c8e88717
BM
7852 vmx->ple_window = __shrink_ple_window(old, ple_window,
7853 ple_window_shrink,
7854 ple_window);
b4a2d31d
RK
7855
7856 if (vmx->ple_window != old)
7857 vmx->ple_window_dirty = true;
7b46268d
RK
7858
7859 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
b4a2d31d
RK
7860}
7861
bf9f6ac8
FW
7862/*
7863 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
7864 */
7865static void wakeup_handler(void)
7866{
7867 struct kvm_vcpu *vcpu;
7868 int cpu = smp_processor_id();
7869
7870 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7871 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
7872 blocked_vcpu_list) {
7873 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7874
7875 if (pi_test_on(pi_desc) == 1)
7876 kvm_vcpu_kick(vcpu);
7877 }
7878 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7879}
7880
e01bca2f 7881static void vmx_enable_tdp(void)
f160c7b7
JS
7882{
7883 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
7884 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
7885 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
7886 0ull, VMX_EPT_EXECUTABLE_MASK,
7887 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
d0ec49d4 7888 VMX_EPT_RWX_MASK, 0ull);
f160c7b7
JS
7889
7890 ept_set_mmio_spte_mask();
7891 kvm_enable_tdp();
7892}
7893
f2c7648d
TC
7894static __init int hardware_setup(void)
7895{
cf81a7e5 7896 unsigned long host_bndcfgs;
904e14fb 7897 int r = -ENOMEM, i;
34a1cd60
TC
7898
7899 rdmsrl_safe(MSR_EFER, &host_efer);
7900
7901 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7902 kvm_define_shared_msr(i, vmx_msr_index[i]);
7903
23611332
RK
7904 for (i = 0; i < VMX_BITMAP_NR; i++) {
7905 vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
7906 if (!vmx_bitmap[i])
7907 goto out;
7908 }
34a1cd60 7909
34a1cd60
TC
7910 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
7911 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
7912
34a1cd60
TC
7913 if (setup_vmcs_config(&vmcs_config) < 0) {
7914 r = -EIO;
23611332 7915 goto out;
baa03522 7916 }
f2c7648d
TC
7917
7918 if (boot_cpu_has(X86_FEATURE_NX))
7919 kvm_enable_efer_bits(EFER_NX);
7920
cf81a7e5
SC
7921 if (boot_cpu_has(X86_FEATURE_MPX)) {
7922 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7923 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7924 }
7925
08d839c4
WL
7926 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7927 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
f2c7648d 7928 enable_vpid = 0;
08d839c4 7929
f2c7648d 7930 if (!cpu_has_vmx_ept() ||
42aa53b4 7931 !cpu_has_vmx_ept_4levels() ||
f5f51586 7932 !cpu_has_vmx_ept_mt_wb() ||
8ad8182e 7933 !cpu_has_vmx_invept_global())
f2c7648d 7934 enable_ept = 0;
f2c7648d 7935
fce6ac4c 7936 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
f2c7648d
TC
7937 enable_ept_ad_bits = 0;
7938
8ad8182e 7939 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
f2c7648d
TC
7940 enable_unrestricted_guest = 0;
7941
ad15a296 7942 if (!cpu_has_vmx_flexpriority())
f2c7648d
TC
7943 flexpriority_enabled = 0;
7944
d02fcf50
PB
7945 if (!cpu_has_virtual_nmis())
7946 enable_vnmi = 0;
7947
ad15a296
PB
7948 /*
7949 * set_apic_access_page_addr() is used to reload apic access
7950 * page upon invalidation. No need to do anything if not
7951 * using the APIC_ACCESS_ADDR VMCS field.
7952 */
7953 if (!flexpriority_enabled)
f2c7648d 7954 kvm_x86_ops->set_apic_access_page_addr = NULL;
f2c7648d
TC
7955
7956 if (!cpu_has_vmx_tpr_shadow())
7957 kvm_x86_ops->update_cr8_intercept = NULL;
7958
7959 if (enable_ept && !cpu_has_vmx_ept_2m_page())
7960 kvm_disable_largepages();
7961
877ad952
TL
7962#if IS_ENABLED(CONFIG_HYPERV)
7963 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7964 && enable_ept)
7965 kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
7966#endif
7967
0f107682 7968 if (!cpu_has_vmx_ple()) {
f2c7648d 7969 ple_gap = 0;
0f107682
WL
7970 ple_window = 0;
7971 ple_window_grow = 0;
7972 ple_window_max = 0;
7973 ple_window_shrink = 0;
7974 }
f2c7648d 7975
76dfafd5 7976 if (!cpu_has_vmx_apicv()) {
f2c7648d 7977 enable_apicv = 0;
76dfafd5
PB
7978 kvm_x86_ops->sync_pir_to_irr = NULL;
7979 }
f2c7648d 7980
64903d61
HZ
7981 if (cpu_has_vmx_tsc_scaling()) {
7982 kvm_has_tsc_control = true;
7983 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
7984 kvm_tsc_scaling_ratio_frac_bits = 48;
7985 }
7986
04bb92e4
WL
7987 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7988
f160c7b7
JS
7989 if (enable_ept)
7990 vmx_enable_tdp();
7991 else
baa03522
TC
7992 kvm_disable_tdp();
7993
8fcc4b59
JM
7994 if (!nested) {
7995 kvm_x86_ops->get_nested_state = NULL;
7996 kvm_x86_ops->set_nested_state = NULL;
7997 }
7998
843e4330
KH
7999 /*
8000 * Only enable PML when hardware supports PML feature, and both EPT
8001 * and EPT A/D bit features are enabled -- PML depends on them to work.
8002 */
8003 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8004 enable_pml = 0;
8005
8006 if (!enable_pml) {
8007 kvm_x86_ops->slot_enable_log_dirty = NULL;
8008 kvm_x86_ops->slot_disable_log_dirty = NULL;
8009 kvm_x86_ops->flush_log_dirty = NULL;
8010 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
8011 }
8012
d264ee0c
SC
8013 if (!cpu_has_vmx_preemption_timer())
8014 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
8015
64672c95
YJ
8016 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
8017 u64 vmx_msr;
8018
8019 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
8020 cpu_preemption_timer_multi =
8021 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8022 } else {
8023 kvm_x86_ops->set_hv_timer = NULL;
8024 kvm_x86_ops->cancel_hv_timer = NULL;
8025 }
8026
c5d167b2
PB
8027 if (!cpu_has_vmx_shadow_vmcs())
8028 enable_shadow_vmcs = 0;
8029 if (enable_shadow_vmcs)
8030 init_vmcs_shadow_fields();
8031
bf9f6ac8 8032 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
1389309c 8033 nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
bf9f6ac8 8034
c45dcc71
AR
8035 kvm_mce_cap_supported |= MCG_LMCE_P;
8036
f2c7648d 8037 return alloc_kvm_area();
34a1cd60 8038
34a1cd60 8039out:
23611332
RK
8040 for (i = 0; i < VMX_BITMAP_NR; i++)
8041 free_page((unsigned long)vmx_bitmap[i]);
34a1cd60
TC
8042
8043 return r;
f2c7648d
TC
8044}
8045
8046static __exit void hardware_unsetup(void)
8047{
23611332
RK
8048 int i;
8049
8050 for (i = 0; i < VMX_BITMAP_NR; i++)
8051 free_page((unsigned long)vmx_bitmap[i]);
34a1cd60 8052
f2c7648d
TC
8053 free_kvm_area();
8054}
8055
4b8d54f9
ZE
8056/*
8057 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
8058 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
8059 */
9fb41ba8 8060static int handle_pause(struct kvm_vcpu *vcpu)
4b8d54f9 8061{
b31c114b 8062 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d
RK
8063 grow_ple_window(vcpu);
8064
de63ad4c
LM
8065 /*
8066 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
8067 * VM-execution control is ignored if CPL > 0. OTOH, KVM
8068 * never set PAUSE_EXITING and just set PLE if supported,
8069 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
8070 */
8071 kvm_vcpu_on_spin(vcpu, true);
6affcbed 8072 return kvm_skip_emulated_instruction(vcpu);
4b8d54f9
ZE
8073}
8074
87c00572 8075static int handle_nop(struct kvm_vcpu *vcpu)
59708670 8076{
6affcbed 8077 return kvm_skip_emulated_instruction(vcpu);
59708670
SY
8078}
8079
87c00572
GS
8080static int handle_mwait(struct kvm_vcpu *vcpu)
8081{
8082 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
8083 return handle_nop(vcpu);
8084}
8085
45ec368c
JM
8086static int handle_invalid_op(struct kvm_vcpu *vcpu)
8087{
8088 kvm_queue_exception(vcpu, UD_VECTOR);
8089 return 1;
8090}
8091
5f3d45e7
MD
8092static int handle_monitor_trap(struct kvm_vcpu *vcpu)
8093{
8094 return 1;
8095}
8096
87c00572
GS
8097static int handle_monitor(struct kvm_vcpu *vcpu)
8098{
8099 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
8100 return handle_nop(vcpu);
8101}
8102
0658fbaa
ACL
8103/*
8104 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
09abb5e3
SC
8105 * set the success or error code of an emulated VMX instruction (as specified
8106 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
8107 * instruction.
0658fbaa 8108 */
09abb5e3 8109static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
0658fbaa
ACL
8110{
8111 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
8112 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8113 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
09abb5e3 8114 return kvm_skip_emulated_instruction(vcpu);
0658fbaa
ACL
8115}
8116
09abb5e3 8117static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
0658fbaa
ACL
8118{
8119 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8120 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
8121 X86_EFLAGS_SF | X86_EFLAGS_OF))
8122 | X86_EFLAGS_CF);
09abb5e3 8123 return kvm_skip_emulated_instruction(vcpu);
0658fbaa
ACL
8124}
8125
09abb5e3
SC
8126static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
8127 u32 vm_instruction_error)
0658fbaa 8128{
b8bbab92
VK
8129 struct vcpu_vmx *vmx = to_vmx(vcpu);
8130
09abb5e3
SC
8131 /*
8132 * failValid writes the error number to the current VMCS, which
8133 * can't be done if there isn't a current VMCS.
8134 */
b8bbab92 8135 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
09abb5e3
SC
8136 return nested_vmx_failInvalid(vcpu);
8137
0658fbaa
ACL
8138 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8139 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8140 X86_EFLAGS_SF | X86_EFLAGS_OF))
8141 | X86_EFLAGS_ZF);
8142 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
8143 /*
8144 * We don't need to force a shadow sync because
8145 * VM_INSTRUCTION_ERROR is not shadowed
8146 */
09abb5e3 8147 return kvm_skip_emulated_instruction(vcpu);
0658fbaa 8148}
145c28dd 8149
ff651cb6
WV
8150static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
8151{
8152 /* TODO: not to reset guest simply here. */
8153 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
bbe41b95 8154 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
ff651cb6
WV
8155}
8156
f4124500
JK
8157static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
8158{
8159 struct vcpu_vmx *vmx =
8160 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
8161
8162 vmx->nested.preemption_timer_expired = true;
8163 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
8164 kvm_vcpu_kick(&vmx->vcpu);
8165
8166 return HRTIMER_NORESTART;
8167}
8168
19677e32
BD
8169/*
8170 * Decode the memory-address operand of a vmx instruction, as recorded on an
8171 * exit caused by such an instruction (run by a guest hypervisor).
8172 * On success, returns 0. When the operand is invalid, returns 1 and throws
8173 * #UD or #GP.
8174 */
8175static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
8176 unsigned long exit_qualification,
f9eb4af6 8177 u32 vmx_instruction_info, bool wr, gva_t *ret)
19677e32 8178{
f9eb4af6
EK
8179 gva_t off;
8180 bool exn;
8181 struct kvm_segment s;
8182
19677e32
BD
8183 /*
8184 * According to Vol. 3B, "Information for VM Exits Due to Instruction
8185 * Execution", on an exit, vmx_instruction_info holds most of the
8186 * addressing components of the operand. Only the displacement part
8187 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
8188 * For how an actual address is calculated from all these components,
8189 * refer to Vol. 1, "Operand Addressing".
8190 */
8191 int scaling = vmx_instruction_info & 3;
8192 int addr_size = (vmx_instruction_info >> 7) & 7;
8193 bool is_reg = vmx_instruction_info & (1u << 10);
8194 int seg_reg = (vmx_instruction_info >> 15) & 7;
8195 int index_reg = (vmx_instruction_info >> 18) & 0xf;
8196 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
8197 int base_reg = (vmx_instruction_info >> 23) & 0xf;
8198 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
8199
8200 if (is_reg) {
8201 kvm_queue_exception(vcpu, UD_VECTOR);
8202 return 1;
8203 }
8204
8205 /* Addr = segment_base + offset */
8206 /* offset = base + [index * scale] + displacement */
f9eb4af6 8207 off = exit_qualification; /* holds the displacement */
19677e32 8208 if (base_is_valid)
f9eb4af6 8209 off += kvm_register_read(vcpu, base_reg);
19677e32 8210 if (index_is_valid)
f9eb4af6
EK
8211 off += kvm_register_read(vcpu, index_reg)<<scaling;
8212 vmx_get_segment(vcpu, &s, seg_reg);
8213 *ret = s.base + off;
19677e32
BD
8214
8215 if (addr_size == 1) /* 32 bit */
8216 *ret &= 0xffffffff;
8217
f9eb4af6
EK
8218 /* Checks for #GP/#SS exceptions. */
8219 exn = false;
ff30ef40
QC
8220 if (is_long_mode(vcpu)) {
8221 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
8222 * non-canonical form. This is the only check on the memory
8223 * destination for long mode!
8224 */
fd8cb433 8225 exn = is_noncanonical_address(*ret, vcpu);
ff30ef40 8226 } else if (is_protmode(vcpu)) {
f9eb4af6
EK
8227 /* Protected mode: apply checks for segment validity in the
8228 * following order:
8229 * - segment type check (#GP(0) may be thrown)
8230 * - usability check (#GP(0)/#SS(0))
8231 * - limit check (#GP(0)/#SS(0))
8232 */
8233 if (wr)
8234 /* #GP(0) if the destination operand is located in a
8235 * read-only data segment or any code segment.
8236 */
8237 exn = ((s.type & 0xa) == 0 || (s.type & 8));
8238 else
8239 /* #GP(0) if the source operand is located in an
8240 * execute-only code segment
8241 */
8242 exn = ((s.type & 0xa) == 8);
ff30ef40
QC
8243 if (exn) {
8244 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
8245 return 1;
8246 }
f9eb4af6
EK
8247 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
8248 */
8249 exn = (s.unusable != 0);
8250 /* Protected mode: #GP(0)/#SS(0) if the memory
8251 * operand is outside the segment limit.
8252 */
8253 exn = exn || (off + sizeof(u64) > s.limit);
8254 }
8255 if (exn) {
8256 kvm_queue_exception_e(vcpu,
8257 seg_reg == VCPU_SREG_SS ?
8258 SS_VECTOR : GP_VECTOR,
8259 0);
8260 return 1;
8261 }
8262
19677e32
BD
8263 return 0;
8264}
8265
cbf71279 8266static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
3573e22c
BD
8267{
8268 gva_t gva;
3573e22c 8269 struct x86_exception e;
3573e22c
BD
8270
8271 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
f9eb4af6 8272 vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
3573e22c
BD
8273 return 1;
8274
ce14e868 8275 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
3573e22c
BD
8276 kvm_inject_page_fault(vcpu, &e);
8277 return 1;
8278 }
8279
3573e22c
BD
8280 return 0;
8281}
8282
abfc52c6
LA
8283/*
8284 * Allocate a shadow VMCS and associate it with the currently loaded
8285 * VMCS, unless such a shadow VMCS already exists. The newly allocated
8286 * VMCS is also VMCLEARed, so that it is ready for use.
8287 */
8288static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
8289{
8290 struct vcpu_vmx *vmx = to_vmx(vcpu);
8291 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
8292
8293 /*
8294 * We should allocate a shadow vmcs for vmcs01 only when L1
8295 * executes VMXON and free it when L1 executes VMXOFF.
8296 * As it is invalid to execute VMXON twice, we shouldn't reach
8297 * here when vmcs01 already have an allocated shadow vmcs.
8298 */
8299 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
8300
8301 if (!loaded_vmcs->shadow_vmcs) {
8302 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
8303 if (loaded_vmcs->shadow_vmcs)
8304 vmcs_clear(loaded_vmcs->shadow_vmcs);
8305 }
8306 return loaded_vmcs->shadow_vmcs;
8307}
8308
e29acc55
JM
8309static int enter_vmx_operation(struct kvm_vcpu *vcpu)
8310{
8311 struct vcpu_vmx *vmx = to_vmx(vcpu);
f21f165e 8312 int r;
e29acc55 8313
f21f165e
PB
8314 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
8315 if (r < 0)
de3a0021 8316 goto out_vmcs02;
e29acc55
JM
8317
8318 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
8319 if (!vmx->nested.cached_vmcs12)
8320 goto out_cached_vmcs12;
8321
61ada748
LA
8322 vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
8323 if (!vmx->nested.cached_shadow_vmcs12)
8324 goto out_cached_shadow_vmcs12;
8325
abfc52c6
LA
8326 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
8327 goto out_shadow_vmcs;
e29acc55 8328
e29acc55
JM
8329 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
8330 HRTIMER_MODE_REL_PINNED);
8331 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
8332
63aff655
RK
8333 vmx->nested.vpid02 = allocate_vpid();
8334
9d6105b2 8335 vmx->nested.vmcs02_initialized = false;
e29acc55
JM
8336 vmx->nested.vmxon = true;
8337 return 0;
8338
8339out_shadow_vmcs:
61ada748
LA
8340 kfree(vmx->nested.cached_shadow_vmcs12);
8341
8342out_cached_shadow_vmcs12:
e29acc55
JM
8343 kfree(vmx->nested.cached_vmcs12);
8344
8345out_cached_vmcs12:
de3a0021 8346 free_loaded_vmcs(&vmx->nested.vmcs02);
e29acc55 8347
de3a0021 8348out_vmcs02:
e29acc55
JM
8349 return -ENOMEM;
8350}
8351
ec378aee
NHE
8352/*
8353 * Emulate the VMXON instruction.
8354 * Currently, we just remember that VMX is active, and do not save or even
8355 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
8356 * do not currently need to store anything in that guest-allocated memory
8357 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
8358 * argument is different from the VMXON pointer (which the spec says they do).
8359 */
8360static int handle_vmon(struct kvm_vcpu *vcpu)
8361{
e29acc55 8362 int ret;
cbf71279
RK
8363 gpa_t vmptr;
8364 struct page *page;
ec378aee 8365 struct vcpu_vmx *vmx = to_vmx(vcpu);
b3897a49
NHE
8366 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
8367 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
ec378aee 8368
70f3aac9
JM
8369 /*
8370 * The Intel VMX Instruction Reference lists a bunch of bits that are
8371 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
8372 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
8373 * Otherwise, we should fail with #UD. But most faulting conditions
8374 * have already been checked by hardware, prior to the VM-exit for
8375 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
8376 * that bit set to 1 in non-root mode.
ec378aee 8377 */
70f3aac9 8378 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
ec378aee
NHE
8379 kvm_queue_exception(vcpu, UD_VECTOR);
8380 return 1;
8381 }
8382
727ba748
FW
8383 /* CPL=0 must be checked manually. */
8384 if (vmx_get_cpl(vcpu)) {
36090bf4 8385 kvm_inject_gp(vcpu, 0);
727ba748
FW
8386 return 1;
8387 }
8388
09abb5e3
SC
8389 if (vmx->nested.vmxon)
8390 return nested_vmx_failValid(vcpu,
8391 VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
b3897a49 8392
3b84080b 8393 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
b3897a49
NHE
8394 != VMXON_NEEDED_FEATURES) {
8395 kvm_inject_gp(vcpu, 0);
8396 return 1;
8397 }
8398
cbf71279 8399 if (nested_vmx_get_vmptr(vcpu, &vmptr))
21e7fbe7 8400 return 1;
cbf71279
RK
8401
8402 /*
8403 * SDM 3: 24.11.5
8404 * The first 4 bytes of VMXON region contain the supported
8405 * VMCS revision identifier
8406 *
8407 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
8408 * which replaces physical address width with 32
8409 */
09abb5e3
SC
8410 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
8411 return nested_vmx_failInvalid(vcpu);
cbf71279 8412
5e2f30b7 8413 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
09abb5e3
SC
8414 if (is_error_page(page))
8415 return nested_vmx_failInvalid(vcpu);
8416
cbf71279
RK
8417 if (*(u32 *)kmap(page) != VMCS12_REVISION) {
8418 kunmap(page);
53a70daf 8419 kvm_release_page_clean(page);
09abb5e3 8420 return nested_vmx_failInvalid(vcpu);
cbf71279
RK
8421 }
8422 kunmap(page);
53a70daf 8423 kvm_release_page_clean(page);
cbf71279
RK
8424
8425 vmx->nested.vmxon_ptr = vmptr;
e29acc55
JM
8426 ret = enter_vmx_operation(vcpu);
8427 if (ret)
8428 return ret;
ec378aee 8429
09abb5e3 8430 return nested_vmx_succeed(vcpu);
ec378aee
NHE
8431}
8432
8433/*
8434 * Intel's VMX Instruction Reference specifies a common set of prerequisites
8435 * for running VMX instructions (except VMXON, whose prerequisites are
8436 * slightly different). It also specifies what exception to inject otherwise.
70f3aac9
JM
8437 * Note that many of these exceptions have priority over VM exits, so they
8438 * don't have to be checked again here.
ec378aee
NHE
8439 */
8440static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
8441{
e49fcb8b 8442 if (!to_vmx(vcpu)->nested.vmxon) {
727ba748
FW
8443 kvm_queue_exception(vcpu, UD_VECTOR);
8444 return 0;
8445 }
8446
e49fcb8b
JM
8447 if (vmx_get_cpl(vcpu)) {
8448 kvm_inject_gp(vcpu, 0);
ec378aee
NHE
8449 return 0;
8450 }
e49fcb8b 8451
ec378aee
NHE
8452 return 1;
8453}
8454
8ca44e88
DM
8455static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
8456{
8457 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
8458 vmcs_write64(VMCS_LINK_POINTER, -1ull);
8459}
8460
b8bbab92
VK
8461static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
8462{
8463 struct vcpu_vmx *vmx = to_vmx(vcpu);
8464
8465 if (!vmx->nested.hv_evmcs)
8466 return;
8467
8468 kunmap(vmx->nested.hv_evmcs_page);
8469 kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
8470 vmx->nested.hv_evmcs_vmptr = -1ull;
8471 vmx->nested.hv_evmcs_page = NULL;
8472 vmx->nested.hv_evmcs = NULL;
8473}
8474
14c07ad8 8475static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
e7953d7f 8476{
14c07ad8
VK
8477 struct vcpu_vmx *vmx = to_vmx(vcpu);
8478
9a2a05b9
PB
8479 if (vmx->nested.current_vmptr == -1ull)
8480 return;
8481
012f83cb 8482 if (enable_shadow_vmcs) {
9a2a05b9
PB
8483 /* copy to memory all shadowed fields in case
8484 they were modified */
8485 copy_shadow_to_vmcs12(vmx);
945679e3 8486 vmx->nested.need_vmcs12_sync = false;
8ca44e88 8487 vmx_disable_shadow_vmcs(vmx);
012f83cb 8488 }
705699a1 8489 vmx->nested.posted_intr_nv = -1;
4f2777bc
DM
8490
8491 /* Flush VMCS12 to guest memory */
14c07ad8 8492 kvm_vcpu_write_guest_page(vcpu,
9f744c59
PB
8493 vmx->nested.current_vmptr >> PAGE_SHIFT,
8494 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4f2777bc 8495
14c07ad8
VK
8496 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
8497
9a2a05b9 8498 vmx->nested.current_vmptr = -1ull;
e7953d7f
AG
8499}
8500
ec378aee
NHE
8501/*
8502 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
8503 * just stops using VMX.
8504 */
14c07ad8 8505static void free_nested(struct kvm_vcpu *vcpu)
ec378aee 8506{
14c07ad8
VK
8507 struct vcpu_vmx *vmx = to_vmx(vcpu);
8508
b7455825 8509 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
ec378aee 8510 return;
9a2a05b9 8511
ec378aee 8512 vmx->nested.vmxon = false;
b7455825 8513 vmx->nested.smm.vmxon = false;
5c614b35 8514 free_vpid(vmx->nested.vpid02);
8ca44e88
DM
8515 vmx->nested.posted_intr_nv = -1;
8516 vmx->nested.current_vmptr = -1ull;
355f4fb1 8517 if (enable_shadow_vmcs) {
8ca44e88 8518 vmx_disable_shadow_vmcs(vmx);
355f4fb1
JM
8519 vmcs_clear(vmx->vmcs01.shadow_vmcs);
8520 free_vmcs(vmx->vmcs01.shadow_vmcs);
8521 vmx->vmcs01.shadow_vmcs = NULL;
8522 }
4f2777bc 8523 kfree(vmx->nested.cached_vmcs12);
61ada748 8524 kfree(vmx->nested.cached_shadow_vmcs12);
de3a0021 8525 /* Unpin physical memory we referred to in the vmcs02 */
fe3ef05c 8526 if (vmx->nested.apic_access_page) {
53a70daf 8527 kvm_release_page_dirty(vmx->nested.apic_access_page);
48d89b92 8528 vmx->nested.apic_access_page = NULL;
fe3ef05c 8529 }
a7c0b07d 8530 if (vmx->nested.virtual_apic_page) {
53a70daf 8531 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
48d89b92 8532 vmx->nested.virtual_apic_page = NULL;
a7c0b07d 8533 }
705699a1
WV
8534 if (vmx->nested.pi_desc_page) {
8535 kunmap(vmx->nested.pi_desc_page);
53a70daf 8536 kvm_release_page_dirty(vmx->nested.pi_desc_page);
705699a1
WV
8537 vmx->nested.pi_desc_page = NULL;
8538 vmx->nested.pi_desc = NULL;
8539 }
ff2f6fe9 8540
14c07ad8
VK
8541 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
8542
b8bbab92
VK
8543 nested_release_evmcs(vcpu);
8544
de3a0021 8545 free_loaded_vmcs(&vmx->nested.vmcs02);
ec378aee
NHE
8546}
8547
8548/* Emulate the VMXOFF instruction */
8549static int handle_vmoff(struct kvm_vcpu *vcpu)
8550{
8551 if (!nested_vmx_check_permission(vcpu))
8552 return 1;
14c07ad8 8553 free_nested(vcpu);
09abb5e3 8554 return nested_vmx_succeed(vcpu);
ec378aee
NHE
8555}
8556
27d6c865
NHE
8557/* Emulate the VMCLEAR instruction */
8558static int handle_vmclear(struct kvm_vcpu *vcpu)
8559{
8560 struct vcpu_vmx *vmx = to_vmx(vcpu);
587d7e72 8561 u32 zero = 0;
27d6c865 8562 gpa_t vmptr;
27d6c865
NHE
8563
8564 if (!nested_vmx_check_permission(vcpu))
8565 return 1;
8566
cbf71279 8567 if (nested_vmx_get_vmptr(vcpu, &vmptr))
27d6c865 8568 return 1;
27d6c865 8569
09abb5e3
SC
8570 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
8571 return nested_vmx_failValid(vcpu,
8572 VMXERR_VMCLEAR_INVALID_ADDRESS);
cbf71279 8573
09abb5e3
SC
8574 if (vmptr == vmx->nested.vmxon_ptr)
8575 return nested_vmx_failValid(vcpu,
8576 VMXERR_VMCLEAR_VMXON_POINTER);
cbf71279 8577
b8bbab92
VK
8578 if (vmx->nested.hv_evmcs_page) {
8579 if (vmptr == vmx->nested.hv_evmcs_vmptr)
8580 nested_release_evmcs(vcpu);
8581 } else {
8582 if (vmptr == vmx->nested.current_vmptr)
8583 nested_release_vmcs12(vcpu);
27d6c865 8584
b8bbab92
VK
8585 kvm_vcpu_write_guest(vcpu,
8586 vmptr + offsetof(struct vmcs12,
8587 launch_state),
8588 &zero, sizeof(zero));
8589 }
27d6c865 8590
09abb5e3 8591 return nested_vmx_succeed(vcpu);
27d6c865
NHE
8592}
8593
cd232ad0
NHE
8594static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
8595
8596/* Emulate the VMLAUNCH instruction */
8597static int handle_vmlaunch(struct kvm_vcpu *vcpu)
8598{
8599 return nested_vmx_run(vcpu, true);
8600}
8601
8602/* Emulate the VMRESUME instruction */
8603static int handle_vmresume(struct kvm_vcpu *vcpu)
8604{
8605
8606 return nested_vmx_run(vcpu, false);
8607}
8608
49f705c5
NHE
8609/*
8610 * Read a vmcs12 field. Since these can have varying lengths and we return
8611 * one type, we chose the biggest type (u64) and zero-extend the return value
8612 * to that size. Note that the caller, handle_vmread, might need to use only
8613 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
8614 * 64-bit fields are to be returned).
8615 */
e2536742 8616static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
a2ae9df7 8617 unsigned long field, u64 *ret)
49f705c5
NHE
8618{
8619 short offset = vmcs_field_to_offset(field);
8620 char *p;
8621
8622 if (offset < 0)
a2ae9df7 8623 return offset;
49f705c5 8624
e2536742 8625 p = (char *)vmcs12 + offset;
49f705c5 8626
d37f4267
JM
8627 switch (vmcs_field_width(field)) {
8628 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
49f705c5 8629 *ret = *((natural_width *)p);
a2ae9df7 8630 return 0;
d37f4267 8631 case VMCS_FIELD_WIDTH_U16:
49f705c5 8632 *ret = *((u16 *)p);
a2ae9df7 8633 return 0;
d37f4267 8634 case VMCS_FIELD_WIDTH_U32:
49f705c5 8635 *ret = *((u32 *)p);
a2ae9df7 8636 return 0;
d37f4267 8637 case VMCS_FIELD_WIDTH_U64:
49f705c5 8638 *ret = *((u64 *)p);
a2ae9df7 8639 return 0;
49f705c5 8640 default:
a2ae9df7
PB
8641 WARN_ON(1);
8642 return -ENOENT;
49f705c5
NHE
8643 }
8644}
8645
20b97fea 8646
e2536742 8647static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
a2ae9df7 8648 unsigned long field, u64 field_value){
20b97fea 8649 short offset = vmcs_field_to_offset(field);
e2536742 8650 char *p = (char *)vmcs12 + offset;
20b97fea 8651 if (offset < 0)
a2ae9df7 8652 return offset;
20b97fea 8653
d37f4267
JM
8654 switch (vmcs_field_width(field)) {
8655 case VMCS_FIELD_WIDTH_U16:
20b97fea 8656 *(u16 *)p = field_value;
a2ae9df7 8657 return 0;
d37f4267 8658 case VMCS_FIELD_WIDTH_U32:
20b97fea 8659 *(u32 *)p = field_value;
a2ae9df7 8660 return 0;
d37f4267 8661 case VMCS_FIELD_WIDTH_U64:
20b97fea 8662 *(u64 *)p = field_value;
a2ae9df7 8663 return 0;
d37f4267 8664 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
20b97fea 8665 *(natural_width *)p = field_value;
a2ae9df7 8666 return 0;
20b97fea 8667 default:
a2ae9df7
PB
8668 WARN_ON(1);
8669 return -ENOENT;
20b97fea
AG
8670 }
8671
8672}
8673
945679e3
VK
8674static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
8675{
8676 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
8677 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
8678
b8bbab92
VK
8679 vmcs12->hdr.revision_id = evmcs->revision_id;
8680
945679e3
VK
8681 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
8682 vmcs12->tpr_threshold = evmcs->tpr_threshold;
8683 vmcs12->guest_rip = evmcs->guest_rip;
8684
8685 if (unlikely(!(evmcs->hv_clean_fields &
8686 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
8687 vmcs12->guest_rsp = evmcs->guest_rsp;
8688 vmcs12->guest_rflags = evmcs->guest_rflags;
8689 vmcs12->guest_interruptibility_info =
8690 evmcs->guest_interruptibility_info;
8691 }
8692
8693 if (unlikely(!(evmcs->hv_clean_fields &
8694 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
8695 vmcs12->cpu_based_vm_exec_control =
8696 evmcs->cpu_based_vm_exec_control;
8697 }
8698
8699 if (unlikely(!(evmcs->hv_clean_fields &
8700 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
8701 vmcs12->exception_bitmap = evmcs->exception_bitmap;
8702 }
8703
8704 if (unlikely(!(evmcs->hv_clean_fields &
8705 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
8706 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
8707 }
8708
8709 if (unlikely(!(evmcs->hv_clean_fields &
8710 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
8711 vmcs12->vm_entry_intr_info_field =
8712 evmcs->vm_entry_intr_info_field;
8713 vmcs12->vm_entry_exception_error_code =
8714 evmcs->vm_entry_exception_error_code;
8715 vmcs12->vm_entry_instruction_len =
8716 evmcs->vm_entry_instruction_len;
8717 }
8718
8719 if (unlikely(!(evmcs->hv_clean_fields &
8720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
8721 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
8722 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
8723 vmcs12->host_cr0 = evmcs->host_cr0;
8724 vmcs12->host_cr3 = evmcs->host_cr3;
8725 vmcs12->host_cr4 = evmcs->host_cr4;
8726 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
8727 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
8728 vmcs12->host_rip = evmcs->host_rip;
8729 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
8730 vmcs12->host_es_selector = evmcs->host_es_selector;
8731 vmcs12->host_cs_selector = evmcs->host_cs_selector;
8732 vmcs12->host_ss_selector = evmcs->host_ss_selector;
8733 vmcs12->host_ds_selector = evmcs->host_ds_selector;
8734 vmcs12->host_fs_selector = evmcs->host_fs_selector;
8735 vmcs12->host_gs_selector = evmcs->host_gs_selector;
8736 vmcs12->host_tr_selector = evmcs->host_tr_selector;
8737 }
8738
8739 if (unlikely(!(evmcs->hv_clean_fields &
8740 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
8741 vmcs12->pin_based_vm_exec_control =
8742 evmcs->pin_based_vm_exec_control;
8743 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
8744 vmcs12->secondary_vm_exec_control =
8745 evmcs->secondary_vm_exec_control;
8746 }
8747
8748 if (unlikely(!(evmcs->hv_clean_fields &
8749 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
8750 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
8751 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
8752 }
8753
8754 if (unlikely(!(evmcs->hv_clean_fields &
8755 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
8756 vmcs12->msr_bitmap = evmcs->msr_bitmap;
8757 }
8758
8759 if (unlikely(!(evmcs->hv_clean_fields &
8760 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
8761 vmcs12->guest_es_base = evmcs->guest_es_base;
8762 vmcs12->guest_cs_base = evmcs->guest_cs_base;
8763 vmcs12->guest_ss_base = evmcs->guest_ss_base;
8764 vmcs12->guest_ds_base = evmcs->guest_ds_base;
8765 vmcs12->guest_fs_base = evmcs->guest_fs_base;
8766 vmcs12->guest_gs_base = evmcs->guest_gs_base;
8767 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
8768 vmcs12->guest_tr_base = evmcs->guest_tr_base;
8769 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
8770 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
8771 vmcs12->guest_es_limit = evmcs->guest_es_limit;
8772 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
8773 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
8774 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
8775 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
8776 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
8777 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
8778 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
8779 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
8780 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
8781 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
8782 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
8783 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
8784 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
8785 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
8786 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
8787 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
8788 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
8789 vmcs12->guest_es_selector = evmcs->guest_es_selector;
8790 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
8791 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
8792 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
8793 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
8794 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
8795 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
8796 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
8797 }
8798
8799 if (unlikely(!(evmcs->hv_clean_fields &
8800 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
8801 vmcs12->tsc_offset = evmcs->tsc_offset;
8802 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
8803 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
8804 }
8805
8806 if (unlikely(!(evmcs->hv_clean_fields &
8807 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
8808 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
8809 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
8810 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
8811 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
8812 vmcs12->guest_cr0 = evmcs->guest_cr0;
8813 vmcs12->guest_cr3 = evmcs->guest_cr3;
8814 vmcs12->guest_cr4 = evmcs->guest_cr4;
8815 vmcs12->guest_dr7 = evmcs->guest_dr7;
8816 }
8817
8818 if (unlikely(!(evmcs->hv_clean_fields &
8819 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
8820 vmcs12->host_fs_base = evmcs->host_fs_base;
8821 vmcs12->host_gs_base = evmcs->host_gs_base;
8822 vmcs12->host_tr_base = evmcs->host_tr_base;
8823 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
8824 vmcs12->host_idtr_base = evmcs->host_idtr_base;
8825 vmcs12->host_rsp = evmcs->host_rsp;
8826 }
8827
8828 if (unlikely(!(evmcs->hv_clean_fields &
8829 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
8830 vmcs12->ept_pointer = evmcs->ept_pointer;
8831 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
8832 }
8833
8834 if (unlikely(!(evmcs->hv_clean_fields &
8835 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
8836 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
8837 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
8838 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
8839 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
8840 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
8841 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
8842 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
8843 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
8844 vmcs12->guest_pending_dbg_exceptions =
8845 evmcs->guest_pending_dbg_exceptions;
8846 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
8847 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
8848 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
8849 vmcs12->guest_activity_state = evmcs->guest_activity_state;
8850 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
8851 }
8852
8853 /*
8854 * Not used?
8855 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
8856 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
8857 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
8858 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
8859 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
8860 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
8861 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
8862 * vmcs12->page_fault_error_code_mask =
8863 * evmcs->page_fault_error_code_mask;
8864 * vmcs12->page_fault_error_code_match =
8865 * evmcs->page_fault_error_code_match;
8866 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
8867 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
8868 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
8869 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
8870 */
8871
8872 /*
8873 * Read only fields:
8874 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
8875 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
8876 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
8877 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
8878 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
8879 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
8880 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
8881 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
8882 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
8883 * vmcs12->exit_qualification = evmcs->exit_qualification;
8884 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
8885 *
8886 * Not present in struct vmcs12:
8887 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
8888 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
8889 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
8890 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
8891 */
8892
8893 return 0;
8894}
8895
8896static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
8897{
8898 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
8899 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
8900
8901 /*
8902 * Should not be changed by KVM:
8903 *
8904 * evmcs->host_es_selector = vmcs12->host_es_selector;
8905 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
8906 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
8907 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
8908 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
8909 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
8910 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
8911 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
8912 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
8913 * evmcs->host_cr0 = vmcs12->host_cr0;
8914 * evmcs->host_cr3 = vmcs12->host_cr3;
8915 * evmcs->host_cr4 = vmcs12->host_cr4;
8916 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
8917 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
8918 * evmcs->host_rip = vmcs12->host_rip;
8919 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
8920 * evmcs->host_fs_base = vmcs12->host_fs_base;
8921 * evmcs->host_gs_base = vmcs12->host_gs_base;
8922 * evmcs->host_tr_base = vmcs12->host_tr_base;
8923 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
8924 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
8925 * evmcs->host_rsp = vmcs12->host_rsp;
8926 * sync_vmcs12() doesn't read these:
8927 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
8928 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
8929 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
8930 * evmcs->ept_pointer = vmcs12->ept_pointer;
8931 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
8932 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
8933 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
8934 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
8935 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
8936 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
8937 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
8938 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
8939 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
8940 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
8941 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
8942 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
8943 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
8944 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
8945 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
8946 * evmcs->page_fault_error_code_mask =
8947 * vmcs12->page_fault_error_code_mask;
8948 * evmcs->page_fault_error_code_match =
8949 * vmcs12->page_fault_error_code_match;
8950 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
8951 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
8952 * evmcs->tsc_offset = vmcs12->tsc_offset;
8953 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
8954 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
8955 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
8956 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
8957 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
8958 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
8959 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
8960 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
8961 *
8962 * Not present in struct vmcs12:
8963 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
8964 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
8965 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
8966 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
8967 */
8968
8969 evmcs->guest_es_selector = vmcs12->guest_es_selector;
8970 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
8971 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
8972 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
8973 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
8974 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
8975 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
8976 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
8977
8978 evmcs->guest_es_limit = vmcs12->guest_es_limit;
8979 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
8980 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
8981 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
8982 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
8983 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
8984 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
8985 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
8986 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
8987 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
8988
8989 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
8990 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
8991 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
8992 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
8993 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
8994 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
8995 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
8996 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
8997
8998 evmcs->guest_es_base = vmcs12->guest_es_base;
8999 evmcs->guest_cs_base = vmcs12->guest_cs_base;
9000 evmcs->guest_ss_base = vmcs12->guest_ss_base;
9001 evmcs->guest_ds_base = vmcs12->guest_ds_base;
9002 evmcs->guest_fs_base = vmcs12->guest_fs_base;
9003 evmcs->guest_gs_base = vmcs12->guest_gs_base;
9004 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
9005 evmcs->guest_tr_base = vmcs12->guest_tr_base;
9006 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
9007 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
9008
9009 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
9010 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
9011
9012 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
9013 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
9014 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
9015 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
9016
9017 evmcs->guest_pending_dbg_exceptions =
9018 vmcs12->guest_pending_dbg_exceptions;
9019 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
9020 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
9021
9022 evmcs->guest_activity_state = vmcs12->guest_activity_state;
9023 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
9024
9025 evmcs->guest_cr0 = vmcs12->guest_cr0;
9026 evmcs->guest_cr3 = vmcs12->guest_cr3;
9027 evmcs->guest_cr4 = vmcs12->guest_cr4;
9028 evmcs->guest_dr7 = vmcs12->guest_dr7;
9029
9030 evmcs->guest_physical_address = vmcs12->guest_physical_address;
9031
9032 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
9033 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
9034 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
9035 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
9036 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
9037 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
9038 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
9039 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
9040
9041 evmcs->exit_qualification = vmcs12->exit_qualification;
9042
9043 evmcs->guest_linear_address = vmcs12->guest_linear_address;
9044 evmcs->guest_rsp = vmcs12->guest_rsp;
9045 evmcs->guest_rflags = vmcs12->guest_rflags;
9046
9047 evmcs->guest_interruptibility_info =
9048 vmcs12->guest_interruptibility_info;
9049 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
9050 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
9051 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
9052 evmcs->vm_entry_exception_error_code =
9053 vmcs12->vm_entry_exception_error_code;
9054 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
9055
9056 evmcs->guest_rip = vmcs12->guest_rip;
9057
9058 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
9059
9060 return 0;
9061}
9062
f4160e45
JM
9063/*
9064 * Copy the writable VMCS shadow fields back to the VMCS12, in case
9065 * they have been modified by the L1 guest. Note that the "read-only"
9066 * VM-exit information fields are actually writable if the vCPU is
9067 * configured to support "VMWRITE to any supported field in the VMCS."
9068 */
16f5b903
AG
9069static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
9070{
f4160e45
JM
9071 const u16 *fields[] = {
9072 shadow_read_write_fields,
9073 shadow_read_only_fields
9074 };
9075 const int max_fields[] = {
9076 max_shadow_read_write_fields,
9077 max_shadow_read_only_fields
9078 };
9079 int i, q;
16f5b903
AG
9080 unsigned long field;
9081 u64 field_value;
355f4fb1 9082 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
16f5b903 9083
282da870
JK
9084 preempt_disable();
9085
16f5b903
AG
9086 vmcs_load(shadow_vmcs);
9087
f4160e45
JM
9088 for (q = 0; q < ARRAY_SIZE(fields); q++) {
9089 for (i = 0; i < max_fields[q]; i++) {
9090 field = fields[q][i];
9091 field_value = __vmcs_readl(field);
e2536742 9092 vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
f4160e45
JM
9093 }
9094 /*
9095 * Skip the VM-exit information fields if they are read-only.
9096 */
9097 if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
9098 break;
16f5b903
AG
9099 }
9100
9101 vmcs_clear(shadow_vmcs);
9102 vmcs_load(vmx->loaded_vmcs->vmcs);
282da870
JK
9103
9104 preempt_enable();
16f5b903
AG
9105}
9106
c3114420
AG
9107static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
9108{
44900ba6 9109 const u16 *fields[] = {
c2bae893
MK
9110 shadow_read_write_fields,
9111 shadow_read_only_fields
c3114420 9112 };
c2bae893 9113 const int max_fields[] = {
c3114420
AG
9114 max_shadow_read_write_fields,
9115 max_shadow_read_only_fields
9116 };
9117 int i, q;
9118 unsigned long field;
9119 u64 field_value = 0;
355f4fb1 9120 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
c3114420
AG
9121
9122 vmcs_load(shadow_vmcs);
9123
c2bae893 9124 for (q = 0; q < ARRAY_SIZE(fields); q++) {
c3114420
AG
9125 for (i = 0; i < max_fields[q]; i++) {
9126 field = fields[q][i];
e2536742 9127 vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
44900ba6 9128 __vmcs_writel(field, field_value);
c3114420
AG
9129 }
9130 }
9131
9132 vmcs_clear(shadow_vmcs);
9133 vmcs_load(vmx->loaded_vmcs->vmcs);
9134}
9135
49f705c5
NHE
9136static int handle_vmread(struct kvm_vcpu *vcpu)
9137{
9138 unsigned long field;
9139 u64 field_value;
9140 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9141 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9142 gva_t gva = 0;
6d894f49 9143 struct vmcs12 *vmcs12;
49f705c5 9144
eb277562 9145 if (!nested_vmx_check_permission(vcpu))
49f705c5
NHE
9146 return 1;
9147
09abb5e3
SC
9148 if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
9149 return nested_vmx_failInvalid(vcpu);
49f705c5 9150
6d894f49
LA
9151 if (!is_guest_mode(vcpu))
9152 vmcs12 = get_vmcs12(vcpu);
9153 else {
9154 /*
9155 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
9156 * to shadowed-field sets the ALU flags for VMfailInvalid.
9157 */
09abb5e3
SC
9158 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
9159 return nested_vmx_failInvalid(vcpu);
6d894f49
LA
9160 vmcs12 = get_shadow_vmcs12(vcpu);
9161 }
9162
49f705c5 9163 /* Decode instruction info and find the field to read */
27e6fb5d 9164 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
49f705c5 9165 /* Read the field, zero-extended to a u64 field_value */
09abb5e3
SC
9166 if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
9167 return nested_vmx_failValid(vcpu,
9168 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
9169
49f705c5
NHE
9170 /*
9171 * Now copy part of this value to register or memory, as requested.
9172 * Note that the number of bits actually copied is 32 or 64 depending
9173 * on the guest's mode (32 or 64 bit), not on the given field's length.
9174 */
9175 if (vmx_instruction_info & (1u << 10)) {
27e6fb5d 9176 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
49f705c5
NHE
9177 field_value);
9178 } else {
9179 if (get_vmx_mem_address(vcpu, exit_qualification,
f9eb4af6 9180 vmx_instruction_info, true, &gva))
49f705c5 9181 return 1;
727ba748 9182 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
ce14e868
PB
9183 kvm_write_guest_virt_system(vcpu, gva, &field_value,
9184 (is_long_mode(vcpu) ? 8 : 4), NULL);
49f705c5
NHE
9185 }
9186
09abb5e3 9187 return nested_vmx_succeed(vcpu);
49f705c5
NHE
9188}
9189
9190
9191static int handle_vmwrite(struct kvm_vcpu *vcpu)
9192{
9193 unsigned long field;
9194 gva_t gva;
74a497fa 9195 struct vcpu_vmx *vmx = to_vmx(vcpu);
49f705c5
NHE
9196 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9197 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
74a497fa 9198
49f705c5
NHE
9199 /* The value to write might be 32 or 64 bits, depending on L1's long
9200 * mode, and eventually we need to write that into a field of several
9201 * possible lengths. The code below first zero-extends the value to 64
6a6256f9 9202 * bit (field_value), and then copies only the appropriate number of
49f705c5
NHE
9203 * bits into the vmcs12 field.
9204 */
9205 u64 field_value = 0;
9206 struct x86_exception e;
6d894f49 9207 struct vmcs12 *vmcs12;
49f705c5 9208
eb277562 9209 if (!nested_vmx_check_permission(vcpu))
49f705c5
NHE
9210 return 1;
9211
09abb5e3
SC
9212 if (vmx->nested.current_vmptr == -1ull)
9213 return nested_vmx_failInvalid(vcpu);
eb277562 9214
49f705c5 9215 if (vmx_instruction_info & (1u << 10))
27e6fb5d 9216 field_value = kvm_register_readl(vcpu,
49f705c5
NHE
9217 (((vmx_instruction_info) >> 3) & 0xf));
9218 else {
9219 if (get_vmx_mem_address(vcpu, exit_qualification,
f9eb4af6 9220 vmx_instruction_info, false, &gva))
49f705c5 9221 return 1;
ce14e868
PB
9222 if (kvm_read_guest_virt(vcpu, gva, &field_value,
9223 (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
49f705c5
NHE
9224 kvm_inject_page_fault(vcpu, &e);
9225 return 1;
9226 }
9227 }
9228
9229
27e6fb5d 9230 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
f4160e45
JM
9231 /*
9232 * If the vCPU supports "VMWRITE to any supported field in the
9233 * VMCS," then the "read-only" fields are actually read/write.
9234 */
9235 if (vmcs_field_readonly(field) &&
09abb5e3
SC
9236 !nested_cpu_has_vmwrite_any_field(vcpu))
9237 return nested_vmx_failValid(vcpu,
49f705c5 9238 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
49f705c5 9239
6d894f49
LA
9240 if (!is_guest_mode(vcpu))
9241 vmcs12 = get_vmcs12(vcpu);
9242 else {
9243 /*
9244 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
9245 * to shadowed-field sets the ALU flags for VMfailInvalid.
9246 */
09abb5e3
SC
9247 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
9248 return nested_vmx_failInvalid(vcpu);
6d894f49 9249 vmcs12 = get_shadow_vmcs12(vcpu);
6d894f49
LA
9250 }
9251
09abb5e3
SC
9252 if (vmcs12_write_any(vmcs12, field, field_value) < 0)
9253 return nested_vmx_failValid(vcpu,
9254 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
49f705c5 9255
6d894f49
LA
9256 /*
9257 * Do not track vmcs12 dirty-state if in guest-mode
9258 * as we actually dirty shadow vmcs12 instead of vmcs12.
9259 */
9260 if (!is_guest_mode(vcpu)) {
9261 switch (field) {
74a497fa
PB
9262#define SHADOW_FIELD_RW(x) case x:
9263#include "vmx_shadow_fields.h"
6d894f49
LA
9264 /*
9265 * The fields that can be updated by L1 without a vmexit are
9266 * always updated in the vmcs02, the others go down the slow
9267 * path of prepare_vmcs02.
9268 */
9269 break;
9270 default:
9271 vmx->nested.dirty_vmcs12 = true;
9272 break;
9273 }
74a497fa
PB
9274 }
9275
09abb5e3 9276 return nested_vmx_succeed(vcpu);
49f705c5
NHE
9277}
9278
a8bc284e
JM
9279static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
9280{
9281 vmx->nested.current_vmptr = vmptr;
9282 if (enable_shadow_vmcs) {
9283 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
9284 SECONDARY_EXEC_SHADOW_VMCS);
9285 vmcs_write64(VMCS_LINK_POINTER,
9286 __pa(vmx->vmcs01.shadow_vmcs));
945679e3 9287 vmx->nested.need_vmcs12_sync = true;
a8bc284e 9288 }
74a497fa 9289 vmx->nested.dirty_vmcs12 = true;
a8bc284e
JM
9290}
9291
63846663
NHE
9292/* Emulate the VMPTRLD instruction */
9293static int handle_vmptrld(struct kvm_vcpu *vcpu)
9294{
9295 struct vcpu_vmx *vmx = to_vmx(vcpu);
63846663 9296 gpa_t vmptr;
63846663
NHE
9297
9298 if (!nested_vmx_check_permission(vcpu))
9299 return 1;
9300
cbf71279 9301 if (nested_vmx_get_vmptr(vcpu, &vmptr))
63846663 9302 return 1;
63846663 9303
09abb5e3
SC
9304 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
9305 return nested_vmx_failValid(vcpu,
9306 VMXERR_VMPTRLD_INVALID_ADDRESS);
cbf71279 9307
09abb5e3
SC
9308 if (vmptr == vmx->nested.vmxon_ptr)
9309 return nested_vmx_failValid(vcpu,
9310 VMXERR_VMPTRLD_VMXON_POINTER);
cbf71279 9311
b8bbab92
VK
9312 /* Forbid normal VMPTRLD if Enlightened version was used */
9313 if (vmx->nested.hv_evmcs)
9314 return 1;
cbf71279 9315
63846663
NHE
9316 if (vmx->nested.current_vmptr != vmptr) {
9317 struct vmcs12 *new_vmcs12;
9318 struct page *page;
5e2f30b7 9319 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
09abb5e3
SC
9320 if (is_error_page(page))
9321 return nested_vmx_failInvalid(vcpu);
9322
63846663 9323 new_vmcs12 = kmap(page);
392b2f25 9324 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
fa97d7db
LA
9325 (new_vmcs12->hdr.shadow_vmcs &&
9326 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
63846663 9327 kunmap(page);
53a70daf 9328 kvm_release_page_clean(page);
09abb5e3 9329 return nested_vmx_failValid(vcpu,
63846663 9330 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
63846663 9331 }
63846663 9332
14c07ad8
VK
9333 nested_release_vmcs12(vcpu);
9334
4f2777bc
DM
9335 /*
9336 * Load VMCS12 from guest memory since it is not already
9337 * cached.
9338 */
9f744c59
PB
9339 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
9340 kunmap(page);
53a70daf 9341 kvm_release_page_clean(page);
9f744c59 9342
a8bc284e 9343 set_current_vmptr(vmx, vmptr);
63846663
NHE
9344 }
9345
09abb5e3 9346 return nested_vmx_succeed(vcpu);
63846663
NHE
9347}
9348
b8bbab92
VK
9349/*
9350 * This is an equivalent of the nested hypervisor executing the vmptrld
9351 * instruction.
9352 */
8cab6507
VK
9353static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
9354 bool from_launch)
b8bbab92
VK
9355{
9356 struct vcpu_vmx *vmx = to_vmx(vcpu);
9357 struct hv_vp_assist_page assist_page;
9358
9359 if (likely(!vmx->nested.enlightened_vmcs_enabled))
9360 return 1;
9361
9362 if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
9363 return 1;
9364
9365 if (unlikely(!assist_page.enlighten_vmentry))
9366 return 1;
9367
9368 if (unlikely(assist_page.current_nested_vmcs !=
9369 vmx->nested.hv_evmcs_vmptr)) {
9370
9371 if (!vmx->nested.hv_evmcs)
9372 vmx->nested.current_vmptr = -1ull;
9373
9374 nested_release_evmcs(vcpu);
9375
9376 vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
9377 vcpu, assist_page.current_nested_vmcs);
9378
9379 if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
9380 return 0;
9381
9382 vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
9383
9384 if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
9385 nested_release_evmcs(vcpu);
9386 return 0;
9387 }
9388
9389 vmx->nested.dirty_vmcs12 = true;
9390 /*
9391 * As we keep L2 state for one guest only 'hv_clean_fields' mask
9392 * can't be used when we switch between them. Reset it here for
9393 * simplicity.
9394 */
9395 vmx->nested.hv_evmcs->hv_clean_fields &=
9396 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
9397 vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
9398
9399 /*
9400 * Unlike normal vmcs12, enlightened vmcs12 is not fully
9401 * reloaded from guest's memory (read only fields, fields not
9402 * present in struct hv_enlightened_vmcs, ...). Make sure there
9403 * are no leftovers.
9404 */
8cab6507
VK
9405 if (from_launch)
9406 memset(vmx->nested.cached_vmcs12, 0,
9407 sizeof(*vmx->nested.cached_vmcs12));
b8bbab92
VK
9408
9409 }
9410 return 1;
63846663
NHE
9411}
9412
6a4d7550
NHE
9413/* Emulate the VMPTRST instruction */
9414static int handle_vmptrst(struct kvm_vcpu *vcpu)
9415{
0a06d425
SC
9416 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
9417 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9418 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
6a4d7550 9419 struct x86_exception e;
0a06d425 9420 gva_t gva;
6a4d7550
NHE
9421
9422 if (!nested_vmx_check_permission(vcpu))
9423 return 1;
9424
b8bbab92
VK
9425 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
9426 return 1;
9427
0a06d425 9428 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
6a4d7550 9429 return 1;
727ba748 9430 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
0a06d425
SC
9431 if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
9432 sizeof(gpa_t), &e)) {
6a4d7550
NHE
9433 kvm_inject_page_fault(vcpu, &e);
9434 return 1;
9435 }
09abb5e3 9436 return nested_vmx_succeed(vcpu);
6a4d7550
NHE
9437}
9438
bfd0a56b
NHE
9439/* Emulate the INVEPT instruction */
9440static int handle_invept(struct kvm_vcpu *vcpu)
9441{
b9c237bb 9442 struct vcpu_vmx *vmx = to_vmx(vcpu);
bfd0a56b
NHE
9443 u32 vmx_instruction_info, types;
9444 unsigned long type;
9445 gva_t gva;
9446 struct x86_exception e;
9447 struct {
9448 u64 eptp, gpa;
9449 } operand;
bfd0a56b 9450
6677f3da 9451 if (!(vmx->nested.msrs.secondary_ctls_high &
b9c237bb 9452 SECONDARY_EXEC_ENABLE_EPT) ||
6677f3da 9453 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
bfd0a56b
NHE
9454 kvm_queue_exception(vcpu, UD_VECTOR);
9455 return 1;
9456 }
9457
9458 if (!nested_vmx_check_permission(vcpu))
9459 return 1;
9460
bfd0a56b 9461 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
27e6fb5d 9462 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
bfd0a56b 9463
6677f3da 9464 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
bfd0a56b 9465
09abb5e3
SC
9466 if (type >= 32 || !(types & (1 << type)))
9467 return nested_vmx_failValid(vcpu,
bfd0a56b 9468 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
bfd0a56b
NHE
9469
9470 /* According to the Intel VMX instruction reference, the memory
9471 * operand is read even if it isn't needed (e.g., for type==global)
9472 */
9473 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
f9eb4af6 9474 vmx_instruction_info, false, &gva))
bfd0a56b 9475 return 1;
ce14e868 9476 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
bfd0a56b
NHE
9477 kvm_inject_page_fault(vcpu, &e);
9478 return 1;
9479 }
9480
9481 switch (type) {
bfd0a56b 9482 case VMX_EPT_EXTENT_GLOBAL:
45e11817
BD
9483 /*
9484 * TODO: track mappings and invalidate
9485 * single context requests appropriately
9486 */
9487 case VMX_EPT_EXTENT_CONTEXT:
bfd0a56b 9488 kvm_mmu_sync_roots(vcpu);
77c3913b 9489 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
bfd0a56b
NHE
9490 break;
9491 default:
9492 BUG_ON(1);
9493 break;
9494 }
9495
09abb5e3 9496 return nested_vmx_succeed(vcpu);
bfd0a56b
NHE
9497}
9498
3d5bdae8
LA
9499static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
9500{
9501 struct vcpu_vmx *vmx = to_vmx(vcpu);
9502
9503 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
bfd0a56b
NHE
9504}
9505
a642fc30
PM
9506static int handle_invvpid(struct kvm_vcpu *vcpu)
9507{
99b83ac8
WL
9508 struct vcpu_vmx *vmx = to_vmx(vcpu);
9509 u32 vmx_instruction_info;
9510 unsigned long type, types;
9511 gva_t gva;
9512 struct x86_exception e;
40352605
JM
9513 struct {
9514 u64 vpid;
9515 u64 gla;
9516 } operand;
3d5bdae8 9517 u16 vpid02;
99b83ac8 9518
6677f3da 9519 if (!(vmx->nested.msrs.secondary_ctls_high &
99b83ac8 9520 SECONDARY_EXEC_ENABLE_VPID) ||
6677f3da 9521 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
99b83ac8
WL
9522 kvm_queue_exception(vcpu, UD_VECTOR);
9523 return 1;
9524 }
9525
9526 if (!nested_vmx_check_permission(vcpu))
9527 return 1;
9528
9529 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9530 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9531
6677f3da 9532 types = (vmx->nested.msrs.vpid_caps &
bcdde302 9533 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
99b83ac8 9534
09abb5e3
SC
9535 if (type >= 32 || !(types & (1 << type)))
9536 return nested_vmx_failValid(vcpu,
99b83ac8 9537 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
99b83ac8
WL
9538
9539 /* according to the intel vmx instruction reference, the memory
9540 * operand is read even if it isn't needed (e.g., for type==global)
9541 */
9542 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9543 vmx_instruction_info, false, &gva))
9544 return 1;
ce14e868 9545 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
99b83ac8
WL
9546 kvm_inject_page_fault(vcpu, &e);
9547 return 1;
9548 }
09abb5e3
SC
9549 if (operand.vpid >> 16)
9550 return nested_vmx_failValid(vcpu,
40352605 9551 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
99b83ac8 9552
3d5bdae8 9553 vpid02 = nested_get_vpid02(vcpu);
99b83ac8 9554 switch (type) {
bcdde302 9555 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
cd9a491f 9556 if (!operand.vpid ||
09abb5e3
SC
9557 is_noncanonical_address(operand.gla, vcpu))
9558 return nested_vmx_failValid(vcpu,
40352605 9559 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
3d5bdae8 9560 if (cpu_has_vmx_invvpid_individual_addr()) {
cd9a491f 9561 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
3d5bdae8 9562 vpid02, operand.gla);
cd9a491f 9563 } else
327c0721 9564 __vmx_flush_tlb(vcpu, vpid02, false);
cd9a491f 9565 break;
ef697a71 9566 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
bcdde302 9567 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
09abb5e3
SC
9568 if (!operand.vpid)
9569 return nested_vmx_failValid(vcpu,
bcdde302 9570 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
327c0721 9571 __vmx_flush_tlb(vcpu, vpid02, false);
bcdde302 9572 break;
99b83ac8 9573 case VMX_VPID_EXTENT_ALL_CONTEXT:
327c0721 9574 __vmx_flush_tlb(vcpu, vpid02, false);
99b83ac8
WL
9575 break;
9576 default:
bcdde302 9577 WARN_ON_ONCE(1);
6affcbed 9578 return kvm_skip_emulated_instruction(vcpu);
99b83ac8
WL
9579 }
9580
09abb5e3 9581 return nested_vmx_succeed(vcpu);
a642fc30
PM
9582}
9583
eb4b248e
JS
9584static int handle_invpcid(struct kvm_vcpu *vcpu)
9585{
9586 u32 vmx_instruction_info;
9587 unsigned long type;
9588 bool pcid_enabled;
9589 gva_t gva;
9590 struct x86_exception e;
b94742c9
JS
9591 unsigned i;
9592 unsigned long roots_to_free = 0;
eb4b248e
JS
9593 struct {
9594 u64 pcid;
9595 u64 gla;
9596 } operand;
9597
9598 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
9599 kvm_queue_exception(vcpu, UD_VECTOR);
9600 return 1;
9601 }
9602
9603 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9604 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9605
9606 if (type > 3) {
9607 kvm_inject_gp(vcpu, 0);
9608 return 1;
9609 }
9610
9611 /* According to the Intel instruction reference, the memory operand
9612 * is read even if it isn't needed (e.g., for type==all)
9613 */
9614 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9615 vmx_instruction_info, false, &gva))
9616 return 1;
9617
9618 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9619 kvm_inject_page_fault(vcpu, &e);
9620 return 1;
9621 }
9622
9623 if (operand.pcid >> 12 != 0) {
9624 kvm_inject_gp(vcpu, 0);
9625 return 1;
9626 }
9627
9628 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
9629
9630 switch (type) {
9631 case INVPCID_TYPE_INDIV_ADDR:
9632 if ((!pcid_enabled && (operand.pcid != 0)) ||
9633 is_noncanonical_address(operand.gla, vcpu)) {
9634 kvm_inject_gp(vcpu, 0);
9635 return 1;
9636 }
9637 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
9638 return kvm_skip_emulated_instruction(vcpu);
9639
9640 case INVPCID_TYPE_SINGLE_CTXT:
9641 if (!pcid_enabled && (operand.pcid != 0)) {
9642 kvm_inject_gp(vcpu, 0);
9643 return 1;
9644 }
9645
9646 if (kvm_get_active_pcid(vcpu) == operand.pcid) {
9647 kvm_mmu_sync_roots(vcpu);
9648 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
9649 }
9650
b94742c9 9651 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
44dd3ffa 9652 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
b94742c9
JS
9653 == operand.pcid)
9654 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
ade61e28 9655
6a82cd1c 9656 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
eb4b248e 9657 /*
b94742c9 9658 * If neither the current cr3 nor any of the prev_roots use the
ade61e28
JS
9659 * given PCID, then nothing needs to be done here because a
9660 * resync will happen anyway before switching to any other CR3.
eb4b248e
JS
9661 */
9662
9663 return kvm_skip_emulated_instruction(vcpu);
9664
9665 case INVPCID_TYPE_ALL_NON_GLOBAL:
9666 /*
9667 * Currently, KVM doesn't mark global entries in the shadow
9668 * page tables, so a non-global flush just degenerates to a
9669 * global flush. If needed, we could optimize this later by
9670 * keeping track of global entries in shadow page tables.
9671 */
9672
9673 /* fall-through */
9674 case INVPCID_TYPE_ALL_INCL_GLOBAL:
9675 kvm_mmu_unload(vcpu);
9676 return kvm_skip_emulated_instruction(vcpu);
9677
9678 default:
9679 BUG(); /* We have already checked above that type <= 3 */
9680 }
9681}
9682
843e4330
KH
9683static int handle_pml_full(struct kvm_vcpu *vcpu)
9684{
9685 unsigned long exit_qualification;
9686
9687 trace_kvm_pml_full(vcpu->vcpu_id);
9688
9689 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9690
9691 /*
9692 * PML buffer FULL happened while executing iret from NMI,
9693 * "blocked by NMI" bit has to be set before next VM entry.
9694 */
9695 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
d02fcf50 9696 enable_vnmi &&
843e4330
KH
9697 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
9698 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
9699 GUEST_INTR_STATE_NMI);
9700
9701 /*
9702 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
9703 * here.., and there's no userspace involvement needed for PML.
9704 */
9705 return 1;
9706}
9707
64672c95
YJ
9708static int handle_preemption_timer(struct kvm_vcpu *vcpu)
9709{
d264ee0c
SC
9710 if (!to_vmx(vcpu)->req_immediate_exit)
9711 kvm_lapic_expired_hv_timer(vcpu);
64672c95
YJ
9712 return 1;
9713}
9714
41ab9372
BD
9715static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
9716{
9717 struct vcpu_vmx *vmx = to_vmx(vcpu);
41ab9372
BD
9718 int maxphyaddr = cpuid_maxphyaddr(vcpu);
9719
9720 /* Check for memory type validity */
bb97a016
DH
9721 switch (address & VMX_EPTP_MT_MASK) {
9722 case VMX_EPTP_MT_UC:
6677f3da 9723 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
41ab9372
BD
9724 return false;
9725 break;
bb97a016 9726 case VMX_EPTP_MT_WB:
6677f3da 9727 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
41ab9372
BD
9728 return false;
9729 break;
9730 default:
9731 return false;
9732 }
9733
bb97a016
DH
9734 /* only 4 levels page-walk length are valid */
9735 if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
41ab9372
BD
9736 return false;
9737
9738 /* Reserved bits should not be set */
9739 if (address >> maxphyaddr || ((address >> 7) & 0x1f))
9740 return false;
9741
9742 /* AD, if set, should be supported */
bb97a016 9743 if (address & VMX_EPTP_AD_ENABLE_BIT) {
6677f3da 9744 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
41ab9372
BD
9745 return false;
9746 }
9747
9748 return true;
9749}
9750
9751static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
9752 struct vmcs12 *vmcs12)
9753{
9754 u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
9755 u64 address;
9756 bool accessed_dirty;
9757 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
9758
9759 if (!nested_cpu_has_eptp_switching(vmcs12) ||
9760 !nested_cpu_has_ept(vmcs12))
9761 return 1;
9762
9763 if (index >= VMFUNC_EPTP_ENTRIES)
9764 return 1;
9765
9766
9767 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
9768 &address, index * 8, 8))
9769 return 1;
9770
bb97a016 9771 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
41ab9372
BD
9772
9773 /*
9774 * If the (L2) guest does a vmfunc to the currently
9775 * active ept pointer, we don't have to do anything else
9776 */
9777 if (vmcs12->ept_pointer != address) {
9778 if (!valid_ept_address(vcpu, address))
9779 return 1;
9780
9781 kvm_mmu_unload(vcpu);
9782 mmu->ept_ad = accessed_dirty;
36d9594d 9783 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
41ab9372
BD
9784 vmcs12->ept_pointer = address;
9785 /*
9786 * TODO: Check what's the correct approach in case
9787 * mmu reload fails. Currently, we just let the next
9788 * reload potentially fail
9789 */
9790 kvm_mmu_reload(vcpu);
9791 }
9792
9793 return 0;
9794}
9795
2a499e49
BD
9796static int handle_vmfunc(struct kvm_vcpu *vcpu)
9797{
27c42a1b
BD
9798 struct vcpu_vmx *vmx = to_vmx(vcpu);
9799 struct vmcs12 *vmcs12;
9800 u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
9801
9802 /*
9803 * VMFUNC is only supported for nested guests, but we always enable the
9804 * secondary control for simplicity; for non-nested mode, fake that we
9805 * didn't by injecting #UD.
9806 */
9807 if (!is_guest_mode(vcpu)) {
9808 kvm_queue_exception(vcpu, UD_VECTOR);
9809 return 1;
9810 }
9811
9812 vmcs12 = get_vmcs12(vcpu);
9813 if ((vmcs12->vm_function_control & (1 << function)) == 0)
9814 goto fail;
41ab9372
BD
9815
9816 switch (function) {
9817 case 0:
9818 if (nested_vmx_eptp_switching(vcpu, vmcs12))
9819 goto fail;
9820 break;
9821 default:
9822 goto fail;
9823 }
9824 return kvm_skip_emulated_instruction(vcpu);
27c42a1b
BD
9825
9826fail:
9827 nested_vmx_vmexit(vcpu, vmx->exit_reason,
9828 vmcs_read32(VM_EXIT_INTR_INFO),
9829 vmcs_readl(EXIT_QUALIFICATION));
2a499e49
BD
9830 return 1;
9831}
9832
0b665d30
SC
9833static int handle_encls(struct kvm_vcpu *vcpu)
9834{
9835 /*
9836 * SGX virtualization is not yet supported. There is no software
9837 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
9838 * to prevent the guest from executing ENCLS.
9839 */
9840 kvm_queue_exception(vcpu, UD_VECTOR);
9841 return 1;
9842}
9843
6aa8b732
AK
9844/*
9845 * The exit handlers return 1 if the exit was handled fully and guest execution
9846 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
9847 * to be done to userspace and return 0.
9848 */
772e0318 9849static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6aa8b732
AK
9850 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
9851 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
988ad74f 9852 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
f08864b4 9853 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6aa8b732 9854 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6aa8b732
AK
9855 [EXIT_REASON_CR_ACCESS] = handle_cr,
9856 [EXIT_REASON_DR_ACCESS] = handle_dr,
9857 [EXIT_REASON_CPUID] = handle_cpuid,
9858 [EXIT_REASON_MSR_READ] = handle_rdmsr,
9859 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
9860 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
9861 [EXIT_REASON_HLT] = handle_halt,
ec25d5e6 9862 [EXIT_REASON_INVD] = handle_invd,
a7052897 9863 [EXIT_REASON_INVLPG] = handle_invlpg,
fee84b07 9864 [EXIT_REASON_RDPMC] = handle_rdpmc,
c21415e8 9865 [EXIT_REASON_VMCALL] = handle_vmcall,
27d6c865 9866 [EXIT_REASON_VMCLEAR] = handle_vmclear,
cd232ad0 9867 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
63846663 9868 [EXIT_REASON_VMPTRLD] = handle_vmptrld,
6a4d7550 9869 [EXIT_REASON_VMPTRST] = handle_vmptrst,
49f705c5 9870 [EXIT_REASON_VMREAD] = handle_vmread,
cd232ad0 9871 [EXIT_REASON_VMRESUME] = handle_vmresume,
49f705c5 9872 [EXIT_REASON_VMWRITE] = handle_vmwrite,
ec378aee
NHE
9873 [EXIT_REASON_VMOFF] = handle_vmoff,
9874 [EXIT_REASON_VMON] = handle_vmon,
f78e0e2e
SY
9875 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
9876 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
83d4c286 9877 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
c7c9c56c 9878 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
e5edaa01 9879 [EXIT_REASON_WBINVD] = handle_wbinvd,
2acf923e 9880 [EXIT_REASON_XSETBV] = handle_xsetbv,
37817f29 9881 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
a0861c02 9882 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
0367f205
PB
9883 [EXIT_REASON_GDTR_IDTR] = handle_desc,
9884 [EXIT_REASON_LDTR_TR] = handle_desc,
68f89400
MT
9885 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
9886 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
4b8d54f9 9887 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
87c00572 9888 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
5f3d45e7 9889 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
87c00572 9890 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
bfd0a56b 9891 [EXIT_REASON_INVEPT] = handle_invept,
a642fc30 9892 [EXIT_REASON_INVVPID] = handle_invvpid,
45ec368c 9893 [EXIT_REASON_RDRAND] = handle_invalid_op,
75f4fc8d 9894 [EXIT_REASON_RDSEED] = handle_invalid_op,
f53cd63c
WL
9895 [EXIT_REASON_XSAVES] = handle_xsaves,
9896 [EXIT_REASON_XRSTORS] = handle_xrstors,
843e4330 9897 [EXIT_REASON_PML_FULL] = handle_pml_full,
eb4b248e 9898 [EXIT_REASON_INVPCID] = handle_invpcid,
2a499e49 9899 [EXIT_REASON_VMFUNC] = handle_vmfunc,
64672c95 9900 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
0b665d30 9901 [EXIT_REASON_ENCLS] = handle_encls,
6aa8b732
AK
9902};
9903
9904static const int kvm_vmx_max_exit_handlers =
50a3485c 9905 ARRAY_SIZE(kvm_vmx_exit_handlers);
6aa8b732 9906
908a7bdd
JK
9907static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
9908 struct vmcs12 *vmcs12)
9909{
9910 unsigned long exit_qualification;
9911 gpa_t bitmap, last_bitmap;
9912 unsigned int port;
9913 int size;
9914 u8 b;
9915
908a7bdd 9916 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
2f0a6397 9917 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
908a7bdd
JK
9918
9919 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9920
9921 port = exit_qualification >> 16;
9922 size = (exit_qualification & 7) + 1;
9923
9924 last_bitmap = (gpa_t)-1;
9925 b = -1;
9926
9927 while (size > 0) {
9928 if (port < 0x8000)
9929 bitmap = vmcs12->io_bitmap_a;
9930 else if (port < 0x10000)
9931 bitmap = vmcs12->io_bitmap_b;
9932 else
1d804d07 9933 return true;
908a7bdd
JK
9934 bitmap += (port & 0x7fff) / 8;
9935
9936 if (last_bitmap != bitmap)
54bf36aa 9937 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
1d804d07 9938 return true;
908a7bdd 9939 if (b & (1 << (port & 7)))
1d804d07 9940 return true;
908a7bdd
JK
9941
9942 port++;
9943 size--;
9944 last_bitmap = bitmap;
9945 }
9946
1d804d07 9947 return false;
908a7bdd
JK
9948}
9949
644d711a
NHE
9950/*
9951 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
9952 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
9953 * disinterest in the current event (read or write a specific MSR) by using an
9954 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
9955 */
9956static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
9957 struct vmcs12 *vmcs12, u32 exit_reason)
9958{
9959 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
9960 gpa_t bitmap;
9961
cbd29cb6 9962 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
1d804d07 9963 return true;
644d711a
NHE
9964
9965 /*
9966 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
9967 * for the four combinations of read/write and low/high MSR numbers.
9968 * First we need to figure out which of the four to use:
9969 */
9970 bitmap = vmcs12->msr_bitmap;
9971 if (exit_reason == EXIT_REASON_MSR_WRITE)
9972 bitmap += 2048;
9973 if (msr_index >= 0xc0000000) {
9974 msr_index -= 0xc0000000;
9975 bitmap += 1024;
9976 }
9977
9978 /* Then read the msr_index'th bit from this bitmap: */
9979 if (msr_index < 1024*8) {
9980 unsigned char b;
54bf36aa 9981 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
1d804d07 9982 return true;
644d711a
NHE
9983 return 1 & (b >> (msr_index & 7));
9984 } else
1d804d07 9985 return true; /* let L1 handle the wrong parameter */
644d711a
NHE
9986}
9987
9988/*
9989 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
9990 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
9991 * intercept (via guest_host_mask etc.) the current event.
9992 */
9993static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
9994 struct vmcs12 *vmcs12)
9995{
9996 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9997 int cr = exit_qualification & 15;
e1d39b17
JS
9998 int reg;
9999 unsigned long val;
644d711a
NHE
10000
10001 switch ((exit_qualification >> 4) & 3) {
10002 case 0: /* mov to cr */
e1d39b17
JS
10003 reg = (exit_qualification >> 8) & 15;
10004 val = kvm_register_readl(vcpu, reg);
644d711a
NHE
10005 switch (cr) {
10006 case 0:
10007 if (vmcs12->cr0_guest_host_mask &
10008 (val ^ vmcs12->cr0_read_shadow))
1d804d07 10009 return true;
644d711a
NHE
10010 break;
10011 case 3:
10012 if ((vmcs12->cr3_target_count >= 1 &&
10013 vmcs12->cr3_target_value0 == val) ||
10014 (vmcs12->cr3_target_count >= 2 &&
10015 vmcs12->cr3_target_value1 == val) ||
10016 (vmcs12->cr3_target_count >= 3 &&
10017 vmcs12->cr3_target_value2 == val) ||
10018 (vmcs12->cr3_target_count >= 4 &&
10019 vmcs12->cr3_target_value3 == val))
1d804d07 10020 return false;
644d711a 10021 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
1d804d07 10022 return true;
644d711a
NHE
10023 break;
10024 case 4:
10025 if (vmcs12->cr4_guest_host_mask &
10026 (vmcs12->cr4_read_shadow ^ val))
1d804d07 10027 return true;
644d711a
NHE
10028 break;
10029 case 8:
10030 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
1d804d07 10031 return true;
644d711a
NHE
10032 break;
10033 }
10034 break;
10035 case 2: /* clts */
10036 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
10037 (vmcs12->cr0_read_shadow & X86_CR0_TS))
1d804d07 10038 return true;
644d711a
NHE
10039 break;
10040 case 1: /* mov from cr */
10041 switch (cr) {
10042 case 3:
10043 if (vmcs12->cpu_based_vm_exec_control &
10044 CPU_BASED_CR3_STORE_EXITING)
1d804d07 10045 return true;
644d711a
NHE
10046 break;
10047 case 8:
10048 if (vmcs12->cpu_based_vm_exec_control &
10049 CPU_BASED_CR8_STORE_EXITING)
1d804d07 10050 return true;
644d711a
NHE
10051 break;
10052 }
10053 break;
10054 case 3: /* lmsw */
10055 /*
10056 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
10057 * cr0. Other attempted changes are ignored, with no exit.
10058 */
e1d39b17 10059 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
644d711a
NHE
10060 if (vmcs12->cr0_guest_host_mask & 0xe &
10061 (val ^ vmcs12->cr0_read_shadow))
1d804d07 10062 return true;
644d711a
NHE
10063 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
10064 !(vmcs12->cr0_read_shadow & 0x1) &&
10065 (val & 0x1))
1d804d07 10066 return true;
644d711a
NHE
10067 break;
10068 }
1d804d07 10069 return false;
644d711a
NHE
10070}
10071
a7cde481
LA
10072static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
10073 struct vmcs12 *vmcs12, gpa_t bitmap)
10074{
10075 u32 vmx_instruction_info;
10076 unsigned long field;
10077 u8 b;
10078
10079 if (!nested_cpu_has_shadow_vmcs(vmcs12))
10080 return true;
10081
10082 /* Decode instruction info and find the field to access */
10083 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
10084 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
10085
10086 /* Out-of-range fields always cause a VM exit from L2 to L1 */
10087 if (field >> 15)
10088 return true;
10089
10090 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
10091 return true;
10092
10093 return 1 & (b >> (field & 7));
10094}
10095
644d711a
NHE
10096/*
10097 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
10098 * should handle it ourselves in L0 (and then continue L2). Only call this
10099 * when in is_guest_mode (L2).
10100 */
7313c698 10101static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
644d711a 10102{
644d711a
NHE
10103 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10104 struct vcpu_vmx *vmx = to_vmx(vcpu);
10105 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10106
4f350c6d
JM
10107 if (vmx->nested.nested_run_pending)
10108 return false;
10109
10110 if (unlikely(vmx->fail)) {
10111 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
10112 vmcs_read32(VM_INSTRUCTION_ERROR));
10113 return true;
10114 }
542060ea 10115
c9f04407
DM
10116 /*
10117 * The host physical addresses of some pages of guest memory
de3a0021
JM
10118 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
10119 * Page). The CPU may write to these pages via their host
10120 * physical address while L2 is running, bypassing any
10121 * address-translation-based dirty tracking (e.g. EPT write
10122 * protection).
c9f04407
DM
10123 *
10124 * Mark them dirty on every exit from L2 to prevent them from
10125 * getting out of sync with dirty tracking.
10126 */
10127 nested_mark_vmcs12_pages_dirty(vcpu);
10128
4f350c6d
JM
10129 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
10130 vmcs_readl(EXIT_QUALIFICATION),
10131 vmx->idt_vectoring_info,
10132 intr_info,
10133 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
10134 KVM_ISA_VMX);
644d711a
NHE
10135
10136 switch (exit_reason) {
10137 case EXIT_REASON_EXCEPTION_NMI:
ef85b673 10138 if (is_nmi(intr_info))
1d804d07 10139 return false;
644d711a 10140 else if (is_page_fault(intr_info))
52a5c155 10141 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
6f05485d
JK
10142 else if (is_debug(intr_info) &&
10143 vcpu->guest_debug &
10144 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
10145 return false;
10146 else if (is_breakpoint(intr_info) &&
10147 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
10148 return false;
644d711a
NHE
10149 return vmcs12->exception_bitmap &
10150 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
10151 case EXIT_REASON_EXTERNAL_INTERRUPT:
1d804d07 10152 return false;
644d711a 10153 case EXIT_REASON_TRIPLE_FAULT:
1d804d07 10154 return true;
644d711a 10155 case EXIT_REASON_PENDING_INTERRUPT:
3b656cf7 10156 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
644d711a 10157 case EXIT_REASON_NMI_WINDOW:
3b656cf7 10158 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
644d711a 10159 case EXIT_REASON_TASK_SWITCH:
1d804d07 10160 return true;
644d711a 10161 case EXIT_REASON_CPUID:
1d804d07 10162 return true;
644d711a
NHE
10163 case EXIT_REASON_HLT:
10164 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
10165 case EXIT_REASON_INVD:
1d804d07 10166 return true;
644d711a
NHE
10167 case EXIT_REASON_INVLPG:
10168 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
10169 case EXIT_REASON_RDPMC:
10170 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
a5f46457 10171 case EXIT_REASON_RDRAND:
736fdf72 10172 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
a5f46457 10173 case EXIT_REASON_RDSEED:
736fdf72 10174 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
b3a2a907 10175 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
644d711a 10176 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
a7cde481
LA
10177 case EXIT_REASON_VMREAD:
10178 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
10179 vmcs12->vmread_bitmap);
10180 case EXIT_REASON_VMWRITE:
10181 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
10182 vmcs12->vmwrite_bitmap);
644d711a
NHE
10183 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
10184 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
a7cde481 10185 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
644d711a 10186 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
a642fc30 10187 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
644d711a
NHE
10188 /*
10189 * VMX instructions trap unconditionally. This allows L1 to
10190 * emulate them for its L2 guest, i.e., allows 3-level nesting!
10191 */
1d804d07 10192 return true;
644d711a
NHE
10193 case EXIT_REASON_CR_ACCESS:
10194 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
10195 case EXIT_REASON_DR_ACCESS:
10196 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
10197 case EXIT_REASON_IO_INSTRUCTION:
908a7bdd 10198 return nested_vmx_exit_handled_io(vcpu, vmcs12);
1b07304c
PB
10199 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
10200 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
644d711a
NHE
10201 case EXIT_REASON_MSR_READ:
10202 case EXIT_REASON_MSR_WRITE:
10203 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
10204 case EXIT_REASON_INVALID_STATE:
1d804d07 10205 return true;
644d711a
NHE
10206 case EXIT_REASON_MWAIT_INSTRUCTION:
10207 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5f3d45e7
MD
10208 case EXIT_REASON_MONITOR_TRAP_FLAG:
10209 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
644d711a
NHE
10210 case EXIT_REASON_MONITOR_INSTRUCTION:
10211 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
10212 case EXIT_REASON_PAUSE_INSTRUCTION:
10213 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
10214 nested_cpu_has2(vmcs12,
10215 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
10216 case EXIT_REASON_MCE_DURING_VMENTRY:
1d804d07 10217 return false;
644d711a 10218 case EXIT_REASON_TPR_BELOW_THRESHOLD:
a7c0b07d 10219 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
644d711a 10220 case EXIT_REASON_APIC_ACCESS:
82f0dd4b 10221 case EXIT_REASON_APIC_WRITE:
608406e2 10222 case EXIT_REASON_EOI_INDUCED:
ab5df31c
JM
10223 /*
10224 * The controls for "virtualize APIC accesses," "APIC-
10225 * register virtualization," and "virtual-interrupt
10226 * delivery" only come from vmcs12.
10227 */
1d804d07 10228 return true;
644d711a 10229 case EXIT_REASON_EPT_VIOLATION:
2b1be677
NHE
10230 /*
10231 * L0 always deals with the EPT violation. If nested EPT is
10232 * used, and the nested mmu code discovers that the address is
10233 * missing in the guest EPT table (EPT12), the EPT violation
10234 * will be injected with nested_ept_inject_page_fault()
10235 */
1d804d07 10236 return false;
644d711a 10237 case EXIT_REASON_EPT_MISCONFIG:
2b1be677
NHE
10238 /*
10239 * L2 never uses directly L1's EPT, but rather L0's own EPT
10240 * table (shadow on EPT) or a merged EPT table that L0 built
10241 * (EPT on EPT). So any problems with the structure of the
10242 * table is L0's fault.
10243 */
1d804d07 10244 return false;
90a2db6d
PB
10245 case EXIT_REASON_INVPCID:
10246 return
10247 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
10248 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
644d711a
NHE
10249 case EXIT_REASON_WBINVD:
10250 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
10251 case EXIT_REASON_XSETBV:
1d804d07 10252 return true;
81dc01f7
WL
10253 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
10254 /*
10255 * This should never happen, since it is not possible to
10256 * set XSS to a non-zero value---neither in L1 nor in L2.
10257 * If if it were, XSS would have to be checked against
10258 * the XSS exit bitmap in vmcs12.
10259 */
10260 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
55123e3c
WL
10261 case EXIT_REASON_PREEMPTION_TIMER:
10262 return false;
ab007cc9 10263 case EXIT_REASON_PML_FULL:
03efce6f 10264 /* We emulate PML support to L1. */
ab007cc9 10265 return false;
2a499e49
BD
10266 case EXIT_REASON_VMFUNC:
10267 /* VM functions are emulated through L2->L0 vmexits. */
10268 return false;
0b665d30
SC
10269 case EXIT_REASON_ENCLS:
10270 /* SGX is never exposed to L1 */
10271 return false;
644d711a 10272 default:
1d804d07 10273 return true;
644d711a
NHE
10274 }
10275}
10276
7313c698
PB
10277static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
10278{
10279 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10280
10281 /*
10282 * At this point, the exit interruption info in exit_intr_info
10283 * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
10284 * we need to query the in-kernel LAPIC.
10285 */
10286 WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
10287 if ((exit_intr_info &
10288 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
10289 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
10290 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10291 vmcs12->vm_exit_intr_error_code =
10292 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
10293 }
10294
10295 nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
10296 vmcs_readl(EXIT_QUALIFICATION));
10297 return 1;
10298}
10299
586f9607
AK
10300static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
10301{
10302 *info1 = vmcs_readl(EXIT_QUALIFICATION);
10303 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
10304}
10305
a3eaa864 10306static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
843e4330 10307{
a3eaa864
KH
10308 if (vmx->pml_pg) {
10309 __free_page(vmx->pml_pg);
10310 vmx->pml_pg = NULL;
10311 }
843e4330
KH
10312}
10313
54bf36aa 10314static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
843e4330 10315{
54bf36aa 10316 struct vcpu_vmx *vmx = to_vmx(vcpu);
843e4330
KH
10317 u64 *pml_buf;
10318 u16 pml_idx;
10319
10320 pml_idx = vmcs_read16(GUEST_PML_INDEX);
10321
10322 /* Do nothing if PML buffer is empty */
10323 if (pml_idx == (PML_ENTITY_NUM - 1))
10324 return;
10325
10326 /* PML index always points to next available PML buffer entity */
10327 if (pml_idx >= PML_ENTITY_NUM)
10328 pml_idx = 0;
10329 else
10330 pml_idx++;
10331
10332 pml_buf = page_address(vmx->pml_pg);
10333 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
10334 u64 gpa;
10335
10336 gpa = pml_buf[pml_idx];
10337 WARN_ON(gpa & (PAGE_SIZE - 1));
54bf36aa 10338 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
843e4330
KH
10339 }
10340
10341 /* reset PML index */
10342 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
10343}
10344
10345/*
10346 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
10347 * Called before reporting dirty_bitmap to userspace.
10348 */
10349static void kvm_flush_pml_buffers(struct kvm *kvm)
10350{
10351 int i;
10352 struct kvm_vcpu *vcpu;
10353 /*
10354 * We only need to kick vcpu out of guest mode here, as PML buffer
10355 * is flushed at beginning of all VMEXITs, and it's obvious that only
10356 * vcpus running in guest are possible to have unflushed GPAs in PML
10357 * buffer.
10358 */
10359 kvm_for_each_vcpu(i, vcpu, kvm)
10360 kvm_vcpu_kick(vcpu);
10361}
10362
4eb64dce
PB
10363static void vmx_dump_sel(char *name, uint32_t sel)
10364{
10365 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
96794e4e 10366 name, vmcs_read16(sel),
4eb64dce
PB
10367 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
10368 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
10369 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
10370}
10371
10372static void vmx_dump_dtsel(char *name, uint32_t limit)
10373{
10374 pr_err("%s limit=0x%08x, base=0x%016lx\n",
10375 name, vmcs_read32(limit),
10376 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
10377}
10378
10379static void dump_vmcs(void)
10380{
10381 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
10382 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
10383 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
10384 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
10385 u32 secondary_exec_control = 0;
10386 unsigned long cr4 = vmcs_readl(GUEST_CR4);
f3531054 10387 u64 efer = vmcs_read64(GUEST_IA32_EFER);
4eb64dce
PB
10388 int i, n;
10389
10390 if (cpu_has_secondary_exec_ctrls())
10391 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
10392
10393 pr_err("*** Guest State ***\n");
10394 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10395 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
10396 vmcs_readl(CR0_GUEST_HOST_MASK));
10397 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10398 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
10399 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
10400 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
10401 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
10402 {
845c5b40
PB
10403 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
10404 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
10405 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
10406 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
4eb64dce
PB
10407 }
10408 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
10409 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
10410 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
10411 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
10412 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10413 vmcs_readl(GUEST_SYSENTER_ESP),
10414 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
10415 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
10416 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
10417 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
10418 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
10419 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
10420 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
10421 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
10422 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
10423 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
10424 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
10425 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
10426 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
845c5b40
PB
10427 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
10428 efer, vmcs_read64(GUEST_IA32_PAT));
10429 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
10430 vmcs_read64(GUEST_IA32_DEBUGCTL),
4eb64dce 10431 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
773e8a04
VK
10432 if (cpu_has_load_perf_global_ctrl &&
10433 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
845c5b40
PB
10434 pr_err("PerfGlobCtl = 0x%016llx\n",
10435 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
4eb64dce 10436 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
845c5b40 10437 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
4eb64dce
PB
10438 pr_err("Interruptibility = %08x ActivityState = %08x\n",
10439 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
10440 vmcs_read32(GUEST_ACTIVITY_STATE));
10441 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
10442 pr_err("InterruptStatus = %04x\n",
10443 vmcs_read16(GUEST_INTR_STATUS));
10444
10445 pr_err("*** Host State ***\n");
10446 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
10447 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
10448 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
10449 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
10450 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
10451 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
10452 vmcs_read16(HOST_TR_SELECTOR));
10453 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
10454 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
10455 vmcs_readl(HOST_TR_BASE));
10456 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
10457 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
10458 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
10459 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
10460 vmcs_readl(HOST_CR4));
10461 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10462 vmcs_readl(HOST_IA32_SYSENTER_ESP),
10463 vmcs_read32(HOST_IA32_SYSENTER_CS),
10464 vmcs_readl(HOST_IA32_SYSENTER_EIP));
10465 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
845c5b40
PB
10466 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
10467 vmcs_read64(HOST_IA32_EFER),
10468 vmcs_read64(HOST_IA32_PAT));
773e8a04
VK
10469 if (cpu_has_load_perf_global_ctrl &&
10470 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
845c5b40
PB
10471 pr_err("PerfGlobCtl = 0x%016llx\n",
10472 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
4eb64dce
PB
10473
10474 pr_err("*** Control State ***\n");
10475 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
10476 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
10477 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
10478 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
10479 vmcs_read32(EXCEPTION_BITMAP),
10480 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
10481 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
10482 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
10483 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
10484 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
10485 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
10486 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
10487 vmcs_read32(VM_EXIT_INTR_INFO),
10488 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
10489 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
10490 pr_err(" reason=%08x qualification=%016lx\n",
10491 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
10492 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
10493 vmcs_read32(IDT_VECTORING_INFO_FIELD),
10494 vmcs_read32(IDT_VECTORING_ERROR_CODE));
845c5b40 10495 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
8cfe9866 10496 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
845c5b40
PB
10497 pr_err("TSC Multiplier = 0x%016llx\n",
10498 vmcs_read64(TSC_MULTIPLIER));
4eb64dce
PB
10499 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
10500 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
10501 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
10502 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
10503 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
845c5b40 10504 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
4eb64dce
PB
10505 n = vmcs_read32(CR3_TARGET_COUNT);
10506 for (i = 0; i + 1 < n; i += 4)
10507 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
10508 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
10509 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
10510 if (i < n)
10511 pr_err("CR3 target%u=%016lx\n",
10512 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
10513 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
10514 pr_err("PLE Gap=%08x Window=%08x\n",
10515 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
10516 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
10517 pr_err("Virtual processor ID = 0x%04x\n",
10518 vmcs_read16(VIRTUAL_PROCESSOR_ID));
10519}
10520
6aa8b732
AK
10521/*
10522 * The guest has exited. See if we can fix it or if we need userspace
10523 * assistance.
10524 */
851ba692 10525static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6aa8b732 10526{
29bd8a78 10527 struct vcpu_vmx *vmx = to_vmx(vcpu);
a0861c02 10528 u32 exit_reason = vmx->exit_reason;
1155f76a 10529 u32 vectoring_info = vmx->idt_vectoring_info;
29bd8a78 10530
8b89fe1f
PB
10531 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
10532
843e4330
KH
10533 /*
10534 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
10535 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
10536 * querying dirty_bitmap, we only need to kick all vcpus out of guest
10537 * mode as if vcpus is in root mode, the PML buffer must has been
10538 * flushed already.
10539 */
10540 if (enable_pml)
54bf36aa 10541 vmx_flush_pml_buffer(vcpu);
843e4330 10542
80ced186 10543 /* If guest state is invalid, start emulating */
14168786 10544 if (vmx->emulation_required)
80ced186 10545 return handle_invalid_guest_state(vcpu);
1d5a4d9b 10546
7313c698
PB
10547 if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
10548 return nested_vmx_reflect_vmexit(vcpu, exit_reason);
644d711a 10549
5120702e 10550 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
4eb64dce 10551 dump_vmcs();
5120702e
MG
10552 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10553 vcpu->run->fail_entry.hardware_entry_failure_reason
10554 = exit_reason;
10555 return 0;
10556 }
10557
29bd8a78 10558 if (unlikely(vmx->fail)) {
851ba692
AK
10559 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10560 vcpu->run->fail_entry.hardware_entry_failure_reason
29bd8a78
AK
10561 = vmcs_read32(VM_INSTRUCTION_ERROR);
10562 return 0;
10563 }
6aa8b732 10564
b9bf6882
XG
10565 /*
10566 * Note:
10567 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
10568 * delivery event since it indicates guest is accessing MMIO.
10569 * The vm-exit can be triggered again after return to guest that
10570 * will cause infinite loop.
10571 */
d77c26fc 10572 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
1439442c 10573 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
60637aac 10574 exit_reason != EXIT_REASON_EPT_VIOLATION &&
b244c9fc 10575 exit_reason != EXIT_REASON_PML_FULL &&
b9bf6882
XG
10576 exit_reason != EXIT_REASON_TASK_SWITCH)) {
10577 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
10578 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
70bcd708 10579 vcpu->run->internal.ndata = 3;
b9bf6882
XG
10580 vcpu->run->internal.data[0] = vectoring_info;
10581 vcpu->run->internal.data[1] = exit_reason;
70bcd708
PB
10582 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
10583 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
10584 vcpu->run->internal.ndata++;
10585 vcpu->run->internal.data[3] =
10586 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
10587 }
b9bf6882
XG
10588 return 0;
10589 }
3b86cd99 10590
d02fcf50 10591 if (unlikely(!enable_vnmi &&
8a1b4392
PB
10592 vmx->loaded_vmcs->soft_vnmi_blocked)) {
10593 if (vmx_interrupt_allowed(vcpu)) {
10594 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10595 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
10596 vcpu->arch.nmi_pending) {
10597 /*
10598 * This CPU don't support us in finding the end of an
10599 * NMI-blocked window if the guest runs with IRQs
10600 * disabled. So we pull the trigger after 1 s of
10601 * futile waiting, but inform the user about this.
10602 */
10603 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
10604 "state on VCPU %d after 1 s timeout\n",
10605 __func__, vcpu->vcpu_id);
10606 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10607 }
10608 }
10609
6aa8b732
AK
10610 if (exit_reason < kvm_vmx_max_exit_handlers
10611 && kvm_vmx_exit_handlers[exit_reason])
851ba692 10612 return kvm_vmx_exit_handlers[exit_reason](vcpu);
6aa8b732 10613 else {
6c6c5e03
RK
10614 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
10615 exit_reason);
2bc19dc3
MT
10616 kvm_queue_exception(vcpu, UD_VECTOR);
10617 return 1;
6aa8b732 10618 }
6aa8b732
AK
10619}
10620
a47dd5f0
PB
10621/*
10622 * Software based L1D cache flush which is used when microcode providing
10623 * the cache control MSR is not loaded.
10624 *
10625 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
10626 * flush it is required to read in 64 KiB because the replacement algorithm
10627 * is not exactly LRU. This could be sized at runtime via topology
10628 * information but as all relevant affected CPUs have 32KiB L1D cache size
10629 * there is no point in doing so.
10630 */
c595ceee 10631static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
a47dd5f0
PB
10632{
10633 int size = PAGE_SIZE << L1D_CACHE_ORDER;
c595ceee
PB
10634
10635 /*
2f055947
TG
10636 * This code is only executed when the the flush mode is 'cond' or
10637 * 'always'
c595ceee 10638 */
427362a1 10639 if (static_branch_likely(&vmx_l1d_flush_cond)) {
45b575c0 10640 bool flush_l1d;
5b6ccc6c 10641
379fd0c7 10642 /*
45b575c0
NS
10643 * Clear the per-vcpu flush bit, it gets set again
10644 * either from vcpu_run() or from one of the unsafe
10645 * VMEXIT handlers.
379fd0c7 10646 */
45b575c0 10647 flush_l1d = vcpu->arch.l1tf_flush_l1d;
4c6523ec 10648 vcpu->arch.l1tf_flush_l1d = false;
45b575c0
NS
10649
10650 /*
10651 * Clear the per-cpu flush bit, it gets set again from
10652 * the interrupt handlers.
10653 */
10654 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
10655 kvm_clear_cpu_l1tf_flush_l1d();
10656
5b6ccc6c
NS
10657 if (!flush_l1d)
10658 return;
379fd0c7 10659 }
c595ceee
PB
10660
10661 vcpu->stat.l1d_flush++;
a47dd5f0 10662
3fa045be
PB
10663 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
10664 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
10665 return;
10666 }
10667
a47dd5f0
PB
10668 asm volatile(
10669 /* First ensure the pages are in the TLB */
10670 "xorl %%eax, %%eax\n"
10671 ".Lpopulate_tlb:\n\t"
288d152c 10672 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
a47dd5f0
PB
10673 "addl $4096, %%eax\n\t"
10674 "cmpl %%eax, %[size]\n\t"
10675 "jne .Lpopulate_tlb\n\t"
10676 "xorl %%eax, %%eax\n\t"
10677 "cpuid\n\t"
10678 /* Now fill the cache */
10679 "xorl %%eax, %%eax\n"
10680 ".Lfill_cache:\n"
288d152c 10681 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
a47dd5f0
PB
10682 "addl $64, %%eax\n\t"
10683 "cmpl %%eax, %[size]\n\t"
10684 "jne .Lfill_cache\n\t"
10685 "lfence\n"
288d152c 10686 :: [flush_pages] "r" (vmx_l1d_flush_pages),
a47dd5f0
PB
10687 [size] "r" (size)
10688 : "eax", "ebx", "ecx", "edx");
10689}
10690
95ba8273 10691static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6e5d865c 10692{
a7c0b07d
WL
10693 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10694
10695 if (is_guest_mode(vcpu) &&
10696 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10697 return;
10698
95ba8273 10699 if (irr == -1 || tpr < irr) {
6e5d865c
YS
10700 vmcs_write32(TPR_THRESHOLD, 0);
10701 return;
10702 }
10703
95ba8273 10704 vmcs_write32(TPR_THRESHOLD, irr);
6e5d865c
YS
10705}
10706
8d860bbe 10707static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
8d14695f
YZ
10708{
10709 u32 sec_exec_control;
10710
8d860bbe
JM
10711 if (!lapic_in_kernel(vcpu))
10712 return;
10713
fd6b6d9b
SC
10714 if (!flexpriority_enabled &&
10715 !cpu_has_vmx_virtualize_x2apic_mode())
10716 return;
10717
dccbfcf5
RK
10718 /* Postpone execution until vmcs01 is the current VMCS. */
10719 if (is_guest_mode(vcpu)) {
8d860bbe 10720 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
dccbfcf5
RK
10721 return;
10722 }
10723
8d14695f 10724 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8d860bbe
JM
10725 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
10726 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
8d14695f 10727
8d860bbe
JM
10728 switch (kvm_get_apic_mode(vcpu)) {
10729 case LAPIC_MODE_INVALID:
10730 WARN_ONCE(true, "Invalid local APIC state");
10731 case LAPIC_MODE_DISABLED:
10732 break;
10733 case LAPIC_MODE_XAPIC:
10734 if (flexpriority_enabled) {
10735 sec_exec_control |=
10736 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
10737 vmx_flush_tlb(vcpu, true);
10738 }
10739 break;
10740 case LAPIC_MODE_X2APIC:
10741 if (cpu_has_vmx_virtualize_x2apic_mode())
10742 sec_exec_control |=
10743 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
10744 break;
8d14695f
YZ
10745 }
10746 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
10747
904e14fb 10748 vmx_update_msr_bitmap(vcpu);
8d14695f
YZ
10749}
10750
38b99173
TC
10751static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
10752{
ab5df31c 10753 if (!is_guest_mode(vcpu)) {
38b99173 10754 vmcs_write64(APIC_ACCESS_ADDR, hpa);
a468f2db 10755 vmx_flush_tlb(vcpu, true);
fb6c8198 10756 }
38b99173
TC
10757}
10758
67c9dddc 10759static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
c7c9c56c
YZ
10760{
10761 u16 status;
10762 u8 old;
10763
67c9dddc
PB
10764 if (max_isr == -1)
10765 max_isr = 0;
c7c9c56c
YZ
10766
10767 status = vmcs_read16(GUEST_INTR_STATUS);
10768 old = status >> 8;
67c9dddc 10769 if (max_isr != old) {
c7c9c56c 10770 status &= 0xff;
67c9dddc 10771 status |= max_isr << 8;
c7c9c56c
YZ
10772 vmcs_write16(GUEST_INTR_STATUS, status);
10773 }
10774}
10775
10776static void vmx_set_rvi(int vector)
10777{
10778 u16 status;
10779 u8 old;
10780
4114c27d
WW
10781 if (vector == -1)
10782 vector = 0;
10783
c7c9c56c
YZ
10784 status = vmcs_read16(GUEST_INTR_STATUS);
10785 old = (u8)status & 0xff;
10786 if ((u8)vector != old) {
10787 status &= ~0xff;
10788 status |= (u8)vector;
10789 vmcs_write16(GUEST_INTR_STATUS, status);
10790 }
10791}
10792
10793static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
10794{
963fee16 10795 /*
851c1a18
LA
10796 * When running L2, updating RVI is only relevant when
10797 * vmcs12 virtual-interrupt-delivery enabled.
10798 * However, it can be enabled only when L1 also
10799 * intercepts external-interrupts and in that case
10800 * we should not update vmcs02 RVI but instead intercept
10801 * interrupt. Therefore, do nothing when running L2.
963fee16 10802 */
851c1a18
LA
10803 if (!is_guest_mode(vcpu))
10804 vmx_set_rvi(max_irr);
c7c9c56c
YZ
10805}
10806
76dfafd5 10807static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
810e6def
PB
10808{
10809 struct vcpu_vmx *vmx = to_vmx(vcpu);
76dfafd5 10810 int max_irr;
f27a85c4 10811 bool max_irr_updated;
810e6def 10812
76dfafd5
PB
10813 WARN_ON(!vcpu->arch.apicv_active);
10814 if (pi_test_on(&vmx->pi_desc)) {
10815 pi_clear_on(&vmx->pi_desc);
10816 /*
10817 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
10818 * But on x86 this is just a compiler barrier anyway.
10819 */
10820 smp_mb__after_atomic();
f27a85c4
LA
10821 max_irr_updated =
10822 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
10823
10824 /*
10825 * If we are running L2 and L1 has a new pending interrupt
10826 * which can be injected, we should re-evaluate
10827 * what should be done with this new L1 interrupt.
851c1a18
LA
10828 * If L1 intercepts external-interrupts, we should
10829 * exit from L2 to L1. Otherwise, interrupt should be
10830 * delivered directly to L2.
f27a85c4 10831 */
851c1a18
LA
10832 if (is_guest_mode(vcpu) && max_irr_updated) {
10833 if (nested_exit_on_intr(vcpu))
10834 kvm_vcpu_exiting_guest_mode(vcpu);
10835 else
10836 kvm_make_request(KVM_REQ_EVENT, vcpu);
10837 }
76dfafd5
PB
10838 } else {
10839 max_irr = kvm_lapic_find_highest_irr(vcpu);
10840 }
10841 vmx_hwapic_irr_update(vcpu, max_irr);
10842 return max_irr;
810e6def
PB
10843}
10844
7e712684
PB
10845static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
10846{
10847 u8 rvi = vmx_get_rvi();
10848 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
10849
10850 return ((rvi & 0xf0) > (vppr & 0xf0));
10851}
10852
6308630b 10853static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
c7c9c56c 10854{
d62caabb 10855 if (!kvm_vcpu_apicv_active(vcpu))
3d81bc7e
YZ
10856 return;
10857
c7c9c56c
YZ
10858 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
10859 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
10860 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
10861 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
10862}
10863
967235d3
PB
10864static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
10865{
10866 struct vcpu_vmx *vmx = to_vmx(vcpu);
10867
10868 pi_clear_on(&vmx->pi_desc);
10869 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
10870}
10871
51aa01d1 10872static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
cf393f75 10873{
48ae0fb4
JM
10874 u32 exit_intr_info = 0;
10875 u16 basic_exit_reason = (u16)vmx->exit_reason;
00eba012 10876
48ae0fb4
JM
10877 if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
10878 || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
00eba012
AK
10879 return;
10880
48ae0fb4
JM
10881 if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
10882 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10883 vmx->exit_intr_info = exit_intr_info;
a0861c02 10884
1261bfa3
WL
10885 /* if exit due to PF check for async PF */
10886 if (is_page_fault(exit_intr_info))
10887 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
10888
a0861c02 10889 /* Handle machine checks before interrupts are enabled */
48ae0fb4
JM
10890 if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
10891 is_machine_check(exit_intr_info))
a0861c02
AK
10892 kvm_machine_check();
10893
20f65983 10894 /* We need to handle NMIs before interrupts are enabled */
ef85b673 10895 if (is_nmi(exit_intr_info)) {
dd60d217 10896 kvm_before_interrupt(&vmx->vcpu);
20f65983 10897 asm("int $2");
dd60d217 10898 kvm_after_interrupt(&vmx->vcpu);
ff9d07a0 10899 }
51aa01d1 10900}
20f65983 10901
a547c6db
YZ
10902static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
10903{
10904 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10905
a547c6db
YZ
10906 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
10907 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
10908 unsigned int vector;
10909 unsigned long entry;
10910 gate_desc *desc;
10911 struct vcpu_vmx *vmx = to_vmx(vcpu);
10912#ifdef CONFIG_X86_64
10913 unsigned long tmp;
10914#endif
10915
10916 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
10917 desc = (gate_desc *)vmx->host_idt_base + vector;
64b163fa 10918 entry = gate_offset(desc);
a547c6db
YZ
10919 asm volatile(
10920#ifdef CONFIG_X86_64
10921 "mov %%" _ASM_SP ", %[sp]\n\t"
10922 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
10923 "push $%c[ss]\n\t"
10924 "push %[sp]\n\t"
10925#endif
10926 "pushf\n\t"
a547c6db 10927 __ASM_SIZE(push) " $%c[cs]\n\t"
c940a3fb 10928 CALL_NOSPEC
a547c6db
YZ
10929 :
10930#ifdef CONFIG_X86_64
3f62de5f 10931 [sp]"=&r"(tmp),
a547c6db 10932#endif
f5caf621 10933 ASM_CALL_CONSTRAINT
a547c6db 10934 :
c940a3fb 10935 THUNK_TARGET(entry),
a547c6db
YZ
10936 [ss]"i"(__KERNEL_DS),
10937 [cs]"i"(__KERNEL_CS)
10938 );
f2485b3e 10939 }
a547c6db 10940}
c207aee4 10941STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
a547c6db 10942
bc226f07 10943static bool vmx_has_emulated_msr(int index)
6d396b55 10944{
bc226f07
TL
10945 switch (index) {
10946 case MSR_IA32_SMBASE:
10947 /*
10948 * We cannot do SMM unless we can run the guest in big
10949 * real mode.
10950 */
10951 return enable_unrestricted_guest || emulate_invalid_guest_state;
10952 case MSR_AMD64_VIRT_SPEC_CTRL:
10953 /* This is AMD only. */
10954 return false;
10955 default:
10956 return true;
10957 }
6d396b55
PB
10958}
10959
da8999d3
LJ
10960static bool vmx_mpx_supported(void)
10961{
10962 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
10963 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
10964}
10965
55412b2e
WL
10966static bool vmx_xsaves_supported(void)
10967{
10968 return vmcs_config.cpu_based_2nd_exec_ctrl &
10969 SECONDARY_EXEC_XSAVES;
10970}
10971
51aa01d1
AK
10972static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
10973{
c5ca8e57 10974 u32 exit_intr_info;
51aa01d1
AK
10975 bool unblock_nmi;
10976 u8 vector;
10977 bool idtv_info_valid;
10978
10979 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
20f65983 10980
d02fcf50 10981 if (enable_vnmi) {
8a1b4392
PB
10982 if (vmx->loaded_vmcs->nmi_known_unmasked)
10983 return;
10984 /*
10985 * Can't use vmx->exit_intr_info since we're not sure what
10986 * the exit reason is.
10987 */
10988 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10989 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
10990 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
10991 /*
10992 * SDM 3: 27.7.1.2 (September 2008)
10993 * Re-set bit "block by NMI" before VM entry if vmexit caused by
10994 * a guest IRET fault.
10995 * SDM 3: 23.2.2 (September 2008)
10996 * Bit 12 is undefined in any of the following cases:
10997 * If the VM exit sets the valid bit in the IDT-vectoring
10998 * information field.
10999 * If the VM exit is due to a double fault.
11000 */
11001 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
11002 vector != DF_VECTOR && !idtv_info_valid)
11003 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
11004 GUEST_INTR_STATE_NMI);
11005 else
11006 vmx->loaded_vmcs->nmi_known_unmasked =
11007 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
11008 & GUEST_INTR_STATE_NMI);
11009 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
11010 vmx->loaded_vmcs->vnmi_blocked_time +=
11011 ktime_to_ns(ktime_sub(ktime_get(),
11012 vmx->loaded_vmcs->entry_time));
51aa01d1
AK
11013}
11014
3ab66e8a 11015static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
83422e17
AK
11016 u32 idt_vectoring_info,
11017 int instr_len_field,
11018 int error_code_field)
51aa01d1 11019{
51aa01d1
AK
11020 u8 vector;
11021 int type;
11022 bool idtv_info_valid;
11023
11024 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
668f612f 11025
3ab66e8a
JK
11026 vcpu->arch.nmi_injected = false;
11027 kvm_clear_exception_queue(vcpu);
11028 kvm_clear_interrupt_queue(vcpu);
37b96e98
GN
11029
11030 if (!idtv_info_valid)
11031 return;
11032
3ab66e8a 11033 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 11034
668f612f
AK
11035 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
11036 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
37b96e98 11037
64a7ec06 11038 switch (type) {
37b96e98 11039 case INTR_TYPE_NMI_INTR:
3ab66e8a 11040 vcpu->arch.nmi_injected = true;
668f612f 11041 /*
7b4a25cb 11042 * SDM 3: 27.7.1.2 (September 2008)
37b96e98
GN
11043 * Clear bit "block by NMI" before VM entry if a NMI
11044 * delivery faulted.
668f612f 11045 */
3ab66e8a 11046 vmx_set_nmi_mask(vcpu, false);
37b96e98 11047 break;
37b96e98 11048 case INTR_TYPE_SOFT_EXCEPTION:
3ab66e8a 11049 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
66fd3f7f
GN
11050 /* fall through */
11051 case INTR_TYPE_HARD_EXCEPTION:
35920a35 11052 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
83422e17 11053 u32 err = vmcs_read32(error_code_field);
851eb667 11054 kvm_requeue_exception_e(vcpu, vector, err);
35920a35 11055 } else
851eb667 11056 kvm_requeue_exception(vcpu, vector);
37b96e98 11057 break;
66fd3f7f 11058 case INTR_TYPE_SOFT_INTR:
3ab66e8a 11059 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
66fd3f7f 11060 /* fall through */
37b96e98 11061 case INTR_TYPE_EXT_INTR:
3ab66e8a 11062 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
37b96e98
GN
11063 break;
11064 default:
11065 break;
f7d9238f 11066 }
cf393f75
AK
11067}
11068
83422e17
AK
11069static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
11070{
3ab66e8a 11071 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
83422e17
AK
11072 VM_EXIT_INSTRUCTION_LEN,
11073 IDT_VECTORING_ERROR_CODE);
11074}
11075
b463a6f7
AK
11076static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
11077{
3ab66e8a 11078 __vmx_complete_interrupts(vcpu,
b463a6f7
AK
11079 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
11080 VM_ENTRY_INSTRUCTION_LEN,
11081 VM_ENTRY_EXCEPTION_ERROR_CODE);
11082
11083 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
11084}
11085
d7cd9796
GN
11086static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
11087{
11088 int i, nr_msrs;
11089 struct perf_guest_switch_msr *msrs;
11090
11091 msrs = perf_guest_get_msrs(&nr_msrs);
11092
11093 if (!msrs)
11094 return;
11095
11096 for (i = 0; i < nr_msrs; i++)
11097 if (msrs[i].host == msrs[i].guest)
11098 clear_atomic_switch_msr(vmx, msrs[i].msr);
11099 else
11100 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
989e3992 11101 msrs[i].host, false);
d7cd9796
GN
11102}
11103
f459a707
SC
11104static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
11105{
11106 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
11107 if (!vmx->loaded_vmcs->hv_timer_armed)
11108 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
11109 PIN_BASED_VMX_PREEMPTION_TIMER);
11110 vmx->loaded_vmcs->hv_timer_armed = true;
11111}
11112
11113static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
64672c95
YJ
11114{
11115 struct vcpu_vmx *vmx = to_vmx(vcpu);
11116 u64 tscl;
11117 u32 delta_tsc;
11118
d264ee0c
SC
11119 if (vmx->req_immediate_exit) {
11120 vmx_arm_hv_timer(vmx, 0);
11121 return;
11122 }
11123
f459a707
SC
11124 if (vmx->hv_deadline_tsc != -1) {
11125 tscl = rdtsc();
11126 if (vmx->hv_deadline_tsc > tscl)
11127 /* set_hv_timer ensures the delta fits in 32-bits */
11128 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
11129 cpu_preemption_timer_multi);
11130 else
11131 delta_tsc = 0;
64672c95 11132
f459a707
SC
11133 vmx_arm_hv_timer(vmx, delta_tsc);
11134 return;
11135 }
64672c95 11136
f459a707
SC
11137 if (vmx->loaded_vmcs->hv_timer_armed)
11138 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
11139 PIN_BASED_VMX_PREEMPTION_TIMER);
11140 vmx->loaded_vmcs->hv_timer_armed = false;
64672c95
YJ
11141}
11142
a3b5ba49 11143static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6aa8b732 11144{
a2fa3e9f 11145 struct vcpu_vmx *vmx = to_vmx(vcpu);
773e8a04 11146 unsigned long cr3, cr4, evmcs_rsp;
104f226b 11147
8a1b4392 11148 /* Record the guest's net vcpu time for enforced NMI injections. */
d02fcf50 11149 if (unlikely(!enable_vnmi &&
8a1b4392
PB
11150 vmx->loaded_vmcs->soft_vnmi_blocked))
11151 vmx->loaded_vmcs->entry_time = ktime_get();
11152
104f226b
AK
11153 /* Don't enter VMX if guest state is invalid, let the exit handler
11154 start emulation until we arrive back to a valid state */
14168786 11155 if (vmx->emulation_required)
104f226b
AK
11156 return;
11157
a7653ecd
RK
11158 if (vmx->ple_window_dirty) {
11159 vmx->ple_window_dirty = false;
11160 vmcs_write32(PLE_WINDOW, vmx->ple_window);
11161 }
11162
945679e3 11163 if (vmx->nested.need_vmcs12_sync) {
8cab6507
VK
11164 /*
11165 * hv_evmcs may end up being not mapped after migration (when
11166 * L2 was running), map it here to make sure vmcs12 changes are
11167 * properly reflected.
11168 */
11169 if (vmx->nested.enlightened_vmcs_enabled &&
11170 !vmx->nested.hv_evmcs)
11171 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
11172
945679e3
VK
11173 if (vmx->nested.hv_evmcs) {
11174 copy_vmcs12_to_enlightened(vmx);
11175 /* All fields are clean */
11176 vmx->nested.hv_evmcs->hv_clean_fields |=
11177 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
11178 } else {
11179 copy_vmcs12_to_shadow(vmx);
11180 }
11181 vmx->nested.need_vmcs12_sync = false;
012f83cb
AG
11182 }
11183
104f226b
AK
11184 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
11185 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
11186 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
11187 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
11188
d6e41f11 11189 cr3 = __get_current_cr3_fast();
d7ee039e 11190 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
d6e41f11 11191 vmcs_writel(HOST_CR3, cr3);
d7ee039e 11192 vmx->loaded_vmcs->host_state.cr3 = cr3;
d6e41f11
AL
11193 }
11194
1e02ce4c 11195 cr4 = cr4_read_shadow();
d7ee039e 11196 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
d974baa3 11197 vmcs_writel(HOST_CR4, cr4);
d7ee039e 11198 vmx->loaded_vmcs->host_state.cr4 = cr4;
d974baa3
AL
11199 }
11200
104f226b
AK
11201 /* When single-stepping over STI and MOV SS, we must clear the
11202 * corresponding interruptibility bits in the guest state. Otherwise
11203 * vmentry fails as it then expects bit 14 (BS) in pending debug
11204 * exceptions being set, but that's not correct for the guest debugging
11205 * case. */
11206 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
11207 vmx_set_interrupt_shadow(vcpu, 0);
11208
b9dd21e1
PB
11209 if (static_cpu_has(X86_FEATURE_PKU) &&
11210 kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
11211 vcpu->arch.pkru != vmx->host_pkru)
11212 __write_pkru(vcpu->arch.pkru);
1be0e61c 11213
d7cd9796
GN
11214 atomic_switch_perf_msrs(vmx);
11215
f459a707 11216 vmx_update_hv_timer(vcpu);
64672c95 11217
d28b387f
KA
11218 /*
11219 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
11220 * it's non-zero. Since vmentry is serialising on affected CPUs, there
11221 * is no need to worry about the conditional branch over the wrmsr
11222 * being speculatively taken.
11223 */
ccbcd267 11224 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
d28b387f 11225
d462b819 11226 vmx->__launched = vmx->loaded_vmcs->launched;
773e8a04
VK
11227
11228 evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
11229 (unsigned long)&current_evmcs->host_rsp : 0;
11230
5b6ccc6c
NS
11231 if (static_branch_unlikely(&vmx_l1d_should_flush))
11232 vmx_l1d_flush(vcpu);
c595ceee 11233
104f226b 11234 asm(
6aa8b732 11235 /* Store host registers */
b188c81f
AK
11236 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
11237 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
11238 "push %%" _ASM_CX " \n\t"
11239 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
313dbd49 11240 "je 1f \n\t"
b188c81f 11241 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
773e8a04
VK
11242 /* Avoid VMWRITE when Enlightened VMCS is in use */
11243 "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
11244 "jz 2f \n\t"
11245 "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
11246 "jmp 1f \n\t"
11247 "2: \n\t"
4b1e5478 11248 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
313dbd49 11249 "1: \n\t"
d3edefc0 11250 /* Reload cr2 if changed */
b188c81f
AK
11251 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
11252 "mov %%cr2, %%" _ASM_DX " \n\t"
11253 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
773e8a04 11254 "je 3f \n\t"
b188c81f 11255 "mov %%" _ASM_AX", %%cr2 \n\t"
773e8a04 11256 "3: \n\t"
6aa8b732 11257 /* Check if vmlaunch of vmresume is needed */
e08aa78a 11258 "cmpl $0, %c[launched](%0) \n\t"
6aa8b732 11259 /* Load guest registers. Don't clobber flags. */
b188c81f
AK
11260 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
11261 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
11262 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
11263 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
11264 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
11265 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
05b3e0c2 11266#ifdef CONFIG_X86_64
e08aa78a
AK
11267 "mov %c[r8](%0), %%r8 \n\t"
11268 "mov %c[r9](%0), %%r9 \n\t"
11269 "mov %c[r10](%0), %%r10 \n\t"
11270 "mov %c[r11](%0), %%r11 \n\t"
11271 "mov %c[r12](%0), %%r12 \n\t"
11272 "mov %c[r13](%0), %%r13 \n\t"
11273 "mov %c[r14](%0), %%r14 \n\t"
11274 "mov %c[r15](%0), %%r15 \n\t"
6aa8b732 11275#endif
b188c81f 11276 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
c801949d 11277
6aa8b732 11278 /* Enter guest mode */
83287ea4 11279 "jne 1f \n\t"
4b1e5478 11280 __ex("vmlaunch") "\n\t"
83287ea4 11281 "jmp 2f \n\t"
4b1e5478 11282 "1: " __ex("vmresume") "\n\t"
83287ea4 11283 "2: "
6aa8b732 11284 /* Save guest registers, load host registers, keep flags */
b188c81f 11285 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
40712fae 11286 "pop %0 \n\t"
0cb5b306 11287 "setbe %c[fail](%0)\n\t"
b188c81f
AK
11288 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
11289 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
11290 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
11291 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
11292 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
11293 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
11294 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
05b3e0c2 11295#ifdef CONFIG_X86_64
e08aa78a
AK
11296 "mov %%r8, %c[r8](%0) \n\t"
11297 "mov %%r9, %c[r9](%0) \n\t"
11298 "mov %%r10, %c[r10](%0) \n\t"
11299 "mov %%r11, %c[r11](%0) \n\t"
11300 "mov %%r12, %c[r12](%0) \n\t"
11301 "mov %%r13, %c[r13](%0) \n\t"
11302 "mov %%r14, %c[r14](%0) \n\t"
11303 "mov %%r15, %c[r15](%0) \n\t"
43ce76ce
UB
11304 /*
11305 * Clear host registers marked as clobbered to prevent
11306 * speculative use.
11307 */
0cb5b306
JM
11308 "xor %%r8d, %%r8d \n\t"
11309 "xor %%r9d, %%r9d \n\t"
11310 "xor %%r10d, %%r10d \n\t"
11311 "xor %%r11d, %%r11d \n\t"
11312 "xor %%r12d, %%r12d \n\t"
11313 "xor %%r13d, %%r13d \n\t"
11314 "xor %%r14d, %%r14d \n\t"
11315 "xor %%r15d, %%r15d \n\t"
6aa8b732 11316#endif
b188c81f
AK
11317 "mov %%cr2, %%" _ASM_AX " \n\t"
11318 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
c801949d 11319
0cb5b306
JM
11320 "xor %%eax, %%eax \n\t"
11321 "xor %%ebx, %%ebx \n\t"
11322 "xor %%esi, %%esi \n\t"
11323 "xor %%edi, %%edi \n\t"
b188c81f 11324 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
83287ea4
AK
11325 ".pushsection .rodata \n\t"
11326 ".global vmx_return \n\t"
11327 "vmx_return: " _ASM_PTR " 2b \n\t"
11328 ".popsection"
773e8a04 11329 : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
d462b819 11330 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
e08aa78a 11331 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
313dbd49 11332 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
ad312c7c
ZX
11333 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
11334 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
11335 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
11336 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
11337 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
11338 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
11339 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
05b3e0c2 11340#ifdef CONFIG_X86_64
ad312c7c
ZX
11341 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
11342 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
11343 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
11344 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
11345 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
11346 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
11347 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
11348 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
6aa8b732 11349#endif
40712fae
AK
11350 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
11351 [wordsize]"i"(sizeof(ulong))
c2036300
LV
11352 : "cc", "memory"
11353#ifdef CONFIG_X86_64
773e8a04 11354 , "rax", "rbx", "rdi"
c2036300 11355 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
b188c81f 11356#else
773e8a04 11357 , "eax", "ebx", "edi"
c2036300
LV
11358#endif
11359 );
6aa8b732 11360
d28b387f
KA
11361 /*
11362 * We do not use IBRS in the kernel. If this vCPU has used the
11363 * SPEC_CTRL MSR it may have left it on; save the value and
11364 * turn it off. This is much more efficient than blindly adding
11365 * it to the atomic save/restore list. Especially as the former
11366 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
11367 *
11368 * For non-nested case:
11369 * If the L01 MSR bitmap does not intercept the MSR, then we need to
11370 * save it.
11371 *
11372 * For nested case:
11373 * If the L02 MSR bitmap does not intercept the MSR, then we need to
11374 * save it.
11375 */
946fbbc1 11376 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
ecb586bd 11377 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
d28b387f 11378
ccbcd267 11379 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
d28b387f 11380
117cc7a9
DW
11381 /* Eliminate branch target predictions from guest mode */
11382 vmexit_fill_RSB();
11383
773e8a04
VK
11384 /* All fields are clean at this point */
11385 if (static_branch_unlikely(&enable_evmcs))
11386 current_evmcs->hv_clean_fields |=
11387 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
11388
2a7921b7 11389 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
74c55931
WL
11390 if (vmx->host_debugctlmsr)
11391 update_debugctlmsr(vmx->host_debugctlmsr);
2a7921b7 11392
aa67f609
AK
11393#ifndef CONFIG_X86_64
11394 /*
11395 * The sysexit path does not restore ds/es, so we must set them to
11396 * a reasonable value ourselves.
11397 *
6d6095bd
SC
11398 * We can't defer this to vmx_prepare_switch_to_host() since that
11399 * function may be executed in interrupt context, which saves and
11400 * restore segments around it, nullifying its effect.
aa67f609
AK
11401 */
11402 loadsegment(ds, __USER_DS);
11403 loadsegment(es, __USER_DS);
11404#endif
11405
6de4f3ad 11406 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
6de12732 11407 | (1 << VCPU_EXREG_RFLAGS)
aff48baa 11408 | (1 << VCPU_EXREG_PDPTR)
2fb92db1 11409 | (1 << VCPU_EXREG_SEGMENTS)
aff48baa 11410 | (1 << VCPU_EXREG_CR3));
5fdbf976
MT
11411 vcpu->arch.regs_dirty = 0;
11412
1be0e61c
XG
11413 /*
11414 * eager fpu is enabled if PKEY is supported and CR4 is switched
11415 * back on host, so it is safe to read guest PKRU from current
11416 * XSAVE.
11417 */
b9dd21e1
PB
11418 if (static_cpu_has(X86_FEATURE_PKU) &&
11419 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
11420 vcpu->arch.pkru = __read_pkru();
11421 if (vcpu->arch.pkru != vmx->host_pkru)
1be0e61c 11422 __write_pkru(vmx->host_pkru);
1be0e61c
XG
11423 }
11424
e0b890d3 11425 vmx->nested.nested_run_pending = 0;
b060ca3b
JM
11426 vmx->idt_vectoring_info = 0;
11427
11428 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
11429 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
11430 return;
11431
11432 vmx->loaded_vmcs->launched = 1;
11433 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
e0b890d3 11434
51aa01d1
AK
11435 vmx_complete_atomic_exit(vmx);
11436 vmx_recover_nmi_blocking(vmx);
cf393f75 11437 vmx_complete_interrupts(vmx);
6aa8b732 11438}
c207aee4 11439STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
6aa8b732 11440
434a1e94
SC
11441static struct kvm *vmx_vm_alloc(void)
11442{
d1e5b0e9 11443 struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
40bbb9d0 11444 return &kvm_vmx->kvm;
434a1e94
SC
11445}
11446
11447static void vmx_vm_free(struct kvm *kvm)
11448{
d1e5b0e9 11449 vfree(to_kvm_vmx(kvm));
434a1e94
SC
11450}
11451
1279a6b1 11452static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
4fa7734c
PB
11453{
11454 struct vcpu_vmx *vmx = to_vmx(vcpu);
11455 int cpu;
11456
1279a6b1 11457 if (vmx->loaded_vmcs == vmcs)
4fa7734c
PB
11458 return;
11459
11460 cpu = get_cpu();
4fa7734c 11461 vmx_vcpu_put(vcpu);
bd9966de 11462 vmx->loaded_vmcs = vmcs;
4fa7734c 11463 vmx_vcpu_load(vcpu, cpu);
4fa7734c 11464 put_cpu();
b7031fd4
SC
11465
11466 vm_entry_controls_reset_shadow(vmx);
11467 vm_exit_controls_reset_shadow(vmx);
11468 vmx_segment_cache_clear(vmx);
4fa7734c
PB
11469}
11470
2f1fe811
JM
11471/*
11472 * Ensure that the current vmcs of the logical processor is the
11473 * vmcs01 of the vcpu before calling free_nested().
11474 */
11475static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
11476{
14c07ad8
VK
11477 vcpu_load(vcpu);
11478 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
11479 free_nested(vcpu);
11480 vcpu_put(vcpu);
2f1fe811
JM
11481}
11482
6aa8b732
AK
11483static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
11484{
fb3f0f51
RR
11485 struct vcpu_vmx *vmx = to_vmx(vcpu);
11486
843e4330 11487 if (enable_pml)
a3eaa864 11488 vmx_destroy_pml_buffer(vmx);
991e7a0e 11489 free_vpid(vmx->vpid);
4fa7734c 11490 leave_guest_mode(vcpu);
2f1fe811 11491 vmx_free_vcpu_nested(vcpu);
4fa7734c 11492 free_loaded_vmcs(vmx->loaded_vmcs);
fb3f0f51
RR
11493 kfree(vmx->guest_msrs);
11494 kvm_vcpu_uninit(vcpu);
a4770347 11495 kmem_cache_free(kvm_vcpu_cache, vmx);
6aa8b732
AK
11496}
11497
fb3f0f51 11498static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6aa8b732 11499{
fb3f0f51 11500 int err;
c16f862d 11501 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
904e14fb 11502 unsigned long *msr_bitmap;
15ad7146 11503 int cpu;
6aa8b732 11504
a2fa3e9f 11505 if (!vmx)
fb3f0f51
RR
11506 return ERR_PTR(-ENOMEM);
11507
991e7a0e 11508 vmx->vpid = allocate_vpid();
2384d2b3 11509
fb3f0f51
RR
11510 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
11511 if (err)
11512 goto free_vcpu;
965b58a5 11513
4e59516a
PF
11514 err = -ENOMEM;
11515
11516 /*
11517 * If PML is turned on, failure on enabling PML just results in failure
11518 * of creating the vcpu, therefore we can simplify PML logic (by
11519 * avoiding dealing with cases, such as enabling PML partially on vcpus
11520 * for the guest, etc.
11521 */
11522 if (enable_pml) {
11523 vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
11524 if (!vmx->pml_pg)
11525 goto uninit_vcpu;
11526 }
11527
a2fa3e9f 11528 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
03916db9
PB
11529 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
11530 > PAGE_SIZE);
0123be42 11531
4e59516a
PF
11532 if (!vmx->guest_msrs)
11533 goto free_pml;
965b58a5 11534
f21f165e
PB
11535 err = alloc_loaded_vmcs(&vmx->vmcs01);
11536 if (err < 0)
fb3f0f51 11537 goto free_msrs;
a2fa3e9f 11538
904e14fb
PB
11539 msr_bitmap = vmx->vmcs01.msr_bitmap;
11540 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
11541 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
11542 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
11543 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
11544 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
11545 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
11546 vmx->msr_bitmap_mode = 0;
11547
f21f165e 11548 vmx->loaded_vmcs = &vmx->vmcs01;
15ad7146
AK
11549 cpu = get_cpu();
11550 vmx_vcpu_load(&vmx->vcpu, cpu);
e48672fa 11551 vmx->vcpu.cpu = cpu;
12d79917 11552 vmx_vcpu_setup(vmx);
fb3f0f51 11553 vmx_vcpu_put(&vmx->vcpu);
15ad7146 11554 put_cpu();
35754c98 11555 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
be6d05cf
JK
11556 err = alloc_apic_access_page(kvm);
11557 if (err)
5e4a0b3c 11558 goto free_vmcs;
a63cb560 11559 }
fb3f0f51 11560
e90008df 11561 if (enable_ept && !enable_unrestricted_guest) {
f51770ed
TC
11562 err = init_rmode_identity_map(kvm);
11563 if (err)
93ea5388 11564 goto free_vmcs;
b927a3ce 11565 }
b7ebfb05 11566
63aff655 11567 if (nested)
6677f3da
PB
11568 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
11569 kvm_vcpu_apicv_active(&vmx->vcpu));
b9c237bb 11570
705699a1 11571 vmx->nested.posted_intr_nv = -1;
a9d30f33 11572 vmx->nested.current_vmptr = -1ull;
a9d30f33 11573
37e4c997
HZ
11574 vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
11575
31afb2ea
PB
11576 /*
11577 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
11578 * or POSTED_INTR_WAKEUP_VECTOR.
11579 */
11580 vmx->pi_desc.nv = POSTED_INTR_VECTOR;
11581 vmx->pi_desc.sn = 1;
11582
fb3f0f51
RR
11583 return &vmx->vcpu;
11584
11585free_vmcs:
5f3fbc34 11586 free_loaded_vmcs(vmx->loaded_vmcs);
fb3f0f51 11587free_msrs:
fb3f0f51 11588 kfree(vmx->guest_msrs);
4e59516a
PF
11589free_pml:
11590 vmx_destroy_pml_buffer(vmx);
fb3f0f51
RR
11591uninit_vcpu:
11592 kvm_vcpu_uninit(&vmx->vcpu);
11593free_vcpu:
991e7a0e 11594 free_vpid(vmx->vpid);
a4770347 11595 kmem_cache_free(kvm_vcpu_cache, vmx);
fb3f0f51 11596 return ERR_PTR(err);
6aa8b732
AK
11597}
11598
d90a7a0e
JK
11599#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
11600#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
26acfb66 11601
b31c114b
WL
11602static int vmx_vm_init(struct kvm *kvm)
11603{
877ad952
TL
11604 spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
11605
b31c114b
WL
11606 if (!ple_gap)
11607 kvm->arch.pause_in_guest = true;
26acfb66 11608
d90a7a0e
JK
11609 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
11610 switch (l1tf_mitigation) {
11611 case L1TF_MITIGATION_OFF:
11612 case L1TF_MITIGATION_FLUSH_NOWARN:
11613 /* 'I explicitly don't care' is set */
11614 break;
11615 case L1TF_MITIGATION_FLUSH:
11616 case L1TF_MITIGATION_FLUSH_NOSMT:
11617 case L1TF_MITIGATION_FULL:
11618 /*
11619 * Warn upon starting the first VM in a potentially
11620 * insecure environment.
11621 */
11622 if (cpu_smt_control == CPU_SMT_ENABLED)
11623 pr_warn_once(L1TF_MSG_SMT);
11624 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
11625 pr_warn_once(L1TF_MSG_L1D);
11626 break;
11627 case L1TF_MITIGATION_FULL_FORCE:
11628 /* Flush is enforced */
11629 break;
26acfb66 11630 }
26acfb66 11631 }
b31c114b
WL
11632 return 0;
11633}
11634
002c7f7c
YS
11635static void __init vmx_check_processor_compat(void *rtn)
11636{
11637 struct vmcs_config vmcs_conf;
11638
11639 *(int *)rtn = 0;
11640 if (setup_vmcs_config(&vmcs_conf) < 0)
11641 *(int *)rtn = -EIO;
1389309c 11642 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
002c7f7c
YS
11643 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
11644 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
11645 smp_processor_id());
11646 *(int *)rtn = -EIO;
11647 }
11648}
11649
4b12f0de 11650static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
64d4d521 11651{
b18d5431
XG
11652 u8 cache;
11653 u64 ipat = 0;
4b12f0de 11654
522c68c4 11655 /* For VT-d and EPT combination
606decd6 11656 * 1. MMIO: always map as UC
522c68c4
SY
11657 * 2. EPT with VT-d:
11658 * a. VT-d without snooping control feature: can't guarantee the
606decd6 11659 * result, try to trust guest.
522c68c4
SY
11660 * b. VT-d with snooping control feature: snooping control feature of
11661 * VT-d engine can guarantee the cache correctness. Just set it
11662 * to WB to keep consistent with host. So the same as item 3.
a19a6d11 11663 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
522c68c4
SY
11664 * consistent with host MTRR
11665 */
606decd6
PB
11666 if (is_mmio) {
11667 cache = MTRR_TYPE_UNCACHABLE;
11668 goto exit;
11669 }
11670
11671 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
b18d5431
XG
11672 ipat = VMX_EPT_IPAT_BIT;
11673 cache = MTRR_TYPE_WRBACK;
11674 goto exit;
11675 }
11676
11677 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
11678 ipat = VMX_EPT_IPAT_BIT;
0da029ed 11679 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
fb279950
XG
11680 cache = MTRR_TYPE_WRBACK;
11681 else
11682 cache = MTRR_TYPE_UNCACHABLE;
b18d5431
XG
11683 goto exit;
11684 }
11685
ff53604b 11686 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
b18d5431
XG
11687
11688exit:
11689 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
64d4d521
SY
11690}
11691
17cc3935 11692static int vmx_get_lpage_level(void)
344f414f 11693{
878403b7
SY
11694 if (enable_ept && !cpu_has_vmx_ept_1g_page())
11695 return PT_DIRECTORY_LEVEL;
11696 else
11697 /* For shadow and EPT supported 1GB page */
11698 return PT_PDPE_LEVEL;
344f414f
JR
11699}
11700
feda805f
XG
11701static void vmcs_set_secondary_exec_control(u32 new_ctl)
11702{
11703 /*
11704 * These bits in the secondary execution controls field
11705 * are dynamic, the others are mostly based on the hypervisor
11706 * architecture and the guest's CPUID. Do not touch the
11707 * dynamic bits.
11708 */
11709 u32 mask =
11710 SECONDARY_EXEC_SHADOW_VMCS |
11711 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
0367f205
PB
11712 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
11713 SECONDARY_EXEC_DESC;
feda805f
XG
11714
11715 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
11716
11717 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
11718 (new_ctl & ~mask) | (cur_ctl & mask));
11719}
11720
8322ebbb
DM
11721/*
11722 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
11723 * (indicating "allowed-1") if they are supported in the guest's CPUID.
11724 */
11725static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
11726{
11727 struct vcpu_vmx *vmx = to_vmx(vcpu);
11728 struct kvm_cpuid_entry2 *entry;
11729
6677f3da
PB
11730 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
11731 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
8322ebbb
DM
11732
11733#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
11734 if (entry && (entry->_reg & (_cpuid_mask))) \
6677f3da 11735 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
8322ebbb
DM
11736} while (0)
11737
11738 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
11739 cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
11740 cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
11741 cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
11742 cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
11743 cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
11744 cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
11745 cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
11746 cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
11747 cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
11748 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
11749 cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
11750 cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
11751 cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
11752 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
11753
11754 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
11755 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
11756 cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
11757 cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
11758 cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
c4ad77e0 11759 cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
8322ebbb
DM
11760
11761#undef cr4_fixed1_update
11762}
11763
5f76f6f5
LA
11764static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
11765{
11766 struct vcpu_vmx *vmx = to_vmx(vcpu);
11767
11768 if (kvm_mpx_supported()) {
11769 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
11770
11771 if (mpx_enabled) {
11772 vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
11773 vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
11774 } else {
11775 vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
11776 vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
11777 }
11778 }
11779}
11780
0e851880
SY
11781static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
11782{
4e47c7a6 11783 struct vcpu_vmx *vmx = to_vmx(vcpu);
4e47c7a6 11784
80154d77
PB
11785 if (cpu_has_secondary_exec_ctrls()) {
11786 vmx_compute_secondary_exec_control(vmx);
11787 vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
ad756a16 11788 }
8b3e34e4 11789
37e4c997
HZ
11790 if (nested_vmx_allowed(vcpu))
11791 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
11792 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
11793 else
11794 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
11795 ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
8322ebbb 11796
5f76f6f5 11797 if (nested_vmx_allowed(vcpu)) {
8322ebbb 11798 nested_vmx_cr_fixed1_bits_update(vcpu);
5f76f6f5
LA
11799 nested_vmx_entry_exit_ctls_update(vcpu);
11800 }
0e851880
SY
11801}
11802
d4330ef2
JR
11803static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
11804{
7b8050f5
NHE
11805 if (func == 1 && nested)
11806 entry->ecx |= bit(X86_FEATURE_VMX);
d4330ef2
JR
11807}
11808
25d92081
YZ
11809static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
11810 struct x86_exception *fault)
11811{
533558bc 11812 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
c5f983f6 11813 struct vcpu_vmx *vmx = to_vmx(vcpu);
533558bc 11814 u32 exit_reason;
c5f983f6 11815 unsigned long exit_qualification = vcpu->arch.exit_qualification;
25d92081 11816
c5f983f6
BD
11817 if (vmx->nested.pml_full) {
11818 exit_reason = EXIT_REASON_PML_FULL;
11819 vmx->nested.pml_full = false;
11820 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
11821 } else if (fault->error_code & PFERR_RSVD_MASK)
533558bc 11822 exit_reason = EXIT_REASON_EPT_MISCONFIG;
25d92081 11823 else
533558bc 11824 exit_reason = EXIT_REASON_EPT_VIOLATION;
c5f983f6
BD
11825
11826 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
25d92081
YZ
11827 vmcs12->guest_physical_address = fault->address;
11828}
11829
995f00a6
PF
11830static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
11831{
bb97a016 11832 return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
995f00a6
PF
11833}
11834
155a97a3
NHE
11835/* Callbacks for nested_ept_init_mmu_context: */
11836
11837static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
11838{
11839 /* return the page table to be shadowed - in our case, EPT12 */
11840 return get_vmcs12(vcpu)->ept_pointer;
11841}
11842
5b8ba41d 11843static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
155a97a3 11844{
ad896af0 11845 WARN_ON(mmu_is_nested(vcpu));
ae1e2d10 11846
14c07ad8 11847 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
ad896af0 11848 kvm_init_shadow_ept_mmu(vcpu,
6677f3da 11849 to_vmx(vcpu)->nested.msrs.ept_caps &
ae1e2d10 11850 VMX_EPT_EXECUTE_ONLY_BIT,
50c28f21
JS
11851 nested_ept_ad_enabled(vcpu),
11852 nested_ept_get_cr3(vcpu));
44dd3ffa
VK
11853 vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
11854 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
11855 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
3dc773e7 11856 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
155a97a3
NHE
11857
11858 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
155a97a3
NHE
11859}
11860
11861static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
11862{
14c07ad8 11863 vcpu->arch.mmu = &vcpu->arch.root_mmu;
44dd3ffa 11864 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
155a97a3
NHE
11865}
11866
19d5f10b
EK
11867static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
11868 u16 error_code)
11869{
11870 bool inequality, bit;
11871
11872 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
11873 inequality =
11874 (error_code & vmcs12->page_fault_error_code_mask) !=
11875 vmcs12->page_fault_error_code_match;
11876 return inequality ^ bit;
11877}
11878
feaf0c7d
GN
11879static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
11880 struct x86_exception *fault)
11881{
11882 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11883
11884 WARN_ON(!is_guest_mode(vcpu));
11885
305d0ab4
WL
11886 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
11887 !to_vmx(vcpu)->nested.nested_run_pending) {
b96fb439
PB
11888 vmcs12->vm_exit_intr_error_code = fault->error_code;
11889 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
11890 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
11891 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
11892 fault->address);
7313c698 11893 } else {
feaf0c7d 11894 kvm_inject_page_fault(vcpu, fault);
7313c698 11895 }
feaf0c7d
GN
11896}
11897
c992384b
PB
11898static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
11899 struct vmcs12 *vmcs12);
6beb7bd5 11900
7f7f1ba3 11901static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
a2bcba50 11902{
7f7f1ba3 11903 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
a2bcba50 11904 struct vcpu_vmx *vmx = to_vmx(vcpu);
5e2f30b7 11905 struct page *page;
6beb7bd5 11906 u64 hpa;
a2bcba50
WL
11907
11908 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
a2bcba50
WL
11909 /*
11910 * Translate L1 physical address to host physical
11911 * address for vmcs02. Keep the page pinned, so this
11912 * physical address remains valid. We keep a reference
11913 * to it so we can release it later.
11914 */
5e2f30b7 11915 if (vmx->nested.apic_access_page) { /* shouldn't happen */
53a70daf 11916 kvm_release_page_dirty(vmx->nested.apic_access_page);
5e2f30b7
DH
11917 vmx->nested.apic_access_page = NULL;
11918 }
11919 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
6beb7bd5
JM
11920 /*
11921 * If translation failed, no matter: This feature asks
11922 * to exit when accessing the given address, and if it
11923 * can never be accessed, this feature won't do
11924 * anything anyway.
11925 */
5e2f30b7
DH
11926 if (!is_error_page(page)) {
11927 vmx->nested.apic_access_page = page;
6beb7bd5
JM
11928 hpa = page_to_phys(vmx->nested.apic_access_page);
11929 vmcs_write64(APIC_ACCESS_ADDR, hpa);
11930 } else {
11931 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
11932 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
11933 }
a2bcba50 11934 }
a7c0b07d
WL
11935
11936 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
5e2f30b7 11937 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
53a70daf 11938 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
5e2f30b7
DH
11939 vmx->nested.virtual_apic_page = NULL;
11940 }
11941 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
a7c0b07d
WL
11942
11943 /*
6beb7bd5
JM
11944 * If translation failed, VM entry will fail because
11945 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
11946 * Failing the vm entry is _not_ what the processor
11947 * does but it's basically the only possibility we
11948 * have. We could still enter the guest if CR8 load
11949 * exits are enabled, CR8 store exits are enabled, and
11950 * virtualize APIC access is disabled; in this case
11951 * the processor would never use the TPR shadow and we
11952 * could simply clear the bit from the execution
11953 * control. But such a configuration is useless, so
11954 * let's keep the code simple.
a7c0b07d 11955 */
5e2f30b7
DH
11956 if (!is_error_page(page)) {
11957 vmx->nested.virtual_apic_page = page;
6beb7bd5
JM
11958 hpa = page_to_phys(vmx->nested.virtual_apic_page);
11959 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
11960 }
a7c0b07d
WL
11961 }
11962
705699a1 11963 if (nested_cpu_has_posted_intr(vmcs12)) {
705699a1
WV
11964 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
11965 kunmap(vmx->nested.pi_desc_page);
53a70daf 11966 kvm_release_page_dirty(vmx->nested.pi_desc_page);
5e2f30b7 11967 vmx->nested.pi_desc_page = NULL;
705699a1 11968 }
5e2f30b7
DH
11969 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
11970 if (is_error_page(page))
6beb7bd5 11971 return;
5e2f30b7
DH
11972 vmx->nested.pi_desc_page = page;
11973 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
705699a1
WV
11974 vmx->nested.pi_desc =
11975 (struct pi_desc *)((void *)vmx->nested.pi_desc +
11976 (unsigned long)(vmcs12->posted_intr_desc_addr &
11977 (PAGE_SIZE - 1)));
6beb7bd5
JM
11978 vmcs_write64(POSTED_INTR_DESC_ADDR,
11979 page_to_phys(vmx->nested.pi_desc_page) +
11980 (unsigned long)(vmcs12->posted_intr_desc_addr &
11981 (PAGE_SIZE - 1)));
705699a1 11982 }
d4667ca1 11983 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3712caeb
KA
11984 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
11985 CPU_BASED_USE_MSR_BITMAPS);
6beb7bd5
JM
11986 else
11987 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
11988 CPU_BASED_USE_MSR_BITMAPS);
a2bcba50
WL
11989}
11990
f4124500
JK
11991static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
11992{
11993 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
11994 struct vcpu_vmx *vmx = to_vmx(vcpu);
11995
4c008127
SC
11996 /*
11997 * A timer value of zero is architecturally guaranteed to cause
11998 * a VMExit prior to executing any instructions in the guest.
11999 */
12000 if (preemption_timeout == 0) {
f4124500
JK
12001 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
12002 return;
12003 }
12004
4c008127
SC
12005 if (vcpu->arch.virtual_tsc_khz == 0)
12006 return;
12007
f4124500
JK
12008 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
12009 preemption_timeout *= 1000000;
12010 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
12011 hrtimer_start(&vmx->nested.preemption_timer,
12012 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
12013}
12014
56a20510
JM
12015static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
12016 struct vmcs12 *vmcs12)
12017{
12018 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
12019 return 0;
12020
12021 if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
12022 !page_address_valid(vcpu, vmcs12->io_bitmap_b))
12023 return -EINVAL;
12024
12025 return 0;
12026}
12027
3af18d9c
WV
12028static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
12029 struct vmcs12 *vmcs12)
12030{
3af18d9c
WV
12031 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
12032 return 0;
12033
5fa99cbe 12034 if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
3af18d9c
WV
12035 return -EINVAL;
12036
12037 return 0;
12038}
12039
712b12d7
JM
12040static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
12041 struct vmcs12 *vmcs12)
12042{
12043 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
12044 return 0;
12045
12046 if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
12047 return -EINVAL;
12048
12049 return 0;
12050}
12051
3af18d9c
WV
12052/*
12053 * Merge L0's and L1's MSR bitmap, return false to indicate that
12054 * we do not use the hardware.
12055 */
c992384b
PB
12056static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
12057 struct vmcs12 *vmcs12)
3af18d9c 12058{
82f0dd4b 12059 int msr;
f2b93280 12060 struct page *page;
d048c098 12061 unsigned long *msr_bitmap_l1;
904e14fb 12062 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
15d45071 12063 /*
d28b387f 12064 * pred_cmd & spec_ctrl are trying to verify two things:
15d45071
AR
12065 *
12066 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
12067 * ensures that we do not accidentally generate an L02 MSR bitmap
12068 * from the L12 MSR bitmap that is too permissive.
12069 * 2. That L1 or L2s have actually used the MSR. This avoids
12070 * unnecessarily merging of the bitmap if the MSR is unused. This
12071 * works properly because we only update the L01 MSR bitmap lazily.
12072 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
12073 * updated to reflect this when L1 (or its L2s) actually write to
12074 * the MSR.
12075 */
206587a9
KA
12076 bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
12077 bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
f2b93280 12078
c992384b
PB
12079 /* Nothing to do if the MSR bitmap is not in use. */
12080 if (!cpu_has_vmx_msr_bitmap() ||
12081 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
12082 return false;
12083
15d45071 12084 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
d28b387f 12085 !pred_cmd && !spec_ctrl)
f2b93280
WV
12086 return false;
12087
5e2f30b7
DH
12088 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
12089 if (is_error_page(page))
f2b93280 12090 return false;
f2b93280 12091
c992384b
PB
12092 msr_bitmap_l1 = (unsigned long *)kmap(page);
12093 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
12094 /*
12095 * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
12096 * just lets the processor take the value from the virtual-APIC page;
12097 * take those 256 bits directly from the L1 bitmap.
12098 */
12099 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
12100 unsigned word = msr / BITS_PER_LONG;
12101 msr_bitmap_l0[word] = msr_bitmap_l1[word];
12102 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
12103 }
12104 } else {
12105 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
12106 unsigned word = msr / BITS_PER_LONG;
12107 msr_bitmap_l0[word] = ~0;
12108 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
12109 }
12110 }
d048c098 12111
c992384b
PB
12112 nested_vmx_disable_intercept_for_msr(
12113 msr_bitmap_l1, msr_bitmap_l0,
d7231e75 12114 X2APIC_MSR(APIC_TASKPRI),
c992384b 12115 MSR_TYPE_W);
d048c098 12116
c992384b 12117 if (nested_cpu_has_vid(vmcs12)) {
d048c098 12118 nested_vmx_disable_intercept_for_msr(
c992384b 12119 msr_bitmap_l1, msr_bitmap_l0,
d7231e75 12120 X2APIC_MSR(APIC_EOI),
c992384b
PB
12121 MSR_TYPE_W);
12122 nested_vmx_disable_intercept_for_msr(
12123 msr_bitmap_l1, msr_bitmap_l0,
d7231e75 12124 X2APIC_MSR(APIC_SELF_IPI),
c992384b 12125 MSR_TYPE_W);
82f0dd4b 12126 }
15d45071 12127
d28b387f
KA
12128 if (spec_ctrl)
12129 nested_vmx_disable_intercept_for_msr(
12130 msr_bitmap_l1, msr_bitmap_l0,
12131 MSR_IA32_SPEC_CTRL,
12132 MSR_TYPE_R | MSR_TYPE_W);
12133
15d45071
AR
12134 if (pred_cmd)
12135 nested_vmx_disable_intercept_for_msr(
12136 msr_bitmap_l1, msr_bitmap_l0,
12137 MSR_IA32_PRED_CMD,
12138 MSR_TYPE_W);
12139
f2b93280 12140 kunmap(page);
53a70daf 12141 kvm_release_page_clean(page);
f2b93280
WV
12142
12143 return true;
12144}
12145
61ada748
LA
12146static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
12147 struct vmcs12 *vmcs12)
12148{
12149 struct vmcs12 *shadow;
12150 struct page *page;
12151
12152 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
12153 vmcs12->vmcs_link_pointer == -1ull)
12154 return;
12155
12156 shadow = get_shadow_vmcs12(vcpu);
12157 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
12158
12159 memcpy(shadow, kmap(page), VMCS12_SIZE);
12160
12161 kunmap(page);
12162 kvm_release_page_clean(page);
12163}
12164
12165static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
12166 struct vmcs12 *vmcs12)
12167{
12168 struct vcpu_vmx *vmx = to_vmx(vcpu);
12169
12170 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
12171 vmcs12->vmcs_link_pointer == -1ull)
12172 return;
12173
12174 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
12175 get_shadow_vmcs12(vcpu), VMCS12_SIZE);
12176}
12177
f0f4cf5b
KS
12178static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
12179 struct vmcs12 *vmcs12)
12180{
12181 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
12182 !page_address_valid(vcpu, vmcs12->apic_access_addr))
12183 return -EINVAL;
12184 else
12185 return 0;
12186}
12187
f2b93280
WV
12188static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
12189 struct vmcs12 *vmcs12)
12190{
82f0dd4b 12191 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
608406e2 12192 !nested_cpu_has_apic_reg_virt(vmcs12) &&
705699a1
WV
12193 !nested_cpu_has_vid(vmcs12) &&
12194 !nested_cpu_has_posted_intr(vmcs12))
f2b93280
WV
12195 return 0;
12196
12197 /*
12198 * If virtualize x2apic mode is enabled,
12199 * virtualize apic access must be disabled.
12200 */
82f0dd4b
WV
12201 if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
12202 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
f2b93280
WV
12203 return -EINVAL;
12204
608406e2
WV
12205 /*
12206 * If virtual interrupt delivery is enabled,
12207 * we must exit on external interrupts.
12208 */
12209 if (nested_cpu_has_vid(vmcs12) &&
12210 !nested_exit_on_intr(vcpu))
12211 return -EINVAL;
12212
705699a1
WV
12213 /*
12214 * bits 15:8 should be zero in posted_intr_nv,
12215 * the descriptor address has been already checked
12216 * in nested_get_vmcs12_pages.
6de84e58
KS
12217 *
12218 * bits 5:0 of posted_intr_desc_addr should be zero.
705699a1
WV
12219 */
12220 if (nested_cpu_has_posted_intr(vmcs12) &&
12221 (!nested_cpu_has_vid(vmcs12) ||
12222 !nested_exit_intr_ack_set(vcpu) ||
6de84e58
KS
12223 (vmcs12->posted_intr_nv & 0xff00) ||
12224 (vmcs12->posted_intr_desc_addr & 0x3f) ||
22a7cdca 12225 (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
705699a1
WV
12226 return -EINVAL;
12227
f2b93280
WV
12228 /* tpr shadow is needed by all apicv features. */
12229 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
12230 return -EINVAL;
12231
12232 return 0;
3af18d9c
WV
12233}
12234
e9ac033e
EK
12235static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
12236 unsigned long count_field,
92d71bc6 12237 unsigned long addr_field)
ff651cb6 12238{
e2536742 12239 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
92d71bc6 12240 int maxphyaddr;
e9ac033e
EK
12241 u64 count, addr;
12242
e2536742
LA
12243 if (vmcs12_read_any(vmcs12, count_field, &count) ||
12244 vmcs12_read_any(vmcs12, addr_field, &addr)) {
e9ac033e
EK
12245 WARN_ON(1);
12246 return -EINVAL;
12247 }
12248 if (count == 0)
12249 return 0;
92d71bc6 12250 maxphyaddr = cpuid_maxphyaddr(vcpu);
e9ac033e
EK
12251 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
12252 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
bbe41b95 12253 pr_debug_ratelimited(
e9ac033e
EK
12254 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
12255 addr_field, maxphyaddr, count, addr);
12256 return -EINVAL;
12257 }
12258 return 0;
12259}
12260
12261static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
12262 struct vmcs12 *vmcs12)
12263{
e9ac033e
EK
12264 if (vmcs12->vm_exit_msr_load_count == 0 &&
12265 vmcs12->vm_exit_msr_store_count == 0 &&
12266 vmcs12->vm_entry_msr_load_count == 0)
12267 return 0; /* Fast path */
e9ac033e 12268 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
92d71bc6 12269 VM_EXIT_MSR_LOAD_ADDR) ||
e9ac033e 12270 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
92d71bc6 12271 VM_EXIT_MSR_STORE_ADDR) ||
e9ac033e 12272 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
92d71bc6 12273 VM_ENTRY_MSR_LOAD_ADDR))
e9ac033e
EK
12274 return -EINVAL;
12275 return 0;
12276}
12277
c5f983f6
BD
12278static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
12279 struct vmcs12 *vmcs12)
12280{
55c1dcd8
KS
12281 if (!nested_cpu_has_pml(vmcs12))
12282 return 0;
c5f983f6 12283
55c1dcd8
KS
12284 if (!nested_cpu_has_ept(vmcs12) ||
12285 !page_address_valid(vcpu, vmcs12->pml_address))
12286 return -EINVAL;
c5f983f6
BD
12287
12288 return 0;
12289}
12290
a8a7c02b
LA
12291static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
12292 struct vmcs12 *vmcs12)
12293{
12294 if (!nested_cpu_has_shadow_vmcs(vmcs12))
12295 return 0;
12296
12297 if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
12298 !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
12299 return -EINVAL;
12300
12301 return 0;
12302}
12303
e9ac033e
EK
12304static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
12305 struct vmx_msr_entry *e)
12306{
12307 /* x2APIC MSR accesses are not allowed */
8a9781f7 12308 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
e9ac033e
EK
12309 return -EINVAL;
12310 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
12311 e->index == MSR_IA32_UCODE_REV)
12312 return -EINVAL;
12313 if (e->reserved != 0)
ff651cb6
WV
12314 return -EINVAL;
12315 return 0;
12316}
12317
e9ac033e
EK
12318static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
12319 struct vmx_msr_entry *e)
ff651cb6
WV
12320{
12321 if (e->index == MSR_FS_BASE ||
12322 e->index == MSR_GS_BASE ||
e9ac033e
EK
12323 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
12324 nested_vmx_msr_check_common(vcpu, e))
12325 return -EINVAL;
12326 return 0;
12327}
12328
12329static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
12330 struct vmx_msr_entry *e)
12331{
12332 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
12333 nested_vmx_msr_check_common(vcpu, e))
ff651cb6
WV
12334 return -EINVAL;
12335 return 0;
12336}
12337
12338/*
12339 * Load guest's/host's msr at nested entry/exit.
12340 * return 0 for success, entry index for failure.
12341 */
12342static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12343{
12344 u32 i;
12345 struct vmx_msr_entry e;
12346 struct msr_data msr;
12347
12348 msr.host_initiated = false;
12349 for (i = 0; i < count; i++) {
54bf36aa
PB
12350 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
12351 &e, sizeof(e))) {
bbe41b95 12352 pr_debug_ratelimited(
e9ac033e
EK
12353 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12354 __func__, i, gpa + i * sizeof(e));
ff651cb6 12355 goto fail;
e9ac033e
EK
12356 }
12357 if (nested_vmx_load_msr_check(vcpu, &e)) {
bbe41b95 12358 pr_debug_ratelimited(
e9ac033e
EK
12359 "%s check failed (%u, 0x%x, 0x%x)\n",
12360 __func__, i, e.index, e.reserved);
12361 goto fail;
12362 }
ff651cb6
WV
12363 msr.index = e.index;
12364 msr.data = e.value;
e9ac033e 12365 if (kvm_set_msr(vcpu, &msr)) {
bbe41b95 12366 pr_debug_ratelimited(
e9ac033e
EK
12367 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
12368 __func__, i, e.index, e.value);
ff651cb6 12369 goto fail;
e9ac033e 12370 }
ff651cb6
WV
12371 }
12372 return 0;
12373fail:
12374 return i + 1;
12375}
12376
12377static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12378{
12379 u32 i;
12380 struct vmx_msr_entry e;
12381
12382 for (i = 0; i < count; i++) {
609e36d3 12383 struct msr_data msr_info;
54bf36aa
PB
12384 if (kvm_vcpu_read_guest(vcpu,
12385 gpa + i * sizeof(e),
12386 &e, 2 * sizeof(u32))) {
bbe41b95 12387 pr_debug_ratelimited(
e9ac033e
EK
12388 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12389 __func__, i, gpa + i * sizeof(e));
ff651cb6 12390 return -EINVAL;
e9ac033e
EK
12391 }
12392 if (nested_vmx_store_msr_check(vcpu, &e)) {
bbe41b95 12393 pr_debug_ratelimited(
e9ac033e
EK
12394 "%s check failed (%u, 0x%x, 0x%x)\n",
12395 __func__, i, e.index, e.reserved);
ff651cb6 12396 return -EINVAL;
e9ac033e 12397 }
609e36d3
PB
12398 msr_info.host_initiated = false;
12399 msr_info.index = e.index;
12400 if (kvm_get_msr(vcpu, &msr_info)) {
bbe41b95 12401 pr_debug_ratelimited(
e9ac033e
EK
12402 "%s cannot read MSR (%u, 0x%x)\n",
12403 __func__, i, e.index);
12404 return -EINVAL;
12405 }
54bf36aa
PB
12406 if (kvm_vcpu_write_guest(vcpu,
12407 gpa + i * sizeof(e) +
12408 offsetof(struct vmx_msr_entry, value),
12409 &msr_info.data, sizeof(msr_info.data))) {
bbe41b95 12410 pr_debug_ratelimited(
e9ac033e 12411 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
609e36d3 12412 __func__, i, e.index, msr_info.data);
e9ac033e
EK
12413 return -EINVAL;
12414 }
ff651cb6
WV
12415 }
12416 return 0;
12417}
12418
1dc35dac
LP
12419static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
12420{
12421 unsigned long invalid_mask;
12422
12423 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
12424 return (val & invalid_mask) == 0;
12425}
12426
9ed38ffa
LP
12427/*
12428 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
12429 * emulating VM entry into a guest with EPT enabled.
12430 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12431 * is assigned to entry_failure_code on failure.
12432 */
12433static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
ca0bde28 12434 u32 *entry_failure_code)
9ed38ffa 12435{
9ed38ffa 12436 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
1dc35dac 12437 if (!nested_cr3_valid(vcpu, cr3)) {
9ed38ffa
LP
12438 *entry_failure_code = ENTRY_FAIL_DEFAULT;
12439 return 1;
12440 }
12441
12442 /*
12443 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
12444 * must not be dereferenced.
12445 */
12446 if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
12447 !nested_ept) {
12448 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
12449 *entry_failure_code = ENTRY_FAIL_PDPTE;
12450 return 1;
12451 }
12452 }
9ed38ffa
LP
12453 }
12454
50c28f21 12455 if (!nested_ept)
ade61e28 12456 kvm_mmu_new_cr3(vcpu, cr3, false);
50c28f21
JS
12457
12458 vcpu->arch.cr3 = cr3;
12459 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
12460
12461 kvm_init_mmu(vcpu, false);
12462
9ed38ffa
LP
12463 return 0;
12464}
12465
efebf0aa
LA
12466/*
12467 * Returns if KVM is able to config CPU to tag TLB entries
12468 * populated by L2 differently than TLB entries populated
12469 * by L1.
12470 *
12471 * If L1 uses EPT, then TLB entries are tagged with different EPTP.
12472 *
12473 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
12474 * with different VPID (L1 entries are tagged with vmx->vpid
12475 * while L2 entries are tagged with vmx->nested.vpid02).
12476 */
12477static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
12478{
12479 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
fe3ef05c 12480
efebf0aa
LA
12481 return nested_cpu_has_ept(vmcs12) ||
12482 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
12483}
25a2e4fe 12484
3df5c37e
SC
12485static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12486{
12487 if (vmx->nested.nested_run_pending &&
12488 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
12489 return vmcs12->guest_ia32_efer;
12490 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
12491 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
12492 else
12493 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
12494}
25a2e4fe 12495
09abe320 12496static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
fe3ef05c 12497{
09abe320 12498 /*
9d6105b2 12499 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
09abe320
SC
12500 * according to L0's settings (vmcs12 is irrelevant here). Host
12501 * fields that come from L0 and are not constant, e.g. HOST_CR3,
12502 * will be set as needed prior to VMLAUNCH/VMRESUME.
12503 */
9d6105b2 12504 if (vmx->nested.vmcs02_initialized)
09abe320 12505 return;
9d6105b2 12506 vmx->nested.vmcs02_initialized = true;
25a2e4fe
PB
12507
12508 /*
52017608
SC
12509 * We don't care what the EPTP value is we just need to guarantee
12510 * it's valid so we don't get a false positive when doing early
12511 * consistency checks.
25a2e4fe 12512 */
52017608
SC
12513 if (enable_ept && nested_early_check)
12514 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
25a2e4fe
PB
12515
12516 /* All VMFUNCs are currently emulated through L0 vmexits. */
12517 if (cpu_has_vmx_vmfunc())
12518 vmcs_write64(VM_FUNCTION_CONTROL, 0);
12519
09abe320
SC
12520 if (cpu_has_vmx_posted_intr())
12521 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
25a2e4fe 12522
09abe320
SC
12523 if (cpu_has_vmx_msr_bitmap())
12524 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
12525
12526 if (enable_pml)
12527 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
25a2e4fe
PB
12528
12529 /*
09abe320
SC
12530 * Set the MSR load/store lists to match L0's settings. Only the
12531 * addresses are constant (for vmcs02), the counts can change based
12532 * on L2's behavior, e.g. switching to/from long mode.
25a2e4fe
PB
12533 */
12534 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
33966dd6 12535 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
33966dd6 12536 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
25a2e4fe 12537
09abe320
SC
12538 vmx_set_constant_host_state(vmx);
12539}
25a2e4fe 12540
09abe320
SC
12541static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
12542 struct vmcs12 *vmcs12)
12543{
12544 prepare_vmcs02_constant_state(vmx);
12545
12546 vmcs_write64(VMCS_LINK_POINTER, -1ull);
25a2e4fe
PB
12547
12548 if (enable_vpid) {
12549 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
12550 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
12551 else
12552 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
12553 }
8665c3f9
PB
12554}
12555
09abe320 12556static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
8665c3f9 12557{
8665c3f9 12558 u32 exec_control, vmcs12_exec_ctrl;
09abe320 12559 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
8665c3f9 12560
945679e3 12561 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
09abe320 12562 prepare_vmcs02_early_full(vmx, vmcs12);
9d1887ef 12563
8665c3f9 12564 /*
09abe320
SC
12565 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
12566 * entry, but only if the current (host) sp changed from the value
12567 * we wrote last (vmx->host_rsp). This cache is no longer relevant
12568 * if we switch vmcs, and rather than hold a separate cache per vmcs,
52017608
SC
12569 * here we just force the write to happen on entry. host_rsp will
12570 * also be written unconditionally by nested_vmx_check_vmentry_hw()
12571 * if we are doing early consistency checks via hardware.
8665c3f9 12572 */
09abe320 12573 vmx->host_rsp = 0;
8665c3f9 12574
09abe320
SC
12575 /*
12576 * PIN CONTROLS
12577 */
f4124500 12578 exec_control = vmcs12->pin_based_vm_exec_control;
9314006d 12579
f459a707 12580 /* Preemption timer setting is computed directly in vmx_vcpu_run. */
9314006d 12581 exec_control |= vmcs_config.pin_based_exec_ctrl;
f459a707
SC
12582 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
12583 vmx->loaded_vmcs->hv_timer_armed = false;
705699a1 12584
9314006d 12585 /* Posted interrupts setting is only taken from vmcs12. */
705699a1 12586 if (nested_cpu_has_posted_intr(vmcs12)) {
705699a1
WV
12587 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
12588 vmx->nested.pi_pending = false;
6beb7bd5 12589 } else {
705699a1 12590 exec_control &= ~PIN_BASED_POSTED_INTR;
6beb7bd5 12591 }
f4124500 12592 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
fe3ef05c 12593
09abe320
SC
12594 /*
12595 * EXEC CONTROLS
12596 */
12597 exec_control = vmx_exec_control(vmx); /* L0's desires */
12598 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
12599 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
12600 exec_control &= ~CPU_BASED_TPR_SHADOW;
12601 exec_control |= vmcs12->cpu_based_vm_exec_control;
12602
12603 /*
12604 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
12605 * nested_get_vmcs12_pages can't fix it up, the illegal value
12606 * will result in a VM entry failure.
12607 */
12608 if (exec_control & CPU_BASED_TPR_SHADOW) {
12609 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
12610 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
12611 } else {
12612#ifdef CONFIG_X86_64
12613 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
12614 CPU_BASED_CR8_STORE_EXITING;
12615#endif
12616 }
12617
12618 /*
12619 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
12620 * for I/O port accesses.
12621 */
12622 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
12623 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
12624 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
0238ea91 12625
09abe320
SC
12626 /*
12627 * SECONDARY EXEC CONTROLS
12628 */
fe3ef05c 12629 if (cpu_has_secondary_exec_ctrls()) {
80154d77 12630 exec_control = vmx->secondary_exec_control;
e2821620 12631
fe3ef05c 12632 /* Take the following fields only from vmcs12 */
696dfd95 12633 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
90a2db6d 12634 SECONDARY_EXEC_ENABLE_INVPCID |
b3a2a907 12635 SECONDARY_EXEC_RDTSCP |
3db13480 12636 SECONDARY_EXEC_XSAVES |
696dfd95 12637 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
27c42a1b
BD
12638 SECONDARY_EXEC_APIC_REGISTER_VIRT |
12639 SECONDARY_EXEC_ENABLE_VMFUNC);
fe3ef05c 12640 if (nested_cpu_has(vmcs12,
03efce6f
BD
12641 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
12642 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
12643 ~SECONDARY_EXEC_ENABLE_PML;
12644 exec_control |= vmcs12_exec_ctrl;
12645 }
fe3ef05c 12646
32c7acf0
LA
12647 /* VMCS shadowing for L2 is emulated for now */
12648 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
12649
25a2e4fe 12650 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
608406e2
WV
12651 vmcs_write16(GUEST_INTR_STATUS,
12652 vmcs12->guest_intr_status);
608406e2 12653
6beb7bd5
JM
12654 /*
12655 * Write an illegal value to APIC_ACCESS_ADDR. Later,
12656 * nested_get_vmcs12_pages will either fix it up or
12657 * remove the VM execution control.
12658 */
12659 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
12660 vmcs_write64(APIC_ACCESS_ADDR, -1ull);
12661
0b665d30
SC
12662 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
12663 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
12664
fe3ef05c
NHE
12665 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
12666 }
12667
fe3ef05c 12668 /*
09abe320
SC
12669 * ENTRY CONTROLS
12670 *
12671 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
12672 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
12673 * on the related bits (if supported by the CPU) in the hope that
12674 * we can avoid VMWrites during vmx_set_efer().
12675 */
12676 exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
12677 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
12678 if (cpu_has_load_ia32_efer) {
12679 if (guest_efer & EFER_LMA)
12680 exec_control |= VM_ENTRY_IA32E_MODE;
12681 if (guest_efer != host_efer)
12682 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
12683 }
12684 vm_entry_controls_init(vmx, exec_control);
12685
12686 /*
12687 * EXIT CONTROLS
12688 *
12689 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
12690 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
12691 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
fe3ef05c 12692 */
09abe320
SC
12693 exec_control = vmcs_config.vmexit_ctrl;
12694 if (cpu_has_load_ia32_efer && guest_efer != host_efer)
12695 exec_control |= VM_EXIT_LOAD_IA32_EFER;
12696 vm_exit_controls_init(vmx, exec_control);
fe3ef05c 12697
09abe320
SC
12698 /*
12699 * Conceptually we want to copy the PML address and index from
12700 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
12701 * since we always flush the log on each vmexit and never change
12702 * the PML address (once set), this happens to be equivalent to
12703 * simply resetting the index in vmcs02.
12704 */
12705 if (enable_pml)
12706 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
a7c0b07d 12707
6beb7bd5 12708 /*
09abe320 12709 * Interrupt/Exception Fields
6beb7bd5 12710 */
09abe320
SC
12711 if (vmx->nested.nested_run_pending) {
12712 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
12713 vmcs12->vm_entry_intr_info_field);
12714 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
12715 vmcs12->vm_entry_exception_error_code);
12716 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
12717 vmcs12->vm_entry_instruction_len);
12718 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
12719 vmcs12->guest_interruptibility_info);
12720 vmx->loaded_vmcs->nmi_known_unmasked =
12721 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
51aa68e7 12722 } else {
09abe320
SC
12723 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
12724 }
12725}
fe3ef05c 12726
09abe320
SC
12727static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12728{
c4ebd629
VK
12729 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
12730
12731 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12732 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
12733 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
cbe3f898 12734 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
c4ebd629
VK
12735 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
12736 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
12737 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
12738 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
12739 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
12740 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
12741 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
cbe3f898 12742 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
c4ebd629
VK
12743 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
12744 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
12745 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
12746 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
12747 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
12748 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
12749 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
12750 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
12751 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
c4ebd629
VK
12752 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
12753 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
12754 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
12755 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
12756 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
cbe3f898
VK
12757 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
12758 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
c4ebd629
VK
12759 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
12760 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
12761 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
12762 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
12763 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
12764 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
12765 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
12766 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
12767 }
12768
12769 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12770 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
12771 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
12772 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
12773 vmcs12->guest_pending_dbg_exceptions);
12774 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
12775 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
12776
12777 /*
12778 * L1 may access the L2's PDPTR, so save them to construct
12779 * vmcs12
12780 */
12781 if (enable_ept) {
12782 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
12783 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
12784 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
12785 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
12786 }
a7c0b07d
WL
12787 }
12788
25a2e4fe
PB
12789 if (nested_cpu_has_xsaves(vmcs12))
12790 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
25a2e4fe 12791
fe3ef05c 12792 /*
25a2e4fe
PB
12793 * Whether page-faults are trapped is determined by a combination of
12794 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
12795 * If enable_ept, L0 doesn't care about page faults and we should
12796 * set all of these to L1's desires. However, if !enable_ept, L0 does
12797 * care about (at least some) page faults, and because it is not easy
12798 * (if at all possible?) to merge L0 and L1's desires, we simply ask
12799 * to exit on each and every L2 page fault. This is done by setting
12800 * MASK=MATCH=0 and (see below) EB.PF=1.
12801 * Note that below we don't need special code to set EB.PF beyond the
12802 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
12803 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
12804 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
fe3ef05c 12805 */
25a2e4fe
PB
12806 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
12807 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
12808 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
12809 enable_ept ? vmcs12->page_fault_error_code_match : 0);
fe3ef05c 12810
25a2e4fe
PB
12811 if (cpu_has_vmx_apicv()) {
12812 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
12813 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
12814 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
12815 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
12816 }
12817
33966dd6 12818 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
33966dd6 12819 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
25a2e4fe
PB
12820
12821 set_cr4_guest_host_mask(vmx);
12822
62cf9bd8
LA
12823 if (kvm_mpx_supported()) {
12824 if (vmx->nested.nested_run_pending &&
12825 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
12826 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
12827 else
12828 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
12829 }
8665c3f9
PB
12830}
12831
12832/*
12833 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
12834 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
12835 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
12836 * guest in a way that will both be appropriate to L1's requests, and our
12837 * needs. In addition to modifying the active vmcs (which is vmcs02), this
12838 * function also has additional necessary side-effects, like setting various
12839 * vcpu->arch fields.
12840 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12841 * is assigned to entry_failure_code on failure.
12842 */
12843static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
6514dc38 12844 u32 *entry_failure_code)
8665c3f9
PB
12845{
12846 struct vcpu_vmx *vmx = to_vmx(vcpu);
c4ebd629 12847 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
8665c3f9 12848
945679e3 12849 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
09abe320 12850 prepare_vmcs02_full(vmx, vmcs12);
9d1887ef
SC
12851 vmx->nested.dirty_vmcs12 = false;
12852 }
12853
8665c3f9
PB
12854 /*
12855 * First, the fields that are shadowed. This must be kept in sync
12856 * with vmx_shadow_fields.h.
12857 */
c4ebd629
VK
12858 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12859 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
c4ebd629 12860 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
cbe3f898 12861 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
c4ebd629 12862 }
8665c3f9 12863
6514dc38 12864 if (vmx->nested.nested_run_pending &&
cf8b84f4 12865 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2996fca0
JK
12866 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
12867 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
12868 } else {
12869 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
12870 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
12871 }
63fbf59f 12872 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
fe3ef05c 12873
f4124500
JK
12874 vmx->nested.preemption_timer_expired = false;
12875 if (nested_cpu_has_preemption_timer(vmcs12))
12876 vmx_start_preemption_timer(vcpu);
fe3ef05c
NHE
12877
12878 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
12879 * bitwise-or of what L1 wants to trap for L2, and what we want to
12880 * trap. Note that CR0.TS also needs updating - we do this later.
12881 */
12882 update_exception_bitmap(vcpu);
12883 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
12884 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
12885
6514dc38 12886 if (vmx->nested.nested_run_pending &&
cf8b84f4 12887 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
fe3ef05c 12888 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
44811c02 12889 vcpu->arch.pat = vmcs12->guest_ia32_pat;
cf8b84f4 12890 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
fe3ef05c 12891 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
cf8b84f4 12892 }
fe3ef05c 12893
e79f245d
KA
12894 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
12895
c95ba92a
PF
12896 if (kvm_has_tsc_control)
12897 decache_tsc_multiplier(vmx);
fe3ef05c
NHE
12898
12899 if (enable_vpid) {
12900 /*
5c614b35
WL
12901 * There is no direct mapping between vpid02 and vpid12, the
12902 * vpid02 is per-vCPU for L0 and reused while the value of
12903 * vpid12 is changed w/ one invvpid during nested vmentry.
12904 * The vpid12 is allocated by L1 for L2, so it will not
12905 * influence global bitmap(for vpid01 and vpid02 allocation)
12906 * even if spawn a lot of nested vCPUs.
fe3ef05c 12907 */
efebf0aa 12908 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
5c614b35
WL
12909 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
12910 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
efebf0aa 12911 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
5c614b35
WL
12912 }
12913 } else {
1438921c
LA
12914 /*
12915 * If L1 use EPT, then L0 needs to execute INVEPT on
12916 * EPTP02 instead of EPTP01. Therefore, delay TLB
12917 * flush until vmcs02->eptp is fully updated by
12918 * KVM_REQ_LOAD_CR3. Note that this assumes
12919 * KVM_REQ_TLB_FLUSH is evaluated after
12920 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
12921 */
12922 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
5c614b35 12923 }
fe3ef05c
NHE
12924 }
12925
5b8ba41d
SC
12926 if (nested_cpu_has_ept(vmcs12))
12927 nested_ept_init_mmu_context(vcpu);
12928 else if (nested_cpu_has2(vmcs12,
12929 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
a468f2db 12930 vmx_flush_tlb(vcpu, true);
155a97a3 12931
fe3ef05c 12932 /*
bd7e5b08
PB
12933 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
12934 * bits which we consider mandatory enabled.
fe3ef05c
NHE
12935 * The CR0_READ_SHADOW is what L2 should have expected to read given
12936 * the specifications by L1; It's not enough to take
12937 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
12938 * have more bits than L1 expected.
12939 */
12940 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
12941 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
12942
12943 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
12944 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
12945
09abe320 12946 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
3df5c37e 12947 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
5a6a9748
DM
12948 vmx_set_efer(vcpu, vcpu->arch.efer);
12949
2bb8cafe
SC
12950 /*
12951 * Guest state is invalid and unrestricted guest is disabled,
12952 * which means L1 attempted VMEntry to L2 with invalid state.
12953 * Fail the VMEntry.
12954 */
3184a995
PB
12955 if (vmx->emulation_required) {
12956 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2bb8cafe 12957 return 1;
3184a995 12958 }
2bb8cafe 12959
9ed38ffa 12960 /* Shadow page tables on either EPT or shadow page tables. */
7ad658b6 12961 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
9ed38ffa
LP
12962 entry_failure_code))
12963 return 1;
7ca29de2 12964
feaf0c7d
GN
12965 if (!enable_ept)
12966 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
12967
fe3ef05c
NHE
12968 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
12969 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
ee146c1c 12970 return 0;
fe3ef05c
NHE
12971}
12972
0c7f650e
KS
12973static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
12974{
12975 if (!nested_cpu_has_nmi_exiting(vmcs12) &&
12976 nested_cpu_has_virtual_nmis(vmcs12))
12977 return -EINVAL;
12978
12979 if (!nested_cpu_has_virtual_nmis(vmcs12) &&
12980 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
12981 return -EINVAL;
12982
12983 return 0;
12984}
12985
ca0bde28 12986static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
cd232ad0 12987{
cd232ad0 12988 struct vcpu_vmx *vmx = to_vmx(vcpu);
64a919f7 12989 bool ia32e;
7c177938 12990
6dfacadd 12991 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
ca0bde28
JM
12992 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
12993 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
26539bd0 12994
ba8e23db
KS
12995 if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
12996 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12997
56a20510
JM
12998 if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
12999 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13000
ca0bde28
JM
13001 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
13002 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
7c177938 13003
f0f4cf5b
KS
13004 if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
13005 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13006
712b12d7
JM
13007 if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
13008 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13009
ca0bde28
JM
13010 if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
13011 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
f2b93280 13012
ca0bde28
JM
13013 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
13014 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
e9ac033e 13015
c5f983f6
BD
13016 if (nested_vmx_check_pml_controls(vcpu, vmcs12))
13017 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13018
a8a7c02b
LA
13019 if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
13020 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13021
7c177938 13022 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
6677f3da
PB
13023 vmx->nested.msrs.procbased_ctls_low,
13024 vmx->nested.msrs.procbased_ctls_high) ||
2e5b0bd9
JM
13025 (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
13026 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
6677f3da
PB
13027 vmx->nested.msrs.secondary_ctls_low,
13028 vmx->nested.msrs.secondary_ctls_high)) ||
7c177938 13029 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
6677f3da
PB
13030 vmx->nested.msrs.pinbased_ctls_low,
13031 vmx->nested.msrs.pinbased_ctls_high) ||
7c177938 13032 !vmx_control_verify(vmcs12->vm_exit_controls,
6677f3da
PB
13033 vmx->nested.msrs.exit_ctls_low,
13034 vmx->nested.msrs.exit_ctls_high) ||
7c177938 13035 !vmx_control_verify(vmcs12->vm_entry_controls,
6677f3da
PB
13036 vmx->nested.msrs.entry_ctls_low,
13037 vmx->nested.msrs.entry_ctls_high))
ca0bde28 13038 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
7c177938 13039
0c7f650e 13040 if (nested_vmx_check_nmi_controls(vmcs12))
ca0bde28 13041 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
7c177938 13042
41ab9372
BD
13043 if (nested_cpu_has_vmfunc(vmcs12)) {
13044 if (vmcs12->vm_function_control &
6677f3da 13045 ~vmx->nested.msrs.vmfunc_controls)
41ab9372
BD
13046 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13047
13048 if (nested_cpu_has_eptp_switching(vmcs12)) {
13049 if (!nested_cpu_has_ept(vmcs12) ||
13050 !page_address_valid(vcpu, vmcs12->eptp_list_address))
13051 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13052 }
13053 }
27c42a1b 13054
c7c2c709
JM
13055 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
13056 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13057
3899152c 13058 if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
1dc35dac 13059 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
ca0bde28
JM
13060 !nested_cr3_valid(vcpu, vmcs12->host_cr3))
13061 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
13062
64a919f7
SC
13063 /*
13064 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
13065 * IA32_EFER MSR must be 0 in the field for that register. In addition,
13066 * the values of the LMA and LME bits in the field must each be that of
13067 * the host address-space size VM-exit control.
13068 */
13069 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
13070 ia32e = (vmcs12->vm_exit_controls &
13071 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
13072 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
13073 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
13074 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
13075 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
13076 }
13077
0447378a
MO
13078 /*
13079 * From the Intel SDM, volume 3:
13080 * Fields relevant to VM-entry event injection must be set properly.
13081 * These fields are the VM-entry interruption-information field, the
13082 * VM-entry exception error code, and the VM-entry instruction length.
13083 */
13084 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
13085 u32 intr_info = vmcs12->vm_entry_intr_info_field;
13086 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
13087 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
13088 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
13089 bool should_have_error_code;
13090 bool urg = nested_cpu_has2(vmcs12,
13091 SECONDARY_EXEC_UNRESTRICTED_GUEST);
13092 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
13093
13094 /* VM-entry interruption-info field: interruption type */
13095 if (intr_type == INTR_TYPE_RESERVED ||
13096 (intr_type == INTR_TYPE_OTHER_EVENT &&
13097 !nested_cpu_supports_monitor_trap_flag(vcpu)))
13098 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13099
13100 /* VM-entry interruption-info field: vector */
13101 if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
13102 (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
13103 (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
13104 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13105
13106 /* VM-entry interruption-info field: deliver error code */
13107 should_have_error_code =
13108 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
13109 x86_exception_has_error_code(vector);
13110 if (has_error_code != should_have_error_code)
13111 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13112
13113 /* VM-entry exception error code */
13114 if (has_error_code &&
13115 vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
13116 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13117
13118 /* VM-entry interruption-info field: reserved bits */
13119 if (intr_info & INTR_INFO_RESVD_BITS_MASK)
13120 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13121
13122 /* VM-entry instruction length */
13123 switch (intr_type) {
13124 case INTR_TYPE_SOFT_EXCEPTION:
13125 case INTR_TYPE_SOFT_INTR:
13126 case INTR_TYPE_PRIV_SW_EXCEPTION:
13127 if ((vmcs12->vm_entry_instruction_len > 15) ||
13128 (vmcs12->vm_entry_instruction_len == 0 &&
13129 !nested_cpu_has_zero_length_injection(vcpu)))
13130 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13131 }
13132 }
13133
5b8ba41d
SC
13134 if (nested_cpu_has_ept(vmcs12) &&
13135 !valid_ept_address(vcpu, vmcs12->ept_pointer))
13136 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13137
ca0bde28
JM
13138 return 0;
13139}
13140
f145d90d
LA
13141static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
13142 struct vmcs12 *vmcs12)
13143{
13144 int r;
13145 struct page *page;
13146 struct vmcs12 *shadow;
13147
13148 if (vmcs12->vmcs_link_pointer == -1ull)
13149 return 0;
13150
13151 if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
13152 return -EINVAL;
13153
13154 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
13155 if (is_error_page(page))
13156 return -EINVAL;
13157
13158 r = 0;
13159 shadow = kmap(page);
13160 if (shadow->hdr.revision_id != VMCS12_REVISION ||
13161 shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
13162 r = -EINVAL;
13163 kunmap(page);
13164 kvm_release_page_clean(page);
13165 return r;
13166}
13167
ca0bde28
JM
13168static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
13169 u32 *exit_qual)
13170{
13171 bool ia32e;
13172
13173 *exit_qual = ENTRY_FAIL_DEFAULT;
7c177938 13174
3899152c 13175 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
ca0bde28 13176 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
b428018a 13177 return 1;
ca0bde28 13178
f145d90d 13179 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
ca0bde28 13180 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
b428018a 13181 return 1;
7c177938
NHE
13182 }
13183
384bb783 13184 /*
cb0c8cda 13185 * If the load IA32_EFER VM-entry control is 1, the following checks
384bb783
JK
13186 * are performed on the field for the IA32_EFER MSR:
13187 * - Bits reserved in the IA32_EFER MSR must be 0.
13188 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
13189 * the IA-32e mode guest VM-exit control. It must also be identical
13190 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
13191 * CR0.PG) is 1.
13192 */
ca0bde28
JM
13193 if (to_vmx(vcpu)->nested.nested_run_pending &&
13194 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
384bb783
JK
13195 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
13196 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
13197 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
13198 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
ca0bde28 13199 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
b428018a 13200 return 1;
384bb783
JK
13201 }
13202
f1b026a3
WL
13203 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
13204 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
13205 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
13206 return 1;
13207
ca0bde28
JM
13208 return 0;
13209}
13210
52017608
SC
13211static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
13212{
13213 struct vcpu_vmx *vmx = to_vmx(vcpu);
13214 unsigned long cr3, cr4;
13215
13216 if (!nested_early_check)
13217 return 0;
13218
13219 if (vmx->msr_autoload.host.nr)
13220 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
13221 if (vmx->msr_autoload.guest.nr)
13222 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
13223
13224 preempt_disable();
13225
13226 vmx_prepare_switch_to_guest(vcpu);
13227
13228 /*
13229 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
13230 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
13231 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
13232 * there is no need to preserve other bits or save/restore the field.
13233 */
13234 vmcs_writel(GUEST_RFLAGS, 0);
13235
13236 vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
13237
13238 cr3 = __get_current_cr3_fast();
13239 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
13240 vmcs_writel(HOST_CR3, cr3);
13241 vmx->loaded_vmcs->host_state.cr3 = cr3;
13242 }
13243
13244 cr4 = cr4_read_shadow();
13245 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
13246 vmcs_writel(HOST_CR4, cr4);
13247 vmx->loaded_vmcs->host_state.cr4 = cr4;
13248 }
13249
13250 vmx->__launched = vmx->loaded_vmcs->launched;
13251
13252 asm(
13253 /* Set HOST_RSP */
4b1e5478 13254 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
52017608
SC
13255 "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
13256
13257 /* Check if vmlaunch of vmresume is needed */
13258 "cmpl $0, %c[launched](%0)\n\t"
13259 "je 1f\n\t"
4b1e5478 13260 __ex("vmresume") "\n\t"
52017608 13261 "jmp 2f\n\t"
4b1e5478 13262 "1: " __ex("vmlaunch") "\n\t"
52017608
SC
13263 "jmp 2f\n\t"
13264 "2: "
13265
13266 /* Set vmx->fail accordingly */
13267 "setbe %c[fail](%0)\n\t"
f145d90d 13268
52017608
SC
13269 ".pushsection .rodata\n\t"
13270 ".global vmx_early_consistency_check_return\n\t"
13271 "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
13272 ".popsection"
13273 :
13274 : "c"(vmx), "d"((unsigned long)HOST_RSP),
13275 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
13276 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
13277 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
13278 : "rax", "cc", "memory"
13279 );
ca0bde28 13280
52017608 13281 vmcs_writel(HOST_RIP, vmx_return);
7c177938 13282
52017608 13283 preempt_enable();
ca0bde28 13284
52017608
SC
13285 if (vmx->msr_autoload.host.nr)
13286 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
13287 if (vmx->msr_autoload.guest.nr)
13288 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
13289
13290 if (vmx->fail) {
13291 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
13292 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
13293 vmx->fail = 0;
b428018a 13294 return 1;
7c177938
NHE
13295 }
13296
384bb783 13297 /*
52017608 13298 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
384bb783 13299 */
52017608
SC
13300 local_irq_enable();
13301 if (hw_breakpoint_active())
13302 set_debugreg(__this_cpu_read(cpu_dr7), 7);
384bb783
JK
13303
13304 /*
52017608
SC
13305 * A non-failing VMEntry means we somehow entered guest mode with
13306 * an illegal RIP, and that's just the tip of the iceberg. There
13307 * is no telling what memory has been modified or what state has
13308 * been exposed to unknown code. Hitting this all but guarantees
13309 * a (very critical) hardware issue.
384bb783 13310 */
52017608
SC
13311 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
13312 VMX_EXIT_REASONS_FAILED_VMENTRY));
f1b026a3 13313
ca0bde28
JM
13314 return 0;
13315}
52017608
SC
13316STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
13317
a633e41e
SC
13318static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
13319 struct vmcs12 *vmcs12);
ca0bde28 13320
7f7f1ba3 13321/*
a633e41e 13322 * If from_vmentry is false, this is being called from state restore (either RSM
8fcc4b59 13323 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
52017608
SC
13324+ *
13325+ * Returns:
13326+ * 0 - success, i.e. proceed with actual VMEnter
13327+ * 1 - consistency check VMExit
13328+ * -1 - consistency check VMFail
7f7f1ba3 13329 */
a633e41e
SC
13330static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
13331 bool from_vmentry)
858e25c0
JM
13332{
13333 struct vcpu_vmx *vmx = to_vmx(vcpu);
13334 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7e712684 13335 bool evaluate_pending_interrupts;
a633e41e
SC
13336 u32 exit_reason = EXIT_REASON_INVALID_STATE;
13337 u32 exit_qual;
858e25c0 13338
7e712684
PB
13339 evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
13340 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
13341 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
13342 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
b5861e5c 13343
858e25c0
JM
13344 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
13345 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
62cf9bd8
LA
13346 if (kvm_mpx_supported() &&
13347 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
13348 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
858e25c0 13349
de3a0021 13350 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
858e25c0 13351
16fb9a46
SC
13352 prepare_vmcs02_early(vmx, vmcs12);
13353
13354 if (from_vmentry) {
13355 nested_get_vmcs12_pages(vcpu);
13356
52017608
SC
13357 if (nested_vmx_check_vmentry_hw(vcpu)) {
13358 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
13359 return -1;
13360 }
13361
16fb9a46
SC
13362 if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
13363 goto vmentry_fail_vmexit;
13364 }
13365
13366 enter_guest_mode(vcpu);
e79f245d
KA
13367 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
13368 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
13369
a633e41e 13370 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
39f9c388 13371 goto vmentry_fail_vmexit_guest_mode;
858e25c0 13372
7f7f1ba3 13373 if (from_vmentry) {
a633e41e
SC
13374 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
13375 exit_qual = nested_vmx_load_msr(vcpu,
13376 vmcs12->vm_entry_msr_load_addr,
13377 vmcs12->vm_entry_msr_load_count);
13378 if (exit_qual)
39f9c388 13379 goto vmentry_fail_vmexit_guest_mode;
7f7f1ba3
PB
13380 } else {
13381 /*
13382 * The MMU is not initialized to point at the right entities yet and
13383 * "get pages" would need to read data from the guest (i.e. we will
13384 * need to perform gpa to hpa translation). Request a call
13385 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
13386 * have already been set at vmentry time and should not be reset.
13387 */
13388 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
13389 }
858e25c0 13390
b5861e5c
LA
13391 /*
13392 * If L1 had a pending IRQ/NMI until it executed
13393 * VMLAUNCH/VMRESUME which wasn't delivered because it was
13394 * disallowed (e.g. interrupts disabled), L0 needs to
13395 * evaluate if this pending event should cause an exit from L2
13396 * to L1 or delivered directly to L2 (e.g. In case L1 don't
13397 * intercept EXTERNAL_INTERRUPT).
13398 *
7e712684
PB
13399 * Usually this would be handled by the processor noticing an
13400 * IRQ/NMI window request, or checking RVI during evaluation of
13401 * pending virtual interrupts. However, this setting was done
13402 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
13403 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
b5861e5c 13404 */
7e712684 13405 if (unlikely(evaluate_pending_interrupts))
b5861e5c 13406 kvm_make_request(KVM_REQ_EVENT, vcpu);
b5861e5c 13407
858e25c0
JM
13408 /*
13409 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
13410 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
13411 * returned as far as L1 is concerned. It will only return (and set
13412 * the success flag) when L2 exits (see nested_vmx_vmexit()).
13413 */
13414 return 0;
e79f245d 13415
a633e41e
SC
13416 /*
13417 * A failed consistency check that leads to a VMExit during L1's
13418 * VMEnter to L2 is a variation of a normal VMexit, as explained in
13419 * 26.7 "VM-entry failures during or after loading guest state".
13420 */
39f9c388 13421vmentry_fail_vmexit_guest_mode:
e79f245d
KA
13422 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
13423 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
13424 leave_guest_mode(vcpu);
16fb9a46
SC
13425
13426vmentry_fail_vmexit:
e79f245d 13427 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
a633e41e
SC
13428
13429 if (!from_vmentry)
13430 return 1;
13431
a633e41e
SC
13432 load_vmcs12_host_state(vcpu, vmcs12);
13433 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
13434 vmcs12->exit_qualification = exit_qual;
945679e3
VK
13435 if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
13436 vmx->nested.need_vmcs12_sync = true;
a633e41e 13437 return 1;
858e25c0
JM
13438}
13439
ca0bde28
JM
13440/*
13441 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
13442 * for running an L2 nested guest.
13443 */
13444static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
13445{
13446 struct vmcs12 *vmcs12;
13447 struct vcpu_vmx *vmx = to_vmx(vcpu);
b3f1dfb6 13448 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
ca0bde28
JM
13449 int ret;
13450
13451 if (!nested_vmx_check_permission(vcpu))
13452 return 1;
13453
8cab6507 13454 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
b8bbab92
VK
13455 return 1;
13456
13457 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
09abb5e3 13458 return nested_vmx_failInvalid(vcpu);
ca0bde28
JM
13459
13460 vmcs12 = get_vmcs12(vcpu);
13461
a6192d40
LA
13462 /*
13463 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
13464 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
13465 * rather than RFLAGS.ZF, and no error number is stored to the
13466 * VM-instruction error field.
13467 */
09abb5e3
SC
13468 if (vmcs12->hdr.shadow_vmcs)
13469 return nested_vmx_failInvalid(vcpu);
a6192d40 13470
945679e3
VK
13471 if (vmx->nested.hv_evmcs) {
13472 copy_enlightened_to_vmcs12(vmx);
13473 /* Enlightened VMCS doesn't have launch state */
13474 vmcs12->launch_state = !launch;
13475 } else if (enable_shadow_vmcs) {
ca0bde28 13476 copy_shadow_to_vmcs12(vmx);
945679e3 13477 }
ca0bde28
JM
13478
13479 /*
13480 * The nested entry process starts with enforcing various prerequisites
13481 * on vmcs12 as required by the Intel SDM, and act appropriately when
13482 * they fail: As the SDM explains, some conditions should cause the
13483 * instruction to fail, while others will cause the instruction to seem
13484 * to succeed, but return an EXIT_REASON_INVALID_STATE.
13485 * To speed up the normal (success) code path, we should avoid checking
13486 * for misconfigurations which will anyway be caught by the processor
13487 * when using the merged vmcs02.
13488 */
09abb5e3
SC
13489 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
13490 return nested_vmx_failValid(vcpu,
13491 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
b3f1dfb6 13492
09abb5e3
SC
13493 if (vmcs12->launch_state == launch)
13494 return nested_vmx_failValid(vcpu,
ca0bde28
JM
13495 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
13496 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
ca0bde28
JM
13497
13498 ret = check_vmentry_prereqs(vcpu, vmcs12);
09abb5e3
SC
13499 if (ret)
13500 return nested_vmx_failValid(vcpu, ret);
384bb783 13501
7c177938
NHE
13502 /*
13503 * We're finally done with prerequisite checking, and can start with
13504 * the nested entry.
13505 */
6514dc38 13506 vmx->nested.nested_run_pending = 1;
a633e41e 13507 ret = nested_vmx_enter_non_root_mode(vcpu, true);
52017608
SC
13508 vmx->nested.nested_run_pending = !ret;
13509 if (ret > 0)
7f7f1ba3 13510 return 1;
52017608
SC
13511 else if (ret)
13512 return nested_vmx_failValid(vcpu,
13513 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
ff651cb6 13514
c595ceee
PB
13515 /* Hide L1D cache contents from the nested guest. */
13516 vmx->vcpu.arch.l1tf_flush_l1d = true;
13517
61ada748 13518 /*
d63907dc 13519 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
61ada748
LA
13520 * also be used as part of restoring nVMX state for
13521 * snapshot restore (migration).
13522 *
13523 * In this flow, it is assumed that vmcs12 cache was
13524 * trasferred as part of captured nVMX state and should
13525 * therefore not be read from guest memory (which may not
13526 * exist on destination host yet).
13527 */
13528 nested_cache_shadow_vmcs12(vcpu, vmcs12);
13529
135a06c3
CG
13530 /*
13531 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
13532 * by event injection, halt vcpu.
13533 */
13534 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
6514dc38
JM
13535 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) {
13536 vmx->nested.nested_run_pending = 0;
5cb56059 13537 return kvm_vcpu_halt(vcpu);
6514dc38 13538 }
cd232ad0
NHE
13539 return 1;
13540}
13541
4704d0be
NHE
13542/*
13543 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
13544 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
13545 * This function returns the new value we should put in vmcs12.guest_cr0.
13546 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
13547 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
13548 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
13549 * didn't trap the bit, because if L1 did, so would L0).
13550 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
13551 * been modified by L2, and L1 knows it. So just leave the old value of
13552 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
13553 * isn't relevant, because if L0 traps this bit it can set it to anything.
13554 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
13555 * changed these bits, and therefore they need to be updated, but L0
13556 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
13557 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
13558 */
13559static inline unsigned long
13560vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13561{
13562 return
13563 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
13564 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
13565 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
13566 vcpu->arch.cr0_guest_owned_bits));
13567}
13568
13569static inline unsigned long
13570vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13571{
13572 return
13573 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
13574 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
13575 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
13576 vcpu->arch.cr4_guest_owned_bits));
13577}
13578
5f3d5799
JK
13579static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
13580 struct vmcs12 *vmcs12)
13581{
13582 u32 idt_vectoring;
13583 unsigned int nr;
13584
664f8e26 13585 if (vcpu->arch.exception.injected) {
5f3d5799
JK
13586 nr = vcpu->arch.exception.nr;
13587 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13588
13589 if (kvm_exception_is_soft(nr)) {
13590 vmcs12->vm_exit_instruction_len =
13591 vcpu->arch.event_exit_inst_len;
13592 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
13593 } else
13594 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
13595
13596 if (vcpu->arch.exception.has_error_code) {
13597 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
13598 vmcs12->idt_vectoring_error_code =
13599 vcpu->arch.exception.error_code;
13600 }
13601
13602 vmcs12->idt_vectoring_info_field = idt_vectoring;
cd2633c5 13603 } else if (vcpu->arch.nmi_injected) {
5f3d5799
JK
13604 vmcs12->idt_vectoring_info_field =
13605 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
04140b41 13606 } else if (vcpu->arch.interrupt.injected) {
5f3d5799
JK
13607 nr = vcpu->arch.interrupt.nr;
13608 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13609
13610 if (vcpu->arch.interrupt.soft) {
13611 idt_vectoring |= INTR_TYPE_SOFT_INTR;
13612 vmcs12->vm_entry_instruction_len =
13613 vcpu->arch.event_exit_inst_len;
13614 } else
13615 idt_vectoring |= INTR_TYPE_EXT_INTR;
13616
13617 vmcs12->idt_vectoring_info_field = idt_vectoring;
13618 }
13619}
13620
b6b8a145
JK
13621static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
13622{
13623 struct vcpu_vmx *vmx = to_vmx(vcpu);
bfcf83b1 13624 unsigned long exit_qual;
917dc606
LA
13625 bool block_nested_events =
13626 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
acc9ab60 13627
bfcf83b1
WL
13628 if (vcpu->arch.exception.pending &&
13629 nested_vmx_check_exception(vcpu, &exit_qual)) {
917dc606 13630 if (block_nested_events)
bfcf83b1
WL
13631 return -EBUSY;
13632 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
bfcf83b1
WL
13633 return 0;
13634 }
13635
f4124500
JK
13636 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
13637 vmx->nested.preemption_timer_expired) {
917dc606 13638 if (block_nested_events)
f4124500
JK
13639 return -EBUSY;
13640 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
13641 return 0;
13642 }
13643
b6b8a145 13644 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
917dc606 13645 if (block_nested_events)
b6b8a145
JK
13646 return -EBUSY;
13647 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
13648 NMI_VECTOR | INTR_TYPE_NMI_INTR |
13649 INTR_INFO_VALID_MASK, 0);
13650 /*
13651 * The NMI-triggered VM exit counts as injection:
13652 * clear this one and block further NMIs.
13653 */
13654 vcpu->arch.nmi_pending = 0;
13655 vmx_set_nmi_mask(vcpu, true);
13656 return 0;
13657 }
13658
13659 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
13660 nested_exit_on_intr(vcpu)) {
917dc606 13661 if (block_nested_events)
b6b8a145
JK
13662 return -EBUSY;
13663 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
705699a1 13664 return 0;
b6b8a145
JK
13665 }
13666
6342c50a
DH
13667 vmx_complete_nested_posted_interrupt(vcpu);
13668 return 0;
b6b8a145
JK
13669}
13670
d264ee0c
SC
13671static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
13672{
13673 to_vmx(vcpu)->req_immediate_exit = true;
13674}
13675
f4124500
JK
13676static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
13677{
13678 ktime_t remaining =
13679 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
13680 u64 value;
13681
13682 if (ktime_to_ns(remaining) <= 0)
13683 return 0;
13684
13685 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
13686 do_div(value, 1000000);
13687 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
13688}
13689
4704d0be 13690/*
cf8b84f4
JM
13691 * Update the guest state fields of vmcs12 to reflect changes that
13692 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
13693 * VM-entry controls is also updated, since this is really a guest
13694 * state bit.)
4704d0be 13695 */
cf8b84f4 13696static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4704d0be 13697{
4704d0be
NHE
13698 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
13699 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
13700
4704d0be
NHE
13701 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
13702 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
13703 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
13704
13705 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
13706 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
13707 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
13708 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
13709 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
13710 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
13711 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
13712 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
13713 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
13714 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
13715 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
13716 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
13717 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
13718 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
13719 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
13720 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
13721 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
13722 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
13723 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
13724 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
13725 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
13726 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
13727 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
13728 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
13729 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
13730 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
13731 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
13732 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
13733 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
13734 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
13735 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
13736 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
13737 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
13738 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
13739 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
13740 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
13741
4704d0be
NHE
13742 vmcs12->guest_interruptibility_info =
13743 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
13744 vmcs12->guest_pending_dbg_exceptions =
13745 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3edf1e69
JK
13746 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
13747 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
13748 else
13749 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4704d0be 13750
f4124500
JK
13751 if (nested_cpu_has_preemption_timer(vmcs12)) {
13752 if (vmcs12->vm_exit_controls &
13753 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
13754 vmcs12->vmx_preemption_timer_value =
13755 vmx_get_preemption_timer_value(vcpu);
13756 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
13757 }
7854cbca 13758
3633cfc3
NHE
13759 /*
13760 * In some cases (usually, nested EPT), L2 is allowed to change its
13761 * own CR3 without exiting. If it has changed it, we must keep it.
13762 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
13763 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
13764 *
13765 * Additionally, restore L2's PDPTR to vmcs12.
13766 */
13767 if (enable_ept) {
f3531054 13768 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3633cfc3
NHE
13769 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
13770 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
13771 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
13772 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
13773 }
13774
d281e13b 13775 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
119a9c01 13776
608406e2
WV
13777 if (nested_cpu_has_vid(vmcs12))
13778 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
13779
c18911a2
JK
13780 vmcs12->vm_entry_controls =
13781 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
2961e876 13782 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
c18911a2 13783
2996fca0
JK
13784 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
13785 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
13786 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
13787 }
13788
4704d0be
NHE
13789 /* TODO: These cannot have changed unless we have MSR bitmaps and
13790 * the relevant bit asks not to trap the change */
b8c07d55 13791 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
4704d0be 13792 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
10ba54a5
JK
13793 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
13794 vmcs12->guest_ia32_efer = vcpu->arch.efer;
4704d0be
NHE
13795 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
13796 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
13797 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
a87036ad 13798 if (kvm_mpx_supported())
36be0b9d 13799 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
cf8b84f4
JM
13800}
13801
13802/*
13803 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
13804 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
13805 * and this function updates it to reflect the changes to the guest state while
13806 * L2 was running (and perhaps made some exits which were handled directly by L0
13807 * without going back to L1), and to reflect the exit reason.
13808 * Note that we do not have to copy here all VMCS fields, just those that
13809 * could have changed by the L2 guest or the exit - i.e., the guest-state and
13810 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
13811 * which already writes to vmcs12 directly.
13812 */
13813static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
13814 u32 exit_reason, u32 exit_intr_info,
13815 unsigned long exit_qualification)
13816{
13817 /* update guest state fields: */
13818 sync_vmcs12(vcpu, vmcs12);
4704d0be
NHE
13819
13820 /* update exit information fields: */
13821
533558bc
JK
13822 vmcs12->vm_exit_reason = exit_reason;
13823 vmcs12->exit_qualification = exit_qualification;
533558bc 13824 vmcs12->vm_exit_intr_info = exit_intr_info;
7313c698 13825
5f3d5799 13826 vmcs12->idt_vectoring_info_field = 0;
4704d0be
NHE
13827 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
13828 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
13829
5f3d5799 13830 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
7cdc2d62
JM
13831 vmcs12->launch_state = 1;
13832
5f3d5799
JK
13833 /* vm_entry_intr_info_field is cleared on exit. Emulate this
13834 * instead of reading the real value. */
4704d0be 13835 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
5f3d5799
JK
13836
13837 /*
13838 * Transfer the event that L0 or L1 may wanted to inject into
13839 * L2 to IDT_VECTORING_INFO_FIELD.
13840 */
13841 vmcs12_save_pending_event(vcpu, vmcs12);
13842 }
13843
13844 /*
13845 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
13846 * preserved above and would only end up incorrectly in L1.
13847 */
13848 vcpu->arch.nmi_injected = false;
13849 kvm_clear_exception_queue(vcpu);
13850 kvm_clear_interrupt_queue(vcpu);
4704d0be
NHE
13851}
13852
13853/*
13854 * A part of what we need to when the nested L2 guest exits and we want to
13855 * run its L1 parent, is to reset L1's guest state to the host state specified
13856 * in vmcs12.
13857 * This function is to be called not only on normal nested exit, but also on
13858 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
13859 * Failures During or After Loading Guest State").
13860 * This function should be called when the active VMCS is L1's (vmcs01).
13861 */
733568f9
JK
13862static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
13863 struct vmcs12 *vmcs12)
4704d0be 13864{
21feb4eb 13865 struct kvm_segment seg;
bd18bffc 13866 u32 entry_failure_code;
21feb4eb 13867
4704d0be
NHE
13868 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
13869 vcpu->arch.efer = vmcs12->host_ia32_efer;
d1fa0352 13870 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4704d0be
NHE
13871 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
13872 else
13873 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
13874 vmx_set_efer(vcpu, vcpu->arch.efer);
13875
13876 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
13877 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
1adfa76a 13878 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
cb61de2f
SC
13879 vmx_set_interrupt_shadow(vcpu, 0);
13880
4704d0be
NHE
13881 /*
13882 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
bd7e5b08
PB
13883 * actually changed, because vmx_set_cr0 refers to efer set above.
13884 *
13885 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
13886 * (KVM doesn't change it);
4704d0be 13887 */
bd7e5b08 13888 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
9e3e4dbf 13889 vmx_set_cr0(vcpu, vmcs12->host_cr0);
4704d0be 13890
bd7e5b08 13891 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
4704d0be 13892 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
8eb3f87d 13893 vmx_set_cr4(vcpu, vmcs12->host_cr4);
4704d0be 13894
bd18bffc
SC
13895 nested_ept_uninit_mmu_context(vcpu);
13896
13897 /*
13898 * Only PDPTE load can fail as the value of cr3 was checked on entry and
13899 * couldn't have changed.
13900 */
13901 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
13902 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
13903
13904 if (!enable_ept)
13905 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
feaf0c7d 13906
6f1e03bc 13907 /*
efebf0aa 13908 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
6f1e03bc
LA
13909 * VMEntry/VMExit. Thus, no need to flush TLB.
13910 *
efebf0aa
LA
13911 * If vmcs12 doesn't use VPID, L1 expects TLB to be
13912 * flushed on every VMEntry/VMExit.
6f1e03bc 13913 *
efebf0aa
LA
13914 * Otherwise, we can preserve TLB entries as long as we are
13915 * able to tag L1 TLB entries differently than L2 TLB entries.
1438921c
LA
13916 *
13917 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
13918 * and therefore we request the TLB flush to happen only after VMCS EPTP
13919 * has been set by KVM_REQ_LOAD_CR3.
6f1e03bc
LA
13920 */
13921 if (enable_vpid &&
efebf0aa 13922 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
1438921c 13923 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
4704d0be 13924 }
4704d0be
NHE
13925
13926 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
13927 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
13928 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
13929 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
13930 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
21f2d551
LP
13931 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
13932 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4704d0be 13933
36be0b9d
PB
13934 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
13935 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
13936 vmcs_write64(GUEST_BNDCFGS, 0);
13937
44811c02 13938 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4704d0be 13939 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
44811c02
JK
13940 vcpu->arch.pat = vmcs12->host_ia32_pat;
13941 }
4704d0be
NHE
13942 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
13943 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
13944 vmcs12->host_ia32_perf_global_ctrl);
503cd0c5 13945
21feb4eb
ACL
13946 /* Set L1 segment info according to Intel SDM
13947 27.5.2 Loading Host Segment and Descriptor-Table Registers */
13948 seg = (struct kvm_segment) {
13949 .base = 0,
13950 .limit = 0xFFFFFFFF,
13951 .selector = vmcs12->host_cs_selector,
13952 .type = 11,
13953 .present = 1,
13954 .s = 1,
13955 .g = 1
13956 };
13957 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
13958 seg.l = 1;
13959 else
13960 seg.db = 1;
13961 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
13962 seg = (struct kvm_segment) {
13963 .base = 0,
13964 .limit = 0xFFFFFFFF,
13965 .type = 3,
13966 .present = 1,
13967 .s = 1,
13968 .db = 1,
13969 .g = 1
13970 };
13971 seg.selector = vmcs12->host_ds_selector;
13972 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
13973 seg.selector = vmcs12->host_es_selector;
13974 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
13975 seg.selector = vmcs12->host_ss_selector;
13976 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
13977 seg.selector = vmcs12->host_fs_selector;
13978 seg.base = vmcs12->host_fs_base;
13979 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
13980 seg.selector = vmcs12->host_gs_selector;
13981 seg.base = vmcs12->host_gs_base;
13982 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
13983 seg = (struct kvm_segment) {
205befd9 13984 .base = vmcs12->host_tr_base,
21feb4eb
ACL
13985 .limit = 0x67,
13986 .selector = vmcs12->host_tr_selector,
13987 .type = 11,
13988 .present = 1
13989 };
13990 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
13991
503cd0c5
JK
13992 kvm_set_dr(vcpu, 7, 0x400);
13993 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
ff651cb6 13994
3af18d9c 13995 if (cpu_has_vmx_msr_bitmap())
904e14fb 13996 vmx_update_msr_bitmap(vcpu);
3af18d9c 13997
ff651cb6
WV
13998 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
13999 vmcs12->vm_exit_msr_load_count))
14000 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4704d0be
NHE
14001}
14002
bd18bffc
SC
14003static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
14004{
14005 struct shared_msr_entry *efer_msr;
14006 unsigned int i;
14007
14008 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
14009 return vmcs_read64(GUEST_IA32_EFER);
14010
14011 if (cpu_has_load_ia32_efer)
14012 return host_efer;
14013
14014 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
14015 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
14016 return vmx->msr_autoload.guest.val[i].value;
14017 }
14018
14019 efer_msr = find_msr_entry(vmx, MSR_EFER);
14020 if (efer_msr)
14021 return efer_msr->data;
14022
14023 return host_efer;
14024}
14025
14026static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
14027{
14028 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14029 struct vcpu_vmx *vmx = to_vmx(vcpu);
14030 struct vmx_msr_entry g, h;
14031 struct msr_data msr;
14032 gpa_t gpa;
14033 u32 i, j;
14034
14035 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
14036
14037 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
14038 /*
14039 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
14040 * as vmcs01.GUEST_DR7 contains a userspace defined value
14041 * and vcpu->arch.dr7 is not squirreled away before the
14042 * nested VMENTER (not worth adding a variable in nested_vmx).
14043 */
14044 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
14045 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
14046 else
14047 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
14048 }
14049
14050 /*
14051 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
14052 * handle a variety of side effects to KVM's software model.
14053 */
14054 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
14055
14056 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
14057 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
14058
14059 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
14060 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
14061
14062 nested_ept_uninit_mmu_context(vcpu);
14063 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
14064 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
14065
14066 /*
14067 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
14068 * from vmcs01 (if necessary). The PDPTRs are not loaded on
14069 * VMFail, like everything else we just need to ensure our
14070 * software model is up-to-date.
14071 */
14072 ept_save_pdptrs(vcpu);
14073
14074 kvm_mmu_reset_context(vcpu);
14075
14076 if (cpu_has_vmx_msr_bitmap())
14077 vmx_update_msr_bitmap(vcpu);
14078
14079 /*
14080 * This nasty bit of open coding is a compromise between blindly
14081 * loading L1's MSRs using the exit load lists (incorrect emulation
14082 * of VMFail), leaving the nested VM's MSRs in the software model
14083 * (incorrect behavior) and snapshotting the modified MSRs (too
14084 * expensive since the lists are unbound by hardware). For each
14085 * MSR that was (prematurely) loaded from the nested VMEntry load
14086 * list, reload it from the exit load list if it exists and differs
14087 * from the guest value. The intent is to stuff host state as
14088 * silently as possible, not to fully process the exit load list.
14089 */
14090 msr.host_initiated = false;
14091 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
14092 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
14093 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
14094 pr_debug_ratelimited(
14095 "%s read MSR index failed (%u, 0x%08llx)\n",
14096 __func__, i, gpa);
14097 goto vmabort;
14098 }
14099
14100 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
14101 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
14102 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
14103 pr_debug_ratelimited(
14104 "%s read MSR failed (%u, 0x%08llx)\n",
14105 __func__, j, gpa);
14106 goto vmabort;
14107 }
14108 if (h.index != g.index)
14109 continue;
14110 if (h.value == g.value)
14111 break;
14112
14113 if (nested_vmx_load_msr_check(vcpu, &h)) {
14114 pr_debug_ratelimited(
14115 "%s check failed (%u, 0x%x, 0x%x)\n",
14116 __func__, j, h.index, h.reserved);
14117 goto vmabort;
14118 }
14119
14120 msr.index = h.index;
14121 msr.data = h.value;
14122 if (kvm_set_msr(vcpu, &msr)) {
14123 pr_debug_ratelimited(
14124 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
14125 __func__, j, h.index, h.value);
14126 goto vmabort;
14127 }
14128 }
14129 }
14130
14131 return;
14132
14133vmabort:
14134 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
14135}
14136
4704d0be
NHE
14137/*
14138 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
14139 * and modify vmcs12 to make it see what it would expect to see there if
14140 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
14141 */
533558bc
JK
14142static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
14143 u32 exit_intr_info,
14144 unsigned long exit_qualification)
4704d0be
NHE
14145{
14146 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be
NHE
14147 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14148
5f3d5799
JK
14149 /* trying to cancel vmlaunch/vmresume is a bug */
14150 WARN_ON_ONCE(vmx->nested.nested_run_pending);
14151
4704d0be 14152 leave_guest_mode(vcpu);
4704d0be 14153
e79f245d
KA
14154 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
14155 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
14156
4f350c6d 14157 if (likely(!vmx->fail)) {
72e9cbdb
LP
14158 if (exit_reason == -1)
14159 sync_vmcs12(vcpu, vmcs12);
14160 else
14161 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
14162 exit_qualification);
ff651cb6 14163
61ada748
LA
14164 /*
14165 * Must happen outside of sync_vmcs12() as it will
14166 * also be used to capture vmcs12 cache as part of
14167 * capturing nVMX state for snapshot (migration).
14168 *
14169 * Otherwise, this flush will dirty guest memory at a
14170 * point it is already assumed by user-space to be
14171 * immutable.
14172 */
14173 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
14174
4f350c6d
JM
14175 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
14176 vmcs12->vm_exit_msr_store_count))
14177 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
2768c0cc
SC
14178 } else {
14179 /*
14180 * The only expected VM-instruction error is "VM entry with
14181 * invalid control field(s)." Anything else indicates a
14182 * problem with L0. And we should never get here with a
14183 * VMFail of any type if early consistency checks are enabled.
14184 */
14185 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
14186 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
14187 WARN_ON_ONCE(nested_early_check);
4f350c6d 14188 }
cf3215d9 14189
1279a6b1 14190 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
36c3cc42 14191
9314006d 14192 /* Update any VMCS fields that might have changed while L2 ran */
33966dd6
KRW
14193 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
14194 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
ea26e4ec 14195 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
f459a707 14196
c95ba92a
PF
14197 if (kvm_has_tsc_control)
14198 decache_tsc_multiplier(vmx);
4704d0be 14199
8d860bbe
JM
14200 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
14201 vmx->nested.change_vmcs01_virtual_apic_mode = false;
14202 vmx_set_virtual_apic_mode(vcpu);
fb6c8198
JM
14203 } else if (!nested_cpu_has_ept(vmcs12) &&
14204 nested_cpu_has2(vmcs12,
14205 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
a468f2db 14206 vmx_flush_tlb(vcpu, true);
dccbfcf5 14207 }
4704d0be
NHE
14208
14209 /* This is needed for same reason as it was needed in prepare_vmcs02 */
14210 vmx->host_rsp = 0;
14211
14212 /* Unpin physical memory we referred to in vmcs02 */
14213 if (vmx->nested.apic_access_page) {
53a70daf 14214 kvm_release_page_dirty(vmx->nested.apic_access_page);
48d89b92 14215 vmx->nested.apic_access_page = NULL;
4704d0be 14216 }
a7c0b07d 14217 if (vmx->nested.virtual_apic_page) {
53a70daf 14218 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
48d89b92 14219 vmx->nested.virtual_apic_page = NULL;
a7c0b07d 14220 }
705699a1
WV
14221 if (vmx->nested.pi_desc_page) {
14222 kunmap(vmx->nested.pi_desc_page);
53a70daf 14223 kvm_release_page_dirty(vmx->nested.pi_desc_page);
705699a1
WV
14224 vmx->nested.pi_desc_page = NULL;
14225 vmx->nested.pi_desc = NULL;
14226 }
4704d0be 14227
38b99173
TC
14228 /*
14229 * We are now running in L2, mmu_notifier will force to reload the
14230 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
14231 */
c83b6d15 14232 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
38b99173 14233
945679e3
VK
14234 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
14235 vmx->nested.need_vmcs12_sync = true;
b6b8a145
JK
14236
14237 /* in case we halted in L2 */
14238 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4f350c6d
JM
14239
14240 if (likely(!vmx->fail)) {
14241 /*
14242 * TODO: SDM says that with acknowledge interrupt on
14243 * exit, bit 31 of the VM-exit interrupt information
14244 * (valid interrupt) is always set to 1 on
14245 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
14246 * need kvm_cpu_has_interrupt(). See the commit
14247 * message for details.
14248 */
14249 if (nested_exit_intr_ack_set(vcpu) &&
14250 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
14251 kvm_cpu_has_interrupt(vcpu)) {
14252 int irq = kvm_cpu_get_interrupt(vcpu);
14253 WARN_ON(irq < 0);
14254 vmcs12->vm_exit_intr_info = irq |
14255 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
14256 }
14257
72e9cbdb
LP
14258 if (exit_reason != -1)
14259 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
14260 vmcs12->exit_qualification,
14261 vmcs12->idt_vectoring_info_field,
14262 vmcs12->vm_exit_intr_info,
14263 vmcs12->vm_exit_intr_error_code,
14264 KVM_ISA_VMX);
4f350c6d
JM
14265
14266 load_vmcs12_host_state(vcpu, vmcs12);
14267
14268 return;
14269 }
09abb5e3 14270
4f350c6d
JM
14271 /*
14272 * After an early L2 VM-entry failure, we're now back
14273 * in L1 which thinks it just finished a VMLAUNCH or
14274 * VMRESUME instruction, so we need to set the failure
14275 * flag and the VM-instruction error field of the VMCS
cb61de2f 14276 * accordingly, and skip the emulated instruction.
4f350c6d 14277 */
09abb5e3 14278 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5af41573 14279
4f350c6d 14280 /*
bd18bffc
SC
14281 * Restore L1's host state to KVM's software model. We're here
14282 * because a consistency check was caught by hardware, which
14283 * means some amount of guest state has been propagated to KVM's
14284 * model and needs to be unwound to the host's state.
4f350c6d 14285 */
bd18bffc 14286 nested_vmx_restore_host_state(vcpu);
5af41573 14287
4f350c6d 14288 vmx->fail = 0;
4704d0be
NHE
14289}
14290
42124925
JK
14291/*
14292 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
14293 */
14294static void vmx_leave_nested(struct kvm_vcpu *vcpu)
14295{
2f707d97
WL
14296 if (is_guest_mode(vcpu)) {
14297 to_vmx(vcpu)->nested.nested_run_pending = 0;
533558bc 14298 nested_vmx_vmexit(vcpu, -1, 0, 0);
2f707d97 14299 }
14c07ad8 14300 free_nested(vcpu);
7c177938
NHE
14301}
14302
8a76d7f2
JR
14303static int vmx_check_intercept(struct kvm_vcpu *vcpu,
14304 struct x86_instruction_info *info,
14305 enum x86_intercept_stage stage)
14306{
fb6d4d34
PB
14307 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14308 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
14309
14310 /*
14311 * RDPID causes #UD if disabled through secondary execution controls.
14312 * Because it is marked as EmulateOnUD, we need to intercept it here.
14313 */
14314 if (info->intercept == x86_intercept_rdtscp &&
14315 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
14316 ctxt->exception.vector = UD_VECTOR;
14317 ctxt->exception.error_code_valid = false;
14318 return X86EMUL_PROPAGATE_FAULT;
14319 }
14320
14321 /* TODO: check more intercepts... */
8a76d7f2
JR
14322 return X86EMUL_CONTINUE;
14323}
14324
64672c95
YJ
14325#ifdef CONFIG_X86_64
14326/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
14327static inline int u64_shl_div_u64(u64 a, unsigned int shift,
14328 u64 divisor, u64 *result)
14329{
14330 u64 low = a << shift, high = a >> (64 - shift);
14331
14332 /* To avoid the overflow on divq */
14333 if (high >= divisor)
14334 return 1;
14335
14336 /* Low hold the result, high hold rem which is discarded */
14337 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
14338 "rm" (divisor), "0" (low), "1" (high));
14339 *result = low;
14340
14341 return 0;
14342}
14343
14344static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
14345{
386c6ddb 14346 struct vcpu_vmx *vmx;
c5ce8235 14347 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
386c6ddb
KA
14348
14349 if (kvm_mwait_in_guest(vcpu->kvm))
14350 return -EOPNOTSUPP;
14351
14352 vmx = to_vmx(vcpu);
14353 tscl = rdtsc();
14354 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
14355 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
c5ce8235
WL
14356 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
14357
14358 if (delta_tsc > lapic_timer_advance_cycles)
14359 delta_tsc -= lapic_timer_advance_cycles;
14360 else
14361 delta_tsc = 0;
64672c95
YJ
14362
14363 /* Convert to host delta tsc if tsc scaling is enabled */
14364 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
14365 u64_shl_div_u64(delta_tsc,
14366 kvm_tsc_scaling_ratio_frac_bits,
14367 vcpu->arch.tsc_scaling_ratio,
14368 &delta_tsc))
14369 return -ERANGE;
14370
14371 /*
14372 * If the delta tsc can't fit in the 32 bit after the multi shift,
14373 * we can't use the preemption timer.
14374 * It's possible that it fits on later vmentries, but checking
14375 * on every vmentry is costly so we just use an hrtimer.
14376 */
14377 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
14378 return -ERANGE;
14379
14380 vmx->hv_deadline_tsc = tscl + delta_tsc;
c8533544 14381 return delta_tsc == 0;
64672c95
YJ
14382}
14383
14384static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
14385{
f459a707 14386 to_vmx(vcpu)->hv_deadline_tsc = -1;
64672c95
YJ
14387}
14388#endif
14389
48d89b92 14390static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
ae97a3b8 14391{
b31c114b 14392 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d 14393 shrink_ple_window(vcpu);
ae97a3b8
RK
14394}
14395
843e4330
KH
14396static void vmx_slot_enable_log_dirty(struct kvm *kvm,
14397 struct kvm_memory_slot *slot)
14398{
14399 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
14400 kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
14401}
14402
14403static void vmx_slot_disable_log_dirty(struct kvm *kvm,
14404 struct kvm_memory_slot *slot)
14405{
14406 kvm_mmu_slot_set_dirty(kvm, slot);
14407}
14408
14409static void vmx_flush_log_dirty(struct kvm *kvm)
14410{
14411 kvm_flush_pml_buffers(kvm);
14412}
14413
c5f983f6
BD
14414static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
14415{
14416 struct vmcs12 *vmcs12;
14417 struct vcpu_vmx *vmx = to_vmx(vcpu);
14418 gpa_t gpa;
14419 struct page *page = NULL;
14420 u64 *pml_address;
14421
14422 if (is_guest_mode(vcpu)) {
14423 WARN_ON_ONCE(vmx->nested.pml_full);
14424
14425 /*
14426 * Check if PML is enabled for the nested guest.
14427 * Whether eptp bit 6 is set is already checked
14428 * as part of A/D emulation.
14429 */
14430 vmcs12 = get_vmcs12(vcpu);
14431 if (!nested_cpu_has_pml(vmcs12))
14432 return 0;
14433
4769886b 14434 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
c5f983f6
BD
14435 vmx->nested.pml_full = true;
14436 return 1;
14437 }
14438
14439 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
14440
5e2f30b7
DH
14441 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
14442 if (is_error_page(page))
c5f983f6
BD
14443 return 0;
14444
14445 pml_address = kmap(page);
14446 pml_address[vmcs12->guest_pml_index--] = gpa;
14447 kunmap(page);
53a70daf 14448 kvm_release_page_clean(page);
c5f983f6
BD
14449 }
14450
14451 return 0;
14452}
14453
843e4330
KH
14454static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
14455 struct kvm_memory_slot *memslot,
14456 gfn_t offset, unsigned long mask)
14457{
14458 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
14459}
14460
cd39e117
PB
14461static void __pi_post_block(struct kvm_vcpu *vcpu)
14462{
14463 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14464 struct pi_desc old, new;
14465 unsigned int dest;
cd39e117
PB
14466
14467 do {
14468 old.control = new.control = pi_desc->control;
8b306e2f
PB
14469 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
14470 "Wakeup handler not enabled while the VCPU is blocked\n");
cd39e117
PB
14471
14472 dest = cpu_physical_id(vcpu->cpu);
14473
14474 if (x2apic_enabled())
14475 new.ndst = dest;
14476 else
14477 new.ndst = (dest << 8) & 0xFF00;
14478
cd39e117
PB
14479 /* set 'NV' to 'notification vector' */
14480 new.nv = POSTED_INTR_VECTOR;
c0a1666b
PB
14481 } while (cmpxchg64(&pi_desc->control, old.control,
14482 new.control) != old.control);
cd39e117 14483
8b306e2f
PB
14484 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
14485 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
cd39e117 14486 list_del(&vcpu->blocked_vcpu_list);
8b306e2f 14487 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
cd39e117
PB
14488 vcpu->pre_pcpu = -1;
14489 }
14490}
14491
bf9f6ac8
FW
14492/*
14493 * This routine does the following things for vCPU which is going
14494 * to be blocked if VT-d PI is enabled.
14495 * - Store the vCPU to the wakeup list, so when interrupts happen
14496 * we can find the right vCPU to wake up.
14497 * - Change the Posted-interrupt descriptor as below:
14498 * 'NDST' <-- vcpu->pre_pcpu
14499 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
14500 * - If 'ON' is set during this process, which means at least one
14501 * interrupt is posted for this vCPU, we cannot block it, in
14502 * this case, return 1, otherwise, return 0.
14503 *
14504 */
bc22512b 14505static int pi_pre_block(struct kvm_vcpu *vcpu)
bf9f6ac8 14506{
bf9f6ac8
FW
14507 unsigned int dest;
14508 struct pi_desc old, new;
14509 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14510
14511 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
a0052191
YZ
14512 !irq_remapping_cap(IRQ_POSTING_CAP) ||
14513 !kvm_vcpu_apicv_active(vcpu))
bf9f6ac8
FW
14514 return 0;
14515
8b306e2f
PB
14516 WARN_ON(irqs_disabled());
14517 local_irq_disable();
14518 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
14519 vcpu->pre_pcpu = vcpu->cpu;
14520 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14521 list_add_tail(&vcpu->blocked_vcpu_list,
14522 &per_cpu(blocked_vcpu_on_cpu,
14523 vcpu->pre_pcpu));
14524 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14525 }
bf9f6ac8
FW
14526
14527 do {
14528 old.control = new.control = pi_desc->control;
14529
bf9f6ac8
FW
14530 WARN((pi_desc->sn == 1),
14531 "Warning: SN field of posted-interrupts "
14532 "is set before blocking\n");
14533
14534 /*
14535 * Since vCPU can be preempted during this process,
14536 * vcpu->cpu could be different with pre_pcpu, we
14537 * need to set pre_pcpu as the destination of wakeup
14538 * notification event, then we can find the right vCPU
14539 * to wakeup in wakeup handler if interrupts happen
14540 * when the vCPU is in blocked state.
14541 */
14542 dest = cpu_physical_id(vcpu->pre_pcpu);
14543
14544 if (x2apic_enabled())
14545 new.ndst = dest;
14546 else
14547 new.ndst = (dest << 8) & 0xFF00;
14548
14549 /* set 'NV' to 'wakeup vector' */
14550 new.nv = POSTED_INTR_WAKEUP_VECTOR;
c0a1666b
PB
14551 } while (cmpxchg64(&pi_desc->control, old.control,
14552 new.control) != old.control);
bf9f6ac8 14553
8b306e2f
PB
14554 /* We should not block the vCPU if an interrupt is posted for it. */
14555 if (pi_test_on(pi_desc) == 1)
14556 __pi_post_block(vcpu);
14557
14558 local_irq_enable();
14559 return (vcpu->pre_pcpu == -1);
bf9f6ac8
FW
14560}
14561
bc22512b
YJ
14562static int vmx_pre_block(struct kvm_vcpu *vcpu)
14563{
14564 if (pi_pre_block(vcpu))
14565 return 1;
14566
64672c95
YJ
14567 if (kvm_lapic_hv_timer_in_use(vcpu))
14568 kvm_lapic_switch_to_sw_timer(vcpu);
14569
bc22512b
YJ
14570 return 0;
14571}
14572
14573static void pi_post_block(struct kvm_vcpu *vcpu)
bf9f6ac8 14574{
8b306e2f 14575 if (vcpu->pre_pcpu == -1)
bf9f6ac8
FW
14576 return;
14577
8b306e2f
PB
14578 WARN_ON(irqs_disabled());
14579 local_irq_disable();
cd39e117 14580 __pi_post_block(vcpu);
8b306e2f 14581 local_irq_enable();
bf9f6ac8
FW
14582}
14583
bc22512b
YJ
14584static void vmx_post_block(struct kvm_vcpu *vcpu)
14585{
64672c95
YJ
14586 if (kvm_x86_ops->set_hv_timer)
14587 kvm_lapic_switch_to_hv_timer(vcpu);
14588
bc22512b
YJ
14589 pi_post_block(vcpu);
14590}
14591
efc64404
FW
14592/*
14593 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
14594 *
14595 * @kvm: kvm
14596 * @host_irq: host irq of the interrupt
14597 * @guest_irq: gsi of the interrupt
14598 * @set: set or unset PI
14599 * returns 0 on success, < 0 on failure
14600 */
14601static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
14602 uint32_t guest_irq, bool set)
14603{
14604 struct kvm_kernel_irq_routing_entry *e;
14605 struct kvm_irq_routing_table *irq_rt;
14606 struct kvm_lapic_irq irq;
14607 struct kvm_vcpu *vcpu;
14608 struct vcpu_data vcpu_info;
3a8b0677 14609 int idx, ret = 0;
efc64404
FW
14610
14611 if (!kvm_arch_has_assigned_device(kvm) ||
a0052191
YZ
14612 !irq_remapping_cap(IRQ_POSTING_CAP) ||
14613 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
efc64404
FW
14614 return 0;
14615
14616 idx = srcu_read_lock(&kvm->irq_srcu);
14617 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
3a8b0677
JS
14618 if (guest_irq >= irq_rt->nr_rt_entries ||
14619 hlist_empty(&irq_rt->map[guest_irq])) {
14620 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
14621 guest_irq, irq_rt->nr_rt_entries);
14622 goto out;
14623 }
efc64404
FW
14624
14625 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
14626 if (e->type != KVM_IRQ_ROUTING_MSI)
14627 continue;
14628 /*
14629 * VT-d PI cannot support posting multicast/broadcast
14630 * interrupts to a vCPU, we still use interrupt remapping
14631 * for these kind of interrupts.
14632 *
14633 * For lowest-priority interrupts, we only support
14634 * those with single CPU as the destination, e.g. user
14635 * configures the interrupts via /proc/irq or uses
14636 * irqbalance to make the interrupts single-CPU.
14637 *
14638 * We will support full lowest-priority interrupt later.
14639 */
14640
37131313 14641 kvm_set_msi_irq(kvm, e, &irq);
23a1c257
FW
14642 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
14643 /*
14644 * Make sure the IRTE is in remapped mode if
14645 * we don't handle it in posted mode.
14646 */
14647 ret = irq_set_vcpu_affinity(host_irq, NULL);
14648 if (ret < 0) {
14649 printk(KERN_INFO
14650 "failed to back to remapped mode, irq: %u\n",
14651 host_irq);
14652 goto out;
14653 }
14654
efc64404 14655 continue;
23a1c257 14656 }
efc64404
FW
14657
14658 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
14659 vcpu_info.vector = irq.vector;
14660
2698d82e 14661 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
efc64404
FW
14662 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
14663
14664 if (set)
14665 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
dc91f2eb 14666 else
efc64404 14667 ret = irq_set_vcpu_affinity(host_irq, NULL);
efc64404
FW
14668
14669 if (ret < 0) {
14670 printk(KERN_INFO "%s: failed to update PI IRTE\n",
14671 __func__);
14672 goto out;
14673 }
14674 }
14675
14676 ret = 0;
14677out:
14678 srcu_read_unlock(&kvm->irq_srcu, idx);
14679 return ret;
14680}
14681
c45dcc71
AR
14682static void vmx_setup_mce(struct kvm_vcpu *vcpu)
14683{
14684 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
14685 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
14686 FEATURE_CONTROL_LMCE;
14687 else
14688 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
14689 ~FEATURE_CONTROL_LMCE;
14690}
14691
72d7b374
LP
14692static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
14693{
72e9cbdb
LP
14694 /* we need a nested vmexit to enter SMM, postpone if run is pending */
14695 if (to_vmx(vcpu)->nested.nested_run_pending)
14696 return 0;
72d7b374
LP
14697 return 1;
14698}
14699
0234bf88
LP
14700static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
14701{
72e9cbdb
LP
14702 struct vcpu_vmx *vmx = to_vmx(vcpu);
14703
14704 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
14705 if (vmx->nested.smm.guest_mode)
14706 nested_vmx_vmexit(vcpu, -1, 0, 0);
14707
14708 vmx->nested.smm.vmxon = vmx->nested.vmxon;
14709 vmx->nested.vmxon = false;
caa057a2 14710 vmx_clear_hlt(vcpu);
0234bf88
LP
14711 return 0;
14712}
14713
14714static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
14715{
72e9cbdb
LP
14716 struct vcpu_vmx *vmx = to_vmx(vcpu);
14717 int ret;
14718
14719 if (vmx->nested.smm.vmxon) {
14720 vmx->nested.vmxon = true;
14721 vmx->nested.smm.vmxon = false;
14722 }
14723
14724 if (vmx->nested.smm.guest_mode) {
14725 vcpu->arch.hflags &= ~HF_SMM_MASK;
a633e41e 14726 ret = nested_vmx_enter_non_root_mode(vcpu, false);
72e9cbdb
LP
14727 vcpu->arch.hflags |= HF_SMM_MASK;
14728 if (ret)
14729 return ret;
14730
14731 vmx->nested.smm.guest_mode = false;
14732 }
0234bf88
LP
14733 return 0;
14734}
14735
cc3d967f
LP
14736static int enable_smi_window(struct kvm_vcpu *vcpu)
14737{
14738 return 0;
14739}
14740
8cab6507
VK
14741static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
14742{
14743 struct vcpu_vmx *vmx = to_vmx(vcpu);
14744
14745 /*
14746 * In case we do two consecutive get/set_nested_state()s while L2 was
14747 * running hv_evmcs may end up not being mapped (we map it from
14748 * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
14749 * have vmcs12 if it is true.
14750 */
14751 return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
14752 vmx->nested.hv_evmcs;
14753}
14754
8fcc4b59
JM
14755static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
14756 struct kvm_nested_state __user *user_kvm_nested_state,
14757 u32 user_data_size)
14758{
14759 struct vcpu_vmx *vmx;
14760 struct vmcs12 *vmcs12;
14761 struct kvm_nested_state kvm_state = {
14762 .flags = 0,
14763 .format = 0,
14764 .size = sizeof(kvm_state),
14765 .vmx.vmxon_pa = -1ull,
14766 .vmx.vmcs_pa = -1ull,
14767 };
14768
14769 if (!vcpu)
14770 return kvm_state.size + 2 * VMCS12_SIZE;
14771
14772 vmx = to_vmx(vcpu);
14773 vmcs12 = get_vmcs12(vcpu);
945679e3 14774
8cab6507
VK
14775 if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
14776 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
945679e3 14777
8fcc4b59
JM
14778 if (nested_vmx_allowed(vcpu) &&
14779 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
14780 kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
14781 kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
14782
8cab6507 14783 if (vmx_has_valid_vmcs12(vcpu)) {
8fcc4b59
JM
14784 kvm_state.size += VMCS12_SIZE;
14785
fa58a9fa
PB
14786 if (is_guest_mode(vcpu) &&
14787 nested_cpu_has_shadow_vmcs(vmcs12) &&
14788 vmcs12->vmcs_link_pointer != -1ull)
14789 kvm_state.size += VMCS12_SIZE;
14790 }
14791
8fcc4b59
JM
14792 if (vmx->nested.smm.vmxon)
14793 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
14794
14795 if (vmx->nested.smm.guest_mode)
14796 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
14797
14798 if (is_guest_mode(vcpu)) {
14799 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
14800
14801 if (vmx->nested.nested_run_pending)
14802 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
14803 }
14804 }
14805
14806 if (user_data_size < kvm_state.size)
14807 goto out;
14808
14809 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
14810 return -EFAULT;
14811
8cab6507 14812 if (!vmx_has_valid_vmcs12(vcpu))
8fcc4b59
JM
14813 goto out;
14814
14815 /*
14816 * When running L2, the authoritative vmcs12 state is in the
14817 * vmcs02. When running L1, the authoritative vmcs12 state is
8cab6507 14818 * in the shadow or enlightened vmcs linked to vmcs01, unless
945679e3 14819 * need_vmcs12_sync is set, in which case, the authoritative
8fcc4b59
JM
14820 * vmcs12 state is in the vmcs12 already.
14821 */
8cab6507 14822 if (is_guest_mode(vcpu)) {
8fcc4b59 14823 sync_vmcs12(vcpu, vmcs12);
8cab6507
VK
14824 } else if (!vmx->nested.need_vmcs12_sync) {
14825 if (vmx->nested.hv_evmcs)
14826 copy_enlightened_to_vmcs12(vmx);
14827 else if (enable_shadow_vmcs)
14828 copy_shadow_to_vmcs12(vmx);
14829 }
8fcc4b59
JM
14830
14831 if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
14832 return -EFAULT;
14833
fa58a9fa
PB
14834 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14835 vmcs12->vmcs_link_pointer != -1ull) {
14836 if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
14837 get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
14838 return -EFAULT;
14839 }
14840
8fcc4b59
JM
14841out:
14842 return kvm_state.size;
14843}
14844
14845static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
14846 struct kvm_nested_state __user *user_kvm_nested_state,
14847 struct kvm_nested_state *kvm_state)
14848{
14849 struct vcpu_vmx *vmx = to_vmx(vcpu);
14850 struct vmcs12 *vmcs12;
14851 u32 exit_qual;
14852 int ret;
14853
14854 if (kvm_state->format != 0)
14855 return -EINVAL;
14856
8cab6507
VK
14857 if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
14858 nested_enable_evmcs(vcpu, NULL);
14859
8fcc4b59
JM
14860 if (!nested_vmx_allowed(vcpu))
14861 return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
14862
14863 if (kvm_state->vmx.vmxon_pa == -1ull) {
14864 if (kvm_state->vmx.smm.flags)
14865 return -EINVAL;
14866
14867 if (kvm_state->vmx.vmcs_pa != -1ull)
14868 return -EINVAL;
14869
14870 vmx_leave_nested(vcpu);
14871 return 0;
14872 }
14873
14874 if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
14875 return -EINVAL;
14876
8fcc4b59
JM
14877 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14878 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14879 return -EINVAL;
14880
14881 if (kvm_state->vmx.smm.flags &
14882 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
14883 return -EINVAL;
14884
5bea5123
PB
14885 /*
14886 * SMM temporarily disables VMX, so we cannot be in guest mode,
14887 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
14888 * must be zero.
14889 */
14890 if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
14891 return -EINVAL;
14892
8fcc4b59
JM
14893 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14894 !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
14895 return -EINVAL;
14896
14897 vmx_leave_nested(vcpu);
14898 if (kvm_state->vmx.vmxon_pa == -1ull)
14899 return 0;
14900
14901 vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
14902 ret = enter_vmx_operation(vcpu);
14903 if (ret)
14904 return ret;
14905
a1b0c1c6
VK
14906 /* Empty 'VMXON' state is permitted */
14907 if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
14908 return 0;
14909
8cab6507
VK
14910 if (kvm_state->vmx.vmcs_pa != -1ull) {
14911 if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
14912 !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
14913 return -EINVAL;
a1b0c1c6 14914
8cab6507
VK
14915 set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
14916 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
14917 /*
14918 * Sync eVMCS upon entry as we may not have
14919 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
14920 */
14921 vmx->nested.need_vmcs12_sync = true;
14922 } else {
14923 return -EINVAL;
14924 }
8fcc4b59
JM
14925
14926 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
14927 vmx->nested.smm.vmxon = true;
14928 vmx->nested.vmxon = false;
14929
14930 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
14931 vmx->nested.smm.guest_mode = true;
14932 }
14933
14934 vmcs12 = get_vmcs12(vcpu);
14935 if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
14936 return -EFAULT;
14937
392b2f25 14938 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
8fcc4b59
JM
14939 return -EINVAL;
14940
14941 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14942 return 0;
14943
14944 vmx->nested.nested_run_pending =
14945 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
14946
fa58a9fa
PB
14947 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14948 vmcs12->vmcs_link_pointer != -1ull) {
14949 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
14950 if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
14951 return -EINVAL;
14952
14953 if (copy_from_user(shadow_vmcs12,
14954 user_kvm_nested_state->data + VMCS12_SIZE,
14955 sizeof(*vmcs12)))
14956 return -EFAULT;
14957
14958 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
14959 !shadow_vmcs12->hdr.shadow_vmcs)
14960 return -EINVAL;
14961 }
14962
8fcc4b59
JM
14963 if (check_vmentry_prereqs(vcpu, vmcs12) ||
14964 check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
14965 return -EINVAL;
14966
8fcc4b59 14967 vmx->nested.dirty_vmcs12 = true;
a633e41e 14968 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8fcc4b59
JM
14969 if (ret)
14970 return -EINVAL;
14971
14972 return 0;
14973}
14974
404f6aac 14975static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
6aa8b732
AK
14976 .cpu_has_kvm_support = cpu_has_kvm_support,
14977 .disabled_by_bios = vmx_disabled_by_bios,
14978 .hardware_setup = hardware_setup,
14979 .hardware_unsetup = hardware_unsetup,
002c7f7c 14980 .check_processor_compatibility = vmx_check_processor_compat,
6aa8b732
AK
14981 .hardware_enable = hardware_enable,
14982 .hardware_disable = hardware_disable,
04547156 14983 .cpu_has_accelerated_tpr = report_flexpriority,
bc226f07 14984 .has_emulated_msr = vmx_has_emulated_msr,
6aa8b732 14985
b31c114b 14986 .vm_init = vmx_vm_init,
434a1e94
SC
14987 .vm_alloc = vmx_vm_alloc,
14988 .vm_free = vmx_vm_free,
b31c114b 14989
6aa8b732
AK
14990 .vcpu_create = vmx_create_vcpu,
14991 .vcpu_free = vmx_free_vcpu,
04d2cc77 14992 .vcpu_reset = vmx_vcpu_reset,
6aa8b732 14993
6d6095bd 14994 .prepare_guest_switch = vmx_prepare_switch_to_guest,
6aa8b732
AK
14995 .vcpu_load = vmx_vcpu_load,
14996 .vcpu_put = vmx_vcpu_put,
14997
a96036b8 14998 .update_bp_intercept = update_exception_bitmap,
801e459a 14999 .get_msr_feature = vmx_get_msr_feature,
6aa8b732
AK
15000 .get_msr = vmx_get_msr,
15001 .set_msr = vmx_set_msr,
15002 .get_segment_base = vmx_get_segment_base,
15003 .get_segment = vmx_get_segment,
15004 .set_segment = vmx_set_segment,
2e4d2653 15005 .get_cpl = vmx_get_cpl,
6aa8b732 15006 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
e8467fda 15007 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
aff48baa 15008 .decache_cr3 = vmx_decache_cr3,
25c4c276 15009 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
6aa8b732 15010 .set_cr0 = vmx_set_cr0,
6aa8b732
AK
15011 .set_cr3 = vmx_set_cr3,
15012 .set_cr4 = vmx_set_cr4,
6aa8b732 15013 .set_efer = vmx_set_efer,
6aa8b732
AK
15014 .get_idt = vmx_get_idt,
15015 .set_idt = vmx_set_idt,
15016 .get_gdt = vmx_get_gdt,
15017 .set_gdt = vmx_set_gdt,
73aaf249
JK
15018 .get_dr6 = vmx_get_dr6,
15019 .set_dr6 = vmx_set_dr6,
020df079 15020 .set_dr7 = vmx_set_dr7,
81908bf4 15021 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
5fdbf976 15022 .cache_reg = vmx_cache_reg,
6aa8b732
AK
15023 .get_rflags = vmx_get_rflags,
15024 .set_rflags = vmx_set_rflags,
be94f6b7 15025
6aa8b732 15026 .tlb_flush = vmx_flush_tlb,
faff8758 15027 .tlb_flush_gva = vmx_flush_tlb_gva,
6aa8b732 15028
6aa8b732 15029 .run = vmx_vcpu_run,
6062d012 15030 .handle_exit = vmx_handle_exit,
6aa8b732 15031 .skip_emulated_instruction = skip_emulated_instruction,
2809f5d2
GC
15032 .set_interrupt_shadow = vmx_set_interrupt_shadow,
15033 .get_interrupt_shadow = vmx_get_interrupt_shadow,
102d8325 15034 .patch_hypercall = vmx_patch_hypercall,
2a8067f1 15035 .set_irq = vmx_inject_irq,
95ba8273 15036 .set_nmi = vmx_inject_nmi,
298101da 15037 .queue_exception = vmx_queue_exception,
b463a6f7 15038 .cancel_injection = vmx_cancel_injection,
78646121 15039 .interrupt_allowed = vmx_interrupt_allowed,
95ba8273 15040 .nmi_allowed = vmx_nmi_allowed,
3cfc3092
JK
15041 .get_nmi_mask = vmx_get_nmi_mask,
15042 .set_nmi_mask = vmx_set_nmi_mask,
95ba8273
GN
15043 .enable_nmi_window = enable_nmi_window,
15044 .enable_irq_window = enable_irq_window,
15045 .update_cr8_intercept = update_cr8_intercept,
8d860bbe 15046 .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
38b99173 15047 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
d62caabb
AS
15048 .get_enable_apicv = vmx_get_enable_apicv,
15049 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
c7c9c56c 15050 .load_eoi_exitmap = vmx_load_eoi_exitmap,
967235d3 15051 .apicv_post_state_restore = vmx_apicv_post_state_restore,
c7c9c56c
YZ
15052 .hwapic_irr_update = vmx_hwapic_irr_update,
15053 .hwapic_isr_update = vmx_hwapic_isr_update,
e6c67d8c 15054 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
a20ed54d
YZ
15055 .sync_pir_to_irr = vmx_sync_pir_to_irr,
15056 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
95ba8273 15057
cbc94022 15058 .set_tss_addr = vmx_set_tss_addr,
2ac52ab8 15059 .set_identity_map_addr = vmx_set_identity_map_addr,
67253af5 15060 .get_tdp_level = get_ept_level,
4b12f0de 15061 .get_mt_mask = vmx_get_mt_mask,
229456fc 15062
586f9607 15063 .get_exit_info = vmx_get_exit_info,
586f9607 15064
17cc3935 15065 .get_lpage_level = vmx_get_lpage_level,
0e851880
SY
15066
15067 .cpuid_update = vmx_cpuid_update,
4e47c7a6
SY
15068
15069 .rdtscp_supported = vmx_rdtscp_supported,
ad756a16 15070 .invpcid_supported = vmx_invpcid_supported,
d4330ef2
JR
15071
15072 .set_supported_cpuid = vmx_set_supported_cpuid,
f5f48ee1
SY
15073
15074 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
99e3e30a 15075
e79f245d 15076 .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
99e3e30a 15077 .write_tsc_offset = vmx_write_tsc_offset,
1c97f0a0
JR
15078
15079 .set_tdp_cr3 = vmx_set_cr3,
8a76d7f2
JR
15080
15081 .check_intercept = vmx_check_intercept,
a547c6db 15082 .handle_external_intr = vmx_handle_external_intr,
da8999d3 15083 .mpx_supported = vmx_mpx_supported,
55412b2e 15084 .xsaves_supported = vmx_xsaves_supported,
66336cab 15085 .umip_emulated = vmx_umip_emulated,
b6b8a145
JK
15086
15087 .check_nested_events = vmx_check_nested_events,
d264ee0c 15088 .request_immediate_exit = vmx_request_immediate_exit,
ae97a3b8
RK
15089
15090 .sched_in = vmx_sched_in,
843e4330
KH
15091
15092 .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
15093 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
15094 .flush_log_dirty = vmx_flush_log_dirty,
15095 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
c5f983f6 15096 .write_log_dirty = vmx_write_pml_buffer,
25462f7f 15097
bf9f6ac8
FW
15098 .pre_block = vmx_pre_block,
15099 .post_block = vmx_post_block,
15100
25462f7f 15101 .pmu_ops = &intel_pmu_ops,
efc64404
FW
15102
15103 .update_pi_irte = vmx_update_pi_irte,
64672c95
YJ
15104
15105#ifdef CONFIG_X86_64
15106 .set_hv_timer = vmx_set_hv_timer,
15107 .cancel_hv_timer = vmx_cancel_hv_timer,
15108#endif
c45dcc71
AR
15109
15110 .setup_mce = vmx_setup_mce,
0234bf88 15111
8fcc4b59
JM
15112 .get_nested_state = vmx_get_nested_state,
15113 .set_nested_state = vmx_set_nested_state,
7f7f1ba3
PB
15114 .get_vmcs12_pages = nested_get_vmcs12_pages,
15115
72d7b374 15116 .smi_allowed = vmx_smi_allowed,
0234bf88
LP
15117 .pre_enter_smm = vmx_pre_enter_smm,
15118 .pre_leave_smm = vmx_pre_leave_smm,
cc3d967f 15119 .enable_smi_window = enable_smi_window,
57b119da
VK
15120
15121 .nested_enable_evmcs = nested_enable_evmcs,
6aa8b732
AK
15122};
15123
72c6d2db 15124static void vmx_cleanup_l1d_flush(void)
a47dd5f0
PB
15125{
15126 if (vmx_l1d_flush_pages) {
15127 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
15128 vmx_l1d_flush_pages = NULL;
15129 }
72c6d2db
TG
15130 /* Restore state so sysfs ignores VMX */
15131 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
15132}
15133
a7b9020b
TG
15134static void vmx_exit(void)
15135{
15136#ifdef CONFIG_KEXEC_CORE
15137 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
15138 synchronize_rcu();
15139#endif
15140
15141 kvm_exit();
15142
15143#if IS_ENABLED(CONFIG_HYPERV)
15144 if (static_branch_unlikely(&enable_evmcs)) {
15145 int cpu;
15146 struct hv_vp_assist_page *vp_ap;
15147 /*
15148 * Reset everything to support using non-enlightened VMCS
15149 * access later (e.g. when we reload the module with
15150 * enlightened_vmcs=0)
15151 */
15152 for_each_online_cpu(cpu) {
15153 vp_ap = hv_get_vp_assist_page(cpu);
15154
15155 if (!vp_ap)
15156 continue;
15157
15158 vp_ap->current_nested_vmcs = 0;
15159 vp_ap->enlighten_vmentry = 0;
15160 }
15161
15162 static_branch_disable(&enable_evmcs);
15163 }
15164#endif
15165 vmx_cleanup_l1d_flush();
15166}
15167module_exit(vmx_exit);
15168
6aa8b732
AK
15169static int __init vmx_init(void)
15170{
773e8a04
VK
15171 int r;
15172
15173#if IS_ENABLED(CONFIG_HYPERV)
15174 /*
15175 * Enlightened VMCS usage should be recommended and the host needs
15176 * to support eVMCS v1 or above. We can also disable eVMCS support
15177 * with module parameter.
15178 */
15179 if (enlightened_vmcs &&
15180 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
15181 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
15182 KVM_EVMCS_VERSION) {
15183 int cpu;
15184
15185 /* Check that we have assist pages on all online CPUs */
15186 for_each_online_cpu(cpu) {
15187 if (!hv_get_vp_assist_page(cpu)) {
15188 enlightened_vmcs = false;
15189 break;
15190 }
15191 }
15192
15193 if (enlightened_vmcs) {
15194 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
15195 static_branch_enable(&enable_evmcs);
15196 }
15197 } else {
15198 enlightened_vmcs = false;
15199 }
15200#endif
15201
15202 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
a7b9020b 15203 __alignof__(struct vcpu_vmx), THIS_MODULE);
fdef3ad1 15204 if (r)
34a1cd60 15205 return r;
25c5f225 15206
a7b9020b 15207 /*
7db92e16
TG
15208 * Must be called after kvm_init() so enable_ept is properly set
15209 * up. Hand the parameter mitigation value in which was stored in
15210 * the pre module init parser. If no parameter was given, it will
15211 * contain 'auto' which will be turned into the default 'cond'
15212 * mitigation mode.
15213 */
15214 if (boot_cpu_has(X86_BUG_L1TF)) {
15215 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
15216 if (r) {
15217 vmx_exit();
15218 return r;
15219 }
a47dd5f0 15220 }
25c5f225 15221
2965faa5 15222#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
15223 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
15224 crash_vmclear_local_loaded_vmcss);
15225#endif
21ebf53b 15226 vmx_check_vmcs12_offsets();
8f536b76 15227
fdef3ad1 15228 return 0;
6aa8b732 15229}
a7b9020b 15230module_init(vmx_init);