KVM: VMX: enable nested virtualization by default
[linux-2.6-block.git] / arch / x86 / kvm / vmx.c
CommitLineData
6aa8b732
AK
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
9611c187 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
9 *
10 * Authors:
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory.
16 *
17 */
18
85f455f7 19#include "irq.h"
1d737c8a 20#include "mmu.h"
00b27a3e 21#include "cpuid.h"
d62caabb 22#include "lapic.h"
b8bbab92 23#include "hyperv.h"
e495606d 24
edf88417 25#include <linux/kvm_host.h>
6aa8b732 26#include <linux/module.h>
9d8f549d 27#include <linux/kernel.h>
6aa8b732
AK
28#include <linux/mm.h>
29#include <linux/highmem.h>
e8edc6e0 30#include <linux/sched.h>
c7addb90 31#include <linux/moduleparam.h>
e9bda3b3 32#include <linux/mod_devicetable.h>
af658dca 33#include <linux/trace_events.h>
5a0e3ad6 34#include <linux/slab.h>
cafd6659 35#include <linux/tboot.h>
f4124500 36#include <linux/hrtimer.h>
c207aee4 37#include <linux/frame.h>
085331df 38#include <linux/nospec.h>
5fdbf976 39#include "kvm_cache_regs.h"
35920a35 40#include "x86.h"
e495606d 41
fd8ca6da 42#include <asm/asm.h>
28b835d6 43#include <asm/cpu.h>
6aa8b732 44#include <asm/io.h>
3b3be0d1 45#include <asm/desc.h>
13673a90 46#include <asm/vmx.h>
6210e37b 47#include <asm/virtext.h>
a0861c02 48#include <asm/mce.h>
952f07ec 49#include <asm/fpu/internal.h>
d7cd9796 50#include <asm/perf_event.h>
81908bf4 51#include <asm/debugreg.h>
8f536b76 52#include <asm/kexec.h>
dab2087d 53#include <asm/apic.h>
efc64404 54#include <asm/irq_remapping.h>
d6e41f11 55#include <asm/mmu_context.h>
28a27752 56#include <asm/spec-ctrl.h>
773e8a04 57#include <asm/mshyperv.h>
6aa8b732 58
229456fc 59#include "trace.h"
25462f7f 60#include "pmu.h"
773e8a04 61#include "vmx_evmcs.h"
229456fc 62
4ecac3fd 63#define __ex(x) __kvm_handle_fault_on_reboot(x)
5e520e62 64#define __ex_clear(x, reg) \
43ce76ce 65 ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
4ecac3fd 66
6aa8b732
AK
67MODULE_AUTHOR("Qumranet");
68MODULE_LICENSE("GPL");
69
e9bda3b3
JT
70static const struct x86_cpu_id vmx_cpu_id[] = {
71 X86_FEATURE_MATCH(X86_FEATURE_VMX),
72 {}
73};
74MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
75
476bc001 76static bool __read_mostly enable_vpid = 1;
736caefe 77module_param_named(vpid, enable_vpid, bool, 0444);
2384d2b3 78
d02fcf50
PB
79static bool __read_mostly enable_vnmi = 1;
80module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
81
476bc001 82static bool __read_mostly flexpriority_enabled = 1;
736caefe 83module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
4c9fc8ef 84
476bc001 85static bool __read_mostly enable_ept = 1;
736caefe 86module_param_named(ept, enable_ept, bool, S_IRUGO);
d56f546d 87
476bc001 88static bool __read_mostly enable_unrestricted_guest = 1;
3a624e29
NK
89module_param_named(unrestricted_guest,
90 enable_unrestricted_guest, bool, S_IRUGO);
91
83c3a331
XH
92static bool __read_mostly enable_ept_ad_bits = 1;
93module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
94
a27685c3 95static bool __read_mostly emulate_invalid_guest_state = true;
c1f8bc04 96module_param(emulate_invalid_guest_state, bool, S_IRUGO);
04fa4d32 97
476bc001 98static bool __read_mostly fasteoi = 1;
58fbbf26
KT
99module_param(fasteoi, bool, S_IRUGO);
100
5a71785d 101static bool __read_mostly enable_apicv = 1;
01e439be 102module_param(enable_apicv, bool, S_IRUGO);
83d4c286 103
abc4fc58
AG
104static bool __read_mostly enable_shadow_vmcs = 1;
105module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
801d3424
NHE
106/*
107 * If nested=1, nested virtualization is supported, i.e., guests may use
108 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
109 * use VMX instructions.
110 */
1e58e5e5 111static bool __read_mostly nested = 1;
801d3424
NHE
112module_param(nested, bool, S_IRUGO);
113
52017608
SC
114static bool __read_mostly nested_early_check = 0;
115module_param(nested_early_check, bool, S_IRUGO);
116
20300099
WL
117static u64 __read_mostly host_xss;
118
843e4330
KH
119static bool __read_mostly enable_pml = 1;
120module_param_named(pml, enable_pml, bool, S_IRUGO);
121
904e14fb
PB
122#define MSR_TYPE_R 1
123#define MSR_TYPE_W 2
124#define MSR_TYPE_RW 3
125
126#define MSR_BITMAP_MODE_X2APIC 1
127#define MSR_BITMAP_MODE_X2APIC_APICV 2
904e14fb 128
64903d61
HZ
129#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
130
64672c95
YJ
131/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
132static int __read_mostly cpu_preemption_timer_multi;
133static bool __read_mostly enable_preemption_timer = 1;
134#ifdef CONFIG_X86_64
135module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
136#endif
137
3de6347b 138#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
1706bd0c
SC
139#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
140#define KVM_VM_CR0_ALWAYS_ON \
141 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
142 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
4c38609a
AK
143#define KVM_CR4_GUEST_OWNED_BITS \
144 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
fd8cb433 145 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
4c38609a 146
5dc1f044 147#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
cdc0e244
AK
148#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
149#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
150
78ac8b47
AK
151#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
152
f4124500
JK
153#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
154
16c2aec6
JD
155/*
156 * Hyper-V requires all of these, so mark them as supported even though
157 * they are just treated the same as all-context.
158 */
159#define VMX_VPID_EXTENT_SUPPORTED_MASK \
160 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
161 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
162 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
163 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
164
4b8d54f9
ZE
165/*
166 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
167 * ple_gap: upper bound on the amount of time between two successive
168 * executions of PAUSE in a loop. Also indicate if ple enabled.
00c25bce 169 * According to test, this time is usually smaller than 128 cycles.
4b8d54f9
ZE
170 * ple_window: upper bound on the amount of time a guest is allowed to execute
171 * in a PAUSE loop. Tests indicate that most spinlocks are held for
172 * less than 2^12 cycles
173 * Time is measured based on a counter that runs at the same rate as the TSC,
174 * refer SDM volume 3b section 21.6.13 & 22.1.3.
175 */
c8e88717 176static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
b4a2d31d 177
7fbc85a5
BM
178static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
179module_param(ple_window, uint, 0444);
4b8d54f9 180
b4a2d31d 181/* Default doubles per-vcpu window every exit. */
c8e88717 182static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
7fbc85a5 183module_param(ple_window_grow, uint, 0444);
b4a2d31d
RK
184
185/* Default resets per-vcpu window every exit to ple_window. */
c8e88717 186static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
7fbc85a5 187module_param(ple_window_shrink, uint, 0444);
b4a2d31d
RK
188
189/* Default is to compute the maximum so we can never overflow. */
7fbc85a5
BM
190static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
191module_param(ple_window_max, uint, 0444);
b4a2d31d 192
83287ea4 193extern const ulong vmx_return;
52017608 194extern const ulong vmx_early_consistency_check_return;
83287ea4 195
a399477e 196static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
427362a1 197static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
dd4bfa73 198static DEFINE_MUTEX(vmx_l1d_flush_mutex);
a399477e 199
7db92e16
TG
200/* Storage for pre module init parameter parsing */
201static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
202
203static const struct {
204 const char *option;
0027ff2a 205 bool for_parse;
a399477e 206} vmentry_l1d_param[] = {
0027ff2a
PB
207 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
208 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
209 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
210 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
211 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
212 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
a399477e
KRW
213};
214
7db92e16
TG
215#define L1D_CACHE_ORDER 4
216static void *vmx_l1d_flush_pages;
217
218static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
a399477e 219{
7db92e16 220 struct page *page;
288d152c 221 unsigned int i;
a399477e 222
7db92e16
TG
223 if (!enable_ept) {
224 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
225 return 0;
a399477e
KRW
226 }
227
d806afa4
YW
228 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
229 u64 msr;
230
231 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
232 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
233 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
234 return 0;
235 }
236 }
8e0b2b91 237
d90a7a0e
JK
238 /* If set to auto use the default l1tf mitigation method */
239 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
240 switch (l1tf_mitigation) {
241 case L1TF_MITIGATION_OFF:
242 l1tf = VMENTER_L1D_FLUSH_NEVER;
243 break;
244 case L1TF_MITIGATION_FLUSH_NOWARN:
245 case L1TF_MITIGATION_FLUSH:
246 case L1TF_MITIGATION_FLUSH_NOSMT:
247 l1tf = VMENTER_L1D_FLUSH_COND;
248 break;
249 case L1TF_MITIGATION_FULL:
250 case L1TF_MITIGATION_FULL_FORCE:
251 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
252 break;
253 }
254 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
255 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
256 }
257
7db92e16
TG
258 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
259 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
260 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
261 if (!page)
262 return -ENOMEM;
263 vmx_l1d_flush_pages = page_address(page);
288d152c
NS
264
265 /*
266 * Initialize each page with a different pattern in
267 * order to protect against KSM in the nested
268 * virtualization case.
269 */
270 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
271 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
272 PAGE_SIZE);
273 }
7db92e16
TG
274 }
275
276 l1tf_vmx_mitigation = l1tf;
277
895ae47f
TG
278 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
279 static_branch_enable(&vmx_l1d_should_flush);
280 else
281 static_branch_disable(&vmx_l1d_should_flush);
4c6523ec 282
427362a1
NS
283 if (l1tf == VMENTER_L1D_FLUSH_COND)
284 static_branch_enable(&vmx_l1d_flush_cond);
895ae47f 285 else
427362a1 286 static_branch_disable(&vmx_l1d_flush_cond);
7db92e16
TG
287 return 0;
288}
289
290static int vmentry_l1d_flush_parse(const char *s)
291{
292 unsigned int i;
293
294 if (s) {
295 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
0027ff2a
PB
296 if (vmentry_l1d_param[i].for_parse &&
297 sysfs_streq(s, vmentry_l1d_param[i].option))
298 return i;
7db92e16
TG
299 }
300 }
a399477e
KRW
301 return -EINVAL;
302}
303
7db92e16
TG
304static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
305{
dd4bfa73 306 int l1tf, ret;
7db92e16 307
7db92e16
TG
308 l1tf = vmentry_l1d_flush_parse(s);
309 if (l1tf < 0)
310 return l1tf;
311
0027ff2a
PB
312 if (!boot_cpu_has(X86_BUG_L1TF))
313 return 0;
314
7db92e16
TG
315 /*
316 * Has vmx_init() run already? If not then this is the pre init
317 * parameter parsing. In that case just store the value and let
318 * vmx_init() do the proper setup after enable_ept has been
319 * established.
320 */
321 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
322 vmentry_l1d_flush_param = l1tf;
323 return 0;
324 }
325
dd4bfa73
TG
326 mutex_lock(&vmx_l1d_flush_mutex);
327 ret = vmx_setup_l1d_flush(l1tf);
328 mutex_unlock(&vmx_l1d_flush_mutex);
329 return ret;
7db92e16
TG
330}
331
a399477e
KRW
332static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
333{
0027ff2a
PB
334 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
335 return sprintf(s, "???\n");
336
7db92e16 337 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
a399477e
KRW
338}
339
340static const struct kernel_param_ops vmentry_l1d_flush_ops = {
341 .set = vmentry_l1d_flush_set,
342 .get = vmentry_l1d_flush_get,
343};
895ae47f 344module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
a399477e 345
877ad952
TL
346enum ept_pointers_status {
347 EPT_POINTERS_CHECK = 0,
348 EPT_POINTERS_MATCH = 1,
349 EPT_POINTERS_MISMATCH = 2
350};
351
40bbb9d0
SC
352struct kvm_vmx {
353 struct kvm kvm;
354
355 unsigned int tss_addr;
356 bool ept_identity_pagetable_done;
357 gpa_t ept_identity_map_addr;
877ad952
TL
358
359 enum ept_pointers_status ept_pointers_match;
360 spinlock_t ept_pointer_lock;
40bbb9d0
SC
361};
362
8bf00a52 363#define NR_AUTOLOAD_MSRS 8
61d2ef2c 364
392b2f25
LA
365struct vmcs_hdr {
366 u32 revision_id:31;
367 u32 shadow_vmcs:1;
368};
369
a2fa3e9f 370struct vmcs {
392b2f25 371 struct vmcs_hdr hdr;
a2fa3e9f
GH
372 u32 abort;
373 char data[0];
374};
375
d7ee039e
SC
376/*
377 * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
378 * and whose values change infrequently, but are not constant. I.e. this is
379 * used as a write-through cache of the corresponding VMCS fields.
380 */
381struct vmcs_host_state {
382 unsigned long cr3; /* May not match real cr3 */
383 unsigned long cr4; /* May not match real cr4 */
5e079c7e
SC
384 unsigned long gs_base;
385 unsigned long fs_base;
d7ee039e
SC
386
387 u16 fs_sel, gs_sel, ldt_sel;
388#ifdef CONFIG_X86_64
389 u16 ds_sel, es_sel;
390#endif
391};
392
d462b819
NHE
393/*
394 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
395 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
396 * loaded on this CPU (so we can clear them if the CPU goes down).
397 */
398struct loaded_vmcs {
399 struct vmcs *vmcs;
355f4fb1 400 struct vmcs *shadow_vmcs;
d462b819 401 int cpu;
4c4a6f79
PB
402 bool launched;
403 bool nmi_known_unmasked;
f459a707 404 bool hv_timer_armed;
8a1b4392
PB
405 /* Support for vnmi-less CPUs */
406 int soft_vnmi_blocked;
407 ktime_t entry_time;
408 s64 vnmi_blocked_time;
904e14fb 409 unsigned long *msr_bitmap;
d462b819 410 struct list_head loaded_vmcss_on_cpu_link;
d7ee039e 411 struct vmcs_host_state host_state;
d462b819
NHE
412};
413
26bb0981
AK
414struct shared_msr_entry {
415 unsigned index;
416 u64 data;
d5696725 417 u64 mask;
26bb0981
AK
418};
419
a9d30f33
NHE
420/*
421 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
422 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
423 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
424 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
425 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
426 * More than one of these structures may exist, if L1 runs multiple L2 guests.
de3a0021 427 * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
a9d30f33
NHE
428 * underlying hardware which will be used to run L2.
429 * This structure is packed to ensure that its layout is identical across
430 * machines (necessary for live migration).
b348e793
JM
431 *
432 * IMPORTANT: Changing the layout of existing fields in this structure
433 * will break save/restore compatibility with older kvm releases. When
434 * adding new fields, either use space in the reserved padding* arrays
435 * or add the new fields to the end of the structure.
a9d30f33 436 */
22bd0358 437typedef u64 natural_width;
a9d30f33
NHE
438struct __packed vmcs12 {
439 /* According to the Intel spec, a VMCS region must start with the
440 * following two fields. Then follow implementation-specific data.
441 */
392b2f25 442 struct vmcs_hdr hdr;
a9d30f33 443 u32 abort;
22bd0358 444
27d6c865
NHE
445 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
446 u32 padding[7]; /* room for future expansion */
447
22bd0358
NHE
448 u64 io_bitmap_a;
449 u64 io_bitmap_b;
450 u64 msr_bitmap;
451 u64 vm_exit_msr_store_addr;
452 u64 vm_exit_msr_load_addr;
453 u64 vm_entry_msr_load_addr;
454 u64 tsc_offset;
455 u64 virtual_apic_page_addr;
456 u64 apic_access_addr;
705699a1 457 u64 posted_intr_desc_addr;
22bd0358 458 u64 ept_pointer;
608406e2
WV
459 u64 eoi_exit_bitmap0;
460 u64 eoi_exit_bitmap1;
461 u64 eoi_exit_bitmap2;
462 u64 eoi_exit_bitmap3;
81dc01f7 463 u64 xss_exit_bitmap;
22bd0358
NHE
464 u64 guest_physical_address;
465 u64 vmcs_link_pointer;
466 u64 guest_ia32_debugctl;
467 u64 guest_ia32_pat;
468 u64 guest_ia32_efer;
469 u64 guest_ia32_perf_global_ctrl;
470 u64 guest_pdptr0;
471 u64 guest_pdptr1;
472 u64 guest_pdptr2;
473 u64 guest_pdptr3;
36be0b9d 474 u64 guest_bndcfgs;
22bd0358
NHE
475 u64 host_ia32_pat;
476 u64 host_ia32_efer;
477 u64 host_ia32_perf_global_ctrl;
b348e793
JM
478 u64 vmread_bitmap;
479 u64 vmwrite_bitmap;
480 u64 vm_function_control;
481 u64 eptp_list_address;
482 u64 pml_address;
483 u64 padding64[3]; /* room for future expansion */
22bd0358
NHE
484 /*
485 * To allow migration of L1 (complete with its L2 guests) between
486 * machines of different natural widths (32 or 64 bit), we cannot have
487 * unsigned long fields with no explict size. We use u64 (aliased
488 * natural_width) instead. Luckily, x86 is little-endian.
489 */
490 natural_width cr0_guest_host_mask;
491 natural_width cr4_guest_host_mask;
492 natural_width cr0_read_shadow;
493 natural_width cr4_read_shadow;
494 natural_width cr3_target_value0;
495 natural_width cr3_target_value1;
496 natural_width cr3_target_value2;
497 natural_width cr3_target_value3;
498 natural_width exit_qualification;
499 natural_width guest_linear_address;
500 natural_width guest_cr0;
501 natural_width guest_cr3;
502 natural_width guest_cr4;
503 natural_width guest_es_base;
504 natural_width guest_cs_base;
505 natural_width guest_ss_base;
506 natural_width guest_ds_base;
507 natural_width guest_fs_base;
508 natural_width guest_gs_base;
509 natural_width guest_ldtr_base;
510 natural_width guest_tr_base;
511 natural_width guest_gdtr_base;
512 natural_width guest_idtr_base;
513 natural_width guest_dr7;
514 natural_width guest_rsp;
515 natural_width guest_rip;
516 natural_width guest_rflags;
517 natural_width guest_pending_dbg_exceptions;
518 natural_width guest_sysenter_esp;
519 natural_width guest_sysenter_eip;
520 natural_width host_cr0;
521 natural_width host_cr3;
522 natural_width host_cr4;
523 natural_width host_fs_base;
524 natural_width host_gs_base;
525 natural_width host_tr_base;
526 natural_width host_gdtr_base;
527 natural_width host_idtr_base;
528 natural_width host_ia32_sysenter_esp;
529 natural_width host_ia32_sysenter_eip;
530 natural_width host_rsp;
531 natural_width host_rip;
532 natural_width paddingl[8]; /* room for future expansion */
533 u32 pin_based_vm_exec_control;
534 u32 cpu_based_vm_exec_control;
535 u32 exception_bitmap;
536 u32 page_fault_error_code_mask;
537 u32 page_fault_error_code_match;
538 u32 cr3_target_count;
539 u32 vm_exit_controls;
540 u32 vm_exit_msr_store_count;
541 u32 vm_exit_msr_load_count;
542 u32 vm_entry_controls;
543 u32 vm_entry_msr_load_count;
544 u32 vm_entry_intr_info_field;
545 u32 vm_entry_exception_error_code;
546 u32 vm_entry_instruction_len;
547 u32 tpr_threshold;
548 u32 secondary_vm_exec_control;
549 u32 vm_instruction_error;
550 u32 vm_exit_reason;
551 u32 vm_exit_intr_info;
552 u32 vm_exit_intr_error_code;
553 u32 idt_vectoring_info_field;
554 u32 idt_vectoring_error_code;
555 u32 vm_exit_instruction_len;
556 u32 vmx_instruction_info;
557 u32 guest_es_limit;
558 u32 guest_cs_limit;
559 u32 guest_ss_limit;
560 u32 guest_ds_limit;
561 u32 guest_fs_limit;
562 u32 guest_gs_limit;
563 u32 guest_ldtr_limit;
564 u32 guest_tr_limit;
565 u32 guest_gdtr_limit;
566 u32 guest_idtr_limit;
567 u32 guest_es_ar_bytes;
568 u32 guest_cs_ar_bytes;
569 u32 guest_ss_ar_bytes;
570 u32 guest_ds_ar_bytes;
571 u32 guest_fs_ar_bytes;
572 u32 guest_gs_ar_bytes;
573 u32 guest_ldtr_ar_bytes;
574 u32 guest_tr_ar_bytes;
575 u32 guest_interruptibility_info;
576 u32 guest_activity_state;
577 u32 guest_sysenter_cs;
578 u32 host_ia32_sysenter_cs;
0238ea91
JK
579 u32 vmx_preemption_timer_value;
580 u32 padding32[7]; /* room for future expansion */
22bd0358 581 u16 virtual_processor_id;
705699a1 582 u16 posted_intr_nv;
22bd0358
NHE
583 u16 guest_es_selector;
584 u16 guest_cs_selector;
585 u16 guest_ss_selector;
586 u16 guest_ds_selector;
587 u16 guest_fs_selector;
588 u16 guest_gs_selector;
589 u16 guest_ldtr_selector;
590 u16 guest_tr_selector;
608406e2 591 u16 guest_intr_status;
22bd0358
NHE
592 u16 host_es_selector;
593 u16 host_cs_selector;
594 u16 host_ss_selector;
595 u16 host_ds_selector;
596 u16 host_fs_selector;
597 u16 host_gs_selector;
598 u16 host_tr_selector;
b348e793 599 u16 guest_pml_index;
a9d30f33
NHE
600};
601
21ebf53b
JM
602/*
603 * For save/restore compatibility, the vmcs12 field offsets must not change.
604 */
605#define CHECK_OFFSET(field, loc) \
606 BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
607 "Offset of " #field " in struct vmcs12 has changed.")
608
609static inline void vmx_check_vmcs12_offsets(void) {
392b2f25 610 CHECK_OFFSET(hdr, 0);
21ebf53b
JM
611 CHECK_OFFSET(abort, 4);
612 CHECK_OFFSET(launch_state, 8);
613 CHECK_OFFSET(io_bitmap_a, 40);
614 CHECK_OFFSET(io_bitmap_b, 48);
615 CHECK_OFFSET(msr_bitmap, 56);
616 CHECK_OFFSET(vm_exit_msr_store_addr, 64);
617 CHECK_OFFSET(vm_exit_msr_load_addr, 72);
618 CHECK_OFFSET(vm_entry_msr_load_addr, 80);
619 CHECK_OFFSET(tsc_offset, 88);
620 CHECK_OFFSET(virtual_apic_page_addr, 96);
621 CHECK_OFFSET(apic_access_addr, 104);
622 CHECK_OFFSET(posted_intr_desc_addr, 112);
623 CHECK_OFFSET(ept_pointer, 120);
624 CHECK_OFFSET(eoi_exit_bitmap0, 128);
625 CHECK_OFFSET(eoi_exit_bitmap1, 136);
626 CHECK_OFFSET(eoi_exit_bitmap2, 144);
627 CHECK_OFFSET(eoi_exit_bitmap3, 152);
628 CHECK_OFFSET(xss_exit_bitmap, 160);
629 CHECK_OFFSET(guest_physical_address, 168);
630 CHECK_OFFSET(vmcs_link_pointer, 176);
631 CHECK_OFFSET(guest_ia32_debugctl, 184);
632 CHECK_OFFSET(guest_ia32_pat, 192);
633 CHECK_OFFSET(guest_ia32_efer, 200);
634 CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
635 CHECK_OFFSET(guest_pdptr0, 216);
636 CHECK_OFFSET(guest_pdptr1, 224);
637 CHECK_OFFSET(guest_pdptr2, 232);
638 CHECK_OFFSET(guest_pdptr3, 240);
639 CHECK_OFFSET(guest_bndcfgs, 248);
640 CHECK_OFFSET(host_ia32_pat, 256);
641 CHECK_OFFSET(host_ia32_efer, 264);
642 CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
643 CHECK_OFFSET(vmread_bitmap, 280);
644 CHECK_OFFSET(vmwrite_bitmap, 288);
645 CHECK_OFFSET(vm_function_control, 296);
646 CHECK_OFFSET(eptp_list_address, 304);
647 CHECK_OFFSET(pml_address, 312);
648 CHECK_OFFSET(cr0_guest_host_mask, 344);
649 CHECK_OFFSET(cr4_guest_host_mask, 352);
650 CHECK_OFFSET(cr0_read_shadow, 360);
651 CHECK_OFFSET(cr4_read_shadow, 368);
652 CHECK_OFFSET(cr3_target_value0, 376);
653 CHECK_OFFSET(cr3_target_value1, 384);
654 CHECK_OFFSET(cr3_target_value2, 392);
655 CHECK_OFFSET(cr3_target_value3, 400);
656 CHECK_OFFSET(exit_qualification, 408);
657 CHECK_OFFSET(guest_linear_address, 416);
658 CHECK_OFFSET(guest_cr0, 424);
659 CHECK_OFFSET(guest_cr3, 432);
660 CHECK_OFFSET(guest_cr4, 440);
661 CHECK_OFFSET(guest_es_base, 448);
662 CHECK_OFFSET(guest_cs_base, 456);
663 CHECK_OFFSET(guest_ss_base, 464);
664 CHECK_OFFSET(guest_ds_base, 472);
665 CHECK_OFFSET(guest_fs_base, 480);
666 CHECK_OFFSET(guest_gs_base, 488);
667 CHECK_OFFSET(guest_ldtr_base, 496);
668 CHECK_OFFSET(guest_tr_base, 504);
669 CHECK_OFFSET(guest_gdtr_base, 512);
670 CHECK_OFFSET(guest_idtr_base, 520);
671 CHECK_OFFSET(guest_dr7, 528);
672 CHECK_OFFSET(guest_rsp, 536);
673 CHECK_OFFSET(guest_rip, 544);
674 CHECK_OFFSET(guest_rflags, 552);
675 CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
676 CHECK_OFFSET(guest_sysenter_esp, 568);
677 CHECK_OFFSET(guest_sysenter_eip, 576);
678 CHECK_OFFSET(host_cr0, 584);
679 CHECK_OFFSET(host_cr3, 592);
680 CHECK_OFFSET(host_cr4, 600);
681 CHECK_OFFSET(host_fs_base, 608);
682 CHECK_OFFSET(host_gs_base, 616);
683 CHECK_OFFSET(host_tr_base, 624);
684 CHECK_OFFSET(host_gdtr_base, 632);
685 CHECK_OFFSET(host_idtr_base, 640);
686 CHECK_OFFSET(host_ia32_sysenter_esp, 648);
687 CHECK_OFFSET(host_ia32_sysenter_eip, 656);
688 CHECK_OFFSET(host_rsp, 664);
689 CHECK_OFFSET(host_rip, 672);
690 CHECK_OFFSET(pin_based_vm_exec_control, 744);
691 CHECK_OFFSET(cpu_based_vm_exec_control, 748);
692 CHECK_OFFSET(exception_bitmap, 752);
693 CHECK_OFFSET(page_fault_error_code_mask, 756);
694 CHECK_OFFSET(page_fault_error_code_match, 760);
695 CHECK_OFFSET(cr3_target_count, 764);
696 CHECK_OFFSET(vm_exit_controls, 768);
697 CHECK_OFFSET(vm_exit_msr_store_count, 772);
698 CHECK_OFFSET(vm_exit_msr_load_count, 776);
699 CHECK_OFFSET(vm_entry_controls, 780);
700 CHECK_OFFSET(vm_entry_msr_load_count, 784);
701 CHECK_OFFSET(vm_entry_intr_info_field, 788);
702 CHECK_OFFSET(vm_entry_exception_error_code, 792);
703 CHECK_OFFSET(vm_entry_instruction_len, 796);
704 CHECK_OFFSET(tpr_threshold, 800);
705 CHECK_OFFSET(secondary_vm_exec_control, 804);
706 CHECK_OFFSET(vm_instruction_error, 808);
707 CHECK_OFFSET(vm_exit_reason, 812);
708 CHECK_OFFSET(vm_exit_intr_info, 816);
709 CHECK_OFFSET(vm_exit_intr_error_code, 820);
710 CHECK_OFFSET(idt_vectoring_info_field, 824);
711 CHECK_OFFSET(idt_vectoring_error_code, 828);
712 CHECK_OFFSET(vm_exit_instruction_len, 832);
713 CHECK_OFFSET(vmx_instruction_info, 836);
714 CHECK_OFFSET(guest_es_limit, 840);
715 CHECK_OFFSET(guest_cs_limit, 844);
716 CHECK_OFFSET(guest_ss_limit, 848);
717 CHECK_OFFSET(guest_ds_limit, 852);
718 CHECK_OFFSET(guest_fs_limit, 856);
719 CHECK_OFFSET(guest_gs_limit, 860);
720 CHECK_OFFSET(guest_ldtr_limit, 864);
721 CHECK_OFFSET(guest_tr_limit, 868);
722 CHECK_OFFSET(guest_gdtr_limit, 872);
723 CHECK_OFFSET(guest_idtr_limit, 876);
724 CHECK_OFFSET(guest_es_ar_bytes, 880);
725 CHECK_OFFSET(guest_cs_ar_bytes, 884);
726 CHECK_OFFSET(guest_ss_ar_bytes, 888);
727 CHECK_OFFSET(guest_ds_ar_bytes, 892);
728 CHECK_OFFSET(guest_fs_ar_bytes, 896);
729 CHECK_OFFSET(guest_gs_ar_bytes, 900);
730 CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
731 CHECK_OFFSET(guest_tr_ar_bytes, 908);
732 CHECK_OFFSET(guest_interruptibility_info, 912);
733 CHECK_OFFSET(guest_activity_state, 916);
734 CHECK_OFFSET(guest_sysenter_cs, 920);
735 CHECK_OFFSET(host_ia32_sysenter_cs, 924);
736 CHECK_OFFSET(vmx_preemption_timer_value, 928);
737 CHECK_OFFSET(virtual_processor_id, 960);
738 CHECK_OFFSET(posted_intr_nv, 962);
739 CHECK_OFFSET(guest_es_selector, 964);
740 CHECK_OFFSET(guest_cs_selector, 966);
741 CHECK_OFFSET(guest_ss_selector, 968);
742 CHECK_OFFSET(guest_ds_selector, 970);
743 CHECK_OFFSET(guest_fs_selector, 972);
744 CHECK_OFFSET(guest_gs_selector, 974);
745 CHECK_OFFSET(guest_ldtr_selector, 976);
746 CHECK_OFFSET(guest_tr_selector, 978);
747 CHECK_OFFSET(guest_intr_status, 980);
748 CHECK_OFFSET(host_es_selector, 982);
749 CHECK_OFFSET(host_cs_selector, 984);
750 CHECK_OFFSET(host_ss_selector, 986);
751 CHECK_OFFSET(host_ds_selector, 988);
752 CHECK_OFFSET(host_fs_selector, 990);
753 CHECK_OFFSET(host_gs_selector, 992);
754 CHECK_OFFSET(host_tr_selector, 994);
755 CHECK_OFFSET(guest_pml_index, 996);
756}
757
a9d30f33
NHE
758/*
759 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
760 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
761 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
b348e793
JM
762 *
763 * IMPORTANT: Changing this value will break save/restore compatibility with
764 * older kvm releases.
a9d30f33
NHE
765 */
766#define VMCS12_REVISION 0x11e57ed0
767
768/*
769 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
770 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
771 * current implementation, 4K are reserved to avoid future complications.
772 */
773#define VMCS12_SIZE 0x1000
774
5b15706d
JM
775/*
776 * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
777 * supported VMCS12 field encoding.
778 */
779#define VMCS12_MAX_FIELD_INDEX 0x17
780
6677f3da
PB
781struct nested_vmx_msrs {
782 /*
783 * We only store the "true" versions of the VMX capability MSRs. We
784 * generate the "non-true" versions by setting the must-be-1 bits
785 * according to the SDM.
786 */
787 u32 procbased_ctls_low;
788 u32 procbased_ctls_high;
789 u32 secondary_ctls_low;
790 u32 secondary_ctls_high;
791 u32 pinbased_ctls_low;
792 u32 pinbased_ctls_high;
793 u32 exit_ctls_low;
794 u32 exit_ctls_high;
795 u32 entry_ctls_low;
796 u32 entry_ctls_high;
797 u32 misc_low;
798 u32 misc_high;
799 u32 ept_caps;
800 u32 vpid_caps;
801 u64 basic;
802 u64 cr0_fixed0;
803 u64 cr0_fixed1;
804 u64 cr4_fixed0;
805 u64 cr4_fixed1;
806 u64 vmcs_enum;
807 u64 vmfunc_controls;
808};
809
ec378aee
NHE
810/*
811 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
812 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
813 */
814struct nested_vmx {
815 /* Has the level1 guest done vmxon? */
816 bool vmxon;
3573e22c 817 gpa_t vmxon_ptr;
c5f983f6 818 bool pml_full;
a9d30f33
NHE
819
820 /* The guest-physical address of the current VMCS L1 keeps for L2 */
821 gpa_t current_vmptr;
4f2777bc
DM
822 /*
823 * Cache of the guest's VMCS, existing outside of guest memory.
824 * Loaded from guest memory during VMPTRLD. Flushed to guest
8ca44e88 825 * memory during VMCLEAR and VMPTRLD.
4f2777bc
DM
826 */
827 struct vmcs12 *cached_vmcs12;
61ada748
LA
828 /*
829 * Cache of the guest's shadow VMCS, existing outside of guest
830 * memory. Loaded from guest memory during VM entry. Flushed
831 * to guest memory during VM exit.
832 */
833 struct vmcs12 *cached_shadow_vmcs12;
012f83cb 834 /*
945679e3
VK
835 * Indicates if the shadow vmcs or enlightened vmcs must be updated
836 * with the data held by struct vmcs12.
012f83cb 837 */
945679e3 838 bool need_vmcs12_sync;
74a497fa 839 bool dirty_vmcs12;
ff2f6fe9 840
9d6105b2
SC
841 /*
842 * vmcs02 has been initialized, i.e. state that is constant for
843 * vmcs02 has been written to the backing VMCS. Initialization
844 * is delayed until L1 actually attempts to run a nested VM.
845 */
846 bool vmcs02_initialized;
847
8d860bbe
JM
848 bool change_vmcs01_virtual_apic_mode;
849
57b119da
VK
850 /*
851 * Enlightened VMCS has been enabled. It does not mean that L1 has to
852 * use it. However, VMX features available to L1 will be limited based
853 * on what the enlightened VMCS supports.
854 */
855 bool enlightened_vmcs_enabled;
856
644d711a
NHE
857 /* L2 must run next, and mustn't decide to exit to L1. */
858 bool nested_run_pending;
de3a0021
JM
859
860 struct loaded_vmcs vmcs02;
861
fe3ef05c 862 /*
de3a0021
JM
863 * Guest pages referred to in the vmcs02 with host-physical
864 * pointers, so we must keep them pinned while L2 runs.
fe3ef05c
NHE
865 */
866 struct page *apic_access_page;
a7c0b07d 867 struct page *virtual_apic_page;
705699a1
WV
868 struct page *pi_desc_page;
869 struct pi_desc *pi_desc;
870 bool pi_pending;
871 u16 posted_intr_nv;
f4124500
JK
872
873 struct hrtimer preemption_timer;
874 bool preemption_timer_expired;
2996fca0
JK
875
876 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
877 u64 vmcs01_debugctl;
62cf9bd8 878 u64 vmcs01_guest_bndcfgs;
b9c237bb 879
5c614b35
WL
880 u16 vpid02;
881 u16 last_vpid;
882
6677f3da 883 struct nested_vmx_msrs msrs;
72e9cbdb
LP
884
885 /* SMM related state */
886 struct {
887 /* in VMX operation on SMM entry? */
888 bool vmxon;
889 /* in guest mode on SMM entry? */
890 bool guest_mode;
891 } smm;
945679e3 892
b8bbab92
VK
893 gpa_t hv_evmcs_vmptr;
894 struct page *hv_evmcs_page;
945679e3 895 struct hv_enlightened_vmcs *hv_evmcs;
ec378aee
NHE
896};
897
01e439be 898#define POSTED_INTR_ON 0
ebbfc765
FW
899#define POSTED_INTR_SN 1
900
01e439be
YZ
901/* Posted-Interrupt Descriptor */
902struct pi_desc {
903 u32 pir[8]; /* Posted interrupt requested */
6ef1522f
FW
904 union {
905 struct {
906 /* bit 256 - Outstanding Notification */
907 u16 on : 1,
908 /* bit 257 - Suppress Notification */
909 sn : 1,
910 /* bit 271:258 - Reserved */
911 rsvd_1 : 14;
912 /* bit 279:272 - Notification Vector */
913 u8 nv;
914 /* bit 287:280 - Reserved */
915 u8 rsvd_2;
916 /* bit 319:288 - Notification Destination */
917 u32 ndst;
918 };
919 u64 control;
920 };
921 u32 rsvd[6];
01e439be
YZ
922} __aligned(64);
923
a20ed54d
YZ
924static bool pi_test_and_set_on(struct pi_desc *pi_desc)
925{
926 return test_and_set_bit(POSTED_INTR_ON,
927 (unsigned long *)&pi_desc->control);
928}
929
930static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
931{
932 return test_and_clear_bit(POSTED_INTR_ON,
933 (unsigned long *)&pi_desc->control);
934}
935
936static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
937{
938 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
939}
940
ebbfc765
FW
941static inline void pi_clear_sn(struct pi_desc *pi_desc)
942{
943 return clear_bit(POSTED_INTR_SN,
944 (unsigned long *)&pi_desc->control);
945}
946
947static inline void pi_set_sn(struct pi_desc *pi_desc)
948{
949 return set_bit(POSTED_INTR_SN,
950 (unsigned long *)&pi_desc->control);
951}
952
ad361091
PB
953static inline void pi_clear_on(struct pi_desc *pi_desc)
954{
955 clear_bit(POSTED_INTR_ON,
956 (unsigned long *)&pi_desc->control);
957}
958
ebbfc765
FW
959static inline int pi_test_on(struct pi_desc *pi_desc)
960{
961 return test_bit(POSTED_INTR_ON,
962 (unsigned long *)&pi_desc->control);
963}
964
965static inline int pi_test_sn(struct pi_desc *pi_desc)
966{
967 return test_bit(POSTED_INTR_SN,
968 (unsigned long *)&pi_desc->control);
969}
970
33966dd6
KRW
971struct vmx_msrs {
972 unsigned int nr;
973 struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
974};
975
a2fa3e9f 976struct vcpu_vmx {
fb3f0f51 977 struct kvm_vcpu vcpu;
313dbd49 978 unsigned long host_rsp;
29bd8a78 979 u8 fail;
904e14fb 980 u8 msr_bitmap_mode;
51aa01d1 981 u32 exit_intr_info;
1155f76a 982 u32 idt_vectoring_info;
6de12732 983 ulong rflags;
26bb0981 984 struct shared_msr_entry *guest_msrs;
a2fa3e9f
GH
985 int nmsrs;
986 int save_nmsrs;
a547c6db 987 unsigned long host_idt_base;
a2fa3e9f 988#ifdef CONFIG_X86_64
44ea2b17
AK
989 u64 msr_host_kernel_gs_base;
990 u64 msr_guest_kernel_gs_base;
a2fa3e9f 991#endif
15d45071 992
28c1c9fa 993 u64 arch_capabilities;
d28b387f 994 u64 spec_ctrl;
28c1c9fa 995
2961e876
GN
996 u32 vm_entry_controls_shadow;
997 u32 vm_exit_controls_shadow;
80154d77
PB
998 u32 secondary_exec_control;
999
d462b819
NHE
1000 /*
1001 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
1002 * non-nested (L1) guest, it always points to vmcs01. For a nested
bd9966de
SC
1003 * guest (L2), it points to a different VMCS. loaded_cpu_state points
1004 * to the VMCS whose state is loaded into the CPU registers that only
1005 * need to be switched when transitioning to/from the kernel; a NULL
1006 * value indicates that host state is loaded.
d462b819
NHE
1007 */
1008 struct loaded_vmcs vmcs01;
1009 struct loaded_vmcs *loaded_vmcs;
bd9966de 1010 struct loaded_vmcs *loaded_cpu_state;
d462b819 1011 bool __launched; /* temporary, used in vmx_vcpu_run */
61d2ef2c 1012 struct msr_autoload {
33966dd6
KRW
1013 struct vmx_msrs guest;
1014 struct vmx_msrs host;
61d2ef2c 1015 } msr_autoload;
bd9966de 1016
9c8cba37 1017 struct {
7ffd92c5 1018 int vm86_active;
78ac8b47 1019 ulong save_rflags;
f5f7b2fe
AK
1020 struct kvm_segment segs[8];
1021 } rmode;
1022 struct {
1023 u32 bitmask; /* 4 bits per segment (1 bit per field) */
7ffd92c5
AK
1024 struct kvm_save_segment {
1025 u16 selector;
1026 unsigned long base;
1027 u32 limit;
1028 u32 ar;
f5f7b2fe 1029 } seg[8];
2fb92db1 1030 } segment_cache;
2384d2b3 1031 int vpid;
04fa4d32 1032 bool emulation_required;
3b86cd99 1033
a0861c02 1034 u32 exit_reason;
4e47c7a6 1035
01e439be
YZ
1036 /* Posted interrupt descriptor */
1037 struct pi_desc pi_desc;
1038
ec378aee
NHE
1039 /* Support for a guest hypervisor (nested VMX) */
1040 struct nested_vmx nested;
a7653ecd
RK
1041
1042 /* Dynamic PLE window. */
1043 int ple_window;
1044 bool ple_window_dirty;
843e4330 1045
d264ee0c
SC
1046 bool req_immediate_exit;
1047
843e4330
KH
1048 /* Support for PML */
1049#define PML_ENTITY_NUM 512
1050 struct page *pml_pg;
2680d6da 1051
64672c95
YJ
1052 /* apic deadline value in host tsc */
1053 u64 hv_deadline_tsc;
1054
2680d6da 1055 u64 current_tsc_ratio;
1be0e61c 1056
1be0e61c 1057 u32 host_pkru;
3b84080b 1058
74c55931
WL
1059 unsigned long host_debugctlmsr;
1060
37e4c997
HZ
1061 /*
1062 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
1063 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
1064 * in msr_ia32_feature_control_valid_bits.
1065 */
3b84080b 1066 u64 msr_ia32_feature_control;
37e4c997 1067 u64 msr_ia32_feature_control_valid_bits;
877ad952 1068 u64 ept_pointer;
a2fa3e9f
GH
1069};
1070
2fb92db1
AK
1071enum segment_cache_field {
1072 SEG_FIELD_SEL = 0,
1073 SEG_FIELD_BASE = 1,
1074 SEG_FIELD_LIMIT = 2,
1075 SEG_FIELD_AR = 3,
1076
1077 SEG_FIELD_NR = 4
1078};
1079
40bbb9d0
SC
1080static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
1081{
1082 return container_of(kvm, struct kvm_vmx, kvm);
1083}
1084
a2fa3e9f
GH
1085static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
1086{
fb3f0f51 1087 return container_of(vcpu, struct vcpu_vmx, vcpu);
a2fa3e9f
GH
1088}
1089
efc64404
FW
1090static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
1091{
1092 return &(to_vmx(vcpu)->pi_desc);
1093}
1094
58e9ffae 1095#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
22bd0358 1096#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
58e9ffae
JM
1097#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
1098#define FIELD64(number, name) \
1099 FIELD(number, name), \
1100 [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
22bd0358 1101
4607c2d7 1102
44900ba6 1103static u16 shadow_read_only_fields[] = {
c9e9deae
PB
1104#define SHADOW_FIELD_RO(x) x,
1105#include "vmx_shadow_fields.h"
4607c2d7 1106};
fe2b201b 1107static int max_shadow_read_only_fields =
4607c2d7
AG
1108 ARRAY_SIZE(shadow_read_only_fields);
1109
44900ba6 1110static u16 shadow_read_write_fields[] = {
c9e9deae
PB
1111#define SHADOW_FIELD_RW(x) x,
1112#include "vmx_shadow_fields.h"
4607c2d7 1113};
fe2b201b 1114static int max_shadow_read_write_fields =
4607c2d7
AG
1115 ARRAY_SIZE(shadow_read_write_fields);
1116
772e0318 1117static const unsigned short vmcs_field_to_offset_table[] = {
22bd0358 1118 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
705699a1 1119 FIELD(POSTED_INTR_NV, posted_intr_nv),
22bd0358
NHE
1120 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
1121 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
1122 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
1123 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
1124 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
1125 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
1126 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
1127 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
608406e2 1128 FIELD(GUEST_INTR_STATUS, guest_intr_status),
c5f983f6 1129 FIELD(GUEST_PML_INDEX, guest_pml_index),
22bd0358
NHE
1130 FIELD(HOST_ES_SELECTOR, host_es_selector),
1131 FIELD(HOST_CS_SELECTOR, host_cs_selector),
1132 FIELD(HOST_SS_SELECTOR, host_ss_selector),
1133 FIELD(HOST_DS_SELECTOR, host_ds_selector),
1134 FIELD(HOST_FS_SELECTOR, host_fs_selector),
1135 FIELD(HOST_GS_SELECTOR, host_gs_selector),
1136 FIELD(HOST_TR_SELECTOR, host_tr_selector),
1137 FIELD64(IO_BITMAP_A, io_bitmap_a),
1138 FIELD64(IO_BITMAP_B, io_bitmap_b),
1139 FIELD64(MSR_BITMAP, msr_bitmap),
1140 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
1141 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
1142 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
b348e793 1143 FIELD64(PML_ADDRESS, pml_address),
22bd0358
NHE
1144 FIELD64(TSC_OFFSET, tsc_offset),
1145 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
1146 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
705699a1 1147 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
27c42a1b 1148 FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
22bd0358 1149 FIELD64(EPT_POINTER, ept_pointer),
608406e2
WV
1150 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
1151 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
1152 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
1153 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
41ab9372 1154 FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
b348e793
JM
1155 FIELD64(VMREAD_BITMAP, vmread_bitmap),
1156 FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
81dc01f7 1157 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
22bd0358
NHE
1158 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
1159 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
1160 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
1161 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
1162 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
1163 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
1164 FIELD64(GUEST_PDPTR0, guest_pdptr0),
1165 FIELD64(GUEST_PDPTR1, guest_pdptr1),
1166 FIELD64(GUEST_PDPTR2, guest_pdptr2),
1167 FIELD64(GUEST_PDPTR3, guest_pdptr3),
36be0b9d 1168 FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
22bd0358
NHE
1169 FIELD64(HOST_IA32_PAT, host_ia32_pat),
1170 FIELD64(HOST_IA32_EFER, host_ia32_efer),
1171 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
1172 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
1173 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
1174 FIELD(EXCEPTION_BITMAP, exception_bitmap),
1175 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
1176 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
1177 FIELD(CR3_TARGET_COUNT, cr3_target_count),
1178 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
1179 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
1180 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
1181 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
1182 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
1183 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
1184 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
1185 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
1186 FIELD(TPR_THRESHOLD, tpr_threshold),
1187 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
1188 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
1189 FIELD(VM_EXIT_REASON, vm_exit_reason),
1190 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
1191 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
1192 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
1193 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
1194 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
1195 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
1196 FIELD(GUEST_ES_LIMIT, guest_es_limit),
1197 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
1198 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
1199 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
1200 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
1201 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
1202 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
1203 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
1204 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
1205 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
1206 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
1207 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
1208 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
1209 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
1210 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
1211 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
1212 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
1213 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
1214 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
1215 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
1216 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
1217 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
0238ea91 1218 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
22bd0358
NHE
1219 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
1220 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
1221 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
1222 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
1223 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
1224 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
1225 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
1226 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
1227 FIELD(EXIT_QUALIFICATION, exit_qualification),
1228 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
1229 FIELD(GUEST_CR0, guest_cr0),
1230 FIELD(GUEST_CR3, guest_cr3),
1231 FIELD(GUEST_CR4, guest_cr4),
1232 FIELD(GUEST_ES_BASE, guest_es_base),
1233 FIELD(GUEST_CS_BASE, guest_cs_base),
1234 FIELD(GUEST_SS_BASE, guest_ss_base),
1235 FIELD(GUEST_DS_BASE, guest_ds_base),
1236 FIELD(GUEST_FS_BASE, guest_fs_base),
1237 FIELD(GUEST_GS_BASE, guest_gs_base),
1238 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
1239 FIELD(GUEST_TR_BASE, guest_tr_base),
1240 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
1241 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
1242 FIELD(GUEST_DR7, guest_dr7),
1243 FIELD(GUEST_RSP, guest_rsp),
1244 FIELD(GUEST_RIP, guest_rip),
1245 FIELD(GUEST_RFLAGS, guest_rflags),
1246 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
1247 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
1248 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
1249 FIELD(HOST_CR0, host_cr0),
1250 FIELD(HOST_CR3, host_cr3),
1251 FIELD(HOST_CR4, host_cr4),
1252 FIELD(HOST_FS_BASE, host_fs_base),
1253 FIELD(HOST_GS_BASE, host_gs_base),
1254 FIELD(HOST_TR_BASE, host_tr_base),
1255 FIELD(HOST_GDTR_BASE, host_gdtr_base),
1256 FIELD(HOST_IDTR_BASE, host_idtr_base),
1257 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
1258 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
1259 FIELD(HOST_RSP, host_rsp),
1260 FIELD(HOST_RIP, host_rip),
1261};
22bd0358
NHE
1262
1263static inline short vmcs_field_to_offset(unsigned long field)
1264{
085331df
DW
1265 const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
1266 unsigned short offset;
58e9ffae
JM
1267 unsigned index;
1268
1269 if (field >> 15)
1270 return -ENOENT;
a2ae9df7 1271
58e9ffae 1272 index = ROL16(field, 6);
15303ba5 1273 if (index >= size)
75f139aa
AH
1274 return -ENOENT;
1275
15303ba5
LT
1276 index = array_index_nospec(index, size);
1277 offset = vmcs_field_to_offset_table[index];
085331df 1278 if (offset == 0)
a2ae9df7 1279 return -ENOENT;
085331df 1280 return offset;
22bd0358
NHE
1281}
1282
a9d30f33
NHE
1283static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
1284{
4f2777bc 1285 return to_vmx(vcpu)->nested.cached_vmcs12;
a9d30f33
NHE
1286}
1287
61ada748
LA
1288static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
1289{
1290 return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
1291}
1292
995f00a6 1293static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
bfd0a56b 1294static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
995f00a6 1295static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
f53cd63c 1296static bool vmx_xsaves_supported(void);
b246dd5d
OW
1297static void vmx_set_segment(struct kvm_vcpu *vcpu,
1298 struct kvm_segment *var, int seg);
1299static void vmx_get_segment(struct kvm_vcpu *vcpu,
1300 struct kvm_segment *var, int seg);
d99e4152
GN
1301static bool guest_state_valid(struct kvm_vcpu *vcpu);
1302static u32 vmx_segment_access_rights(struct kvm_segment *var);
16f5b903 1303static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
b96fb439
PB
1304static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
1305static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
1306static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
1307 u16 error_code);
904e14fb 1308static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
15d45071
AR
1309static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1310 u32 msr, int type);
75880a01 1311
6aa8b732
AK
1312static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1313static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
d462b819
NHE
1314/*
1315 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
1316 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
1317 */
1318static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
6aa8b732 1319
bf9f6ac8
FW
1320/*
1321 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
1322 * can find which vCPU should be waken up.
1323 */
1324static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
1325static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1326
23611332 1327enum {
23611332
RK
1328 VMX_VMREAD_BITMAP,
1329 VMX_VMWRITE_BITMAP,
1330 VMX_BITMAP_NR
1331};
1332
1333static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
1334
23611332
RK
1335#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
1336#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
fdef3ad1 1337
110312c8 1338static bool cpu_has_load_ia32_efer;
8bf00a52 1339static bool cpu_has_load_perf_global_ctrl;
110312c8 1340
2384d2b3
SY
1341static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
1342static DEFINE_SPINLOCK(vmx_vpid_lock);
1343
1c3d14fe 1344static struct vmcs_config {
6aa8b732
AK
1345 int size;
1346 int order;
9ac7e3e8 1347 u32 basic_cap;
6aa8b732 1348 u32 revision_id;
1c3d14fe
YS
1349 u32 pin_based_exec_ctrl;
1350 u32 cpu_based_exec_ctrl;
f78e0e2e 1351 u32 cpu_based_2nd_exec_ctrl;
1c3d14fe
YS
1352 u32 vmexit_ctrl;
1353 u32 vmentry_ctrl;
1389309c 1354 struct nested_vmx_msrs nested;
1c3d14fe 1355} vmcs_config;
6aa8b732 1356
efff9e53 1357static struct vmx_capability {
d56f546d
SY
1358 u32 ept;
1359 u32 vpid;
1360} vmx_capability;
1361
6aa8b732
AK
1362#define VMX_SEGMENT_FIELD(seg) \
1363 [VCPU_SREG_##seg] = { \
1364 .selector = GUEST_##seg##_SELECTOR, \
1365 .base = GUEST_##seg##_BASE, \
1366 .limit = GUEST_##seg##_LIMIT, \
1367 .ar_bytes = GUEST_##seg##_AR_BYTES, \
1368 }
1369
772e0318 1370static const struct kvm_vmx_segment_field {
6aa8b732
AK
1371 unsigned selector;
1372 unsigned base;
1373 unsigned limit;
1374 unsigned ar_bytes;
1375} kvm_vmx_segment_fields[] = {
1376 VMX_SEGMENT_FIELD(CS),
1377 VMX_SEGMENT_FIELD(DS),
1378 VMX_SEGMENT_FIELD(ES),
1379 VMX_SEGMENT_FIELD(FS),
1380 VMX_SEGMENT_FIELD(GS),
1381 VMX_SEGMENT_FIELD(SS),
1382 VMX_SEGMENT_FIELD(TR),
1383 VMX_SEGMENT_FIELD(LDTR),
1384};
1385
26bb0981
AK
1386static u64 host_efer;
1387
6de4f3ad
AK
1388static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1389
4d56c8a7 1390/*
8c06585d 1391 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
4d56c8a7
AK
1392 * away by decrementing the array size.
1393 */
6aa8b732 1394static const u32 vmx_msr_index[] = {
05b3e0c2 1395#ifdef CONFIG_X86_64
44ea2b17 1396 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
6aa8b732 1397#endif
8c06585d 1398 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
6aa8b732 1399};
6aa8b732 1400
773e8a04
VK
1401DEFINE_STATIC_KEY_FALSE(enable_evmcs);
1402
1403#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
1404
1405#define KVM_EVMCS_VERSION 1
1406
5d7a6443
VK
1407/*
1408 * Enlightened VMCSv1 doesn't support these:
1409 *
1410 * POSTED_INTR_NV = 0x00000002,
1411 * GUEST_INTR_STATUS = 0x00000810,
1412 * APIC_ACCESS_ADDR = 0x00002014,
1413 * POSTED_INTR_DESC_ADDR = 0x00002016,
1414 * EOI_EXIT_BITMAP0 = 0x0000201c,
1415 * EOI_EXIT_BITMAP1 = 0x0000201e,
1416 * EOI_EXIT_BITMAP2 = 0x00002020,
1417 * EOI_EXIT_BITMAP3 = 0x00002022,
1418 * GUEST_PML_INDEX = 0x00000812,
1419 * PML_ADDRESS = 0x0000200e,
1420 * VM_FUNCTION_CONTROL = 0x00002018,
1421 * EPTP_LIST_ADDRESS = 0x00002024,
1422 * VMREAD_BITMAP = 0x00002026,
1423 * VMWRITE_BITMAP = 0x00002028,
1424 *
1425 * TSC_MULTIPLIER = 0x00002032,
1426 * PLE_GAP = 0x00004020,
1427 * PLE_WINDOW = 0x00004022,
1428 * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
1429 * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
1430 * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
1431 *
1432 * Currently unsupported in KVM:
1433 * GUEST_IA32_RTIT_CTL = 0x00002814,
1434 */
1435#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
1436 PIN_BASED_VMX_PREEMPTION_TIMER)
1437#define EVMCS1_UNSUPPORTED_2NDEXEC \
1438 (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
1439 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
1440 SECONDARY_EXEC_APIC_REGISTER_VIRT | \
1441 SECONDARY_EXEC_ENABLE_PML | \
1442 SECONDARY_EXEC_ENABLE_VMFUNC | \
1443 SECONDARY_EXEC_SHADOW_VMCS | \
1444 SECONDARY_EXEC_TSC_SCALING | \
1445 SECONDARY_EXEC_PAUSE_LOOP_EXITING)
1446#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
1447#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
1448#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
1449
773e8a04
VK
1450#if IS_ENABLED(CONFIG_HYPERV)
1451static bool __read_mostly enlightened_vmcs = true;
1452module_param(enlightened_vmcs, bool, 0444);
1453
1454static inline void evmcs_write64(unsigned long field, u64 value)
1455{
1456 u16 clean_field;
1457 int offset = get_evmcs_offset(field, &clean_field);
1458
1459 if (offset < 0)
1460 return;
1461
1462 *(u64 *)((char *)current_evmcs + offset) = value;
1463
1464 current_evmcs->hv_clean_fields &= ~clean_field;
1465}
1466
1467static inline void evmcs_write32(unsigned long field, u32 value)
1468{
1469 u16 clean_field;
1470 int offset = get_evmcs_offset(field, &clean_field);
1471
1472 if (offset < 0)
1473 return;
1474
1475 *(u32 *)((char *)current_evmcs + offset) = value;
1476 current_evmcs->hv_clean_fields &= ~clean_field;
1477}
1478
1479static inline void evmcs_write16(unsigned long field, u16 value)
1480{
1481 u16 clean_field;
1482 int offset = get_evmcs_offset(field, &clean_field);
1483
1484 if (offset < 0)
1485 return;
1486
1487 *(u16 *)((char *)current_evmcs + offset) = value;
1488 current_evmcs->hv_clean_fields &= ~clean_field;
1489}
1490
1491static inline u64 evmcs_read64(unsigned long field)
1492{
1493 int offset = get_evmcs_offset(field, NULL);
1494
1495 if (offset < 0)
1496 return 0;
1497
1498 return *(u64 *)((char *)current_evmcs + offset);
1499}
1500
1501static inline u32 evmcs_read32(unsigned long field)
1502{
1503 int offset = get_evmcs_offset(field, NULL);
1504
1505 if (offset < 0)
1506 return 0;
1507
1508 return *(u32 *)((char *)current_evmcs + offset);
1509}
1510
1511static inline u16 evmcs_read16(unsigned long field)
1512{
1513 int offset = get_evmcs_offset(field, NULL);
1514
1515 if (offset < 0)
1516 return 0;
1517
1518 return *(u16 *)((char *)current_evmcs + offset);
1519}
1520
ceef7d10
VK
1521static inline void evmcs_touch_msr_bitmap(void)
1522{
1523 if (unlikely(!current_evmcs))
1524 return;
1525
1526 if (current_evmcs->hv_enlightenments_control.msr_bitmap)
1527 current_evmcs->hv_clean_fields &=
1528 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
1529}
1530
773e8a04
VK
1531static void evmcs_load(u64 phys_addr)
1532{
1533 struct hv_vp_assist_page *vp_ap =
1534 hv_get_vp_assist_page(smp_processor_id());
1535
1536 vp_ap->current_nested_vmcs = phys_addr;
1537 vp_ap->enlighten_vmentry = 1;
1538}
1539
1540static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
1541{
5d7a6443
VK
1542 vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
1543 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
773e8a04 1544
5d7a6443
VK
1545 vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
1546 vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
773e8a04 1547
773e8a04 1548}
877ad952
TL
1549
1550/* check_ept_pointer() should be under protection of ept_pointer_lock. */
1551static void check_ept_pointer_match(struct kvm *kvm)
1552{
1553 struct kvm_vcpu *vcpu;
1554 u64 tmp_eptp = INVALID_PAGE;
1555 int i;
1556
1557 kvm_for_each_vcpu(i, vcpu, kvm) {
1558 if (!VALID_PAGE(tmp_eptp)) {
1559 tmp_eptp = to_vmx(vcpu)->ept_pointer;
1560 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
1561 to_kvm_vmx(kvm)->ept_pointers_match
1562 = EPT_POINTERS_MISMATCH;
1563 return;
1564 }
1565 }
1566
1567 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
1568}
1569
1570static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
1571{
a5c214da
LT
1572 struct kvm_vcpu *vcpu;
1573 int ret = -ENOTSUPP, i;
877ad952
TL
1574
1575 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1576
1577 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
1578 check_ept_pointer_match(kvm);
1579
1580 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
a5c214da
LT
1581 kvm_for_each_vcpu(i, vcpu, kvm)
1582 ret |= hyperv_flush_guest_mapping(
1583 to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer);
1584 } else {
1585 ret = hyperv_flush_guest_mapping(
1586 to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
877ad952
TL
1587 }
1588
877ad952
TL
1589 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1590 return ret;
1591}
773e8a04
VK
1592#else /* !IS_ENABLED(CONFIG_HYPERV) */
1593static inline void evmcs_write64(unsigned long field, u64 value) {}
1594static inline void evmcs_write32(unsigned long field, u32 value) {}
1595static inline void evmcs_write16(unsigned long field, u16 value) {}
1596static inline u64 evmcs_read64(unsigned long field) { return 0; }
1597static inline u32 evmcs_read32(unsigned long field) { return 0; }
1598static inline u16 evmcs_read16(unsigned long field) { return 0; }
1599static inline void evmcs_load(u64 phys_addr) {}
1600static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
ceef7d10 1601static inline void evmcs_touch_msr_bitmap(void) {}
773e8a04
VK
1602#endif /* IS_ENABLED(CONFIG_HYPERV) */
1603
57b119da
VK
1604static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
1605 uint16_t *vmcs_version)
1606{
1607 struct vcpu_vmx *vmx = to_vmx(vcpu);
1608
1609 /* We don't support disabling the feature for simplicity. */
1610 if (vmx->nested.enlightened_vmcs_enabled)
1611 return 0;
1612
1613 vmx->nested.enlightened_vmcs_enabled = true;
1614
1615 /*
1616 * vmcs_version represents the range of supported Enlightened VMCS
1617 * versions: lower 8 bits is the minimal version, higher 8 bits is the
1618 * maximum supported version. KVM supports versions from 1 to
1619 * KVM_EVMCS_VERSION.
1620 */
8cab6507
VK
1621 if (vmcs_version)
1622 *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
57b119da
VK
1623
1624 vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
1625 vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
1626 vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
1627 vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
1628 vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
1629
1630 return 0;
1631}
1632
5bb16016 1633static inline bool is_exception_n(u32 intr_info, u8 vector)
6aa8b732
AK
1634{
1635 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1636 INTR_INFO_VALID_MASK)) ==
5bb16016
JK
1637 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1638}
1639
6f05485d
JK
1640static inline bool is_debug(u32 intr_info)
1641{
1642 return is_exception_n(intr_info, DB_VECTOR);
1643}
1644
1645static inline bool is_breakpoint(u32 intr_info)
1646{
1647 return is_exception_n(intr_info, BP_VECTOR);
1648}
1649
5bb16016
JK
1650static inline bool is_page_fault(u32 intr_info)
1651{
1652 return is_exception_n(intr_info, PF_VECTOR);
6aa8b732
AK
1653}
1654
31299944 1655static inline bool is_invalid_opcode(u32 intr_info)
7aa81cc0 1656{
5bb16016 1657 return is_exception_n(intr_info, UD_VECTOR);
7aa81cc0
AL
1658}
1659
9e869480
LA
1660static inline bool is_gp_fault(u32 intr_info)
1661{
1662 return is_exception_n(intr_info, GP_VECTOR);
1663}
1664
31299944 1665static inline bool is_machine_check(u32 intr_info)
a0861c02
AK
1666{
1667 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1668 INTR_INFO_VALID_MASK)) ==
1669 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1670}
1671
32d43cd3
LT
1672/* Undocumented: icebp/int1 */
1673static inline bool is_icebp(u32 intr_info)
1674{
1675 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1676 == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
1677}
1678
31299944 1679static inline bool cpu_has_vmx_msr_bitmap(void)
25c5f225 1680{
04547156 1681 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
25c5f225
SY
1682}
1683
31299944 1684static inline bool cpu_has_vmx_tpr_shadow(void)
6e5d865c 1685{
04547156 1686 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
6e5d865c
YS
1687}
1688
35754c98 1689static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
6e5d865c 1690{
35754c98 1691 return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
6e5d865c
YS
1692}
1693
31299944 1694static inline bool cpu_has_secondary_exec_ctrls(void)
f78e0e2e 1695{
04547156
SY
1696 return vmcs_config.cpu_based_exec_ctrl &
1697 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
f78e0e2e
SY
1698}
1699
774ead3a 1700static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
f78e0e2e 1701{
04547156
SY
1702 return vmcs_config.cpu_based_2nd_exec_ctrl &
1703 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1704}
1705
8d14695f
YZ
1706static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1707{
1708 return vmcs_config.cpu_based_2nd_exec_ctrl &
1709 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1710}
1711
83d4c286
YZ
1712static inline bool cpu_has_vmx_apic_register_virt(void)
1713{
1714 return vmcs_config.cpu_based_2nd_exec_ctrl &
1715 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1716}
1717
c7c9c56c
YZ
1718static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1719{
1720 return vmcs_config.cpu_based_2nd_exec_ctrl &
1721 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1722}
1723
0b665d30
SC
1724static inline bool cpu_has_vmx_encls_vmexit(void)
1725{
1726 return vmcs_config.cpu_based_2nd_exec_ctrl &
1727 SECONDARY_EXEC_ENCLS_EXITING;
1728}
1729
64672c95
YJ
1730/*
1731 * Comment's format: document - errata name - stepping - processor name.
1732 * Refer from
1733 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1734 */
1735static u32 vmx_preemption_cpu_tfms[] = {
1736/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
17370x000206E6,
1738/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
1739/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1740/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
17410x00020652,
1742/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
17430x00020655,
1744/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
1745/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
1746/*
1747 * 320767.pdf - AAP86 - B1 -
1748 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1749 */
17500x000106E5,
1751/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
17520x000106A0,
1753/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
17540x000106A1,
1755/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
17560x000106A4,
1757 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1758 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1759 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
17600x000106A5,
1761};
1762
1763static inline bool cpu_has_broken_vmx_preemption_timer(void)
1764{
1765 u32 eax = cpuid_eax(0x00000001), i;
1766
1767 /* Clear the reserved bits */
1768 eax &= ~(0x3U << 14 | 0xfU << 28);
03f6a22a 1769 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
64672c95
YJ
1770 if (eax == vmx_preemption_cpu_tfms[i])
1771 return true;
1772
1773 return false;
1774}
1775
1776static inline bool cpu_has_vmx_preemption_timer(void)
1777{
64672c95
YJ
1778 return vmcs_config.pin_based_exec_ctrl &
1779 PIN_BASED_VMX_PREEMPTION_TIMER;
1780}
1781
01e439be
YZ
1782static inline bool cpu_has_vmx_posted_intr(void)
1783{
d6a858d1
PB
1784 return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1785 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
01e439be
YZ
1786}
1787
1788static inline bool cpu_has_vmx_apicv(void)
1789{
1790 return cpu_has_vmx_apic_register_virt() &&
1791 cpu_has_vmx_virtual_intr_delivery() &&
1792 cpu_has_vmx_posted_intr();
1793}
1794
04547156
SY
1795static inline bool cpu_has_vmx_flexpriority(void)
1796{
1797 return cpu_has_vmx_tpr_shadow() &&
1798 cpu_has_vmx_virtualize_apic_accesses();
f78e0e2e
SY
1799}
1800
e799794e
MT
1801static inline bool cpu_has_vmx_ept_execute_only(void)
1802{
31299944 1803 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
e799794e
MT
1804}
1805
e799794e
MT
1806static inline bool cpu_has_vmx_ept_2m_page(void)
1807{
31299944 1808 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
e799794e
MT
1809}
1810
878403b7
SY
1811static inline bool cpu_has_vmx_ept_1g_page(void)
1812{
31299944 1813 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
878403b7
SY
1814}
1815
4bc9b982
SY
1816static inline bool cpu_has_vmx_ept_4levels(void)
1817{
1818 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1819}
1820
42aa53b4
DH
1821static inline bool cpu_has_vmx_ept_mt_wb(void)
1822{
1823 return vmx_capability.ept & VMX_EPTP_WB_BIT;
1824}
1825
855feb67
YZ
1826static inline bool cpu_has_vmx_ept_5levels(void)
1827{
1828 return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1829}
1830
83c3a331
XH
1831static inline bool cpu_has_vmx_ept_ad_bits(void)
1832{
1833 return vmx_capability.ept & VMX_EPT_AD_BIT;
1834}
1835
31299944 1836static inline bool cpu_has_vmx_invept_context(void)
d56f546d 1837{
31299944 1838 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
d56f546d
SY
1839}
1840
31299944 1841static inline bool cpu_has_vmx_invept_global(void)
d56f546d 1842{
31299944 1843 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
d56f546d
SY
1844}
1845
cd9a491f
LA
1846static inline bool cpu_has_vmx_invvpid_individual_addr(void)
1847{
1848 return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
1849}
1850
518c8aee
GJ
1851static inline bool cpu_has_vmx_invvpid_single(void)
1852{
1853 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1854}
1855
b9d762fa
GJ
1856static inline bool cpu_has_vmx_invvpid_global(void)
1857{
1858 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1859}
1860
08d839c4
WL
1861static inline bool cpu_has_vmx_invvpid(void)
1862{
1863 return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1864}
1865
31299944 1866static inline bool cpu_has_vmx_ept(void)
d56f546d 1867{
04547156
SY
1868 return vmcs_config.cpu_based_2nd_exec_ctrl &
1869 SECONDARY_EXEC_ENABLE_EPT;
d56f546d
SY
1870}
1871
31299944 1872static inline bool cpu_has_vmx_unrestricted_guest(void)
3a624e29
NK
1873{
1874 return vmcs_config.cpu_based_2nd_exec_ctrl &
1875 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1876}
1877
31299944 1878static inline bool cpu_has_vmx_ple(void)
4b8d54f9
ZE
1879{
1880 return vmcs_config.cpu_based_2nd_exec_ctrl &
1881 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1882}
1883
9ac7e3e8
JD
1884static inline bool cpu_has_vmx_basic_inout(void)
1885{
1886 return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1887}
1888
35754c98 1889static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
f78e0e2e 1890{
35754c98 1891 return flexpriority_enabled && lapic_in_kernel(vcpu);
f78e0e2e
SY
1892}
1893
31299944 1894static inline bool cpu_has_vmx_vpid(void)
2384d2b3 1895{
04547156
SY
1896 return vmcs_config.cpu_based_2nd_exec_ctrl &
1897 SECONDARY_EXEC_ENABLE_VPID;
2384d2b3
SY
1898}
1899
31299944 1900static inline bool cpu_has_vmx_rdtscp(void)
4e47c7a6
SY
1901{
1902 return vmcs_config.cpu_based_2nd_exec_ctrl &
1903 SECONDARY_EXEC_RDTSCP;
1904}
1905
ad756a16
MJ
1906static inline bool cpu_has_vmx_invpcid(void)
1907{
1908 return vmcs_config.cpu_based_2nd_exec_ctrl &
1909 SECONDARY_EXEC_ENABLE_INVPCID;
1910}
1911
8a1b4392
PB
1912static inline bool cpu_has_virtual_nmis(void)
1913{
1914 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1915}
1916
f5f48ee1
SY
1917static inline bool cpu_has_vmx_wbinvd_exit(void)
1918{
1919 return vmcs_config.cpu_based_2nd_exec_ctrl &
1920 SECONDARY_EXEC_WBINVD_EXITING;
1921}
1922
abc4fc58
AG
1923static inline bool cpu_has_vmx_shadow_vmcs(void)
1924{
1925 u64 vmx_msr;
1926 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1927 /* check if the cpu supports writing r/o exit information fields */
1928 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1929 return false;
1930
1931 return vmcs_config.cpu_based_2nd_exec_ctrl &
1932 SECONDARY_EXEC_SHADOW_VMCS;
1933}
1934
843e4330
KH
1935static inline bool cpu_has_vmx_pml(void)
1936{
1937 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1938}
1939
64903d61
HZ
1940static inline bool cpu_has_vmx_tsc_scaling(void)
1941{
1942 return vmcs_config.cpu_based_2nd_exec_ctrl &
1943 SECONDARY_EXEC_TSC_SCALING;
1944}
1945
2a499e49
BD
1946static inline bool cpu_has_vmx_vmfunc(void)
1947{
1948 return vmcs_config.cpu_based_2nd_exec_ctrl &
1949 SECONDARY_EXEC_ENABLE_VMFUNC;
1950}
1951
64f7a115
SC
1952static bool vmx_umip_emulated(void)
1953{
1954 return vmcs_config.cpu_based_2nd_exec_ctrl &
1955 SECONDARY_EXEC_DESC;
1956}
1957
04547156
SY
1958static inline bool report_flexpriority(void)
1959{
1960 return flexpriority_enabled;
1961}
1962
c7c2c709
JM
1963static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1964{
6677f3da 1965 return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
c7c2c709
JM
1966}
1967
f4160e45
JM
1968/*
1969 * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
1970 * to modify any valid field of the VMCS, or are the VM-exit
1971 * information fields read-only?
1972 */
1973static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
1974{
1975 return to_vmx(vcpu)->nested.msrs.misc_low &
1976 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
1977}
1978
0447378a
MO
1979static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
1980{
1981 return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
1982}
1983
1984static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
1985{
1986 return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
1987 CPU_BASED_MONITOR_TRAP_FLAG;
1988}
1989
fa97d7db
LA
1990static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
1991{
1992 return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
1993 SECONDARY_EXEC_SHADOW_VMCS;
1994}
1995
fe3ef05c
NHE
1996static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1997{
1998 return vmcs12->cpu_based_vm_exec_control & bit;
1999}
2000
2001static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
2002{
2003 return (vmcs12->cpu_based_vm_exec_control &
2004 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2005 (vmcs12->secondary_vm_exec_control & bit);
2006}
2007
f4124500
JK
2008static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
2009{
2010 return vmcs12->pin_based_vm_exec_control &
2011 PIN_BASED_VMX_PREEMPTION_TIMER;
2012}
2013
0c7f650e
KS
2014static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
2015{
2016 return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
2017}
2018
2019static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
2020{
2021 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
2022}
2023
155a97a3
NHE
2024static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
2025{
2026 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
2027}
2028
81dc01f7
WL
2029static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
2030{
3db13480 2031 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
81dc01f7
WL
2032}
2033
c5f983f6
BD
2034static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
2035{
2036 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
2037}
2038
f2b93280
WV
2039static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
2040{
2041 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
2042}
2043
5c614b35
WL
2044static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
2045{
2046 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
2047}
2048
82f0dd4b
WV
2049static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
2050{
2051 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
2052}
2053
608406e2
WV
2054static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
2055{
2056 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2057}
2058
705699a1
WV
2059static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
2060{
2061 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
2062}
2063
27c42a1b
BD
2064static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
2065{
2066 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
2067}
2068
41ab9372
BD
2069static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
2070{
2071 return nested_cpu_has_vmfunc(vmcs12) &&
2072 (vmcs12->vm_function_control &
2073 VMX_VMFUNC_EPTP_SWITCHING);
2074}
2075
f792d274
LA
2076static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
2077{
2078 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
2079}
2080
ef85b673 2081static inline bool is_nmi(u32 intr_info)
644d711a
NHE
2082{
2083 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
ef85b673 2084 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
644d711a
NHE
2085}
2086
533558bc
JK
2087static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2088 u32 exit_intr_info,
2089 unsigned long exit_qualification);
7c177938 2090
8b9cf98c 2091static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
7725f0ba
AK
2092{
2093 int i;
2094
a2fa3e9f 2095 for (i = 0; i < vmx->nmsrs; ++i)
26bb0981 2096 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
a75beee6
ED
2097 return i;
2098 return -1;
2099}
2100
5ebb272b 2101static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
2384d2b3
SY
2102{
2103 struct {
2104 u64 vpid : 16;
2105 u64 rsvd : 48;
2106 u64 gva;
2107 } operand = { vpid, 0, gva };
fd8ca6da 2108 bool error;
2384d2b3 2109
4b1e5478
UB
2110 asm volatile (__ex("invvpid %2, %1") CC_SET(na)
2111 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
fd8ca6da 2112 BUG_ON(error);
2384d2b3
SY
2113}
2114
5ebb272b 2115static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
1439442c
SY
2116{
2117 struct {
2118 u64 eptp, gpa;
2119 } operand = {eptp, gpa};
fd8ca6da 2120 bool error;
1439442c 2121
4b1e5478
UB
2122 asm volatile (__ex("invept %2, %1") CC_SET(na)
2123 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
fd8ca6da 2124 BUG_ON(error);
1439442c
SY
2125}
2126
26bb0981 2127static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
a75beee6
ED
2128{
2129 int i;
2130
8b9cf98c 2131 i = __find_msr_index(vmx, msr);
a75beee6 2132 if (i >= 0)
a2fa3e9f 2133 return &vmx->guest_msrs[i];
8b6d44c7 2134 return NULL;
7725f0ba
AK
2135}
2136
6aa8b732
AK
2137static void vmcs_clear(struct vmcs *vmcs)
2138{
2139 u64 phys_addr = __pa(vmcs);
fd8ca6da 2140 bool error;
6aa8b732 2141
4b1e5478
UB
2142 asm volatile (__ex("vmclear %1") CC_SET(na)
2143 : CC_OUT(na) (error) : "m"(phys_addr));
fd8ca6da 2144 if (unlikely(error))
6aa8b732
AK
2145 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
2146 vmcs, phys_addr);
2147}
2148
d462b819
NHE
2149static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
2150{
2151 vmcs_clear(loaded_vmcs->vmcs);
355f4fb1
JM
2152 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
2153 vmcs_clear(loaded_vmcs->shadow_vmcs);
d462b819
NHE
2154 loaded_vmcs->cpu = -1;
2155 loaded_vmcs->launched = 0;
2156}
2157
7725b894
DX
2158static void vmcs_load(struct vmcs *vmcs)
2159{
2160 u64 phys_addr = __pa(vmcs);
fd8ca6da 2161 bool error;
7725b894 2162
773e8a04
VK
2163 if (static_branch_unlikely(&enable_evmcs))
2164 return evmcs_load(phys_addr);
2165
4b1e5478
UB
2166 asm volatile (__ex("vmptrld %1") CC_SET(na)
2167 : CC_OUT(na) (error) : "m"(phys_addr));
fd8ca6da 2168 if (unlikely(error))
2844d849 2169 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
7725b894
DX
2170 vmcs, phys_addr);
2171}
2172
2965faa5 2173#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
2174/*
2175 * This bitmap is used to indicate whether the vmclear
2176 * operation is enabled on all cpus. All disabled by
2177 * default.
2178 */
2179static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
2180
2181static inline void crash_enable_local_vmclear(int cpu)
2182{
2183 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
2184}
2185
2186static inline void crash_disable_local_vmclear(int cpu)
2187{
2188 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
2189}
2190
2191static inline int crash_local_vmclear_enabled(int cpu)
2192{
2193 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
2194}
2195
2196static void crash_vmclear_local_loaded_vmcss(void)
2197{
2198 int cpu = raw_smp_processor_id();
2199 struct loaded_vmcs *v;
2200
2201 if (!crash_local_vmclear_enabled(cpu))
2202 return;
2203
2204 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
2205 loaded_vmcss_on_cpu_link)
2206 vmcs_clear(v->vmcs);
2207}
2208#else
2209static inline void crash_enable_local_vmclear(int cpu) { }
2210static inline void crash_disable_local_vmclear(int cpu) { }
2965faa5 2211#endif /* CONFIG_KEXEC_CORE */
8f536b76 2212
d462b819 2213static void __loaded_vmcs_clear(void *arg)
6aa8b732 2214{
d462b819 2215 struct loaded_vmcs *loaded_vmcs = arg;
d3b2c338 2216 int cpu = raw_smp_processor_id();
6aa8b732 2217
d462b819
NHE
2218 if (loaded_vmcs->cpu != cpu)
2219 return; /* vcpu migration can race with cpu offline */
2220 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
6aa8b732 2221 per_cpu(current_vmcs, cpu) = NULL;
8f536b76 2222 crash_disable_local_vmclear(cpu);
d462b819 2223 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
5a560f8b
XG
2224
2225 /*
2226 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
2227 * is before setting loaded_vmcs->vcpu to -1 which is done in
2228 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
2229 * then adds the vmcs into percpu list before it is deleted.
2230 */
2231 smp_wmb();
2232
d462b819 2233 loaded_vmcs_init(loaded_vmcs);
8f536b76 2234 crash_enable_local_vmclear(cpu);
6aa8b732
AK
2235}
2236
d462b819 2237static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
8d0be2b3 2238{
e6c7d321
XG
2239 int cpu = loaded_vmcs->cpu;
2240
2241 if (cpu != -1)
2242 smp_call_function_single(cpu,
2243 __loaded_vmcs_clear, loaded_vmcs, 1);
8d0be2b3
AK
2244}
2245
faff8758
JS
2246static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
2247{
2248 if (vpid == 0)
2249 return true;
2250
2251 if (cpu_has_vmx_invvpid_individual_addr()) {
2252 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
2253 return true;
2254 }
2255
2256 return false;
2257}
2258
dd5f5341 2259static inline void vpid_sync_vcpu_single(int vpid)
2384d2b3 2260{
dd5f5341 2261 if (vpid == 0)
2384d2b3
SY
2262 return;
2263
518c8aee 2264 if (cpu_has_vmx_invvpid_single())
dd5f5341 2265 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
2384d2b3
SY
2266}
2267
b9d762fa
GJ
2268static inline void vpid_sync_vcpu_global(void)
2269{
2270 if (cpu_has_vmx_invvpid_global())
2271 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
2272}
2273
dd5f5341 2274static inline void vpid_sync_context(int vpid)
b9d762fa
GJ
2275{
2276 if (cpu_has_vmx_invvpid_single())
dd5f5341 2277 vpid_sync_vcpu_single(vpid);
b9d762fa
GJ
2278 else
2279 vpid_sync_vcpu_global();
2280}
2281
1439442c
SY
2282static inline void ept_sync_global(void)
2283{
f5f51586 2284 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1439442c
SY
2285}
2286
2287static inline void ept_sync_context(u64 eptp)
2288{
0e1252dc
DH
2289 if (cpu_has_vmx_invept_context())
2290 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
2291 else
2292 ept_sync_global();
1439442c
SY
2293}
2294
8a86aea9
PB
2295static __always_inline void vmcs_check16(unsigned long field)
2296{
2297 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2298 "16-bit accessor invalid for 64-bit field");
2299 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2300 "16-bit accessor invalid for 64-bit high field");
2301 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2302 "16-bit accessor invalid for 32-bit high field");
2303 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2304 "16-bit accessor invalid for natural width field");
2305}
2306
2307static __always_inline void vmcs_check32(unsigned long field)
2308{
2309 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2310 "32-bit accessor invalid for 16-bit field");
2311 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2312 "32-bit accessor invalid for natural width field");
2313}
2314
2315static __always_inline void vmcs_check64(unsigned long field)
2316{
2317 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2318 "64-bit accessor invalid for 16-bit field");
2319 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2320 "64-bit accessor invalid for 64-bit high field");
2321 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2322 "64-bit accessor invalid for 32-bit field");
2323 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2324 "64-bit accessor invalid for natural width field");
2325}
2326
2327static __always_inline void vmcs_checkl(unsigned long field)
2328{
2329 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2330 "Natural width accessor invalid for 16-bit field");
2331 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2332 "Natural width accessor invalid for 64-bit field");
2333 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2334 "Natural width accessor invalid for 64-bit high field");
2335 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2336 "Natural width accessor invalid for 32-bit field");
2337}
2338
2339static __always_inline unsigned long __vmcs_readl(unsigned long field)
6aa8b732 2340{
5e520e62 2341 unsigned long value;
6aa8b732 2342
44c2d667 2343 asm volatile (__ex_clear("vmread %1, %0", "%k0")
4b1e5478 2344 : "=r"(value) : "r"(field));
6aa8b732
AK
2345 return value;
2346}
2347
96304217 2348static __always_inline u16 vmcs_read16(unsigned long field)
6aa8b732 2349{
8a86aea9 2350 vmcs_check16(field);
773e8a04
VK
2351 if (static_branch_unlikely(&enable_evmcs))
2352 return evmcs_read16(field);
8a86aea9 2353 return __vmcs_readl(field);
6aa8b732
AK
2354}
2355
96304217 2356static __always_inline u32 vmcs_read32(unsigned long field)
6aa8b732 2357{
8a86aea9 2358 vmcs_check32(field);
773e8a04
VK
2359 if (static_branch_unlikely(&enable_evmcs))
2360 return evmcs_read32(field);
8a86aea9 2361 return __vmcs_readl(field);
6aa8b732
AK
2362}
2363
96304217 2364static __always_inline u64 vmcs_read64(unsigned long field)
6aa8b732 2365{
8a86aea9 2366 vmcs_check64(field);
773e8a04
VK
2367 if (static_branch_unlikely(&enable_evmcs))
2368 return evmcs_read64(field);
05b3e0c2 2369#ifdef CONFIG_X86_64
8a86aea9 2370 return __vmcs_readl(field);
6aa8b732 2371#else
8a86aea9 2372 return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
6aa8b732
AK
2373#endif
2374}
2375
8a86aea9
PB
2376static __always_inline unsigned long vmcs_readl(unsigned long field)
2377{
2378 vmcs_checkl(field);
773e8a04
VK
2379 if (static_branch_unlikely(&enable_evmcs))
2380 return evmcs_read64(field);
8a86aea9
PB
2381 return __vmcs_readl(field);
2382}
2383
e52de1b8
AK
2384static noinline void vmwrite_error(unsigned long field, unsigned long value)
2385{
2386 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
2387 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
2388 dump_stack();
2389}
2390
8a86aea9 2391static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
6aa8b732 2392{
fd8ca6da 2393 bool error;
6aa8b732 2394
4b1e5478
UB
2395 asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
2396 : CC_OUT(na) (error) : "r"(field), "rm"(value));
e52de1b8
AK
2397 if (unlikely(error))
2398 vmwrite_error(field, value);
6aa8b732
AK
2399}
2400
8a86aea9 2401static __always_inline void vmcs_write16(unsigned long field, u16 value)
6aa8b732 2402{
8a86aea9 2403 vmcs_check16(field);
773e8a04
VK
2404 if (static_branch_unlikely(&enable_evmcs))
2405 return evmcs_write16(field, value);
2406
8a86aea9 2407 __vmcs_writel(field, value);
6aa8b732
AK
2408}
2409
8a86aea9 2410static __always_inline void vmcs_write32(unsigned long field, u32 value)
6aa8b732 2411{
8a86aea9 2412 vmcs_check32(field);
773e8a04
VK
2413 if (static_branch_unlikely(&enable_evmcs))
2414 return evmcs_write32(field, value);
2415
8a86aea9 2416 __vmcs_writel(field, value);
6aa8b732
AK
2417}
2418
8a86aea9 2419static __always_inline void vmcs_write64(unsigned long field, u64 value)
6aa8b732 2420{
8a86aea9 2421 vmcs_check64(field);
773e8a04
VK
2422 if (static_branch_unlikely(&enable_evmcs))
2423 return evmcs_write64(field, value);
2424
8a86aea9 2425 __vmcs_writel(field, value);
7682f2d0 2426#ifndef CONFIG_X86_64
6aa8b732 2427 asm volatile ("");
8a86aea9 2428 __vmcs_writel(field+1, value >> 32);
6aa8b732
AK
2429#endif
2430}
2431
8a86aea9 2432static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
2ab455cc 2433{
8a86aea9 2434 vmcs_checkl(field);
773e8a04
VK
2435 if (static_branch_unlikely(&enable_evmcs))
2436 return evmcs_write64(field, value);
2437
8a86aea9 2438 __vmcs_writel(field, value);
2ab455cc
AL
2439}
2440
8a86aea9 2441static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
2ab455cc 2442{
8a86aea9
PB
2443 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2444 "vmcs_clear_bits does not support 64-bit fields");
773e8a04
VK
2445 if (static_branch_unlikely(&enable_evmcs))
2446 return evmcs_write32(field, evmcs_read32(field) & ~mask);
2447
8a86aea9 2448 __vmcs_writel(field, __vmcs_readl(field) & ~mask);
2ab455cc
AL
2449}
2450
8a86aea9 2451static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
2ab455cc 2452{
8a86aea9
PB
2453 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2454 "vmcs_set_bits does not support 64-bit fields");
773e8a04
VK
2455 if (static_branch_unlikely(&enable_evmcs))
2456 return evmcs_write32(field, evmcs_read32(field) | mask);
2457
8a86aea9 2458 __vmcs_writel(field, __vmcs_readl(field) | mask);
2ab455cc
AL
2459}
2460
8391ce44
PB
2461static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
2462{
2463 vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
2464}
2465
2961e876
GN
2466static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
2467{
2468 vmcs_write32(VM_ENTRY_CONTROLS, val);
2469 vmx->vm_entry_controls_shadow = val;
2470}
2471
2472static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
2473{
2474 if (vmx->vm_entry_controls_shadow != val)
2475 vm_entry_controls_init(vmx, val);
2476}
2477
2478static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
2479{
2480 return vmx->vm_entry_controls_shadow;
2481}
2482
2483
2484static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2485{
2486 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
2487}
2488
2489static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2490{
2491 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
2492}
2493
8391ce44
PB
2494static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
2495{
2496 vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
2497}
2498
2961e876
GN
2499static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
2500{
2501 vmcs_write32(VM_EXIT_CONTROLS, val);
2502 vmx->vm_exit_controls_shadow = val;
2503}
2504
2505static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
2506{
2507 if (vmx->vm_exit_controls_shadow != val)
2508 vm_exit_controls_init(vmx, val);
2509}
2510
2511static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
2512{
2513 return vmx->vm_exit_controls_shadow;
2514}
2515
2516
2517static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2518{
2519 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
2520}
2521
2522static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2523{
2524 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
2525}
2526
2fb92db1
AK
2527static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
2528{
2529 vmx->segment_cache.bitmask = 0;
2530}
2531
2532static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
2533 unsigned field)
2534{
2535 bool ret;
2536 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
2537
2538 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
2539 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
2540 vmx->segment_cache.bitmask = 0;
2541 }
2542 ret = vmx->segment_cache.bitmask & mask;
2543 vmx->segment_cache.bitmask |= mask;
2544 return ret;
2545}
2546
2547static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
2548{
2549 u16 *p = &vmx->segment_cache.seg[seg].selector;
2550
2551 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
2552 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
2553 return *p;
2554}
2555
2556static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
2557{
2558 ulong *p = &vmx->segment_cache.seg[seg].base;
2559
2560 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
2561 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
2562 return *p;
2563}
2564
2565static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
2566{
2567 u32 *p = &vmx->segment_cache.seg[seg].limit;
2568
2569 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
2570 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
2571 return *p;
2572}
2573
2574static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
2575{
2576 u32 *p = &vmx->segment_cache.seg[seg].ar;
2577
2578 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
2579 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
2580 return *p;
2581}
2582
abd3f2d6
AK
2583static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2584{
2585 u32 eb;
2586
fd7373cc 2587 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
bd7e5b08 2588 (1u << DB_VECTOR) | (1u << AC_VECTOR);
9e869480
LA
2589 /*
2590 * Guest access to VMware backdoor ports could legitimately
2591 * trigger #GP because of TSS I/O permission bitmap.
2592 * We intercept those #GP and allow access to them anyway
2593 * as VMware does.
2594 */
2595 if (enable_vmware_backdoor)
2596 eb |= (1u << GP_VECTOR);
fd7373cc
JK
2597 if ((vcpu->guest_debug &
2598 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
2599 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
2600 eb |= 1u << BP_VECTOR;
7ffd92c5 2601 if (to_vmx(vcpu)->rmode.vm86_active)
abd3f2d6 2602 eb = ~0;
089d034e 2603 if (enable_ept)
1439442c 2604 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
36cf24e0
NHE
2605
2606 /* When we are running a nested L2 guest and L1 specified for it a
2607 * certain exception bitmap, we must trap the same exceptions and pass
2608 * them to L1. When running L2, we will only handle the exceptions
2609 * specified above if L1 did not want them.
2610 */
2611 if (is_guest_mode(vcpu))
2612 eb |= get_vmcs12(vcpu)->exception_bitmap;
2613
abd3f2d6
AK
2614 vmcs_write32(EXCEPTION_BITMAP, eb);
2615}
2616
d28b387f
KA
2617/*
2618 * Check if MSR is intercepted for currently loaded MSR bitmap.
2619 */
2620static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2621{
2622 unsigned long *msr_bitmap;
2623 int f = sizeof(unsigned long);
2624
2625 if (!cpu_has_vmx_msr_bitmap())
2626 return true;
2627
2628 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2629
2630 if (msr <= 0x1fff) {
2631 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2632 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2633 msr &= 0x1fff;
2634 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2635 }
2636
2637 return true;
2638}
2639
15d45071
AR
2640/*
2641 * Check if MSR is intercepted for L01 MSR bitmap.
2642 */
2643static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2644{
2645 unsigned long *msr_bitmap;
2646 int f = sizeof(unsigned long);
2647
2648 if (!cpu_has_vmx_msr_bitmap())
2649 return true;
2650
2651 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2652
2653 if (msr <= 0x1fff) {
2654 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2655 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2656 msr &= 0x1fff;
2657 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2658 }
2659
2660 return true;
2661}
2662
2961e876
GN
2663static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2664 unsigned long entry, unsigned long exit)
8bf00a52 2665{
2961e876
GN
2666 vm_entry_controls_clearbit(vmx, entry);
2667 vm_exit_controls_clearbit(vmx, exit);
8bf00a52
GN
2668}
2669
ca83b4a7
KRW
2670static int find_msr(struct vmx_msrs *m, unsigned int msr)
2671{
2672 unsigned int i;
2673
2674 for (i = 0; i < m->nr; ++i) {
2675 if (m->val[i].index == msr)
2676 return i;
2677 }
2678 return -ENOENT;
2679}
2680
61d2ef2c
AK
2681static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
2682{
ca83b4a7 2683 int i;
61d2ef2c
AK
2684 struct msr_autoload *m = &vmx->msr_autoload;
2685
8bf00a52
GN
2686 switch (msr) {
2687 case MSR_EFER:
2688 if (cpu_has_load_ia32_efer) {
2961e876
GN
2689 clear_atomic_switch_msr_special(vmx,
2690 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
2691 VM_EXIT_LOAD_IA32_EFER);
2692 return;
2693 }
2694 break;
2695 case MSR_CORE_PERF_GLOBAL_CTRL:
2696 if (cpu_has_load_perf_global_ctrl) {
2961e876 2697 clear_atomic_switch_msr_special(vmx,
8bf00a52
GN
2698 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2699 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2700 return;
2701 }
2702 break;
110312c8 2703 }
ca83b4a7
KRW
2704 i = find_msr(&m->guest, msr);
2705 if (i < 0)
31907093 2706 goto skip_guest;
33966dd6 2707 --m->guest.nr;
33966dd6 2708 m->guest.val[i] = m->guest.val[m->guest.nr];
33966dd6 2709 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
110312c8 2710
31907093
KRW
2711skip_guest:
2712 i = find_msr(&m->host, msr);
2713 if (i < 0)
61d2ef2c 2714 return;
31907093
KRW
2715
2716 --m->host.nr;
2717 m->host.val[i] = m->host.val[m->host.nr];
33966dd6 2718 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c
AK
2719}
2720
2961e876
GN
2721static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2722 unsigned long entry, unsigned long exit,
2723 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
2724 u64 guest_val, u64 host_val)
8bf00a52
GN
2725{
2726 vmcs_write64(guest_val_vmcs, guest_val);
5a5e8a15
SC
2727 if (host_val_vmcs != HOST_IA32_EFER)
2728 vmcs_write64(host_val_vmcs, host_val);
2961e876
GN
2729 vm_entry_controls_setbit(vmx, entry);
2730 vm_exit_controls_setbit(vmx, exit);
8bf00a52
GN
2731}
2732
61d2ef2c 2733static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
989e3992 2734 u64 guest_val, u64 host_val, bool entry_only)
61d2ef2c 2735{
989e3992 2736 int i, j = 0;
61d2ef2c
AK
2737 struct msr_autoload *m = &vmx->msr_autoload;
2738
8bf00a52
GN
2739 switch (msr) {
2740 case MSR_EFER:
2741 if (cpu_has_load_ia32_efer) {
2961e876
GN
2742 add_atomic_switch_msr_special(vmx,
2743 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
2744 VM_EXIT_LOAD_IA32_EFER,
2745 GUEST_IA32_EFER,
2746 HOST_IA32_EFER,
2747 guest_val, host_val);
2748 return;
2749 }
2750 break;
2751 case MSR_CORE_PERF_GLOBAL_CTRL:
2752 if (cpu_has_load_perf_global_ctrl) {
2961e876 2753 add_atomic_switch_msr_special(vmx,
8bf00a52
GN
2754 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2755 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
2756 GUEST_IA32_PERF_GLOBAL_CTRL,
2757 HOST_IA32_PERF_GLOBAL_CTRL,
2758 guest_val, host_val);
2759 return;
2760 }
2761 break;
7099e2e1
RK
2762 case MSR_IA32_PEBS_ENABLE:
2763 /* PEBS needs a quiescent period after being disabled (to write
2764 * a record). Disabling PEBS through VMX MSR swapping doesn't
2765 * provide that period, so a CPU could write host's record into
2766 * guest's memory.
2767 */
2768 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
110312c8
AK
2769 }
2770
ca83b4a7 2771 i = find_msr(&m->guest, msr);
989e3992
KRW
2772 if (!entry_only)
2773 j = find_msr(&m->host, msr);
61d2ef2c 2774
31907093 2775 if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
60266204 2776 printk_once(KERN_WARNING "Not enough msr switch entries. "
e7fc6f93
GN
2777 "Can't add msr %x\n", msr);
2778 return;
61d2ef2c 2779 }
31907093 2780 if (i < 0) {
ca83b4a7 2781 i = m->guest.nr++;
33966dd6 2782 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
31907093 2783 }
989e3992
KRW
2784 m->guest.val[i].index = msr;
2785 m->guest.val[i].value = guest_val;
2786
2787 if (entry_only)
2788 return;
61d2ef2c 2789
31907093
KRW
2790 if (j < 0) {
2791 j = m->host.nr++;
33966dd6 2792 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c 2793 }
31907093
KRW
2794 m->host.val[j].index = msr;
2795 m->host.val[j].value = host_val;
61d2ef2c
AK
2796}
2797
92c0d900 2798static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2cc51560 2799{
844a5fe2
PB
2800 u64 guest_efer = vmx->vcpu.arch.efer;
2801 u64 ignore_bits = 0;
2802
2803 if (!enable_ept) {
2804 /*
2805 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
2806 * host CPUID is more efficient than testing guest CPUID
2807 * or CR4. Host SMEP is anyway a requirement for guest SMEP.
2808 */
2809 if (boot_cpu_has(X86_FEATURE_SMEP))
2810 guest_efer |= EFER_NX;
2811 else if (!(guest_efer & EFER_NX))
2812 ignore_bits |= EFER_NX;
2813 }
3a34a881 2814
51c6cf66 2815 /*
844a5fe2 2816 * LMA and LME handled by hardware; SCE meaningless outside long mode.
51c6cf66 2817 */
844a5fe2 2818 ignore_bits |= EFER_SCE;
51c6cf66
AK
2819#ifdef CONFIG_X86_64
2820 ignore_bits |= EFER_LMA | EFER_LME;
2821 /* SCE is meaningful only in long mode on Intel */
2822 if (guest_efer & EFER_LMA)
2823 ignore_bits &= ~(u64)EFER_SCE;
2824#endif
84ad33ef 2825
f6577a5f
AL
2826 /*
2827 * On EPT, we can't emulate NX, so we must switch EFER atomically.
2828 * On CPUs that support "load IA32_EFER", always switch EFER
2829 * atomically, since it's faster than switching it manually.
2830 */
2831 if (cpu_has_load_ia32_efer ||
2832 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
84ad33ef
AK
2833 if (!(guest_efer & EFER_LMA))
2834 guest_efer &= ~EFER_LME;
54b98bff
AL
2835 if (guest_efer != host_efer)
2836 add_atomic_switch_msr(vmx, MSR_EFER,
989e3992 2837 guest_efer, host_efer, false);
02343cf2
SC
2838 else
2839 clear_atomic_switch_msr(vmx, MSR_EFER);
84ad33ef 2840 return false;
844a5fe2 2841 } else {
02343cf2
SC
2842 clear_atomic_switch_msr(vmx, MSR_EFER);
2843
844a5fe2
PB
2844 guest_efer &= ~ignore_bits;
2845 guest_efer |= host_efer & ignore_bits;
2846
2847 vmx->guest_msrs[efer_offset].data = guest_efer;
2848 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
84ad33ef 2849
844a5fe2
PB
2850 return true;
2851 }
51c6cf66
AK
2852}
2853
e28baead
AL
2854#ifdef CONFIG_X86_32
2855/*
2856 * On 32-bit kernels, VM exits still load the FS and GS bases from the
2857 * VMCS rather than the segment table. KVM uses this helper to figure
2858 * out the current bases to poke them into the VMCS before entry.
2859 */
2d49ec72
GN
2860static unsigned long segment_base(u16 selector)
2861{
8c2e41f7 2862 struct desc_struct *table;
2d49ec72
GN
2863 unsigned long v;
2864
8c2e41f7 2865 if (!(selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
2866 return 0;
2867
45fc8757 2868 table = get_current_gdt_ro();
2d49ec72 2869
8c2e41f7 2870 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2d49ec72
GN
2871 u16 ldt_selector = kvm_read_ldt();
2872
8c2e41f7 2873 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
2874 return 0;
2875
8c2e41f7 2876 table = (struct desc_struct *)segment_base(ldt_selector);
2d49ec72 2877 }
8c2e41f7 2878 v = get_desc_base(&table[selector >> 3]);
2d49ec72
GN
2879 return v;
2880}
e28baead 2881#endif
2d49ec72 2882
6d6095bd 2883static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
33ed6329 2884{
04d2cc77 2885 struct vcpu_vmx *vmx = to_vmx(vcpu);
d7ee039e 2886 struct vmcs_host_state *host_state;
51e8a8cc 2887#ifdef CONFIG_X86_64
35060ed6 2888 int cpu = raw_smp_processor_id();
51e8a8cc 2889#endif
e368b875
SC
2890 unsigned long fs_base, gs_base;
2891 u16 fs_sel, gs_sel;
26bb0981 2892 int i;
04d2cc77 2893
d264ee0c
SC
2894 vmx->req_immediate_exit = false;
2895
bd9966de 2896 if (vmx->loaded_cpu_state)
33ed6329
AK
2897 return;
2898
bd9966de 2899 vmx->loaded_cpu_state = vmx->loaded_vmcs;
d7ee039e 2900 host_state = &vmx->loaded_cpu_state->host_state;
bd9966de 2901
33ed6329
AK
2902 /*
2903 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
2904 * allow segment selectors with cpl > 0 or ti == 1.
2905 */
d7ee039e 2906 host_state->ldt_sel = kvm_read_ldt();
42b933b5
VK
2907
2908#ifdef CONFIG_X86_64
d7ee039e
SC
2909 savesegment(ds, host_state->ds_sel);
2910 savesegment(es, host_state->es_sel);
e368b875
SC
2911
2912 gs_base = cpu_kernelmode_gs_base(cpu);
b062b794
VK
2913 if (likely(is_64bit_mm(current->mm))) {
2914 save_fsgs_for_kvm();
e368b875
SC
2915 fs_sel = current->thread.fsindex;
2916 gs_sel = current->thread.gsindex;
b062b794 2917 fs_base = current->thread.fsbase;
e368b875 2918 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
b062b794 2919 } else {
e368b875
SC
2920 savesegment(fs, fs_sel);
2921 savesegment(gs, gs_sel);
b062b794 2922 fs_base = read_msr(MSR_FS_BASE);
e368b875 2923 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
33ed6329 2924 }
b2da15ac 2925
4679b61f 2926 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
4fde8d57 2927#else
e368b875
SC
2928 savesegment(fs, fs_sel);
2929 savesegment(gs, gs_sel);
2930 fs_base = segment_base(fs_sel);
2931 gs_base = segment_base(gs_sel);
707c0874 2932#endif
e368b875 2933
8f21a0bb
SC
2934 if (unlikely(fs_sel != host_state->fs_sel)) {
2935 if (!(fs_sel & 7))
2936 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2937 else
2938 vmcs_write16(HOST_FS_SELECTOR, 0);
2939 host_state->fs_sel = fs_sel;
2940 }
2941 if (unlikely(gs_sel != host_state->gs_sel)) {
2942 if (!(gs_sel & 7))
2943 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2944 else
2945 vmcs_write16(HOST_GS_SELECTOR, 0);
2946 host_state->gs_sel = gs_sel;
2947 }
5e079c7e
SC
2948 if (unlikely(fs_base != host_state->fs_base)) {
2949 vmcs_writel(HOST_FS_BASE, fs_base);
2950 host_state->fs_base = fs_base;
2951 }
2952 if (unlikely(gs_base != host_state->gs_base)) {
2953 vmcs_writel(HOST_GS_BASE, gs_base);
2954 host_state->gs_base = gs_base;
2955 }
707c0874 2956
26bb0981
AK
2957 for (i = 0; i < vmx->save_nmsrs; ++i)
2958 kvm_set_shared_msr(vmx->guest_msrs[i].index,
d5696725
AK
2959 vmx->guest_msrs[i].data,
2960 vmx->guest_msrs[i].mask);
33ed6329
AK
2961}
2962
6d6095bd 2963static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
33ed6329 2964{
d7ee039e
SC
2965 struct vmcs_host_state *host_state;
2966
bd9966de 2967 if (!vmx->loaded_cpu_state)
33ed6329
AK
2968 return;
2969
bd9966de 2970 WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
d7ee039e 2971 host_state = &vmx->loaded_cpu_state->host_state;
bd9966de 2972
e1beb1d3 2973 ++vmx->vcpu.stat.host_state_reload;
bd9966de
SC
2974 vmx->loaded_cpu_state = NULL;
2975
c8770e7b 2976#ifdef CONFIG_X86_64
4679b61f 2977 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
c8770e7b 2978#endif
d7ee039e
SC
2979 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
2980 kvm_load_ldt(host_state->ldt_sel);
33ed6329 2981#ifdef CONFIG_X86_64
d7ee039e 2982 load_gs_index(host_state->gs_sel);
9581d442 2983#else
d7ee039e 2984 loadsegment(gs, host_state->gs_sel);
33ed6329 2985#endif
33ed6329 2986 }
d7ee039e
SC
2987 if (host_state->fs_sel & 7)
2988 loadsegment(fs, host_state->fs_sel);
b2da15ac 2989#ifdef CONFIG_X86_64
d7ee039e
SC
2990 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
2991 loadsegment(ds, host_state->ds_sel);
2992 loadsegment(es, host_state->es_sel);
b2da15ac 2993 }
b2da15ac 2994#endif
b7ffc44d 2995 invalidate_tss_limit();
44ea2b17 2996#ifdef CONFIG_X86_64
c8770e7b 2997 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
44ea2b17 2998#endif
45fc8757 2999 load_fixmap_gdt(raw_smp_processor_id());
33ed6329
AK
3000}
3001
678e315e
SC
3002#ifdef CONFIG_X86_64
3003static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
a9b21b62 3004{
4679b61f
PB
3005 preempt_disable();
3006 if (vmx->loaded_cpu_state)
3007 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
3008 preempt_enable();
678e315e 3009 return vmx->msr_guest_kernel_gs_base;
a9b21b62
AK
3010}
3011
678e315e
SC
3012static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
3013{
4679b61f
PB
3014 preempt_disable();
3015 if (vmx->loaded_cpu_state)
3016 wrmsrl(MSR_KERNEL_GS_BASE, data);
3017 preempt_enable();
678e315e
SC
3018 vmx->msr_guest_kernel_gs_base = data;
3019}
3020#endif
3021
28b835d6
FW
3022static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
3023{
3024 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3025 struct pi_desc old, new;
3026 unsigned int dest;
3027
31afb2ea
PB
3028 /*
3029 * In case of hot-plug or hot-unplug, we may have to undo
3030 * vmx_vcpu_pi_put even if there is no assigned device. And we
3031 * always keep PI.NDST up to date for simplicity: it makes the
3032 * code easier, and CPU migration is not a fast path.
3033 */
3034 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
28b835d6
FW
3035 return;
3036
31afb2ea
PB
3037 /*
3038 * First handle the simple case where no cmpxchg is necessary; just
3039 * allow posting non-urgent interrupts.
3040 *
3041 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
3042 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
3043 * expects the VCPU to be on the blocked_vcpu_list that matches
3044 * PI.NDST.
3045 */
3046 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
3047 vcpu->cpu == cpu) {
3048 pi_clear_sn(pi_desc);
28b835d6 3049 return;
31afb2ea 3050 }
28b835d6 3051
31afb2ea 3052 /* The full case. */
28b835d6
FW
3053 do {
3054 old.control = new.control = pi_desc->control;
3055
31afb2ea 3056 dest = cpu_physical_id(cpu);
28b835d6 3057
31afb2ea
PB
3058 if (x2apic_enabled())
3059 new.ndst = dest;
3060 else
3061 new.ndst = (dest << 8) & 0xFF00;
28b835d6 3062
28b835d6 3063 new.sn = 0;
c0a1666b
PB
3064 } while (cmpxchg64(&pi_desc->control, old.control,
3065 new.control) != old.control);
28b835d6 3066}
1be0e61c 3067
c95ba92a
PF
3068static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
3069{
3070 vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
3071 vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
3072}
3073
6aa8b732
AK
3074/*
3075 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
3076 * vcpu mutex is already taken.
3077 */
15ad7146 3078static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
6aa8b732 3079{
a2fa3e9f 3080 struct vcpu_vmx *vmx = to_vmx(vcpu);
b80c76ec 3081 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
6aa8b732 3082
b80c76ec 3083 if (!already_loaded) {
fe0e80be 3084 loaded_vmcs_clear(vmx->loaded_vmcs);
92fe13be 3085 local_irq_disable();
8f536b76 3086 crash_disable_local_vmclear(cpu);
5a560f8b
XG
3087
3088 /*
3089 * Read loaded_vmcs->cpu should be before fetching
3090 * loaded_vmcs->loaded_vmcss_on_cpu_link.
3091 * See the comments in __loaded_vmcs_clear().
3092 */
3093 smp_rmb();
3094
d462b819
NHE
3095 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
3096 &per_cpu(loaded_vmcss_on_cpu, cpu));
8f536b76 3097 crash_enable_local_vmclear(cpu);
92fe13be 3098 local_irq_enable();
b80c76ec
JM
3099 }
3100
3101 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
3102 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
3103 vmcs_load(vmx->loaded_vmcs->vmcs);
15d45071 3104 indirect_branch_prediction_barrier();
b80c76ec
JM
3105 }
3106
3107 if (!already_loaded) {
59c58ceb 3108 void *gdt = get_current_gdt_ro();
b80c76ec
JM
3109 unsigned long sysenter_esp;
3110
3111 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
92fe13be 3112
6aa8b732
AK
3113 /*
3114 * Linux uses per-cpu TSS and GDT, so set these when switching
e0c23063 3115 * processors. See 22.2.4.
6aa8b732 3116 */
e0c23063 3117 vmcs_writel(HOST_TR_BASE,
72f5e08d 3118 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
59c58ceb 3119 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
6aa8b732 3120
b7ffc44d
AL
3121 /*
3122 * VM exits change the host TR limit to 0x67 after a VM
3123 * exit. This is okay, since 0x67 covers everything except
3124 * the IO bitmap and have have code to handle the IO bitmap
3125 * being lost after a VM exit.
3126 */
3127 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
3128
6aa8b732
AK
3129 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
3130 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
ff2c3a18 3131
d462b819 3132 vmx->loaded_vmcs->cpu = cpu;
6aa8b732 3133 }
28b835d6 3134
2680d6da
OH
3135 /* Setup TSC multiplier */
3136 if (kvm_has_tsc_control &&
c95ba92a
PF
3137 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
3138 decache_tsc_multiplier(vmx);
2680d6da 3139
28b835d6 3140 vmx_vcpu_pi_load(vcpu, cpu);
1be0e61c 3141 vmx->host_pkru = read_pkru();
74c55931 3142 vmx->host_debugctlmsr = get_debugctlmsr();
28b835d6
FW
3143}
3144
3145static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
3146{
3147 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3148
3149 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
a0052191
YZ
3150 !irq_remapping_cap(IRQ_POSTING_CAP) ||
3151 !kvm_vcpu_apicv_active(vcpu))
28b835d6
FW
3152 return;
3153
3154 /* Set SN when the vCPU is preempted */
3155 if (vcpu->preempted)
3156 pi_set_sn(pi_desc);
6aa8b732
AK
3157}
3158
3159static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
3160{
28b835d6
FW
3161 vmx_vcpu_pi_put(vcpu);
3162
6d6095bd 3163 vmx_prepare_switch_to_host(to_vmx(vcpu));
6aa8b732
AK
3164}
3165
f244deed
WL
3166static bool emulation_required(struct kvm_vcpu *vcpu)
3167{
3168 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3169}
3170
edcafe3c
AK
3171static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
3172
fe3ef05c
NHE
3173/*
3174 * Return the cr0 value that a nested guest would read. This is a combination
3175 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
3176 * its hypervisor (cr0_read_shadow).
3177 */
3178static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
3179{
3180 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
3181 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
3182}
3183static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
3184{
3185 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
3186 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
3187}
3188
6aa8b732
AK
3189static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
3190{
78ac8b47 3191 unsigned long rflags, save_rflags;
345dcaa8 3192
6de12732
AK
3193 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
3194 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3195 rflags = vmcs_readl(GUEST_RFLAGS);
3196 if (to_vmx(vcpu)->rmode.vm86_active) {
3197 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3198 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
3199 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3200 }
3201 to_vmx(vcpu)->rflags = rflags;
78ac8b47 3202 }
6de12732 3203 return to_vmx(vcpu)->rflags;
6aa8b732
AK
3204}
3205
3206static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3207{
f244deed
WL
3208 unsigned long old_rflags = vmx_get_rflags(vcpu);
3209
6de12732
AK
3210 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3211 to_vmx(vcpu)->rflags = rflags;
78ac8b47
AK
3212 if (to_vmx(vcpu)->rmode.vm86_active) {
3213 to_vmx(vcpu)->rmode.save_rflags = rflags;
053de044 3214 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
78ac8b47 3215 }
6aa8b732 3216 vmcs_writel(GUEST_RFLAGS, rflags);
f244deed
WL
3217
3218 if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
3219 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
6aa8b732
AK
3220}
3221
37ccdcbe 3222static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
3223{
3224 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3225 int ret = 0;
3226
3227 if (interruptibility & GUEST_INTR_STATE_STI)
48005f64 3228 ret |= KVM_X86_SHADOW_INT_STI;
2809f5d2 3229 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
48005f64 3230 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2809f5d2 3231
37ccdcbe 3232 return ret;
2809f5d2
GC
3233}
3234
3235static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
3236{
3237 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3238 u32 interruptibility = interruptibility_old;
3239
3240 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
3241
48005f64 3242 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2809f5d2 3243 interruptibility |= GUEST_INTR_STATE_MOV_SS;
48005f64 3244 else if (mask & KVM_X86_SHADOW_INT_STI)
2809f5d2
GC
3245 interruptibility |= GUEST_INTR_STATE_STI;
3246
3247 if ((interruptibility != interruptibility_old))
3248 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
3249}
3250
6aa8b732
AK
3251static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
3252{
3253 unsigned long rip;
6aa8b732 3254
5fdbf976 3255 rip = kvm_rip_read(vcpu);
6aa8b732 3256 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5fdbf976 3257 kvm_rip_write(vcpu, rip);
6aa8b732 3258
2809f5d2
GC
3259 /* skipping an emulated instruction also counts */
3260 vmx_set_interrupt_shadow(vcpu, 0);
6aa8b732
AK
3261}
3262
b96fb439
PB
3263static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3264 unsigned long exit_qual)
3265{
3266 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3267 unsigned int nr = vcpu->arch.exception.nr;
3268 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3269
3270 if (vcpu->arch.exception.has_error_code) {
3271 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3272 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3273 }
3274
3275 if (kvm_exception_is_soft(nr))
3276 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3277 else
3278 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3279
3280 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3281 vmx_get_nmi_mask(vcpu))
3282 intr_info |= INTR_INFO_UNBLOCK_NMI;
3283
3284 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3285}
3286
0b6ac343
NHE
3287/*
3288 * KVM wants to inject page-faults which it got to the guest. This function
3289 * checks whether in a nested guest, we need to inject them to L1 or L2.
0b6ac343 3290 */
bfcf83b1 3291static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
0b6ac343
NHE
3292{
3293 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
adfe20fb 3294 unsigned int nr = vcpu->arch.exception.nr;
da998b46
JM
3295 bool has_payload = vcpu->arch.exception.has_payload;
3296 unsigned long payload = vcpu->arch.exception.payload;
0b6ac343 3297
b96fb439
PB
3298 if (nr == PF_VECTOR) {
3299 if (vcpu->arch.exception.nested_apf) {
bfcf83b1 3300 *exit_qual = vcpu->arch.apf.nested_apf_token;
b96fb439
PB
3301 return 1;
3302 }
b96fb439
PB
3303 if (nested_vmx_is_page_fault_vmexit(vmcs12,
3304 vcpu->arch.exception.error_code)) {
da998b46 3305 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
b96fb439
PB
3306 return 1;
3307 }
f10c729f
JM
3308 } else if (vmcs12->exception_bitmap & (1u << nr)) {
3309 if (nr == DB_VECTOR) {
3310 if (!has_payload) {
3311 payload = vcpu->arch.dr6;
3312 payload &= ~(DR6_FIXED_1 | DR6_BT);
3313 payload ^= DR6_RTM;
cfb634fe 3314 }
f10c729f
JM
3315 *exit_qual = payload;
3316 } else
3317 *exit_qual = 0;
3318 return 1;
adfe20fb
WL
3319 }
3320
b96fb439 3321 return 0;
0b6ac343
NHE
3322}
3323
caa057a2
WL
3324static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
3325{
3326 /*
3327 * Ensure that we clear the HLT state in the VMCS. We don't need to
3328 * explicitly skip the instruction because if the HLT state is set,
3329 * then the instruction is already executing and RIP has already been
3330 * advanced.
3331 */
3332 if (kvm_hlt_in_guest(vcpu->kvm) &&
3333 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
3334 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3335}
3336
cfcd20e5 3337static void vmx_queue_exception(struct kvm_vcpu *vcpu)
298101da 3338{
77ab6db0 3339 struct vcpu_vmx *vmx = to_vmx(vcpu);
cfcd20e5
WL
3340 unsigned nr = vcpu->arch.exception.nr;
3341 bool has_error_code = vcpu->arch.exception.has_error_code;
cfcd20e5 3342 u32 error_code = vcpu->arch.exception.error_code;
8ab2d2e2 3343 u32 intr_info = nr | INTR_INFO_VALID_MASK;
77ab6db0 3344
da998b46
JM
3345 kvm_deliver_exception_payload(vcpu);
3346
8ab2d2e2 3347 if (has_error_code) {
77ab6db0 3348 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
8ab2d2e2
JK
3349 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3350 }
77ab6db0 3351
7ffd92c5 3352 if (vmx->rmode.vm86_active) {
71f9833b
SH
3353 int inc_eip = 0;
3354 if (kvm_exception_is_soft(nr))
3355 inc_eip = vcpu->arch.event_exit_inst_len;
3356 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
a92601bb 3357 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
77ab6db0
JK
3358 return;
3359 }
3360
add5ff7a
SC
3361 WARN_ON_ONCE(vmx->emulation_required);
3362
66fd3f7f
GN
3363 if (kvm_exception_is_soft(nr)) {
3364 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
3365 vmx->vcpu.arch.event_exit_inst_len);
8ab2d2e2
JK
3366 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3367 } else
3368 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3369
3370 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
caa057a2
WL
3371
3372 vmx_clear_hlt(vcpu);
298101da
AK
3373}
3374
4e47c7a6
SY
3375static bool vmx_rdtscp_supported(void)
3376{
3377 return cpu_has_vmx_rdtscp();
3378}
3379
ad756a16
MJ
3380static bool vmx_invpcid_supported(void)
3381{
eb4b248e 3382 return cpu_has_vmx_invpcid();
ad756a16
MJ
3383}
3384
a75beee6
ED
3385/*
3386 * Swap MSR entry in host/guest MSR entry array.
3387 */
8b9cf98c 3388static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
a75beee6 3389{
26bb0981 3390 struct shared_msr_entry tmp;
a2fa3e9f
GH
3391
3392 tmp = vmx->guest_msrs[to];
3393 vmx->guest_msrs[to] = vmx->guest_msrs[from];
3394 vmx->guest_msrs[from] = tmp;
a75beee6
ED
3395}
3396
e38aea3e
AK
3397/*
3398 * Set up the vmcs to automatically save and restore system
3399 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
3400 * mode, as fiddling with msrs is very expensive.
3401 */
8b9cf98c 3402static void setup_msrs(struct vcpu_vmx *vmx)
e38aea3e 3403{
26bb0981 3404 int save_nmsrs, index;
e38aea3e 3405
a75beee6
ED
3406 save_nmsrs = 0;
3407#ifdef CONFIG_X86_64
8b9cf98c 3408 if (is_long_mode(&vmx->vcpu)) {
8b9cf98c 3409 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
a75beee6 3410 if (index >= 0)
8b9cf98c
RR
3411 move_msr_up(vmx, index, save_nmsrs++);
3412 index = __find_msr_index(vmx, MSR_LSTAR);
a75beee6 3413 if (index >= 0)
8b9cf98c
RR
3414 move_msr_up(vmx, index, save_nmsrs++);
3415 index = __find_msr_index(vmx, MSR_CSTAR);
a75beee6 3416 if (index >= 0)
8b9cf98c 3417 move_msr_up(vmx, index, save_nmsrs++);
4e47c7a6 3418 index = __find_msr_index(vmx, MSR_TSC_AUX);
d6321d49 3419 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
4e47c7a6 3420 move_msr_up(vmx, index, save_nmsrs++);
a75beee6 3421 /*
8c06585d 3422 * MSR_STAR is only needed on long mode guests, and only
a75beee6
ED
3423 * if efer.sce is enabled.
3424 */
8c06585d 3425 index = __find_msr_index(vmx, MSR_STAR);
f6801dff 3426 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
8b9cf98c 3427 move_msr_up(vmx, index, save_nmsrs++);
a75beee6
ED
3428 }
3429#endif
92c0d900
AK
3430 index = __find_msr_index(vmx, MSR_EFER);
3431 if (index >= 0 && update_transition_efer(vmx, index))
26bb0981 3432 move_msr_up(vmx, index, save_nmsrs++);
e38aea3e 3433
26bb0981 3434 vmx->save_nmsrs = save_nmsrs;
5897297b 3435
8d14695f 3436 if (cpu_has_vmx_msr_bitmap())
904e14fb 3437 vmx_update_msr_bitmap(&vmx->vcpu);
e38aea3e
AK
3438}
3439
e79f245d 3440static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
6aa8b732 3441{
e79f245d 3442 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6aa8b732 3443
e79f245d
KA
3444 if (is_guest_mode(vcpu) &&
3445 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
3446 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
3447
3448 return vcpu->arch.tsc_offset;
6aa8b732
AK
3449}
3450
3451/*
99e3e30a 3452 * writes 'offset' into guest's timestamp counter offset register
6aa8b732 3453 */
99e3e30a 3454static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
6aa8b732 3455{
27fc51b2 3456 if (is_guest_mode(vcpu)) {
7991825b 3457 /*
27fc51b2
NHE
3458 * We're here if L1 chose not to trap WRMSR to TSC. According
3459 * to the spec, this should set L1's TSC; The offset that L1
3460 * set for L2 remains unchanged, and still needs to be added
3461 * to the newly set TSC to get L2's TSC.
7991825b 3462 */
27fc51b2 3463 struct vmcs12 *vmcs12;
27fc51b2
NHE
3464 /* recalculate vmcs02.TSC_OFFSET: */
3465 vmcs12 = get_vmcs12(vcpu);
3466 vmcs_write64(TSC_OFFSET, offset +
3467 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
3468 vmcs12->tsc_offset : 0));
3469 } else {
489223ed
YY
3470 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
3471 vmcs_read64(TSC_OFFSET), offset);
27fc51b2
NHE
3472 vmcs_write64(TSC_OFFSET, offset);
3473 }
6aa8b732
AK
3474}
3475
801d3424
NHE
3476/*
3477 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
3478 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
3479 * all guests if the "nested" module option is off, and can also be disabled
3480 * for a single guest by disabling its VMX cpuid bit.
3481 */
3482static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
3483{
d6321d49 3484 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
801d3424
NHE
3485}
3486
b87a51ae
NHE
3487/*
3488 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
3489 * returned for the various VMX controls MSRs when nested VMX is enabled.
3490 * The same values should also be used to verify that vmcs12 control fields are
3491 * valid during nested entry from L1 to L2.
3492 * Each of these control msrs has a low and high 32-bit half: A low bit is on
3493 * if the corresponding bit in the (32-bit) control field *must* be on, and a
3494 * bit in the high half is on if the corresponding bit in the control field
3495 * may be on. See also vmx_control_verify().
b87a51ae 3496 */
6677f3da 3497static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
b87a51ae 3498{
1389309c
PB
3499 if (!nested) {
3500 memset(msrs, 0, sizeof(*msrs));
3501 return;
3502 }
3503
b87a51ae
NHE
3504 /*
3505 * Note that as a general rule, the high half of the MSRs (bits in
3506 * the control fields which may be 1) should be initialized by the
3507 * intersection of the underlying hardware's MSR (i.e., features which
3508 * can be supported) and the list of features we want to expose -
3509 * because they are known to be properly supported in our code.
3510 * Also, usually, the low half of the MSRs (bits which must be 1) can
3511 * be set to 0, meaning that L1 may turn off any of these bits. The
3512 * reason is that if one of these bits is necessary, it will appear
3513 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
3514 * fields of vmcs01 and vmcs02, will turn these bits off - and
7313c698 3515 * nested_vmx_exit_reflected() will not pass related exits to L1.
b87a51ae
NHE
3516 * These rules have exceptions below.
3517 */
3518
3519 /* pin-based controls */
eabeaacc 3520 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6677f3da
PB
3521 msrs->pinbased_ctls_low,
3522 msrs->pinbased_ctls_high);
3523 msrs->pinbased_ctls_low |=
b9c237bb 3524 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6677f3da 3525 msrs->pinbased_ctls_high &=
b9c237bb
WV
3526 PIN_BASED_EXT_INTR_MASK |
3527 PIN_BASED_NMI_EXITING |
1389309c
PB
3528 PIN_BASED_VIRTUAL_NMIS |
3529 (apicv ? PIN_BASED_POSTED_INTR : 0);
6677f3da 3530 msrs->pinbased_ctls_high |=
b9c237bb 3531 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
0238ea91 3532 PIN_BASED_VMX_PREEMPTION_TIMER;
b87a51ae 3533
3dbcd8da 3534 /* exit controls */
c0dfee58 3535 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6677f3da
PB
3536 msrs->exit_ctls_low,
3537 msrs->exit_ctls_high);
3538 msrs->exit_ctls_low =
b9c237bb 3539 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
e0ba1a6f 3540
6677f3da 3541 msrs->exit_ctls_high &=
b87a51ae 3542#ifdef CONFIG_X86_64
c0dfee58 3543 VM_EXIT_HOST_ADDR_SPACE_SIZE |
b87a51ae 3544#endif
f4124500 3545 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
6677f3da 3546 msrs->exit_ctls_high |=
b9c237bb 3547 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
f4124500 3548 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
e0ba1a6f
BD
3549 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
3550
2996fca0 3551 /* We support free control of debug control saving. */
6677f3da 3552 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2996fca0 3553
b87a51ae
NHE
3554 /* entry controls */
3555 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6677f3da
PB
3556 msrs->entry_ctls_low,
3557 msrs->entry_ctls_high);
3558 msrs->entry_ctls_low =
b9c237bb 3559 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6677f3da 3560 msrs->entry_ctls_high &=
57435349
JK
3561#ifdef CONFIG_X86_64
3562 VM_ENTRY_IA32E_MODE |
3563#endif
3564 VM_ENTRY_LOAD_IA32_PAT;
6677f3da 3565 msrs->entry_ctls_high |=
b9c237bb 3566 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
57435349 3567
2996fca0 3568 /* We support free control of debug control loading. */
6677f3da 3569 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2996fca0 3570
b87a51ae
NHE
3571 /* cpu-based controls */
3572 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6677f3da
PB
3573 msrs->procbased_ctls_low,
3574 msrs->procbased_ctls_high);
3575 msrs->procbased_ctls_low =
b9c237bb 3576 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6677f3da 3577 msrs->procbased_ctls_high &=
a294c9bb
JK
3578 CPU_BASED_VIRTUAL_INTR_PENDING |
3579 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
b87a51ae
NHE
3580 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
3581 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
3582 CPU_BASED_CR3_STORE_EXITING |
3583#ifdef CONFIG_X86_64
3584 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
3585#endif
3586 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
5f3d45e7
MD
3587 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
3588 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
3589 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
3590 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
b87a51ae
NHE
3591 /*
3592 * We can allow some features even when not supported by the
3593 * hardware. For example, L1 can specify an MSR bitmap - and we
3594 * can use it to avoid exits to L1 - even when L0 runs L2
3595 * without MSR bitmaps.
3596 */
6677f3da 3597 msrs->procbased_ctls_high |=
b9c237bb 3598 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
560b7ee1 3599 CPU_BASED_USE_MSR_BITMAPS;
b87a51ae 3600
3dcdf3ec 3601 /* We support free control of CR3 access interception. */
6677f3da 3602 msrs->procbased_ctls_low &=
3dcdf3ec
JK
3603 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
3604
80154d77
PB
3605 /*
3606 * secondary cpu-based controls. Do not include those that
3607 * depend on CPUID bits, they are added later by vmx_cpuid_update.
3608 */
b87a51ae 3609 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6677f3da
PB
3610 msrs->secondary_ctls_low,
3611 msrs->secondary_ctls_high);
3612 msrs->secondary_ctls_low = 0;
3613 msrs->secondary_ctls_high &=
1b07304c 3614 SECONDARY_EXEC_DESC |
f2b93280 3615 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
82f0dd4b 3616 SECONDARY_EXEC_APIC_REGISTER_VIRT |
608406e2 3617 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3db13480 3618 SECONDARY_EXEC_WBINVD_EXITING;
2cf7ea9f 3619
32c7acf0
LA
3620 /*
3621 * We can emulate "VMCS shadowing," even if the hardware
3622 * doesn't support it.
3623 */
3624 msrs->secondary_ctls_high |=
3625 SECONDARY_EXEC_SHADOW_VMCS;
c18911a2 3626
afa61f75
NHE
3627 if (enable_ept) {
3628 /* nested EPT: emulate EPT also to L1 */
6677f3da 3629 msrs->secondary_ctls_high |=
0790ec17 3630 SECONDARY_EXEC_ENABLE_EPT;
6677f3da 3631 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
7db74265 3632 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
02120c45 3633 if (cpu_has_vmx_ept_execute_only())
6677f3da 3634 msrs->ept_caps |=
02120c45 3635 VMX_EPT_EXECUTE_ONLY_BIT;
6677f3da
PB
3636 msrs->ept_caps &= vmx_capability.ept;
3637 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
7db74265
PB
3638 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
3639 VMX_EPT_1GB_PAGE_BIT;
03efce6f 3640 if (enable_ept_ad_bits) {
6677f3da 3641 msrs->secondary_ctls_high |=
03efce6f 3642 SECONDARY_EXEC_ENABLE_PML;
6677f3da 3643 msrs->ept_caps |= VMX_EPT_AD_BIT;
03efce6f 3644 }
1c13bffd 3645 }
afa61f75 3646
27c42a1b 3647 if (cpu_has_vmx_vmfunc()) {
6677f3da 3648 msrs->secondary_ctls_high |=
27c42a1b 3649 SECONDARY_EXEC_ENABLE_VMFUNC;
41ab9372
BD
3650 /*
3651 * Advertise EPTP switching unconditionally
3652 * since we emulate it
3653 */
575b3a2c 3654 if (enable_ept)
6677f3da 3655 msrs->vmfunc_controls =
575b3a2c 3656 VMX_VMFUNC_EPTP_SWITCHING;
27c42a1b
BD
3657 }
3658
ef697a71
PB
3659 /*
3660 * Old versions of KVM use the single-context version without
3661 * checking for support, so declare that it is supported even
3662 * though it is treated as global context. The alternative is
3663 * not failing the single-context invvpid, and it is worse.
3664 */
63cb6d5f 3665 if (enable_vpid) {
6677f3da 3666 msrs->secondary_ctls_high |=
63cb6d5f 3667 SECONDARY_EXEC_ENABLE_VPID;
6677f3da 3668 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
bcdde302 3669 VMX_VPID_EXTENT_SUPPORTED_MASK;
1c13bffd 3670 }
99b83ac8 3671
0790ec17 3672 if (enable_unrestricted_guest)
6677f3da 3673 msrs->secondary_ctls_high |=
0790ec17
RK
3674 SECONDARY_EXEC_UNRESTRICTED_GUEST;
3675
2cf7ea9f
PB
3676 if (flexpriority_enabled)
3677 msrs->secondary_ctls_high |=
3678 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3679
c18911a2 3680 /* miscellaneous data */
b9c237bb 3681 rdmsr(MSR_IA32_VMX_MISC,
6677f3da
PB
3682 msrs->misc_low,
3683 msrs->misc_high);
3684 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
3685 msrs->misc_low |=
f4160e45 3686 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
b9c237bb 3687 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
f4124500 3688 VMX_MISC_ACTIVITY_HLT;
6677f3da 3689 msrs->misc_high = 0;
62cc6b9d
DM
3690
3691 /*
3692 * This MSR reports some information about VMX support. We
3693 * should return information about the VMX we emulate for the
3694 * guest, and the VMCS structure we give it - not about the
3695 * VMX support of the underlying hardware.
3696 */
6677f3da 3697 msrs->basic =
62cc6b9d
DM
3698 VMCS12_REVISION |
3699 VMX_BASIC_TRUE_CTLS |
3700 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
3701 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
3702
3703 if (cpu_has_vmx_basic_inout())
6677f3da 3704 msrs->basic |= VMX_BASIC_INOUT;
62cc6b9d
DM
3705
3706 /*
8322ebbb 3707 * These MSRs specify bits which the guest must keep fixed on
62cc6b9d
DM
3708 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
3709 * We picked the standard core2 setting.
3710 */
3711#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
3712#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6677f3da
PB
3713 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
3714 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
8322ebbb
DM
3715
3716 /* These MSRs specify bits which the guest must keep fixed off. */
6677f3da
PB
3717 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
3718 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
62cc6b9d
DM
3719
3720 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6677f3da 3721 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
b87a51ae
NHE
3722}
3723
3899152c
DM
3724/*
3725 * if fixed0[i] == 1: val[i] must be 1
3726 * if fixed1[i] == 0: val[i] must be 0
3727 */
3728static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
3729{
3730 return ((val & fixed1) | fixed0) == val;
b87a51ae
NHE
3731}
3732
3733static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
3734{
3899152c 3735 return fixed_bits_valid(control, low, high);
b87a51ae
NHE
3736}
3737
3738static inline u64 vmx_control_msr(u32 low, u32 high)
3739{
3740 return low | ((u64)high << 32);
3741}
3742
62cc6b9d
DM
3743static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
3744{
3745 superset &= mask;
3746 subset &= mask;
3747
3748 return (superset | subset) == superset;
3749}
3750
3751static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
3752{
3753 const u64 feature_and_reserved =
3754 /* feature (except bit 48; see below) */
3755 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
3756 /* reserved */
3757 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
6677f3da 3758 u64 vmx_basic = vmx->nested.msrs.basic;
62cc6b9d
DM
3759
3760 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
3761 return -EINVAL;
3762
3763 /*
3764 * KVM does not emulate a version of VMX that constrains physical
3765 * addresses of VMX structures (e.g. VMCS) to 32-bits.
3766 */
3767 if (data & BIT_ULL(48))
3768 return -EINVAL;
3769
3770 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
3771 vmx_basic_vmcs_revision_id(data))
3772 return -EINVAL;
3773
3774 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
3775 return -EINVAL;
3776
6677f3da 3777 vmx->nested.msrs.basic = data;
62cc6b9d
DM
3778 return 0;
3779}
3780
3781static int
3782vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3783{
3784 u64 supported;
3785 u32 *lowp, *highp;
3786
3787 switch (msr_index) {
3788 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
6677f3da
PB
3789 lowp = &vmx->nested.msrs.pinbased_ctls_low;
3790 highp = &vmx->nested.msrs.pinbased_ctls_high;
62cc6b9d
DM
3791 break;
3792 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
6677f3da
PB
3793 lowp = &vmx->nested.msrs.procbased_ctls_low;
3794 highp = &vmx->nested.msrs.procbased_ctls_high;
62cc6b9d
DM
3795 break;
3796 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
6677f3da
PB
3797 lowp = &vmx->nested.msrs.exit_ctls_low;
3798 highp = &vmx->nested.msrs.exit_ctls_high;
62cc6b9d
DM
3799 break;
3800 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
6677f3da
PB
3801 lowp = &vmx->nested.msrs.entry_ctls_low;
3802 highp = &vmx->nested.msrs.entry_ctls_high;
62cc6b9d
DM
3803 break;
3804 case MSR_IA32_VMX_PROCBASED_CTLS2:
6677f3da
PB
3805 lowp = &vmx->nested.msrs.secondary_ctls_low;
3806 highp = &vmx->nested.msrs.secondary_ctls_high;
62cc6b9d
DM
3807 break;
3808 default:
3809 BUG();
3810 }
3811
3812 supported = vmx_control_msr(*lowp, *highp);
3813
3814 /* Check must-be-1 bits are still 1. */
3815 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
3816 return -EINVAL;
3817
3818 /* Check must-be-0 bits are still 0. */
3819 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
3820 return -EINVAL;
3821
3822 *lowp = data;
3823 *highp = data >> 32;
3824 return 0;
3825}
3826
3827static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3828{
3829 const u64 feature_and_reserved_bits =
3830 /* feature */
3831 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
3832 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
3833 /* reserved */
3834 GENMASK_ULL(13, 9) | BIT_ULL(31);
3835 u64 vmx_misc;
3836
6677f3da
PB
3837 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
3838 vmx->nested.msrs.misc_high);
62cc6b9d
DM
3839
3840 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3841 return -EINVAL;
3842
6677f3da 3843 if ((vmx->nested.msrs.pinbased_ctls_high &
62cc6b9d
DM
3844 PIN_BASED_VMX_PREEMPTION_TIMER) &&
3845 vmx_misc_preemption_timer_rate(data) !=
3846 vmx_misc_preemption_timer_rate(vmx_misc))
3847 return -EINVAL;
3848
3849 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
3850 return -EINVAL;
3851
3852 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
3853 return -EINVAL;
3854
3855 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3856 return -EINVAL;
3857
6677f3da
PB
3858 vmx->nested.msrs.misc_low = data;
3859 vmx->nested.msrs.misc_high = data >> 32;
f4160e45
JM
3860
3861 /*
3862 * If L1 has read-only VM-exit information fields, use the
3863 * less permissive vmx_vmwrite_bitmap to specify write
3864 * permissions for the shadow VMCS.
3865 */
3866 if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
3867 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
3868
62cc6b9d
DM
3869 return 0;
3870}
3871
3872static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3873{
3874 u64 vmx_ept_vpid_cap;
3875
6677f3da
PB
3876 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
3877 vmx->nested.msrs.vpid_caps);
62cc6b9d
DM
3878
3879 /* Every bit is either reserved or a feature bit. */
3880 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3881 return -EINVAL;
3882
6677f3da
PB
3883 vmx->nested.msrs.ept_caps = data;
3884 vmx->nested.msrs.vpid_caps = data >> 32;
62cc6b9d
DM
3885 return 0;
3886}
3887
3888static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3889{
3890 u64 *msr;
3891
3892 switch (msr_index) {
3893 case MSR_IA32_VMX_CR0_FIXED0:
6677f3da 3894 msr = &vmx->nested.msrs.cr0_fixed0;
62cc6b9d
DM
3895 break;
3896 case MSR_IA32_VMX_CR4_FIXED0:
6677f3da 3897 msr = &vmx->nested.msrs.cr4_fixed0;
62cc6b9d
DM
3898 break;
3899 default:
3900 BUG();
3901 }
3902
3903 /*
3904 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3905 * must be 1 in the restored value.
3906 */
3907 if (!is_bitwise_subset(data, *msr, -1ULL))
3908 return -EINVAL;
3909
3910 *msr = data;
3911 return 0;
3912}
3913
3914/*
3915 * Called when userspace is restoring VMX MSRs.
3916 *
3917 * Returns 0 on success, non-0 otherwise.
3918 */
3919static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
b87a51ae 3920{
b9c237bb
WV
3921 struct vcpu_vmx *vmx = to_vmx(vcpu);
3922
a943ac50
JM
3923 /*
3924 * Don't allow changes to the VMX capability MSRs while the vCPU
3925 * is in VMX operation.
3926 */
3927 if (vmx->nested.vmxon)
3928 return -EBUSY;
3929
b87a51ae 3930 switch (msr_index) {
b87a51ae 3931 case MSR_IA32_VMX_BASIC:
62cc6b9d
DM
3932 return vmx_restore_vmx_basic(vmx, data);
3933 case MSR_IA32_VMX_PINBASED_CTLS:
3934 case MSR_IA32_VMX_PROCBASED_CTLS:
3935 case MSR_IA32_VMX_EXIT_CTLS:
3936 case MSR_IA32_VMX_ENTRY_CTLS:
b87a51ae 3937 /*
62cc6b9d
DM
3938 * The "non-true" VMX capability MSRs are generated from the
3939 * "true" MSRs, so we do not support restoring them directly.
3940 *
3941 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3942 * should restore the "true" MSRs with the must-be-1 bits
3943 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3944 * DEFAULT SETTINGS".
b87a51ae 3945 */
62cc6b9d
DM
3946 return -EINVAL;
3947 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3948 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3949 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3950 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3951 case MSR_IA32_VMX_PROCBASED_CTLS2:
3952 return vmx_restore_control_msr(vmx, msr_index, data);
3953 case MSR_IA32_VMX_MISC:
3954 return vmx_restore_vmx_misc(vmx, data);
3955 case MSR_IA32_VMX_CR0_FIXED0:
3956 case MSR_IA32_VMX_CR4_FIXED0:
3957 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3958 case MSR_IA32_VMX_CR0_FIXED1:
3959 case MSR_IA32_VMX_CR4_FIXED1:
3960 /*
3961 * These MSRs are generated based on the vCPU's CPUID, so we
3962 * do not support restoring them directly.
3963 */
3964 return -EINVAL;
3965 case MSR_IA32_VMX_EPT_VPID_CAP:
3966 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3967 case MSR_IA32_VMX_VMCS_ENUM:
6677f3da 3968 vmx->nested.msrs.vmcs_enum = data;
62cc6b9d
DM
3969 return 0;
3970 default:
b87a51ae 3971 /*
62cc6b9d 3972 * The rest of the VMX capability MSRs do not support restore.
b87a51ae 3973 */
62cc6b9d
DM
3974 return -EINVAL;
3975 }
3976}
3977
3978/* Returns 0 on success, non-0 otherwise. */
6677f3da 3979static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
62cc6b9d 3980{
62cc6b9d
DM
3981 switch (msr_index) {
3982 case MSR_IA32_VMX_BASIC:
6677f3da 3983 *pdata = msrs->basic;
b87a51ae
NHE
3984 break;
3985 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3986 case MSR_IA32_VMX_PINBASED_CTLS:
b9c237bb 3987 *pdata = vmx_control_msr(
6677f3da
PB
3988 msrs->pinbased_ctls_low,
3989 msrs->pinbased_ctls_high);
0115f9cb
DM
3990 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
3991 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
3992 break;
3993 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3994 case MSR_IA32_VMX_PROCBASED_CTLS:
b9c237bb 3995 *pdata = vmx_control_msr(
6677f3da
PB
3996 msrs->procbased_ctls_low,
3997 msrs->procbased_ctls_high);
0115f9cb
DM
3998 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
3999 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
4000 break;
4001 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
4002 case MSR_IA32_VMX_EXIT_CTLS:
b9c237bb 4003 *pdata = vmx_control_msr(
6677f3da
PB
4004 msrs->exit_ctls_low,
4005 msrs->exit_ctls_high);
0115f9cb
DM
4006 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
4007 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
4008 break;
4009 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
4010 case MSR_IA32_VMX_ENTRY_CTLS:
b9c237bb 4011 *pdata = vmx_control_msr(
6677f3da
PB
4012 msrs->entry_ctls_low,
4013 msrs->entry_ctls_high);
0115f9cb
DM
4014 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
4015 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
b87a51ae
NHE
4016 break;
4017 case MSR_IA32_VMX_MISC:
b9c237bb 4018 *pdata = vmx_control_msr(
6677f3da
PB
4019 msrs->misc_low,
4020 msrs->misc_high);
b87a51ae 4021 break;
b87a51ae 4022 case MSR_IA32_VMX_CR0_FIXED0:
6677f3da 4023 *pdata = msrs->cr0_fixed0;
b87a51ae
NHE
4024 break;
4025 case MSR_IA32_VMX_CR0_FIXED1:
6677f3da 4026 *pdata = msrs->cr0_fixed1;
b87a51ae
NHE
4027 break;
4028 case MSR_IA32_VMX_CR4_FIXED0:
6677f3da 4029 *pdata = msrs->cr4_fixed0;
b87a51ae
NHE
4030 break;
4031 case MSR_IA32_VMX_CR4_FIXED1:
6677f3da 4032 *pdata = msrs->cr4_fixed1;
b87a51ae
NHE
4033 break;
4034 case MSR_IA32_VMX_VMCS_ENUM:
6677f3da 4035 *pdata = msrs->vmcs_enum;
b87a51ae
NHE
4036 break;
4037 case MSR_IA32_VMX_PROCBASED_CTLS2:
b9c237bb 4038 *pdata = vmx_control_msr(
6677f3da
PB
4039 msrs->secondary_ctls_low,
4040 msrs->secondary_ctls_high);
b87a51ae
NHE
4041 break;
4042 case MSR_IA32_VMX_EPT_VPID_CAP:
6677f3da
PB
4043 *pdata = msrs->ept_caps |
4044 ((u64)msrs->vpid_caps << 32);
b87a51ae 4045 break;
27c42a1b 4046 case MSR_IA32_VMX_VMFUNC:
6677f3da 4047 *pdata = msrs->vmfunc_controls;
27c42a1b 4048 break;
b87a51ae 4049 default:
b87a51ae 4050 return 1;
b3897a49
NHE
4051 }
4052
b87a51ae
NHE
4053 return 0;
4054}
4055
37e4c997
HZ
4056static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
4057 uint64_t val)
4058{
4059 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
4060
4061 return !(val & ~valid_bits);
4062}
4063
801e459a
TL
4064static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
4065{
1389309c
PB
4066 switch (msr->index) {
4067 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4068 if (!nested)
4069 return 1;
4070 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
4071 default:
4072 return 1;
4073 }
4074
4075 return 0;
801e459a
TL
4076}
4077
6aa8b732
AK
4078/*
4079 * Reads an msr value (of 'msr_index') into 'pdata'.
4080 * Returns 0 on success, non-0 otherwise.
4081 * Assumes vcpu_load() was already called.
4082 */
609e36d3 4083static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 4084{
a6cb099a 4085 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 4086 struct shared_msr_entry *msr;
6aa8b732 4087
609e36d3 4088 switch (msr_info->index) {
05b3e0c2 4089#ifdef CONFIG_X86_64
6aa8b732 4090 case MSR_FS_BASE:
609e36d3 4091 msr_info->data = vmcs_readl(GUEST_FS_BASE);
6aa8b732
AK
4092 break;
4093 case MSR_GS_BASE:
609e36d3 4094 msr_info->data = vmcs_readl(GUEST_GS_BASE);
6aa8b732 4095 break;
44ea2b17 4096 case MSR_KERNEL_GS_BASE:
678e315e 4097 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
44ea2b17 4098 break;
26bb0981 4099#endif
6aa8b732 4100 case MSR_EFER:
609e36d3 4101 return kvm_get_msr_common(vcpu, msr_info);
d28b387f
KA
4102 case MSR_IA32_SPEC_CTRL:
4103 if (!msr_info->host_initiated &&
d28b387f
KA
4104 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4105 return 1;
4106
4107 msr_info->data = to_vmx(vcpu)->spec_ctrl;
4108 break;
28c1c9fa
KA
4109 case MSR_IA32_ARCH_CAPABILITIES:
4110 if (!msr_info->host_initiated &&
4111 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
4112 return 1;
4113 msr_info->data = to_vmx(vcpu)->arch_capabilities;
4114 break;
6aa8b732 4115 case MSR_IA32_SYSENTER_CS:
609e36d3 4116 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
6aa8b732
AK
4117 break;
4118 case MSR_IA32_SYSENTER_EIP:
609e36d3 4119 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
6aa8b732
AK
4120 break;
4121 case MSR_IA32_SYSENTER_ESP:
609e36d3 4122 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
6aa8b732 4123 break;
0dd376e7 4124 case MSR_IA32_BNDCFGS:
691bd434 4125 if (!kvm_mpx_supported() ||
d6321d49
RK
4126 (!msr_info->host_initiated &&
4127 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 4128 return 1;
609e36d3 4129 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
0dd376e7 4130 break;
c45dcc71
AR
4131 case MSR_IA32_MCG_EXT_CTL:
4132 if (!msr_info->host_initiated &&
a6cb099a 4133 !(vmx->msr_ia32_feature_control &
c45dcc71 4134 FEATURE_CONTROL_LMCE))
cae50139 4135 return 1;
c45dcc71
AR
4136 msr_info->data = vcpu->arch.mcg_ext_ctl;
4137 break;
cae50139 4138 case MSR_IA32_FEATURE_CONTROL:
a6cb099a 4139 msr_info->data = vmx->msr_ia32_feature_control;
cae50139
JK
4140 break;
4141 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4142 if (!nested_vmx_allowed(vcpu))
4143 return 1;
6677f3da
PB
4144 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
4145 &msr_info->data);
20300099
WL
4146 case MSR_IA32_XSS:
4147 if (!vmx_xsaves_supported())
4148 return 1;
609e36d3 4149 msr_info->data = vcpu->arch.ia32_xss;
20300099 4150 break;
4e47c7a6 4151 case MSR_TSC_AUX:
d6321d49
RK
4152 if (!msr_info->host_initiated &&
4153 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4e47c7a6
SY
4154 return 1;
4155 /* Otherwise falls through */
6aa8b732 4156 default:
a6cb099a 4157 msr = find_msr_entry(vmx, msr_info->index);
3bab1f5d 4158 if (msr) {
609e36d3 4159 msr_info->data = msr->data;
3bab1f5d 4160 break;
6aa8b732 4161 }
609e36d3 4162 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
4163 }
4164
6aa8b732
AK
4165 return 0;
4166}
4167
cae50139
JK
4168static void vmx_leave_nested(struct kvm_vcpu *vcpu);
4169
6aa8b732
AK
4170/*
4171 * Writes msr value into into the appropriate "register".
4172 * Returns 0 on success, non-0 otherwise.
4173 * Assumes vcpu_load() was already called.
4174 */
8fe8ab46 4175static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 4176{
a2fa3e9f 4177 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 4178 struct shared_msr_entry *msr;
2cc51560 4179 int ret = 0;
8fe8ab46
WA
4180 u32 msr_index = msr_info->index;
4181 u64 data = msr_info->data;
2cc51560 4182
6aa8b732 4183 switch (msr_index) {
3bab1f5d 4184 case MSR_EFER:
8fe8ab46 4185 ret = kvm_set_msr_common(vcpu, msr_info);
2cc51560 4186 break;
16175a79 4187#ifdef CONFIG_X86_64
6aa8b732 4188 case MSR_FS_BASE:
2fb92db1 4189 vmx_segment_cache_clear(vmx);
6aa8b732
AK
4190 vmcs_writel(GUEST_FS_BASE, data);
4191 break;
4192 case MSR_GS_BASE:
2fb92db1 4193 vmx_segment_cache_clear(vmx);
6aa8b732
AK
4194 vmcs_writel(GUEST_GS_BASE, data);
4195 break;
44ea2b17 4196 case MSR_KERNEL_GS_BASE:
678e315e 4197 vmx_write_guest_kernel_gs_base(vmx, data);
44ea2b17 4198 break;
6aa8b732
AK
4199#endif
4200 case MSR_IA32_SYSENTER_CS:
4201 vmcs_write32(GUEST_SYSENTER_CS, data);
4202 break;
4203 case MSR_IA32_SYSENTER_EIP:
f5b42c33 4204 vmcs_writel(GUEST_SYSENTER_EIP, data);
6aa8b732
AK
4205 break;
4206 case MSR_IA32_SYSENTER_ESP:
f5b42c33 4207 vmcs_writel(GUEST_SYSENTER_ESP, data);
6aa8b732 4208 break;
0dd376e7 4209 case MSR_IA32_BNDCFGS:
691bd434 4210 if (!kvm_mpx_supported() ||
d6321d49
RK
4211 (!msr_info->host_initiated &&
4212 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 4213 return 1;
fd8cb433 4214 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
4531662d 4215 (data & MSR_IA32_BNDCFGS_RSVD))
93c4adc7 4216 return 1;
0dd376e7
LJ
4217 vmcs_write64(GUEST_BNDCFGS, data);
4218 break;
d28b387f
KA
4219 case MSR_IA32_SPEC_CTRL:
4220 if (!msr_info->host_initiated &&
d28b387f
KA
4221 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4222 return 1;
4223
4224 /* The STIBP bit doesn't fault even if it's not advertised */
9f65fb29 4225 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
d28b387f
KA
4226 return 1;
4227
4228 vmx->spec_ctrl = data;
4229
4230 if (!data)
4231 break;
4232
4233 /*
4234 * For non-nested:
4235 * When it's written (to non-zero) for the first time, pass
4236 * it through.
4237 *
4238 * For nested:
4239 * The handling of the MSR bitmap for L2 guests is done in
4240 * nested_vmx_merge_msr_bitmap. We should not touch the
4241 * vmcs02.msr_bitmap here since it gets completely overwritten
4242 * in the merging. We update the vmcs01 here for L1 as well
4243 * since it will end up touching the MSR anyway now.
4244 */
4245 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
4246 MSR_IA32_SPEC_CTRL,
4247 MSR_TYPE_RW);
4248 break;
15d45071
AR
4249 case MSR_IA32_PRED_CMD:
4250 if (!msr_info->host_initiated &&
15d45071
AR
4251 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4252 return 1;
4253
4254 if (data & ~PRED_CMD_IBPB)
4255 return 1;
4256
4257 if (!data)
4258 break;
4259
4260 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
4261
4262 /*
4263 * For non-nested:
4264 * When it's written (to non-zero) for the first time, pass
4265 * it through.
4266 *
4267 * For nested:
4268 * The handling of the MSR bitmap for L2 guests is done in
4269 * nested_vmx_merge_msr_bitmap. We should not touch the
4270 * vmcs02.msr_bitmap here since it gets completely overwritten
4271 * in the merging.
4272 */
4273 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
4274 MSR_TYPE_W);
4275 break;
28c1c9fa
KA
4276 case MSR_IA32_ARCH_CAPABILITIES:
4277 if (!msr_info->host_initiated)
4278 return 1;
4279 vmx->arch_capabilities = data;
4280 break;
468d472f
SY
4281 case MSR_IA32_CR_PAT:
4282 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
4566654b
NA
4283 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
4284 return 1;
468d472f
SY
4285 vmcs_write64(GUEST_IA32_PAT, data);
4286 vcpu->arch.pat = data;
4287 break;
4288 }
8fe8ab46 4289 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 4290 break;
ba904635
WA
4291 case MSR_IA32_TSC_ADJUST:
4292 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 4293 break;
c45dcc71
AR
4294 case MSR_IA32_MCG_EXT_CTL:
4295 if ((!msr_info->host_initiated &&
4296 !(to_vmx(vcpu)->msr_ia32_feature_control &
4297 FEATURE_CONTROL_LMCE)) ||
4298 (data & ~MCG_EXT_CTL_LMCE_EN))
4299 return 1;
4300 vcpu->arch.mcg_ext_ctl = data;
4301 break;
cae50139 4302 case MSR_IA32_FEATURE_CONTROL:
37e4c997 4303 if (!vmx_feature_control_msr_valid(vcpu, data) ||
3b84080b 4304 (to_vmx(vcpu)->msr_ia32_feature_control &
cae50139
JK
4305 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
4306 return 1;
3b84080b 4307 vmx->msr_ia32_feature_control = data;
cae50139
JK
4308 if (msr_info->host_initiated && data == 0)
4309 vmx_leave_nested(vcpu);
4310 break;
4311 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
62cc6b9d
DM
4312 if (!msr_info->host_initiated)
4313 return 1; /* they are read-only */
4314 if (!nested_vmx_allowed(vcpu))
4315 return 1;
4316 return vmx_set_vmx_msr(vcpu, msr_index, data);
20300099
WL
4317 case MSR_IA32_XSS:
4318 if (!vmx_xsaves_supported())
4319 return 1;
4320 /*
4321 * The only supported bit as of Skylake is bit 8, but
4322 * it is not supported on KVM.
4323 */
4324 if (data != 0)
4325 return 1;
4326 vcpu->arch.ia32_xss = data;
4327 if (vcpu->arch.ia32_xss != host_xss)
4328 add_atomic_switch_msr(vmx, MSR_IA32_XSS,
989e3992 4329 vcpu->arch.ia32_xss, host_xss, false);
20300099
WL
4330 else
4331 clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
4332 break;
4e47c7a6 4333 case MSR_TSC_AUX:
d6321d49
RK
4334 if (!msr_info->host_initiated &&
4335 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4e47c7a6
SY
4336 return 1;
4337 /* Check reserved bit, higher 32 bits should be zero */
4338 if ((data >> 32) != 0)
4339 return 1;
4340 /* Otherwise falls through */
6aa8b732 4341 default:
8b9cf98c 4342 msr = find_msr_entry(vmx, msr_index);
3bab1f5d 4343 if (msr) {
8b3c3104 4344 u64 old_msr_data = msr->data;
3bab1f5d 4345 msr->data = data;
2225fd56
AK
4346 if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
4347 preempt_disable();
8b3c3104
AH
4348 ret = kvm_set_shared_msr(msr->index, msr->data,
4349 msr->mask);
2225fd56 4350 preempt_enable();
8b3c3104
AH
4351 if (ret)
4352 msr->data = old_msr_data;
2225fd56 4353 }
3bab1f5d 4354 break;
6aa8b732 4355 }
8fe8ab46 4356 ret = kvm_set_msr_common(vcpu, msr_info);
6aa8b732
AK
4357 }
4358
2cc51560 4359 return ret;
6aa8b732
AK
4360}
4361
5fdbf976 4362static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
6aa8b732 4363{
5fdbf976
MT
4364 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
4365 switch (reg) {
4366 case VCPU_REGS_RSP:
4367 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
4368 break;
4369 case VCPU_REGS_RIP:
4370 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
4371 break;
6de4f3ad
AK
4372 case VCPU_EXREG_PDPTR:
4373 if (enable_ept)
4374 ept_save_pdptrs(vcpu);
4375 break;
5fdbf976
MT
4376 default:
4377 break;
4378 }
6aa8b732
AK
4379}
4380
6aa8b732
AK
4381static __init int cpu_has_kvm_support(void)
4382{
6210e37b 4383 return cpu_has_vmx();
6aa8b732
AK
4384}
4385
4386static __init int vmx_disabled_by_bios(void)
4387{
4388 u64 msr;
4389
4390 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
cafd6659 4391 if (msr & FEATURE_CONTROL_LOCKED) {
23f3e991 4392 /* launched w/ TXT and VMX disabled */
cafd6659
SW
4393 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
4394 && tboot_enabled())
4395 return 1;
23f3e991 4396 /* launched w/o TXT and VMX only enabled w/ TXT */
cafd6659 4397 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
23f3e991 4398 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
f9335afe
SW
4399 && !tboot_enabled()) {
4400 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
23f3e991 4401 "activate TXT before enabling KVM\n");
cafd6659 4402 return 1;
f9335afe 4403 }
23f3e991
JC
4404 /* launched w/o TXT and VMX disabled */
4405 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
4406 && !tboot_enabled())
4407 return 1;
cafd6659
SW
4408 }
4409
4410 return 0;
6aa8b732
AK
4411}
4412
7725b894
DX
4413static void kvm_cpu_vmxon(u64 addr)
4414{
fe0e80be 4415 cr4_set_bits(X86_CR4_VMXE);
1c5ac21a
AS
4416 intel_pt_handle_vmx(1);
4417
4b1e5478 4418 asm volatile ("vmxon %0" : : "m"(addr));
7725b894
DX
4419}
4420
13a34e06 4421static int hardware_enable(void)
6aa8b732
AK
4422{
4423 int cpu = raw_smp_processor_id();
4424 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
cafd6659 4425 u64 old, test_bits;
6aa8b732 4426
1e02ce4c 4427 if (cr4_read_shadow() & X86_CR4_VMXE)
10474ae8
AG
4428 return -EBUSY;
4429
773e8a04
VK
4430 /*
4431 * This can happen if we hot-added a CPU but failed to allocate
4432 * VP assist page for it.
4433 */
4434 if (static_branch_unlikely(&enable_evmcs) &&
4435 !hv_get_vp_assist_page(cpu))
4436 return -EFAULT;
4437
d462b819 4438 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
bf9f6ac8
FW
4439 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
4440 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
8f536b76
ZY
4441
4442 /*
4443 * Now we can enable the vmclear operation in kdump
4444 * since the loaded_vmcss_on_cpu list on this cpu
4445 * has been initialized.
4446 *
4447 * Though the cpu is not in VMX operation now, there
4448 * is no problem to enable the vmclear operation
4449 * for the loaded_vmcss_on_cpu list is empty!
4450 */
4451 crash_enable_local_vmclear(cpu);
4452
6aa8b732 4453 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
cafd6659
SW
4454
4455 test_bits = FEATURE_CONTROL_LOCKED;
4456 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4457 if (tboot_enabled())
4458 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
4459
4460 if ((old & test_bits) != test_bits) {
6aa8b732 4461 /* enable and lock */
cafd6659
SW
4462 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
4463 }
fe0e80be 4464 kvm_cpu_vmxon(phys_addr);
fdf288bf
DH
4465 if (enable_ept)
4466 ept_sync_global();
10474ae8
AG
4467
4468 return 0;
6aa8b732
AK
4469}
4470
d462b819 4471static void vmclear_local_loaded_vmcss(void)
543e4243
AK
4472{
4473 int cpu = raw_smp_processor_id();
d462b819 4474 struct loaded_vmcs *v, *n;
543e4243 4475
d462b819
NHE
4476 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
4477 loaded_vmcss_on_cpu_link)
4478 __loaded_vmcs_clear(v);
543e4243
AK
4479}
4480
710ff4a8
EH
4481
4482/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
4483 * tricks.
4484 */
4485static void kvm_cpu_vmxoff(void)
6aa8b732 4486{
4b1e5478 4487 asm volatile (__ex("vmxoff"));
1c5ac21a
AS
4488
4489 intel_pt_handle_vmx(0);
fe0e80be 4490 cr4_clear_bits(X86_CR4_VMXE);
6aa8b732
AK
4491}
4492
13a34e06 4493static void hardware_disable(void)
710ff4a8 4494{
fe0e80be
DH
4495 vmclear_local_loaded_vmcss();
4496 kvm_cpu_vmxoff();
710ff4a8
EH
4497}
4498
1c3d14fe 4499static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
d77c26fc 4500 u32 msr, u32 *result)
1c3d14fe
YS
4501{
4502 u32 vmx_msr_low, vmx_msr_high;
4503 u32 ctl = ctl_min | ctl_opt;
4504
4505 rdmsr(msr, vmx_msr_low, vmx_msr_high);
4506
4507 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
4508 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
4509
4510 /* Ensure minimum (required) set of control bits are supported. */
4511 if (ctl_min & ~ctl)
002c7f7c 4512 return -EIO;
1c3d14fe
YS
4513
4514 *result = ctl;
4515 return 0;
4516}
4517
110312c8
AK
4518static __init bool allow_1_setting(u32 msr, u32 ctl)
4519{
4520 u32 vmx_msr_low, vmx_msr_high;
4521
4522 rdmsr(msr, vmx_msr_low, vmx_msr_high);
4523 return vmx_msr_high & ctl;
4524}
4525
002c7f7c 4526static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
6aa8b732
AK
4527{
4528 u32 vmx_msr_low, vmx_msr_high;
d56f546d 4529 u32 min, opt, min2, opt2;
1c3d14fe
YS
4530 u32 _pin_based_exec_control = 0;
4531 u32 _cpu_based_exec_control = 0;
f78e0e2e 4532 u32 _cpu_based_2nd_exec_control = 0;
1c3d14fe
YS
4533 u32 _vmexit_control = 0;
4534 u32 _vmentry_control = 0;
4535
1389309c 4536 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
10166744 4537 min = CPU_BASED_HLT_EXITING |
1c3d14fe
YS
4538#ifdef CONFIG_X86_64
4539 CPU_BASED_CR8_LOAD_EXITING |
4540 CPU_BASED_CR8_STORE_EXITING |
4541#endif
d56f546d
SY
4542 CPU_BASED_CR3_LOAD_EXITING |
4543 CPU_BASED_CR3_STORE_EXITING |
8eb73e2d 4544 CPU_BASED_UNCOND_IO_EXITING |
1c3d14fe 4545 CPU_BASED_MOV_DR_EXITING |
a7052897 4546 CPU_BASED_USE_TSC_OFFSETING |
4d5422ce
WL
4547 CPU_BASED_MWAIT_EXITING |
4548 CPU_BASED_MONITOR_EXITING |
fee84b07
AK
4549 CPU_BASED_INVLPG_EXITING |
4550 CPU_BASED_RDPMC_EXITING;
443381a8 4551
f78e0e2e 4552 opt = CPU_BASED_TPR_SHADOW |
25c5f225 4553 CPU_BASED_USE_MSR_BITMAPS |
f78e0e2e 4554 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1c3d14fe
YS
4555 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
4556 &_cpu_based_exec_control) < 0)
002c7f7c 4557 return -EIO;
6e5d865c
YS
4558#ifdef CONFIG_X86_64
4559 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4560 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
4561 ~CPU_BASED_CR8_STORE_EXITING;
4562#endif
f78e0e2e 4563 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
d56f546d
SY
4564 min2 = 0;
4565 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
8d14695f 4566 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2384d2b3 4567 SECONDARY_EXEC_WBINVD_EXITING |
d56f546d 4568 SECONDARY_EXEC_ENABLE_VPID |
3a624e29 4569 SECONDARY_EXEC_ENABLE_EPT |
4b8d54f9 4570 SECONDARY_EXEC_UNRESTRICTED_GUEST |
4e47c7a6 4571 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
0367f205 4572 SECONDARY_EXEC_DESC |
ad756a16 4573 SECONDARY_EXEC_RDTSCP |
83d4c286 4574 SECONDARY_EXEC_ENABLE_INVPCID |
c7c9c56c 4575 SECONDARY_EXEC_APIC_REGISTER_VIRT |
abc4fc58 4576 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
20300099 4577 SECONDARY_EXEC_SHADOW_VMCS |
843e4330 4578 SECONDARY_EXEC_XSAVES |
736fdf72
DH
4579 SECONDARY_EXEC_RDSEED_EXITING |
4580 SECONDARY_EXEC_RDRAND_EXITING |
8b3e34e4 4581 SECONDARY_EXEC_ENABLE_PML |
2a499e49 4582 SECONDARY_EXEC_TSC_SCALING |
0b665d30
SC
4583 SECONDARY_EXEC_ENABLE_VMFUNC |
4584 SECONDARY_EXEC_ENCLS_EXITING;
d56f546d
SY
4585 if (adjust_vmx_controls(min2, opt2,
4586 MSR_IA32_VMX_PROCBASED_CTLS2,
f78e0e2e
SY
4587 &_cpu_based_2nd_exec_control) < 0)
4588 return -EIO;
4589 }
4590#ifndef CONFIG_X86_64
4591 if (!(_cpu_based_2nd_exec_control &
4592 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
4593 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
4594#endif
83d4c286
YZ
4595
4596 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4597 _cpu_based_2nd_exec_control &= ~(
8d14695f 4598 SECONDARY_EXEC_APIC_REGISTER_VIRT |
c7c9c56c
YZ
4599 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4600 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
83d4c286 4601
61f1dd90
WL
4602 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
4603 &vmx_capability.ept, &vmx_capability.vpid);
4604
d56f546d 4605 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
a7052897
MT
4606 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
4607 enabled */
5fff7d27
GN
4608 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4609 CPU_BASED_CR3_STORE_EXITING |
4610 CPU_BASED_INVLPG_EXITING);
61f1dd90
WL
4611 } else if (vmx_capability.ept) {
4612 vmx_capability.ept = 0;
4613 pr_warn_once("EPT CAP should not exist if not support "
4614 "1-setting enable EPT VM-execution control\n");
4615 }
4616 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
4617 vmx_capability.vpid) {
4618 vmx_capability.vpid = 0;
4619 pr_warn_once("VPID CAP should not exist if not support "
4620 "1-setting enable VPID VM-execution control\n");
d56f546d 4621 }
1c3d14fe 4622
91fa0f8e 4623 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
1c3d14fe
YS
4624#ifdef CONFIG_X86_64
4625 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
4626#endif
a547c6db 4627 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
91fa0f8e 4628 VM_EXIT_CLEAR_BNDCFGS;
1c3d14fe
YS
4629 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
4630 &_vmexit_control) < 0)
002c7f7c 4631 return -EIO;
1c3d14fe 4632
8a1b4392
PB
4633 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
4634 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
4635 PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be
YZ
4636 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
4637 &_pin_based_exec_control) < 0)
4638 return -EIO;
4639
1c17c3e6
PB
4640 if (cpu_has_broken_vmx_preemption_timer())
4641 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be 4642 if (!(_cpu_based_2nd_exec_control &
91fa0f8e 4643 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
01e439be
YZ
4644 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
4645
c845f9c6 4646 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
da8999d3 4647 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
1c3d14fe
YS
4648 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
4649 &_vmentry_control) < 0)
002c7f7c 4650 return -EIO;
6aa8b732 4651
c68876fd 4652 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1c3d14fe
YS
4653
4654 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
4655 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
002c7f7c 4656 return -EIO;
1c3d14fe
YS
4657
4658#ifdef CONFIG_X86_64
4659 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
4660 if (vmx_msr_high & (1u<<16))
002c7f7c 4661 return -EIO;
1c3d14fe
YS
4662#endif
4663
4664 /* Require Write-Back (WB) memory type for VMCS accesses. */
4665 if (((vmx_msr_high >> 18) & 15) != 6)
002c7f7c 4666 return -EIO;
1c3d14fe 4667
002c7f7c 4668 vmcs_conf->size = vmx_msr_high & 0x1fff;
16cb0255 4669 vmcs_conf->order = get_order(vmcs_conf->size);
9ac7e3e8 4670 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
773e8a04 4671
2307af1c 4672 vmcs_conf->revision_id = vmx_msr_low;
1c3d14fe 4673
002c7f7c
YS
4674 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
4675 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
f78e0e2e 4676 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
002c7f7c
YS
4677 vmcs_conf->vmexit_ctrl = _vmexit_control;
4678 vmcs_conf->vmentry_ctrl = _vmentry_control;
1c3d14fe 4679
773e8a04
VK
4680 if (static_branch_unlikely(&enable_evmcs))
4681 evmcs_sanitize_exec_ctrls(vmcs_conf);
4682
110312c8
AK
4683 cpu_has_load_ia32_efer =
4684 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4685 VM_ENTRY_LOAD_IA32_EFER)
4686 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4687 VM_EXIT_LOAD_IA32_EFER);
4688
8bf00a52
GN
4689 cpu_has_load_perf_global_ctrl =
4690 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4691 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
4692 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4693 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
4694
4695 /*
4696 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
bb3541f1 4697 * but due to errata below it can't be used. Workaround is to use
8bf00a52
GN
4698 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
4699 *
4700 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
4701 *
4702 * AAK155 (model 26)
4703 * AAP115 (model 30)
4704 * AAT100 (model 37)
4705 * BC86,AAY89,BD102 (model 44)
4706 * BA97 (model 46)
4707 *
4708 */
4709 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
4710 switch (boot_cpu_data.x86_model) {
4711 case 26:
4712 case 30:
4713 case 37:
4714 case 44:
4715 case 46:
4716 cpu_has_load_perf_global_ctrl = false;
4717 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
4718 "does not work properly. Using workaround\n");
4719 break;
4720 default:
4721 break;
4722 }
4723 }
4724
782511b0 4725 if (boot_cpu_has(X86_FEATURE_XSAVES))
20300099
WL
4726 rdmsrl(MSR_IA32_XSS, host_xss);
4727
1c3d14fe 4728 return 0;
c68876fd 4729}
6aa8b732 4730
491a6038 4731static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
6aa8b732
AK
4732{
4733 int node = cpu_to_node(cpu);
4734 struct page *pages;
4735 struct vmcs *vmcs;
4736
96db800f 4737 pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
6aa8b732
AK
4738 if (!pages)
4739 return NULL;
4740 vmcs = page_address(pages);
1c3d14fe 4741 memset(vmcs, 0, vmcs_config.size);
2307af1c
LA
4742
4743 /* KVM supports Enlightened VMCS v1 only */
4744 if (static_branch_unlikely(&enable_evmcs))
392b2f25 4745 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2307af1c 4746 else
392b2f25 4747 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 4748
491a6038
LA
4749 if (shadow)
4750 vmcs->hdr.shadow_vmcs = 1;
6aa8b732
AK
4751 return vmcs;
4752}
4753
6aa8b732
AK
4754static void free_vmcs(struct vmcs *vmcs)
4755{
1c3d14fe 4756 free_pages((unsigned long)vmcs, vmcs_config.order);
6aa8b732
AK
4757}
4758
d462b819
NHE
4759/*
4760 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
4761 */
4762static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4763{
4764 if (!loaded_vmcs->vmcs)
4765 return;
4766 loaded_vmcs_clear(loaded_vmcs);
4767 free_vmcs(loaded_vmcs->vmcs);
4768 loaded_vmcs->vmcs = NULL;
904e14fb
PB
4769 if (loaded_vmcs->msr_bitmap)
4770 free_page((unsigned long)loaded_vmcs->msr_bitmap);
355f4fb1 4771 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
d462b819
NHE
4772}
4773
491a6038 4774static struct vmcs *alloc_vmcs(bool shadow)
f21f165e 4775{
491a6038 4776 return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
f21f165e
PB
4777}
4778
4779static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4780{
491a6038 4781 loaded_vmcs->vmcs = alloc_vmcs(false);
f21f165e
PB
4782 if (!loaded_vmcs->vmcs)
4783 return -ENOMEM;
4784
4785 loaded_vmcs->shadow_vmcs = NULL;
4786 loaded_vmcs_init(loaded_vmcs);
904e14fb
PB
4787
4788 if (cpu_has_vmx_msr_bitmap()) {
4789 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
4790 if (!loaded_vmcs->msr_bitmap)
4791 goto out_vmcs;
4792 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
ceef7d10 4793
1f008e11
AB
4794 if (IS_ENABLED(CONFIG_HYPERV) &&
4795 static_branch_unlikely(&enable_evmcs) &&
ceef7d10
VK
4796 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
4797 struct hv_enlightened_vmcs *evmcs =
4798 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
4799
4800 evmcs->hv_enlightenments_control.msr_bitmap = 1;
4801 }
904e14fb 4802 }
d7ee039e
SC
4803
4804 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
4805
f21f165e 4806 return 0;
904e14fb
PB
4807
4808out_vmcs:
4809 free_loaded_vmcs(loaded_vmcs);
4810 return -ENOMEM;
f21f165e
PB
4811}
4812
39959588 4813static void free_kvm_area(void)
6aa8b732
AK
4814{
4815 int cpu;
4816
3230bb47 4817 for_each_possible_cpu(cpu) {
6aa8b732 4818 free_vmcs(per_cpu(vmxarea, cpu));
3230bb47
ZA
4819 per_cpu(vmxarea, cpu) = NULL;
4820 }
6aa8b732
AK
4821}
4822
d37f4267
JM
4823enum vmcs_field_width {
4824 VMCS_FIELD_WIDTH_U16 = 0,
4825 VMCS_FIELD_WIDTH_U64 = 1,
4826 VMCS_FIELD_WIDTH_U32 = 2,
4827 VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
85fd514e
JM
4828};
4829
d37f4267 4830static inline int vmcs_field_width(unsigned long field)
85fd514e
JM
4831{
4832 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
d37f4267 4833 return VMCS_FIELD_WIDTH_U32;
85fd514e
JM
4834 return (field >> 13) & 0x3 ;
4835}
4836
4837static inline int vmcs_field_readonly(unsigned long field)
4838{
4839 return (((field >> 10) & 0x3) == 1);
4840}
4841
fe2b201b
BD
4842static void init_vmcs_shadow_fields(void)
4843{
4844 int i, j;
4845
44900ba6
PB
4846 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
4847 u16 field = shadow_read_only_fields[i];
d37f4267 4848 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
44900ba6
PB
4849 (i + 1 == max_shadow_read_only_fields ||
4850 shadow_read_only_fields[i + 1] != field + 1))
4851 pr_err("Missing field from shadow_read_only_field %x\n",
4852 field + 1);
4853
4854 clear_bit(field, vmx_vmread_bitmap);
4855#ifdef CONFIG_X86_64
4856 if (field & 1)
4857 continue;
4858#endif
4859 if (j < i)
4860 shadow_read_only_fields[j] = field;
4861 j++;
4862 }
4863 max_shadow_read_only_fields = j;
fe2b201b
BD
4864
4865 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
44900ba6 4866 u16 field = shadow_read_write_fields[i];
d37f4267 4867 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
44900ba6
PB
4868 (i + 1 == max_shadow_read_write_fields ||
4869 shadow_read_write_fields[i + 1] != field + 1))
4870 pr_err("Missing field from shadow_read_write_field %x\n",
4871 field + 1);
4872
c5d167b2
PB
4873 /*
4874 * PML and the preemption timer can be emulated, but the
4875 * processor cannot vmwrite to fields that don't exist
4876 * on bare metal.
4877 */
44900ba6 4878 switch (field) {
c5d167b2
PB
4879 case GUEST_PML_INDEX:
4880 if (!cpu_has_vmx_pml())
4881 continue;
4882 break;
4883 case VMX_PREEMPTION_TIMER_VALUE:
4884 if (!cpu_has_vmx_preemption_timer())
4885 continue;
4886 break;
4887 case GUEST_INTR_STATUS:
4888 if (!cpu_has_vmx_apicv())
fe2b201b
BD
4889 continue;
4890 break;
4891 default:
4892 break;
4893 }
4894
44900ba6
PB
4895 clear_bit(field, vmx_vmwrite_bitmap);
4896 clear_bit(field, vmx_vmread_bitmap);
4897#ifdef CONFIG_X86_64
4898 if (field & 1)
4899 continue;
4900#endif
fe2b201b 4901 if (j < i)
44900ba6 4902 shadow_read_write_fields[j] = field;
fe2b201b
BD
4903 j++;
4904 }
4905 max_shadow_read_write_fields = j;
fe2b201b
BD
4906}
4907
6aa8b732
AK
4908static __init int alloc_kvm_area(void)
4909{
4910 int cpu;
4911
3230bb47 4912 for_each_possible_cpu(cpu) {
6aa8b732
AK
4913 struct vmcs *vmcs;
4914
491a6038 4915 vmcs = alloc_vmcs_cpu(false, cpu);
6aa8b732
AK
4916 if (!vmcs) {
4917 free_kvm_area();
4918 return -ENOMEM;
4919 }
4920
2307af1c
LA
4921 /*
4922 * When eVMCS is enabled, alloc_vmcs_cpu() sets
4923 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
4924 * revision_id reported by MSR_IA32_VMX_BASIC.
4925 *
4926 * However, even though not explictly documented by
4927 * TLFS, VMXArea passed as VMXON argument should
4928 * still be marked with revision_id reported by
4929 * physical CPU.
4930 */
4931 if (static_branch_unlikely(&enable_evmcs))
392b2f25 4932 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 4933
6aa8b732
AK
4934 per_cpu(vmxarea, cpu) = vmcs;
4935 }
4936 return 0;
4937}
4938
91b0aa2c 4939static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
d99e4152 4940 struct kvm_segment *save)
6aa8b732 4941{
d99e4152
GN
4942 if (!emulate_invalid_guest_state) {
4943 /*
4944 * CS and SS RPL should be equal during guest entry according
4945 * to VMX spec, but in reality it is not always so. Since vcpu
4946 * is in the middle of the transition from real mode to
4947 * protected mode it is safe to assume that RPL 0 is a good
4948 * default value.
4949 */
4950 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
b32a9918
NA
4951 save->selector &= ~SEGMENT_RPL_MASK;
4952 save->dpl = save->selector & SEGMENT_RPL_MASK;
d99e4152 4953 save->s = 1;
6aa8b732 4954 }
d99e4152 4955 vmx_set_segment(vcpu, save, seg);
6aa8b732
AK
4956}
4957
4958static void enter_pmode(struct kvm_vcpu *vcpu)
4959{
4960 unsigned long flags;
a89a8fb9 4961 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732 4962
d99e4152
GN
4963 /*
4964 * Update real mode segment cache. It may be not up-to-date if sement
4965 * register was written while vcpu was in a guest mode.
4966 */
4967 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4968 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4969 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4970 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4971 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4972 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4973
7ffd92c5 4974 vmx->rmode.vm86_active = 0;
6aa8b732 4975
2fb92db1
AK
4976 vmx_segment_cache_clear(vmx);
4977
f5f7b2fe 4978 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
6aa8b732
AK
4979
4980 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47
AK
4981 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
4982 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
6aa8b732
AK
4983 vmcs_writel(GUEST_RFLAGS, flags);
4984
66aee91a
RR
4985 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
4986 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
6aa8b732
AK
4987
4988 update_exception_bitmap(vcpu);
4989
91b0aa2c
GN
4990 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
4991 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
4992 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
4993 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
4994 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
4995 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
6aa8b732
AK
4996}
4997
f5f7b2fe 4998static void fix_rmode_seg(int seg, struct kvm_segment *save)
6aa8b732 4999{
772e0318 5000 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
d99e4152
GN
5001 struct kvm_segment var = *save;
5002
5003 var.dpl = 0x3;
5004 if (seg == VCPU_SREG_CS)
5005 var.type = 0x3;
5006
5007 if (!emulate_invalid_guest_state) {
5008 var.selector = var.base >> 4;
5009 var.base = var.base & 0xffff0;
5010 var.limit = 0xffff;
5011 var.g = 0;
5012 var.db = 0;
5013 var.present = 1;
5014 var.s = 1;
5015 var.l = 0;
5016 var.unusable = 0;
5017 var.type = 0x3;
5018 var.avl = 0;
5019 if (save->base & 0xf)
5020 printk_once(KERN_WARNING "kvm: segment base is not "
5021 "paragraph aligned when entering "
5022 "protected mode (seg=%d)", seg);
5023 }
6aa8b732 5024
d99e4152 5025 vmcs_write16(sf->selector, var.selector);
96794e4e 5026 vmcs_writel(sf->base, var.base);
d99e4152
GN
5027 vmcs_write32(sf->limit, var.limit);
5028 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
6aa8b732
AK
5029}
5030
5031static void enter_rmode(struct kvm_vcpu *vcpu)
5032{
5033 unsigned long flags;
a89a8fb9 5034 struct vcpu_vmx *vmx = to_vmx(vcpu);
40bbb9d0 5035 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
6aa8b732 5036
f5f7b2fe
AK
5037 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
5038 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
5039 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
5040 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
5041 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
c6ad1153
GN
5042 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
5043 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
f5f7b2fe 5044
7ffd92c5 5045 vmx->rmode.vm86_active = 1;
6aa8b732 5046
776e58ea
GN
5047 /*
5048 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4918c6ca 5049 * vcpu. Warn the user that an update is overdue.
776e58ea 5050 */
40bbb9d0 5051 if (!kvm_vmx->tss_addr)
776e58ea
GN
5052 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
5053 "called before entering vcpu\n");
776e58ea 5054
2fb92db1
AK
5055 vmx_segment_cache_clear(vmx);
5056
40bbb9d0 5057 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
6aa8b732 5058 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
6aa8b732
AK
5059 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5060
5061 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47 5062 vmx->rmode.save_rflags = flags;
6aa8b732 5063
053de044 5064 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
6aa8b732
AK
5065
5066 vmcs_writel(GUEST_RFLAGS, flags);
66aee91a 5067 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
6aa8b732
AK
5068 update_exception_bitmap(vcpu);
5069
d99e4152
GN
5070 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
5071 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
5072 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
5073 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
5074 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
5075 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
b246dd5d 5076
8668a3c4 5077 kvm_mmu_reset_context(vcpu);
6aa8b732
AK
5078}
5079
401d10de
AS
5080static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
5081{
5082 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981
AK
5083 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
5084
5085 if (!msr)
5086 return;
401d10de 5087
f6801dff 5088 vcpu->arch.efer = efer;
401d10de 5089 if (efer & EFER_LMA) {
2961e876 5090 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
401d10de
AS
5091 msr->data = efer;
5092 } else {
2961e876 5093 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
401d10de
AS
5094
5095 msr->data = efer & ~EFER_LME;
5096 }
5097 setup_msrs(vmx);
5098}
5099
05b3e0c2 5100#ifdef CONFIG_X86_64
6aa8b732
AK
5101
5102static void enter_lmode(struct kvm_vcpu *vcpu)
5103{
5104 u32 guest_tr_ar;
5105
2fb92db1
AK
5106 vmx_segment_cache_clear(to_vmx(vcpu));
5107
6aa8b732 5108 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4d283ec9 5109 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
bd80158a
JK
5110 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
5111 __func__);
6aa8b732 5112 vmcs_write32(GUEST_TR_AR_BYTES,
4d283ec9
AL
5113 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
5114 | VMX_AR_TYPE_BUSY_64_TSS);
6aa8b732 5115 }
da38f438 5116 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
6aa8b732
AK
5117}
5118
5119static void exit_lmode(struct kvm_vcpu *vcpu)
5120{
2961e876 5121 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
da38f438 5122 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
6aa8b732
AK
5123}
5124
5125#endif
5126
c2ba05cc
WL
5127static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
5128 bool invalidate_gpa)
2384d2b3 5129{
c2ba05cc 5130 if (enable_ept && (invalidate_gpa || !enable_vpid)) {
44dd3ffa 5131 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
dd180b3e 5132 return;
44dd3ffa
VK
5133 ept_sync_context(construct_eptp(vcpu,
5134 vcpu->arch.mmu->root_hpa));
f0b98c02
JM
5135 } else {
5136 vpid_sync_context(vpid);
dd180b3e 5137 }
2384d2b3
SY
5138}
5139
c2ba05cc 5140static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
dd5f5341 5141{
c2ba05cc 5142 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
dd5f5341
WL
5143}
5144
faff8758
JS
5145static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
5146{
5147 int vpid = to_vmx(vcpu)->vpid;
5148
5149 if (!vpid_sync_vcpu_addr(vpid, addr))
5150 vpid_sync_context(vpid);
5151
5152 /*
5153 * If VPIDs are not supported or enabled, then the above is a no-op.
5154 * But we don't really need a TLB flush in that case anyway, because
5155 * each VM entry/exit includes an implicit flush when VPID is 0.
5156 */
5157}
5158
e8467fda
AK
5159static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
5160{
5161 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
5162
5163 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
5164 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
5165}
5166
aff48baa
AK
5167static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
5168{
b4d18517 5169 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
aff48baa
AK
5170 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
5171 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5172}
5173
25c4c276 5174static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
399badf3 5175{
fc78f519
AK
5176 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
5177
5178 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
5179 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
399badf3
AK
5180}
5181
1439442c
SY
5182static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
5183{
d0d538b9
GN
5184 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5185
6de4f3ad
AK
5186 if (!test_bit(VCPU_EXREG_PDPTR,
5187 (unsigned long *)&vcpu->arch.regs_dirty))
5188 return;
5189
1439442c 5190 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
d0d538b9
GN
5191 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
5192 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
5193 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
5194 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
1439442c
SY
5195 }
5196}
5197
8f5d549f
AK
5198static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
5199{
d0d538b9
GN
5200 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5201
8f5d549f 5202 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
d0d538b9
GN
5203 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
5204 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
5205 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
5206 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
8f5d549f 5207 }
6de4f3ad
AK
5208
5209 __set_bit(VCPU_EXREG_PDPTR,
5210 (unsigned long *)&vcpu->arch.regs_avail);
5211 __set_bit(VCPU_EXREG_PDPTR,
5212 (unsigned long *)&vcpu->arch.regs_dirty);
8f5d549f
AK
5213}
5214
3899152c
DM
5215static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5216{
6677f3da
PB
5217 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5218 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
3899152c
DM
5219 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5220
6677f3da 5221 if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
3899152c
DM
5222 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
5223 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
5224 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
5225
5226 return fixed_bits_valid(val, fixed0, fixed1);
5227}
5228
5229static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5230{
6677f3da
PB
5231 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5232 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
3899152c
DM
5233
5234 return fixed_bits_valid(val, fixed0, fixed1);
5235}
5236
5237static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
5238{
6677f3da
PB
5239 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
5240 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
3899152c
DM
5241
5242 return fixed_bits_valid(val, fixed0, fixed1);
5243}
5244
5245/* No difference in the restrictions on guest and host CR4 in VMX operation. */
5246#define nested_guest_cr4_valid nested_cr4_valid
5247#define nested_host_cr4_valid nested_cr4_valid
5248
5e1746d6 5249static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1439442c
SY
5250
5251static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
5252 unsigned long cr0,
5253 struct kvm_vcpu *vcpu)
5254{
5233dd51
MT
5255 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
5256 vmx_decache_cr3(vcpu);
1439442c
SY
5257 if (!(cr0 & X86_CR0_PG)) {
5258 /* From paging/starting to nonpaging */
5259 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
65267ea1 5260 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1439442c
SY
5261 (CPU_BASED_CR3_LOAD_EXITING |
5262 CPU_BASED_CR3_STORE_EXITING));
5263 vcpu->arch.cr0 = cr0;
fc78f519 5264 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c
SY
5265 } else if (!is_paging(vcpu)) {
5266 /* From nonpaging to paging */
5267 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
65267ea1 5268 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1439442c
SY
5269 ~(CPU_BASED_CR3_LOAD_EXITING |
5270 CPU_BASED_CR3_STORE_EXITING));
5271 vcpu->arch.cr0 = cr0;
fc78f519 5272 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c 5273 }
95eb84a7
SY
5274
5275 if (!(cr0 & X86_CR0_WP))
5276 *hw_cr0 &= ~X86_CR0_WP;
1439442c
SY
5277}
5278
6aa8b732
AK
5279static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
5280{
7ffd92c5 5281 struct vcpu_vmx *vmx = to_vmx(vcpu);
3a624e29
NK
5282 unsigned long hw_cr0;
5283
3de6347b 5284 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3a624e29 5285 if (enable_unrestricted_guest)
5037878e 5286 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
218e763f 5287 else {
5037878e 5288 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
1439442c 5289
218e763f
GN
5290 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
5291 enter_pmode(vcpu);
6aa8b732 5292
218e763f
GN
5293 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
5294 enter_rmode(vcpu);
5295 }
6aa8b732 5296
05b3e0c2 5297#ifdef CONFIG_X86_64
f6801dff 5298 if (vcpu->arch.efer & EFER_LME) {
707d92fa 5299 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
6aa8b732 5300 enter_lmode(vcpu);
707d92fa 5301 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
6aa8b732
AK
5302 exit_lmode(vcpu);
5303 }
5304#endif
5305
b4d18517 5306 if (enable_ept && !enable_unrestricted_guest)
1439442c
SY
5307 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
5308
6aa8b732 5309 vmcs_writel(CR0_READ_SHADOW, cr0);
1439442c 5310 vmcs_writel(GUEST_CR0, hw_cr0);
ad312c7c 5311 vcpu->arch.cr0 = cr0;
14168786
GN
5312
5313 /* depends on vcpu->arch.cr0 to be set to a new value */
5314 vmx->emulation_required = emulation_required(vcpu);
6aa8b732
AK
5315}
5316
855feb67
YZ
5317static int get_ept_level(struct kvm_vcpu *vcpu)
5318{
5319 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
5320 return 5;
5321 return 4;
5322}
5323
995f00a6 5324static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
1439442c 5325{
855feb67
YZ
5326 u64 eptp = VMX_EPTP_MT_WB;
5327
5328 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
1439442c 5329
995f00a6
PF
5330 if (enable_ept_ad_bits &&
5331 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
bb97a016 5332 eptp |= VMX_EPTP_AD_ENABLE_BIT;
1439442c
SY
5333 eptp |= (root_hpa & PAGE_MASK);
5334
5335 return eptp;
5336}
5337
6aa8b732
AK
5338static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
5339{
877ad952 5340 struct kvm *kvm = vcpu->kvm;
1439442c
SY
5341 unsigned long guest_cr3;
5342 u64 eptp;
5343
5344 guest_cr3 = cr3;
089d034e 5345 if (enable_ept) {
995f00a6 5346 eptp = construct_eptp(vcpu, cr3);
1439442c 5347 vmcs_write64(EPT_POINTER, eptp);
877ad952
TL
5348
5349 if (kvm_x86_ops->tlb_remote_flush) {
5350 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5351 to_vmx(vcpu)->ept_pointer = eptp;
5352 to_kvm_vmx(kvm)->ept_pointers_match
5353 = EPT_POINTERS_CHECK;
5354 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5355 }
5356
e90008df
SC
5357 if (enable_unrestricted_guest || is_paging(vcpu) ||
5358 is_guest_mode(vcpu))
59ab5a8f
JK
5359 guest_cr3 = kvm_read_cr3(vcpu);
5360 else
877ad952 5361 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
7c93be44 5362 ept_load_pdptrs(vcpu);
1439442c
SY
5363 }
5364
1439442c 5365 vmcs_writel(GUEST_CR3, guest_cr3);
6aa8b732
AK
5366}
5367
5e1746d6 5368static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 5369{
085e68ee
BS
5370 /*
5371 * Pass through host's Machine Check Enable value to hw_cr4, which
5372 * is in force while we are in guest mode. Do not let guests control
5373 * this bit, even if host CR4.MCE == 0.
5374 */
5dc1f044
SC
5375 unsigned long hw_cr4;
5376
5377 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
5378 if (enable_unrestricted_guest)
5379 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
5380 else if (to_vmx(vcpu)->rmode.vm86_active)
5381 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
5382 else
5383 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
1439442c 5384
64f7a115
SC
5385 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
5386 if (cr4 & X86_CR4_UMIP) {
5387 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
0367f205 5388 SECONDARY_EXEC_DESC);
64f7a115
SC
5389 hw_cr4 &= ~X86_CR4_UMIP;
5390 } else if (!is_guest_mode(vcpu) ||
5391 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
5392 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
5393 SECONDARY_EXEC_DESC);
5394 }
0367f205 5395
5e1746d6
NHE
5396 if (cr4 & X86_CR4_VMXE) {
5397 /*
5398 * To use VMXON (and later other VMX instructions), a guest
5399 * must first be able to turn on cr4.VMXE (see handle_vmon()).
5400 * So basically the check on whether to allow nested VMX
5bea5123
PB
5401 * is here. We operate under the default treatment of SMM,
5402 * so VMX cannot be enabled under SMM.
5e1746d6 5403 */
5bea5123 5404 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
5e1746d6 5405 return 1;
1a0d74e6 5406 }
3899152c
DM
5407
5408 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
5e1746d6
NHE
5409 return 1;
5410
ad312c7c 5411 vcpu->arch.cr4 = cr4;
5dc1f044
SC
5412
5413 if (!enable_unrestricted_guest) {
5414 if (enable_ept) {
5415 if (!is_paging(vcpu)) {
5416 hw_cr4 &= ~X86_CR4_PAE;
5417 hw_cr4 |= X86_CR4_PSE;
5418 } else if (!(cr4 & X86_CR4_PAE)) {
5419 hw_cr4 &= ~X86_CR4_PAE;
5420 }
bc23008b 5421 }
1439442c 5422
656ec4a4 5423 /*
ddba2628
HH
5424 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
5425 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
5426 * to be manually disabled when guest switches to non-paging
5427 * mode.
5428 *
5429 * If !enable_unrestricted_guest, the CPU is always running
5430 * with CR0.PG=1 and CR4 needs to be modified.
5431 * If enable_unrestricted_guest, the CPU automatically
5432 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
656ec4a4 5433 */
5dc1f044
SC
5434 if (!is_paging(vcpu))
5435 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
5436 }
656ec4a4 5437
1439442c
SY
5438 vmcs_writel(CR4_READ_SHADOW, cr4);
5439 vmcs_writel(GUEST_CR4, hw_cr4);
5e1746d6 5440 return 0;
6aa8b732
AK
5441}
5442
6aa8b732
AK
5443static void vmx_get_segment(struct kvm_vcpu *vcpu,
5444 struct kvm_segment *var, int seg)
5445{
a9179499 5446 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732
AK
5447 u32 ar;
5448
c6ad1153 5449 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
f5f7b2fe 5450 *var = vmx->rmode.segs[seg];
a9179499 5451 if (seg == VCPU_SREG_TR
2fb92db1 5452 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
f5f7b2fe 5453 return;
1390a28b
AK
5454 var->base = vmx_read_guest_seg_base(vmx, seg);
5455 var->selector = vmx_read_guest_seg_selector(vmx, seg);
5456 return;
a9179499 5457 }
2fb92db1
AK
5458 var->base = vmx_read_guest_seg_base(vmx, seg);
5459 var->limit = vmx_read_guest_seg_limit(vmx, seg);
5460 var->selector = vmx_read_guest_seg_selector(vmx, seg);
5461 ar = vmx_read_guest_seg_ar(vmx, seg);
03617c18 5462 var->unusable = (ar >> 16) & 1;
6aa8b732
AK
5463 var->type = ar & 15;
5464 var->s = (ar >> 4) & 1;
5465 var->dpl = (ar >> 5) & 3;
03617c18
GN
5466 /*
5467 * Some userspaces do not preserve unusable property. Since usable
5468 * segment has to be present according to VMX spec we can use present
5469 * property to amend userspace bug by making unusable segment always
5470 * nonpresent. vmx_segment_access_rights() already marks nonpresent
5471 * segment as unusable.
5472 */
5473 var->present = !var->unusable;
6aa8b732
AK
5474 var->avl = (ar >> 12) & 1;
5475 var->l = (ar >> 13) & 1;
5476 var->db = (ar >> 14) & 1;
5477 var->g = (ar >> 15) & 1;
6aa8b732
AK
5478}
5479
a9179499
AK
5480static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
5481{
a9179499
AK
5482 struct kvm_segment s;
5483
5484 if (to_vmx(vcpu)->rmode.vm86_active) {
5485 vmx_get_segment(vcpu, &s, seg);
5486 return s.base;
5487 }
2fb92db1 5488 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
a9179499
AK
5489}
5490
b09408d0 5491static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2e4d2653 5492{
b09408d0
MT
5493 struct vcpu_vmx *vmx = to_vmx(vcpu);
5494
ae9fedc7 5495 if (unlikely(vmx->rmode.vm86_active))
2e4d2653 5496 return 0;
ae9fedc7
PB
5497 else {
5498 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4d283ec9 5499 return VMX_AR_DPL(ar);
69c73028 5500 }
69c73028
AK
5501}
5502
653e3108 5503static u32 vmx_segment_access_rights(struct kvm_segment *var)
6aa8b732 5504{
6aa8b732
AK
5505 u32 ar;
5506
f0495f9b 5507 if (var->unusable || !var->present)
6aa8b732
AK
5508 ar = 1 << 16;
5509 else {
5510 ar = var->type & 15;
5511 ar |= (var->s & 1) << 4;
5512 ar |= (var->dpl & 3) << 5;
5513 ar |= (var->present & 1) << 7;
5514 ar |= (var->avl & 1) << 12;
5515 ar |= (var->l & 1) << 13;
5516 ar |= (var->db & 1) << 14;
5517 ar |= (var->g & 1) << 15;
5518 }
653e3108
AK
5519
5520 return ar;
5521}
5522
5523static void vmx_set_segment(struct kvm_vcpu *vcpu,
5524 struct kvm_segment *var, int seg)
5525{
7ffd92c5 5526 struct vcpu_vmx *vmx = to_vmx(vcpu);
772e0318 5527 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
653e3108 5528
2fb92db1
AK
5529 vmx_segment_cache_clear(vmx);
5530
1ecd50a9
GN
5531 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
5532 vmx->rmode.segs[seg] = *var;
5533 if (seg == VCPU_SREG_TR)
5534 vmcs_write16(sf->selector, var->selector);
5535 else if (var->s)
5536 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
d99e4152 5537 goto out;
653e3108 5538 }
1ecd50a9 5539
653e3108
AK
5540 vmcs_writel(sf->base, var->base);
5541 vmcs_write32(sf->limit, var->limit);
5542 vmcs_write16(sf->selector, var->selector);
3a624e29
NK
5543
5544 /*
5545 * Fix the "Accessed" bit in AR field of segment registers for older
5546 * qemu binaries.
5547 * IA32 arch specifies that at the time of processor reset the
5548 * "Accessed" bit in the AR field of segment registers is 1. And qemu
0fa06071 5549 * is setting it to 0 in the userland code. This causes invalid guest
3a624e29
NK
5550 * state vmexit when "unrestricted guest" mode is turned on.
5551 * Fix for this setup issue in cpu_reset is being pushed in the qemu
5552 * tree. Newer qemu binaries with that qemu fix would not need this
5553 * kvm hack.
5554 */
5555 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
f924d66d 5556 var->type |= 0x1; /* Accessed */
3a624e29 5557
f924d66d 5558 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
d99e4152
GN
5559
5560out:
98eb2f8b 5561 vmx->emulation_required = emulation_required(vcpu);
6aa8b732
AK
5562}
5563
6aa8b732
AK
5564static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
5565{
2fb92db1 5566 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
6aa8b732
AK
5567
5568 *db = (ar >> 14) & 1;
5569 *l = (ar >> 13) & 1;
5570}
5571
89a27f4d 5572static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5573{
89a27f4d
GN
5574 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
5575 dt->address = vmcs_readl(GUEST_IDTR_BASE);
6aa8b732
AK
5576}
5577
89a27f4d 5578static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5579{
89a27f4d
GN
5580 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
5581 vmcs_writel(GUEST_IDTR_BASE, dt->address);
6aa8b732
AK
5582}
5583
89a27f4d 5584static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5585{
89a27f4d
GN
5586 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
5587 dt->address = vmcs_readl(GUEST_GDTR_BASE);
6aa8b732
AK
5588}
5589
89a27f4d 5590static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 5591{
89a27f4d
GN
5592 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
5593 vmcs_writel(GUEST_GDTR_BASE, dt->address);
6aa8b732
AK
5594}
5595
648dfaa7
MG
5596static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
5597{
5598 struct kvm_segment var;
5599 u32 ar;
5600
5601 vmx_get_segment(vcpu, &var, seg);
07f42f5f 5602 var.dpl = 0x3;
0647f4aa
GN
5603 if (seg == VCPU_SREG_CS)
5604 var.type = 0x3;
648dfaa7
MG
5605 ar = vmx_segment_access_rights(&var);
5606
5607 if (var.base != (var.selector << 4))
5608 return false;
89efbed0 5609 if (var.limit != 0xffff)
648dfaa7 5610 return false;
07f42f5f 5611 if (ar != 0xf3)
648dfaa7
MG
5612 return false;
5613
5614 return true;
5615}
5616
5617static bool code_segment_valid(struct kvm_vcpu *vcpu)
5618{
5619 struct kvm_segment cs;
5620 unsigned int cs_rpl;
5621
5622 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
b32a9918 5623 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
648dfaa7 5624
1872a3f4
AK
5625 if (cs.unusable)
5626 return false;
4d283ec9 5627 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
648dfaa7
MG
5628 return false;
5629 if (!cs.s)
5630 return false;
4d283ec9 5631 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
648dfaa7
MG
5632 if (cs.dpl > cs_rpl)
5633 return false;
1872a3f4 5634 } else {
648dfaa7
MG
5635 if (cs.dpl != cs_rpl)
5636 return false;
5637 }
5638 if (!cs.present)
5639 return false;
5640
5641 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
5642 return true;
5643}
5644
5645static bool stack_segment_valid(struct kvm_vcpu *vcpu)
5646{
5647 struct kvm_segment ss;
5648 unsigned int ss_rpl;
5649
5650 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
b32a9918 5651 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
648dfaa7 5652
1872a3f4
AK
5653 if (ss.unusable)
5654 return true;
5655 if (ss.type != 3 && ss.type != 7)
648dfaa7
MG
5656 return false;
5657 if (!ss.s)
5658 return false;
5659 if (ss.dpl != ss_rpl) /* DPL != RPL */
5660 return false;
5661 if (!ss.present)
5662 return false;
5663
5664 return true;
5665}
5666
5667static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
5668{
5669 struct kvm_segment var;
5670 unsigned int rpl;
5671
5672 vmx_get_segment(vcpu, &var, seg);
b32a9918 5673 rpl = var.selector & SEGMENT_RPL_MASK;
648dfaa7 5674
1872a3f4
AK
5675 if (var.unusable)
5676 return true;
648dfaa7
MG
5677 if (!var.s)
5678 return false;
5679 if (!var.present)
5680 return false;
4d283ec9 5681 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
648dfaa7
MG
5682 if (var.dpl < rpl) /* DPL < RPL */
5683 return false;
5684 }
5685
5686 /* TODO: Add other members to kvm_segment_field to allow checking for other access
5687 * rights flags
5688 */
5689 return true;
5690}
5691
5692static bool tr_valid(struct kvm_vcpu *vcpu)
5693{
5694 struct kvm_segment tr;
5695
5696 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
5697
1872a3f4
AK
5698 if (tr.unusable)
5699 return false;
b32a9918 5700 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7 5701 return false;
1872a3f4 5702 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
648dfaa7
MG
5703 return false;
5704 if (!tr.present)
5705 return false;
5706
5707 return true;
5708}
5709
5710static bool ldtr_valid(struct kvm_vcpu *vcpu)
5711{
5712 struct kvm_segment ldtr;
5713
5714 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
5715
1872a3f4
AK
5716 if (ldtr.unusable)
5717 return true;
b32a9918 5718 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7
MG
5719 return false;
5720 if (ldtr.type != 2)
5721 return false;
5722 if (!ldtr.present)
5723 return false;
5724
5725 return true;
5726}
5727
5728static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
5729{
5730 struct kvm_segment cs, ss;
5731
5732 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5733 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
5734
b32a9918
NA
5735 return ((cs.selector & SEGMENT_RPL_MASK) ==
5736 (ss.selector & SEGMENT_RPL_MASK));
648dfaa7
MG
5737}
5738
5739/*
5740 * Check if guest state is valid. Returns true if valid, false if
5741 * not.
5742 * We assume that registers are always usable
5743 */
5744static bool guest_state_valid(struct kvm_vcpu *vcpu)
5745{
c5e97c80
GN
5746 if (enable_unrestricted_guest)
5747 return true;
5748
648dfaa7 5749 /* real mode guest state checks */
f13882d8 5750 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
648dfaa7
MG
5751 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
5752 return false;
5753 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
5754 return false;
5755 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
5756 return false;
5757 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
5758 return false;
5759 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
5760 return false;
5761 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
5762 return false;
5763 } else {
5764 /* protected mode guest state checks */
5765 if (!cs_ss_rpl_check(vcpu))
5766 return false;
5767 if (!code_segment_valid(vcpu))
5768 return false;
5769 if (!stack_segment_valid(vcpu))
5770 return false;
5771 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
5772 return false;
5773 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
5774 return false;
5775 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
5776 return false;
5777 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
5778 return false;
5779 if (!tr_valid(vcpu))
5780 return false;
5781 if (!ldtr_valid(vcpu))
5782 return false;
5783 }
5784 /* TODO:
5785 * - Add checks on RIP
5786 * - Add checks on RFLAGS
5787 */
5788
5789 return true;
5790}
5791
5fa99cbe
JM
5792static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
5793{
5794 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
5795}
5796
d77c26fc 5797static int init_rmode_tss(struct kvm *kvm)
6aa8b732 5798{
40dcaa9f 5799 gfn_t fn;
195aefde 5800 u16 data = 0;
1f755a82 5801 int idx, r;
6aa8b732 5802
40dcaa9f 5803 idx = srcu_read_lock(&kvm->srcu);
40bbb9d0 5804 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
195aefde
IE
5805 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5806 if (r < 0)
10589a46 5807 goto out;
195aefde 5808 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
464d17c8
SY
5809 r = kvm_write_guest_page(kvm, fn++, &data,
5810 TSS_IOPB_BASE_OFFSET, sizeof(u16));
195aefde 5811 if (r < 0)
10589a46 5812 goto out;
195aefde
IE
5813 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
5814 if (r < 0)
10589a46 5815 goto out;
195aefde
IE
5816 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5817 if (r < 0)
10589a46 5818 goto out;
195aefde 5819 data = ~0;
10589a46
MT
5820 r = kvm_write_guest_page(kvm, fn, &data,
5821 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
5822 sizeof(u8));
10589a46 5823out:
40dcaa9f 5824 srcu_read_unlock(&kvm->srcu, idx);
1f755a82 5825 return r;
6aa8b732
AK
5826}
5827
b7ebfb05
SY
5828static int init_rmode_identity_map(struct kvm *kvm)
5829{
40bbb9d0 5830 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
f51770ed 5831 int i, idx, r = 0;
ba049e93 5832 kvm_pfn_t identity_map_pfn;
b7ebfb05
SY
5833 u32 tmp;
5834
40bbb9d0 5835 /* Protect kvm_vmx->ept_identity_pagetable_done. */
a255d479
TC
5836 mutex_lock(&kvm->slots_lock);
5837
40bbb9d0 5838 if (likely(kvm_vmx->ept_identity_pagetable_done))
a255d479 5839 goto out2;
a255d479 5840
40bbb9d0
SC
5841 if (!kvm_vmx->ept_identity_map_addr)
5842 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
5843 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
a255d479 5844
d8a6e365 5845 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
40bbb9d0 5846 kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
f51770ed 5847 if (r < 0)
a255d479
TC
5848 goto out2;
5849
40dcaa9f 5850 idx = srcu_read_lock(&kvm->srcu);
b7ebfb05
SY
5851 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
5852 if (r < 0)
5853 goto out;
5854 /* Set up identity-mapping pagetable for EPT in real mode */
5855 for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
5856 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5857 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5858 r = kvm_write_guest_page(kvm, identity_map_pfn,
5859 &tmp, i * sizeof(tmp), sizeof(tmp));
5860 if (r < 0)
5861 goto out;
5862 }
40bbb9d0 5863 kvm_vmx->ept_identity_pagetable_done = true;
f51770ed 5864
b7ebfb05 5865out:
40dcaa9f 5866 srcu_read_unlock(&kvm->srcu, idx);
a255d479
TC
5867
5868out2:
5869 mutex_unlock(&kvm->slots_lock);
f51770ed 5870 return r;
b7ebfb05
SY
5871}
5872
6aa8b732
AK
5873static void seg_setup(int seg)
5874{
772e0318 5875 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3a624e29 5876 unsigned int ar;
6aa8b732
AK
5877
5878 vmcs_write16(sf->selector, 0);
5879 vmcs_writel(sf->base, 0);
5880 vmcs_write32(sf->limit, 0xffff);
d54d07b2
GN
5881 ar = 0x93;
5882 if (seg == VCPU_SREG_CS)
5883 ar |= 0x08; /* code segment */
3a624e29
NK
5884
5885 vmcs_write32(sf->ar_bytes, ar);
6aa8b732
AK
5886}
5887
f78e0e2e
SY
5888static int alloc_apic_access_page(struct kvm *kvm)
5889{
4484141a 5890 struct page *page;
f78e0e2e
SY
5891 int r = 0;
5892
79fac95e 5893 mutex_lock(&kvm->slots_lock);
c24ae0dc 5894 if (kvm->arch.apic_access_page_done)
f78e0e2e 5895 goto out;
1d8007bd
PB
5896 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
5897 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
f78e0e2e
SY
5898 if (r)
5899 goto out;
72dc67a6 5900
73a6d941 5901 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4484141a
XG
5902 if (is_error_page(page)) {
5903 r = -EFAULT;
5904 goto out;
5905 }
5906
c24ae0dc
TC
5907 /*
5908 * Do not pin the page in memory, so that memory hot-unplug
5909 * is able to migrate it.
5910 */
5911 put_page(page);
5912 kvm->arch.apic_access_page_done = true;
f78e0e2e 5913out:
79fac95e 5914 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
5915 return r;
5916}
5917
991e7a0e 5918static int allocate_vpid(void)
2384d2b3
SY
5919{
5920 int vpid;
5921
919818ab 5922 if (!enable_vpid)
991e7a0e 5923 return 0;
2384d2b3
SY
5924 spin_lock(&vmx_vpid_lock);
5925 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
991e7a0e 5926 if (vpid < VMX_NR_VPIDS)
2384d2b3 5927 __set_bit(vpid, vmx_vpid_bitmap);
991e7a0e
WL
5928 else
5929 vpid = 0;
2384d2b3 5930 spin_unlock(&vmx_vpid_lock);
991e7a0e 5931 return vpid;
2384d2b3
SY
5932}
5933
991e7a0e 5934static void free_vpid(int vpid)
cdbecfc3 5935{
991e7a0e 5936 if (!enable_vpid || vpid == 0)
cdbecfc3
LJ
5937 return;
5938 spin_lock(&vmx_vpid_lock);
991e7a0e 5939 __clear_bit(vpid, vmx_vpid_bitmap);
cdbecfc3
LJ
5940 spin_unlock(&vmx_vpid_lock);
5941}
5942
904e14fb
PB
5943static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
5944 u32 msr, int type)
25c5f225 5945{
3e7c73e9 5946 int f = sizeof(unsigned long);
25c5f225
SY
5947
5948 if (!cpu_has_vmx_msr_bitmap())
5949 return;
5950
ceef7d10
VK
5951 if (static_branch_unlikely(&enable_evmcs))
5952 evmcs_touch_msr_bitmap();
5953
25c5f225
SY
5954 /*
5955 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5956 * have the write-low and read-high bitmap offsets the wrong way round.
5957 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5958 */
25c5f225 5959 if (msr <= 0x1fff) {
8d14695f
YZ
5960 if (type & MSR_TYPE_R)
5961 /* read-low */
5962 __clear_bit(msr, msr_bitmap + 0x000 / f);
5963
5964 if (type & MSR_TYPE_W)
5965 /* write-low */
5966 __clear_bit(msr, msr_bitmap + 0x800 / f);
5967
25c5f225
SY
5968 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5969 msr &= 0x1fff;
8d14695f
YZ
5970 if (type & MSR_TYPE_R)
5971 /* read-high */
5972 __clear_bit(msr, msr_bitmap + 0x400 / f);
5973
5974 if (type & MSR_TYPE_W)
5975 /* write-high */
5976 __clear_bit(msr, msr_bitmap + 0xc00 / f);
5977
5978 }
5979}
5980
904e14fb
PB
5981static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
5982 u32 msr, int type)
5983{
5984 int f = sizeof(unsigned long);
5985
5986 if (!cpu_has_vmx_msr_bitmap())
5987 return;
5988
ceef7d10
VK
5989 if (static_branch_unlikely(&enable_evmcs))
5990 evmcs_touch_msr_bitmap();
5991
904e14fb
PB
5992 /*
5993 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5994 * have the write-low and read-high bitmap offsets the wrong way round.
5995 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5996 */
5997 if (msr <= 0x1fff) {
5998 if (type & MSR_TYPE_R)
5999 /* read-low */
6000 __set_bit(msr, msr_bitmap + 0x000 / f);
6001
6002 if (type & MSR_TYPE_W)
6003 /* write-low */
6004 __set_bit(msr, msr_bitmap + 0x800 / f);
6005
6006 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6007 msr &= 0x1fff;
6008 if (type & MSR_TYPE_R)
6009 /* read-high */
6010 __set_bit(msr, msr_bitmap + 0x400 / f);
6011
6012 if (type & MSR_TYPE_W)
6013 /* write-high */
6014 __set_bit(msr, msr_bitmap + 0xc00 / f);
6015
6016 }
6017}
6018
6019static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
6020 u32 msr, int type, bool value)
6021{
6022 if (value)
6023 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
6024 else
6025 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
6026}
6027
f2b93280
WV
6028/*
6029 * If a msr is allowed by L0, we should check whether it is allowed by L1.
6030 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
6031 */
6032static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
6033 unsigned long *msr_bitmap_nested,
6034 u32 msr, int type)
6035{
6036 int f = sizeof(unsigned long);
6037
f2b93280
WV
6038 /*
6039 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
6040 * have the write-low and read-high bitmap offsets the wrong way round.
6041 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
6042 */
6043 if (msr <= 0x1fff) {
6044 if (type & MSR_TYPE_R &&
6045 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
6046 /* read-low */
6047 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
6048
6049 if (type & MSR_TYPE_W &&
6050 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
6051 /* write-low */
6052 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
6053
6054 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6055 msr &= 0x1fff;
6056 if (type & MSR_TYPE_R &&
6057 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
6058 /* read-high */
6059 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
6060
6061 if (type & MSR_TYPE_W &&
6062 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
6063 /* write-high */
6064 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
6065
6066 }
6067}
6068
904e14fb 6069static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
5897297b 6070{
904e14fb
PB
6071 u8 mode = 0;
6072
6073 if (cpu_has_secondary_exec_ctrls() &&
6074 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
6075 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
6076 mode |= MSR_BITMAP_MODE_X2APIC;
6077 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
6078 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
6079 }
6080
904e14fb 6081 return mode;
8d14695f
YZ
6082}
6083
904e14fb
PB
6084#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
6085
6086static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
6087 u8 mode)
8d14695f 6088{
904e14fb
PB
6089 int msr;
6090
6091 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
6092 unsigned word = msr / BITS_PER_LONG;
6093 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
6094 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
6095 }
6096
6097 if (mode & MSR_BITMAP_MODE_X2APIC) {
6098 /*
6099 * TPR reads and writes can be virtualized even if virtual interrupt
6100 * delivery is not in use.
6101 */
6102 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
6103 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
6104 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
6105 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
6106 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
6107 }
f6e90f9e 6108 }
5897297b
AK
6109}
6110
904e14fb
PB
6111static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
6112{
6113 struct vcpu_vmx *vmx = to_vmx(vcpu);
6114 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
6115 u8 mode = vmx_msr_bitmap_mode(vcpu);
6116 u8 changed = mode ^ vmx->msr_bitmap_mode;
6117
6118 if (!changed)
6119 return;
6120
904e14fb
PB
6121 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
6122 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
6123
6124 vmx->msr_bitmap_mode = mode;
6125}
6126
b2a05fef 6127static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
d50ab6c1 6128{
d62caabb 6129 return enable_apicv;
d50ab6c1
PB
6130}
6131
c9f04407
DM
6132static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
6133{
6134 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6135 gfn_t gfn;
6136
6137 /*
6138 * Don't need to mark the APIC access page dirty; it is never
6139 * written to by the CPU during APIC virtualization.
6140 */
6141
6142 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
6143 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
6144 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6145 }
6146
6147 if (nested_cpu_has_posted_intr(vmcs12)) {
6148 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
6149 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6150 }
6151}
6152
6153
6342c50a 6154static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
705699a1
WV
6155{
6156 struct vcpu_vmx *vmx = to_vmx(vcpu);
6157 int max_irr;
6158 void *vapic_page;
6159 u16 status;
6160
c9f04407
DM
6161 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
6162 return;
705699a1 6163
c9f04407
DM
6164 vmx->nested.pi_pending = false;
6165 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
6166 return;
705699a1 6167
c9f04407
DM
6168 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
6169 if (max_irr != 256) {
705699a1 6170 vapic_page = kmap(vmx->nested.virtual_apic_page);
e7387b0e
LA
6171 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
6172 vapic_page, &max_irr);
705699a1
WV
6173 kunmap(vmx->nested.virtual_apic_page);
6174
6175 status = vmcs_read16(GUEST_INTR_STATUS);
6176 if ((u8)max_irr > ((u8)status & 0xff)) {
6177 status &= ~0xff;
6178 status |= (u8)max_irr;
6179 vmcs_write16(GUEST_INTR_STATUS, status);
6180 }
6181 }
c9f04407
DM
6182
6183 nested_mark_vmcs12_pages_dirty(vcpu);
705699a1
WV
6184}
6185
7e712684
PB
6186static u8 vmx_get_rvi(void)
6187{
6188 return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
6189}
6190
e6c67d8c
LA
6191static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
6192{
6193 struct vcpu_vmx *vmx = to_vmx(vcpu);
6194 void *vapic_page;
6195 u32 vppr;
6196 int rvi;
6197
6198 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
6199 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
6200 WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
6201 return false;
6202
7e712684 6203 rvi = vmx_get_rvi();
e6c67d8c
LA
6204
6205 vapic_page = kmap(vmx->nested.virtual_apic_page);
6206 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
6207 kunmap(vmx->nested.virtual_apic_page);
6208
6209 return ((rvi & 0xf0) > (vppr & 0xf0));
6210}
6211
06a5524f
WV
6212static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
6213 bool nested)
21bc8dc5
RK
6214{
6215#ifdef CONFIG_SMP
06a5524f
WV
6216 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
6217
21bc8dc5 6218 if (vcpu->mode == IN_GUEST_MODE) {
28b835d6 6219 /*
5753743f
HZ
6220 * The vector of interrupt to be delivered to vcpu had
6221 * been set in PIR before this function.
6222 *
6223 * Following cases will be reached in this block, and
6224 * we always send a notification event in all cases as
6225 * explained below.
6226 *
6227 * Case 1: vcpu keeps in non-root mode. Sending a
6228 * notification event posts the interrupt to vcpu.
6229 *
6230 * Case 2: vcpu exits to root mode and is still
6231 * runnable. PIR will be synced to vIRR before the
6232 * next vcpu entry. Sending a notification event in
6233 * this case has no effect, as vcpu is not in root
6234 * mode.
28b835d6 6235 *
5753743f
HZ
6236 * Case 3: vcpu exits to root mode and is blocked.
6237 * vcpu_block() has already synced PIR to vIRR and
6238 * never blocks vcpu if vIRR is not cleared. Therefore,
6239 * a blocked vcpu here does not wait for any requested
6240 * interrupts in PIR, and sending a notification event
6241 * which has no effect is safe here.
28b835d6 6242 */
28b835d6 6243
06a5524f 6244 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
21bc8dc5
RK
6245 return true;
6246 }
6247#endif
6248 return false;
6249}
6250
705699a1
WV
6251static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
6252 int vector)
6253{
6254 struct vcpu_vmx *vmx = to_vmx(vcpu);
6255
6256 if (is_guest_mode(vcpu) &&
6257 vector == vmx->nested.posted_intr_nv) {
705699a1
WV
6258 /*
6259 * If a posted intr is not recognized by hardware,
6260 * we will accomplish it in the next vmentry.
6261 */
6262 vmx->nested.pi_pending = true;
6263 kvm_make_request(KVM_REQ_EVENT, vcpu);
6b697711
LA
6264 /* the PIR and ON have been set by L1. */
6265 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
6266 kvm_vcpu_kick(vcpu);
705699a1
WV
6267 return 0;
6268 }
6269 return -1;
6270}
a20ed54d
YZ
6271/*
6272 * Send interrupt to vcpu via posted interrupt way.
6273 * 1. If target vcpu is running(non-root mode), send posted interrupt
6274 * notification to vcpu and hardware will sync PIR to vIRR atomically.
6275 * 2. If target vcpu isn't running(root mode), kick it to pick up the
6276 * interrupt from PIR in next vmentry.
6277 */
6278static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
6279{
6280 struct vcpu_vmx *vmx = to_vmx(vcpu);
6281 int r;
6282
705699a1
WV
6283 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
6284 if (!r)
6285 return;
6286
a20ed54d
YZ
6287 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
6288 return;
6289
b95234c8
PB
6290 /* If a previous notification has sent the IPI, nothing to do. */
6291 if (pi_test_and_set_on(&vmx->pi_desc))
6292 return;
6293
06a5524f 6294 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
a20ed54d
YZ
6295 kvm_vcpu_kick(vcpu);
6296}
6297
a3a8ff8e
NHE
6298/*
6299 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
6300 * will not change in the lifetime of the guest.
6301 * Note that host-state that does change is set elsewhere. E.g., host-state
6302 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
6303 */
a547c6db 6304static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
a3a8ff8e
NHE
6305{
6306 u32 low32, high32;
6307 unsigned long tmpl;
6308 struct desc_ptr dt;
d6e41f11 6309 unsigned long cr0, cr3, cr4;
a3a8ff8e 6310
04ac88ab
AL
6311 cr0 = read_cr0();
6312 WARN_ON(cr0 & X86_CR0_TS);
6313 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
d6e41f11
AL
6314
6315 /*
6316 * Save the most likely value for this task's CR3 in the VMCS.
6317 * We can't use __get_current_cr3_fast() because we're not atomic.
6318 */
6c690ee1 6319 cr3 = __read_cr3();
d6e41f11 6320 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
d7ee039e 6321 vmx->loaded_vmcs->host_state.cr3 = cr3;
a3a8ff8e 6322
d974baa3 6323 /* Save the most likely value for this task's CR4 in the VMCS. */
1e02ce4c 6324 cr4 = cr4_read_shadow();
d974baa3 6325 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
d7ee039e 6326 vmx->loaded_vmcs->host_state.cr4 = cr4;
d974baa3 6327
a3a8ff8e 6328 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
b2da15ac
AK
6329#ifdef CONFIG_X86_64
6330 /*
6331 * Load null selectors, so we can avoid reloading them in
6d6095bd
SC
6332 * vmx_prepare_switch_to_host(), in case userspace uses
6333 * the null selectors too (the expected case).
b2da15ac
AK
6334 */
6335 vmcs_write16(HOST_DS_SELECTOR, 0);
6336 vmcs_write16(HOST_ES_SELECTOR, 0);
6337#else
a3a8ff8e
NHE
6338 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
6339 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
b2da15ac 6340#endif
a3a8ff8e
NHE
6341 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
6342 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
6343
87930019 6344 store_idt(&dt);
a3a8ff8e 6345 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
a547c6db 6346 vmx->host_idt_base = dt.address;
a3a8ff8e 6347
83287ea4 6348 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
a3a8ff8e
NHE
6349
6350 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
6351 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
6352 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
6353 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
6354
6355 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
6356 rdmsr(MSR_IA32_CR_PAT, low32, high32);
6357 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
6358 }
5a5e8a15
SC
6359
6360 if (cpu_has_load_ia32_efer)
6361 vmcs_write64(HOST_IA32_EFER, host_efer);
a3a8ff8e
NHE
6362}
6363
bf8179a0
NHE
6364static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
6365{
6366 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
6367 if (enable_ept)
6368 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
fe3ef05c
NHE
6369 if (is_guest_mode(&vmx->vcpu))
6370 vmx->vcpu.arch.cr4_guest_owned_bits &=
6371 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
bf8179a0
NHE
6372 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
6373}
6374
01e439be
YZ
6375static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
6376{
6377 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
6378
d62caabb 6379 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
01e439be 6380 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
d02fcf50
PB
6381
6382 if (!enable_vnmi)
6383 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
6384
64672c95
YJ
6385 /* Enable the preemption timer dynamically */
6386 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be
YZ
6387 return pin_based_exec_ctrl;
6388}
6389
d62caabb
AS
6390static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
6391{
6392 struct vcpu_vmx *vmx = to_vmx(vcpu);
6393
6394 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
3ce424e4
RK
6395 if (cpu_has_secondary_exec_ctrls()) {
6396 if (kvm_vcpu_apicv_active(vcpu))
6397 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
6398 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6399 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6400 else
6401 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
6402 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6403 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6404 }
6405
6406 if (cpu_has_vmx_msr_bitmap())
904e14fb 6407 vmx_update_msr_bitmap(vcpu);
d62caabb
AS
6408}
6409
bf8179a0
NHE
6410static u32 vmx_exec_control(struct vcpu_vmx *vmx)
6411{
6412 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
d16c293e
PB
6413
6414 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
6415 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
6416
35754c98 6417 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
bf8179a0
NHE
6418 exec_control &= ~CPU_BASED_TPR_SHADOW;
6419#ifdef CONFIG_X86_64
6420 exec_control |= CPU_BASED_CR8_STORE_EXITING |
6421 CPU_BASED_CR8_LOAD_EXITING;
6422#endif
6423 }
6424 if (!enable_ept)
6425 exec_control |= CPU_BASED_CR3_STORE_EXITING |
6426 CPU_BASED_CR3_LOAD_EXITING |
6427 CPU_BASED_INVLPG_EXITING;
4d5422ce
WL
6428 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
6429 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
6430 CPU_BASED_MONITOR_EXITING);
caa057a2
WL
6431 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
6432 exec_control &= ~CPU_BASED_HLT_EXITING;
bf8179a0
NHE
6433 return exec_control;
6434}
6435
45ec368c 6436static bool vmx_rdrand_supported(void)
bf8179a0 6437{
45ec368c 6438 return vmcs_config.cpu_based_2nd_exec_ctrl &
736fdf72 6439 SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
6440}
6441
75f4fc8d
JM
6442static bool vmx_rdseed_supported(void)
6443{
6444 return vmcs_config.cpu_based_2nd_exec_ctrl &
736fdf72 6445 SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
6446}
6447
80154d77 6448static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
bf8179a0 6449{
80154d77
PB
6450 struct kvm_vcpu *vcpu = &vmx->vcpu;
6451
bf8179a0 6452 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
0367f205 6453
80154d77 6454 if (!cpu_need_virtualize_apic_accesses(vcpu))
bf8179a0
NHE
6455 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6456 if (vmx->vpid == 0)
6457 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
6458 if (!enable_ept) {
6459 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
6460 enable_unrestricted_guest = 0;
6461 }
6462 if (!enable_unrestricted_guest)
6463 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
b31c114b 6464 if (kvm_pause_in_guest(vmx->vcpu.kvm))
bf8179a0 6465 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
80154d77 6466 if (!kvm_vcpu_apicv_active(vcpu))
c7c9c56c
YZ
6467 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
6468 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
8d14695f 6469 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
0367f205
PB
6470
6471 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
6472 * in vmx_set_cr4. */
6473 exec_control &= ~SECONDARY_EXEC_DESC;
6474
abc4fc58
AG
6475 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
6476 (handle_vmptrld).
6477 We can NOT enable shadow_vmcs here because we don't have yet
6478 a current VMCS12
6479 */
6480 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
a3eaa864
KH
6481
6482 if (!enable_pml)
6483 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
843e4330 6484
3db13480
PB
6485 if (vmx_xsaves_supported()) {
6486 /* Exposing XSAVES only when XSAVE is exposed */
6487 bool xsaves_enabled =
6488 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
6489 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
6490
6491 if (!xsaves_enabled)
6492 exec_control &= ~SECONDARY_EXEC_XSAVES;
6493
6494 if (nested) {
6495 if (xsaves_enabled)
6677f3da 6496 vmx->nested.msrs.secondary_ctls_high |=
3db13480
PB
6497 SECONDARY_EXEC_XSAVES;
6498 else
6677f3da 6499 vmx->nested.msrs.secondary_ctls_high &=
3db13480
PB
6500 ~SECONDARY_EXEC_XSAVES;
6501 }
6502 }
6503
80154d77
PB
6504 if (vmx_rdtscp_supported()) {
6505 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
6506 if (!rdtscp_enabled)
6507 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6508
6509 if (nested) {
6510 if (rdtscp_enabled)
6677f3da 6511 vmx->nested.msrs.secondary_ctls_high |=
80154d77
PB
6512 SECONDARY_EXEC_RDTSCP;
6513 else
6677f3da 6514 vmx->nested.msrs.secondary_ctls_high &=
80154d77
PB
6515 ~SECONDARY_EXEC_RDTSCP;
6516 }
6517 }
6518
6519 if (vmx_invpcid_supported()) {
6520 /* Exposing INVPCID only when PCID is exposed */
6521 bool invpcid_enabled =
6522 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
6523 guest_cpuid_has(vcpu, X86_FEATURE_PCID);
6524
6525 if (!invpcid_enabled) {
6526 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6527 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
6528 }
6529
6530 if (nested) {
6531 if (invpcid_enabled)
6677f3da 6532 vmx->nested.msrs.secondary_ctls_high |=
80154d77
PB
6533 SECONDARY_EXEC_ENABLE_INVPCID;
6534 else
6677f3da 6535 vmx->nested.msrs.secondary_ctls_high &=
80154d77
PB
6536 ~SECONDARY_EXEC_ENABLE_INVPCID;
6537 }
6538 }
6539
45ec368c
JM
6540 if (vmx_rdrand_supported()) {
6541 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
6542 if (rdrand_enabled)
736fdf72 6543 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
6544
6545 if (nested) {
6546 if (rdrand_enabled)
6677f3da 6547 vmx->nested.msrs.secondary_ctls_high |=
736fdf72 6548 SECONDARY_EXEC_RDRAND_EXITING;
45ec368c 6549 else
6677f3da 6550 vmx->nested.msrs.secondary_ctls_high &=
736fdf72 6551 ~SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
6552 }
6553 }
6554
75f4fc8d
JM
6555 if (vmx_rdseed_supported()) {
6556 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
6557 if (rdseed_enabled)
736fdf72 6558 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
6559
6560 if (nested) {
6561 if (rdseed_enabled)
6677f3da 6562 vmx->nested.msrs.secondary_ctls_high |=
736fdf72 6563 SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d 6564 else
6677f3da 6565 vmx->nested.msrs.secondary_ctls_high &=
736fdf72 6566 ~SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
6567 }
6568 }
6569
80154d77 6570 vmx->secondary_exec_control = exec_control;
bf8179a0
NHE
6571}
6572
ce88decf
XG
6573static void ept_set_mmio_spte_mask(void)
6574{
6575 /*
6576 * EPT Misconfigurations can be generated if the value of bits 2:0
6577 * of an EPT paging-structure entry is 110b (write/execute).
ce88decf 6578 */
dcdca5fe
PF
6579 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
6580 VMX_EPT_MISCONFIG_WX_VALUE);
ce88decf
XG
6581}
6582
f53cd63c 6583#define VMX_XSS_EXIT_BITMAP 0
6aa8b732
AK
6584/*
6585 * Sets up the vmcs for emulated real mode.
6586 */
12d79917 6587static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
6aa8b732 6588{
6aa8b732 6589 int i;
6aa8b732 6590
4607c2d7 6591 if (enable_shadow_vmcs) {
f4160e45
JM
6592 /*
6593 * At vCPU creation, "VMWRITE to any supported field
6594 * in the VMCS" is supported, so use the more
6595 * permissive vmx_vmread_bitmap to specify both read
6596 * and write permissions for the shadow VMCS.
6597 */
4607c2d7 6598 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
f4160e45 6599 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
4607c2d7 6600 }
25c5f225 6601 if (cpu_has_vmx_msr_bitmap())
904e14fb 6602 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
25c5f225 6603
6aa8b732
AK
6604 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
6605
6aa8b732 6606 /* Control */
01e439be 6607 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
64672c95 6608 vmx->hv_deadline_tsc = -1;
6e5d865c 6609
bf8179a0 6610 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
6aa8b732 6611
dfa169bb 6612 if (cpu_has_secondary_exec_ctrls()) {
80154d77 6613 vmx_compute_secondary_exec_control(vmx);
bf8179a0 6614 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
80154d77 6615 vmx->secondary_exec_control);
dfa169bb 6616 }
f78e0e2e 6617
d62caabb 6618 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
c7c9c56c
YZ
6619 vmcs_write64(EOI_EXIT_BITMAP0, 0);
6620 vmcs_write64(EOI_EXIT_BITMAP1, 0);
6621 vmcs_write64(EOI_EXIT_BITMAP2, 0);
6622 vmcs_write64(EOI_EXIT_BITMAP3, 0);
6623
6624 vmcs_write16(GUEST_INTR_STATUS, 0);
01e439be 6625
0bcf261c 6626 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
01e439be 6627 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
c7c9c56c
YZ
6628 }
6629
b31c114b 6630 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4b8d54f9 6631 vmcs_write32(PLE_GAP, ple_gap);
a7653ecd
RK
6632 vmx->ple_window = ple_window;
6633 vmx->ple_window_dirty = true;
4b8d54f9
ZE
6634 }
6635
c3707958
XG
6636 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
6637 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6aa8b732
AK
6638 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
6639
9581d442
AK
6640 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
6641 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
a547c6db 6642 vmx_set_constant_host_state(vmx);
6aa8b732
AK
6643 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
6644 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6aa8b732 6645
2a499e49
BD
6646 if (cpu_has_vmx_vmfunc())
6647 vmcs_write64(VM_FUNCTION_CONTROL, 0);
6648
2cc51560
ED
6649 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
6650 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
33966dd6 6651 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2cc51560 6652 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
33966dd6 6653 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6aa8b732 6654
74545705
RK
6655 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6656 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
468d472f 6657
03916db9 6658 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
6aa8b732
AK
6659 u32 index = vmx_msr_index[i];
6660 u32 data_low, data_high;
a2fa3e9f 6661 int j = vmx->nmsrs;
6aa8b732
AK
6662
6663 if (rdmsr_safe(index, &data_low, &data_high) < 0)
6664 continue;
432bd6cb
AK
6665 if (wrmsr_safe(index, data_low, data_high) < 0)
6666 continue;
26bb0981
AK
6667 vmx->guest_msrs[j].index = i;
6668 vmx->guest_msrs[j].data = 0;
d5696725 6669 vmx->guest_msrs[j].mask = -1ull;
a2fa3e9f 6670 ++vmx->nmsrs;
6aa8b732 6671 }
6aa8b732 6672
5b76a3cf 6673 vmx->arch_capabilities = kvm_get_arch_capabilities();
2961e876
GN
6674
6675 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
6aa8b732
AK
6676
6677 /* 22.2.1, 20.8.1 */
2961e876 6678 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
1c3d14fe 6679
bd7e5b08
PB
6680 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
6681 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
6682
bf8179a0 6683 set_cr4_guest_host_mask(vmx);
e00c8cf2 6684
f53cd63c
WL
6685 if (vmx_xsaves_supported())
6686 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
6687
4e59516a 6688 if (enable_pml) {
4e59516a
PF
6689 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
6690 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6691 }
0b665d30
SC
6692
6693 if (cpu_has_vmx_encls_vmexit())
6694 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
e00c8cf2
AK
6695}
6696
d28bc9dd 6697static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
e00c8cf2
AK
6698{
6699 struct vcpu_vmx *vmx = to_vmx(vcpu);
58cb628d 6700 struct msr_data apic_base_msr;
d28bc9dd 6701 u64 cr0;
e00c8cf2 6702
7ffd92c5 6703 vmx->rmode.vm86_active = 0;
d28b387f 6704 vmx->spec_ctrl = 0;
e00c8cf2 6705
518e7b94 6706 vcpu->arch.microcode_version = 0x100000000ULL;
ad312c7c 6707 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
d28bc9dd
NA
6708 kvm_set_cr8(vcpu, 0);
6709
6710 if (!init_event) {
6711 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
6712 MSR_IA32_APICBASE_ENABLE;
6713 if (kvm_vcpu_is_reset_bsp(vcpu))
6714 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
6715 apic_base_msr.host_initiated = true;
6716 kvm_set_apic_base(vcpu, &apic_base_msr);
6717 }
e00c8cf2 6718
2fb92db1
AK
6719 vmx_segment_cache_clear(vmx);
6720
5706be0d 6721 seg_setup(VCPU_SREG_CS);
66450a21 6722 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
f3531054 6723 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
e00c8cf2
AK
6724
6725 seg_setup(VCPU_SREG_DS);
6726 seg_setup(VCPU_SREG_ES);
6727 seg_setup(VCPU_SREG_FS);
6728 seg_setup(VCPU_SREG_GS);
6729 seg_setup(VCPU_SREG_SS);
6730
6731 vmcs_write16(GUEST_TR_SELECTOR, 0);
6732 vmcs_writel(GUEST_TR_BASE, 0);
6733 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
6734 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
6735
6736 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
6737 vmcs_writel(GUEST_LDTR_BASE, 0);
6738 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
6739 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
6740
d28bc9dd
NA
6741 if (!init_event) {
6742 vmcs_write32(GUEST_SYSENTER_CS, 0);
6743 vmcs_writel(GUEST_SYSENTER_ESP, 0);
6744 vmcs_writel(GUEST_SYSENTER_EIP, 0);
6745 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
6746 }
e00c8cf2 6747
c37c2873 6748 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
66450a21 6749 kvm_rip_write(vcpu, 0xfff0);
e00c8cf2 6750
e00c8cf2
AK
6751 vmcs_writel(GUEST_GDTR_BASE, 0);
6752 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
6753
6754 vmcs_writel(GUEST_IDTR_BASE, 0);
6755 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
6756
443381a8 6757 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
e00c8cf2 6758 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
f3531054 6759 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
a554d207
WL
6760 if (kvm_mpx_supported())
6761 vmcs_write64(GUEST_BNDCFGS, 0);
e00c8cf2 6762
e00c8cf2
AK
6763 setup_msrs(vmx);
6764
6aa8b732
AK
6765 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
6766
d28bc9dd 6767 if (cpu_has_vmx_tpr_shadow() && !init_event) {
f78e0e2e 6768 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
35754c98 6769 if (cpu_need_tpr_shadow(vcpu))
f78e0e2e 6770 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
d28bc9dd 6771 __pa(vcpu->arch.apic->regs));
f78e0e2e
SY
6772 vmcs_write32(TPR_THRESHOLD, 0);
6773 }
6774
a73896cb 6775 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6aa8b732 6776
2384d2b3
SY
6777 if (vmx->vpid != 0)
6778 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6779
d28bc9dd 6780 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
d28bc9dd 6781 vmx->vcpu.arch.cr0 = cr0;
f2463247 6782 vmx_set_cr0(vcpu, cr0); /* enter rmode */
d28bc9dd 6783 vmx_set_cr4(vcpu, 0);
5690891b 6784 vmx_set_efer(vcpu, 0);
bd7e5b08 6785
d28bc9dd 6786 update_exception_bitmap(vcpu);
6aa8b732 6787
dd5f5341 6788 vpid_sync_context(vmx->vpid);
caa057a2
WL
6789 if (init_event)
6790 vmx_clear_hlt(vcpu);
6aa8b732
AK
6791}
6792
b6f1250e
NHE
6793/*
6794 * In nested virtualization, check if L1 asked to exit on external interrupts.
6795 * For most existing hypervisors, this will always return true.
6796 */
6797static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
6798{
6799 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
6800 PIN_BASED_EXT_INTR_MASK;
6801}
6802
77b0f5d6
BD
6803/*
6804 * In nested virtualization, check if L1 has set
6805 * VM_EXIT_ACK_INTR_ON_EXIT
6806 */
6807static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
6808{
6809 return get_vmcs12(vcpu)->vm_exit_controls &
6810 VM_EXIT_ACK_INTR_ON_EXIT;
6811}
6812
ea8ceb83
JK
6813static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
6814{
0c7f650e 6815 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
ea8ceb83
JK
6816}
6817
c9a7953f 6818static void enable_irq_window(struct kvm_vcpu *vcpu)
3b86cd99 6819{
47c0152e
PB
6820 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6821 CPU_BASED_VIRTUAL_INTR_PENDING);
3b86cd99
JK
6822}
6823
c9a7953f 6824static void enable_nmi_window(struct kvm_vcpu *vcpu)
3b86cd99 6825{
d02fcf50 6826 if (!enable_vnmi ||
8a1b4392 6827 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
c9a7953f
JK
6828 enable_irq_window(vcpu);
6829 return;
6830 }
3b86cd99 6831
47c0152e
PB
6832 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6833 CPU_BASED_VIRTUAL_NMI_PENDING);
3b86cd99
JK
6834}
6835
66fd3f7f 6836static void vmx_inject_irq(struct kvm_vcpu *vcpu)
85f455f7 6837{
9c8cba37 6838 struct vcpu_vmx *vmx = to_vmx(vcpu);
66fd3f7f
GN
6839 uint32_t intr;
6840 int irq = vcpu->arch.interrupt.nr;
9c8cba37 6841
229456fc 6842 trace_kvm_inj_virq(irq);
2714d1d3 6843
fa89a817 6844 ++vcpu->stat.irq_injections;
7ffd92c5 6845 if (vmx->rmode.vm86_active) {
71f9833b
SH
6846 int inc_eip = 0;
6847 if (vcpu->arch.interrupt.soft)
6848 inc_eip = vcpu->arch.event_exit_inst_len;
6849 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
a92601bb 6850 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
85f455f7
ED
6851 return;
6852 }
66fd3f7f
GN
6853 intr = irq | INTR_INFO_VALID_MASK;
6854 if (vcpu->arch.interrupt.soft) {
6855 intr |= INTR_TYPE_SOFT_INTR;
6856 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6857 vmx->vcpu.arch.event_exit_inst_len);
6858 } else
6859 intr |= INTR_TYPE_EXT_INTR;
6860 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
caa057a2
WL
6861
6862 vmx_clear_hlt(vcpu);
85f455f7
ED
6863}
6864
f08864b4
SY
6865static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
6866{
66a5a347
JK
6867 struct vcpu_vmx *vmx = to_vmx(vcpu);
6868
d02fcf50 6869 if (!enable_vnmi) {
8a1b4392
PB
6870 /*
6871 * Tracking the NMI-blocked state in software is built upon
6872 * finding the next open IRQ window. This, in turn, depends on
6873 * well-behaving guests: They have to keep IRQs disabled at
6874 * least as long as the NMI handler runs. Otherwise we may
6875 * cause NMI nesting, maybe breaking the guest. But as this is
6876 * highly unlikely, we can live with the residual risk.
6877 */
6878 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
6879 vmx->loaded_vmcs->vnmi_blocked_time = 0;
6880 }
6881
4c4a6f79
PB
6882 ++vcpu->stat.nmi_injections;
6883 vmx->loaded_vmcs->nmi_known_unmasked = false;
3b86cd99 6884
7ffd92c5 6885 if (vmx->rmode.vm86_active) {
71f9833b 6886 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
a92601bb 6887 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
66a5a347
JK
6888 return;
6889 }
c5a6d5f7 6890
f08864b4
SY
6891 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6892 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
caa057a2
WL
6893
6894 vmx_clear_hlt(vcpu);
f08864b4
SY
6895}
6896
3cfc3092
JK
6897static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
6898{
4c4a6f79
PB
6899 struct vcpu_vmx *vmx = to_vmx(vcpu);
6900 bool masked;
6901
d02fcf50 6902 if (!enable_vnmi)
8a1b4392 6903 return vmx->loaded_vmcs->soft_vnmi_blocked;
4c4a6f79 6904 if (vmx->loaded_vmcs->nmi_known_unmasked)
9d58b931 6905 return false;
4c4a6f79
PB
6906 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
6907 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6908 return masked;
3cfc3092
JK
6909}
6910
6911static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
6912{
6913 struct vcpu_vmx *vmx = to_vmx(vcpu);
6914
d02fcf50 6915 if (!enable_vnmi) {
8a1b4392
PB
6916 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
6917 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
6918 vmx->loaded_vmcs->vnmi_blocked_time = 0;
6919 }
6920 } else {
6921 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6922 if (masked)
6923 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6924 GUEST_INTR_STATE_NMI);
6925 else
6926 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
6927 GUEST_INTR_STATE_NMI);
6928 }
3cfc3092
JK
6929}
6930
2505dc9f
JK
6931static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
6932{
b6b8a145
JK
6933 if (to_vmx(vcpu)->nested.nested_run_pending)
6934 return 0;
ea8ceb83 6935
d02fcf50 6936 if (!enable_vnmi &&
8a1b4392
PB
6937 to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
6938 return 0;
6939
2505dc9f
JK
6940 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6941 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
6942 | GUEST_INTR_STATE_NMI));
6943}
6944
78646121
GN
6945static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
6946{
b6b8a145
JK
6947 return (!to_vmx(vcpu)->nested.nested_run_pending &&
6948 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
c4282df9
GN
6949 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6950 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
78646121
GN
6951}
6952
cbc94022
IE
6953static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
6954{
6955 int ret;
cbc94022 6956
f7eaeb0a
SC
6957 if (enable_unrestricted_guest)
6958 return 0;
6959
1d8007bd
PB
6960 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
6961 PAGE_SIZE * 3);
cbc94022
IE
6962 if (ret)
6963 return ret;
40bbb9d0 6964 to_kvm_vmx(kvm)->tss_addr = addr;
1f755a82 6965 return init_rmode_tss(kvm);
cbc94022
IE
6966}
6967
2ac52ab8
SC
6968static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
6969{
40bbb9d0 6970 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
2ac52ab8
SC
6971 return 0;
6972}
6973
0ca1b4f4 6974static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6aa8b732 6975{
77ab6db0 6976 switch (vec) {
77ab6db0 6977 case BP_VECTOR:
c573cd22
JK
6978 /*
6979 * Update instruction length as we may reinject the exception
6980 * from user space while in guest debugging mode.
6981 */
6982 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
6983 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
d0bfb940 6984 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
0ca1b4f4
GN
6985 return false;
6986 /* fall through */
6987 case DB_VECTOR:
6988 if (vcpu->guest_debug &
6989 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
6990 return false;
d0bfb940
JK
6991 /* fall through */
6992 case DE_VECTOR:
77ab6db0
JK
6993 case OF_VECTOR:
6994 case BR_VECTOR:
6995 case UD_VECTOR:
6996 case DF_VECTOR:
6997 case SS_VECTOR:
6998 case GP_VECTOR:
6999 case MF_VECTOR:
0ca1b4f4
GN
7000 return true;
7001 break;
77ab6db0 7002 }
0ca1b4f4
GN
7003 return false;
7004}
7005
7006static int handle_rmode_exception(struct kvm_vcpu *vcpu,
7007 int vec, u32 err_code)
7008{
7009 /*
7010 * Instruction with address size override prefix opcode 0x67
7011 * Cause the #SS fault with 0 error code in VM86 mode.
7012 */
7013 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
0ce97a2b 7014 if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
0ca1b4f4
GN
7015 if (vcpu->arch.halt_request) {
7016 vcpu->arch.halt_request = 0;
5cb56059 7017 return kvm_vcpu_halt(vcpu);
0ca1b4f4
GN
7018 }
7019 return 1;
7020 }
7021 return 0;
7022 }
7023
7024 /*
7025 * Forward all other exceptions that are valid in real mode.
7026 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
7027 * the required debugging infrastructure rework.
7028 */
7029 kvm_queue_exception(vcpu, vec);
7030 return 1;
6aa8b732
AK
7031}
7032
a0861c02
AK
7033/*
7034 * Trigger machine check on the host. We assume all the MSRs are already set up
7035 * by the CPU and that we still run on the same CPU as the MCE occurred on.
7036 * We pass a fake environment to the machine check handler because we want
7037 * the guest to be always treated like user space, no matter what context
7038 * it used internally.
7039 */
7040static void kvm_machine_check(void)
7041{
7042#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
7043 struct pt_regs regs = {
7044 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
7045 .flags = X86_EFLAGS_IF,
7046 };
7047
7048 do_machine_check(&regs, 0);
7049#endif
7050}
7051
851ba692 7052static int handle_machine_check(struct kvm_vcpu *vcpu)
a0861c02
AK
7053{
7054 /* already handled by vcpu_run */
7055 return 1;
7056}
7057
851ba692 7058static int handle_exception(struct kvm_vcpu *vcpu)
6aa8b732 7059{
1155f76a 7060 struct vcpu_vmx *vmx = to_vmx(vcpu);
851ba692 7061 struct kvm_run *kvm_run = vcpu->run;
d0bfb940 7062 u32 intr_info, ex_no, error_code;
42dbaa5a 7063 unsigned long cr2, rip, dr6;
6aa8b732
AK
7064 u32 vect_info;
7065 enum emulation_result er;
7066
1155f76a 7067 vect_info = vmx->idt_vectoring_info;
88786475 7068 intr_info = vmx->exit_intr_info;
6aa8b732 7069
a0861c02 7070 if (is_machine_check(intr_info))
851ba692 7071 return handle_machine_check(vcpu);
a0861c02 7072
ef85b673 7073 if (is_nmi(intr_info))
1b6269db 7074 return 1; /* already handled by vmx_vcpu_run() */
2ab455cc 7075
082d06ed
WL
7076 if (is_invalid_opcode(intr_info))
7077 return handle_ud(vcpu);
7aa81cc0 7078
6aa8b732 7079 error_code = 0;
2e11384c 7080 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6aa8b732 7081 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
bf4ca23e 7082
9e869480
LA
7083 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
7084 WARN_ON_ONCE(!enable_vmware_backdoor);
0ce97a2b 7085 er = kvm_emulate_instruction(vcpu,
9e869480
LA
7086 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
7087 if (er == EMULATE_USER_EXIT)
7088 return 0;
7089 else if (er != EMULATE_DONE)
7090 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
7091 return 1;
7092 }
7093
bf4ca23e
XG
7094 /*
7095 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
7096 * MMIO, it is better to report an internal error.
7097 * See the comments in vmx_handle_exit.
7098 */
7099 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
7100 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
7101 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7102 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
80f0e95d 7103 vcpu->run->internal.ndata = 3;
bf4ca23e
XG
7104 vcpu->run->internal.data[0] = vect_info;
7105 vcpu->run->internal.data[1] = intr_info;
80f0e95d 7106 vcpu->run->internal.data[2] = error_code;
bf4ca23e
XG
7107 return 0;
7108 }
7109
6aa8b732
AK
7110 if (is_page_fault(intr_info)) {
7111 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1261bfa3
WL
7112 /* EPT won't cause page fault directly */
7113 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
d0006530 7114 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
6aa8b732
AK
7115 }
7116
d0bfb940 7117 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
0ca1b4f4
GN
7118
7119 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
7120 return handle_rmode_exception(vcpu, ex_no, error_code);
7121
42dbaa5a 7122 switch (ex_no) {
54a20552
EN
7123 case AC_VECTOR:
7124 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
7125 return 1;
42dbaa5a
JK
7126 case DB_VECTOR:
7127 dr6 = vmcs_readl(EXIT_QUALIFICATION);
7128 if (!(vcpu->guest_debug &
7129 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
8246bf52 7130 vcpu->arch.dr6 &= ~15;
6f43ed01 7131 vcpu->arch.dr6 |= dr6 | DR6_RTM;
32d43cd3 7132 if (is_icebp(intr_info))
fd2a445a
HD
7133 skip_emulated_instruction(vcpu);
7134
42dbaa5a
JK
7135 kvm_queue_exception(vcpu, DB_VECTOR);
7136 return 1;
7137 }
7138 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
7139 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
7140 /* fall through */
7141 case BP_VECTOR:
c573cd22
JK
7142 /*
7143 * Update instruction length as we may reinject #BP from
7144 * user space while in guest debugging mode. Reading it for
7145 * #DB as well causes no harm, it is not used in that case.
7146 */
7147 vmx->vcpu.arch.event_exit_inst_len =
7148 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6aa8b732 7149 kvm_run->exit_reason = KVM_EXIT_DEBUG;
0a434bb2 7150 rip = kvm_rip_read(vcpu);
d0bfb940
JK
7151 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
7152 kvm_run->debug.arch.exception = ex_no;
42dbaa5a
JK
7153 break;
7154 default:
d0bfb940
JK
7155 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
7156 kvm_run->ex.exception = ex_no;
7157 kvm_run->ex.error_code = error_code;
42dbaa5a 7158 break;
6aa8b732 7159 }
6aa8b732
AK
7160 return 0;
7161}
7162
851ba692 7163static int handle_external_interrupt(struct kvm_vcpu *vcpu)
6aa8b732 7164{
1165f5fe 7165 ++vcpu->stat.irq_exits;
6aa8b732
AK
7166 return 1;
7167}
7168
851ba692 7169static int handle_triple_fault(struct kvm_vcpu *vcpu)
988ad74f 7170{
851ba692 7171 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
bbeac283 7172 vcpu->mmio_needed = 0;
988ad74f
AK
7173 return 0;
7174}
6aa8b732 7175
851ba692 7176static int handle_io(struct kvm_vcpu *vcpu)
6aa8b732 7177{
bfdaab09 7178 unsigned long exit_qualification;
dca7f128 7179 int size, in, string;
039576c0 7180 unsigned port;
6aa8b732 7181
bfdaab09 7182 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
039576c0 7183 string = (exit_qualification & 16) != 0;
e70669ab 7184
cf8f70bf 7185 ++vcpu->stat.io_exits;
e70669ab 7186
432baf60 7187 if (string)
0ce97a2b 7188 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
e70669ab 7189
cf8f70bf
GN
7190 port = exit_qualification >> 16;
7191 size = (exit_qualification & 7) + 1;
432baf60 7192 in = (exit_qualification & 8) != 0;
cf8f70bf 7193
dca7f128 7194 return kvm_fast_pio(vcpu, size, port, in);
6aa8b732
AK
7195}
7196
102d8325
IM
7197static void
7198vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
7199{
7200 /*
7201 * Patch in the VMCALL instruction:
7202 */
7203 hypercall[0] = 0x0f;
7204 hypercall[1] = 0x01;
7205 hypercall[2] = 0xc1;
102d8325
IM
7206}
7207
0fa06071 7208/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
eeadf9e7
NHE
7209static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
7210{
eeadf9e7 7211 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
7212 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7213 unsigned long orig_val = val;
7214
eeadf9e7
NHE
7215 /*
7216 * We get here when L2 changed cr0 in a way that did not change
7217 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
1a0d74e6
JK
7218 * but did change L0 shadowed bits. So we first calculate the
7219 * effective cr0 value that L1 would like to write into the
7220 * hardware. It consists of the L2-owned bits from the new
7221 * value combined with the L1-owned bits from L1's guest_cr0.
eeadf9e7 7222 */
1a0d74e6
JK
7223 val = (val & ~vmcs12->cr0_guest_host_mask) |
7224 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
7225
3899152c 7226 if (!nested_guest_cr0_valid(vcpu, val))
eeadf9e7 7227 return 1;
1a0d74e6
JK
7228
7229 if (kvm_set_cr0(vcpu, val))
7230 return 1;
7231 vmcs_writel(CR0_READ_SHADOW, orig_val);
eeadf9e7 7232 return 0;
1a0d74e6
JK
7233 } else {
7234 if (to_vmx(vcpu)->nested.vmxon &&
3899152c 7235 !nested_host_cr0_valid(vcpu, val))
1a0d74e6 7236 return 1;
3899152c 7237
eeadf9e7 7238 return kvm_set_cr0(vcpu, val);
1a0d74e6 7239 }
eeadf9e7
NHE
7240}
7241
7242static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
7243{
7244 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
7245 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7246 unsigned long orig_val = val;
7247
7248 /* analogously to handle_set_cr0 */
7249 val = (val & ~vmcs12->cr4_guest_host_mask) |
7250 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
7251 if (kvm_set_cr4(vcpu, val))
eeadf9e7 7252 return 1;
1a0d74e6 7253 vmcs_writel(CR4_READ_SHADOW, orig_val);
eeadf9e7
NHE
7254 return 0;
7255 } else
7256 return kvm_set_cr4(vcpu, val);
7257}
7258
0367f205
PB
7259static int handle_desc(struct kvm_vcpu *vcpu)
7260{
7261 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
0ce97a2b 7262 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
0367f205
PB
7263}
7264
851ba692 7265static int handle_cr(struct kvm_vcpu *vcpu)
6aa8b732 7266{
229456fc 7267 unsigned long exit_qualification, val;
6aa8b732
AK
7268 int cr;
7269 int reg;
49a9b07e 7270 int err;
6affcbed 7271 int ret;
6aa8b732 7272
bfdaab09 7273 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6aa8b732
AK
7274 cr = exit_qualification & 15;
7275 reg = (exit_qualification >> 8) & 15;
7276 switch ((exit_qualification >> 4) & 3) {
7277 case 0: /* mov to cr */
1e32c079 7278 val = kvm_register_readl(vcpu, reg);
229456fc 7279 trace_kvm_cr_write(cr, val);
6aa8b732
AK
7280 switch (cr) {
7281 case 0:
eeadf9e7 7282 err = handle_set_cr0(vcpu, val);
6affcbed 7283 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 7284 case 3:
e1de91cc 7285 WARN_ON_ONCE(enable_unrestricted_guest);
2390218b 7286 err = kvm_set_cr3(vcpu, val);
6affcbed 7287 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 7288 case 4:
eeadf9e7 7289 err = handle_set_cr4(vcpu, val);
6affcbed 7290 return kvm_complete_insn_gp(vcpu, err);
0a5fff19
GN
7291 case 8: {
7292 u8 cr8_prev = kvm_get_cr8(vcpu);
1e32c079 7293 u8 cr8 = (u8)val;
eea1cff9 7294 err = kvm_set_cr8(vcpu, cr8);
6affcbed 7295 ret = kvm_complete_insn_gp(vcpu, err);
35754c98 7296 if (lapic_in_kernel(vcpu))
6affcbed 7297 return ret;
0a5fff19 7298 if (cr8_prev <= cr8)
6affcbed
KH
7299 return ret;
7300 /*
7301 * TODO: we might be squashing a
7302 * KVM_GUESTDBG_SINGLESTEP-triggered
7303 * KVM_EXIT_DEBUG here.
7304 */
851ba692 7305 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
0a5fff19
GN
7306 return 0;
7307 }
4b8073e4 7308 }
6aa8b732 7309 break;
25c4c276 7310 case 2: /* clts */
bd7e5b08
PB
7311 WARN_ONCE(1, "Guest should always own CR0.TS");
7312 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4d4ec087 7313 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
6affcbed 7314 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7315 case 1: /*mov from cr*/
7316 switch (cr) {
7317 case 3:
e1de91cc 7318 WARN_ON_ONCE(enable_unrestricted_guest);
9f8fe504
AK
7319 val = kvm_read_cr3(vcpu);
7320 kvm_register_write(vcpu, reg, val);
7321 trace_kvm_cr_read(cr, val);
6affcbed 7322 return kvm_skip_emulated_instruction(vcpu);
6aa8b732 7323 case 8:
229456fc
MT
7324 val = kvm_get_cr8(vcpu);
7325 kvm_register_write(vcpu, reg, val);
7326 trace_kvm_cr_read(cr, val);
6affcbed 7327 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7328 }
7329 break;
7330 case 3: /* lmsw */
a1f83a74 7331 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4d4ec087 7332 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
a1f83a74 7333 kvm_lmsw(vcpu, val);
6aa8b732 7334
6affcbed 7335 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7336 default:
7337 break;
7338 }
851ba692 7339 vcpu->run->exit_reason = 0;
a737f256 7340 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6aa8b732
AK
7341 (int)(exit_qualification >> 4) & 3, cr);
7342 return 0;
7343}
7344
851ba692 7345static int handle_dr(struct kvm_vcpu *vcpu)
6aa8b732 7346{
bfdaab09 7347 unsigned long exit_qualification;
16f8a6f9
NA
7348 int dr, dr7, reg;
7349
7350 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7351 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
7352
7353 /* First, if DR does not exist, trigger UD */
7354 if (!kvm_require_dr(vcpu, dr))
7355 return 1;
6aa8b732 7356
f2483415 7357 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
0a79b009
AK
7358 if (!kvm_require_cpl(vcpu, 0))
7359 return 1;
16f8a6f9
NA
7360 dr7 = vmcs_readl(GUEST_DR7);
7361 if (dr7 & DR7_GD) {
42dbaa5a
JK
7362 /*
7363 * As the vm-exit takes precedence over the debug trap, we
7364 * need to emulate the latter, either for the host or the
7365 * guest debugging itself.
7366 */
7367 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
851ba692 7368 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
16f8a6f9 7369 vcpu->run->debug.arch.dr7 = dr7;
82b32774 7370 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
851ba692
AK
7371 vcpu->run->debug.arch.exception = DB_VECTOR;
7372 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
42dbaa5a
JK
7373 return 0;
7374 } else {
7305eb5d 7375 vcpu->arch.dr6 &= ~15;
6f43ed01 7376 vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
42dbaa5a
JK
7377 kvm_queue_exception(vcpu, DB_VECTOR);
7378 return 1;
7379 }
7380 }
7381
81908bf4 7382 if (vcpu->guest_debug == 0) {
8f22372f
PB
7383 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7384 CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
7385
7386 /*
7387 * No more DR vmexits; force a reload of the debug registers
7388 * and reenter on this instruction. The next vmexit will
7389 * retrieve the full state of the debug registers.
7390 */
7391 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
7392 return 1;
7393 }
7394
42dbaa5a
JK
7395 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
7396 if (exit_qualification & TYPE_MOV_FROM_DR) {
020df079 7397 unsigned long val;
4c4d563b
JK
7398
7399 if (kvm_get_dr(vcpu, dr, &val))
7400 return 1;
7401 kvm_register_write(vcpu, reg, val);
020df079 7402 } else
5777392e 7403 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
4c4d563b
JK
7404 return 1;
7405
6affcbed 7406 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7407}
7408
73aaf249
JK
7409static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
7410{
7411 return vcpu->arch.dr6;
7412}
7413
7414static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
7415{
7416}
7417
81908bf4
PB
7418static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
7419{
81908bf4
PB
7420 get_debugreg(vcpu->arch.db[0], 0);
7421 get_debugreg(vcpu->arch.db[1], 1);
7422 get_debugreg(vcpu->arch.db[2], 2);
7423 get_debugreg(vcpu->arch.db[3], 3);
7424 get_debugreg(vcpu->arch.dr6, 6);
7425 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
7426
7427 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
8f22372f 7428 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
7429}
7430
020df079
GN
7431static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
7432{
7433 vmcs_writel(GUEST_DR7, val);
7434}
7435
851ba692 7436static int handle_cpuid(struct kvm_vcpu *vcpu)
6aa8b732 7437{
6a908b62 7438 return kvm_emulate_cpuid(vcpu);
6aa8b732
AK
7439}
7440
851ba692 7441static int handle_rdmsr(struct kvm_vcpu *vcpu)
6aa8b732 7442{
ad312c7c 7443 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
609e36d3 7444 struct msr_data msr_info;
6aa8b732 7445
609e36d3
PB
7446 msr_info.index = ecx;
7447 msr_info.host_initiated = false;
7448 if (vmx_get_msr(vcpu, &msr_info)) {
59200273 7449 trace_kvm_msr_read_ex(ecx);
c1a5d4f9 7450 kvm_inject_gp(vcpu, 0);
6aa8b732
AK
7451 return 1;
7452 }
7453
609e36d3 7454 trace_kvm_msr_read(ecx, msr_info.data);
2714d1d3 7455
6aa8b732 7456 /* FIXME: handling of bits 32:63 of rax, rdx */
609e36d3
PB
7457 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
7458 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
6affcbed 7459 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7460}
7461
851ba692 7462static int handle_wrmsr(struct kvm_vcpu *vcpu)
6aa8b732 7463{
8fe8ab46 7464 struct msr_data msr;
ad312c7c
ZX
7465 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
7466 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
7467 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
6aa8b732 7468
8fe8ab46
WA
7469 msr.data = data;
7470 msr.index = ecx;
7471 msr.host_initiated = false;
854e8bb1 7472 if (kvm_set_msr(vcpu, &msr) != 0) {
59200273 7473 trace_kvm_msr_write_ex(ecx, data);
c1a5d4f9 7474 kvm_inject_gp(vcpu, 0);
6aa8b732
AK
7475 return 1;
7476 }
7477
59200273 7478 trace_kvm_msr_write(ecx, data);
6affcbed 7479 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
7480}
7481
851ba692 7482static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6e5d865c 7483{
eb90f341 7484 kvm_apic_update_ppr(vcpu);
6e5d865c
YS
7485 return 1;
7486}
7487
851ba692 7488static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6aa8b732 7489{
47c0152e
PB
7490 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7491 CPU_BASED_VIRTUAL_INTR_PENDING);
2714d1d3 7492
3842d135
AK
7493 kvm_make_request(KVM_REQ_EVENT, vcpu);
7494
a26bf12a 7495 ++vcpu->stat.irq_window_exits;
6aa8b732
AK
7496 return 1;
7497}
7498
851ba692 7499static int handle_halt(struct kvm_vcpu *vcpu)
6aa8b732 7500{
d3bef15f 7501 return kvm_emulate_halt(vcpu);
6aa8b732
AK
7502}
7503
851ba692 7504static int handle_vmcall(struct kvm_vcpu *vcpu)
c21415e8 7505{
0d9c055e 7506 return kvm_emulate_hypercall(vcpu);
c21415e8
IM
7507}
7508
ec25d5e6
GN
7509static int handle_invd(struct kvm_vcpu *vcpu)
7510{
0ce97a2b 7511 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
ec25d5e6
GN
7512}
7513
851ba692 7514static int handle_invlpg(struct kvm_vcpu *vcpu)
a7052897 7515{
f9c617f6 7516 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
a7052897
MT
7517
7518 kvm_mmu_invlpg(vcpu, exit_qualification);
6affcbed 7519 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
7520}
7521
fee84b07
AK
7522static int handle_rdpmc(struct kvm_vcpu *vcpu)
7523{
7524 int err;
7525
7526 err = kvm_rdpmc(vcpu);
6affcbed 7527 return kvm_complete_insn_gp(vcpu, err);
fee84b07
AK
7528}
7529
851ba692 7530static int handle_wbinvd(struct kvm_vcpu *vcpu)
e5edaa01 7531{
6affcbed 7532 return kvm_emulate_wbinvd(vcpu);
e5edaa01
ED
7533}
7534
2acf923e
DC
7535static int handle_xsetbv(struct kvm_vcpu *vcpu)
7536{
7537 u64 new_bv = kvm_read_edx_eax(vcpu);
7538 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
7539
7540 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
6affcbed 7541 return kvm_skip_emulated_instruction(vcpu);
2acf923e
DC
7542 return 1;
7543}
7544
f53cd63c
WL
7545static int handle_xsaves(struct kvm_vcpu *vcpu)
7546{
6affcbed 7547 kvm_skip_emulated_instruction(vcpu);
f53cd63c
WL
7548 WARN(1, "this should never happen\n");
7549 return 1;
7550}
7551
7552static int handle_xrstors(struct kvm_vcpu *vcpu)
7553{
6affcbed 7554 kvm_skip_emulated_instruction(vcpu);
f53cd63c
WL
7555 WARN(1, "this should never happen\n");
7556 return 1;
7557}
7558
851ba692 7559static int handle_apic_access(struct kvm_vcpu *vcpu)
f78e0e2e 7560{
58fbbf26
KT
7561 if (likely(fasteoi)) {
7562 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7563 int access_type, offset;
7564
7565 access_type = exit_qualification & APIC_ACCESS_TYPE;
7566 offset = exit_qualification & APIC_ACCESS_OFFSET;
7567 /*
7568 * Sane guest uses MOV to write EOI, with written value
7569 * not cared. So make a short-circuit here by avoiding
7570 * heavy instruction emulation.
7571 */
7572 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
7573 (offset == APIC_EOI)) {
7574 kvm_lapic_set_eoi(vcpu);
6affcbed 7575 return kvm_skip_emulated_instruction(vcpu);
58fbbf26
KT
7576 }
7577 }
0ce97a2b 7578 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
f78e0e2e
SY
7579}
7580
c7c9c56c
YZ
7581static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
7582{
7583 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7584 int vector = exit_qualification & 0xff;
7585
7586 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
7587 kvm_apic_set_eoi_accelerated(vcpu, vector);
7588 return 1;
7589}
7590
83d4c286
YZ
7591static int handle_apic_write(struct kvm_vcpu *vcpu)
7592{
7593 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7594 u32 offset = exit_qualification & 0xfff;
7595
7596 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
7597 kvm_apic_write_nodecode(vcpu, offset);
7598 return 1;
7599}
7600
851ba692 7601static int handle_task_switch(struct kvm_vcpu *vcpu)
37817f29 7602{
60637aac 7603 struct vcpu_vmx *vmx = to_vmx(vcpu);
37817f29 7604 unsigned long exit_qualification;
e269fb21
JK
7605 bool has_error_code = false;
7606 u32 error_code = 0;
37817f29 7607 u16 tss_selector;
7f3d35fd 7608 int reason, type, idt_v, idt_index;
64a7ec06
GN
7609
7610 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
7f3d35fd 7611 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
64a7ec06 7612 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
37817f29
IE
7613
7614 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7615
7616 reason = (u32)exit_qualification >> 30;
64a7ec06
GN
7617 if (reason == TASK_SWITCH_GATE && idt_v) {
7618 switch (type) {
7619 case INTR_TYPE_NMI_INTR:
7620 vcpu->arch.nmi_injected = false;
654f06fc 7621 vmx_set_nmi_mask(vcpu, true);
64a7ec06
GN
7622 break;
7623 case INTR_TYPE_EXT_INTR:
66fd3f7f 7624 case INTR_TYPE_SOFT_INTR:
64a7ec06
GN
7625 kvm_clear_interrupt_queue(vcpu);
7626 break;
7627 case INTR_TYPE_HARD_EXCEPTION:
e269fb21
JK
7628 if (vmx->idt_vectoring_info &
7629 VECTORING_INFO_DELIVER_CODE_MASK) {
7630 has_error_code = true;
7631 error_code =
7632 vmcs_read32(IDT_VECTORING_ERROR_CODE);
7633 }
7634 /* fall through */
64a7ec06
GN
7635 case INTR_TYPE_SOFT_EXCEPTION:
7636 kvm_clear_exception_queue(vcpu);
7637 break;
7638 default:
7639 break;
7640 }
60637aac 7641 }
37817f29
IE
7642 tss_selector = exit_qualification;
7643
64a7ec06
GN
7644 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
7645 type != INTR_TYPE_EXT_INTR &&
7646 type != INTR_TYPE_NMI_INTR))
7647 skip_emulated_instruction(vcpu);
7648
7f3d35fd
KW
7649 if (kvm_task_switch(vcpu, tss_selector,
7650 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
7651 has_error_code, error_code) == EMULATE_FAIL) {
acb54517
GN
7652 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7653 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7654 vcpu->run->internal.ndata = 0;
42dbaa5a 7655 return 0;
acb54517 7656 }
42dbaa5a 7657
42dbaa5a
JK
7658 /*
7659 * TODO: What about debug traps on tss switch?
7660 * Are we supposed to inject them and update dr6?
7661 */
7662
7663 return 1;
37817f29
IE
7664}
7665
851ba692 7666static int handle_ept_violation(struct kvm_vcpu *vcpu)
1439442c 7667{
f9c617f6 7668 unsigned long exit_qualification;
1439442c 7669 gpa_t gpa;
eebed243 7670 u64 error_code;
1439442c 7671
f9c617f6 7672 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1439442c 7673
0be9c7a8
GN
7674 /*
7675 * EPT violation happened while executing iret from NMI,
7676 * "blocked by NMI" bit has to be set before next VM entry.
7677 * There are errata that may cause this bit to not be set:
7678 * AAK134, BY25.
7679 */
bcd1c294 7680 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
d02fcf50 7681 enable_vnmi &&
bcd1c294 7682 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
0be9c7a8
GN
7683 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
7684
1439442c 7685 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
229456fc 7686 trace_kvm_page_fault(gpa, exit_qualification);
4f5982a5 7687
27959a44 7688 /* Is it a read fault? */
ab22a473 7689 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
27959a44
JS
7690 ? PFERR_USER_MASK : 0;
7691 /* Is it a write fault? */
ab22a473 7692 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
27959a44
JS
7693 ? PFERR_WRITE_MASK : 0;
7694 /* Is it a fetch fault? */
ab22a473 7695 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
27959a44
JS
7696 ? PFERR_FETCH_MASK : 0;
7697 /* ept page table entry is present? */
7698 error_code |= (exit_qualification &
7699 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
7700 EPT_VIOLATION_EXECUTABLE))
7701 ? PFERR_PRESENT_MASK : 0;
4f5982a5 7702
eebed243
PB
7703 error_code |= (exit_qualification & 0x100) != 0 ?
7704 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
25d92081 7705
25d92081 7706 vcpu->arch.exit_qualification = exit_qualification;
4f5982a5 7707 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
1439442c
SY
7708}
7709
851ba692 7710static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
68f89400 7711{
68f89400
MT
7712 gpa_t gpa;
7713
9034e6e8
PB
7714 /*
7715 * A nested guest cannot optimize MMIO vmexits, because we have an
7716 * nGPA here instead of the required GPA.
7717 */
68f89400 7718 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
9034e6e8
PB
7719 if (!is_guest_mode(vcpu) &&
7720 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
931c33b1 7721 trace_kvm_fast_mmio(gpa);
d391f120
VK
7722 /*
7723 * Doing kvm_skip_emulated_instruction() depends on undefined
7724 * behavior: Intel's manual doesn't mandate
7725 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
7726 * occurs and while on real hardware it was observed to be set,
7727 * other hypervisors (namely Hyper-V) don't set it, we end up
7728 * advancing IP with some random value. Disable fast mmio when
7729 * running nested and keep it for real hardware in hope that
7730 * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
7731 */
7732 if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
7733 return kvm_skip_emulated_instruction(vcpu);
7734 else
0ce97a2b 7735 return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
c4409905 7736 EMULATE_DONE;
68c3b4d1 7737 }
68f89400 7738
c75d0edc 7739 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
68f89400
MT
7740}
7741
851ba692 7742static int handle_nmi_window(struct kvm_vcpu *vcpu)
f08864b4 7743{
d02fcf50 7744 WARN_ON_ONCE(!enable_vnmi);
47c0152e
PB
7745 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7746 CPU_BASED_VIRTUAL_NMI_PENDING);
f08864b4 7747 ++vcpu->stat.nmi_window_exits;
3842d135 7748 kvm_make_request(KVM_REQ_EVENT, vcpu);
f08864b4
SY
7749
7750 return 1;
7751}
7752
80ced186 7753static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
ea953ef0 7754{
8b3079a5
AK
7755 struct vcpu_vmx *vmx = to_vmx(vcpu);
7756 enum emulation_result err = EMULATE_DONE;
80ced186 7757 int ret = 1;
49e9d557
AK
7758 u32 cpu_exec_ctrl;
7759 bool intr_window_requested;
b8405c18 7760 unsigned count = 130;
49e9d557 7761
2bb8cafe
SC
7762 /*
7763 * We should never reach the point where we are emulating L2
7764 * due to invalid guest state as that means we incorrectly
7765 * allowed a nested VMEntry with an invalid vmcs12.
7766 */
7767 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
7768
49e9d557
AK
7769 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
7770 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
ea953ef0 7771
98eb2f8b 7772 while (vmx->emulation_required && count-- != 0) {
bdea48e3 7773 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
49e9d557
AK
7774 return handle_interrupt_window(&vmx->vcpu);
7775
72875d8a 7776 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
de87dcdd
AK
7777 return 1;
7778
0ce97a2b 7779 err = kvm_emulate_instruction(vcpu, 0);
ea953ef0 7780
ac0a48c3 7781 if (err == EMULATE_USER_EXIT) {
94452b9e 7782 ++vcpu->stat.mmio_exits;
80ced186
MG
7783 ret = 0;
7784 goto out;
7785 }
1d5a4d9b 7786
add5ff7a
SC
7787 if (err != EMULATE_DONE)
7788 goto emulation_error;
7789
7790 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
7791 vcpu->arch.exception.pending)
7792 goto emulation_error;
ea953ef0 7793
8d76c49e
GN
7794 if (vcpu->arch.halt_request) {
7795 vcpu->arch.halt_request = 0;
5cb56059 7796 ret = kvm_vcpu_halt(vcpu);
8d76c49e
GN
7797 goto out;
7798 }
7799
ea953ef0 7800 if (signal_pending(current))
80ced186 7801 goto out;
ea953ef0
MG
7802 if (need_resched())
7803 schedule();
7804 }
7805
80ced186
MG
7806out:
7807 return ret;
b4a2d31d 7808
add5ff7a
SC
7809emulation_error:
7810 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7811 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7812 vcpu->run->internal.ndata = 0;
7813 return 0;
b4a2d31d
RK
7814}
7815
7816static void grow_ple_window(struct kvm_vcpu *vcpu)
7817{
7818 struct vcpu_vmx *vmx = to_vmx(vcpu);
7819 int old = vmx->ple_window;
7820
c8e88717
BM
7821 vmx->ple_window = __grow_ple_window(old, ple_window,
7822 ple_window_grow,
7823 ple_window_max);
b4a2d31d
RK
7824
7825 if (vmx->ple_window != old)
7826 vmx->ple_window_dirty = true;
7b46268d
RK
7827
7828 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
b4a2d31d
RK
7829}
7830
7831static void shrink_ple_window(struct kvm_vcpu *vcpu)
7832{
7833 struct vcpu_vmx *vmx = to_vmx(vcpu);
7834 int old = vmx->ple_window;
7835
c8e88717
BM
7836 vmx->ple_window = __shrink_ple_window(old, ple_window,
7837 ple_window_shrink,
7838 ple_window);
b4a2d31d
RK
7839
7840 if (vmx->ple_window != old)
7841 vmx->ple_window_dirty = true;
7b46268d
RK
7842
7843 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
b4a2d31d
RK
7844}
7845
bf9f6ac8
FW
7846/*
7847 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
7848 */
7849static void wakeup_handler(void)
7850{
7851 struct kvm_vcpu *vcpu;
7852 int cpu = smp_processor_id();
7853
7854 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7855 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
7856 blocked_vcpu_list) {
7857 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7858
7859 if (pi_test_on(pi_desc) == 1)
7860 kvm_vcpu_kick(vcpu);
7861 }
7862 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7863}
7864
e01bca2f 7865static void vmx_enable_tdp(void)
f160c7b7
JS
7866{
7867 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
7868 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
7869 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
7870 0ull, VMX_EPT_EXECUTABLE_MASK,
7871 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
d0ec49d4 7872 VMX_EPT_RWX_MASK, 0ull);
f160c7b7
JS
7873
7874 ept_set_mmio_spte_mask();
7875 kvm_enable_tdp();
7876}
7877
f2c7648d
TC
7878static __init int hardware_setup(void)
7879{
cf81a7e5 7880 unsigned long host_bndcfgs;
904e14fb 7881 int r = -ENOMEM, i;
34a1cd60
TC
7882
7883 rdmsrl_safe(MSR_EFER, &host_efer);
7884
7885 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7886 kvm_define_shared_msr(i, vmx_msr_index[i]);
7887
23611332
RK
7888 for (i = 0; i < VMX_BITMAP_NR; i++) {
7889 vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
7890 if (!vmx_bitmap[i])
7891 goto out;
7892 }
34a1cd60 7893
34a1cd60
TC
7894 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
7895 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
7896
34a1cd60
TC
7897 if (setup_vmcs_config(&vmcs_config) < 0) {
7898 r = -EIO;
23611332 7899 goto out;
baa03522 7900 }
f2c7648d
TC
7901
7902 if (boot_cpu_has(X86_FEATURE_NX))
7903 kvm_enable_efer_bits(EFER_NX);
7904
cf81a7e5
SC
7905 if (boot_cpu_has(X86_FEATURE_MPX)) {
7906 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7907 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7908 }
7909
08d839c4
WL
7910 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7911 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
f2c7648d 7912 enable_vpid = 0;
08d839c4 7913
f2c7648d 7914 if (!cpu_has_vmx_ept() ||
42aa53b4 7915 !cpu_has_vmx_ept_4levels() ||
f5f51586 7916 !cpu_has_vmx_ept_mt_wb() ||
8ad8182e 7917 !cpu_has_vmx_invept_global())
f2c7648d 7918 enable_ept = 0;
f2c7648d 7919
fce6ac4c 7920 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
f2c7648d
TC
7921 enable_ept_ad_bits = 0;
7922
8ad8182e 7923 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
f2c7648d
TC
7924 enable_unrestricted_guest = 0;
7925
ad15a296 7926 if (!cpu_has_vmx_flexpriority())
f2c7648d
TC
7927 flexpriority_enabled = 0;
7928
d02fcf50
PB
7929 if (!cpu_has_virtual_nmis())
7930 enable_vnmi = 0;
7931
ad15a296
PB
7932 /*
7933 * set_apic_access_page_addr() is used to reload apic access
7934 * page upon invalidation. No need to do anything if not
7935 * using the APIC_ACCESS_ADDR VMCS field.
7936 */
7937 if (!flexpriority_enabled)
f2c7648d 7938 kvm_x86_ops->set_apic_access_page_addr = NULL;
f2c7648d
TC
7939
7940 if (!cpu_has_vmx_tpr_shadow())
7941 kvm_x86_ops->update_cr8_intercept = NULL;
7942
7943 if (enable_ept && !cpu_has_vmx_ept_2m_page())
7944 kvm_disable_largepages();
7945
877ad952
TL
7946#if IS_ENABLED(CONFIG_HYPERV)
7947 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7948 && enable_ept)
7949 kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
7950#endif
7951
0f107682 7952 if (!cpu_has_vmx_ple()) {
f2c7648d 7953 ple_gap = 0;
0f107682
WL
7954 ple_window = 0;
7955 ple_window_grow = 0;
7956 ple_window_max = 0;
7957 ple_window_shrink = 0;
7958 }
f2c7648d 7959
76dfafd5 7960 if (!cpu_has_vmx_apicv()) {
f2c7648d 7961 enable_apicv = 0;
76dfafd5
PB
7962 kvm_x86_ops->sync_pir_to_irr = NULL;
7963 }
f2c7648d 7964
64903d61
HZ
7965 if (cpu_has_vmx_tsc_scaling()) {
7966 kvm_has_tsc_control = true;
7967 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
7968 kvm_tsc_scaling_ratio_frac_bits = 48;
7969 }
7970
04bb92e4
WL
7971 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7972
f160c7b7
JS
7973 if (enable_ept)
7974 vmx_enable_tdp();
7975 else
baa03522
TC
7976 kvm_disable_tdp();
7977
8fcc4b59
JM
7978 if (!nested) {
7979 kvm_x86_ops->get_nested_state = NULL;
7980 kvm_x86_ops->set_nested_state = NULL;
7981 }
7982
843e4330
KH
7983 /*
7984 * Only enable PML when hardware supports PML feature, and both EPT
7985 * and EPT A/D bit features are enabled -- PML depends on them to work.
7986 */
7987 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
7988 enable_pml = 0;
7989
7990 if (!enable_pml) {
7991 kvm_x86_ops->slot_enable_log_dirty = NULL;
7992 kvm_x86_ops->slot_disable_log_dirty = NULL;
7993 kvm_x86_ops->flush_log_dirty = NULL;
7994 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
7995 }
7996
d264ee0c
SC
7997 if (!cpu_has_vmx_preemption_timer())
7998 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
7999
64672c95
YJ
8000 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
8001 u64 vmx_msr;
8002
8003 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
8004 cpu_preemption_timer_multi =
8005 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8006 } else {
8007 kvm_x86_ops->set_hv_timer = NULL;
8008 kvm_x86_ops->cancel_hv_timer = NULL;
8009 }
8010
c5d167b2
PB
8011 if (!cpu_has_vmx_shadow_vmcs())
8012 enable_shadow_vmcs = 0;
8013 if (enable_shadow_vmcs)
8014 init_vmcs_shadow_fields();
8015
bf9f6ac8 8016 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
1389309c 8017 nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
bf9f6ac8 8018
c45dcc71
AR
8019 kvm_mce_cap_supported |= MCG_LMCE_P;
8020
f2c7648d 8021 return alloc_kvm_area();
34a1cd60 8022
34a1cd60 8023out:
23611332
RK
8024 for (i = 0; i < VMX_BITMAP_NR; i++)
8025 free_page((unsigned long)vmx_bitmap[i]);
34a1cd60
TC
8026
8027 return r;
f2c7648d
TC
8028}
8029
8030static __exit void hardware_unsetup(void)
8031{
23611332
RK
8032 int i;
8033
8034 for (i = 0; i < VMX_BITMAP_NR; i++)
8035 free_page((unsigned long)vmx_bitmap[i]);
34a1cd60 8036
f2c7648d
TC
8037 free_kvm_area();
8038}
8039
4b8d54f9
ZE
8040/*
8041 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
8042 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
8043 */
9fb41ba8 8044static int handle_pause(struct kvm_vcpu *vcpu)
4b8d54f9 8045{
b31c114b 8046 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d
RK
8047 grow_ple_window(vcpu);
8048
de63ad4c
LM
8049 /*
8050 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
8051 * VM-execution control is ignored if CPL > 0. OTOH, KVM
8052 * never set PAUSE_EXITING and just set PLE if supported,
8053 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
8054 */
8055 kvm_vcpu_on_spin(vcpu, true);
6affcbed 8056 return kvm_skip_emulated_instruction(vcpu);
4b8d54f9
ZE
8057}
8058
87c00572 8059static int handle_nop(struct kvm_vcpu *vcpu)
59708670 8060{
6affcbed 8061 return kvm_skip_emulated_instruction(vcpu);
59708670
SY
8062}
8063
87c00572
GS
8064static int handle_mwait(struct kvm_vcpu *vcpu)
8065{
8066 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
8067 return handle_nop(vcpu);
8068}
8069
45ec368c
JM
8070static int handle_invalid_op(struct kvm_vcpu *vcpu)
8071{
8072 kvm_queue_exception(vcpu, UD_VECTOR);
8073 return 1;
8074}
8075
5f3d45e7
MD
8076static int handle_monitor_trap(struct kvm_vcpu *vcpu)
8077{
8078 return 1;
8079}
8080
87c00572
GS
8081static int handle_monitor(struct kvm_vcpu *vcpu)
8082{
8083 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
8084 return handle_nop(vcpu);
8085}
8086
0658fbaa
ACL
8087/*
8088 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
09abb5e3
SC
8089 * set the success or error code of an emulated VMX instruction (as specified
8090 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
8091 * instruction.
0658fbaa 8092 */
09abb5e3 8093static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
0658fbaa
ACL
8094{
8095 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
8096 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8097 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
09abb5e3 8098 return kvm_skip_emulated_instruction(vcpu);
0658fbaa
ACL
8099}
8100
09abb5e3 8101static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
0658fbaa
ACL
8102{
8103 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8104 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
8105 X86_EFLAGS_SF | X86_EFLAGS_OF))
8106 | X86_EFLAGS_CF);
09abb5e3 8107 return kvm_skip_emulated_instruction(vcpu);
0658fbaa
ACL
8108}
8109
09abb5e3
SC
8110static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
8111 u32 vm_instruction_error)
0658fbaa 8112{
b8bbab92
VK
8113 struct vcpu_vmx *vmx = to_vmx(vcpu);
8114
09abb5e3
SC
8115 /*
8116 * failValid writes the error number to the current VMCS, which
8117 * can't be done if there isn't a current VMCS.
8118 */
b8bbab92 8119 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
09abb5e3
SC
8120 return nested_vmx_failInvalid(vcpu);
8121
0658fbaa
ACL
8122 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8123 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8124 X86_EFLAGS_SF | X86_EFLAGS_OF))
8125 | X86_EFLAGS_ZF);
8126 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
8127 /*
8128 * We don't need to force a shadow sync because
8129 * VM_INSTRUCTION_ERROR is not shadowed
8130 */
09abb5e3 8131 return kvm_skip_emulated_instruction(vcpu);
0658fbaa 8132}
145c28dd 8133
ff651cb6
WV
8134static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
8135{
8136 /* TODO: not to reset guest simply here. */
8137 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
bbe41b95 8138 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
ff651cb6
WV
8139}
8140
f4124500
JK
8141static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
8142{
8143 struct vcpu_vmx *vmx =
8144 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
8145
8146 vmx->nested.preemption_timer_expired = true;
8147 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
8148 kvm_vcpu_kick(&vmx->vcpu);
8149
8150 return HRTIMER_NORESTART;
8151}
8152
19677e32
BD
8153/*
8154 * Decode the memory-address operand of a vmx instruction, as recorded on an
8155 * exit caused by such an instruction (run by a guest hypervisor).
8156 * On success, returns 0. When the operand is invalid, returns 1 and throws
8157 * #UD or #GP.
8158 */
8159static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
8160 unsigned long exit_qualification,
f9eb4af6 8161 u32 vmx_instruction_info, bool wr, gva_t *ret)
19677e32 8162{
f9eb4af6
EK
8163 gva_t off;
8164 bool exn;
8165 struct kvm_segment s;
8166
19677e32
BD
8167 /*
8168 * According to Vol. 3B, "Information for VM Exits Due to Instruction
8169 * Execution", on an exit, vmx_instruction_info holds most of the
8170 * addressing components of the operand. Only the displacement part
8171 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
8172 * For how an actual address is calculated from all these components,
8173 * refer to Vol. 1, "Operand Addressing".
8174 */
8175 int scaling = vmx_instruction_info & 3;
8176 int addr_size = (vmx_instruction_info >> 7) & 7;
8177 bool is_reg = vmx_instruction_info & (1u << 10);
8178 int seg_reg = (vmx_instruction_info >> 15) & 7;
8179 int index_reg = (vmx_instruction_info >> 18) & 0xf;
8180 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
8181 int base_reg = (vmx_instruction_info >> 23) & 0xf;
8182 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
8183
8184 if (is_reg) {
8185 kvm_queue_exception(vcpu, UD_VECTOR);
8186 return 1;
8187 }
8188
8189 /* Addr = segment_base + offset */
8190 /* offset = base + [index * scale] + displacement */
f9eb4af6 8191 off = exit_qualification; /* holds the displacement */
19677e32 8192 if (base_is_valid)
f9eb4af6 8193 off += kvm_register_read(vcpu, base_reg);
19677e32 8194 if (index_is_valid)
f9eb4af6
EK
8195 off += kvm_register_read(vcpu, index_reg)<<scaling;
8196 vmx_get_segment(vcpu, &s, seg_reg);
8197 *ret = s.base + off;
19677e32
BD
8198
8199 if (addr_size == 1) /* 32 bit */
8200 *ret &= 0xffffffff;
8201
f9eb4af6
EK
8202 /* Checks for #GP/#SS exceptions. */
8203 exn = false;
ff30ef40
QC
8204 if (is_long_mode(vcpu)) {
8205 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
8206 * non-canonical form. This is the only check on the memory
8207 * destination for long mode!
8208 */
fd8cb433 8209 exn = is_noncanonical_address(*ret, vcpu);
ff30ef40 8210 } else if (is_protmode(vcpu)) {
f9eb4af6
EK
8211 /* Protected mode: apply checks for segment validity in the
8212 * following order:
8213 * - segment type check (#GP(0) may be thrown)
8214 * - usability check (#GP(0)/#SS(0))
8215 * - limit check (#GP(0)/#SS(0))
8216 */
8217 if (wr)
8218 /* #GP(0) if the destination operand is located in a
8219 * read-only data segment or any code segment.
8220 */
8221 exn = ((s.type & 0xa) == 0 || (s.type & 8));
8222 else
8223 /* #GP(0) if the source operand is located in an
8224 * execute-only code segment
8225 */
8226 exn = ((s.type & 0xa) == 8);
ff30ef40
QC
8227 if (exn) {
8228 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
8229 return 1;
8230 }
f9eb4af6
EK
8231 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
8232 */
8233 exn = (s.unusable != 0);
8234 /* Protected mode: #GP(0)/#SS(0) if the memory
8235 * operand is outside the segment limit.
8236 */
8237 exn = exn || (off + sizeof(u64) > s.limit);
8238 }
8239 if (exn) {
8240 kvm_queue_exception_e(vcpu,
8241 seg_reg == VCPU_SREG_SS ?
8242 SS_VECTOR : GP_VECTOR,
8243 0);
8244 return 1;
8245 }
8246
19677e32
BD
8247 return 0;
8248}
8249
cbf71279 8250static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
3573e22c
BD
8251{
8252 gva_t gva;
3573e22c 8253 struct x86_exception e;
3573e22c
BD
8254
8255 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
f9eb4af6 8256 vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
3573e22c
BD
8257 return 1;
8258
ce14e868 8259 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
3573e22c
BD
8260 kvm_inject_page_fault(vcpu, &e);
8261 return 1;
8262 }
8263
3573e22c
BD
8264 return 0;
8265}
8266
abfc52c6
LA
8267/*
8268 * Allocate a shadow VMCS and associate it with the currently loaded
8269 * VMCS, unless such a shadow VMCS already exists. The newly allocated
8270 * VMCS is also VMCLEARed, so that it is ready for use.
8271 */
8272static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
8273{
8274 struct vcpu_vmx *vmx = to_vmx(vcpu);
8275 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
8276
8277 /*
8278 * We should allocate a shadow vmcs for vmcs01 only when L1
8279 * executes VMXON and free it when L1 executes VMXOFF.
8280 * As it is invalid to execute VMXON twice, we shouldn't reach
8281 * here when vmcs01 already have an allocated shadow vmcs.
8282 */
8283 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
8284
8285 if (!loaded_vmcs->shadow_vmcs) {
8286 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
8287 if (loaded_vmcs->shadow_vmcs)
8288 vmcs_clear(loaded_vmcs->shadow_vmcs);
8289 }
8290 return loaded_vmcs->shadow_vmcs;
8291}
8292
e29acc55
JM
8293static int enter_vmx_operation(struct kvm_vcpu *vcpu)
8294{
8295 struct vcpu_vmx *vmx = to_vmx(vcpu);
f21f165e 8296 int r;
e29acc55 8297
f21f165e
PB
8298 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
8299 if (r < 0)
de3a0021 8300 goto out_vmcs02;
e29acc55
JM
8301
8302 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
8303 if (!vmx->nested.cached_vmcs12)
8304 goto out_cached_vmcs12;
8305
61ada748
LA
8306 vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
8307 if (!vmx->nested.cached_shadow_vmcs12)
8308 goto out_cached_shadow_vmcs12;
8309
abfc52c6
LA
8310 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
8311 goto out_shadow_vmcs;
e29acc55 8312
e29acc55
JM
8313 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
8314 HRTIMER_MODE_REL_PINNED);
8315 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
8316
63aff655
RK
8317 vmx->nested.vpid02 = allocate_vpid();
8318
9d6105b2 8319 vmx->nested.vmcs02_initialized = false;
e29acc55
JM
8320 vmx->nested.vmxon = true;
8321 return 0;
8322
8323out_shadow_vmcs:
61ada748
LA
8324 kfree(vmx->nested.cached_shadow_vmcs12);
8325
8326out_cached_shadow_vmcs12:
e29acc55
JM
8327 kfree(vmx->nested.cached_vmcs12);
8328
8329out_cached_vmcs12:
de3a0021 8330 free_loaded_vmcs(&vmx->nested.vmcs02);
e29acc55 8331
de3a0021 8332out_vmcs02:
e29acc55
JM
8333 return -ENOMEM;
8334}
8335
ec378aee
NHE
8336/*
8337 * Emulate the VMXON instruction.
8338 * Currently, we just remember that VMX is active, and do not save or even
8339 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
8340 * do not currently need to store anything in that guest-allocated memory
8341 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
8342 * argument is different from the VMXON pointer (which the spec says they do).
8343 */
8344static int handle_vmon(struct kvm_vcpu *vcpu)
8345{
e29acc55 8346 int ret;
cbf71279
RK
8347 gpa_t vmptr;
8348 struct page *page;
ec378aee 8349 struct vcpu_vmx *vmx = to_vmx(vcpu);
b3897a49
NHE
8350 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
8351 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
ec378aee 8352
70f3aac9
JM
8353 /*
8354 * The Intel VMX Instruction Reference lists a bunch of bits that are
8355 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
8356 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
8357 * Otherwise, we should fail with #UD. But most faulting conditions
8358 * have already been checked by hardware, prior to the VM-exit for
8359 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
8360 * that bit set to 1 in non-root mode.
ec378aee 8361 */
70f3aac9 8362 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
ec378aee
NHE
8363 kvm_queue_exception(vcpu, UD_VECTOR);
8364 return 1;
8365 }
8366
727ba748
FW
8367 /* CPL=0 must be checked manually. */
8368 if (vmx_get_cpl(vcpu)) {
36090bf4 8369 kvm_inject_gp(vcpu, 0);
727ba748
FW
8370 return 1;
8371 }
8372
09abb5e3
SC
8373 if (vmx->nested.vmxon)
8374 return nested_vmx_failValid(vcpu,
8375 VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
b3897a49 8376
3b84080b 8377 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
b3897a49
NHE
8378 != VMXON_NEEDED_FEATURES) {
8379 kvm_inject_gp(vcpu, 0);
8380 return 1;
8381 }
8382
cbf71279 8383 if (nested_vmx_get_vmptr(vcpu, &vmptr))
21e7fbe7 8384 return 1;
cbf71279
RK
8385
8386 /*
8387 * SDM 3: 24.11.5
8388 * The first 4 bytes of VMXON region contain the supported
8389 * VMCS revision identifier
8390 *
8391 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
8392 * which replaces physical address width with 32
8393 */
09abb5e3
SC
8394 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
8395 return nested_vmx_failInvalid(vcpu);
cbf71279 8396
5e2f30b7 8397 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
09abb5e3
SC
8398 if (is_error_page(page))
8399 return nested_vmx_failInvalid(vcpu);
8400
cbf71279
RK
8401 if (*(u32 *)kmap(page) != VMCS12_REVISION) {
8402 kunmap(page);
53a70daf 8403 kvm_release_page_clean(page);
09abb5e3 8404 return nested_vmx_failInvalid(vcpu);
cbf71279
RK
8405 }
8406 kunmap(page);
53a70daf 8407 kvm_release_page_clean(page);
cbf71279
RK
8408
8409 vmx->nested.vmxon_ptr = vmptr;
e29acc55
JM
8410 ret = enter_vmx_operation(vcpu);
8411 if (ret)
8412 return ret;
ec378aee 8413
09abb5e3 8414 return nested_vmx_succeed(vcpu);
ec378aee
NHE
8415}
8416
8417/*
8418 * Intel's VMX Instruction Reference specifies a common set of prerequisites
8419 * for running VMX instructions (except VMXON, whose prerequisites are
8420 * slightly different). It also specifies what exception to inject otherwise.
70f3aac9
JM
8421 * Note that many of these exceptions have priority over VM exits, so they
8422 * don't have to be checked again here.
ec378aee
NHE
8423 */
8424static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
8425{
e49fcb8b 8426 if (!to_vmx(vcpu)->nested.vmxon) {
727ba748
FW
8427 kvm_queue_exception(vcpu, UD_VECTOR);
8428 return 0;
8429 }
8430
e49fcb8b
JM
8431 if (vmx_get_cpl(vcpu)) {
8432 kvm_inject_gp(vcpu, 0);
ec378aee
NHE
8433 return 0;
8434 }
e49fcb8b 8435
ec378aee
NHE
8436 return 1;
8437}
8438
8ca44e88
DM
8439static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
8440{
8441 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
8442 vmcs_write64(VMCS_LINK_POINTER, -1ull);
8443}
8444
b8bbab92
VK
8445static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
8446{
8447 struct vcpu_vmx *vmx = to_vmx(vcpu);
8448
8449 if (!vmx->nested.hv_evmcs)
8450 return;
8451
8452 kunmap(vmx->nested.hv_evmcs_page);
8453 kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
8454 vmx->nested.hv_evmcs_vmptr = -1ull;
8455 vmx->nested.hv_evmcs_page = NULL;
8456 vmx->nested.hv_evmcs = NULL;
8457}
8458
14c07ad8 8459static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
e7953d7f 8460{
14c07ad8
VK
8461 struct vcpu_vmx *vmx = to_vmx(vcpu);
8462
9a2a05b9
PB
8463 if (vmx->nested.current_vmptr == -1ull)
8464 return;
8465
012f83cb 8466 if (enable_shadow_vmcs) {
9a2a05b9
PB
8467 /* copy to memory all shadowed fields in case
8468 they were modified */
8469 copy_shadow_to_vmcs12(vmx);
945679e3 8470 vmx->nested.need_vmcs12_sync = false;
8ca44e88 8471 vmx_disable_shadow_vmcs(vmx);
012f83cb 8472 }
705699a1 8473 vmx->nested.posted_intr_nv = -1;
4f2777bc
DM
8474
8475 /* Flush VMCS12 to guest memory */
14c07ad8 8476 kvm_vcpu_write_guest_page(vcpu,
9f744c59
PB
8477 vmx->nested.current_vmptr >> PAGE_SHIFT,
8478 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4f2777bc 8479
14c07ad8
VK
8480 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
8481
9a2a05b9 8482 vmx->nested.current_vmptr = -1ull;
e7953d7f
AG
8483}
8484
ec378aee
NHE
8485/*
8486 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
8487 * just stops using VMX.
8488 */
14c07ad8 8489static void free_nested(struct kvm_vcpu *vcpu)
ec378aee 8490{
14c07ad8
VK
8491 struct vcpu_vmx *vmx = to_vmx(vcpu);
8492
b7455825 8493 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
ec378aee 8494 return;
9a2a05b9 8495
ec378aee 8496 vmx->nested.vmxon = false;
b7455825 8497 vmx->nested.smm.vmxon = false;
5c614b35 8498 free_vpid(vmx->nested.vpid02);
8ca44e88
DM
8499 vmx->nested.posted_intr_nv = -1;
8500 vmx->nested.current_vmptr = -1ull;
355f4fb1 8501 if (enable_shadow_vmcs) {
8ca44e88 8502 vmx_disable_shadow_vmcs(vmx);
355f4fb1
JM
8503 vmcs_clear(vmx->vmcs01.shadow_vmcs);
8504 free_vmcs(vmx->vmcs01.shadow_vmcs);
8505 vmx->vmcs01.shadow_vmcs = NULL;
8506 }
4f2777bc 8507 kfree(vmx->nested.cached_vmcs12);
61ada748 8508 kfree(vmx->nested.cached_shadow_vmcs12);
de3a0021 8509 /* Unpin physical memory we referred to in the vmcs02 */
fe3ef05c 8510 if (vmx->nested.apic_access_page) {
53a70daf 8511 kvm_release_page_dirty(vmx->nested.apic_access_page);
48d89b92 8512 vmx->nested.apic_access_page = NULL;
fe3ef05c 8513 }
a7c0b07d 8514 if (vmx->nested.virtual_apic_page) {
53a70daf 8515 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
48d89b92 8516 vmx->nested.virtual_apic_page = NULL;
a7c0b07d 8517 }
705699a1
WV
8518 if (vmx->nested.pi_desc_page) {
8519 kunmap(vmx->nested.pi_desc_page);
53a70daf 8520 kvm_release_page_dirty(vmx->nested.pi_desc_page);
705699a1
WV
8521 vmx->nested.pi_desc_page = NULL;
8522 vmx->nested.pi_desc = NULL;
8523 }
ff2f6fe9 8524
14c07ad8
VK
8525 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
8526
b8bbab92
VK
8527 nested_release_evmcs(vcpu);
8528
de3a0021 8529 free_loaded_vmcs(&vmx->nested.vmcs02);
ec378aee
NHE
8530}
8531
8532/* Emulate the VMXOFF instruction */
8533static int handle_vmoff(struct kvm_vcpu *vcpu)
8534{
8535 if (!nested_vmx_check_permission(vcpu))
8536 return 1;
14c07ad8 8537 free_nested(vcpu);
09abb5e3 8538 return nested_vmx_succeed(vcpu);
ec378aee
NHE
8539}
8540
27d6c865
NHE
8541/* Emulate the VMCLEAR instruction */
8542static int handle_vmclear(struct kvm_vcpu *vcpu)
8543{
8544 struct vcpu_vmx *vmx = to_vmx(vcpu);
587d7e72 8545 u32 zero = 0;
27d6c865 8546 gpa_t vmptr;
27d6c865
NHE
8547
8548 if (!nested_vmx_check_permission(vcpu))
8549 return 1;
8550
cbf71279 8551 if (nested_vmx_get_vmptr(vcpu, &vmptr))
27d6c865 8552 return 1;
27d6c865 8553
09abb5e3
SC
8554 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
8555 return nested_vmx_failValid(vcpu,
8556 VMXERR_VMCLEAR_INVALID_ADDRESS);
cbf71279 8557
09abb5e3
SC
8558 if (vmptr == vmx->nested.vmxon_ptr)
8559 return nested_vmx_failValid(vcpu,
8560 VMXERR_VMCLEAR_VMXON_POINTER);
cbf71279 8561
b8bbab92
VK
8562 if (vmx->nested.hv_evmcs_page) {
8563 if (vmptr == vmx->nested.hv_evmcs_vmptr)
8564 nested_release_evmcs(vcpu);
8565 } else {
8566 if (vmptr == vmx->nested.current_vmptr)
8567 nested_release_vmcs12(vcpu);
27d6c865 8568
b8bbab92
VK
8569 kvm_vcpu_write_guest(vcpu,
8570 vmptr + offsetof(struct vmcs12,
8571 launch_state),
8572 &zero, sizeof(zero));
8573 }
27d6c865 8574
09abb5e3 8575 return nested_vmx_succeed(vcpu);
27d6c865
NHE
8576}
8577
cd232ad0
NHE
8578static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
8579
8580/* Emulate the VMLAUNCH instruction */
8581static int handle_vmlaunch(struct kvm_vcpu *vcpu)
8582{
8583 return nested_vmx_run(vcpu, true);
8584}
8585
8586/* Emulate the VMRESUME instruction */
8587static int handle_vmresume(struct kvm_vcpu *vcpu)
8588{
8589
8590 return nested_vmx_run(vcpu, false);
8591}
8592
49f705c5
NHE
8593/*
8594 * Read a vmcs12 field. Since these can have varying lengths and we return
8595 * one type, we chose the biggest type (u64) and zero-extend the return value
8596 * to that size. Note that the caller, handle_vmread, might need to use only
8597 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
8598 * 64-bit fields are to be returned).
8599 */
e2536742 8600static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
a2ae9df7 8601 unsigned long field, u64 *ret)
49f705c5
NHE
8602{
8603 short offset = vmcs_field_to_offset(field);
8604 char *p;
8605
8606 if (offset < 0)
a2ae9df7 8607 return offset;
49f705c5 8608
e2536742 8609 p = (char *)vmcs12 + offset;
49f705c5 8610
d37f4267
JM
8611 switch (vmcs_field_width(field)) {
8612 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
49f705c5 8613 *ret = *((natural_width *)p);
a2ae9df7 8614 return 0;
d37f4267 8615 case VMCS_FIELD_WIDTH_U16:
49f705c5 8616 *ret = *((u16 *)p);
a2ae9df7 8617 return 0;
d37f4267 8618 case VMCS_FIELD_WIDTH_U32:
49f705c5 8619 *ret = *((u32 *)p);
a2ae9df7 8620 return 0;
d37f4267 8621 case VMCS_FIELD_WIDTH_U64:
49f705c5 8622 *ret = *((u64 *)p);
a2ae9df7 8623 return 0;
49f705c5 8624 default:
a2ae9df7
PB
8625 WARN_ON(1);
8626 return -ENOENT;
49f705c5
NHE
8627 }
8628}
8629
20b97fea 8630
e2536742 8631static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
a2ae9df7 8632 unsigned long field, u64 field_value){
20b97fea 8633 short offset = vmcs_field_to_offset(field);
e2536742 8634 char *p = (char *)vmcs12 + offset;
20b97fea 8635 if (offset < 0)
a2ae9df7 8636 return offset;
20b97fea 8637
d37f4267
JM
8638 switch (vmcs_field_width(field)) {
8639 case VMCS_FIELD_WIDTH_U16:
20b97fea 8640 *(u16 *)p = field_value;
a2ae9df7 8641 return 0;
d37f4267 8642 case VMCS_FIELD_WIDTH_U32:
20b97fea 8643 *(u32 *)p = field_value;
a2ae9df7 8644 return 0;
d37f4267 8645 case VMCS_FIELD_WIDTH_U64:
20b97fea 8646 *(u64 *)p = field_value;
a2ae9df7 8647 return 0;
d37f4267 8648 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
20b97fea 8649 *(natural_width *)p = field_value;
a2ae9df7 8650 return 0;
20b97fea 8651 default:
a2ae9df7
PB
8652 WARN_ON(1);
8653 return -ENOENT;
20b97fea
AG
8654 }
8655
8656}
8657
945679e3
VK
8658static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
8659{
8660 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
8661 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
8662
b8bbab92
VK
8663 vmcs12->hdr.revision_id = evmcs->revision_id;
8664
945679e3
VK
8665 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
8666 vmcs12->tpr_threshold = evmcs->tpr_threshold;
8667 vmcs12->guest_rip = evmcs->guest_rip;
8668
8669 if (unlikely(!(evmcs->hv_clean_fields &
8670 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
8671 vmcs12->guest_rsp = evmcs->guest_rsp;
8672 vmcs12->guest_rflags = evmcs->guest_rflags;
8673 vmcs12->guest_interruptibility_info =
8674 evmcs->guest_interruptibility_info;
8675 }
8676
8677 if (unlikely(!(evmcs->hv_clean_fields &
8678 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
8679 vmcs12->cpu_based_vm_exec_control =
8680 evmcs->cpu_based_vm_exec_control;
8681 }
8682
8683 if (unlikely(!(evmcs->hv_clean_fields &
8684 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
8685 vmcs12->exception_bitmap = evmcs->exception_bitmap;
8686 }
8687
8688 if (unlikely(!(evmcs->hv_clean_fields &
8689 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
8690 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
8691 }
8692
8693 if (unlikely(!(evmcs->hv_clean_fields &
8694 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
8695 vmcs12->vm_entry_intr_info_field =
8696 evmcs->vm_entry_intr_info_field;
8697 vmcs12->vm_entry_exception_error_code =
8698 evmcs->vm_entry_exception_error_code;
8699 vmcs12->vm_entry_instruction_len =
8700 evmcs->vm_entry_instruction_len;
8701 }
8702
8703 if (unlikely(!(evmcs->hv_clean_fields &
8704 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
8705 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
8706 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
8707 vmcs12->host_cr0 = evmcs->host_cr0;
8708 vmcs12->host_cr3 = evmcs->host_cr3;
8709 vmcs12->host_cr4 = evmcs->host_cr4;
8710 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
8711 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
8712 vmcs12->host_rip = evmcs->host_rip;
8713 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
8714 vmcs12->host_es_selector = evmcs->host_es_selector;
8715 vmcs12->host_cs_selector = evmcs->host_cs_selector;
8716 vmcs12->host_ss_selector = evmcs->host_ss_selector;
8717 vmcs12->host_ds_selector = evmcs->host_ds_selector;
8718 vmcs12->host_fs_selector = evmcs->host_fs_selector;
8719 vmcs12->host_gs_selector = evmcs->host_gs_selector;
8720 vmcs12->host_tr_selector = evmcs->host_tr_selector;
8721 }
8722
8723 if (unlikely(!(evmcs->hv_clean_fields &
8724 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
8725 vmcs12->pin_based_vm_exec_control =
8726 evmcs->pin_based_vm_exec_control;
8727 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
8728 vmcs12->secondary_vm_exec_control =
8729 evmcs->secondary_vm_exec_control;
8730 }
8731
8732 if (unlikely(!(evmcs->hv_clean_fields &
8733 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
8734 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
8735 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
8736 }
8737
8738 if (unlikely(!(evmcs->hv_clean_fields &
8739 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
8740 vmcs12->msr_bitmap = evmcs->msr_bitmap;
8741 }
8742
8743 if (unlikely(!(evmcs->hv_clean_fields &
8744 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
8745 vmcs12->guest_es_base = evmcs->guest_es_base;
8746 vmcs12->guest_cs_base = evmcs->guest_cs_base;
8747 vmcs12->guest_ss_base = evmcs->guest_ss_base;
8748 vmcs12->guest_ds_base = evmcs->guest_ds_base;
8749 vmcs12->guest_fs_base = evmcs->guest_fs_base;
8750 vmcs12->guest_gs_base = evmcs->guest_gs_base;
8751 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
8752 vmcs12->guest_tr_base = evmcs->guest_tr_base;
8753 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
8754 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
8755 vmcs12->guest_es_limit = evmcs->guest_es_limit;
8756 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
8757 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
8758 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
8759 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
8760 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
8761 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
8762 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
8763 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
8764 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
8765 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
8766 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
8767 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
8768 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
8769 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
8770 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
8771 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
8772 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
8773 vmcs12->guest_es_selector = evmcs->guest_es_selector;
8774 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
8775 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
8776 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
8777 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
8778 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
8779 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
8780 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
8781 }
8782
8783 if (unlikely(!(evmcs->hv_clean_fields &
8784 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
8785 vmcs12->tsc_offset = evmcs->tsc_offset;
8786 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
8787 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
8788 }
8789
8790 if (unlikely(!(evmcs->hv_clean_fields &
8791 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
8792 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
8793 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
8794 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
8795 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
8796 vmcs12->guest_cr0 = evmcs->guest_cr0;
8797 vmcs12->guest_cr3 = evmcs->guest_cr3;
8798 vmcs12->guest_cr4 = evmcs->guest_cr4;
8799 vmcs12->guest_dr7 = evmcs->guest_dr7;
8800 }
8801
8802 if (unlikely(!(evmcs->hv_clean_fields &
8803 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
8804 vmcs12->host_fs_base = evmcs->host_fs_base;
8805 vmcs12->host_gs_base = evmcs->host_gs_base;
8806 vmcs12->host_tr_base = evmcs->host_tr_base;
8807 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
8808 vmcs12->host_idtr_base = evmcs->host_idtr_base;
8809 vmcs12->host_rsp = evmcs->host_rsp;
8810 }
8811
8812 if (unlikely(!(evmcs->hv_clean_fields &
8813 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
8814 vmcs12->ept_pointer = evmcs->ept_pointer;
8815 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
8816 }
8817
8818 if (unlikely(!(evmcs->hv_clean_fields &
8819 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
8820 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
8821 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
8822 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
8823 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
8824 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
8825 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
8826 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
8827 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
8828 vmcs12->guest_pending_dbg_exceptions =
8829 evmcs->guest_pending_dbg_exceptions;
8830 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
8831 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
8832 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
8833 vmcs12->guest_activity_state = evmcs->guest_activity_state;
8834 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
8835 }
8836
8837 /*
8838 * Not used?
8839 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
8840 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
8841 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
8842 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
8843 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
8844 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
8845 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
8846 * vmcs12->page_fault_error_code_mask =
8847 * evmcs->page_fault_error_code_mask;
8848 * vmcs12->page_fault_error_code_match =
8849 * evmcs->page_fault_error_code_match;
8850 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
8851 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
8852 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
8853 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
8854 */
8855
8856 /*
8857 * Read only fields:
8858 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
8859 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
8860 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
8861 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
8862 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
8863 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
8864 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
8865 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
8866 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
8867 * vmcs12->exit_qualification = evmcs->exit_qualification;
8868 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
8869 *
8870 * Not present in struct vmcs12:
8871 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
8872 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
8873 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
8874 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
8875 */
8876
8877 return 0;
8878}
8879
8880static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
8881{
8882 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
8883 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
8884
8885 /*
8886 * Should not be changed by KVM:
8887 *
8888 * evmcs->host_es_selector = vmcs12->host_es_selector;
8889 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
8890 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
8891 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
8892 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
8893 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
8894 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
8895 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
8896 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
8897 * evmcs->host_cr0 = vmcs12->host_cr0;
8898 * evmcs->host_cr3 = vmcs12->host_cr3;
8899 * evmcs->host_cr4 = vmcs12->host_cr4;
8900 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
8901 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
8902 * evmcs->host_rip = vmcs12->host_rip;
8903 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
8904 * evmcs->host_fs_base = vmcs12->host_fs_base;
8905 * evmcs->host_gs_base = vmcs12->host_gs_base;
8906 * evmcs->host_tr_base = vmcs12->host_tr_base;
8907 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
8908 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
8909 * evmcs->host_rsp = vmcs12->host_rsp;
8910 * sync_vmcs12() doesn't read these:
8911 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
8912 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
8913 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
8914 * evmcs->ept_pointer = vmcs12->ept_pointer;
8915 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
8916 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
8917 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
8918 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
8919 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
8920 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
8921 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
8922 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
8923 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
8924 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
8925 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
8926 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
8927 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
8928 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
8929 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
8930 * evmcs->page_fault_error_code_mask =
8931 * vmcs12->page_fault_error_code_mask;
8932 * evmcs->page_fault_error_code_match =
8933 * vmcs12->page_fault_error_code_match;
8934 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
8935 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
8936 * evmcs->tsc_offset = vmcs12->tsc_offset;
8937 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
8938 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
8939 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
8940 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
8941 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
8942 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
8943 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
8944 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
8945 *
8946 * Not present in struct vmcs12:
8947 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
8948 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
8949 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
8950 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
8951 */
8952
8953 evmcs->guest_es_selector = vmcs12->guest_es_selector;
8954 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
8955 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
8956 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
8957 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
8958 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
8959 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
8960 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
8961
8962 evmcs->guest_es_limit = vmcs12->guest_es_limit;
8963 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
8964 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
8965 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
8966 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
8967 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
8968 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
8969 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
8970 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
8971 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
8972
8973 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
8974 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
8975 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
8976 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
8977 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
8978 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
8979 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
8980 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
8981
8982 evmcs->guest_es_base = vmcs12->guest_es_base;
8983 evmcs->guest_cs_base = vmcs12->guest_cs_base;
8984 evmcs->guest_ss_base = vmcs12->guest_ss_base;
8985 evmcs->guest_ds_base = vmcs12->guest_ds_base;
8986 evmcs->guest_fs_base = vmcs12->guest_fs_base;
8987 evmcs->guest_gs_base = vmcs12->guest_gs_base;
8988 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
8989 evmcs->guest_tr_base = vmcs12->guest_tr_base;
8990 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
8991 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
8992
8993 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
8994 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
8995
8996 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
8997 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
8998 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
8999 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
9000
9001 evmcs->guest_pending_dbg_exceptions =
9002 vmcs12->guest_pending_dbg_exceptions;
9003 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
9004 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
9005
9006 evmcs->guest_activity_state = vmcs12->guest_activity_state;
9007 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
9008
9009 evmcs->guest_cr0 = vmcs12->guest_cr0;
9010 evmcs->guest_cr3 = vmcs12->guest_cr3;
9011 evmcs->guest_cr4 = vmcs12->guest_cr4;
9012 evmcs->guest_dr7 = vmcs12->guest_dr7;
9013
9014 evmcs->guest_physical_address = vmcs12->guest_physical_address;
9015
9016 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
9017 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
9018 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
9019 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
9020 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
9021 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
9022 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
9023 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
9024
9025 evmcs->exit_qualification = vmcs12->exit_qualification;
9026
9027 evmcs->guest_linear_address = vmcs12->guest_linear_address;
9028 evmcs->guest_rsp = vmcs12->guest_rsp;
9029 evmcs->guest_rflags = vmcs12->guest_rflags;
9030
9031 evmcs->guest_interruptibility_info =
9032 vmcs12->guest_interruptibility_info;
9033 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
9034 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
9035 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
9036 evmcs->vm_entry_exception_error_code =
9037 vmcs12->vm_entry_exception_error_code;
9038 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
9039
9040 evmcs->guest_rip = vmcs12->guest_rip;
9041
9042 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
9043
9044 return 0;
9045}
9046
f4160e45
JM
9047/*
9048 * Copy the writable VMCS shadow fields back to the VMCS12, in case
9049 * they have been modified by the L1 guest. Note that the "read-only"
9050 * VM-exit information fields are actually writable if the vCPU is
9051 * configured to support "VMWRITE to any supported field in the VMCS."
9052 */
16f5b903
AG
9053static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
9054{
f4160e45
JM
9055 const u16 *fields[] = {
9056 shadow_read_write_fields,
9057 shadow_read_only_fields
9058 };
9059 const int max_fields[] = {
9060 max_shadow_read_write_fields,
9061 max_shadow_read_only_fields
9062 };
9063 int i, q;
16f5b903
AG
9064 unsigned long field;
9065 u64 field_value;
355f4fb1 9066 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
16f5b903 9067
282da870
JK
9068 preempt_disable();
9069
16f5b903
AG
9070 vmcs_load(shadow_vmcs);
9071
f4160e45
JM
9072 for (q = 0; q < ARRAY_SIZE(fields); q++) {
9073 for (i = 0; i < max_fields[q]; i++) {
9074 field = fields[q][i];
9075 field_value = __vmcs_readl(field);
e2536742 9076 vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
f4160e45
JM
9077 }
9078 /*
9079 * Skip the VM-exit information fields if they are read-only.
9080 */
9081 if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
9082 break;
16f5b903
AG
9083 }
9084
9085 vmcs_clear(shadow_vmcs);
9086 vmcs_load(vmx->loaded_vmcs->vmcs);
282da870
JK
9087
9088 preempt_enable();
16f5b903
AG
9089}
9090
c3114420
AG
9091static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
9092{
44900ba6 9093 const u16 *fields[] = {
c2bae893
MK
9094 shadow_read_write_fields,
9095 shadow_read_only_fields
c3114420 9096 };
c2bae893 9097 const int max_fields[] = {
c3114420
AG
9098 max_shadow_read_write_fields,
9099 max_shadow_read_only_fields
9100 };
9101 int i, q;
9102 unsigned long field;
9103 u64 field_value = 0;
355f4fb1 9104 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
c3114420
AG
9105
9106 vmcs_load(shadow_vmcs);
9107
c2bae893 9108 for (q = 0; q < ARRAY_SIZE(fields); q++) {
c3114420
AG
9109 for (i = 0; i < max_fields[q]; i++) {
9110 field = fields[q][i];
e2536742 9111 vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
44900ba6 9112 __vmcs_writel(field, field_value);
c3114420
AG
9113 }
9114 }
9115
9116 vmcs_clear(shadow_vmcs);
9117 vmcs_load(vmx->loaded_vmcs->vmcs);
9118}
9119
49f705c5
NHE
9120static int handle_vmread(struct kvm_vcpu *vcpu)
9121{
9122 unsigned long field;
9123 u64 field_value;
9124 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9125 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9126 gva_t gva = 0;
6d894f49 9127 struct vmcs12 *vmcs12;
49f705c5 9128
eb277562 9129 if (!nested_vmx_check_permission(vcpu))
49f705c5
NHE
9130 return 1;
9131
09abb5e3
SC
9132 if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
9133 return nested_vmx_failInvalid(vcpu);
49f705c5 9134
6d894f49
LA
9135 if (!is_guest_mode(vcpu))
9136 vmcs12 = get_vmcs12(vcpu);
9137 else {
9138 /*
9139 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
9140 * to shadowed-field sets the ALU flags for VMfailInvalid.
9141 */
09abb5e3
SC
9142 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
9143 return nested_vmx_failInvalid(vcpu);
6d894f49
LA
9144 vmcs12 = get_shadow_vmcs12(vcpu);
9145 }
9146
49f705c5 9147 /* Decode instruction info and find the field to read */
27e6fb5d 9148 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
49f705c5 9149 /* Read the field, zero-extended to a u64 field_value */
09abb5e3
SC
9150 if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
9151 return nested_vmx_failValid(vcpu,
9152 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
9153
49f705c5
NHE
9154 /*
9155 * Now copy part of this value to register or memory, as requested.
9156 * Note that the number of bits actually copied is 32 or 64 depending
9157 * on the guest's mode (32 or 64 bit), not on the given field's length.
9158 */
9159 if (vmx_instruction_info & (1u << 10)) {
27e6fb5d 9160 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
49f705c5
NHE
9161 field_value);
9162 } else {
9163 if (get_vmx_mem_address(vcpu, exit_qualification,
f9eb4af6 9164 vmx_instruction_info, true, &gva))
49f705c5 9165 return 1;
727ba748 9166 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
ce14e868
PB
9167 kvm_write_guest_virt_system(vcpu, gva, &field_value,
9168 (is_long_mode(vcpu) ? 8 : 4), NULL);
49f705c5
NHE
9169 }
9170
09abb5e3 9171 return nested_vmx_succeed(vcpu);
49f705c5
NHE
9172}
9173
9174
9175static int handle_vmwrite(struct kvm_vcpu *vcpu)
9176{
9177 unsigned long field;
9178 gva_t gva;
74a497fa 9179 struct vcpu_vmx *vmx = to_vmx(vcpu);
49f705c5
NHE
9180 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9181 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
74a497fa 9182
49f705c5
NHE
9183 /* The value to write might be 32 or 64 bits, depending on L1's long
9184 * mode, and eventually we need to write that into a field of several
9185 * possible lengths. The code below first zero-extends the value to 64
6a6256f9 9186 * bit (field_value), and then copies only the appropriate number of
49f705c5
NHE
9187 * bits into the vmcs12 field.
9188 */
9189 u64 field_value = 0;
9190 struct x86_exception e;
6d894f49 9191 struct vmcs12 *vmcs12;
49f705c5 9192
eb277562 9193 if (!nested_vmx_check_permission(vcpu))
49f705c5
NHE
9194 return 1;
9195
09abb5e3
SC
9196 if (vmx->nested.current_vmptr == -1ull)
9197 return nested_vmx_failInvalid(vcpu);
eb277562 9198
49f705c5 9199 if (vmx_instruction_info & (1u << 10))
27e6fb5d 9200 field_value = kvm_register_readl(vcpu,
49f705c5
NHE
9201 (((vmx_instruction_info) >> 3) & 0xf));
9202 else {
9203 if (get_vmx_mem_address(vcpu, exit_qualification,
f9eb4af6 9204 vmx_instruction_info, false, &gva))
49f705c5 9205 return 1;
ce14e868
PB
9206 if (kvm_read_guest_virt(vcpu, gva, &field_value,
9207 (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
49f705c5
NHE
9208 kvm_inject_page_fault(vcpu, &e);
9209 return 1;
9210 }
9211 }
9212
9213
27e6fb5d 9214 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
f4160e45
JM
9215 /*
9216 * If the vCPU supports "VMWRITE to any supported field in the
9217 * VMCS," then the "read-only" fields are actually read/write.
9218 */
9219 if (vmcs_field_readonly(field) &&
09abb5e3
SC
9220 !nested_cpu_has_vmwrite_any_field(vcpu))
9221 return nested_vmx_failValid(vcpu,
49f705c5 9222 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
49f705c5 9223
6d894f49
LA
9224 if (!is_guest_mode(vcpu))
9225 vmcs12 = get_vmcs12(vcpu);
9226 else {
9227 /*
9228 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
9229 * to shadowed-field sets the ALU flags for VMfailInvalid.
9230 */
09abb5e3
SC
9231 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
9232 return nested_vmx_failInvalid(vcpu);
6d894f49 9233 vmcs12 = get_shadow_vmcs12(vcpu);
6d894f49
LA
9234 }
9235
09abb5e3
SC
9236 if (vmcs12_write_any(vmcs12, field, field_value) < 0)
9237 return nested_vmx_failValid(vcpu,
9238 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
49f705c5 9239
6d894f49
LA
9240 /*
9241 * Do not track vmcs12 dirty-state if in guest-mode
9242 * as we actually dirty shadow vmcs12 instead of vmcs12.
9243 */
9244 if (!is_guest_mode(vcpu)) {
9245 switch (field) {
74a497fa
PB
9246#define SHADOW_FIELD_RW(x) case x:
9247#include "vmx_shadow_fields.h"
6d894f49
LA
9248 /*
9249 * The fields that can be updated by L1 without a vmexit are
9250 * always updated in the vmcs02, the others go down the slow
9251 * path of prepare_vmcs02.
9252 */
9253 break;
9254 default:
9255 vmx->nested.dirty_vmcs12 = true;
9256 break;
9257 }
74a497fa
PB
9258 }
9259
09abb5e3 9260 return nested_vmx_succeed(vcpu);
49f705c5
NHE
9261}
9262
a8bc284e
JM
9263static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
9264{
9265 vmx->nested.current_vmptr = vmptr;
9266 if (enable_shadow_vmcs) {
9267 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
9268 SECONDARY_EXEC_SHADOW_VMCS);
9269 vmcs_write64(VMCS_LINK_POINTER,
9270 __pa(vmx->vmcs01.shadow_vmcs));
945679e3 9271 vmx->nested.need_vmcs12_sync = true;
a8bc284e 9272 }
74a497fa 9273 vmx->nested.dirty_vmcs12 = true;
a8bc284e
JM
9274}
9275
63846663
NHE
9276/* Emulate the VMPTRLD instruction */
9277static int handle_vmptrld(struct kvm_vcpu *vcpu)
9278{
9279 struct vcpu_vmx *vmx = to_vmx(vcpu);
63846663 9280 gpa_t vmptr;
63846663
NHE
9281
9282 if (!nested_vmx_check_permission(vcpu))
9283 return 1;
9284
cbf71279 9285 if (nested_vmx_get_vmptr(vcpu, &vmptr))
63846663 9286 return 1;
63846663 9287
09abb5e3
SC
9288 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
9289 return nested_vmx_failValid(vcpu,
9290 VMXERR_VMPTRLD_INVALID_ADDRESS);
cbf71279 9291
09abb5e3
SC
9292 if (vmptr == vmx->nested.vmxon_ptr)
9293 return nested_vmx_failValid(vcpu,
9294 VMXERR_VMPTRLD_VMXON_POINTER);
cbf71279 9295
b8bbab92
VK
9296 /* Forbid normal VMPTRLD if Enlightened version was used */
9297 if (vmx->nested.hv_evmcs)
9298 return 1;
9299
63846663
NHE
9300 if (vmx->nested.current_vmptr != vmptr) {
9301 struct vmcs12 *new_vmcs12;
9302 struct page *page;
5e2f30b7 9303 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
09abb5e3
SC
9304 if (is_error_page(page))
9305 return nested_vmx_failInvalid(vcpu);
9306
63846663 9307 new_vmcs12 = kmap(page);
392b2f25 9308 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
fa97d7db
LA
9309 (new_vmcs12->hdr.shadow_vmcs &&
9310 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
63846663 9311 kunmap(page);
53a70daf 9312 kvm_release_page_clean(page);
09abb5e3 9313 return nested_vmx_failValid(vcpu,
63846663 9314 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
63846663 9315 }
63846663 9316
14c07ad8
VK
9317 nested_release_vmcs12(vcpu);
9318
4f2777bc
DM
9319 /*
9320 * Load VMCS12 from guest memory since it is not already
9321 * cached.
9322 */
9f744c59
PB
9323 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
9324 kunmap(page);
53a70daf 9325 kvm_release_page_clean(page);
9f744c59 9326
a8bc284e 9327 set_current_vmptr(vmx, vmptr);
63846663
NHE
9328 }
9329
09abb5e3 9330 return nested_vmx_succeed(vcpu);
63846663
NHE
9331}
9332
b8bbab92
VK
9333/*
9334 * This is an equivalent of the nested hypervisor executing the vmptrld
9335 * instruction.
9336 */
8cab6507
VK
9337static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
9338 bool from_launch)
b8bbab92
VK
9339{
9340 struct vcpu_vmx *vmx = to_vmx(vcpu);
9341 struct hv_vp_assist_page assist_page;
9342
9343 if (likely(!vmx->nested.enlightened_vmcs_enabled))
9344 return 1;
9345
9346 if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
9347 return 1;
9348
9349 if (unlikely(!assist_page.enlighten_vmentry))
9350 return 1;
9351
9352 if (unlikely(assist_page.current_nested_vmcs !=
9353 vmx->nested.hv_evmcs_vmptr)) {
9354
9355 if (!vmx->nested.hv_evmcs)
9356 vmx->nested.current_vmptr = -1ull;
9357
9358 nested_release_evmcs(vcpu);
9359
9360 vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
9361 vcpu, assist_page.current_nested_vmcs);
9362
9363 if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
9364 return 0;
9365
9366 vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
9367
9368 if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
9369 nested_release_evmcs(vcpu);
9370 return 0;
9371 }
9372
9373 vmx->nested.dirty_vmcs12 = true;
9374 /*
9375 * As we keep L2 state for one guest only 'hv_clean_fields' mask
9376 * can't be used when we switch between them. Reset it here for
9377 * simplicity.
9378 */
9379 vmx->nested.hv_evmcs->hv_clean_fields &=
9380 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
9381 vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
9382
9383 /*
9384 * Unlike normal vmcs12, enlightened vmcs12 is not fully
9385 * reloaded from guest's memory (read only fields, fields not
9386 * present in struct hv_enlightened_vmcs, ...). Make sure there
9387 * are no leftovers.
9388 */
8cab6507
VK
9389 if (from_launch)
9390 memset(vmx->nested.cached_vmcs12, 0,
9391 sizeof(*vmx->nested.cached_vmcs12));
b8bbab92
VK
9392
9393 }
9394 return 1;
9395}
9396
6a4d7550
NHE
9397/* Emulate the VMPTRST instruction */
9398static int handle_vmptrst(struct kvm_vcpu *vcpu)
9399{
0a06d425
SC
9400 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
9401 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9402 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
6a4d7550 9403 struct x86_exception e;
0a06d425 9404 gva_t gva;
6a4d7550
NHE
9405
9406 if (!nested_vmx_check_permission(vcpu))
9407 return 1;
9408
b8bbab92
VK
9409 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
9410 return 1;
9411
0a06d425 9412 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
6a4d7550 9413 return 1;
727ba748 9414 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
0a06d425
SC
9415 if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
9416 sizeof(gpa_t), &e)) {
6a4d7550
NHE
9417 kvm_inject_page_fault(vcpu, &e);
9418 return 1;
9419 }
09abb5e3 9420 return nested_vmx_succeed(vcpu);
6a4d7550
NHE
9421}
9422
bfd0a56b
NHE
9423/* Emulate the INVEPT instruction */
9424static int handle_invept(struct kvm_vcpu *vcpu)
9425{
b9c237bb 9426 struct vcpu_vmx *vmx = to_vmx(vcpu);
bfd0a56b
NHE
9427 u32 vmx_instruction_info, types;
9428 unsigned long type;
9429 gva_t gva;
9430 struct x86_exception e;
9431 struct {
9432 u64 eptp, gpa;
9433 } operand;
bfd0a56b 9434
6677f3da 9435 if (!(vmx->nested.msrs.secondary_ctls_high &
b9c237bb 9436 SECONDARY_EXEC_ENABLE_EPT) ||
6677f3da 9437 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
bfd0a56b
NHE
9438 kvm_queue_exception(vcpu, UD_VECTOR);
9439 return 1;
9440 }
9441
9442 if (!nested_vmx_check_permission(vcpu))
9443 return 1;
9444
bfd0a56b 9445 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
27e6fb5d 9446 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
bfd0a56b 9447
6677f3da 9448 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
bfd0a56b 9449
09abb5e3
SC
9450 if (type >= 32 || !(types & (1 << type)))
9451 return nested_vmx_failValid(vcpu,
bfd0a56b 9452 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
bfd0a56b
NHE
9453
9454 /* According to the Intel VMX instruction reference, the memory
9455 * operand is read even if it isn't needed (e.g., for type==global)
9456 */
9457 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
f9eb4af6 9458 vmx_instruction_info, false, &gva))
bfd0a56b 9459 return 1;
ce14e868 9460 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
bfd0a56b
NHE
9461 kvm_inject_page_fault(vcpu, &e);
9462 return 1;
9463 }
9464
9465 switch (type) {
bfd0a56b 9466 case VMX_EPT_EXTENT_GLOBAL:
45e11817
BD
9467 /*
9468 * TODO: track mappings and invalidate
9469 * single context requests appropriately
9470 */
9471 case VMX_EPT_EXTENT_CONTEXT:
bfd0a56b 9472 kvm_mmu_sync_roots(vcpu);
77c3913b 9473 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
bfd0a56b
NHE
9474 break;
9475 default:
9476 BUG_ON(1);
9477 break;
9478 }
9479
09abb5e3 9480 return nested_vmx_succeed(vcpu);
bfd0a56b
NHE
9481}
9482
3d5bdae8
LA
9483static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
9484{
9485 struct vcpu_vmx *vmx = to_vmx(vcpu);
9486
9487 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
9488}
9489
a642fc30
PM
9490static int handle_invvpid(struct kvm_vcpu *vcpu)
9491{
99b83ac8
WL
9492 struct vcpu_vmx *vmx = to_vmx(vcpu);
9493 u32 vmx_instruction_info;
9494 unsigned long type, types;
9495 gva_t gva;
9496 struct x86_exception e;
40352605
JM
9497 struct {
9498 u64 vpid;
9499 u64 gla;
9500 } operand;
3d5bdae8 9501 u16 vpid02;
99b83ac8 9502
6677f3da 9503 if (!(vmx->nested.msrs.secondary_ctls_high &
99b83ac8 9504 SECONDARY_EXEC_ENABLE_VPID) ||
6677f3da 9505 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
99b83ac8
WL
9506 kvm_queue_exception(vcpu, UD_VECTOR);
9507 return 1;
9508 }
9509
9510 if (!nested_vmx_check_permission(vcpu))
9511 return 1;
9512
9513 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9514 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9515
6677f3da 9516 types = (vmx->nested.msrs.vpid_caps &
bcdde302 9517 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
99b83ac8 9518
09abb5e3
SC
9519 if (type >= 32 || !(types & (1 << type)))
9520 return nested_vmx_failValid(vcpu,
99b83ac8 9521 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
99b83ac8
WL
9522
9523 /* according to the intel vmx instruction reference, the memory
9524 * operand is read even if it isn't needed (e.g., for type==global)
9525 */
9526 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9527 vmx_instruction_info, false, &gva))
9528 return 1;
ce14e868 9529 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
99b83ac8
WL
9530 kvm_inject_page_fault(vcpu, &e);
9531 return 1;
9532 }
09abb5e3
SC
9533 if (operand.vpid >> 16)
9534 return nested_vmx_failValid(vcpu,
40352605 9535 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
99b83ac8 9536
3d5bdae8 9537 vpid02 = nested_get_vpid02(vcpu);
99b83ac8 9538 switch (type) {
bcdde302 9539 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
cd9a491f 9540 if (!operand.vpid ||
09abb5e3
SC
9541 is_noncanonical_address(operand.gla, vcpu))
9542 return nested_vmx_failValid(vcpu,
40352605 9543 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
3d5bdae8 9544 if (cpu_has_vmx_invvpid_individual_addr()) {
cd9a491f 9545 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
3d5bdae8 9546 vpid02, operand.gla);
cd9a491f 9547 } else
327c0721 9548 __vmx_flush_tlb(vcpu, vpid02, false);
cd9a491f 9549 break;
ef697a71 9550 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
bcdde302 9551 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
09abb5e3
SC
9552 if (!operand.vpid)
9553 return nested_vmx_failValid(vcpu,
bcdde302 9554 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
327c0721 9555 __vmx_flush_tlb(vcpu, vpid02, false);
bcdde302 9556 break;
99b83ac8 9557 case VMX_VPID_EXTENT_ALL_CONTEXT:
327c0721 9558 __vmx_flush_tlb(vcpu, vpid02, false);
99b83ac8
WL
9559 break;
9560 default:
bcdde302 9561 WARN_ON_ONCE(1);
6affcbed 9562 return kvm_skip_emulated_instruction(vcpu);
99b83ac8
WL
9563 }
9564
09abb5e3 9565 return nested_vmx_succeed(vcpu);
a642fc30
PM
9566}
9567
eb4b248e
JS
9568static int handle_invpcid(struct kvm_vcpu *vcpu)
9569{
9570 u32 vmx_instruction_info;
9571 unsigned long type;
9572 bool pcid_enabled;
9573 gva_t gva;
9574 struct x86_exception e;
b94742c9
JS
9575 unsigned i;
9576 unsigned long roots_to_free = 0;
eb4b248e
JS
9577 struct {
9578 u64 pcid;
9579 u64 gla;
9580 } operand;
9581
9582 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
9583 kvm_queue_exception(vcpu, UD_VECTOR);
9584 return 1;
9585 }
9586
9587 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9588 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9589
9590 if (type > 3) {
9591 kvm_inject_gp(vcpu, 0);
9592 return 1;
9593 }
9594
9595 /* According to the Intel instruction reference, the memory operand
9596 * is read even if it isn't needed (e.g., for type==all)
9597 */
9598 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9599 vmx_instruction_info, false, &gva))
9600 return 1;
9601
9602 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9603 kvm_inject_page_fault(vcpu, &e);
9604 return 1;
9605 }
9606
9607 if (operand.pcid >> 12 != 0) {
9608 kvm_inject_gp(vcpu, 0);
9609 return 1;
9610 }
9611
9612 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
9613
9614 switch (type) {
9615 case INVPCID_TYPE_INDIV_ADDR:
9616 if ((!pcid_enabled && (operand.pcid != 0)) ||
9617 is_noncanonical_address(operand.gla, vcpu)) {
9618 kvm_inject_gp(vcpu, 0);
9619 return 1;
9620 }
9621 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
9622 return kvm_skip_emulated_instruction(vcpu);
9623
9624 case INVPCID_TYPE_SINGLE_CTXT:
9625 if (!pcid_enabled && (operand.pcid != 0)) {
9626 kvm_inject_gp(vcpu, 0);
9627 return 1;
9628 }
9629
9630 if (kvm_get_active_pcid(vcpu) == operand.pcid) {
9631 kvm_mmu_sync_roots(vcpu);
9632 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
9633 }
9634
b94742c9 9635 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
44dd3ffa 9636 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
b94742c9
JS
9637 == operand.pcid)
9638 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
ade61e28 9639
6a82cd1c 9640 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
eb4b248e 9641 /*
b94742c9 9642 * If neither the current cr3 nor any of the prev_roots use the
ade61e28
JS
9643 * given PCID, then nothing needs to be done here because a
9644 * resync will happen anyway before switching to any other CR3.
eb4b248e
JS
9645 */
9646
9647 return kvm_skip_emulated_instruction(vcpu);
9648
9649 case INVPCID_TYPE_ALL_NON_GLOBAL:
9650 /*
9651 * Currently, KVM doesn't mark global entries in the shadow
9652 * page tables, so a non-global flush just degenerates to a
9653 * global flush. If needed, we could optimize this later by
9654 * keeping track of global entries in shadow page tables.
9655 */
9656
9657 /* fall-through */
9658 case INVPCID_TYPE_ALL_INCL_GLOBAL:
9659 kvm_mmu_unload(vcpu);
9660 return kvm_skip_emulated_instruction(vcpu);
9661
9662 default:
9663 BUG(); /* We have already checked above that type <= 3 */
9664 }
9665}
9666
843e4330
KH
9667static int handle_pml_full(struct kvm_vcpu *vcpu)
9668{
9669 unsigned long exit_qualification;
9670
9671 trace_kvm_pml_full(vcpu->vcpu_id);
9672
9673 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9674
9675 /*
9676 * PML buffer FULL happened while executing iret from NMI,
9677 * "blocked by NMI" bit has to be set before next VM entry.
9678 */
9679 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
d02fcf50 9680 enable_vnmi &&
843e4330
KH
9681 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
9682 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
9683 GUEST_INTR_STATE_NMI);
9684
9685 /*
9686 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
9687 * here.., and there's no userspace involvement needed for PML.
9688 */
9689 return 1;
9690}
9691
64672c95
YJ
9692static int handle_preemption_timer(struct kvm_vcpu *vcpu)
9693{
d264ee0c
SC
9694 if (!to_vmx(vcpu)->req_immediate_exit)
9695 kvm_lapic_expired_hv_timer(vcpu);
64672c95
YJ
9696 return 1;
9697}
9698
41ab9372
BD
9699static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
9700{
9701 struct vcpu_vmx *vmx = to_vmx(vcpu);
41ab9372
BD
9702 int maxphyaddr = cpuid_maxphyaddr(vcpu);
9703
9704 /* Check for memory type validity */
bb97a016
DH
9705 switch (address & VMX_EPTP_MT_MASK) {
9706 case VMX_EPTP_MT_UC:
6677f3da 9707 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
41ab9372
BD
9708 return false;
9709 break;
bb97a016 9710 case VMX_EPTP_MT_WB:
6677f3da 9711 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
41ab9372
BD
9712 return false;
9713 break;
9714 default:
9715 return false;
9716 }
9717
bb97a016
DH
9718 /* only 4 levels page-walk length are valid */
9719 if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
41ab9372
BD
9720 return false;
9721
9722 /* Reserved bits should not be set */
9723 if (address >> maxphyaddr || ((address >> 7) & 0x1f))
9724 return false;
9725
9726 /* AD, if set, should be supported */
bb97a016 9727 if (address & VMX_EPTP_AD_ENABLE_BIT) {
6677f3da 9728 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
41ab9372
BD
9729 return false;
9730 }
9731
9732 return true;
9733}
9734
9735static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
9736 struct vmcs12 *vmcs12)
9737{
9738 u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
9739 u64 address;
9740 bool accessed_dirty;
9741 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
9742
9743 if (!nested_cpu_has_eptp_switching(vmcs12) ||
9744 !nested_cpu_has_ept(vmcs12))
9745 return 1;
9746
9747 if (index >= VMFUNC_EPTP_ENTRIES)
9748 return 1;
9749
9750
9751 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
9752 &address, index * 8, 8))
9753 return 1;
9754
bb97a016 9755 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
41ab9372
BD
9756
9757 /*
9758 * If the (L2) guest does a vmfunc to the currently
9759 * active ept pointer, we don't have to do anything else
9760 */
9761 if (vmcs12->ept_pointer != address) {
9762 if (!valid_ept_address(vcpu, address))
9763 return 1;
9764
9765 kvm_mmu_unload(vcpu);
9766 mmu->ept_ad = accessed_dirty;
36d9594d 9767 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
41ab9372
BD
9768 vmcs12->ept_pointer = address;
9769 /*
9770 * TODO: Check what's the correct approach in case
9771 * mmu reload fails. Currently, we just let the next
9772 * reload potentially fail
9773 */
9774 kvm_mmu_reload(vcpu);
9775 }
9776
9777 return 0;
9778}
9779
2a499e49
BD
9780static int handle_vmfunc(struct kvm_vcpu *vcpu)
9781{
27c42a1b
BD
9782 struct vcpu_vmx *vmx = to_vmx(vcpu);
9783 struct vmcs12 *vmcs12;
9784 u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
9785
9786 /*
9787 * VMFUNC is only supported for nested guests, but we always enable the
9788 * secondary control for simplicity; for non-nested mode, fake that we
9789 * didn't by injecting #UD.
9790 */
9791 if (!is_guest_mode(vcpu)) {
9792 kvm_queue_exception(vcpu, UD_VECTOR);
9793 return 1;
9794 }
9795
9796 vmcs12 = get_vmcs12(vcpu);
9797 if ((vmcs12->vm_function_control & (1 << function)) == 0)
9798 goto fail;
41ab9372
BD
9799
9800 switch (function) {
9801 case 0:
9802 if (nested_vmx_eptp_switching(vcpu, vmcs12))
9803 goto fail;
9804 break;
9805 default:
9806 goto fail;
9807 }
9808 return kvm_skip_emulated_instruction(vcpu);
27c42a1b
BD
9809
9810fail:
9811 nested_vmx_vmexit(vcpu, vmx->exit_reason,
9812 vmcs_read32(VM_EXIT_INTR_INFO),
9813 vmcs_readl(EXIT_QUALIFICATION));
2a499e49
BD
9814 return 1;
9815}
9816
0b665d30
SC
9817static int handle_encls(struct kvm_vcpu *vcpu)
9818{
9819 /*
9820 * SGX virtualization is not yet supported. There is no software
9821 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
9822 * to prevent the guest from executing ENCLS.
9823 */
9824 kvm_queue_exception(vcpu, UD_VECTOR);
9825 return 1;
9826}
9827
6aa8b732
AK
9828/*
9829 * The exit handlers return 1 if the exit was handled fully and guest execution
9830 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
9831 * to be done to userspace and return 0.
9832 */
772e0318 9833static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6aa8b732
AK
9834 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
9835 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
988ad74f 9836 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
f08864b4 9837 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6aa8b732 9838 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6aa8b732
AK
9839 [EXIT_REASON_CR_ACCESS] = handle_cr,
9840 [EXIT_REASON_DR_ACCESS] = handle_dr,
9841 [EXIT_REASON_CPUID] = handle_cpuid,
9842 [EXIT_REASON_MSR_READ] = handle_rdmsr,
9843 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
9844 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
9845 [EXIT_REASON_HLT] = handle_halt,
ec25d5e6 9846 [EXIT_REASON_INVD] = handle_invd,
a7052897 9847 [EXIT_REASON_INVLPG] = handle_invlpg,
fee84b07 9848 [EXIT_REASON_RDPMC] = handle_rdpmc,
c21415e8 9849 [EXIT_REASON_VMCALL] = handle_vmcall,
27d6c865 9850 [EXIT_REASON_VMCLEAR] = handle_vmclear,
cd232ad0 9851 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
63846663 9852 [EXIT_REASON_VMPTRLD] = handle_vmptrld,
6a4d7550 9853 [EXIT_REASON_VMPTRST] = handle_vmptrst,
49f705c5 9854 [EXIT_REASON_VMREAD] = handle_vmread,
cd232ad0 9855 [EXIT_REASON_VMRESUME] = handle_vmresume,
49f705c5 9856 [EXIT_REASON_VMWRITE] = handle_vmwrite,
ec378aee
NHE
9857 [EXIT_REASON_VMOFF] = handle_vmoff,
9858 [EXIT_REASON_VMON] = handle_vmon,
f78e0e2e
SY
9859 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
9860 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
83d4c286 9861 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
c7c9c56c 9862 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
e5edaa01 9863 [EXIT_REASON_WBINVD] = handle_wbinvd,
2acf923e 9864 [EXIT_REASON_XSETBV] = handle_xsetbv,
37817f29 9865 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
a0861c02 9866 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
0367f205
PB
9867 [EXIT_REASON_GDTR_IDTR] = handle_desc,
9868 [EXIT_REASON_LDTR_TR] = handle_desc,
68f89400
MT
9869 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
9870 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
4b8d54f9 9871 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
87c00572 9872 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
5f3d45e7 9873 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
87c00572 9874 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
bfd0a56b 9875 [EXIT_REASON_INVEPT] = handle_invept,
a642fc30 9876 [EXIT_REASON_INVVPID] = handle_invvpid,
45ec368c 9877 [EXIT_REASON_RDRAND] = handle_invalid_op,
75f4fc8d 9878 [EXIT_REASON_RDSEED] = handle_invalid_op,
f53cd63c
WL
9879 [EXIT_REASON_XSAVES] = handle_xsaves,
9880 [EXIT_REASON_XRSTORS] = handle_xrstors,
843e4330 9881 [EXIT_REASON_PML_FULL] = handle_pml_full,
eb4b248e 9882 [EXIT_REASON_INVPCID] = handle_invpcid,
2a499e49 9883 [EXIT_REASON_VMFUNC] = handle_vmfunc,
64672c95 9884 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
0b665d30 9885 [EXIT_REASON_ENCLS] = handle_encls,
6aa8b732
AK
9886};
9887
9888static const int kvm_vmx_max_exit_handlers =
50a3485c 9889 ARRAY_SIZE(kvm_vmx_exit_handlers);
6aa8b732 9890
908a7bdd
JK
9891static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
9892 struct vmcs12 *vmcs12)
9893{
9894 unsigned long exit_qualification;
9895 gpa_t bitmap, last_bitmap;
9896 unsigned int port;
9897 int size;
9898 u8 b;
9899
908a7bdd 9900 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
2f0a6397 9901 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
908a7bdd
JK
9902
9903 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9904
9905 port = exit_qualification >> 16;
9906 size = (exit_qualification & 7) + 1;
9907
9908 last_bitmap = (gpa_t)-1;
9909 b = -1;
9910
9911 while (size > 0) {
9912 if (port < 0x8000)
9913 bitmap = vmcs12->io_bitmap_a;
9914 else if (port < 0x10000)
9915 bitmap = vmcs12->io_bitmap_b;
9916 else
1d804d07 9917 return true;
908a7bdd
JK
9918 bitmap += (port & 0x7fff) / 8;
9919
9920 if (last_bitmap != bitmap)
54bf36aa 9921 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
1d804d07 9922 return true;
908a7bdd 9923 if (b & (1 << (port & 7)))
1d804d07 9924 return true;
908a7bdd
JK
9925
9926 port++;
9927 size--;
9928 last_bitmap = bitmap;
9929 }
9930
1d804d07 9931 return false;
908a7bdd
JK
9932}
9933
644d711a
NHE
9934/*
9935 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
9936 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
9937 * disinterest in the current event (read or write a specific MSR) by using an
9938 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
9939 */
9940static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
9941 struct vmcs12 *vmcs12, u32 exit_reason)
9942{
9943 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
9944 gpa_t bitmap;
9945
cbd29cb6 9946 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
1d804d07 9947 return true;
644d711a
NHE
9948
9949 /*
9950 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
9951 * for the four combinations of read/write and low/high MSR numbers.
9952 * First we need to figure out which of the four to use:
9953 */
9954 bitmap = vmcs12->msr_bitmap;
9955 if (exit_reason == EXIT_REASON_MSR_WRITE)
9956 bitmap += 2048;
9957 if (msr_index >= 0xc0000000) {
9958 msr_index -= 0xc0000000;
9959 bitmap += 1024;
9960 }
9961
9962 /* Then read the msr_index'th bit from this bitmap: */
9963 if (msr_index < 1024*8) {
9964 unsigned char b;
54bf36aa 9965 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
1d804d07 9966 return true;
644d711a
NHE
9967 return 1 & (b >> (msr_index & 7));
9968 } else
1d804d07 9969 return true; /* let L1 handle the wrong parameter */
644d711a
NHE
9970}
9971
9972/*
9973 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
9974 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
9975 * intercept (via guest_host_mask etc.) the current event.
9976 */
9977static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
9978 struct vmcs12 *vmcs12)
9979{
9980 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9981 int cr = exit_qualification & 15;
e1d39b17
JS
9982 int reg;
9983 unsigned long val;
644d711a
NHE
9984
9985 switch ((exit_qualification >> 4) & 3) {
9986 case 0: /* mov to cr */
e1d39b17
JS
9987 reg = (exit_qualification >> 8) & 15;
9988 val = kvm_register_readl(vcpu, reg);
644d711a
NHE
9989 switch (cr) {
9990 case 0:
9991 if (vmcs12->cr0_guest_host_mask &
9992 (val ^ vmcs12->cr0_read_shadow))
1d804d07 9993 return true;
644d711a
NHE
9994 break;
9995 case 3:
9996 if ((vmcs12->cr3_target_count >= 1 &&
9997 vmcs12->cr3_target_value0 == val) ||
9998 (vmcs12->cr3_target_count >= 2 &&
9999 vmcs12->cr3_target_value1 == val) ||
10000 (vmcs12->cr3_target_count >= 3 &&
10001 vmcs12->cr3_target_value2 == val) ||
10002 (vmcs12->cr3_target_count >= 4 &&
10003 vmcs12->cr3_target_value3 == val))
1d804d07 10004 return false;
644d711a 10005 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
1d804d07 10006 return true;
644d711a
NHE
10007 break;
10008 case 4:
10009 if (vmcs12->cr4_guest_host_mask &
10010 (vmcs12->cr4_read_shadow ^ val))
1d804d07 10011 return true;
644d711a
NHE
10012 break;
10013 case 8:
10014 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
1d804d07 10015 return true;
644d711a
NHE
10016 break;
10017 }
10018 break;
10019 case 2: /* clts */
10020 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
10021 (vmcs12->cr0_read_shadow & X86_CR0_TS))
1d804d07 10022 return true;
644d711a
NHE
10023 break;
10024 case 1: /* mov from cr */
10025 switch (cr) {
10026 case 3:
10027 if (vmcs12->cpu_based_vm_exec_control &
10028 CPU_BASED_CR3_STORE_EXITING)
1d804d07 10029 return true;
644d711a
NHE
10030 break;
10031 case 8:
10032 if (vmcs12->cpu_based_vm_exec_control &
10033 CPU_BASED_CR8_STORE_EXITING)
1d804d07 10034 return true;
644d711a
NHE
10035 break;
10036 }
10037 break;
10038 case 3: /* lmsw */
10039 /*
10040 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
10041 * cr0. Other attempted changes are ignored, with no exit.
10042 */
e1d39b17 10043 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
644d711a
NHE
10044 if (vmcs12->cr0_guest_host_mask & 0xe &
10045 (val ^ vmcs12->cr0_read_shadow))
1d804d07 10046 return true;
644d711a
NHE
10047 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
10048 !(vmcs12->cr0_read_shadow & 0x1) &&
10049 (val & 0x1))
1d804d07 10050 return true;
644d711a
NHE
10051 break;
10052 }
1d804d07 10053 return false;
644d711a
NHE
10054}
10055
a7cde481
LA
10056static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
10057 struct vmcs12 *vmcs12, gpa_t bitmap)
10058{
10059 u32 vmx_instruction_info;
10060 unsigned long field;
10061 u8 b;
10062
10063 if (!nested_cpu_has_shadow_vmcs(vmcs12))
10064 return true;
10065
10066 /* Decode instruction info and find the field to access */
10067 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
10068 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
10069
10070 /* Out-of-range fields always cause a VM exit from L2 to L1 */
10071 if (field >> 15)
10072 return true;
10073
10074 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
10075 return true;
10076
10077 return 1 & (b >> (field & 7));
10078}
10079
644d711a
NHE
10080/*
10081 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
10082 * should handle it ourselves in L0 (and then continue L2). Only call this
10083 * when in is_guest_mode (L2).
10084 */
7313c698 10085static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
644d711a 10086{
644d711a
NHE
10087 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10088 struct vcpu_vmx *vmx = to_vmx(vcpu);
10089 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10090
4f350c6d
JM
10091 if (vmx->nested.nested_run_pending)
10092 return false;
10093
10094 if (unlikely(vmx->fail)) {
10095 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
10096 vmcs_read32(VM_INSTRUCTION_ERROR));
10097 return true;
10098 }
542060ea 10099
c9f04407
DM
10100 /*
10101 * The host physical addresses of some pages of guest memory
de3a0021
JM
10102 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
10103 * Page). The CPU may write to these pages via their host
10104 * physical address while L2 is running, bypassing any
10105 * address-translation-based dirty tracking (e.g. EPT write
10106 * protection).
c9f04407
DM
10107 *
10108 * Mark them dirty on every exit from L2 to prevent them from
10109 * getting out of sync with dirty tracking.
10110 */
10111 nested_mark_vmcs12_pages_dirty(vcpu);
10112
4f350c6d
JM
10113 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
10114 vmcs_readl(EXIT_QUALIFICATION),
10115 vmx->idt_vectoring_info,
10116 intr_info,
10117 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
10118 KVM_ISA_VMX);
644d711a
NHE
10119
10120 switch (exit_reason) {
10121 case EXIT_REASON_EXCEPTION_NMI:
ef85b673 10122 if (is_nmi(intr_info))
1d804d07 10123 return false;
644d711a 10124 else if (is_page_fault(intr_info))
52a5c155 10125 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
6f05485d
JK
10126 else if (is_debug(intr_info) &&
10127 vcpu->guest_debug &
10128 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
10129 return false;
10130 else if (is_breakpoint(intr_info) &&
10131 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
10132 return false;
644d711a
NHE
10133 return vmcs12->exception_bitmap &
10134 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
10135 case EXIT_REASON_EXTERNAL_INTERRUPT:
1d804d07 10136 return false;
644d711a 10137 case EXIT_REASON_TRIPLE_FAULT:
1d804d07 10138 return true;
644d711a 10139 case EXIT_REASON_PENDING_INTERRUPT:
3b656cf7 10140 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
644d711a 10141 case EXIT_REASON_NMI_WINDOW:
3b656cf7 10142 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
644d711a 10143 case EXIT_REASON_TASK_SWITCH:
1d804d07 10144 return true;
644d711a 10145 case EXIT_REASON_CPUID:
1d804d07 10146 return true;
644d711a
NHE
10147 case EXIT_REASON_HLT:
10148 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
10149 case EXIT_REASON_INVD:
1d804d07 10150 return true;
644d711a
NHE
10151 case EXIT_REASON_INVLPG:
10152 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
10153 case EXIT_REASON_RDPMC:
10154 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
a5f46457 10155 case EXIT_REASON_RDRAND:
736fdf72 10156 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
a5f46457 10157 case EXIT_REASON_RDSEED:
736fdf72 10158 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
b3a2a907 10159 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
644d711a 10160 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
a7cde481
LA
10161 case EXIT_REASON_VMREAD:
10162 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
10163 vmcs12->vmread_bitmap);
10164 case EXIT_REASON_VMWRITE:
10165 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
10166 vmcs12->vmwrite_bitmap);
644d711a
NHE
10167 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
10168 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
a7cde481 10169 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
644d711a 10170 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
a642fc30 10171 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
644d711a
NHE
10172 /*
10173 * VMX instructions trap unconditionally. This allows L1 to
10174 * emulate them for its L2 guest, i.e., allows 3-level nesting!
10175 */
1d804d07 10176 return true;
644d711a
NHE
10177 case EXIT_REASON_CR_ACCESS:
10178 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
10179 case EXIT_REASON_DR_ACCESS:
10180 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
10181 case EXIT_REASON_IO_INSTRUCTION:
908a7bdd 10182 return nested_vmx_exit_handled_io(vcpu, vmcs12);
1b07304c
PB
10183 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
10184 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
644d711a
NHE
10185 case EXIT_REASON_MSR_READ:
10186 case EXIT_REASON_MSR_WRITE:
10187 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
10188 case EXIT_REASON_INVALID_STATE:
1d804d07 10189 return true;
644d711a
NHE
10190 case EXIT_REASON_MWAIT_INSTRUCTION:
10191 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5f3d45e7
MD
10192 case EXIT_REASON_MONITOR_TRAP_FLAG:
10193 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
644d711a
NHE
10194 case EXIT_REASON_MONITOR_INSTRUCTION:
10195 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
10196 case EXIT_REASON_PAUSE_INSTRUCTION:
10197 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
10198 nested_cpu_has2(vmcs12,
10199 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
10200 case EXIT_REASON_MCE_DURING_VMENTRY:
1d804d07 10201 return false;
644d711a 10202 case EXIT_REASON_TPR_BELOW_THRESHOLD:
a7c0b07d 10203 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
644d711a 10204 case EXIT_REASON_APIC_ACCESS:
82f0dd4b 10205 case EXIT_REASON_APIC_WRITE:
608406e2 10206 case EXIT_REASON_EOI_INDUCED:
ab5df31c
JM
10207 /*
10208 * The controls for "virtualize APIC accesses," "APIC-
10209 * register virtualization," and "virtual-interrupt
10210 * delivery" only come from vmcs12.
10211 */
1d804d07 10212 return true;
644d711a 10213 case EXIT_REASON_EPT_VIOLATION:
2b1be677
NHE
10214 /*
10215 * L0 always deals with the EPT violation. If nested EPT is
10216 * used, and the nested mmu code discovers that the address is
10217 * missing in the guest EPT table (EPT12), the EPT violation
10218 * will be injected with nested_ept_inject_page_fault()
10219 */
1d804d07 10220 return false;
644d711a 10221 case EXIT_REASON_EPT_MISCONFIG:
2b1be677
NHE
10222 /*
10223 * L2 never uses directly L1's EPT, but rather L0's own EPT
10224 * table (shadow on EPT) or a merged EPT table that L0 built
10225 * (EPT on EPT). So any problems with the structure of the
10226 * table is L0's fault.
10227 */
1d804d07 10228 return false;
90a2db6d
PB
10229 case EXIT_REASON_INVPCID:
10230 return
10231 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
10232 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
644d711a
NHE
10233 case EXIT_REASON_WBINVD:
10234 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
10235 case EXIT_REASON_XSETBV:
1d804d07 10236 return true;
81dc01f7
WL
10237 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
10238 /*
10239 * This should never happen, since it is not possible to
10240 * set XSS to a non-zero value---neither in L1 nor in L2.
10241 * If if it were, XSS would have to be checked against
10242 * the XSS exit bitmap in vmcs12.
10243 */
10244 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
55123e3c
WL
10245 case EXIT_REASON_PREEMPTION_TIMER:
10246 return false;
ab007cc9 10247 case EXIT_REASON_PML_FULL:
03efce6f 10248 /* We emulate PML support to L1. */
ab007cc9 10249 return false;
2a499e49
BD
10250 case EXIT_REASON_VMFUNC:
10251 /* VM functions are emulated through L2->L0 vmexits. */
10252 return false;
0b665d30
SC
10253 case EXIT_REASON_ENCLS:
10254 /* SGX is never exposed to L1 */
10255 return false;
644d711a 10256 default:
1d804d07 10257 return true;
644d711a
NHE
10258 }
10259}
10260
7313c698
PB
10261static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
10262{
10263 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10264
10265 /*
10266 * At this point, the exit interruption info in exit_intr_info
10267 * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
10268 * we need to query the in-kernel LAPIC.
10269 */
10270 WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
10271 if ((exit_intr_info &
10272 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
10273 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
10274 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10275 vmcs12->vm_exit_intr_error_code =
10276 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
10277 }
10278
10279 nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
10280 vmcs_readl(EXIT_QUALIFICATION));
10281 return 1;
10282}
10283
586f9607
AK
10284static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
10285{
10286 *info1 = vmcs_readl(EXIT_QUALIFICATION);
10287 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
10288}
10289
a3eaa864 10290static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
843e4330 10291{
a3eaa864
KH
10292 if (vmx->pml_pg) {
10293 __free_page(vmx->pml_pg);
10294 vmx->pml_pg = NULL;
10295 }
843e4330
KH
10296}
10297
54bf36aa 10298static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
843e4330 10299{
54bf36aa 10300 struct vcpu_vmx *vmx = to_vmx(vcpu);
843e4330
KH
10301 u64 *pml_buf;
10302 u16 pml_idx;
10303
10304 pml_idx = vmcs_read16(GUEST_PML_INDEX);
10305
10306 /* Do nothing if PML buffer is empty */
10307 if (pml_idx == (PML_ENTITY_NUM - 1))
10308 return;
10309
10310 /* PML index always points to next available PML buffer entity */
10311 if (pml_idx >= PML_ENTITY_NUM)
10312 pml_idx = 0;
10313 else
10314 pml_idx++;
10315
10316 pml_buf = page_address(vmx->pml_pg);
10317 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
10318 u64 gpa;
10319
10320 gpa = pml_buf[pml_idx];
10321 WARN_ON(gpa & (PAGE_SIZE - 1));
54bf36aa 10322 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
843e4330
KH
10323 }
10324
10325 /* reset PML index */
10326 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
10327}
10328
10329/*
10330 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
10331 * Called before reporting dirty_bitmap to userspace.
10332 */
10333static void kvm_flush_pml_buffers(struct kvm *kvm)
10334{
10335 int i;
10336 struct kvm_vcpu *vcpu;
10337 /*
10338 * We only need to kick vcpu out of guest mode here, as PML buffer
10339 * is flushed at beginning of all VMEXITs, and it's obvious that only
10340 * vcpus running in guest are possible to have unflushed GPAs in PML
10341 * buffer.
10342 */
10343 kvm_for_each_vcpu(i, vcpu, kvm)
10344 kvm_vcpu_kick(vcpu);
10345}
10346
4eb64dce
PB
10347static void vmx_dump_sel(char *name, uint32_t sel)
10348{
10349 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
96794e4e 10350 name, vmcs_read16(sel),
4eb64dce
PB
10351 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
10352 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
10353 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
10354}
10355
10356static void vmx_dump_dtsel(char *name, uint32_t limit)
10357{
10358 pr_err("%s limit=0x%08x, base=0x%016lx\n",
10359 name, vmcs_read32(limit),
10360 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
10361}
10362
10363static void dump_vmcs(void)
10364{
10365 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
10366 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
10367 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
10368 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
10369 u32 secondary_exec_control = 0;
10370 unsigned long cr4 = vmcs_readl(GUEST_CR4);
f3531054 10371 u64 efer = vmcs_read64(GUEST_IA32_EFER);
4eb64dce
PB
10372 int i, n;
10373
10374 if (cpu_has_secondary_exec_ctrls())
10375 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
10376
10377 pr_err("*** Guest State ***\n");
10378 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10379 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
10380 vmcs_readl(CR0_GUEST_HOST_MASK));
10381 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10382 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
10383 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
10384 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
10385 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
10386 {
845c5b40
PB
10387 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
10388 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
10389 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
10390 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
4eb64dce
PB
10391 }
10392 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
10393 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
10394 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
10395 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
10396 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10397 vmcs_readl(GUEST_SYSENTER_ESP),
10398 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
10399 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
10400 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
10401 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
10402 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
10403 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
10404 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
10405 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
10406 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
10407 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
10408 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
10409 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
10410 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
845c5b40
PB
10411 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
10412 efer, vmcs_read64(GUEST_IA32_PAT));
10413 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
10414 vmcs_read64(GUEST_IA32_DEBUGCTL),
4eb64dce 10415 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
773e8a04
VK
10416 if (cpu_has_load_perf_global_ctrl &&
10417 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
845c5b40
PB
10418 pr_err("PerfGlobCtl = 0x%016llx\n",
10419 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
4eb64dce 10420 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
845c5b40 10421 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
4eb64dce
PB
10422 pr_err("Interruptibility = %08x ActivityState = %08x\n",
10423 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
10424 vmcs_read32(GUEST_ACTIVITY_STATE));
10425 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
10426 pr_err("InterruptStatus = %04x\n",
10427 vmcs_read16(GUEST_INTR_STATUS));
10428
10429 pr_err("*** Host State ***\n");
10430 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
10431 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
10432 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
10433 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
10434 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
10435 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
10436 vmcs_read16(HOST_TR_SELECTOR));
10437 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
10438 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
10439 vmcs_readl(HOST_TR_BASE));
10440 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
10441 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
10442 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
10443 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
10444 vmcs_readl(HOST_CR4));
10445 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10446 vmcs_readl(HOST_IA32_SYSENTER_ESP),
10447 vmcs_read32(HOST_IA32_SYSENTER_CS),
10448 vmcs_readl(HOST_IA32_SYSENTER_EIP));
10449 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
845c5b40
PB
10450 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
10451 vmcs_read64(HOST_IA32_EFER),
10452 vmcs_read64(HOST_IA32_PAT));
773e8a04
VK
10453 if (cpu_has_load_perf_global_ctrl &&
10454 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
845c5b40
PB
10455 pr_err("PerfGlobCtl = 0x%016llx\n",
10456 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
4eb64dce
PB
10457
10458 pr_err("*** Control State ***\n");
10459 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
10460 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
10461 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
10462 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
10463 vmcs_read32(EXCEPTION_BITMAP),
10464 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
10465 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
10466 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
10467 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
10468 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
10469 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
10470 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
10471 vmcs_read32(VM_EXIT_INTR_INFO),
10472 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
10473 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
10474 pr_err(" reason=%08x qualification=%016lx\n",
10475 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
10476 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
10477 vmcs_read32(IDT_VECTORING_INFO_FIELD),
10478 vmcs_read32(IDT_VECTORING_ERROR_CODE));
845c5b40 10479 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
8cfe9866 10480 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
845c5b40
PB
10481 pr_err("TSC Multiplier = 0x%016llx\n",
10482 vmcs_read64(TSC_MULTIPLIER));
4eb64dce
PB
10483 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
10484 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
10485 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
10486 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
10487 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
845c5b40 10488 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
4eb64dce
PB
10489 n = vmcs_read32(CR3_TARGET_COUNT);
10490 for (i = 0; i + 1 < n; i += 4)
10491 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
10492 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
10493 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
10494 if (i < n)
10495 pr_err("CR3 target%u=%016lx\n",
10496 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
10497 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
10498 pr_err("PLE Gap=%08x Window=%08x\n",
10499 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
10500 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
10501 pr_err("Virtual processor ID = 0x%04x\n",
10502 vmcs_read16(VIRTUAL_PROCESSOR_ID));
10503}
10504
6aa8b732
AK
10505/*
10506 * The guest has exited. See if we can fix it or if we need userspace
10507 * assistance.
10508 */
851ba692 10509static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6aa8b732 10510{
29bd8a78 10511 struct vcpu_vmx *vmx = to_vmx(vcpu);
a0861c02 10512 u32 exit_reason = vmx->exit_reason;
1155f76a 10513 u32 vectoring_info = vmx->idt_vectoring_info;
29bd8a78 10514
8b89fe1f
PB
10515 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
10516
843e4330
KH
10517 /*
10518 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
10519 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
10520 * querying dirty_bitmap, we only need to kick all vcpus out of guest
10521 * mode as if vcpus is in root mode, the PML buffer must has been
10522 * flushed already.
10523 */
10524 if (enable_pml)
54bf36aa 10525 vmx_flush_pml_buffer(vcpu);
843e4330 10526
80ced186 10527 /* If guest state is invalid, start emulating */
14168786 10528 if (vmx->emulation_required)
80ced186 10529 return handle_invalid_guest_state(vcpu);
1d5a4d9b 10530
7313c698
PB
10531 if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
10532 return nested_vmx_reflect_vmexit(vcpu, exit_reason);
644d711a 10533
5120702e 10534 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
4eb64dce 10535 dump_vmcs();
5120702e
MG
10536 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10537 vcpu->run->fail_entry.hardware_entry_failure_reason
10538 = exit_reason;
10539 return 0;
10540 }
10541
29bd8a78 10542 if (unlikely(vmx->fail)) {
851ba692
AK
10543 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10544 vcpu->run->fail_entry.hardware_entry_failure_reason
29bd8a78
AK
10545 = vmcs_read32(VM_INSTRUCTION_ERROR);
10546 return 0;
10547 }
6aa8b732 10548
b9bf6882
XG
10549 /*
10550 * Note:
10551 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
10552 * delivery event since it indicates guest is accessing MMIO.
10553 * The vm-exit can be triggered again after return to guest that
10554 * will cause infinite loop.
10555 */
d77c26fc 10556 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
1439442c 10557 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
60637aac 10558 exit_reason != EXIT_REASON_EPT_VIOLATION &&
b244c9fc 10559 exit_reason != EXIT_REASON_PML_FULL &&
b9bf6882
XG
10560 exit_reason != EXIT_REASON_TASK_SWITCH)) {
10561 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
10562 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
70bcd708 10563 vcpu->run->internal.ndata = 3;
b9bf6882
XG
10564 vcpu->run->internal.data[0] = vectoring_info;
10565 vcpu->run->internal.data[1] = exit_reason;
70bcd708
PB
10566 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
10567 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
10568 vcpu->run->internal.ndata++;
10569 vcpu->run->internal.data[3] =
10570 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
10571 }
b9bf6882
XG
10572 return 0;
10573 }
3b86cd99 10574
d02fcf50 10575 if (unlikely(!enable_vnmi &&
8a1b4392
PB
10576 vmx->loaded_vmcs->soft_vnmi_blocked)) {
10577 if (vmx_interrupt_allowed(vcpu)) {
10578 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10579 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
10580 vcpu->arch.nmi_pending) {
10581 /*
10582 * This CPU don't support us in finding the end of an
10583 * NMI-blocked window if the guest runs with IRQs
10584 * disabled. So we pull the trigger after 1 s of
10585 * futile waiting, but inform the user about this.
10586 */
10587 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
10588 "state on VCPU %d after 1 s timeout\n",
10589 __func__, vcpu->vcpu_id);
10590 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10591 }
10592 }
10593
6aa8b732
AK
10594 if (exit_reason < kvm_vmx_max_exit_handlers
10595 && kvm_vmx_exit_handlers[exit_reason])
851ba692 10596 return kvm_vmx_exit_handlers[exit_reason](vcpu);
6aa8b732 10597 else {
6c6c5e03
RK
10598 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
10599 exit_reason);
2bc19dc3
MT
10600 kvm_queue_exception(vcpu, UD_VECTOR);
10601 return 1;
6aa8b732 10602 }
6aa8b732
AK
10603}
10604
a47dd5f0
PB
10605/*
10606 * Software based L1D cache flush which is used when microcode providing
10607 * the cache control MSR is not loaded.
10608 *
10609 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
10610 * flush it is required to read in 64 KiB because the replacement algorithm
10611 * is not exactly LRU. This could be sized at runtime via topology
10612 * information but as all relevant affected CPUs have 32KiB L1D cache size
10613 * there is no point in doing so.
10614 */
c595ceee 10615static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
a47dd5f0
PB
10616{
10617 int size = PAGE_SIZE << L1D_CACHE_ORDER;
c595ceee
PB
10618
10619 /*
2f055947
TG
10620 * This code is only executed when the the flush mode is 'cond' or
10621 * 'always'
c595ceee 10622 */
427362a1 10623 if (static_branch_likely(&vmx_l1d_flush_cond)) {
45b575c0 10624 bool flush_l1d;
5b6ccc6c 10625
379fd0c7 10626 /*
45b575c0
NS
10627 * Clear the per-vcpu flush bit, it gets set again
10628 * either from vcpu_run() or from one of the unsafe
10629 * VMEXIT handlers.
379fd0c7 10630 */
45b575c0 10631 flush_l1d = vcpu->arch.l1tf_flush_l1d;
4c6523ec 10632 vcpu->arch.l1tf_flush_l1d = false;
45b575c0
NS
10633
10634 /*
10635 * Clear the per-cpu flush bit, it gets set again from
10636 * the interrupt handlers.
10637 */
10638 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
10639 kvm_clear_cpu_l1tf_flush_l1d();
10640
5b6ccc6c
NS
10641 if (!flush_l1d)
10642 return;
379fd0c7 10643 }
c595ceee
PB
10644
10645 vcpu->stat.l1d_flush++;
a47dd5f0 10646
3fa045be
PB
10647 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
10648 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
10649 return;
10650 }
10651
a47dd5f0
PB
10652 asm volatile(
10653 /* First ensure the pages are in the TLB */
10654 "xorl %%eax, %%eax\n"
10655 ".Lpopulate_tlb:\n\t"
288d152c 10656 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
a47dd5f0
PB
10657 "addl $4096, %%eax\n\t"
10658 "cmpl %%eax, %[size]\n\t"
10659 "jne .Lpopulate_tlb\n\t"
10660 "xorl %%eax, %%eax\n\t"
10661 "cpuid\n\t"
10662 /* Now fill the cache */
10663 "xorl %%eax, %%eax\n"
10664 ".Lfill_cache:\n"
288d152c 10665 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
a47dd5f0
PB
10666 "addl $64, %%eax\n\t"
10667 "cmpl %%eax, %[size]\n\t"
10668 "jne .Lfill_cache\n\t"
10669 "lfence\n"
288d152c 10670 :: [flush_pages] "r" (vmx_l1d_flush_pages),
a47dd5f0
PB
10671 [size] "r" (size)
10672 : "eax", "ebx", "ecx", "edx");
10673}
10674
95ba8273 10675static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6e5d865c 10676{
a7c0b07d
WL
10677 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10678
10679 if (is_guest_mode(vcpu) &&
10680 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10681 return;
10682
95ba8273 10683 if (irr == -1 || tpr < irr) {
6e5d865c
YS
10684 vmcs_write32(TPR_THRESHOLD, 0);
10685 return;
10686 }
10687
95ba8273 10688 vmcs_write32(TPR_THRESHOLD, irr);
6e5d865c
YS
10689}
10690
8d860bbe 10691static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
8d14695f
YZ
10692{
10693 u32 sec_exec_control;
10694
8d860bbe
JM
10695 if (!lapic_in_kernel(vcpu))
10696 return;
10697
fd6b6d9b
SC
10698 if (!flexpriority_enabled &&
10699 !cpu_has_vmx_virtualize_x2apic_mode())
10700 return;
10701
dccbfcf5
RK
10702 /* Postpone execution until vmcs01 is the current VMCS. */
10703 if (is_guest_mode(vcpu)) {
8d860bbe 10704 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
dccbfcf5
RK
10705 return;
10706 }
10707
8d14695f 10708 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8d860bbe
JM
10709 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
10710 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
8d14695f 10711
8d860bbe
JM
10712 switch (kvm_get_apic_mode(vcpu)) {
10713 case LAPIC_MODE_INVALID:
10714 WARN_ONCE(true, "Invalid local APIC state");
10715 case LAPIC_MODE_DISABLED:
10716 break;
10717 case LAPIC_MODE_XAPIC:
10718 if (flexpriority_enabled) {
10719 sec_exec_control |=
10720 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
10721 vmx_flush_tlb(vcpu, true);
10722 }
10723 break;
10724 case LAPIC_MODE_X2APIC:
10725 if (cpu_has_vmx_virtualize_x2apic_mode())
10726 sec_exec_control |=
10727 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
10728 break;
8d14695f
YZ
10729 }
10730 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
10731
904e14fb 10732 vmx_update_msr_bitmap(vcpu);
8d14695f
YZ
10733}
10734
38b99173
TC
10735static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
10736{
ab5df31c 10737 if (!is_guest_mode(vcpu)) {
38b99173 10738 vmcs_write64(APIC_ACCESS_ADDR, hpa);
a468f2db 10739 vmx_flush_tlb(vcpu, true);
fb6c8198 10740 }
38b99173
TC
10741}
10742
67c9dddc 10743static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
c7c9c56c
YZ
10744{
10745 u16 status;
10746 u8 old;
10747
67c9dddc
PB
10748 if (max_isr == -1)
10749 max_isr = 0;
c7c9c56c
YZ
10750
10751 status = vmcs_read16(GUEST_INTR_STATUS);
10752 old = status >> 8;
67c9dddc 10753 if (max_isr != old) {
c7c9c56c 10754 status &= 0xff;
67c9dddc 10755 status |= max_isr << 8;
c7c9c56c
YZ
10756 vmcs_write16(GUEST_INTR_STATUS, status);
10757 }
10758}
10759
10760static void vmx_set_rvi(int vector)
10761{
10762 u16 status;
10763 u8 old;
10764
4114c27d
WW
10765 if (vector == -1)
10766 vector = 0;
10767
c7c9c56c
YZ
10768 status = vmcs_read16(GUEST_INTR_STATUS);
10769 old = (u8)status & 0xff;
10770 if ((u8)vector != old) {
10771 status &= ~0xff;
10772 status |= (u8)vector;
10773 vmcs_write16(GUEST_INTR_STATUS, status);
10774 }
10775}
10776
10777static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
10778{
963fee16 10779 /*
851c1a18
LA
10780 * When running L2, updating RVI is only relevant when
10781 * vmcs12 virtual-interrupt-delivery enabled.
10782 * However, it can be enabled only when L1 also
10783 * intercepts external-interrupts and in that case
10784 * we should not update vmcs02 RVI but instead intercept
10785 * interrupt. Therefore, do nothing when running L2.
963fee16 10786 */
851c1a18
LA
10787 if (!is_guest_mode(vcpu))
10788 vmx_set_rvi(max_irr);
c7c9c56c
YZ
10789}
10790
76dfafd5 10791static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
810e6def
PB
10792{
10793 struct vcpu_vmx *vmx = to_vmx(vcpu);
76dfafd5 10794 int max_irr;
f27a85c4 10795 bool max_irr_updated;
810e6def 10796
76dfafd5
PB
10797 WARN_ON(!vcpu->arch.apicv_active);
10798 if (pi_test_on(&vmx->pi_desc)) {
10799 pi_clear_on(&vmx->pi_desc);
10800 /*
10801 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
10802 * But on x86 this is just a compiler barrier anyway.
10803 */
10804 smp_mb__after_atomic();
f27a85c4
LA
10805 max_irr_updated =
10806 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
10807
10808 /*
10809 * If we are running L2 and L1 has a new pending interrupt
10810 * which can be injected, we should re-evaluate
10811 * what should be done with this new L1 interrupt.
851c1a18
LA
10812 * If L1 intercepts external-interrupts, we should
10813 * exit from L2 to L1. Otherwise, interrupt should be
10814 * delivered directly to L2.
f27a85c4 10815 */
851c1a18
LA
10816 if (is_guest_mode(vcpu) && max_irr_updated) {
10817 if (nested_exit_on_intr(vcpu))
10818 kvm_vcpu_exiting_guest_mode(vcpu);
10819 else
10820 kvm_make_request(KVM_REQ_EVENT, vcpu);
10821 }
76dfafd5
PB
10822 } else {
10823 max_irr = kvm_lapic_find_highest_irr(vcpu);
10824 }
10825 vmx_hwapic_irr_update(vcpu, max_irr);
10826 return max_irr;
810e6def
PB
10827}
10828
7e712684
PB
10829static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
10830{
10831 u8 rvi = vmx_get_rvi();
10832 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
10833
10834 return ((rvi & 0xf0) > (vppr & 0xf0));
10835}
10836
6308630b 10837static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
c7c9c56c 10838{
d62caabb 10839 if (!kvm_vcpu_apicv_active(vcpu))
3d81bc7e
YZ
10840 return;
10841
c7c9c56c
YZ
10842 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
10843 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
10844 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
10845 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
10846}
10847
967235d3
PB
10848static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
10849{
10850 struct vcpu_vmx *vmx = to_vmx(vcpu);
10851
10852 pi_clear_on(&vmx->pi_desc);
10853 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
10854}
10855
51aa01d1 10856static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
cf393f75 10857{
48ae0fb4
JM
10858 u32 exit_intr_info = 0;
10859 u16 basic_exit_reason = (u16)vmx->exit_reason;
00eba012 10860
48ae0fb4
JM
10861 if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
10862 || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
00eba012
AK
10863 return;
10864
48ae0fb4
JM
10865 if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
10866 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10867 vmx->exit_intr_info = exit_intr_info;
a0861c02 10868
1261bfa3
WL
10869 /* if exit due to PF check for async PF */
10870 if (is_page_fault(exit_intr_info))
10871 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
10872
a0861c02 10873 /* Handle machine checks before interrupts are enabled */
48ae0fb4
JM
10874 if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
10875 is_machine_check(exit_intr_info))
a0861c02
AK
10876 kvm_machine_check();
10877
20f65983 10878 /* We need to handle NMIs before interrupts are enabled */
ef85b673 10879 if (is_nmi(exit_intr_info)) {
dd60d217 10880 kvm_before_interrupt(&vmx->vcpu);
20f65983 10881 asm("int $2");
dd60d217 10882 kvm_after_interrupt(&vmx->vcpu);
ff9d07a0 10883 }
51aa01d1 10884}
20f65983 10885
a547c6db
YZ
10886static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
10887{
10888 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10889
a547c6db
YZ
10890 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
10891 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
10892 unsigned int vector;
10893 unsigned long entry;
10894 gate_desc *desc;
10895 struct vcpu_vmx *vmx = to_vmx(vcpu);
10896#ifdef CONFIG_X86_64
10897 unsigned long tmp;
10898#endif
10899
10900 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
10901 desc = (gate_desc *)vmx->host_idt_base + vector;
64b163fa 10902 entry = gate_offset(desc);
a547c6db
YZ
10903 asm volatile(
10904#ifdef CONFIG_X86_64
10905 "mov %%" _ASM_SP ", %[sp]\n\t"
10906 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
10907 "push $%c[ss]\n\t"
10908 "push %[sp]\n\t"
10909#endif
10910 "pushf\n\t"
a547c6db 10911 __ASM_SIZE(push) " $%c[cs]\n\t"
c940a3fb 10912 CALL_NOSPEC
a547c6db
YZ
10913 :
10914#ifdef CONFIG_X86_64
3f62de5f 10915 [sp]"=&r"(tmp),
a547c6db 10916#endif
f5caf621 10917 ASM_CALL_CONSTRAINT
a547c6db 10918 :
c940a3fb 10919 THUNK_TARGET(entry),
a547c6db
YZ
10920 [ss]"i"(__KERNEL_DS),
10921 [cs]"i"(__KERNEL_CS)
10922 );
f2485b3e 10923 }
a547c6db 10924}
c207aee4 10925STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
a547c6db 10926
bc226f07 10927static bool vmx_has_emulated_msr(int index)
6d396b55 10928{
bc226f07
TL
10929 switch (index) {
10930 case MSR_IA32_SMBASE:
10931 /*
10932 * We cannot do SMM unless we can run the guest in big
10933 * real mode.
10934 */
10935 return enable_unrestricted_guest || emulate_invalid_guest_state;
10936 case MSR_AMD64_VIRT_SPEC_CTRL:
10937 /* This is AMD only. */
10938 return false;
10939 default:
10940 return true;
10941 }
6d396b55
PB
10942}
10943
da8999d3
LJ
10944static bool vmx_mpx_supported(void)
10945{
10946 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
10947 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
10948}
10949
55412b2e
WL
10950static bool vmx_xsaves_supported(void)
10951{
10952 return vmcs_config.cpu_based_2nd_exec_ctrl &
10953 SECONDARY_EXEC_XSAVES;
10954}
10955
51aa01d1
AK
10956static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
10957{
c5ca8e57 10958 u32 exit_intr_info;
51aa01d1
AK
10959 bool unblock_nmi;
10960 u8 vector;
10961 bool idtv_info_valid;
10962
10963 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
20f65983 10964
d02fcf50 10965 if (enable_vnmi) {
8a1b4392
PB
10966 if (vmx->loaded_vmcs->nmi_known_unmasked)
10967 return;
10968 /*
10969 * Can't use vmx->exit_intr_info since we're not sure what
10970 * the exit reason is.
10971 */
10972 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10973 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
10974 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
10975 /*
10976 * SDM 3: 27.7.1.2 (September 2008)
10977 * Re-set bit "block by NMI" before VM entry if vmexit caused by
10978 * a guest IRET fault.
10979 * SDM 3: 23.2.2 (September 2008)
10980 * Bit 12 is undefined in any of the following cases:
10981 * If the VM exit sets the valid bit in the IDT-vectoring
10982 * information field.
10983 * If the VM exit is due to a double fault.
10984 */
10985 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
10986 vector != DF_VECTOR && !idtv_info_valid)
10987 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
10988 GUEST_INTR_STATE_NMI);
10989 else
10990 vmx->loaded_vmcs->nmi_known_unmasked =
10991 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
10992 & GUEST_INTR_STATE_NMI);
10993 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
10994 vmx->loaded_vmcs->vnmi_blocked_time +=
10995 ktime_to_ns(ktime_sub(ktime_get(),
10996 vmx->loaded_vmcs->entry_time));
51aa01d1
AK
10997}
10998
3ab66e8a 10999static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
83422e17
AK
11000 u32 idt_vectoring_info,
11001 int instr_len_field,
11002 int error_code_field)
51aa01d1 11003{
51aa01d1
AK
11004 u8 vector;
11005 int type;
11006 bool idtv_info_valid;
11007
11008 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
668f612f 11009
3ab66e8a
JK
11010 vcpu->arch.nmi_injected = false;
11011 kvm_clear_exception_queue(vcpu);
11012 kvm_clear_interrupt_queue(vcpu);
37b96e98
GN
11013
11014 if (!idtv_info_valid)
11015 return;
11016
3ab66e8a 11017 kvm_make_request(KVM_REQ_EVENT, vcpu);
3842d135 11018
668f612f
AK
11019 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
11020 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
37b96e98 11021
64a7ec06 11022 switch (type) {
37b96e98 11023 case INTR_TYPE_NMI_INTR:
3ab66e8a 11024 vcpu->arch.nmi_injected = true;
668f612f 11025 /*
7b4a25cb 11026 * SDM 3: 27.7.1.2 (September 2008)
37b96e98
GN
11027 * Clear bit "block by NMI" before VM entry if a NMI
11028 * delivery faulted.
668f612f 11029 */
3ab66e8a 11030 vmx_set_nmi_mask(vcpu, false);
37b96e98 11031 break;
37b96e98 11032 case INTR_TYPE_SOFT_EXCEPTION:
3ab66e8a 11033 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
66fd3f7f
GN
11034 /* fall through */
11035 case INTR_TYPE_HARD_EXCEPTION:
35920a35 11036 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
83422e17 11037 u32 err = vmcs_read32(error_code_field);
851eb667 11038 kvm_requeue_exception_e(vcpu, vector, err);
35920a35 11039 } else
851eb667 11040 kvm_requeue_exception(vcpu, vector);
37b96e98 11041 break;
66fd3f7f 11042 case INTR_TYPE_SOFT_INTR:
3ab66e8a 11043 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
66fd3f7f 11044 /* fall through */
37b96e98 11045 case INTR_TYPE_EXT_INTR:
3ab66e8a 11046 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
37b96e98
GN
11047 break;
11048 default:
11049 break;
f7d9238f 11050 }
cf393f75
AK
11051}
11052
83422e17
AK
11053static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
11054{
3ab66e8a 11055 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
83422e17
AK
11056 VM_EXIT_INSTRUCTION_LEN,
11057 IDT_VECTORING_ERROR_CODE);
11058}
11059
b463a6f7
AK
11060static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
11061{
3ab66e8a 11062 __vmx_complete_interrupts(vcpu,
b463a6f7
AK
11063 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
11064 VM_ENTRY_INSTRUCTION_LEN,
11065 VM_ENTRY_EXCEPTION_ERROR_CODE);
11066
11067 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
11068}
11069
d7cd9796
GN
11070static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
11071{
11072 int i, nr_msrs;
11073 struct perf_guest_switch_msr *msrs;
11074
11075 msrs = perf_guest_get_msrs(&nr_msrs);
11076
11077 if (!msrs)
11078 return;
11079
11080 for (i = 0; i < nr_msrs; i++)
11081 if (msrs[i].host == msrs[i].guest)
11082 clear_atomic_switch_msr(vmx, msrs[i].msr);
11083 else
11084 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
989e3992 11085 msrs[i].host, false);
d7cd9796
GN
11086}
11087
f459a707
SC
11088static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
11089{
11090 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
11091 if (!vmx->loaded_vmcs->hv_timer_armed)
11092 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
11093 PIN_BASED_VMX_PREEMPTION_TIMER);
11094 vmx->loaded_vmcs->hv_timer_armed = true;
11095}
11096
11097static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
64672c95
YJ
11098{
11099 struct vcpu_vmx *vmx = to_vmx(vcpu);
11100 u64 tscl;
11101 u32 delta_tsc;
11102
d264ee0c
SC
11103 if (vmx->req_immediate_exit) {
11104 vmx_arm_hv_timer(vmx, 0);
11105 return;
11106 }
11107
f459a707
SC
11108 if (vmx->hv_deadline_tsc != -1) {
11109 tscl = rdtsc();
11110 if (vmx->hv_deadline_tsc > tscl)
11111 /* set_hv_timer ensures the delta fits in 32-bits */
11112 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
11113 cpu_preemption_timer_multi);
11114 else
11115 delta_tsc = 0;
64672c95 11116
f459a707
SC
11117 vmx_arm_hv_timer(vmx, delta_tsc);
11118 return;
11119 }
64672c95 11120
f459a707
SC
11121 if (vmx->loaded_vmcs->hv_timer_armed)
11122 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
11123 PIN_BASED_VMX_PREEMPTION_TIMER);
11124 vmx->loaded_vmcs->hv_timer_armed = false;
64672c95
YJ
11125}
11126
a3b5ba49 11127static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6aa8b732 11128{
a2fa3e9f 11129 struct vcpu_vmx *vmx = to_vmx(vcpu);
773e8a04 11130 unsigned long cr3, cr4, evmcs_rsp;
104f226b 11131
8a1b4392 11132 /* Record the guest's net vcpu time for enforced NMI injections. */
d02fcf50 11133 if (unlikely(!enable_vnmi &&
8a1b4392
PB
11134 vmx->loaded_vmcs->soft_vnmi_blocked))
11135 vmx->loaded_vmcs->entry_time = ktime_get();
11136
104f226b
AK
11137 /* Don't enter VMX if guest state is invalid, let the exit handler
11138 start emulation until we arrive back to a valid state */
14168786 11139 if (vmx->emulation_required)
104f226b
AK
11140 return;
11141
a7653ecd
RK
11142 if (vmx->ple_window_dirty) {
11143 vmx->ple_window_dirty = false;
11144 vmcs_write32(PLE_WINDOW, vmx->ple_window);
11145 }
11146
945679e3 11147 if (vmx->nested.need_vmcs12_sync) {
8cab6507
VK
11148 /*
11149 * hv_evmcs may end up being not mapped after migration (when
11150 * L2 was running), map it here to make sure vmcs12 changes are
11151 * properly reflected.
11152 */
11153 if (vmx->nested.enlightened_vmcs_enabled &&
11154 !vmx->nested.hv_evmcs)
11155 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
11156
945679e3
VK
11157 if (vmx->nested.hv_evmcs) {
11158 copy_vmcs12_to_enlightened(vmx);
11159 /* All fields are clean */
11160 vmx->nested.hv_evmcs->hv_clean_fields |=
11161 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
11162 } else {
11163 copy_vmcs12_to_shadow(vmx);
11164 }
11165 vmx->nested.need_vmcs12_sync = false;
012f83cb
AG
11166 }
11167
104f226b
AK
11168 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
11169 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
11170 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
11171 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
11172
d6e41f11 11173 cr3 = __get_current_cr3_fast();
d7ee039e 11174 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
d6e41f11 11175 vmcs_writel(HOST_CR3, cr3);
d7ee039e 11176 vmx->loaded_vmcs->host_state.cr3 = cr3;
d6e41f11
AL
11177 }
11178
1e02ce4c 11179 cr4 = cr4_read_shadow();
d7ee039e 11180 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
d974baa3 11181 vmcs_writel(HOST_CR4, cr4);
d7ee039e 11182 vmx->loaded_vmcs->host_state.cr4 = cr4;
d974baa3
AL
11183 }
11184
104f226b
AK
11185 /* When single-stepping over STI and MOV SS, we must clear the
11186 * corresponding interruptibility bits in the guest state. Otherwise
11187 * vmentry fails as it then expects bit 14 (BS) in pending debug
11188 * exceptions being set, but that's not correct for the guest debugging
11189 * case. */
11190 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
11191 vmx_set_interrupt_shadow(vcpu, 0);
11192
b9dd21e1
PB
11193 if (static_cpu_has(X86_FEATURE_PKU) &&
11194 kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
11195 vcpu->arch.pkru != vmx->host_pkru)
11196 __write_pkru(vcpu->arch.pkru);
1be0e61c 11197
d7cd9796
GN
11198 atomic_switch_perf_msrs(vmx);
11199
f459a707 11200 vmx_update_hv_timer(vcpu);
64672c95 11201
d28b387f
KA
11202 /*
11203 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
11204 * it's non-zero. Since vmentry is serialising on affected CPUs, there
11205 * is no need to worry about the conditional branch over the wrmsr
11206 * being speculatively taken.
11207 */
ccbcd267 11208 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
d28b387f 11209
d462b819 11210 vmx->__launched = vmx->loaded_vmcs->launched;
773e8a04
VK
11211
11212 evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
11213 (unsigned long)&current_evmcs->host_rsp : 0;
11214
5b6ccc6c
NS
11215 if (static_branch_unlikely(&vmx_l1d_should_flush))
11216 vmx_l1d_flush(vcpu);
c595ceee 11217
104f226b 11218 asm(
6aa8b732 11219 /* Store host registers */
b188c81f
AK
11220 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
11221 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
11222 "push %%" _ASM_CX " \n\t"
11223 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
313dbd49 11224 "je 1f \n\t"
b188c81f 11225 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
773e8a04
VK
11226 /* Avoid VMWRITE when Enlightened VMCS is in use */
11227 "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
11228 "jz 2f \n\t"
11229 "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
11230 "jmp 1f \n\t"
11231 "2: \n\t"
4b1e5478 11232 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
313dbd49 11233 "1: \n\t"
d3edefc0 11234 /* Reload cr2 if changed */
b188c81f
AK
11235 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
11236 "mov %%cr2, %%" _ASM_DX " \n\t"
11237 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
773e8a04 11238 "je 3f \n\t"
b188c81f 11239 "mov %%" _ASM_AX", %%cr2 \n\t"
773e8a04 11240 "3: \n\t"
6aa8b732 11241 /* Check if vmlaunch of vmresume is needed */
e08aa78a 11242 "cmpl $0, %c[launched](%0) \n\t"
6aa8b732 11243 /* Load guest registers. Don't clobber flags. */
b188c81f
AK
11244 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
11245 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
11246 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
11247 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
11248 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
11249 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
05b3e0c2 11250#ifdef CONFIG_X86_64
e08aa78a
AK
11251 "mov %c[r8](%0), %%r8 \n\t"
11252 "mov %c[r9](%0), %%r9 \n\t"
11253 "mov %c[r10](%0), %%r10 \n\t"
11254 "mov %c[r11](%0), %%r11 \n\t"
11255 "mov %c[r12](%0), %%r12 \n\t"
11256 "mov %c[r13](%0), %%r13 \n\t"
11257 "mov %c[r14](%0), %%r14 \n\t"
11258 "mov %c[r15](%0), %%r15 \n\t"
6aa8b732 11259#endif
b188c81f 11260 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
c801949d 11261
6aa8b732 11262 /* Enter guest mode */
83287ea4 11263 "jne 1f \n\t"
4b1e5478 11264 __ex("vmlaunch") "\n\t"
83287ea4 11265 "jmp 2f \n\t"
4b1e5478 11266 "1: " __ex("vmresume") "\n\t"
83287ea4 11267 "2: "
6aa8b732 11268 /* Save guest registers, load host registers, keep flags */
b188c81f 11269 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
40712fae 11270 "pop %0 \n\t"
0cb5b306 11271 "setbe %c[fail](%0)\n\t"
b188c81f
AK
11272 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
11273 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
11274 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
11275 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
11276 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
11277 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
11278 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
05b3e0c2 11279#ifdef CONFIG_X86_64
e08aa78a
AK
11280 "mov %%r8, %c[r8](%0) \n\t"
11281 "mov %%r9, %c[r9](%0) \n\t"
11282 "mov %%r10, %c[r10](%0) \n\t"
11283 "mov %%r11, %c[r11](%0) \n\t"
11284 "mov %%r12, %c[r12](%0) \n\t"
11285 "mov %%r13, %c[r13](%0) \n\t"
11286 "mov %%r14, %c[r14](%0) \n\t"
11287 "mov %%r15, %c[r15](%0) \n\t"
43ce76ce
UB
11288 /*
11289 * Clear host registers marked as clobbered to prevent
11290 * speculative use.
11291 */
0cb5b306
JM
11292 "xor %%r8d, %%r8d \n\t"
11293 "xor %%r9d, %%r9d \n\t"
11294 "xor %%r10d, %%r10d \n\t"
11295 "xor %%r11d, %%r11d \n\t"
11296 "xor %%r12d, %%r12d \n\t"
11297 "xor %%r13d, %%r13d \n\t"
11298 "xor %%r14d, %%r14d \n\t"
11299 "xor %%r15d, %%r15d \n\t"
6aa8b732 11300#endif
b188c81f
AK
11301 "mov %%cr2, %%" _ASM_AX " \n\t"
11302 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
c801949d 11303
0cb5b306
JM
11304 "xor %%eax, %%eax \n\t"
11305 "xor %%ebx, %%ebx \n\t"
11306 "xor %%esi, %%esi \n\t"
11307 "xor %%edi, %%edi \n\t"
b188c81f 11308 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
83287ea4
AK
11309 ".pushsection .rodata \n\t"
11310 ".global vmx_return \n\t"
11311 "vmx_return: " _ASM_PTR " 2b \n\t"
11312 ".popsection"
773e8a04 11313 : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
d462b819 11314 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
e08aa78a 11315 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
313dbd49 11316 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
ad312c7c
ZX
11317 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
11318 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
11319 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
11320 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
11321 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
11322 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
11323 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
05b3e0c2 11324#ifdef CONFIG_X86_64
ad312c7c
ZX
11325 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
11326 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
11327 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
11328 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
11329 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
11330 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
11331 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
11332 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
6aa8b732 11333#endif
40712fae
AK
11334 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
11335 [wordsize]"i"(sizeof(ulong))
c2036300
LV
11336 : "cc", "memory"
11337#ifdef CONFIG_X86_64
773e8a04 11338 , "rax", "rbx", "rdi"
c2036300 11339 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
b188c81f 11340#else
773e8a04 11341 , "eax", "ebx", "edi"
c2036300
LV
11342#endif
11343 );
6aa8b732 11344
d28b387f
KA
11345 /*
11346 * We do not use IBRS in the kernel. If this vCPU has used the
11347 * SPEC_CTRL MSR it may have left it on; save the value and
11348 * turn it off. This is much more efficient than blindly adding
11349 * it to the atomic save/restore list. Especially as the former
11350 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
11351 *
11352 * For non-nested case:
11353 * If the L01 MSR bitmap does not intercept the MSR, then we need to
11354 * save it.
11355 *
11356 * For nested case:
11357 * If the L02 MSR bitmap does not intercept the MSR, then we need to
11358 * save it.
11359 */
946fbbc1 11360 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
ecb586bd 11361 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
d28b387f 11362
ccbcd267 11363 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
d28b387f 11364
117cc7a9
DW
11365 /* Eliminate branch target predictions from guest mode */
11366 vmexit_fill_RSB();
11367
773e8a04
VK
11368 /* All fields are clean at this point */
11369 if (static_branch_unlikely(&enable_evmcs))
11370 current_evmcs->hv_clean_fields |=
11371 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
11372
2a7921b7 11373 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
74c55931
WL
11374 if (vmx->host_debugctlmsr)
11375 update_debugctlmsr(vmx->host_debugctlmsr);
2a7921b7 11376
aa67f609
AK
11377#ifndef CONFIG_X86_64
11378 /*
11379 * The sysexit path does not restore ds/es, so we must set them to
11380 * a reasonable value ourselves.
11381 *
6d6095bd
SC
11382 * We can't defer this to vmx_prepare_switch_to_host() since that
11383 * function may be executed in interrupt context, which saves and
11384 * restore segments around it, nullifying its effect.
aa67f609
AK
11385 */
11386 loadsegment(ds, __USER_DS);
11387 loadsegment(es, __USER_DS);
11388#endif
11389
6de4f3ad 11390 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
6de12732 11391 | (1 << VCPU_EXREG_RFLAGS)
aff48baa 11392 | (1 << VCPU_EXREG_PDPTR)
2fb92db1 11393 | (1 << VCPU_EXREG_SEGMENTS)
aff48baa 11394 | (1 << VCPU_EXREG_CR3));
5fdbf976
MT
11395 vcpu->arch.regs_dirty = 0;
11396
1be0e61c
XG
11397 /*
11398 * eager fpu is enabled if PKEY is supported and CR4 is switched
11399 * back on host, so it is safe to read guest PKRU from current
11400 * XSAVE.
11401 */
b9dd21e1
PB
11402 if (static_cpu_has(X86_FEATURE_PKU) &&
11403 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
11404 vcpu->arch.pkru = __read_pkru();
11405 if (vcpu->arch.pkru != vmx->host_pkru)
1be0e61c 11406 __write_pkru(vmx->host_pkru);
1be0e61c
XG
11407 }
11408
e0b890d3 11409 vmx->nested.nested_run_pending = 0;
b060ca3b
JM
11410 vmx->idt_vectoring_info = 0;
11411
11412 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
11413 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
11414 return;
11415
11416 vmx->loaded_vmcs->launched = 1;
11417 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
e0b890d3 11418
51aa01d1
AK
11419 vmx_complete_atomic_exit(vmx);
11420 vmx_recover_nmi_blocking(vmx);
cf393f75 11421 vmx_complete_interrupts(vmx);
6aa8b732 11422}
c207aee4 11423STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
6aa8b732 11424
434a1e94
SC
11425static struct kvm *vmx_vm_alloc(void)
11426{
d1e5b0e9 11427 struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
40bbb9d0 11428 return &kvm_vmx->kvm;
434a1e94
SC
11429}
11430
11431static void vmx_vm_free(struct kvm *kvm)
11432{
d1e5b0e9 11433 vfree(to_kvm_vmx(kvm));
434a1e94
SC
11434}
11435
1279a6b1 11436static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
4fa7734c
PB
11437{
11438 struct vcpu_vmx *vmx = to_vmx(vcpu);
11439 int cpu;
11440
1279a6b1 11441 if (vmx->loaded_vmcs == vmcs)
4fa7734c
PB
11442 return;
11443
11444 cpu = get_cpu();
4fa7734c 11445 vmx_vcpu_put(vcpu);
bd9966de 11446 vmx->loaded_vmcs = vmcs;
4fa7734c 11447 vmx_vcpu_load(vcpu, cpu);
4fa7734c 11448 put_cpu();
b7031fd4
SC
11449
11450 vm_entry_controls_reset_shadow(vmx);
11451 vm_exit_controls_reset_shadow(vmx);
11452 vmx_segment_cache_clear(vmx);
4fa7734c
PB
11453}
11454
2f1fe811
JM
11455/*
11456 * Ensure that the current vmcs of the logical processor is the
11457 * vmcs01 of the vcpu before calling free_nested().
11458 */
11459static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
11460{
14c07ad8
VK
11461 vcpu_load(vcpu);
11462 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
11463 free_nested(vcpu);
11464 vcpu_put(vcpu);
2f1fe811
JM
11465}
11466
6aa8b732
AK
11467static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
11468{
fb3f0f51
RR
11469 struct vcpu_vmx *vmx = to_vmx(vcpu);
11470
843e4330 11471 if (enable_pml)
a3eaa864 11472 vmx_destroy_pml_buffer(vmx);
991e7a0e 11473 free_vpid(vmx->vpid);
4fa7734c 11474 leave_guest_mode(vcpu);
2f1fe811 11475 vmx_free_vcpu_nested(vcpu);
4fa7734c 11476 free_loaded_vmcs(vmx->loaded_vmcs);
fb3f0f51
RR
11477 kfree(vmx->guest_msrs);
11478 kvm_vcpu_uninit(vcpu);
a4770347 11479 kmem_cache_free(kvm_vcpu_cache, vmx);
6aa8b732
AK
11480}
11481
fb3f0f51 11482static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6aa8b732 11483{
fb3f0f51 11484 int err;
c16f862d 11485 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
904e14fb 11486 unsigned long *msr_bitmap;
15ad7146 11487 int cpu;
6aa8b732 11488
a2fa3e9f 11489 if (!vmx)
fb3f0f51
RR
11490 return ERR_PTR(-ENOMEM);
11491
991e7a0e 11492 vmx->vpid = allocate_vpid();
2384d2b3 11493
fb3f0f51
RR
11494 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
11495 if (err)
11496 goto free_vcpu;
965b58a5 11497
4e59516a
PF
11498 err = -ENOMEM;
11499
11500 /*
11501 * If PML is turned on, failure on enabling PML just results in failure
11502 * of creating the vcpu, therefore we can simplify PML logic (by
11503 * avoiding dealing with cases, such as enabling PML partially on vcpus
11504 * for the guest, etc.
11505 */
11506 if (enable_pml) {
11507 vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
11508 if (!vmx->pml_pg)
11509 goto uninit_vcpu;
11510 }
11511
a2fa3e9f 11512 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
03916db9
PB
11513 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
11514 > PAGE_SIZE);
0123be42 11515
4e59516a
PF
11516 if (!vmx->guest_msrs)
11517 goto free_pml;
965b58a5 11518
f21f165e
PB
11519 err = alloc_loaded_vmcs(&vmx->vmcs01);
11520 if (err < 0)
fb3f0f51 11521 goto free_msrs;
a2fa3e9f 11522
904e14fb
PB
11523 msr_bitmap = vmx->vmcs01.msr_bitmap;
11524 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
11525 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
11526 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
11527 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
11528 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
11529 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
11530 vmx->msr_bitmap_mode = 0;
11531
f21f165e 11532 vmx->loaded_vmcs = &vmx->vmcs01;
15ad7146
AK
11533 cpu = get_cpu();
11534 vmx_vcpu_load(&vmx->vcpu, cpu);
e48672fa 11535 vmx->vcpu.cpu = cpu;
12d79917 11536 vmx_vcpu_setup(vmx);
fb3f0f51 11537 vmx_vcpu_put(&vmx->vcpu);
15ad7146 11538 put_cpu();
35754c98 11539 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
be6d05cf
JK
11540 err = alloc_apic_access_page(kvm);
11541 if (err)
5e4a0b3c 11542 goto free_vmcs;
a63cb560 11543 }
fb3f0f51 11544
e90008df 11545 if (enable_ept && !enable_unrestricted_guest) {
f51770ed
TC
11546 err = init_rmode_identity_map(kvm);
11547 if (err)
93ea5388 11548 goto free_vmcs;
b927a3ce 11549 }
b7ebfb05 11550
63aff655 11551 if (nested)
6677f3da
PB
11552 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
11553 kvm_vcpu_apicv_active(&vmx->vcpu));
b9c237bb 11554
705699a1 11555 vmx->nested.posted_intr_nv = -1;
a9d30f33 11556 vmx->nested.current_vmptr = -1ull;
a9d30f33 11557
37e4c997
HZ
11558 vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
11559
31afb2ea
PB
11560 /*
11561 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
11562 * or POSTED_INTR_WAKEUP_VECTOR.
11563 */
11564 vmx->pi_desc.nv = POSTED_INTR_VECTOR;
11565 vmx->pi_desc.sn = 1;
11566
fb3f0f51
RR
11567 return &vmx->vcpu;
11568
11569free_vmcs:
5f3fbc34 11570 free_loaded_vmcs(vmx->loaded_vmcs);
fb3f0f51 11571free_msrs:
fb3f0f51 11572 kfree(vmx->guest_msrs);
4e59516a
PF
11573free_pml:
11574 vmx_destroy_pml_buffer(vmx);
fb3f0f51
RR
11575uninit_vcpu:
11576 kvm_vcpu_uninit(&vmx->vcpu);
11577free_vcpu:
991e7a0e 11578 free_vpid(vmx->vpid);
a4770347 11579 kmem_cache_free(kvm_vcpu_cache, vmx);
fb3f0f51 11580 return ERR_PTR(err);
6aa8b732
AK
11581}
11582
d90a7a0e
JK
11583#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
11584#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
26acfb66 11585
b31c114b
WL
11586static int vmx_vm_init(struct kvm *kvm)
11587{
877ad952
TL
11588 spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
11589
b31c114b
WL
11590 if (!ple_gap)
11591 kvm->arch.pause_in_guest = true;
26acfb66 11592
d90a7a0e
JK
11593 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
11594 switch (l1tf_mitigation) {
11595 case L1TF_MITIGATION_OFF:
11596 case L1TF_MITIGATION_FLUSH_NOWARN:
11597 /* 'I explicitly don't care' is set */
11598 break;
11599 case L1TF_MITIGATION_FLUSH:
11600 case L1TF_MITIGATION_FLUSH_NOSMT:
11601 case L1TF_MITIGATION_FULL:
11602 /*
11603 * Warn upon starting the first VM in a potentially
11604 * insecure environment.
11605 */
11606 if (cpu_smt_control == CPU_SMT_ENABLED)
11607 pr_warn_once(L1TF_MSG_SMT);
11608 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
11609 pr_warn_once(L1TF_MSG_L1D);
11610 break;
11611 case L1TF_MITIGATION_FULL_FORCE:
11612 /* Flush is enforced */
11613 break;
26acfb66 11614 }
26acfb66 11615 }
b31c114b
WL
11616 return 0;
11617}
11618
002c7f7c
YS
11619static void __init vmx_check_processor_compat(void *rtn)
11620{
11621 struct vmcs_config vmcs_conf;
11622
11623 *(int *)rtn = 0;
11624 if (setup_vmcs_config(&vmcs_conf) < 0)
11625 *(int *)rtn = -EIO;
1389309c 11626 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
002c7f7c
YS
11627 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
11628 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
11629 smp_processor_id());
11630 *(int *)rtn = -EIO;
11631 }
11632}
11633
4b12f0de 11634static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
64d4d521 11635{
b18d5431
XG
11636 u8 cache;
11637 u64 ipat = 0;
4b12f0de 11638
522c68c4 11639 /* For VT-d and EPT combination
606decd6 11640 * 1. MMIO: always map as UC
522c68c4
SY
11641 * 2. EPT with VT-d:
11642 * a. VT-d without snooping control feature: can't guarantee the
606decd6 11643 * result, try to trust guest.
522c68c4
SY
11644 * b. VT-d with snooping control feature: snooping control feature of
11645 * VT-d engine can guarantee the cache correctness. Just set it
11646 * to WB to keep consistent with host. So the same as item 3.
a19a6d11 11647 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
522c68c4
SY
11648 * consistent with host MTRR
11649 */
606decd6
PB
11650 if (is_mmio) {
11651 cache = MTRR_TYPE_UNCACHABLE;
11652 goto exit;
11653 }
11654
11655 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
b18d5431
XG
11656 ipat = VMX_EPT_IPAT_BIT;
11657 cache = MTRR_TYPE_WRBACK;
11658 goto exit;
11659 }
11660
11661 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
11662 ipat = VMX_EPT_IPAT_BIT;
0da029ed 11663 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
fb279950
XG
11664 cache = MTRR_TYPE_WRBACK;
11665 else
11666 cache = MTRR_TYPE_UNCACHABLE;
b18d5431
XG
11667 goto exit;
11668 }
11669
ff53604b 11670 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
b18d5431
XG
11671
11672exit:
11673 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
64d4d521
SY
11674}
11675
17cc3935 11676static int vmx_get_lpage_level(void)
344f414f 11677{
878403b7
SY
11678 if (enable_ept && !cpu_has_vmx_ept_1g_page())
11679 return PT_DIRECTORY_LEVEL;
11680 else
11681 /* For shadow and EPT supported 1GB page */
11682 return PT_PDPE_LEVEL;
344f414f
JR
11683}
11684
feda805f
XG
11685static void vmcs_set_secondary_exec_control(u32 new_ctl)
11686{
11687 /*
11688 * These bits in the secondary execution controls field
11689 * are dynamic, the others are mostly based on the hypervisor
11690 * architecture and the guest's CPUID. Do not touch the
11691 * dynamic bits.
11692 */
11693 u32 mask =
11694 SECONDARY_EXEC_SHADOW_VMCS |
11695 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
0367f205
PB
11696 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
11697 SECONDARY_EXEC_DESC;
feda805f
XG
11698
11699 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
11700
11701 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
11702 (new_ctl & ~mask) | (cur_ctl & mask));
11703}
11704
8322ebbb
DM
11705/*
11706 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
11707 * (indicating "allowed-1") if they are supported in the guest's CPUID.
11708 */
11709static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
11710{
11711 struct vcpu_vmx *vmx = to_vmx(vcpu);
11712 struct kvm_cpuid_entry2 *entry;
11713
6677f3da
PB
11714 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
11715 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
8322ebbb
DM
11716
11717#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
11718 if (entry && (entry->_reg & (_cpuid_mask))) \
6677f3da 11719 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
8322ebbb
DM
11720} while (0)
11721
11722 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
11723 cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
11724 cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
11725 cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
11726 cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
11727 cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
11728 cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
11729 cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
11730 cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
11731 cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
11732 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
11733 cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
11734 cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
11735 cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
11736 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
11737
11738 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
11739 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
11740 cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
11741 cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
11742 cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
c4ad77e0 11743 cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
8322ebbb
DM
11744
11745#undef cr4_fixed1_update
11746}
11747
5f76f6f5
LA
11748static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
11749{
11750 struct vcpu_vmx *vmx = to_vmx(vcpu);
11751
11752 if (kvm_mpx_supported()) {
11753 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
11754
11755 if (mpx_enabled) {
11756 vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
11757 vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
11758 } else {
11759 vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
11760 vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
11761 }
11762 }
11763}
11764
0e851880
SY
11765static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
11766{
4e47c7a6 11767 struct vcpu_vmx *vmx = to_vmx(vcpu);
4e47c7a6 11768
80154d77
PB
11769 if (cpu_has_secondary_exec_ctrls()) {
11770 vmx_compute_secondary_exec_control(vmx);
11771 vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
ad756a16 11772 }
8b3e34e4 11773
37e4c997
HZ
11774 if (nested_vmx_allowed(vcpu))
11775 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
11776 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
11777 else
11778 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
11779 ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
8322ebbb 11780
5f76f6f5 11781 if (nested_vmx_allowed(vcpu)) {
8322ebbb 11782 nested_vmx_cr_fixed1_bits_update(vcpu);
5f76f6f5
LA
11783 nested_vmx_entry_exit_ctls_update(vcpu);
11784 }
0e851880
SY
11785}
11786
d4330ef2
JR
11787static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
11788{
7b8050f5
NHE
11789 if (func == 1 && nested)
11790 entry->ecx |= bit(X86_FEATURE_VMX);
d4330ef2
JR
11791}
11792
25d92081
YZ
11793static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
11794 struct x86_exception *fault)
11795{
533558bc 11796 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
c5f983f6 11797 struct vcpu_vmx *vmx = to_vmx(vcpu);
533558bc 11798 u32 exit_reason;
c5f983f6 11799 unsigned long exit_qualification = vcpu->arch.exit_qualification;
25d92081 11800
c5f983f6
BD
11801 if (vmx->nested.pml_full) {
11802 exit_reason = EXIT_REASON_PML_FULL;
11803 vmx->nested.pml_full = false;
11804 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
11805 } else if (fault->error_code & PFERR_RSVD_MASK)
533558bc 11806 exit_reason = EXIT_REASON_EPT_MISCONFIG;
25d92081 11807 else
533558bc 11808 exit_reason = EXIT_REASON_EPT_VIOLATION;
c5f983f6
BD
11809
11810 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
25d92081
YZ
11811 vmcs12->guest_physical_address = fault->address;
11812}
11813
995f00a6
PF
11814static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
11815{
bb97a016 11816 return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
995f00a6
PF
11817}
11818
155a97a3
NHE
11819/* Callbacks for nested_ept_init_mmu_context: */
11820
11821static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
11822{
11823 /* return the page table to be shadowed - in our case, EPT12 */
11824 return get_vmcs12(vcpu)->ept_pointer;
11825}
11826
5b8ba41d 11827static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
155a97a3 11828{
ad896af0 11829 WARN_ON(mmu_is_nested(vcpu));
ae1e2d10 11830
14c07ad8 11831 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
ad896af0 11832 kvm_init_shadow_ept_mmu(vcpu,
6677f3da 11833 to_vmx(vcpu)->nested.msrs.ept_caps &
ae1e2d10 11834 VMX_EPT_EXECUTE_ONLY_BIT,
50c28f21
JS
11835 nested_ept_ad_enabled(vcpu),
11836 nested_ept_get_cr3(vcpu));
44dd3ffa
VK
11837 vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
11838 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
11839 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
3dc773e7 11840 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
155a97a3
NHE
11841
11842 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
155a97a3
NHE
11843}
11844
11845static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
11846{
14c07ad8 11847 vcpu->arch.mmu = &vcpu->arch.root_mmu;
44dd3ffa 11848 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
155a97a3
NHE
11849}
11850
19d5f10b
EK
11851static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
11852 u16 error_code)
11853{
11854 bool inequality, bit;
11855
11856 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
11857 inequality =
11858 (error_code & vmcs12->page_fault_error_code_mask) !=
11859 vmcs12->page_fault_error_code_match;
11860 return inequality ^ bit;
11861}
11862
feaf0c7d
GN
11863static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
11864 struct x86_exception *fault)
11865{
11866 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11867
11868 WARN_ON(!is_guest_mode(vcpu));
11869
305d0ab4
WL
11870 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
11871 !to_vmx(vcpu)->nested.nested_run_pending) {
b96fb439
PB
11872 vmcs12->vm_exit_intr_error_code = fault->error_code;
11873 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
11874 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
11875 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
11876 fault->address);
7313c698 11877 } else {
feaf0c7d 11878 kvm_inject_page_fault(vcpu, fault);
7313c698 11879 }
feaf0c7d
GN
11880}
11881
c992384b
PB
11882static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
11883 struct vmcs12 *vmcs12);
6beb7bd5 11884
7f7f1ba3 11885static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
a2bcba50 11886{
7f7f1ba3 11887 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
a2bcba50 11888 struct vcpu_vmx *vmx = to_vmx(vcpu);
5e2f30b7 11889 struct page *page;
6beb7bd5 11890 u64 hpa;
a2bcba50
WL
11891
11892 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
a2bcba50
WL
11893 /*
11894 * Translate L1 physical address to host physical
11895 * address for vmcs02. Keep the page pinned, so this
11896 * physical address remains valid. We keep a reference
11897 * to it so we can release it later.
11898 */
5e2f30b7 11899 if (vmx->nested.apic_access_page) { /* shouldn't happen */
53a70daf 11900 kvm_release_page_dirty(vmx->nested.apic_access_page);
5e2f30b7
DH
11901 vmx->nested.apic_access_page = NULL;
11902 }
11903 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
6beb7bd5
JM
11904 /*
11905 * If translation failed, no matter: This feature asks
11906 * to exit when accessing the given address, and if it
11907 * can never be accessed, this feature won't do
11908 * anything anyway.
11909 */
5e2f30b7
DH
11910 if (!is_error_page(page)) {
11911 vmx->nested.apic_access_page = page;
6beb7bd5
JM
11912 hpa = page_to_phys(vmx->nested.apic_access_page);
11913 vmcs_write64(APIC_ACCESS_ADDR, hpa);
11914 } else {
11915 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
11916 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
11917 }
a2bcba50 11918 }
a7c0b07d
WL
11919
11920 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
5e2f30b7 11921 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
53a70daf 11922 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
5e2f30b7
DH
11923 vmx->nested.virtual_apic_page = NULL;
11924 }
11925 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
a7c0b07d
WL
11926
11927 /*
6beb7bd5
JM
11928 * If translation failed, VM entry will fail because
11929 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
11930 * Failing the vm entry is _not_ what the processor
11931 * does but it's basically the only possibility we
11932 * have. We could still enter the guest if CR8 load
11933 * exits are enabled, CR8 store exits are enabled, and
11934 * virtualize APIC access is disabled; in this case
11935 * the processor would never use the TPR shadow and we
11936 * could simply clear the bit from the execution
11937 * control. But such a configuration is useless, so
11938 * let's keep the code simple.
a7c0b07d 11939 */
5e2f30b7
DH
11940 if (!is_error_page(page)) {
11941 vmx->nested.virtual_apic_page = page;
6beb7bd5
JM
11942 hpa = page_to_phys(vmx->nested.virtual_apic_page);
11943 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
11944 }
a7c0b07d
WL
11945 }
11946
705699a1 11947 if (nested_cpu_has_posted_intr(vmcs12)) {
705699a1
WV
11948 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
11949 kunmap(vmx->nested.pi_desc_page);
53a70daf 11950 kvm_release_page_dirty(vmx->nested.pi_desc_page);
5e2f30b7 11951 vmx->nested.pi_desc_page = NULL;
705699a1 11952 }
5e2f30b7
DH
11953 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
11954 if (is_error_page(page))
6beb7bd5 11955 return;
5e2f30b7
DH
11956 vmx->nested.pi_desc_page = page;
11957 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
705699a1
WV
11958 vmx->nested.pi_desc =
11959 (struct pi_desc *)((void *)vmx->nested.pi_desc +
11960 (unsigned long)(vmcs12->posted_intr_desc_addr &
11961 (PAGE_SIZE - 1)));
6beb7bd5
JM
11962 vmcs_write64(POSTED_INTR_DESC_ADDR,
11963 page_to_phys(vmx->nested.pi_desc_page) +
11964 (unsigned long)(vmcs12->posted_intr_desc_addr &
11965 (PAGE_SIZE - 1)));
705699a1 11966 }
d4667ca1 11967 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3712caeb
KA
11968 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
11969 CPU_BASED_USE_MSR_BITMAPS);
6beb7bd5
JM
11970 else
11971 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
11972 CPU_BASED_USE_MSR_BITMAPS);
a2bcba50
WL
11973}
11974
f4124500
JK
11975static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
11976{
11977 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
11978 struct vcpu_vmx *vmx = to_vmx(vcpu);
11979
4c008127
SC
11980 /*
11981 * A timer value of zero is architecturally guaranteed to cause
11982 * a VMExit prior to executing any instructions in the guest.
11983 */
11984 if (preemption_timeout == 0) {
f4124500
JK
11985 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
11986 return;
11987 }
11988
4c008127
SC
11989 if (vcpu->arch.virtual_tsc_khz == 0)
11990 return;
11991
f4124500
JK
11992 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
11993 preemption_timeout *= 1000000;
11994 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
11995 hrtimer_start(&vmx->nested.preemption_timer,
11996 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
11997}
11998
56a20510
JM
11999static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
12000 struct vmcs12 *vmcs12)
12001{
12002 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
12003 return 0;
12004
12005 if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
12006 !page_address_valid(vcpu, vmcs12->io_bitmap_b))
12007 return -EINVAL;
12008
12009 return 0;
12010}
12011
3af18d9c
WV
12012static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
12013 struct vmcs12 *vmcs12)
12014{
3af18d9c
WV
12015 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
12016 return 0;
12017
5fa99cbe 12018 if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
3af18d9c
WV
12019 return -EINVAL;
12020
12021 return 0;
12022}
12023
712b12d7
JM
12024static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
12025 struct vmcs12 *vmcs12)
12026{
12027 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
12028 return 0;
12029
12030 if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
12031 return -EINVAL;
12032
12033 return 0;
12034}
12035
3af18d9c
WV
12036/*
12037 * Merge L0's and L1's MSR bitmap, return false to indicate that
12038 * we do not use the hardware.
12039 */
c992384b
PB
12040static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
12041 struct vmcs12 *vmcs12)
3af18d9c 12042{
82f0dd4b 12043 int msr;
f2b93280 12044 struct page *page;
d048c098 12045 unsigned long *msr_bitmap_l1;
904e14fb 12046 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
15d45071 12047 /*
d28b387f 12048 * pred_cmd & spec_ctrl are trying to verify two things:
15d45071
AR
12049 *
12050 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
12051 * ensures that we do not accidentally generate an L02 MSR bitmap
12052 * from the L12 MSR bitmap that is too permissive.
12053 * 2. That L1 or L2s have actually used the MSR. This avoids
12054 * unnecessarily merging of the bitmap if the MSR is unused. This
12055 * works properly because we only update the L01 MSR bitmap lazily.
12056 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
12057 * updated to reflect this when L1 (or its L2s) actually write to
12058 * the MSR.
12059 */
206587a9
KA
12060 bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
12061 bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
f2b93280 12062
c992384b
PB
12063 /* Nothing to do if the MSR bitmap is not in use. */
12064 if (!cpu_has_vmx_msr_bitmap() ||
12065 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
12066 return false;
12067
15d45071 12068 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
d28b387f 12069 !pred_cmd && !spec_ctrl)
f2b93280
WV
12070 return false;
12071
5e2f30b7
DH
12072 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
12073 if (is_error_page(page))
f2b93280 12074 return false;
f2b93280 12075
c992384b
PB
12076 msr_bitmap_l1 = (unsigned long *)kmap(page);
12077 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
12078 /*
12079 * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
12080 * just lets the processor take the value from the virtual-APIC page;
12081 * take those 256 bits directly from the L1 bitmap.
12082 */
12083 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
12084 unsigned word = msr / BITS_PER_LONG;
12085 msr_bitmap_l0[word] = msr_bitmap_l1[word];
12086 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
12087 }
12088 } else {
12089 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
12090 unsigned word = msr / BITS_PER_LONG;
12091 msr_bitmap_l0[word] = ~0;
12092 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
12093 }
12094 }
d048c098 12095
c992384b
PB
12096 nested_vmx_disable_intercept_for_msr(
12097 msr_bitmap_l1, msr_bitmap_l0,
d7231e75 12098 X2APIC_MSR(APIC_TASKPRI),
c992384b 12099 MSR_TYPE_W);
d048c098 12100
c992384b 12101 if (nested_cpu_has_vid(vmcs12)) {
d048c098 12102 nested_vmx_disable_intercept_for_msr(
c992384b 12103 msr_bitmap_l1, msr_bitmap_l0,
d7231e75 12104 X2APIC_MSR(APIC_EOI),
c992384b
PB
12105 MSR_TYPE_W);
12106 nested_vmx_disable_intercept_for_msr(
12107 msr_bitmap_l1, msr_bitmap_l0,
d7231e75 12108 X2APIC_MSR(APIC_SELF_IPI),
c992384b 12109 MSR_TYPE_W);
82f0dd4b 12110 }
15d45071 12111
d28b387f
KA
12112 if (spec_ctrl)
12113 nested_vmx_disable_intercept_for_msr(
12114 msr_bitmap_l1, msr_bitmap_l0,
12115 MSR_IA32_SPEC_CTRL,
12116 MSR_TYPE_R | MSR_TYPE_W);
12117
15d45071
AR
12118 if (pred_cmd)
12119 nested_vmx_disable_intercept_for_msr(
12120 msr_bitmap_l1, msr_bitmap_l0,
12121 MSR_IA32_PRED_CMD,
12122 MSR_TYPE_W);
12123
f2b93280 12124 kunmap(page);
53a70daf 12125 kvm_release_page_clean(page);
f2b93280
WV
12126
12127 return true;
12128}
12129
61ada748
LA
12130static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
12131 struct vmcs12 *vmcs12)
12132{
12133 struct vmcs12 *shadow;
12134 struct page *page;
12135
12136 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
12137 vmcs12->vmcs_link_pointer == -1ull)
12138 return;
12139
12140 shadow = get_shadow_vmcs12(vcpu);
12141 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
12142
12143 memcpy(shadow, kmap(page), VMCS12_SIZE);
12144
12145 kunmap(page);
12146 kvm_release_page_clean(page);
12147}
12148
12149static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
12150 struct vmcs12 *vmcs12)
12151{
12152 struct vcpu_vmx *vmx = to_vmx(vcpu);
12153
12154 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
12155 vmcs12->vmcs_link_pointer == -1ull)
12156 return;
12157
12158 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
12159 get_shadow_vmcs12(vcpu), VMCS12_SIZE);
12160}
12161
f0f4cf5b
KS
12162static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
12163 struct vmcs12 *vmcs12)
12164{
12165 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
12166 !page_address_valid(vcpu, vmcs12->apic_access_addr))
12167 return -EINVAL;
12168 else
12169 return 0;
12170}
12171
f2b93280
WV
12172static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
12173 struct vmcs12 *vmcs12)
12174{
82f0dd4b 12175 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
608406e2 12176 !nested_cpu_has_apic_reg_virt(vmcs12) &&
705699a1
WV
12177 !nested_cpu_has_vid(vmcs12) &&
12178 !nested_cpu_has_posted_intr(vmcs12))
f2b93280
WV
12179 return 0;
12180
12181 /*
12182 * If virtualize x2apic mode is enabled,
12183 * virtualize apic access must be disabled.
12184 */
82f0dd4b
WV
12185 if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
12186 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
f2b93280
WV
12187 return -EINVAL;
12188
608406e2
WV
12189 /*
12190 * If virtual interrupt delivery is enabled,
12191 * we must exit on external interrupts.
12192 */
12193 if (nested_cpu_has_vid(vmcs12) &&
12194 !nested_exit_on_intr(vcpu))
12195 return -EINVAL;
12196
705699a1
WV
12197 /*
12198 * bits 15:8 should be zero in posted_intr_nv,
12199 * the descriptor address has been already checked
12200 * in nested_get_vmcs12_pages.
6de84e58
KS
12201 *
12202 * bits 5:0 of posted_intr_desc_addr should be zero.
705699a1
WV
12203 */
12204 if (nested_cpu_has_posted_intr(vmcs12) &&
12205 (!nested_cpu_has_vid(vmcs12) ||
12206 !nested_exit_intr_ack_set(vcpu) ||
6de84e58
KS
12207 (vmcs12->posted_intr_nv & 0xff00) ||
12208 (vmcs12->posted_intr_desc_addr & 0x3f) ||
12209 (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
705699a1
WV
12210 return -EINVAL;
12211
f2b93280
WV
12212 /* tpr shadow is needed by all apicv features. */
12213 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
12214 return -EINVAL;
12215
12216 return 0;
3af18d9c
WV
12217}
12218
e9ac033e
EK
12219static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
12220 unsigned long count_field,
92d71bc6 12221 unsigned long addr_field)
ff651cb6 12222{
e2536742 12223 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
92d71bc6 12224 int maxphyaddr;
e9ac033e
EK
12225 u64 count, addr;
12226
e2536742
LA
12227 if (vmcs12_read_any(vmcs12, count_field, &count) ||
12228 vmcs12_read_any(vmcs12, addr_field, &addr)) {
e9ac033e
EK
12229 WARN_ON(1);
12230 return -EINVAL;
12231 }
12232 if (count == 0)
12233 return 0;
92d71bc6 12234 maxphyaddr = cpuid_maxphyaddr(vcpu);
e9ac033e
EK
12235 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
12236 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
bbe41b95 12237 pr_debug_ratelimited(
e9ac033e
EK
12238 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
12239 addr_field, maxphyaddr, count, addr);
12240 return -EINVAL;
12241 }
12242 return 0;
12243}
12244
12245static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
12246 struct vmcs12 *vmcs12)
12247{
e9ac033e
EK
12248 if (vmcs12->vm_exit_msr_load_count == 0 &&
12249 vmcs12->vm_exit_msr_store_count == 0 &&
12250 vmcs12->vm_entry_msr_load_count == 0)
12251 return 0; /* Fast path */
e9ac033e 12252 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
92d71bc6 12253 VM_EXIT_MSR_LOAD_ADDR) ||
e9ac033e 12254 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
92d71bc6 12255 VM_EXIT_MSR_STORE_ADDR) ||
e9ac033e 12256 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
92d71bc6 12257 VM_ENTRY_MSR_LOAD_ADDR))
e9ac033e
EK
12258 return -EINVAL;
12259 return 0;
12260}
12261
c5f983f6
BD
12262static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
12263 struct vmcs12 *vmcs12)
12264{
55c1dcd8
KS
12265 if (!nested_cpu_has_pml(vmcs12))
12266 return 0;
c5f983f6 12267
55c1dcd8
KS
12268 if (!nested_cpu_has_ept(vmcs12) ||
12269 !page_address_valid(vcpu, vmcs12->pml_address))
12270 return -EINVAL;
c5f983f6
BD
12271
12272 return 0;
12273}
12274
a8a7c02b
LA
12275static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
12276 struct vmcs12 *vmcs12)
12277{
12278 if (!nested_cpu_has_shadow_vmcs(vmcs12))
12279 return 0;
12280
12281 if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
12282 !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
12283 return -EINVAL;
12284
12285 return 0;
12286}
12287
e9ac033e
EK
12288static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
12289 struct vmx_msr_entry *e)
12290{
12291 /* x2APIC MSR accesses are not allowed */
8a9781f7 12292 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
e9ac033e
EK
12293 return -EINVAL;
12294 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
12295 e->index == MSR_IA32_UCODE_REV)
12296 return -EINVAL;
12297 if (e->reserved != 0)
ff651cb6
WV
12298 return -EINVAL;
12299 return 0;
12300}
12301
e9ac033e
EK
12302static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
12303 struct vmx_msr_entry *e)
ff651cb6
WV
12304{
12305 if (e->index == MSR_FS_BASE ||
12306 e->index == MSR_GS_BASE ||
e9ac033e
EK
12307 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
12308 nested_vmx_msr_check_common(vcpu, e))
12309 return -EINVAL;
12310 return 0;
12311}
12312
12313static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
12314 struct vmx_msr_entry *e)
12315{
12316 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
12317 nested_vmx_msr_check_common(vcpu, e))
ff651cb6
WV
12318 return -EINVAL;
12319 return 0;
12320}
12321
12322/*
12323 * Load guest's/host's msr at nested entry/exit.
12324 * return 0 for success, entry index for failure.
12325 */
12326static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12327{
12328 u32 i;
12329 struct vmx_msr_entry e;
12330 struct msr_data msr;
12331
12332 msr.host_initiated = false;
12333 for (i = 0; i < count; i++) {
54bf36aa
PB
12334 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
12335 &e, sizeof(e))) {
bbe41b95 12336 pr_debug_ratelimited(
e9ac033e
EK
12337 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12338 __func__, i, gpa + i * sizeof(e));
ff651cb6 12339 goto fail;
e9ac033e
EK
12340 }
12341 if (nested_vmx_load_msr_check(vcpu, &e)) {
bbe41b95 12342 pr_debug_ratelimited(
e9ac033e
EK
12343 "%s check failed (%u, 0x%x, 0x%x)\n",
12344 __func__, i, e.index, e.reserved);
12345 goto fail;
12346 }
ff651cb6
WV
12347 msr.index = e.index;
12348 msr.data = e.value;
e9ac033e 12349 if (kvm_set_msr(vcpu, &msr)) {
bbe41b95 12350 pr_debug_ratelimited(
e9ac033e
EK
12351 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
12352 __func__, i, e.index, e.value);
ff651cb6 12353 goto fail;
e9ac033e 12354 }
ff651cb6
WV
12355 }
12356 return 0;
12357fail:
12358 return i + 1;
12359}
12360
12361static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12362{
12363 u32 i;
12364 struct vmx_msr_entry e;
12365
12366 for (i = 0; i < count; i++) {
609e36d3 12367 struct msr_data msr_info;
54bf36aa
PB
12368 if (kvm_vcpu_read_guest(vcpu,
12369 gpa + i * sizeof(e),
12370 &e, 2 * sizeof(u32))) {
bbe41b95 12371 pr_debug_ratelimited(
e9ac033e
EK
12372 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12373 __func__, i, gpa + i * sizeof(e));
ff651cb6 12374 return -EINVAL;
e9ac033e
EK
12375 }
12376 if (nested_vmx_store_msr_check(vcpu, &e)) {
bbe41b95 12377 pr_debug_ratelimited(
e9ac033e
EK
12378 "%s check failed (%u, 0x%x, 0x%x)\n",
12379 __func__, i, e.index, e.reserved);
ff651cb6 12380 return -EINVAL;
e9ac033e 12381 }
609e36d3
PB
12382 msr_info.host_initiated = false;
12383 msr_info.index = e.index;
12384 if (kvm_get_msr(vcpu, &msr_info)) {
bbe41b95 12385 pr_debug_ratelimited(
e9ac033e
EK
12386 "%s cannot read MSR (%u, 0x%x)\n",
12387 __func__, i, e.index);
12388 return -EINVAL;
12389 }
54bf36aa
PB
12390 if (kvm_vcpu_write_guest(vcpu,
12391 gpa + i * sizeof(e) +
12392 offsetof(struct vmx_msr_entry, value),
12393 &msr_info.data, sizeof(msr_info.data))) {
bbe41b95 12394 pr_debug_ratelimited(
e9ac033e 12395 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
609e36d3 12396 __func__, i, e.index, msr_info.data);
e9ac033e
EK
12397 return -EINVAL;
12398 }
ff651cb6
WV
12399 }
12400 return 0;
12401}
12402
1dc35dac
LP
12403static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
12404{
12405 unsigned long invalid_mask;
12406
12407 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
12408 return (val & invalid_mask) == 0;
12409}
12410
9ed38ffa
LP
12411/*
12412 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
12413 * emulating VM entry into a guest with EPT enabled.
12414 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12415 * is assigned to entry_failure_code on failure.
12416 */
12417static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
ca0bde28 12418 u32 *entry_failure_code)
9ed38ffa 12419{
9ed38ffa 12420 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
1dc35dac 12421 if (!nested_cr3_valid(vcpu, cr3)) {
9ed38ffa
LP
12422 *entry_failure_code = ENTRY_FAIL_DEFAULT;
12423 return 1;
12424 }
12425
12426 /*
12427 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
12428 * must not be dereferenced.
12429 */
12430 if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
12431 !nested_ept) {
12432 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
12433 *entry_failure_code = ENTRY_FAIL_PDPTE;
12434 return 1;
12435 }
12436 }
9ed38ffa
LP
12437 }
12438
50c28f21 12439 if (!nested_ept)
ade61e28 12440 kvm_mmu_new_cr3(vcpu, cr3, false);
50c28f21
JS
12441
12442 vcpu->arch.cr3 = cr3;
12443 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
12444
12445 kvm_init_mmu(vcpu, false);
12446
9ed38ffa
LP
12447 return 0;
12448}
12449
efebf0aa
LA
12450/*
12451 * Returns if KVM is able to config CPU to tag TLB entries
12452 * populated by L2 differently than TLB entries populated
12453 * by L1.
12454 *
12455 * If L1 uses EPT, then TLB entries are tagged with different EPTP.
12456 *
12457 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
12458 * with different VPID (L1 entries are tagged with vmx->vpid
12459 * while L2 entries are tagged with vmx->nested.vpid02).
12460 */
12461static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
12462{
12463 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
12464
12465 return nested_cpu_has_ept(vmcs12) ||
12466 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
12467}
12468
3df5c37e
SC
12469static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12470{
12471 if (vmx->nested.nested_run_pending &&
12472 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
12473 return vmcs12->guest_ia32_efer;
12474 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
12475 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
12476 else
12477 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
12478}
12479
09abe320 12480static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
fe3ef05c 12481{
09abe320 12482 /*
9d6105b2 12483 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
09abe320
SC
12484 * according to L0's settings (vmcs12 is irrelevant here). Host
12485 * fields that come from L0 and are not constant, e.g. HOST_CR3,
12486 * will be set as needed prior to VMLAUNCH/VMRESUME.
12487 */
9d6105b2 12488 if (vmx->nested.vmcs02_initialized)
09abe320 12489 return;
9d6105b2 12490 vmx->nested.vmcs02_initialized = true;
09abe320 12491
52017608
SC
12492 /*
12493 * We don't care what the EPTP value is we just need to guarantee
12494 * it's valid so we don't get a false positive when doing early
12495 * consistency checks.
12496 */
12497 if (enable_ept && nested_early_check)
12498 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
12499
09abe320
SC
12500 /* All VMFUNCs are currently emulated through L0 vmexits. */
12501 if (cpu_has_vmx_vmfunc())
12502 vmcs_write64(VM_FUNCTION_CONTROL, 0);
12503
12504 if (cpu_has_vmx_posted_intr())
12505 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
12506
12507 if (cpu_has_vmx_msr_bitmap())
12508 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
12509
12510 if (enable_pml)
12511 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
12512
12513 /*
12514 * Set the MSR load/store lists to match L0's settings. Only the
12515 * addresses are constant (for vmcs02), the counts can change based
12516 * on L2's behavior, e.g. switching to/from long mode.
12517 */
12518 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
12519 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
12520 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
12521
12522 vmx_set_constant_host_state(vmx);
12523}
12524
12525static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
12526 struct vmcs12 *vmcs12)
12527{
12528 prepare_vmcs02_constant_state(vmx);
12529
12530 vmcs_write64(VMCS_LINK_POINTER, -1ull);
12531
12532 if (enable_vpid) {
12533 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
12534 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
12535 else
12536 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
12537 }
12538}
12539
12540static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12541{
12542 u32 exec_control, vmcs12_exec_ctrl;
12543 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
12544
945679e3 12545 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
09abe320
SC
12546 prepare_vmcs02_early_full(vmx, vmcs12);
12547
12548 /*
12549 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
12550 * entry, but only if the current (host) sp changed from the value
12551 * we wrote last (vmx->host_rsp). This cache is no longer relevant
12552 * if we switch vmcs, and rather than hold a separate cache per vmcs,
52017608
SC
12553 * here we just force the write to happen on entry. host_rsp will
12554 * also be written unconditionally by nested_vmx_check_vmentry_hw()
12555 * if we are doing early consistency checks via hardware.
09abe320
SC
12556 */
12557 vmx->host_rsp = 0;
12558
12559 /*
12560 * PIN CONTROLS
12561 */
12562 exec_control = vmcs12->pin_based_vm_exec_control;
12563
12564 /* Preemption timer setting is computed directly in vmx_vcpu_run. */
12565 exec_control |= vmcs_config.pin_based_exec_ctrl;
12566 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
12567 vmx->loaded_vmcs->hv_timer_armed = false;
12568
12569 /* Posted interrupts setting is only taken from vmcs12. */
12570 if (nested_cpu_has_posted_intr(vmcs12)) {
12571 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
12572 vmx->nested.pi_pending = false;
12573 } else {
12574 exec_control &= ~PIN_BASED_POSTED_INTR;
12575 }
12576 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
12577
12578 /*
12579 * EXEC CONTROLS
12580 */
12581 exec_control = vmx_exec_control(vmx); /* L0's desires */
12582 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
12583 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
12584 exec_control &= ~CPU_BASED_TPR_SHADOW;
12585 exec_control |= vmcs12->cpu_based_vm_exec_control;
12586
12587 /*
12588 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
12589 * nested_get_vmcs12_pages can't fix it up, the illegal value
12590 * will result in a VM entry failure.
12591 */
12592 if (exec_control & CPU_BASED_TPR_SHADOW) {
12593 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
12594 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
12595 } else {
12596#ifdef CONFIG_X86_64
12597 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
12598 CPU_BASED_CR8_STORE_EXITING;
12599#endif
12600 }
12601
12602 /*
12603 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
12604 * for I/O port accesses.
12605 */
12606 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
12607 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
12608 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
12609
12610 /*
12611 * SECONDARY EXEC CONTROLS
12612 */
12613 if (cpu_has_secondary_exec_ctrls()) {
12614 exec_control = vmx->secondary_exec_control;
12615
12616 /* Take the following fields only from vmcs12 */
12617 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
12618 SECONDARY_EXEC_ENABLE_INVPCID |
12619 SECONDARY_EXEC_RDTSCP |
12620 SECONDARY_EXEC_XSAVES |
12621 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
12622 SECONDARY_EXEC_APIC_REGISTER_VIRT |
12623 SECONDARY_EXEC_ENABLE_VMFUNC);
12624 if (nested_cpu_has(vmcs12,
12625 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
12626 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
12627 ~SECONDARY_EXEC_ENABLE_PML;
12628 exec_control |= vmcs12_exec_ctrl;
12629 }
12630
12631 /* VMCS shadowing for L2 is emulated for now */
12632 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
12633
12634 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
12635 vmcs_write16(GUEST_INTR_STATUS,
12636 vmcs12->guest_intr_status);
12637
12638 /*
12639 * Write an illegal value to APIC_ACCESS_ADDR. Later,
12640 * nested_get_vmcs12_pages will either fix it up or
12641 * remove the VM execution control.
12642 */
12643 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
12644 vmcs_write64(APIC_ACCESS_ADDR, -1ull);
12645
12646 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
12647 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
12648
12649 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
12650 }
12651
12652 /*
12653 * ENTRY CONTROLS
12654 *
12655 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
12656 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
12657 * on the related bits (if supported by the CPU) in the hope that
12658 * we can avoid VMWrites during vmx_set_efer().
12659 */
12660 exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
12661 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
12662 if (cpu_has_load_ia32_efer) {
12663 if (guest_efer & EFER_LMA)
12664 exec_control |= VM_ENTRY_IA32E_MODE;
12665 if (guest_efer != host_efer)
12666 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
12667 }
12668 vm_entry_controls_init(vmx, exec_control);
12669
12670 /*
12671 * EXIT CONTROLS
12672 *
12673 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
12674 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
12675 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
12676 */
12677 exec_control = vmcs_config.vmexit_ctrl;
12678 if (cpu_has_load_ia32_efer && guest_efer != host_efer)
12679 exec_control |= VM_EXIT_LOAD_IA32_EFER;
12680 vm_exit_controls_init(vmx, exec_control);
12681
12682 /*
12683 * Conceptually we want to copy the PML address and index from
12684 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
12685 * since we always flush the log on each vmexit and never change
12686 * the PML address (once set), this happens to be equivalent to
12687 * simply resetting the index in vmcs02.
12688 */
12689 if (enable_pml)
12690 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
12691
12692 /*
12693 * Interrupt/Exception Fields
12694 */
12695 if (vmx->nested.nested_run_pending) {
12696 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
12697 vmcs12->vm_entry_intr_info_field);
12698 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
12699 vmcs12->vm_entry_exception_error_code);
12700 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
12701 vmcs12->vm_entry_instruction_len);
12702 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
12703 vmcs12->guest_interruptibility_info);
12704 vmx->loaded_vmcs->nmi_known_unmasked =
12705 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
12706 } else {
12707 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
12708 }
12709}
fe3ef05c 12710
09abe320
SC
12711static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12712{
c4ebd629
VK
12713 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
12714
12715 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12716 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
12717 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
12718 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
12719 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
12720 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
12721 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
12722 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
12723 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
12724 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
12725 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
12726 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
12727 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
12728 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
12729 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
12730 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
12731 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
12732 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
12733 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
12734 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
12735 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
12736 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
12737 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
12738 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
12739 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
12740 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
12741 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
12742 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
12743 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
12744 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
12745 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
12746 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
12747 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
12748 }
12749
12750 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12751 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
12752 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
12753 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
12754 vmcs12->guest_pending_dbg_exceptions);
12755 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
12756 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
12757
12758 /*
12759 * L1 may access the L2's PDPTR, so save them to construct
12760 * vmcs12
12761 */
12762 if (enable_ept) {
12763 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
12764 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
12765 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
12766 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
12767 }
12768 }
25a2e4fe
PB
12769
12770 if (nested_cpu_has_xsaves(vmcs12))
12771 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
25a2e4fe
PB
12772
12773 /*
12774 * Whether page-faults are trapped is determined by a combination of
12775 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
12776 * If enable_ept, L0 doesn't care about page faults and we should
12777 * set all of these to L1's desires. However, if !enable_ept, L0 does
12778 * care about (at least some) page faults, and because it is not easy
12779 * (if at all possible?) to merge L0 and L1's desires, we simply ask
12780 * to exit on each and every L2 page fault. This is done by setting
12781 * MASK=MATCH=0 and (see below) EB.PF=1.
12782 * Note that below we don't need special code to set EB.PF beyond the
12783 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
12784 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
12785 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
12786 */
12787 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
12788 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
12789 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
12790 enable_ept ? vmcs12->page_fault_error_code_match : 0);
12791
25a2e4fe
PB
12792 if (cpu_has_vmx_apicv()) {
12793 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
12794 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
12795 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
12796 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
12797 }
12798
33966dd6 12799 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
33966dd6 12800 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
25a2e4fe
PB
12801
12802 set_cr4_guest_host_mask(vmx);
12803
62cf9bd8
LA
12804 if (kvm_mpx_supported()) {
12805 if (vmx->nested.nested_run_pending &&
12806 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
12807 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
12808 else
12809 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
12810 }
8665c3f9
PB
12811}
12812
12813/*
12814 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
12815 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
12816 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
12817 * guest in a way that will both be appropriate to L1's requests, and our
12818 * needs. In addition to modifying the active vmcs (which is vmcs02), this
12819 * function also has additional necessary side-effects, like setting various
12820 * vcpu->arch fields.
12821 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12822 * is assigned to entry_failure_code on failure.
12823 */
12824static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
6514dc38 12825 u32 *entry_failure_code)
8665c3f9
PB
12826{
12827 struct vcpu_vmx *vmx = to_vmx(vcpu);
c4ebd629 12828 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
8665c3f9 12829
945679e3 12830 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
09abe320 12831 prepare_vmcs02_full(vmx, vmcs12);
9d1887ef
SC
12832 vmx->nested.dirty_vmcs12 = false;
12833 }
12834
8665c3f9
PB
12835 /*
12836 * First, the fields that are shadowed. This must be kept in sync
12837 * with vmx_shadow_fields.h.
12838 */
c4ebd629
VK
12839 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12840 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
12841 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
12842 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
12843 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
12844 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
12845 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
12846 }
8665c3f9 12847
6514dc38 12848 if (vmx->nested.nested_run_pending &&
cf8b84f4 12849 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2996fca0
JK
12850 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
12851 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
12852 } else {
12853 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
12854 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
12855 }
63fbf59f 12856 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
fe3ef05c 12857
f4124500
JK
12858 vmx->nested.preemption_timer_expired = false;
12859 if (nested_cpu_has_preemption_timer(vmcs12))
12860 vmx_start_preemption_timer(vcpu);
0238ea91 12861
fe3ef05c
NHE
12862 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
12863 * bitwise-or of what L1 wants to trap for L2, and what we want to
12864 * trap. Note that CR0.TS also needs updating - we do this later.
12865 */
12866 update_exception_bitmap(vcpu);
12867 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
12868 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
12869
6514dc38 12870 if (vmx->nested.nested_run_pending &&
cf8b84f4 12871 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
fe3ef05c 12872 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
44811c02 12873 vcpu->arch.pat = vmcs12->guest_ia32_pat;
cf8b84f4 12874 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
fe3ef05c 12875 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
cf8b84f4 12876 }
fe3ef05c 12877
e79f245d
KA
12878 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
12879
c95ba92a
PF
12880 if (kvm_has_tsc_control)
12881 decache_tsc_multiplier(vmx);
fe3ef05c
NHE
12882
12883 if (enable_vpid) {
12884 /*
5c614b35
WL
12885 * There is no direct mapping between vpid02 and vpid12, the
12886 * vpid02 is per-vCPU for L0 and reused while the value of
12887 * vpid12 is changed w/ one invvpid during nested vmentry.
12888 * The vpid12 is allocated by L1 for L2, so it will not
12889 * influence global bitmap(for vpid01 and vpid02 allocation)
12890 * even if spawn a lot of nested vCPUs.
fe3ef05c 12891 */
efebf0aa 12892 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
5c614b35
WL
12893 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
12894 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
efebf0aa 12895 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
5c614b35
WL
12896 }
12897 } else {
1438921c
LA
12898 /*
12899 * If L1 use EPT, then L0 needs to execute INVEPT on
12900 * EPTP02 instead of EPTP01. Therefore, delay TLB
12901 * flush until vmcs02->eptp is fully updated by
12902 * KVM_REQ_LOAD_CR3. Note that this assumes
12903 * KVM_REQ_TLB_FLUSH is evaluated after
12904 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
12905 */
12906 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
5c614b35 12907 }
fe3ef05c
NHE
12908 }
12909
5b8ba41d
SC
12910 if (nested_cpu_has_ept(vmcs12))
12911 nested_ept_init_mmu_context(vcpu);
12912 else if (nested_cpu_has2(vmcs12,
12913 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
a468f2db 12914 vmx_flush_tlb(vcpu, true);
155a97a3 12915
fe3ef05c 12916 /*
bd7e5b08
PB
12917 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
12918 * bits which we consider mandatory enabled.
fe3ef05c
NHE
12919 * The CR0_READ_SHADOW is what L2 should have expected to read given
12920 * the specifications by L1; It's not enough to take
12921 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
12922 * have more bits than L1 expected.
12923 */
12924 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
12925 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
12926
12927 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
12928 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
12929
09abe320 12930 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
3df5c37e 12931 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
5a6a9748
DM
12932 vmx_set_efer(vcpu, vcpu->arch.efer);
12933
2bb8cafe
SC
12934 /*
12935 * Guest state is invalid and unrestricted guest is disabled,
12936 * which means L1 attempted VMEntry to L2 with invalid state.
12937 * Fail the VMEntry.
12938 */
3184a995
PB
12939 if (vmx->emulation_required) {
12940 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2bb8cafe 12941 return 1;
3184a995 12942 }
2bb8cafe 12943
9ed38ffa 12944 /* Shadow page tables on either EPT or shadow page tables. */
7ad658b6 12945 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
9ed38ffa
LP
12946 entry_failure_code))
12947 return 1;
7ca29de2 12948
feaf0c7d
GN
12949 if (!enable_ept)
12950 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
12951
fe3ef05c
NHE
12952 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
12953 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
ee146c1c 12954 return 0;
fe3ef05c
NHE
12955}
12956
0c7f650e
KS
12957static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
12958{
12959 if (!nested_cpu_has_nmi_exiting(vmcs12) &&
12960 nested_cpu_has_virtual_nmis(vmcs12))
12961 return -EINVAL;
12962
12963 if (!nested_cpu_has_virtual_nmis(vmcs12) &&
12964 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
12965 return -EINVAL;
12966
12967 return 0;
12968}
12969
ca0bde28 12970static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
cd232ad0 12971{
cd232ad0 12972 struct vcpu_vmx *vmx = to_vmx(vcpu);
64a919f7 12973 bool ia32e;
7c177938 12974
6dfacadd 12975 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
ca0bde28
JM
12976 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
12977 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
26539bd0 12978
ba8e23db
KS
12979 if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
12980 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12981
56a20510
JM
12982 if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
12983 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12984
ca0bde28
JM
12985 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
12986 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
7c177938 12987
f0f4cf5b
KS
12988 if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
12989 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12990
712b12d7
JM
12991 if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
12992 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12993
ca0bde28
JM
12994 if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
12995 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
f2b93280 12996
ca0bde28
JM
12997 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
12998 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
e9ac033e 12999
c5f983f6
BD
13000 if (nested_vmx_check_pml_controls(vcpu, vmcs12))
13001 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13002
a8a7c02b
LA
13003 if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
13004 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13005
7c177938 13006 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
6677f3da
PB
13007 vmx->nested.msrs.procbased_ctls_low,
13008 vmx->nested.msrs.procbased_ctls_high) ||
2e5b0bd9
JM
13009 (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
13010 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
6677f3da
PB
13011 vmx->nested.msrs.secondary_ctls_low,
13012 vmx->nested.msrs.secondary_ctls_high)) ||
7c177938 13013 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
6677f3da
PB
13014 vmx->nested.msrs.pinbased_ctls_low,
13015 vmx->nested.msrs.pinbased_ctls_high) ||
7c177938 13016 !vmx_control_verify(vmcs12->vm_exit_controls,
6677f3da
PB
13017 vmx->nested.msrs.exit_ctls_low,
13018 vmx->nested.msrs.exit_ctls_high) ||
7c177938 13019 !vmx_control_verify(vmcs12->vm_entry_controls,
6677f3da
PB
13020 vmx->nested.msrs.entry_ctls_low,
13021 vmx->nested.msrs.entry_ctls_high))
ca0bde28 13022 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
7c177938 13023
0c7f650e 13024 if (nested_vmx_check_nmi_controls(vmcs12))
ca0bde28 13025 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
7c177938 13026
41ab9372
BD
13027 if (nested_cpu_has_vmfunc(vmcs12)) {
13028 if (vmcs12->vm_function_control &
6677f3da 13029 ~vmx->nested.msrs.vmfunc_controls)
41ab9372
BD
13030 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13031
13032 if (nested_cpu_has_eptp_switching(vmcs12)) {
13033 if (!nested_cpu_has_ept(vmcs12) ||
13034 !page_address_valid(vcpu, vmcs12->eptp_list_address))
13035 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13036 }
13037 }
27c42a1b 13038
c7c2c709
JM
13039 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
13040 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13041
3899152c 13042 if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
1dc35dac 13043 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
ca0bde28
JM
13044 !nested_cr3_valid(vcpu, vmcs12->host_cr3))
13045 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
13046
64a919f7
SC
13047 /*
13048 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
13049 * IA32_EFER MSR must be 0 in the field for that register. In addition,
13050 * the values of the LMA and LME bits in the field must each be that of
13051 * the host address-space size VM-exit control.
13052 */
13053 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
13054 ia32e = (vmcs12->vm_exit_controls &
13055 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
13056 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
13057 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
13058 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
13059 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
13060 }
13061
0447378a
MO
13062 /*
13063 * From the Intel SDM, volume 3:
13064 * Fields relevant to VM-entry event injection must be set properly.
13065 * These fields are the VM-entry interruption-information field, the
13066 * VM-entry exception error code, and the VM-entry instruction length.
13067 */
13068 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
13069 u32 intr_info = vmcs12->vm_entry_intr_info_field;
13070 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
13071 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
13072 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
13073 bool should_have_error_code;
13074 bool urg = nested_cpu_has2(vmcs12,
13075 SECONDARY_EXEC_UNRESTRICTED_GUEST);
13076 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
13077
13078 /* VM-entry interruption-info field: interruption type */
13079 if (intr_type == INTR_TYPE_RESERVED ||
13080 (intr_type == INTR_TYPE_OTHER_EVENT &&
13081 !nested_cpu_supports_monitor_trap_flag(vcpu)))
13082 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13083
13084 /* VM-entry interruption-info field: vector */
13085 if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
13086 (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
13087 (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
13088 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13089
13090 /* VM-entry interruption-info field: deliver error code */
13091 should_have_error_code =
13092 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
13093 x86_exception_has_error_code(vector);
13094 if (has_error_code != should_have_error_code)
13095 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13096
13097 /* VM-entry exception error code */
13098 if (has_error_code &&
13099 vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
13100 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13101
13102 /* VM-entry interruption-info field: reserved bits */
13103 if (intr_info & INTR_INFO_RESVD_BITS_MASK)
13104 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13105
13106 /* VM-entry instruction length */
13107 switch (intr_type) {
13108 case INTR_TYPE_SOFT_EXCEPTION:
13109 case INTR_TYPE_SOFT_INTR:
13110 case INTR_TYPE_PRIV_SW_EXCEPTION:
13111 if ((vmcs12->vm_entry_instruction_len > 15) ||
13112 (vmcs12->vm_entry_instruction_len == 0 &&
13113 !nested_cpu_has_zero_length_injection(vcpu)))
13114 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13115 }
13116 }
13117
5b8ba41d
SC
13118 if (nested_cpu_has_ept(vmcs12) &&
13119 !valid_ept_address(vcpu, vmcs12->ept_pointer))
13120 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13121
ca0bde28
JM
13122 return 0;
13123}
13124
f145d90d
LA
13125static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
13126 struct vmcs12 *vmcs12)
13127{
13128 int r;
13129 struct page *page;
13130 struct vmcs12 *shadow;
13131
13132 if (vmcs12->vmcs_link_pointer == -1ull)
13133 return 0;
13134
13135 if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
13136 return -EINVAL;
13137
13138 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
13139 if (is_error_page(page))
13140 return -EINVAL;
13141
13142 r = 0;
13143 shadow = kmap(page);
13144 if (shadow->hdr.revision_id != VMCS12_REVISION ||
13145 shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
13146 r = -EINVAL;
13147 kunmap(page);
13148 kvm_release_page_clean(page);
13149 return r;
13150}
13151
ca0bde28
JM
13152static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
13153 u32 *exit_qual)
13154{
13155 bool ia32e;
13156
13157 *exit_qual = ENTRY_FAIL_DEFAULT;
7c177938 13158
3899152c 13159 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
ca0bde28 13160 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
b428018a 13161 return 1;
ca0bde28 13162
f145d90d 13163 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
ca0bde28 13164 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
b428018a 13165 return 1;
7c177938
NHE
13166 }
13167
384bb783 13168 /*
cb0c8cda 13169 * If the load IA32_EFER VM-entry control is 1, the following checks
384bb783
JK
13170 * are performed on the field for the IA32_EFER MSR:
13171 * - Bits reserved in the IA32_EFER MSR must be 0.
13172 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
13173 * the IA-32e mode guest VM-exit control. It must also be identical
13174 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
13175 * CR0.PG) is 1.
13176 */
ca0bde28
JM
13177 if (to_vmx(vcpu)->nested.nested_run_pending &&
13178 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
384bb783
JK
13179 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
13180 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
13181 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
13182 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
ca0bde28 13183 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
b428018a 13184 return 1;
384bb783
JK
13185 }
13186
f1b026a3
WL
13187 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
13188 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
13189 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
13190 return 1;
13191
ca0bde28
JM
13192 return 0;
13193}
13194
52017608
SC
13195static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
13196{
13197 struct vcpu_vmx *vmx = to_vmx(vcpu);
13198 unsigned long cr3, cr4;
13199
13200 if (!nested_early_check)
13201 return 0;
13202
13203 if (vmx->msr_autoload.host.nr)
13204 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
13205 if (vmx->msr_autoload.guest.nr)
13206 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
13207
13208 preempt_disable();
13209
13210 vmx_prepare_switch_to_guest(vcpu);
13211
13212 /*
13213 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
13214 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
13215 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
13216 * there is no need to preserve other bits or save/restore the field.
13217 */
13218 vmcs_writel(GUEST_RFLAGS, 0);
13219
13220 vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
13221
13222 cr3 = __get_current_cr3_fast();
13223 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
13224 vmcs_writel(HOST_CR3, cr3);
13225 vmx->loaded_vmcs->host_state.cr3 = cr3;
13226 }
13227
13228 cr4 = cr4_read_shadow();
13229 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
13230 vmcs_writel(HOST_CR4, cr4);
13231 vmx->loaded_vmcs->host_state.cr4 = cr4;
13232 }
13233
13234 vmx->__launched = vmx->loaded_vmcs->launched;
13235
13236 asm(
13237 /* Set HOST_RSP */
4b1e5478 13238 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
52017608
SC
13239 "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
13240
13241 /* Check if vmlaunch of vmresume is needed */
13242 "cmpl $0, %c[launched](%0)\n\t"
13243 "je 1f\n\t"
4b1e5478 13244 __ex("vmresume") "\n\t"
52017608 13245 "jmp 2f\n\t"
4b1e5478 13246 "1: " __ex("vmlaunch") "\n\t"
52017608
SC
13247 "jmp 2f\n\t"
13248 "2: "
13249
13250 /* Set vmx->fail accordingly */
13251 "setbe %c[fail](%0)\n\t"
13252
13253 ".pushsection .rodata\n\t"
13254 ".global vmx_early_consistency_check_return\n\t"
13255 "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
13256 ".popsection"
13257 :
13258 : "c"(vmx), "d"((unsigned long)HOST_RSP),
13259 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
13260 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
13261 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
13262 : "rax", "cc", "memory"
13263 );
13264
13265 vmcs_writel(HOST_RIP, vmx_return);
13266
13267 preempt_enable();
13268
13269 if (vmx->msr_autoload.host.nr)
13270 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
13271 if (vmx->msr_autoload.guest.nr)
13272 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
13273
13274 if (vmx->fail) {
13275 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
13276 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
13277 vmx->fail = 0;
13278 return 1;
13279 }
13280
13281 /*
13282 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
13283 */
13284 local_irq_enable();
13285 if (hw_breakpoint_active())
13286 set_debugreg(__this_cpu_read(cpu_dr7), 7);
13287
13288 /*
13289 * A non-failing VMEntry means we somehow entered guest mode with
13290 * an illegal RIP, and that's just the tip of the iceberg. There
13291 * is no telling what memory has been modified or what state has
13292 * been exposed to unknown code. Hitting this all but guarantees
13293 * a (very critical) hardware issue.
13294 */
13295 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
13296 VMX_EXIT_REASONS_FAILED_VMENTRY));
13297
13298 return 0;
13299}
13300STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
13301
a633e41e
SC
13302static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
13303 struct vmcs12 *vmcs12);
13304
7f7f1ba3 13305/*
a633e41e 13306 * If from_vmentry is false, this is being called from state restore (either RSM
8fcc4b59 13307 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
52017608
SC
13308+ *
13309+ * Returns:
13310+ * 0 - success, i.e. proceed with actual VMEnter
13311+ * 1 - consistency check VMExit
13312+ * -1 - consistency check VMFail
7f7f1ba3 13313 */
a633e41e
SC
13314static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
13315 bool from_vmentry)
858e25c0
JM
13316{
13317 struct vcpu_vmx *vmx = to_vmx(vcpu);
13318 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7e712684 13319 bool evaluate_pending_interrupts;
a633e41e
SC
13320 u32 exit_reason = EXIT_REASON_INVALID_STATE;
13321 u32 exit_qual;
858e25c0 13322
7e712684
PB
13323 evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
13324 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
13325 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
13326 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
b5861e5c 13327
858e25c0
JM
13328 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
13329 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
62cf9bd8
LA
13330 if (kvm_mpx_supported() &&
13331 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
13332 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
858e25c0 13333
de3a0021 13334 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
858e25c0 13335
16fb9a46
SC
13336 prepare_vmcs02_early(vmx, vmcs12);
13337
13338 if (from_vmentry) {
13339 nested_get_vmcs12_pages(vcpu);
13340
52017608
SC
13341 if (nested_vmx_check_vmentry_hw(vcpu)) {
13342 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
13343 return -1;
13344 }
13345
16fb9a46
SC
13346 if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
13347 goto vmentry_fail_vmexit;
13348 }
13349
13350 enter_guest_mode(vcpu);
e79f245d
KA
13351 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
13352 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
13353
a633e41e 13354 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
39f9c388 13355 goto vmentry_fail_vmexit_guest_mode;
858e25c0 13356
7f7f1ba3 13357 if (from_vmentry) {
a633e41e
SC
13358 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
13359 exit_qual = nested_vmx_load_msr(vcpu,
13360 vmcs12->vm_entry_msr_load_addr,
13361 vmcs12->vm_entry_msr_load_count);
13362 if (exit_qual)
39f9c388 13363 goto vmentry_fail_vmexit_guest_mode;
7f7f1ba3
PB
13364 } else {
13365 /*
13366 * The MMU is not initialized to point at the right entities yet and
13367 * "get pages" would need to read data from the guest (i.e. we will
13368 * need to perform gpa to hpa translation). Request a call
13369 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
13370 * have already been set at vmentry time and should not be reset.
13371 */
13372 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
13373 }
858e25c0 13374
b5861e5c
LA
13375 /*
13376 * If L1 had a pending IRQ/NMI until it executed
13377 * VMLAUNCH/VMRESUME which wasn't delivered because it was
13378 * disallowed (e.g. interrupts disabled), L0 needs to
13379 * evaluate if this pending event should cause an exit from L2
13380 * to L1 or delivered directly to L2 (e.g. In case L1 don't
13381 * intercept EXTERNAL_INTERRUPT).
13382 *
7e712684
PB
13383 * Usually this would be handled by the processor noticing an
13384 * IRQ/NMI window request, or checking RVI during evaluation of
13385 * pending virtual interrupts. However, this setting was done
13386 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
13387 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
b5861e5c 13388 */
7e712684 13389 if (unlikely(evaluate_pending_interrupts))
b5861e5c 13390 kvm_make_request(KVM_REQ_EVENT, vcpu);
b5861e5c 13391
858e25c0
JM
13392 /*
13393 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
13394 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
13395 * returned as far as L1 is concerned. It will only return (and set
13396 * the success flag) when L2 exits (see nested_vmx_vmexit()).
13397 */
13398 return 0;
e79f245d 13399
a633e41e
SC
13400 /*
13401 * A failed consistency check that leads to a VMExit during L1's
13402 * VMEnter to L2 is a variation of a normal VMexit, as explained in
13403 * 26.7 "VM-entry failures during or after loading guest state".
13404 */
39f9c388 13405vmentry_fail_vmexit_guest_mode:
e79f245d
KA
13406 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
13407 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
13408 leave_guest_mode(vcpu);
16fb9a46
SC
13409
13410vmentry_fail_vmexit:
e79f245d 13411 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
a633e41e
SC
13412
13413 if (!from_vmentry)
13414 return 1;
13415
a633e41e
SC
13416 load_vmcs12_host_state(vcpu, vmcs12);
13417 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
13418 vmcs12->exit_qualification = exit_qual;
945679e3
VK
13419 if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
13420 vmx->nested.need_vmcs12_sync = true;
a633e41e 13421 return 1;
858e25c0
JM
13422}
13423
ca0bde28
JM
13424/*
13425 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
13426 * for running an L2 nested guest.
13427 */
13428static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
13429{
13430 struct vmcs12 *vmcs12;
13431 struct vcpu_vmx *vmx = to_vmx(vcpu);
b3f1dfb6 13432 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
ca0bde28
JM
13433 int ret;
13434
13435 if (!nested_vmx_check_permission(vcpu))
13436 return 1;
13437
8cab6507 13438 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
b8bbab92
VK
13439 return 1;
13440
13441 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
09abb5e3 13442 return nested_vmx_failInvalid(vcpu);
ca0bde28
JM
13443
13444 vmcs12 = get_vmcs12(vcpu);
13445
a6192d40
LA
13446 /*
13447 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
13448 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
13449 * rather than RFLAGS.ZF, and no error number is stored to the
13450 * VM-instruction error field.
13451 */
09abb5e3
SC
13452 if (vmcs12->hdr.shadow_vmcs)
13453 return nested_vmx_failInvalid(vcpu);
a6192d40 13454
945679e3
VK
13455 if (vmx->nested.hv_evmcs) {
13456 copy_enlightened_to_vmcs12(vmx);
13457 /* Enlightened VMCS doesn't have launch state */
13458 vmcs12->launch_state = !launch;
13459 } else if (enable_shadow_vmcs) {
ca0bde28 13460 copy_shadow_to_vmcs12(vmx);
945679e3 13461 }
ca0bde28
JM
13462
13463 /*
13464 * The nested entry process starts with enforcing various prerequisites
13465 * on vmcs12 as required by the Intel SDM, and act appropriately when
13466 * they fail: As the SDM explains, some conditions should cause the
13467 * instruction to fail, while others will cause the instruction to seem
13468 * to succeed, but return an EXIT_REASON_INVALID_STATE.
13469 * To speed up the normal (success) code path, we should avoid checking
13470 * for misconfigurations which will anyway be caught by the processor
13471 * when using the merged vmcs02.
13472 */
09abb5e3
SC
13473 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
13474 return nested_vmx_failValid(vcpu,
13475 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
b3f1dfb6 13476
09abb5e3
SC
13477 if (vmcs12->launch_state == launch)
13478 return nested_vmx_failValid(vcpu,
ca0bde28
JM
13479 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
13480 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
ca0bde28
JM
13481
13482 ret = check_vmentry_prereqs(vcpu, vmcs12);
09abb5e3
SC
13483 if (ret)
13484 return nested_vmx_failValid(vcpu, ret);
ca0bde28 13485
7c177938
NHE
13486 /*
13487 * We're finally done with prerequisite checking, and can start with
13488 * the nested entry.
13489 */
6514dc38 13490 vmx->nested.nested_run_pending = 1;
a633e41e 13491 ret = nested_vmx_enter_non_root_mode(vcpu, true);
52017608
SC
13492 vmx->nested.nested_run_pending = !ret;
13493 if (ret > 0)
7f7f1ba3 13494 return 1;
52017608
SC
13495 else if (ret)
13496 return nested_vmx_failValid(vcpu,
13497 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
ff651cb6 13498
c595ceee
PB
13499 /* Hide L1D cache contents from the nested guest. */
13500 vmx->vcpu.arch.l1tf_flush_l1d = true;
13501
61ada748 13502 /*
d63907dc 13503 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
61ada748
LA
13504 * also be used as part of restoring nVMX state for
13505 * snapshot restore (migration).
13506 *
13507 * In this flow, it is assumed that vmcs12 cache was
13508 * trasferred as part of captured nVMX state and should
13509 * therefore not be read from guest memory (which may not
13510 * exist on destination host yet).
13511 */
13512 nested_cache_shadow_vmcs12(vcpu, vmcs12);
13513
135a06c3
CG
13514 /*
13515 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
13516 * by event injection, halt vcpu.
13517 */
13518 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
6514dc38
JM
13519 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) {
13520 vmx->nested.nested_run_pending = 0;
5cb56059 13521 return kvm_vcpu_halt(vcpu);
6514dc38 13522 }
cd232ad0
NHE
13523 return 1;
13524}
13525
4704d0be
NHE
13526/*
13527 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
13528 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
13529 * This function returns the new value we should put in vmcs12.guest_cr0.
13530 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
13531 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
13532 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
13533 * didn't trap the bit, because if L1 did, so would L0).
13534 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
13535 * been modified by L2, and L1 knows it. So just leave the old value of
13536 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
13537 * isn't relevant, because if L0 traps this bit it can set it to anything.
13538 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
13539 * changed these bits, and therefore they need to be updated, but L0
13540 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
13541 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
13542 */
13543static inline unsigned long
13544vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13545{
13546 return
13547 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
13548 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
13549 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
13550 vcpu->arch.cr0_guest_owned_bits));
13551}
13552
13553static inline unsigned long
13554vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13555{
13556 return
13557 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
13558 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
13559 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
13560 vcpu->arch.cr4_guest_owned_bits));
13561}
13562
5f3d5799
JK
13563static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
13564 struct vmcs12 *vmcs12)
13565{
13566 u32 idt_vectoring;
13567 unsigned int nr;
13568
664f8e26 13569 if (vcpu->arch.exception.injected) {
5f3d5799
JK
13570 nr = vcpu->arch.exception.nr;
13571 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13572
13573 if (kvm_exception_is_soft(nr)) {
13574 vmcs12->vm_exit_instruction_len =
13575 vcpu->arch.event_exit_inst_len;
13576 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
13577 } else
13578 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
13579
13580 if (vcpu->arch.exception.has_error_code) {
13581 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
13582 vmcs12->idt_vectoring_error_code =
13583 vcpu->arch.exception.error_code;
13584 }
13585
13586 vmcs12->idt_vectoring_info_field = idt_vectoring;
cd2633c5 13587 } else if (vcpu->arch.nmi_injected) {
5f3d5799
JK
13588 vmcs12->idt_vectoring_info_field =
13589 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
04140b41 13590 } else if (vcpu->arch.interrupt.injected) {
5f3d5799
JK
13591 nr = vcpu->arch.interrupt.nr;
13592 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13593
13594 if (vcpu->arch.interrupt.soft) {
13595 idt_vectoring |= INTR_TYPE_SOFT_INTR;
13596 vmcs12->vm_entry_instruction_len =
13597 vcpu->arch.event_exit_inst_len;
13598 } else
13599 idt_vectoring |= INTR_TYPE_EXT_INTR;
13600
13601 vmcs12->idt_vectoring_info_field = idt_vectoring;
13602 }
13603}
13604
b6b8a145
JK
13605static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
13606{
13607 struct vcpu_vmx *vmx = to_vmx(vcpu);
bfcf83b1 13608 unsigned long exit_qual;
917dc606
LA
13609 bool block_nested_events =
13610 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
acc9ab60 13611
bfcf83b1
WL
13612 if (vcpu->arch.exception.pending &&
13613 nested_vmx_check_exception(vcpu, &exit_qual)) {
917dc606 13614 if (block_nested_events)
bfcf83b1
WL
13615 return -EBUSY;
13616 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
bfcf83b1
WL
13617 return 0;
13618 }
13619
f4124500
JK
13620 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
13621 vmx->nested.preemption_timer_expired) {
917dc606 13622 if (block_nested_events)
f4124500
JK
13623 return -EBUSY;
13624 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
13625 return 0;
13626 }
13627
b6b8a145 13628 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
917dc606 13629 if (block_nested_events)
b6b8a145
JK
13630 return -EBUSY;
13631 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
13632 NMI_VECTOR | INTR_TYPE_NMI_INTR |
13633 INTR_INFO_VALID_MASK, 0);
13634 /*
13635 * The NMI-triggered VM exit counts as injection:
13636 * clear this one and block further NMIs.
13637 */
13638 vcpu->arch.nmi_pending = 0;
13639 vmx_set_nmi_mask(vcpu, true);
13640 return 0;
13641 }
13642
13643 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
13644 nested_exit_on_intr(vcpu)) {
917dc606 13645 if (block_nested_events)
b6b8a145
JK
13646 return -EBUSY;
13647 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
705699a1 13648 return 0;
b6b8a145
JK
13649 }
13650
6342c50a
DH
13651 vmx_complete_nested_posted_interrupt(vcpu);
13652 return 0;
b6b8a145
JK
13653}
13654
d264ee0c
SC
13655static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
13656{
13657 to_vmx(vcpu)->req_immediate_exit = true;
13658}
13659
f4124500
JK
13660static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
13661{
13662 ktime_t remaining =
13663 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
13664 u64 value;
13665
13666 if (ktime_to_ns(remaining) <= 0)
13667 return 0;
13668
13669 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
13670 do_div(value, 1000000);
13671 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
13672}
13673
4704d0be 13674/*
cf8b84f4
JM
13675 * Update the guest state fields of vmcs12 to reflect changes that
13676 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
13677 * VM-entry controls is also updated, since this is really a guest
13678 * state bit.)
4704d0be 13679 */
cf8b84f4 13680static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4704d0be 13681{
4704d0be
NHE
13682 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
13683 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
13684
4704d0be
NHE
13685 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
13686 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
13687 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
13688
13689 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
13690 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
13691 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
13692 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
13693 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
13694 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
13695 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
13696 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
13697 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
13698 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
13699 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
13700 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
13701 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
13702 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
13703 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
13704 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
13705 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
13706 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
13707 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
13708 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
13709 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
13710 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
13711 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
13712 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
13713 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
13714 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
13715 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
13716 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
13717 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
13718 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
13719 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
13720 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
13721 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
13722 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
13723 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
13724 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
13725
4704d0be
NHE
13726 vmcs12->guest_interruptibility_info =
13727 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
13728 vmcs12->guest_pending_dbg_exceptions =
13729 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3edf1e69
JK
13730 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
13731 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
13732 else
13733 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4704d0be 13734
f4124500
JK
13735 if (nested_cpu_has_preemption_timer(vmcs12)) {
13736 if (vmcs12->vm_exit_controls &
13737 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
13738 vmcs12->vmx_preemption_timer_value =
13739 vmx_get_preemption_timer_value(vcpu);
13740 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
13741 }
7854cbca 13742
3633cfc3
NHE
13743 /*
13744 * In some cases (usually, nested EPT), L2 is allowed to change its
13745 * own CR3 without exiting. If it has changed it, we must keep it.
13746 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
13747 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
13748 *
13749 * Additionally, restore L2's PDPTR to vmcs12.
13750 */
13751 if (enable_ept) {
f3531054 13752 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3633cfc3
NHE
13753 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
13754 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
13755 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
13756 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
13757 }
13758
d281e13b 13759 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
119a9c01 13760
608406e2
WV
13761 if (nested_cpu_has_vid(vmcs12))
13762 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
13763
c18911a2
JK
13764 vmcs12->vm_entry_controls =
13765 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
2961e876 13766 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
c18911a2 13767
2996fca0
JK
13768 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
13769 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
13770 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
13771 }
13772
4704d0be
NHE
13773 /* TODO: These cannot have changed unless we have MSR bitmaps and
13774 * the relevant bit asks not to trap the change */
b8c07d55 13775 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
4704d0be 13776 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
10ba54a5
JK
13777 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
13778 vmcs12->guest_ia32_efer = vcpu->arch.efer;
4704d0be
NHE
13779 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
13780 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
13781 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
a87036ad 13782 if (kvm_mpx_supported())
36be0b9d 13783 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
cf8b84f4
JM
13784}
13785
13786/*
13787 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
13788 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
13789 * and this function updates it to reflect the changes to the guest state while
13790 * L2 was running (and perhaps made some exits which were handled directly by L0
13791 * without going back to L1), and to reflect the exit reason.
13792 * Note that we do not have to copy here all VMCS fields, just those that
13793 * could have changed by the L2 guest or the exit - i.e., the guest-state and
13794 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
13795 * which already writes to vmcs12 directly.
13796 */
13797static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
13798 u32 exit_reason, u32 exit_intr_info,
13799 unsigned long exit_qualification)
13800{
13801 /* update guest state fields: */
13802 sync_vmcs12(vcpu, vmcs12);
4704d0be
NHE
13803
13804 /* update exit information fields: */
13805
533558bc
JK
13806 vmcs12->vm_exit_reason = exit_reason;
13807 vmcs12->exit_qualification = exit_qualification;
533558bc 13808 vmcs12->vm_exit_intr_info = exit_intr_info;
7313c698 13809
5f3d5799 13810 vmcs12->idt_vectoring_info_field = 0;
4704d0be
NHE
13811 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
13812 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
13813
5f3d5799 13814 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
7cdc2d62
JM
13815 vmcs12->launch_state = 1;
13816
5f3d5799
JK
13817 /* vm_entry_intr_info_field is cleared on exit. Emulate this
13818 * instead of reading the real value. */
4704d0be 13819 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
5f3d5799
JK
13820
13821 /*
13822 * Transfer the event that L0 or L1 may wanted to inject into
13823 * L2 to IDT_VECTORING_INFO_FIELD.
13824 */
13825 vmcs12_save_pending_event(vcpu, vmcs12);
13826 }
13827
13828 /*
13829 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
13830 * preserved above and would only end up incorrectly in L1.
13831 */
13832 vcpu->arch.nmi_injected = false;
13833 kvm_clear_exception_queue(vcpu);
13834 kvm_clear_interrupt_queue(vcpu);
4704d0be
NHE
13835}
13836
13837/*
13838 * A part of what we need to when the nested L2 guest exits and we want to
13839 * run its L1 parent, is to reset L1's guest state to the host state specified
13840 * in vmcs12.
13841 * This function is to be called not only on normal nested exit, but also on
13842 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
13843 * Failures During or After Loading Guest State").
13844 * This function should be called when the active VMCS is L1's (vmcs01).
13845 */
733568f9
JK
13846static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
13847 struct vmcs12 *vmcs12)
4704d0be 13848{
21feb4eb 13849 struct kvm_segment seg;
bd18bffc 13850 u32 entry_failure_code;
21feb4eb 13851
4704d0be
NHE
13852 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
13853 vcpu->arch.efer = vmcs12->host_ia32_efer;
d1fa0352 13854 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4704d0be
NHE
13855 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
13856 else
13857 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
13858 vmx_set_efer(vcpu, vcpu->arch.efer);
13859
13860 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
13861 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
1adfa76a 13862 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
cb61de2f
SC
13863 vmx_set_interrupt_shadow(vcpu, 0);
13864
4704d0be
NHE
13865 /*
13866 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
bd7e5b08
PB
13867 * actually changed, because vmx_set_cr0 refers to efer set above.
13868 *
13869 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
13870 * (KVM doesn't change it);
4704d0be 13871 */
bd7e5b08 13872 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
9e3e4dbf 13873 vmx_set_cr0(vcpu, vmcs12->host_cr0);
4704d0be 13874
bd7e5b08 13875 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
4704d0be 13876 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
8eb3f87d 13877 vmx_set_cr4(vcpu, vmcs12->host_cr4);
4704d0be 13878
bd18bffc
SC
13879 nested_ept_uninit_mmu_context(vcpu);
13880
13881 /*
13882 * Only PDPTE load can fail as the value of cr3 was checked on entry and
13883 * couldn't have changed.
13884 */
13885 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
13886 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
13887
13888 if (!enable_ept)
13889 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
feaf0c7d 13890
6f1e03bc 13891 /*
efebf0aa 13892 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
6f1e03bc
LA
13893 * VMEntry/VMExit. Thus, no need to flush TLB.
13894 *
efebf0aa
LA
13895 * If vmcs12 doesn't use VPID, L1 expects TLB to be
13896 * flushed on every VMEntry/VMExit.
6f1e03bc 13897 *
efebf0aa
LA
13898 * Otherwise, we can preserve TLB entries as long as we are
13899 * able to tag L1 TLB entries differently than L2 TLB entries.
1438921c
LA
13900 *
13901 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
13902 * and therefore we request the TLB flush to happen only after VMCS EPTP
13903 * has been set by KVM_REQ_LOAD_CR3.
6f1e03bc
LA
13904 */
13905 if (enable_vpid &&
efebf0aa 13906 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
1438921c 13907 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
4704d0be 13908 }
4704d0be
NHE
13909
13910 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
13911 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
13912 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
13913 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
13914 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
21f2d551
LP
13915 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
13916 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4704d0be 13917
36be0b9d
PB
13918 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
13919 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
13920 vmcs_write64(GUEST_BNDCFGS, 0);
13921
44811c02 13922 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4704d0be 13923 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
44811c02
JK
13924 vcpu->arch.pat = vmcs12->host_ia32_pat;
13925 }
4704d0be
NHE
13926 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
13927 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
13928 vmcs12->host_ia32_perf_global_ctrl);
503cd0c5 13929
21feb4eb
ACL
13930 /* Set L1 segment info according to Intel SDM
13931 27.5.2 Loading Host Segment and Descriptor-Table Registers */
13932 seg = (struct kvm_segment) {
13933 .base = 0,
13934 .limit = 0xFFFFFFFF,
13935 .selector = vmcs12->host_cs_selector,
13936 .type = 11,
13937 .present = 1,
13938 .s = 1,
13939 .g = 1
13940 };
13941 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
13942 seg.l = 1;
13943 else
13944 seg.db = 1;
13945 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
13946 seg = (struct kvm_segment) {
13947 .base = 0,
13948 .limit = 0xFFFFFFFF,
13949 .type = 3,
13950 .present = 1,
13951 .s = 1,
13952 .db = 1,
13953 .g = 1
13954 };
13955 seg.selector = vmcs12->host_ds_selector;
13956 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
13957 seg.selector = vmcs12->host_es_selector;
13958 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
13959 seg.selector = vmcs12->host_ss_selector;
13960 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
13961 seg.selector = vmcs12->host_fs_selector;
13962 seg.base = vmcs12->host_fs_base;
13963 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
13964 seg.selector = vmcs12->host_gs_selector;
13965 seg.base = vmcs12->host_gs_base;
13966 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
13967 seg = (struct kvm_segment) {
205befd9 13968 .base = vmcs12->host_tr_base,
21feb4eb
ACL
13969 .limit = 0x67,
13970 .selector = vmcs12->host_tr_selector,
13971 .type = 11,
13972 .present = 1
13973 };
13974 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
13975
503cd0c5
JK
13976 kvm_set_dr(vcpu, 7, 0x400);
13977 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
ff651cb6 13978
3af18d9c 13979 if (cpu_has_vmx_msr_bitmap())
904e14fb 13980 vmx_update_msr_bitmap(vcpu);
3af18d9c 13981
ff651cb6
WV
13982 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
13983 vmcs12->vm_exit_msr_load_count))
13984 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4704d0be
NHE
13985}
13986
bd18bffc
SC
13987static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
13988{
13989 struct shared_msr_entry *efer_msr;
13990 unsigned int i;
13991
13992 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
13993 return vmcs_read64(GUEST_IA32_EFER);
13994
13995 if (cpu_has_load_ia32_efer)
13996 return host_efer;
13997
13998 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
13999 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
14000 return vmx->msr_autoload.guest.val[i].value;
14001 }
14002
14003 efer_msr = find_msr_entry(vmx, MSR_EFER);
14004 if (efer_msr)
14005 return efer_msr->data;
14006
14007 return host_efer;
14008}
14009
14010static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
14011{
14012 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14013 struct vcpu_vmx *vmx = to_vmx(vcpu);
14014 struct vmx_msr_entry g, h;
14015 struct msr_data msr;
14016 gpa_t gpa;
14017 u32 i, j;
14018
14019 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
14020
14021 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
14022 /*
14023 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
14024 * as vmcs01.GUEST_DR7 contains a userspace defined value
14025 * and vcpu->arch.dr7 is not squirreled away before the
14026 * nested VMENTER (not worth adding a variable in nested_vmx).
14027 */
14028 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
14029 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
14030 else
14031 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
14032 }
14033
14034 /*
14035 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
14036 * handle a variety of side effects to KVM's software model.
14037 */
14038 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
14039
14040 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
14041 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
14042
14043 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
14044 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
14045
14046 nested_ept_uninit_mmu_context(vcpu);
14047 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
14048 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
14049
14050 /*
14051 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
14052 * from vmcs01 (if necessary). The PDPTRs are not loaded on
14053 * VMFail, like everything else we just need to ensure our
14054 * software model is up-to-date.
14055 */
14056 ept_save_pdptrs(vcpu);
14057
14058 kvm_mmu_reset_context(vcpu);
14059
14060 if (cpu_has_vmx_msr_bitmap())
14061 vmx_update_msr_bitmap(vcpu);
14062
14063 /*
14064 * This nasty bit of open coding is a compromise between blindly
14065 * loading L1's MSRs using the exit load lists (incorrect emulation
14066 * of VMFail), leaving the nested VM's MSRs in the software model
14067 * (incorrect behavior) and snapshotting the modified MSRs (too
14068 * expensive since the lists are unbound by hardware). For each
14069 * MSR that was (prematurely) loaded from the nested VMEntry load
14070 * list, reload it from the exit load list if it exists and differs
14071 * from the guest value. The intent is to stuff host state as
14072 * silently as possible, not to fully process the exit load list.
14073 */
14074 msr.host_initiated = false;
14075 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
14076 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
14077 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
14078 pr_debug_ratelimited(
14079 "%s read MSR index failed (%u, 0x%08llx)\n",
14080 __func__, i, gpa);
14081 goto vmabort;
14082 }
14083
14084 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
14085 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
14086 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
14087 pr_debug_ratelimited(
14088 "%s read MSR failed (%u, 0x%08llx)\n",
14089 __func__, j, gpa);
14090 goto vmabort;
14091 }
14092 if (h.index != g.index)
14093 continue;
14094 if (h.value == g.value)
14095 break;
14096
14097 if (nested_vmx_load_msr_check(vcpu, &h)) {
14098 pr_debug_ratelimited(
14099 "%s check failed (%u, 0x%x, 0x%x)\n",
14100 __func__, j, h.index, h.reserved);
14101 goto vmabort;
14102 }
14103
14104 msr.index = h.index;
14105 msr.data = h.value;
14106 if (kvm_set_msr(vcpu, &msr)) {
14107 pr_debug_ratelimited(
14108 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
14109 __func__, j, h.index, h.value);
14110 goto vmabort;
14111 }
14112 }
14113 }
14114
14115 return;
14116
14117vmabort:
14118 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
14119}
14120
4704d0be
NHE
14121/*
14122 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
14123 * and modify vmcs12 to make it see what it would expect to see there if
14124 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
14125 */
533558bc
JK
14126static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
14127 u32 exit_intr_info,
14128 unsigned long exit_qualification)
4704d0be
NHE
14129{
14130 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be
NHE
14131 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14132
5f3d5799
JK
14133 /* trying to cancel vmlaunch/vmresume is a bug */
14134 WARN_ON_ONCE(vmx->nested.nested_run_pending);
14135
4704d0be 14136 leave_guest_mode(vcpu);
4704d0be 14137
e79f245d
KA
14138 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
14139 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
14140
4f350c6d 14141 if (likely(!vmx->fail)) {
72e9cbdb
LP
14142 if (exit_reason == -1)
14143 sync_vmcs12(vcpu, vmcs12);
14144 else
14145 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
14146 exit_qualification);
ff651cb6 14147
61ada748
LA
14148 /*
14149 * Must happen outside of sync_vmcs12() as it will
14150 * also be used to capture vmcs12 cache as part of
14151 * capturing nVMX state for snapshot (migration).
14152 *
14153 * Otherwise, this flush will dirty guest memory at a
14154 * point it is already assumed by user-space to be
14155 * immutable.
14156 */
14157 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
14158
4f350c6d
JM
14159 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
14160 vmcs12->vm_exit_msr_store_count))
14161 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
2768c0cc
SC
14162 } else {
14163 /*
14164 * The only expected VM-instruction error is "VM entry with
14165 * invalid control field(s)." Anything else indicates a
14166 * problem with L0. And we should never get here with a
14167 * VMFail of any type if early consistency checks are enabled.
14168 */
14169 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
14170 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
14171 WARN_ON_ONCE(nested_early_check);
4f350c6d 14172 }
cf3215d9 14173
1279a6b1 14174 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
36c3cc42 14175
9314006d 14176 /* Update any VMCS fields that might have changed while L2 ran */
33966dd6
KRW
14177 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
14178 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
ea26e4ec 14179 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
f459a707 14180
c95ba92a
PF
14181 if (kvm_has_tsc_control)
14182 decache_tsc_multiplier(vmx);
4704d0be 14183
8d860bbe
JM
14184 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
14185 vmx->nested.change_vmcs01_virtual_apic_mode = false;
14186 vmx_set_virtual_apic_mode(vcpu);
fb6c8198
JM
14187 } else if (!nested_cpu_has_ept(vmcs12) &&
14188 nested_cpu_has2(vmcs12,
14189 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
a468f2db 14190 vmx_flush_tlb(vcpu, true);
dccbfcf5 14191 }
4704d0be
NHE
14192
14193 /* This is needed for same reason as it was needed in prepare_vmcs02 */
14194 vmx->host_rsp = 0;
14195
14196 /* Unpin physical memory we referred to in vmcs02 */
14197 if (vmx->nested.apic_access_page) {
53a70daf 14198 kvm_release_page_dirty(vmx->nested.apic_access_page);
48d89b92 14199 vmx->nested.apic_access_page = NULL;
4704d0be 14200 }
a7c0b07d 14201 if (vmx->nested.virtual_apic_page) {
53a70daf 14202 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
48d89b92 14203 vmx->nested.virtual_apic_page = NULL;
a7c0b07d 14204 }
705699a1
WV
14205 if (vmx->nested.pi_desc_page) {
14206 kunmap(vmx->nested.pi_desc_page);
53a70daf 14207 kvm_release_page_dirty(vmx->nested.pi_desc_page);
705699a1
WV
14208 vmx->nested.pi_desc_page = NULL;
14209 vmx->nested.pi_desc = NULL;
14210 }
4704d0be 14211
38b99173
TC
14212 /*
14213 * We are now running in L2, mmu_notifier will force to reload the
14214 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
14215 */
c83b6d15 14216 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
38b99173 14217
945679e3
VK
14218 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
14219 vmx->nested.need_vmcs12_sync = true;
b6b8a145
JK
14220
14221 /* in case we halted in L2 */
14222 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4f350c6d
JM
14223
14224 if (likely(!vmx->fail)) {
14225 /*
14226 * TODO: SDM says that with acknowledge interrupt on
14227 * exit, bit 31 of the VM-exit interrupt information
14228 * (valid interrupt) is always set to 1 on
14229 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
14230 * need kvm_cpu_has_interrupt(). See the commit
14231 * message for details.
14232 */
14233 if (nested_exit_intr_ack_set(vcpu) &&
14234 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
14235 kvm_cpu_has_interrupt(vcpu)) {
14236 int irq = kvm_cpu_get_interrupt(vcpu);
14237 WARN_ON(irq < 0);
14238 vmcs12->vm_exit_intr_info = irq |
14239 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
14240 }
14241
72e9cbdb
LP
14242 if (exit_reason != -1)
14243 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
14244 vmcs12->exit_qualification,
14245 vmcs12->idt_vectoring_info_field,
14246 vmcs12->vm_exit_intr_info,
14247 vmcs12->vm_exit_intr_error_code,
14248 KVM_ISA_VMX);
4f350c6d
JM
14249
14250 load_vmcs12_host_state(vcpu, vmcs12);
14251
14252 return;
14253 }
09abb5e3 14254
4f350c6d
JM
14255 /*
14256 * After an early L2 VM-entry failure, we're now back
14257 * in L1 which thinks it just finished a VMLAUNCH or
14258 * VMRESUME instruction, so we need to set the failure
14259 * flag and the VM-instruction error field of the VMCS
cb61de2f 14260 * accordingly, and skip the emulated instruction.
4f350c6d 14261 */
09abb5e3 14262 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
cb61de2f 14263
bd18bffc
SC
14264 /*
14265 * Restore L1's host state to KVM's software model. We're here
14266 * because a consistency check was caught by hardware, which
14267 * means some amount of guest state has been propagated to KVM's
14268 * model and needs to be unwound to the host's state.
14269 */
14270 nested_vmx_restore_host_state(vcpu);
5af41573 14271
4f350c6d 14272 vmx->fail = 0;
4704d0be
NHE
14273}
14274
42124925
JK
14275/*
14276 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
14277 */
14278static void vmx_leave_nested(struct kvm_vcpu *vcpu)
14279{
2f707d97
WL
14280 if (is_guest_mode(vcpu)) {
14281 to_vmx(vcpu)->nested.nested_run_pending = 0;
533558bc 14282 nested_vmx_vmexit(vcpu, -1, 0, 0);
2f707d97 14283 }
14c07ad8 14284 free_nested(vcpu);
42124925
JK
14285}
14286
8a76d7f2
JR
14287static int vmx_check_intercept(struct kvm_vcpu *vcpu,
14288 struct x86_instruction_info *info,
14289 enum x86_intercept_stage stage)
14290{
fb6d4d34
PB
14291 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14292 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
14293
14294 /*
14295 * RDPID causes #UD if disabled through secondary execution controls.
14296 * Because it is marked as EmulateOnUD, we need to intercept it here.
14297 */
14298 if (info->intercept == x86_intercept_rdtscp &&
14299 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
14300 ctxt->exception.vector = UD_VECTOR;
14301 ctxt->exception.error_code_valid = false;
14302 return X86EMUL_PROPAGATE_FAULT;
14303 }
14304
14305 /* TODO: check more intercepts... */
8a76d7f2
JR
14306 return X86EMUL_CONTINUE;
14307}
14308
64672c95
YJ
14309#ifdef CONFIG_X86_64
14310/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
14311static inline int u64_shl_div_u64(u64 a, unsigned int shift,
14312 u64 divisor, u64 *result)
14313{
14314 u64 low = a << shift, high = a >> (64 - shift);
14315
14316 /* To avoid the overflow on divq */
14317 if (high >= divisor)
14318 return 1;
14319
14320 /* Low hold the result, high hold rem which is discarded */
14321 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
14322 "rm" (divisor), "0" (low), "1" (high));
14323 *result = low;
14324
14325 return 0;
14326}
14327
14328static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
14329{
386c6ddb 14330 struct vcpu_vmx *vmx;
c5ce8235 14331 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
386c6ddb
KA
14332
14333 if (kvm_mwait_in_guest(vcpu->kvm))
14334 return -EOPNOTSUPP;
14335
14336 vmx = to_vmx(vcpu);
14337 tscl = rdtsc();
14338 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
14339 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
c5ce8235
WL
14340 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
14341
14342 if (delta_tsc > lapic_timer_advance_cycles)
14343 delta_tsc -= lapic_timer_advance_cycles;
14344 else
14345 delta_tsc = 0;
64672c95
YJ
14346
14347 /* Convert to host delta tsc if tsc scaling is enabled */
14348 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
14349 u64_shl_div_u64(delta_tsc,
14350 kvm_tsc_scaling_ratio_frac_bits,
14351 vcpu->arch.tsc_scaling_ratio,
14352 &delta_tsc))
14353 return -ERANGE;
14354
14355 /*
14356 * If the delta tsc can't fit in the 32 bit after the multi shift,
14357 * we can't use the preemption timer.
14358 * It's possible that it fits on later vmentries, but checking
14359 * on every vmentry is costly so we just use an hrtimer.
14360 */
14361 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
14362 return -ERANGE;
14363
14364 vmx->hv_deadline_tsc = tscl + delta_tsc;
c8533544 14365 return delta_tsc == 0;
64672c95
YJ
14366}
14367
14368static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
14369{
f459a707 14370 to_vmx(vcpu)->hv_deadline_tsc = -1;
64672c95
YJ
14371}
14372#endif
14373
48d89b92 14374static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
ae97a3b8 14375{
b31c114b 14376 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d 14377 shrink_ple_window(vcpu);
ae97a3b8
RK
14378}
14379
843e4330
KH
14380static void vmx_slot_enable_log_dirty(struct kvm *kvm,
14381 struct kvm_memory_slot *slot)
14382{
14383 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
14384 kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
14385}
14386
14387static void vmx_slot_disable_log_dirty(struct kvm *kvm,
14388 struct kvm_memory_slot *slot)
14389{
14390 kvm_mmu_slot_set_dirty(kvm, slot);
14391}
14392
14393static void vmx_flush_log_dirty(struct kvm *kvm)
14394{
14395 kvm_flush_pml_buffers(kvm);
14396}
14397
c5f983f6
BD
14398static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
14399{
14400 struct vmcs12 *vmcs12;
14401 struct vcpu_vmx *vmx = to_vmx(vcpu);
14402 gpa_t gpa;
14403 struct page *page = NULL;
14404 u64 *pml_address;
14405
14406 if (is_guest_mode(vcpu)) {
14407 WARN_ON_ONCE(vmx->nested.pml_full);
14408
14409 /*
14410 * Check if PML is enabled for the nested guest.
14411 * Whether eptp bit 6 is set is already checked
14412 * as part of A/D emulation.
14413 */
14414 vmcs12 = get_vmcs12(vcpu);
14415 if (!nested_cpu_has_pml(vmcs12))
14416 return 0;
14417
4769886b 14418 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
c5f983f6
BD
14419 vmx->nested.pml_full = true;
14420 return 1;
14421 }
14422
14423 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
14424
5e2f30b7
DH
14425 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
14426 if (is_error_page(page))
c5f983f6
BD
14427 return 0;
14428
14429 pml_address = kmap(page);
14430 pml_address[vmcs12->guest_pml_index--] = gpa;
14431 kunmap(page);
53a70daf 14432 kvm_release_page_clean(page);
c5f983f6
BD
14433 }
14434
14435 return 0;
14436}
14437
843e4330
KH
14438static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
14439 struct kvm_memory_slot *memslot,
14440 gfn_t offset, unsigned long mask)
14441{
14442 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
14443}
14444
cd39e117
PB
14445static void __pi_post_block(struct kvm_vcpu *vcpu)
14446{
14447 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14448 struct pi_desc old, new;
14449 unsigned int dest;
cd39e117
PB
14450
14451 do {
14452 old.control = new.control = pi_desc->control;
8b306e2f
PB
14453 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
14454 "Wakeup handler not enabled while the VCPU is blocked\n");
cd39e117
PB
14455
14456 dest = cpu_physical_id(vcpu->cpu);
14457
14458 if (x2apic_enabled())
14459 new.ndst = dest;
14460 else
14461 new.ndst = (dest << 8) & 0xFF00;
14462
cd39e117
PB
14463 /* set 'NV' to 'notification vector' */
14464 new.nv = POSTED_INTR_VECTOR;
c0a1666b
PB
14465 } while (cmpxchg64(&pi_desc->control, old.control,
14466 new.control) != old.control);
cd39e117 14467
8b306e2f
PB
14468 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
14469 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
cd39e117 14470 list_del(&vcpu->blocked_vcpu_list);
8b306e2f 14471 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
cd39e117
PB
14472 vcpu->pre_pcpu = -1;
14473 }
14474}
14475
bf9f6ac8
FW
14476/*
14477 * This routine does the following things for vCPU which is going
14478 * to be blocked if VT-d PI is enabled.
14479 * - Store the vCPU to the wakeup list, so when interrupts happen
14480 * we can find the right vCPU to wake up.
14481 * - Change the Posted-interrupt descriptor as below:
14482 * 'NDST' <-- vcpu->pre_pcpu
14483 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
14484 * - If 'ON' is set during this process, which means at least one
14485 * interrupt is posted for this vCPU, we cannot block it, in
14486 * this case, return 1, otherwise, return 0.
14487 *
14488 */
bc22512b 14489static int pi_pre_block(struct kvm_vcpu *vcpu)
bf9f6ac8 14490{
bf9f6ac8
FW
14491 unsigned int dest;
14492 struct pi_desc old, new;
14493 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14494
14495 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
a0052191
YZ
14496 !irq_remapping_cap(IRQ_POSTING_CAP) ||
14497 !kvm_vcpu_apicv_active(vcpu))
bf9f6ac8
FW
14498 return 0;
14499
8b306e2f
PB
14500 WARN_ON(irqs_disabled());
14501 local_irq_disable();
14502 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
14503 vcpu->pre_pcpu = vcpu->cpu;
14504 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14505 list_add_tail(&vcpu->blocked_vcpu_list,
14506 &per_cpu(blocked_vcpu_on_cpu,
14507 vcpu->pre_pcpu));
14508 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14509 }
bf9f6ac8
FW
14510
14511 do {
14512 old.control = new.control = pi_desc->control;
14513
bf9f6ac8
FW
14514 WARN((pi_desc->sn == 1),
14515 "Warning: SN field of posted-interrupts "
14516 "is set before blocking\n");
14517
14518 /*
14519 * Since vCPU can be preempted during this process,
14520 * vcpu->cpu could be different with pre_pcpu, we
14521 * need to set pre_pcpu as the destination of wakeup
14522 * notification event, then we can find the right vCPU
14523 * to wakeup in wakeup handler if interrupts happen
14524 * when the vCPU is in blocked state.
14525 */
14526 dest = cpu_physical_id(vcpu->pre_pcpu);
14527
14528 if (x2apic_enabled())
14529 new.ndst = dest;
14530 else
14531 new.ndst = (dest << 8) & 0xFF00;
14532
14533 /* set 'NV' to 'wakeup vector' */
14534 new.nv = POSTED_INTR_WAKEUP_VECTOR;
c0a1666b
PB
14535 } while (cmpxchg64(&pi_desc->control, old.control,
14536 new.control) != old.control);
bf9f6ac8 14537
8b306e2f
PB
14538 /* We should not block the vCPU if an interrupt is posted for it. */
14539 if (pi_test_on(pi_desc) == 1)
14540 __pi_post_block(vcpu);
14541
14542 local_irq_enable();
14543 return (vcpu->pre_pcpu == -1);
bf9f6ac8
FW
14544}
14545
bc22512b
YJ
14546static int vmx_pre_block(struct kvm_vcpu *vcpu)
14547{
14548 if (pi_pre_block(vcpu))
14549 return 1;
14550
64672c95
YJ
14551 if (kvm_lapic_hv_timer_in_use(vcpu))
14552 kvm_lapic_switch_to_sw_timer(vcpu);
14553
bc22512b
YJ
14554 return 0;
14555}
14556
14557static void pi_post_block(struct kvm_vcpu *vcpu)
bf9f6ac8 14558{
8b306e2f 14559 if (vcpu->pre_pcpu == -1)
bf9f6ac8
FW
14560 return;
14561
8b306e2f
PB
14562 WARN_ON(irqs_disabled());
14563 local_irq_disable();
cd39e117 14564 __pi_post_block(vcpu);
8b306e2f 14565 local_irq_enable();
bf9f6ac8
FW
14566}
14567
bc22512b
YJ
14568static void vmx_post_block(struct kvm_vcpu *vcpu)
14569{
64672c95
YJ
14570 if (kvm_x86_ops->set_hv_timer)
14571 kvm_lapic_switch_to_hv_timer(vcpu);
14572
bc22512b
YJ
14573 pi_post_block(vcpu);
14574}
14575
efc64404
FW
14576/*
14577 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
14578 *
14579 * @kvm: kvm
14580 * @host_irq: host irq of the interrupt
14581 * @guest_irq: gsi of the interrupt
14582 * @set: set or unset PI
14583 * returns 0 on success, < 0 on failure
14584 */
14585static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
14586 uint32_t guest_irq, bool set)
14587{
14588 struct kvm_kernel_irq_routing_entry *e;
14589 struct kvm_irq_routing_table *irq_rt;
14590 struct kvm_lapic_irq irq;
14591 struct kvm_vcpu *vcpu;
14592 struct vcpu_data vcpu_info;
3a8b0677 14593 int idx, ret = 0;
efc64404
FW
14594
14595 if (!kvm_arch_has_assigned_device(kvm) ||
a0052191
YZ
14596 !irq_remapping_cap(IRQ_POSTING_CAP) ||
14597 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
efc64404
FW
14598 return 0;
14599
14600 idx = srcu_read_lock(&kvm->irq_srcu);
14601 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
3a8b0677
JS
14602 if (guest_irq >= irq_rt->nr_rt_entries ||
14603 hlist_empty(&irq_rt->map[guest_irq])) {
14604 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
14605 guest_irq, irq_rt->nr_rt_entries);
14606 goto out;
14607 }
efc64404
FW
14608
14609 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
14610 if (e->type != KVM_IRQ_ROUTING_MSI)
14611 continue;
14612 /*
14613 * VT-d PI cannot support posting multicast/broadcast
14614 * interrupts to a vCPU, we still use interrupt remapping
14615 * for these kind of interrupts.
14616 *
14617 * For lowest-priority interrupts, we only support
14618 * those with single CPU as the destination, e.g. user
14619 * configures the interrupts via /proc/irq or uses
14620 * irqbalance to make the interrupts single-CPU.
14621 *
14622 * We will support full lowest-priority interrupt later.
14623 */
14624
37131313 14625 kvm_set_msi_irq(kvm, e, &irq);
23a1c257
FW
14626 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
14627 /*
14628 * Make sure the IRTE is in remapped mode if
14629 * we don't handle it in posted mode.
14630 */
14631 ret = irq_set_vcpu_affinity(host_irq, NULL);
14632 if (ret < 0) {
14633 printk(KERN_INFO
14634 "failed to back to remapped mode, irq: %u\n",
14635 host_irq);
14636 goto out;
14637 }
14638
efc64404 14639 continue;
23a1c257 14640 }
efc64404
FW
14641
14642 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
14643 vcpu_info.vector = irq.vector;
14644
2698d82e 14645 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
efc64404
FW
14646 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
14647
14648 if (set)
14649 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
dc91f2eb 14650 else
efc64404 14651 ret = irq_set_vcpu_affinity(host_irq, NULL);
efc64404
FW
14652
14653 if (ret < 0) {
14654 printk(KERN_INFO "%s: failed to update PI IRTE\n",
14655 __func__);
14656 goto out;
14657 }
14658 }
14659
14660 ret = 0;
14661out:
14662 srcu_read_unlock(&kvm->irq_srcu, idx);
14663 return ret;
14664}
14665
c45dcc71
AR
14666static void vmx_setup_mce(struct kvm_vcpu *vcpu)
14667{
14668 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
14669 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
14670 FEATURE_CONTROL_LMCE;
14671 else
14672 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
14673 ~FEATURE_CONTROL_LMCE;
14674}
14675
72d7b374
LP
14676static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
14677{
72e9cbdb
LP
14678 /* we need a nested vmexit to enter SMM, postpone if run is pending */
14679 if (to_vmx(vcpu)->nested.nested_run_pending)
14680 return 0;
72d7b374
LP
14681 return 1;
14682}
14683
0234bf88
LP
14684static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
14685{
72e9cbdb
LP
14686 struct vcpu_vmx *vmx = to_vmx(vcpu);
14687
14688 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
14689 if (vmx->nested.smm.guest_mode)
14690 nested_vmx_vmexit(vcpu, -1, 0, 0);
14691
14692 vmx->nested.smm.vmxon = vmx->nested.vmxon;
14693 vmx->nested.vmxon = false;
caa057a2 14694 vmx_clear_hlt(vcpu);
0234bf88
LP
14695 return 0;
14696}
14697
14698static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
14699{
72e9cbdb
LP
14700 struct vcpu_vmx *vmx = to_vmx(vcpu);
14701 int ret;
14702
14703 if (vmx->nested.smm.vmxon) {
14704 vmx->nested.vmxon = true;
14705 vmx->nested.smm.vmxon = false;
14706 }
14707
14708 if (vmx->nested.smm.guest_mode) {
14709 vcpu->arch.hflags &= ~HF_SMM_MASK;
a633e41e 14710 ret = nested_vmx_enter_non_root_mode(vcpu, false);
72e9cbdb
LP
14711 vcpu->arch.hflags |= HF_SMM_MASK;
14712 if (ret)
14713 return ret;
14714
14715 vmx->nested.smm.guest_mode = false;
14716 }
0234bf88
LP
14717 return 0;
14718}
14719
cc3d967f
LP
14720static int enable_smi_window(struct kvm_vcpu *vcpu)
14721{
14722 return 0;
14723}
14724
8cab6507
VK
14725static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
14726{
14727 struct vcpu_vmx *vmx = to_vmx(vcpu);
14728
14729 /*
14730 * In case we do two consecutive get/set_nested_state()s while L2 was
14731 * running hv_evmcs may end up not being mapped (we map it from
14732 * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
14733 * have vmcs12 if it is true.
14734 */
14735 return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
14736 vmx->nested.hv_evmcs;
14737}
14738
8fcc4b59
JM
14739static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
14740 struct kvm_nested_state __user *user_kvm_nested_state,
14741 u32 user_data_size)
14742{
14743 struct vcpu_vmx *vmx;
14744 struct vmcs12 *vmcs12;
14745 struct kvm_nested_state kvm_state = {
14746 .flags = 0,
14747 .format = 0,
14748 .size = sizeof(kvm_state),
14749 .vmx.vmxon_pa = -1ull,
14750 .vmx.vmcs_pa = -1ull,
14751 };
14752
14753 if (!vcpu)
14754 return kvm_state.size + 2 * VMCS12_SIZE;
14755
14756 vmx = to_vmx(vcpu);
14757 vmcs12 = get_vmcs12(vcpu);
945679e3 14758
8cab6507
VK
14759 if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
14760 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
945679e3 14761
8fcc4b59
JM
14762 if (nested_vmx_allowed(vcpu) &&
14763 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
14764 kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
14765 kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
14766
8cab6507 14767 if (vmx_has_valid_vmcs12(vcpu)) {
8fcc4b59
JM
14768 kvm_state.size += VMCS12_SIZE;
14769
fa58a9fa
PB
14770 if (is_guest_mode(vcpu) &&
14771 nested_cpu_has_shadow_vmcs(vmcs12) &&
14772 vmcs12->vmcs_link_pointer != -1ull)
14773 kvm_state.size += VMCS12_SIZE;
14774 }
14775
8fcc4b59
JM
14776 if (vmx->nested.smm.vmxon)
14777 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
14778
14779 if (vmx->nested.smm.guest_mode)
14780 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
14781
14782 if (is_guest_mode(vcpu)) {
14783 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
14784
14785 if (vmx->nested.nested_run_pending)
14786 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
14787 }
14788 }
14789
14790 if (user_data_size < kvm_state.size)
14791 goto out;
14792
14793 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
14794 return -EFAULT;
14795
8cab6507 14796 if (!vmx_has_valid_vmcs12(vcpu))
8fcc4b59
JM
14797 goto out;
14798
14799 /*
14800 * When running L2, the authoritative vmcs12 state is in the
14801 * vmcs02. When running L1, the authoritative vmcs12 state is
8cab6507 14802 * in the shadow or enlightened vmcs linked to vmcs01, unless
945679e3 14803 * need_vmcs12_sync is set, in which case, the authoritative
8fcc4b59
JM
14804 * vmcs12 state is in the vmcs12 already.
14805 */
8cab6507 14806 if (is_guest_mode(vcpu)) {
8fcc4b59 14807 sync_vmcs12(vcpu, vmcs12);
8cab6507
VK
14808 } else if (!vmx->nested.need_vmcs12_sync) {
14809 if (vmx->nested.hv_evmcs)
14810 copy_enlightened_to_vmcs12(vmx);
14811 else if (enable_shadow_vmcs)
14812 copy_shadow_to_vmcs12(vmx);
14813 }
8fcc4b59
JM
14814
14815 if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
14816 return -EFAULT;
14817
fa58a9fa
PB
14818 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14819 vmcs12->vmcs_link_pointer != -1ull) {
14820 if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
14821 get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
14822 return -EFAULT;
14823 }
14824
8fcc4b59
JM
14825out:
14826 return kvm_state.size;
14827}
14828
14829static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
14830 struct kvm_nested_state __user *user_kvm_nested_state,
14831 struct kvm_nested_state *kvm_state)
14832{
14833 struct vcpu_vmx *vmx = to_vmx(vcpu);
14834 struct vmcs12 *vmcs12;
14835 u32 exit_qual;
14836 int ret;
14837
14838 if (kvm_state->format != 0)
14839 return -EINVAL;
14840
8cab6507
VK
14841 if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
14842 nested_enable_evmcs(vcpu, NULL);
14843
8fcc4b59
JM
14844 if (!nested_vmx_allowed(vcpu))
14845 return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
14846
14847 if (kvm_state->vmx.vmxon_pa == -1ull) {
14848 if (kvm_state->vmx.smm.flags)
14849 return -EINVAL;
14850
14851 if (kvm_state->vmx.vmcs_pa != -1ull)
14852 return -EINVAL;
14853
14854 vmx_leave_nested(vcpu);
14855 return 0;
14856 }
14857
14858 if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
14859 return -EINVAL;
14860
8fcc4b59
JM
14861 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14862 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14863 return -EINVAL;
14864
14865 if (kvm_state->vmx.smm.flags &
14866 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
14867 return -EINVAL;
14868
5bea5123
PB
14869 /*
14870 * SMM temporarily disables VMX, so we cannot be in guest mode,
14871 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
14872 * must be zero.
14873 */
14874 if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
14875 return -EINVAL;
14876
8fcc4b59
JM
14877 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14878 !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
14879 return -EINVAL;
14880
14881 vmx_leave_nested(vcpu);
14882 if (kvm_state->vmx.vmxon_pa == -1ull)
14883 return 0;
14884
14885 vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
14886 ret = enter_vmx_operation(vcpu);
14887 if (ret)
14888 return ret;
14889
a1b0c1c6
VK
14890 /* Empty 'VMXON' state is permitted */
14891 if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
14892 return 0;
14893
8cab6507
VK
14894 if (kvm_state->vmx.vmcs_pa != -1ull) {
14895 if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
14896 !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
14897 return -EINVAL;
a1b0c1c6 14898
8cab6507
VK
14899 set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
14900 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
14901 /*
14902 * Sync eVMCS upon entry as we may not have
14903 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
14904 */
14905 vmx->nested.need_vmcs12_sync = true;
14906 } else {
14907 return -EINVAL;
14908 }
8fcc4b59
JM
14909
14910 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
14911 vmx->nested.smm.vmxon = true;
14912 vmx->nested.vmxon = false;
14913
14914 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
14915 vmx->nested.smm.guest_mode = true;
14916 }
14917
14918 vmcs12 = get_vmcs12(vcpu);
14919 if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
14920 return -EFAULT;
14921
392b2f25 14922 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
8fcc4b59
JM
14923 return -EINVAL;
14924
14925 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14926 return 0;
14927
14928 vmx->nested.nested_run_pending =
14929 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
14930
fa58a9fa
PB
14931 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14932 vmcs12->vmcs_link_pointer != -1ull) {
14933 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
14934 if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
14935 return -EINVAL;
14936
14937 if (copy_from_user(shadow_vmcs12,
14938 user_kvm_nested_state->data + VMCS12_SIZE,
14939 sizeof(*vmcs12)))
14940 return -EFAULT;
14941
14942 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
14943 !shadow_vmcs12->hdr.shadow_vmcs)
14944 return -EINVAL;
14945 }
14946
8fcc4b59
JM
14947 if (check_vmentry_prereqs(vcpu, vmcs12) ||
14948 check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
14949 return -EINVAL;
14950
8fcc4b59 14951 vmx->nested.dirty_vmcs12 = true;
a633e41e 14952 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8fcc4b59
JM
14953 if (ret)
14954 return -EINVAL;
14955
14956 return 0;
14957}
14958
404f6aac 14959static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
6aa8b732
AK
14960 .cpu_has_kvm_support = cpu_has_kvm_support,
14961 .disabled_by_bios = vmx_disabled_by_bios,
14962 .hardware_setup = hardware_setup,
14963 .hardware_unsetup = hardware_unsetup,
002c7f7c 14964 .check_processor_compatibility = vmx_check_processor_compat,
6aa8b732
AK
14965 .hardware_enable = hardware_enable,
14966 .hardware_disable = hardware_disable,
04547156 14967 .cpu_has_accelerated_tpr = report_flexpriority,
bc226f07 14968 .has_emulated_msr = vmx_has_emulated_msr,
6aa8b732 14969
b31c114b 14970 .vm_init = vmx_vm_init,
434a1e94
SC
14971 .vm_alloc = vmx_vm_alloc,
14972 .vm_free = vmx_vm_free,
b31c114b 14973
6aa8b732
AK
14974 .vcpu_create = vmx_create_vcpu,
14975 .vcpu_free = vmx_free_vcpu,
04d2cc77 14976 .vcpu_reset = vmx_vcpu_reset,
6aa8b732 14977
6d6095bd 14978 .prepare_guest_switch = vmx_prepare_switch_to_guest,
6aa8b732
AK
14979 .vcpu_load = vmx_vcpu_load,
14980 .vcpu_put = vmx_vcpu_put,
14981
a96036b8 14982 .update_bp_intercept = update_exception_bitmap,
801e459a 14983 .get_msr_feature = vmx_get_msr_feature,
6aa8b732
AK
14984 .get_msr = vmx_get_msr,
14985 .set_msr = vmx_set_msr,
14986 .get_segment_base = vmx_get_segment_base,
14987 .get_segment = vmx_get_segment,
14988 .set_segment = vmx_set_segment,
2e4d2653 14989 .get_cpl = vmx_get_cpl,
6aa8b732 14990 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
e8467fda 14991 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
aff48baa 14992 .decache_cr3 = vmx_decache_cr3,
25c4c276 14993 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
6aa8b732 14994 .set_cr0 = vmx_set_cr0,
6aa8b732
AK
14995 .set_cr3 = vmx_set_cr3,
14996 .set_cr4 = vmx_set_cr4,
6aa8b732 14997 .set_efer = vmx_set_efer,
6aa8b732
AK
14998 .get_idt = vmx_get_idt,
14999 .set_idt = vmx_set_idt,
15000 .get_gdt = vmx_get_gdt,
15001 .set_gdt = vmx_set_gdt,
73aaf249
JK
15002 .get_dr6 = vmx_get_dr6,
15003 .set_dr6 = vmx_set_dr6,
020df079 15004 .set_dr7 = vmx_set_dr7,
81908bf4 15005 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
5fdbf976 15006 .cache_reg = vmx_cache_reg,
6aa8b732
AK
15007 .get_rflags = vmx_get_rflags,
15008 .set_rflags = vmx_set_rflags,
be94f6b7 15009
6aa8b732 15010 .tlb_flush = vmx_flush_tlb,
faff8758 15011 .tlb_flush_gva = vmx_flush_tlb_gva,
6aa8b732 15012
6aa8b732 15013 .run = vmx_vcpu_run,
6062d012 15014 .handle_exit = vmx_handle_exit,
6aa8b732 15015 .skip_emulated_instruction = skip_emulated_instruction,
2809f5d2
GC
15016 .set_interrupt_shadow = vmx_set_interrupt_shadow,
15017 .get_interrupt_shadow = vmx_get_interrupt_shadow,
102d8325 15018 .patch_hypercall = vmx_patch_hypercall,
2a8067f1 15019 .set_irq = vmx_inject_irq,
95ba8273 15020 .set_nmi = vmx_inject_nmi,
298101da 15021 .queue_exception = vmx_queue_exception,
b463a6f7 15022 .cancel_injection = vmx_cancel_injection,
78646121 15023 .interrupt_allowed = vmx_interrupt_allowed,
95ba8273 15024 .nmi_allowed = vmx_nmi_allowed,
3cfc3092
JK
15025 .get_nmi_mask = vmx_get_nmi_mask,
15026 .set_nmi_mask = vmx_set_nmi_mask,
95ba8273
GN
15027 .enable_nmi_window = enable_nmi_window,
15028 .enable_irq_window = enable_irq_window,
15029 .update_cr8_intercept = update_cr8_intercept,
8d860bbe 15030 .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
38b99173 15031 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
d62caabb
AS
15032 .get_enable_apicv = vmx_get_enable_apicv,
15033 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
c7c9c56c 15034 .load_eoi_exitmap = vmx_load_eoi_exitmap,
967235d3 15035 .apicv_post_state_restore = vmx_apicv_post_state_restore,
c7c9c56c
YZ
15036 .hwapic_irr_update = vmx_hwapic_irr_update,
15037 .hwapic_isr_update = vmx_hwapic_isr_update,
e6c67d8c 15038 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
a20ed54d
YZ
15039 .sync_pir_to_irr = vmx_sync_pir_to_irr,
15040 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
95ba8273 15041
cbc94022 15042 .set_tss_addr = vmx_set_tss_addr,
2ac52ab8 15043 .set_identity_map_addr = vmx_set_identity_map_addr,
67253af5 15044 .get_tdp_level = get_ept_level,
4b12f0de 15045 .get_mt_mask = vmx_get_mt_mask,
229456fc 15046
586f9607 15047 .get_exit_info = vmx_get_exit_info,
586f9607 15048
17cc3935 15049 .get_lpage_level = vmx_get_lpage_level,
0e851880
SY
15050
15051 .cpuid_update = vmx_cpuid_update,
4e47c7a6
SY
15052
15053 .rdtscp_supported = vmx_rdtscp_supported,
ad756a16 15054 .invpcid_supported = vmx_invpcid_supported,
d4330ef2
JR
15055
15056 .set_supported_cpuid = vmx_set_supported_cpuid,
f5f48ee1
SY
15057
15058 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
99e3e30a 15059
e79f245d 15060 .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
99e3e30a 15061 .write_tsc_offset = vmx_write_tsc_offset,
1c97f0a0
JR
15062
15063 .set_tdp_cr3 = vmx_set_cr3,
8a76d7f2
JR
15064
15065 .check_intercept = vmx_check_intercept,
a547c6db 15066 .handle_external_intr = vmx_handle_external_intr,
da8999d3 15067 .mpx_supported = vmx_mpx_supported,
55412b2e 15068 .xsaves_supported = vmx_xsaves_supported,
66336cab 15069 .umip_emulated = vmx_umip_emulated,
b6b8a145
JK
15070
15071 .check_nested_events = vmx_check_nested_events,
d264ee0c 15072 .request_immediate_exit = vmx_request_immediate_exit,
ae97a3b8
RK
15073
15074 .sched_in = vmx_sched_in,
843e4330
KH
15075
15076 .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
15077 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
15078 .flush_log_dirty = vmx_flush_log_dirty,
15079 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
c5f983f6 15080 .write_log_dirty = vmx_write_pml_buffer,
25462f7f 15081
bf9f6ac8
FW
15082 .pre_block = vmx_pre_block,
15083 .post_block = vmx_post_block,
15084
25462f7f 15085 .pmu_ops = &intel_pmu_ops,
efc64404
FW
15086
15087 .update_pi_irte = vmx_update_pi_irte,
64672c95
YJ
15088
15089#ifdef CONFIG_X86_64
15090 .set_hv_timer = vmx_set_hv_timer,
15091 .cancel_hv_timer = vmx_cancel_hv_timer,
15092#endif
c45dcc71
AR
15093
15094 .setup_mce = vmx_setup_mce,
0234bf88 15095
8fcc4b59
JM
15096 .get_nested_state = vmx_get_nested_state,
15097 .set_nested_state = vmx_set_nested_state,
7f7f1ba3
PB
15098 .get_vmcs12_pages = nested_get_vmcs12_pages,
15099
72d7b374 15100 .smi_allowed = vmx_smi_allowed,
0234bf88
LP
15101 .pre_enter_smm = vmx_pre_enter_smm,
15102 .pre_leave_smm = vmx_pre_leave_smm,
cc3d967f 15103 .enable_smi_window = enable_smi_window,
57b119da
VK
15104
15105 .nested_enable_evmcs = nested_enable_evmcs,
6aa8b732
AK
15106};
15107
72c6d2db 15108static void vmx_cleanup_l1d_flush(void)
a47dd5f0
PB
15109{
15110 if (vmx_l1d_flush_pages) {
15111 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
15112 vmx_l1d_flush_pages = NULL;
15113 }
72c6d2db
TG
15114 /* Restore state so sysfs ignores VMX */
15115 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
15116}
15117
a7b9020b
TG
15118static void vmx_exit(void)
15119{
15120#ifdef CONFIG_KEXEC_CORE
15121 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
15122 synchronize_rcu();
15123#endif
15124
15125 kvm_exit();
15126
15127#if IS_ENABLED(CONFIG_HYPERV)
15128 if (static_branch_unlikely(&enable_evmcs)) {
15129 int cpu;
15130 struct hv_vp_assist_page *vp_ap;
15131 /*
15132 * Reset everything to support using non-enlightened VMCS
15133 * access later (e.g. when we reload the module with
15134 * enlightened_vmcs=0)
15135 */
15136 for_each_online_cpu(cpu) {
15137 vp_ap = hv_get_vp_assist_page(cpu);
15138
15139 if (!vp_ap)
15140 continue;
15141
15142 vp_ap->current_nested_vmcs = 0;
15143 vp_ap->enlighten_vmentry = 0;
15144 }
15145
15146 static_branch_disable(&enable_evmcs);
15147 }
15148#endif
15149 vmx_cleanup_l1d_flush();
15150}
15151module_exit(vmx_exit);
15152
6aa8b732
AK
15153static int __init vmx_init(void)
15154{
773e8a04
VK
15155 int r;
15156
15157#if IS_ENABLED(CONFIG_HYPERV)
15158 /*
15159 * Enlightened VMCS usage should be recommended and the host needs
15160 * to support eVMCS v1 or above. We can also disable eVMCS support
15161 * with module parameter.
15162 */
15163 if (enlightened_vmcs &&
15164 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
15165 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
15166 KVM_EVMCS_VERSION) {
15167 int cpu;
15168
15169 /* Check that we have assist pages on all online CPUs */
15170 for_each_online_cpu(cpu) {
15171 if (!hv_get_vp_assist_page(cpu)) {
15172 enlightened_vmcs = false;
15173 break;
15174 }
15175 }
15176
15177 if (enlightened_vmcs) {
15178 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
15179 static_branch_enable(&enable_evmcs);
15180 }
15181 } else {
15182 enlightened_vmcs = false;
15183 }
15184#endif
15185
15186 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
a7b9020b 15187 __alignof__(struct vcpu_vmx), THIS_MODULE);
fdef3ad1 15188 if (r)
34a1cd60 15189 return r;
25c5f225 15190
a7b9020b 15191 /*
7db92e16
TG
15192 * Must be called after kvm_init() so enable_ept is properly set
15193 * up. Hand the parameter mitigation value in which was stored in
15194 * the pre module init parser. If no parameter was given, it will
15195 * contain 'auto' which will be turned into the default 'cond'
15196 * mitigation mode.
15197 */
15198 if (boot_cpu_has(X86_BUG_L1TF)) {
15199 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
15200 if (r) {
15201 vmx_exit();
15202 return r;
15203 }
a47dd5f0 15204 }
25c5f225 15205
2965faa5 15206#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
15207 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
15208 crash_vmclear_local_loaded_vmcss);
15209#endif
21ebf53b 15210 vmx_check_vmcs12_offsets();
8f536b76 15211
fdef3ad1 15212 return 0;
6aa8b732 15213}
a7b9020b 15214module_init(vmx_init);