Merge branches 'pm-devfreq', 'pm-qos', 'pm-tools' and 'pm-docs'
[linux-2.6-block.git] / arch / x86 / kvm / vmx / nested.c
CommitLineData
55d2375e
SC
1// SPDX-License-Identifier: GPL-2.0
2
00089c04 3#include <linux/objtool.h>
55d2375e
SC
4#include <linux/percpu.h>
5
6#include <asm/debugreg.h>
7#include <asm/mmu_context.h>
8
9#include "cpuid.h"
6cbbaab6 10#include "evmcs.h"
55d2375e
SC
11#include "hyperv.h"
12#include "mmu.h"
13#include "nested.h"
bfc6ad6a 14#include "pmu.h"
72add915 15#include "sgx.h"
55d2375e 16#include "trace.h"
150f17bf 17#include "vmx.h"
55d2375e
SC
18#include "x86.h"
19
20static bool __read_mostly enable_shadow_vmcs = 1;
21module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
22
23static bool __read_mostly nested_early_check = 0;
24module_param(nested_early_check, bool, S_IRUGO);
25
648fc8ae 26#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
5497b955 27
55d2375e
SC
28/*
29 * Hyper-V requires all of these, so mark them as supported even though
30 * they are just treated the same as all-context.
31 */
32#define VMX_VPID_EXTENT_SUPPORTED_MASK \
33 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
34 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
35 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
36 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
37
38#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
39
40enum {
41 VMX_VMREAD_BITMAP,
42 VMX_VMWRITE_BITMAP,
43 VMX_BITMAP_NR
44};
45static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
46
47#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
48#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
49
1c6f0b47
SC
50struct shadow_vmcs_field {
51 u16 encoding;
52 u16 offset;
53};
54static struct shadow_vmcs_field shadow_read_only_fields[] = {
55#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
55d2375e
SC
56#include "vmcs_shadow_fields.h"
57};
58static int max_shadow_read_only_fields =
59 ARRAY_SIZE(shadow_read_only_fields);
60
1c6f0b47
SC
61static struct shadow_vmcs_field shadow_read_write_fields[] = {
62#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
55d2375e
SC
63#include "vmcs_shadow_fields.h"
64};
65static int max_shadow_read_write_fields =
66 ARRAY_SIZE(shadow_read_write_fields);
67
8997f657 68static void init_vmcs_shadow_fields(void)
55d2375e
SC
69{
70 int i, j;
71
72 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
73 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
74
75 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
1c6f0b47
SC
76 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
77 u16 field = entry.encoding;
55d2375e
SC
78
79 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
80 (i + 1 == max_shadow_read_only_fields ||
1c6f0b47 81 shadow_read_only_fields[i + 1].encoding != field + 1))
55d2375e
SC
82 pr_err("Missing field from shadow_read_only_field %x\n",
83 field + 1);
84
85 clear_bit(field, vmx_vmread_bitmap);
55d2375e 86 if (field & 1)
1c6f0b47 87#ifdef CONFIG_X86_64
55d2375e 88 continue;
1c6f0b47
SC
89#else
90 entry.offset += sizeof(u32);
55d2375e 91#endif
1c6f0b47 92 shadow_read_only_fields[j++] = entry;
55d2375e
SC
93 }
94 max_shadow_read_only_fields = j;
95
96 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
1c6f0b47
SC
97 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
98 u16 field = entry.encoding;
55d2375e
SC
99
100 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
101 (i + 1 == max_shadow_read_write_fields ||
1c6f0b47 102 shadow_read_write_fields[i + 1].encoding != field + 1))
55d2375e
SC
103 pr_err("Missing field from shadow_read_write_field %x\n",
104 field + 1);
105
b6437805
SC
106 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
107 field <= GUEST_TR_AR_BYTES,
1c6f0b47 108 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
b6437805 109
55d2375e
SC
110 /*
111 * PML and the preemption timer can be emulated, but the
112 * processor cannot vmwrite to fields that don't exist
113 * on bare metal.
114 */
115 switch (field) {
116 case GUEST_PML_INDEX:
117 if (!cpu_has_vmx_pml())
118 continue;
119 break;
120 case VMX_PREEMPTION_TIMER_VALUE:
121 if (!cpu_has_vmx_preemption_timer())
122 continue;
123 break;
124 case GUEST_INTR_STATUS:
125 if (!cpu_has_vmx_apicv())
126 continue;
127 break;
128 default:
129 break;
130 }
131
132 clear_bit(field, vmx_vmwrite_bitmap);
133 clear_bit(field, vmx_vmread_bitmap);
55d2375e 134 if (field & 1)
1c6f0b47 135#ifdef CONFIG_X86_64
55d2375e 136 continue;
1c6f0b47
SC
137#else
138 entry.offset += sizeof(u32);
55d2375e 139#endif
1c6f0b47 140 shadow_read_write_fields[j++] = entry;
55d2375e
SC
141 }
142 max_shadow_read_write_fields = j;
143}
144
145/*
146 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
147 * set the success or error code of an emulated VMX instruction (as specified
148 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
149 * instruction.
150 */
151static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
152{
153 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
154 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
155 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
156 return kvm_skip_emulated_instruction(vcpu);
157}
158
159static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
160{
161 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
162 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
163 X86_EFLAGS_SF | X86_EFLAGS_OF))
164 | X86_EFLAGS_CF);
165 return kvm_skip_emulated_instruction(vcpu);
166}
167
168static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
169 u32 vm_instruction_error)
170{
55d2375e
SC
171 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
172 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
173 X86_EFLAGS_SF | X86_EFLAGS_OF))
174 | X86_EFLAGS_ZF);
175 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
176 /*
b7685cfd
VK
177 * We don't need to force sync to shadow VMCS because
178 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
179 * fields and thus must be synced.
55d2375e 180 */
b7685cfd
VK
181 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
182 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
183
55d2375e
SC
184 return kvm_skip_emulated_instruction(vcpu);
185}
186
b2656e4d
SC
187static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
188{
189 struct vcpu_vmx *vmx = to_vmx(vcpu);
190
191 /*
192 * failValid writes the error number to the current VMCS, which
193 * can't be done if there isn't a current VMCS.
194 */
64c78508 195 if (vmx->nested.current_vmptr == INVALID_GPA &&
1e9dfbd7 196 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
b2656e4d
SC
197 return nested_vmx_failInvalid(vcpu);
198
199 return nested_vmx_failValid(vcpu, vm_instruction_error);
200}
201
55d2375e
SC
202static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
203{
204 /* TODO: not to reset guest simply here. */
205 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
206 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
207}
208
f0b5105a
MO
209static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
210{
211 return fixed_bits_valid(control, low, high);
212}
213
214static inline u64 vmx_control_msr(u32 low, u32 high)
215{
216 return low | ((u64)high << 32);
217}
218
55d2375e
SC
219static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
220{
fe7f895d 221 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
64c78508 222 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
88dddc11 223 vmx->nested.need_vmcs12_to_shadow_sync = false;
55d2375e
SC
224}
225
226static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
227{
228 struct vcpu_vmx *vmx = to_vmx(vcpu);
229
1e9dfbd7
VK
230 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
231 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
232 vmx->nested.hv_evmcs = NULL;
233 }
55d2375e 234
1e9dfbd7 235 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
55d2375e
SC
236}
237
c61ca2fc
SC
238static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
239 struct loaded_vmcs *prev)
240{
241 struct vmcs_host_state *dest, *src;
242
243 if (unlikely(!vmx->guest_state_loaded))
244 return;
245
246 src = &prev->host_state;
247 dest = &vmx->loaded_vmcs->host_state;
248
bca06b85 249 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
c61ca2fc
SC
250 dest->ldt_sel = src->ldt_sel;
251#ifdef CONFIG_X86_64
252 dest->ds_sel = src->ds_sel;
253 dest->es_sel = src->es_sel;
254#endif
255}
256
257static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
258{
259 struct vcpu_vmx *vmx = to_vmx(vcpu);
260 struct loaded_vmcs *prev;
261 int cpu;
262
138534a8 263 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
c61ca2fc
SC
264 return;
265
266 cpu = get_cpu();
267 prev = vmx->loaded_vmcs;
268 vmx->loaded_vmcs = vmcs;
269 vmx_vcpu_load_vmcs(vcpu, cpu, prev);
270 vmx_sync_vmcs_host_state(vmx, prev);
271 put_cpu();
272
41e68b69
PB
273 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
274
275 /*
276 * All lazily updated registers will be reloaded from VMCS12 on both
277 * vmentry and vmexit.
278 */
279 vcpu->arch.regs_dirty = 0;
c61ca2fc
SC
280}
281
55d2375e
SC
282/*
283 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
284 * just stops using VMX.
285 */
286static void free_nested(struct kvm_vcpu *vcpu)
287{
288 struct vcpu_vmx *vmx = to_vmx(vcpu);
289
df82a24b
SC
290 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
291 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
292
55d2375e
SC
293 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
294 return;
295
729c15c2 296 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
cf64527b 297
55d2375e
SC
298 vmx->nested.vmxon = false;
299 vmx->nested.smm.vmxon = false;
feb3162f 300 vmx->nested.vmxon_ptr = INVALID_GPA;
55d2375e
SC
301 free_vpid(vmx->nested.vpid02);
302 vmx->nested.posted_intr_nv = -1;
64c78508 303 vmx->nested.current_vmptr = INVALID_GPA;
55d2375e
SC
304 if (enable_shadow_vmcs) {
305 vmx_disable_shadow_vmcs(vmx);
306 vmcs_clear(vmx->vmcs01.shadow_vmcs);
307 free_vmcs(vmx->vmcs01.shadow_vmcs);
308 vmx->vmcs01.shadow_vmcs = NULL;
309 }
310 kfree(vmx->nested.cached_vmcs12);
c6bf2ae9 311 vmx->nested.cached_vmcs12 = NULL;
55d2375e 312 kfree(vmx->nested.cached_shadow_vmcs12);
c6bf2ae9 313 vmx->nested.cached_shadow_vmcs12 = NULL;
55d2375e
SC
314 /* Unpin physical memory we referred to in the vmcs02 */
315 if (vmx->nested.apic_access_page) {
b11494bc 316 kvm_release_page_clean(vmx->nested.apic_access_page);
55d2375e
SC
317 vmx->nested.apic_access_page = NULL;
318 }
96c66e87 319 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
3278e049
KA
320 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
321 vmx->nested.pi_desc = NULL;
55d2375e 322
0c1c92f1 323 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
55d2375e
SC
324
325 nested_release_evmcs(vcpu);
326
327 free_loaded_vmcs(&vmx->nested.vmcs02);
328}
329
55d2375e
SC
330/*
331 * Ensure that the current vmcs of the logical processor is the
332 * vmcs01 of the vcpu before calling free_nested().
333 */
334void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
335{
336 vcpu_load(vcpu);
b4b65b56 337 vmx_leave_nested(vcpu);
55d2375e
SC
338 vcpu_put(vcpu);
339}
340
85aa8889
JS
341#define EPTP_PA_MASK GENMASK_ULL(51, 12)
342
343static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
344{
345 return VALID_PAGE(root_hpa) &&
346 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
347}
348
349static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
350 gpa_t addr)
351{
352 uint i;
353 struct kvm_mmu_root_info *cached_root;
354
355 WARN_ON_ONCE(!mmu_is_nested(vcpu));
356
357 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
358 cached_root = &vcpu->arch.mmu->prev_roots[i];
359
360 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
361 eptp))
362 vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
363 }
364}
365
55d2375e
SC
366static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
367 struct x86_exception *fault)
368{
369 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
370 struct vcpu_vmx *vmx = to_vmx(vcpu);
4dcefa31 371 u32 vm_exit_reason;
55d2375e
SC
372 unsigned long exit_qualification = vcpu->arch.exit_qualification;
373
374 if (vmx->nested.pml_full) {
4dcefa31 375 vm_exit_reason = EXIT_REASON_PML_FULL;
55d2375e
SC
376 vmx->nested.pml_full = false;
377 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
85aa8889
JS
378 } else {
379 if (fault->error_code & PFERR_RSVD_MASK)
380 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
381 else
382 vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
383
384 /*
385 * Although the caller (kvm_inject_emulated_page_fault) would
386 * have already synced the faulting address in the shadow EPT
387 * tables for the current EPTP12, we also need to sync it for
388 * any other cached EPTP02s based on the same EP4TA, since the
389 * TLB associates mappings to the EP4TA rather than the full EPTP.
390 */
391 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
392 fault->address);
393 }
55d2375e 394
4dcefa31 395 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
55d2375e
SC
396 vmcs12->guest_physical_address = fault->address;
397}
398
39353ab5
SC
399static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
400{
cc022ae1
LJ
401 struct vcpu_vmx *vmx = to_vmx(vcpu);
402 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
403 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
404
405 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
39353ab5
SC
406 nested_ept_ad_enabled(vcpu),
407 nested_ept_get_eptp(vcpu));
408}
409
55d2375e
SC
410static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
411{
412 WARN_ON(mmu_is_nested(vcpu));
413
414 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
39353ab5 415 nested_ept_new_eptp(vcpu);
d8dd54e0 416 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
55d2375e
SC
417 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
418 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
419
420 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
421}
422
423static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
424{
425 vcpu->arch.mmu = &vcpu->arch.root_mmu;
426 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
427}
428
429static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
430 u16 error_code)
431{
432 bool inequality, bit;
433
434 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
435 inequality =
436 (error_code & vmcs12->page_fault_error_code_mask) !=
437 vmcs12->page_fault_error_code_match;
438 return inequality ^ bit;
439}
440
441
442/*
443 * KVM wants to inject page-faults which it got to the guest. This function
444 * checks whether in a nested guest, we need to inject them to L1 or L2.
445 */
446static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
447{
448 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
449 unsigned int nr = vcpu->arch.exception.nr;
450 bool has_payload = vcpu->arch.exception.has_payload;
451 unsigned long payload = vcpu->arch.exception.payload;
452
453 if (nr == PF_VECTOR) {
454 if (vcpu->arch.exception.nested_apf) {
455 *exit_qual = vcpu->arch.apf.nested_apf_token;
456 return 1;
457 }
458 if (nested_vmx_is_page_fault_vmexit(vmcs12,
459 vcpu->arch.exception.error_code)) {
460 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
461 return 1;
462 }
463 } else if (vmcs12->exception_bitmap & (1u << nr)) {
464 if (nr == DB_VECTOR) {
465 if (!has_payload) {
466 payload = vcpu->arch.dr6;
9a3ecd5e
CQ
467 payload &= ~DR6_BT;
468 payload ^= DR6_ACTIVE_LOW;
55d2375e
SC
469 }
470 *exit_qual = payload;
471 } else
472 *exit_qual = 0;
473 return 1;
474 }
475
476 return 0;
477}
478
6819af75
SC
479static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
480 struct x86_exception *fault)
55d2375e
SC
481{
482 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
483
484 WARN_ON(!is_guest_mode(vcpu));
485
486 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
6819af75 487 !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
55d2375e
SC
488 vmcs12->vm_exit_intr_error_code = fault->error_code;
489 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
490 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
491 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
492 fault->address);
6819af75 493 return true;
55d2375e 494 }
6819af75 495 return false;
55d2375e
SC
496}
497
55d2375e
SC
498static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
499 struct vmcs12 *vmcs12)
500{
501 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
502 return 0;
503
5497b955
SC
504 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
505 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
55d2375e
SC
506 return -EINVAL;
507
508 return 0;
509}
510
511static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
512 struct vmcs12 *vmcs12)
513{
514 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
515 return 0;
516
5497b955 517 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
55d2375e
SC
518 return -EINVAL;
519
520 return 0;
521}
522
523static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
524 struct vmcs12 *vmcs12)
525{
526 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
527 return 0;
528
5497b955 529 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
55d2375e
SC
530 return -EINVAL;
531
532 return 0;
533}
534
55d2375e 535/*
a5e0c252
SC
536 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
537 * itself utilizing x2APIC. All MSRs were previously set to be intercepted,
538 * only the "disable intercept" case needs to be handled.
55d2375e 539 */
a5e0c252
SC
540static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
541 unsigned long *msr_bitmap_l0,
542 u32 msr, int type)
55d2375e 543{
a5e0c252
SC
544 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
545 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
55d2375e 546
a5e0c252
SC
547 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
548 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
55d2375e
SC
549}
550
ffdbd50d
ML
551static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
552{
acff7847
MO
553 int msr;
554
555 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
556 unsigned word = msr / BITS_PER_LONG;
557
558 msr_bitmap[word] = ~0;
559 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
560 }
561}
562
67f4b996
SC
563#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
564static inline \
565void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
566 unsigned long *msr_bitmap_l1, \
567 unsigned long *msr_bitmap_l0, u32 msr) \
568{ \
569 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
570 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
571 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
572 else \
573 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
574}
575BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
576BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
577
578static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
579 unsigned long *msr_bitmap_l1,
580 unsigned long *msr_bitmap_l0,
581 u32 msr, int types)
582{
583 if (types & MSR_TYPE_R)
584 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
585 msr_bitmap_l0, msr);
586 if (types & MSR_TYPE_W)
587 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
588 msr_bitmap_l0, msr);
589}
590
55d2375e
SC
591/*
592 * Merge L0's and L1's MSR bitmap, return false to indicate that
593 * we do not use the hardware.
594 */
595static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
596 struct vmcs12 *vmcs12)
597{
67f4b996 598 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 599 int msr;
55d2375e 600 unsigned long *msr_bitmap_l1;
67f4b996 601 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
502d2bf5 602 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
67f4b996 603 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
55d2375e
SC
604
605 /* Nothing to do if the MSR bitmap is not in use. */
606 if (!cpu_has_vmx_msr_bitmap() ||
607 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
608 return false;
609
502d2bf5
VK
610 /*
611 * MSR bitmap update can be skipped when:
612 * - MSR bitmap for L1 hasn't changed.
613 * - Nested hypervisor (L1) is attempting to launch the same L2 as
614 * before.
615 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
616 * and tells KVM (L0) there were no changes in MSR bitmap for L2.
617 */
618 if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
619 evmcs->hv_enlightenments_control.msr_bitmap &&
620 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
621 return true;
622
31f0b6c4 623 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
55d2375e
SC
624 return false;
625
31f0b6c4 626 msr_bitmap_l1 = (unsigned long *)map->hva;
55d2375e 627
acff7847
MO
628 /*
629 * To keep the control flow simple, pay eight 8-byte writes (sixteen
630 * 4-byte writes on 32-bit systems) up front to enable intercepts for
a5e0c252 631 * the x2APIC MSR range and selectively toggle those relevant to L2.
acff7847
MO
632 */
633 enable_x2apic_msr_intercepts(msr_bitmap_l0);
634
635 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
636 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
637 /*
638 * L0 need not intercept reads for MSRs between 0x800
639 * and 0x8ff, it just lets the processor take the value
640 * from the virtual-APIC page; take those 256 bits
641 * directly from the L1 bitmap.
642 */
643 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
644 unsigned word = msr / BITS_PER_LONG;
645
646 msr_bitmap_l0[word] = msr_bitmap_l1[word];
647 }
648 }
55d2375e 649
a5e0c252 650 nested_vmx_disable_intercept_for_x2apic_msr(
55d2375e 651 msr_bitmap_l1, msr_bitmap_l0,
acff7847 652 X2APIC_MSR(APIC_TASKPRI),
c73f4c99 653 MSR_TYPE_R | MSR_TYPE_W);
acff7847
MO
654
655 if (nested_cpu_has_vid(vmcs12)) {
a5e0c252 656 nested_vmx_disable_intercept_for_x2apic_msr(
acff7847
MO
657 msr_bitmap_l1, msr_bitmap_l0,
658 X2APIC_MSR(APIC_EOI),
659 MSR_TYPE_W);
a5e0c252 660 nested_vmx_disable_intercept_for_x2apic_msr(
acff7847
MO
661 msr_bitmap_l1, msr_bitmap_l0,
662 X2APIC_MSR(APIC_SELF_IPI),
663 MSR_TYPE_W);
664 }
55d2375e
SC
665 }
666
67f4b996
SC
667 /*
668 * Always check vmcs01's bitmap to honor userspace MSR filters and any
669 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
670 */
dbdd096a 671#ifdef CONFIG_X86_64
67f4b996
SC
672 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
673 MSR_FS_BASE, MSR_TYPE_RW);
d69129b4 674
67f4b996
SC
675 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
676 MSR_GS_BASE, MSR_TYPE_RW);
d69129b4 677
67f4b996
SC
678 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
679 MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
dbdd096a 680#endif
67f4b996
SC
681 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
682 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
d69129b4 683
67f4b996
SC
684 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
685 MSR_IA32_PRED_CMD, MSR_TYPE_W);
55d2375e 686
67f4b996 687 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
55d2375e 688
ed2a4800
VK
689 vmx->nested.force_msr_bitmap_recalc = false;
690
55d2375e
SC
691 return true;
692}
693
694static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
695 struct vmcs12 *vmcs12)
696{
297d597a
DW
697 struct vcpu_vmx *vmx = to_vmx(vcpu);
698 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
55d2375e
SC
699
700 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
64c78508 701 vmcs12->vmcs_link_pointer == INVALID_GPA)
55d2375e
SC
702 return;
703
297d597a
DW
704 if (ghc->gpa != vmcs12->vmcs_link_pointer &&
705 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
706 vmcs12->vmcs_link_pointer, VMCS12_SIZE))
88925305 707 return;
55d2375e 708
297d597a
DW
709 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
710 VMCS12_SIZE);
55d2375e
SC
711}
712
713static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
714 struct vmcs12 *vmcs12)
715{
716 struct vcpu_vmx *vmx = to_vmx(vcpu);
297d597a 717 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
55d2375e
SC
718
719 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
64c78508 720 vmcs12->vmcs_link_pointer == INVALID_GPA)
55d2375e
SC
721 return;
722
297d597a
DW
723 if (ghc->gpa != vmcs12->vmcs_link_pointer &&
724 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
725 vmcs12->vmcs_link_pointer, VMCS12_SIZE))
726 return;
727
728 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
729 VMCS12_SIZE);
55d2375e
SC
730}
731
732/*
733 * In nested virtualization, check if L1 has set
734 * VM_EXIT_ACK_INTR_ON_EXIT
735 */
736static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
737{
738 return get_vmcs12(vcpu)->vm_exit_controls &
739 VM_EXIT_ACK_INTR_ON_EXIT;
740}
741
55d2375e
SC
742static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
743 struct vmcs12 *vmcs12)
744{
745 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
5497b955 746 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
55d2375e
SC
747 return -EINVAL;
748 else
749 return 0;
750}
751
752static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
753 struct vmcs12 *vmcs12)
754{
755 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
756 !nested_cpu_has_apic_reg_virt(vmcs12) &&
757 !nested_cpu_has_vid(vmcs12) &&
758 !nested_cpu_has_posted_intr(vmcs12))
759 return 0;
760
761 /*
762 * If virtualize x2apic mode is enabled,
763 * virtualize apic access must be disabled.
764 */
5497b955
SC
765 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
766 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
55d2375e
SC
767 return -EINVAL;
768
769 /*
770 * If virtual interrupt delivery is enabled,
771 * we must exit on external interrupts.
772 */
5497b955 773 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
55d2375e
SC
774 return -EINVAL;
775
776 /*
777 * bits 15:8 should be zero in posted_intr_nv,
778 * the descriptor address has been already checked
779 * in nested_get_vmcs12_pages.
780 *
781 * bits 5:0 of posted_intr_desc_addr should be zero.
782 */
783 if (nested_cpu_has_posted_intr(vmcs12) &&
5497b955
SC
784 (CC(!nested_cpu_has_vid(vmcs12)) ||
785 CC(!nested_exit_intr_ack_set(vcpu)) ||
786 CC((vmcs12->posted_intr_nv & 0xff00)) ||
636e8b73 787 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
55d2375e
SC
788 return -EINVAL;
789
790 /* tpr shadow is needed by all apicv features. */
5497b955 791 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
55d2375e
SC
792 return -EINVAL;
793
794 return 0;
795}
796
797static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
f9b245e1 798 u32 count, u64 addr)
55d2375e 799{
55d2375e
SC
800 if (count == 0)
801 return 0;
636e8b73
SC
802
803 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
804 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
55d2375e 805 return -EINVAL;
f9b245e1 806
55d2375e
SC
807 return 0;
808}
809
61446ba7
KS
810static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
811 struct vmcs12 *vmcs12)
55d2375e 812{
5497b955
SC
813 if (CC(nested_vmx_check_msr_switch(vcpu,
814 vmcs12->vm_exit_msr_load_count,
815 vmcs12->vm_exit_msr_load_addr)) ||
816 CC(nested_vmx_check_msr_switch(vcpu,
817 vmcs12->vm_exit_msr_store_count,
818 vmcs12->vm_exit_msr_store_addr)))
55d2375e 819 return -EINVAL;
f9b245e1 820
55d2375e
SC
821 return 0;
822}
823
5fbf9634
KS
824static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
825 struct vmcs12 *vmcs12)
61446ba7 826{
5497b955
SC
827 if (CC(nested_vmx_check_msr_switch(vcpu,
828 vmcs12->vm_entry_msr_load_count,
829 vmcs12->vm_entry_msr_load_addr)))
61446ba7
KS
830 return -EINVAL;
831
832 return 0;
833}
834
55d2375e
SC
835static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
836 struct vmcs12 *vmcs12)
837{
838 if (!nested_cpu_has_pml(vmcs12))
839 return 0;
840
5497b955
SC
841 if (CC(!nested_cpu_has_ept(vmcs12)) ||
842 CC(!page_address_valid(vcpu, vmcs12->pml_address)))
55d2375e
SC
843 return -EINVAL;
844
845 return 0;
846}
847
848static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
849 struct vmcs12 *vmcs12)
850{
5497b955
SC
851 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
852 !nested_cpu_has_ept(vmcs12)))
55d2375e
SC
853 return -EINVAL;
854 return 0;
855}
856
857static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
858 struct vmcs12 *vmcs12)
859{
5497b955
SC
860 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
861 !nested_cpu_has_ept(vmcs12)))
55d2375e
SC
862 return -EINVAL;
863 return 0;
864}
865
866static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
867 struct vmcs12 *vmcs12)
868{
869 if (!nested_cpu_has_shadow_vmcs(vmcs12))
870 return 0;
871
5497b955
SC
872 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
873 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
55d2375e
SC
874 return -EINVAL;
875
876 return 0;
877}
878
879static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
880 struct vmx_msr_entry *e)
881{
882 /* x2APIC MSR accesses are not allowed */
5497b955 883 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
55d2375e 884 return -EINVAL;
5497b955
SC
885 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
886 CC(e->index == MSR_IA32_UCODE_REV))
55d2375e 887 return -EINVAL;
5497b955 888 if (CC(e->reserved != 0))
55d2375e
SC
889 return -EINVAL;
890 return 0;
891}
892
893static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
894 struct vmx_msr_entry *e)
895{
5497b955
SC
896 if (CC(e->index == MSR_FS_BASE) ||
897 CC(e->index == MSR_GS_BASE) ||
898 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
55d2375e
SC
899 nested_vmx_msr_check_common(vcpu, e))
900 return -EINVAL;
901 return 0;
902}
903
904static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
905 struct vmx_msr_entry *e)
906{
5497b955 907 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
55d2375e
SC
908 nested_vmx_msr_check_common(vcpu, e))
909 return -EINVAL;
910 return 0;
911}
912
f0b5105a
MO
913static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
914{
915 struct vcpu_vmx *vmx = to_vmx(vcpu);
916 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
917 vmx->nested.msrs.misc_high);
918
919 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
920}
921
55d2375e
SC
922/*
923 * Load guest's/host's msr at nested entry/exit.
924 * return 0 for success, entry index for failure.
f0b5105a
MO
925 *
926 * One of the failure modes for MSR load/store is when a list exceeds the
927 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
928 * as possible, process all valid entries before failing rather than precheck
929 * for a capacity violation.
55d2375e
SC
930 */
931static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
932{
933 u32 i;
934 struct vmx_msr_entry e;
f0b5105a 935 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
55d2375e 936
55d2375e 937 for (i = 0; i < count; i++) {
f0b5105a
MO
938 if (unlikely(i >= max_msr_list_size))
939 goto fail;
940
55d2375e
SC
941 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
942 &e, sizeof(e))) {
943 pr_debug_ratelimited(
944 "%s cannot read MSR entry (%u, 0x%08llx)\n",
945 __func__, i, gpa + i * sizeof(e));
946 goto fail;
947 }
948 if (nested_vmx_load_msr_check(vcpu, &e)) {
949 pr_debug_ratelimited(
950 "%s check failed (%u, 0x%x, 0x%x)\n",
951 __func__, i, e.index, e.reserved);
952 goto fail;
953 }
f20935d8 954 if (kvm_set_msr(vcpu, e.index, e.value)) {
55d2375e
SC
955 pr_debug_ratelimited(
956 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
957 __func__, i, e.index, e.value);
958 goto fail;
959 }
960 }
961 return 0;
962fail:
68cda40d 963 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
55d2375e
SC
964 return i + 1;
965}
966
662f1d1d
AL
967static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
968 u32 msr_index,
969 u64 *data)
970{
971 struct vcpu_vmx *vmx = to_vmx(vcpu);
972
973 /*
974 * If the L0 hypervisor stored a more accurate value for the TSC that
975 * does not include the time taken for emulation of the L2->L1
976 * VM-exit in L0, use the more accurate value.
977 */
978 if (msr_index == MSR_IA32_TSC) {
a128a934
SC
979 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
980 MSR_IA32_TSC);
662f1d1d 981
a128a934
SC
982 if (i >= 0) {
983 u64 val = vmx->msr_autostore.guest.val[i].value;
662f1d1d
AL
984
985 *data = kvm_read_l1_tsc(vcpu, val);
986 return true;
987 }
988 }
989
990 if (kvm_get_msr(vcpu, msr_index, data)) {
991 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
992 msr_index);
993 return false;
994 }
995 return true;
996}
997
365d3d55
AL
998static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
999 struct vmx_msr_entry *e)
1000{
1001 if (kvm_vcpu_read_guest(vcpu,
1002 gpa + i * sizeof(*e),
1003 e, 2 * sizeof(u32))) {
1004 pr_debug_ratelimited(
1005 "%s cannot read MSR entry (%u, 0x%08llx)\n",
1006 __func__, i, gpa + i * sizeof(*e));
1007 return false;
1008 }
1009 if (nested_vmx_store_msr_check(vcpu, e)) {
1010 pr_debug_ratelimited(
1011 "%s check failed (%u, 0x%x, 0x%x)\n",
1012 __func__, i, e->index, e->reserved);
1013 return false;
1014 }
1015 return true;
1016}
1017
55d2375e
SC
1018static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1019{
f20935d8 1020 u64 data;
55d2375e
SC
1021 u32 i;
1022 struct vmx_msr_entry e;
f0b5105a 1023 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
55d2375e
SC
1024
1025 for (i = 0; i < count; i++) {
f0b5105a
MO
1026 if (unlikely(i >= max_msr_list_size))
1027 return -EINVAL;
1028
365d3d55 1029 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
55d2375e 1030 return -EINVAL;
365d3d55 1031
662f1d1d 1032 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
55d2375e 1033 return -EINVAL;
662f1d1d 1034
55d2375e
SC
1035 if (kvm_vcpu_write_guest(vcpu,
1036 gpa + i * sizeof(e) +
1037 offsetof(struct vmx_msr_entry, value),
f20935d8 1038 &data, sizeof(data))) {
55d2375e
SC
1039 pr_debug_ratelimited(
1040 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
f20935d8 1041 __func__, i, e.index, data);
55d2375e
SC
1042 return -EINVAL;
1043 }
1044 }
1045 return 0;
1046}
1047
662f1d1d
AL
1048static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1049{
1050 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1051 u32 count = vmcs12->vm_exit_msr_store_count;
1052 u64 gpa = vmcs12->vm_exit_msr_store_addr;
1053 struct vmx_msr_entry e;
1054 u32 i;
1055
1056 for (i = 0; i < count; i++) {
1057 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1058 return false;
1059
1060 if (e.index == msr_index)
1061 return true;
1062 }
1063 return false;
1064}
1065
1066static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1067 u32 msr_index)
1068{
1069 struct vcpu_vmx *vmx = to_vmx(vcpu);
1070 struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1071 bool in_vmcs12_store_list;
a128a934 1072 int msr_autostore_slot;
662f1d1d
AL
1073 bool in_autostore_list;
1074 int last;
1075
a128a934
SC
1076 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1077 in_autostore_list = msr_autostore_slot >= 0;
662f1d1d
AL
1078 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1079
1080 if (in_vmcs12_store_list && !in_autostore_list) {
ce833b23 1081 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
662f1d1d
AL
1082 /*
1083 * Emulated VMEntry does not fail here. Instead a less
1084 * accurate value will be returned by
1085 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1086 * instead of reading the value from the vmcs02 VMExit
1087 * MSR-store area.
1088 */
1089 pr_warn_ratelimited(
1090 "Not enough msr entries in msr_autostore. Can't add msr %x\n",
1091 msr_index);
1092 return;
1093 }
1094 last = autostore->nr++;
1095 autostore->val[last].index = msr_index;
1096 } else if (!in_vmcs12_store_list && in_autostore_list) {
1097 last = --autostore->nr;
a128a934 1098 autostore->val[msr_autostore_slot] = autostore->val[last];
662f1d1d
AL
1099 }
1100}
1101
55d2375e 1102/*
ea79a750
SC
1103 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1104 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1105 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1106 * @entry_failure_code.
55d2375e 1107 */
0f857223
ML
1108static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1109 bool nested_ept, bool reload_pdptrs,
68cda40d 1110 enum vm_entry_failure_code *entry_failure_code)
55d2375e 1111{
636e8b73 1112 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
0cc69204
SC
1113 *entry_failure_code = ENTRY_FAIL_DEFAULT;
1114 return -EINVAL;
1115 }
55d2375e 1116
0cc69204
SC
1117 /*
1118 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1119 * must not be dereferenced.
1120 */
0f857223 1121 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
2df4a5eb 1122 CC(!load_pdptrs(vcpu, cr3))) {
bcb72d06
SC
1123 *entry_failure_code = ENTRY_FAIL_PDPTE;
1124 return -EINVAL;
55d2375e
SC
1125 }
1126
55d2375e 1127 vcpu->arch.cr3 = cr3;
3883bc9d 1128 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
55d2375e 1129
616007c8 1130 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
c9060662 1131 kvm_init_mmu(vcpu);
55d2375e 1132
3cffc89d
PB
1133 if (!nested_ept)
1134 kvm_mmu_new_pgd(vcpu, cr3);
1135
55d2375e
SC
1136 return 0;
1137}
1138
1139/*
1140 * Returns if KVM is able to config CPU to tag TLB entries
1141 * populated by L2 differently than TLB entries populated
1142 * by L1.
1143 *
992edeae
LA
1144 * If L0 uses EPT, L1 and L2 run with different EPTP because
1145 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1146 * are tagged with different EPTP.
55d2375e
SC
1147 *
1148 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1149 * with different VPID (L1 entries are tagged with vmx->vpid
1150 * while L2 entries are tagged with vmx->nested.vpid02).
1151 */
1152static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1153{
1154 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1155
992edeae 1156 return enable_ept ||
55d2375e
SC
1157 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1158}
1159
50b265a4
SC
1160static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1161 struct vmcs12 *vmcs12,
1162 bool is_vmenter)
1163{
1164 struct vcpu_vmx *vmx = to_vmx(vcpu);
1165
1166 /*
50a41796
SC
1167 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
1168 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
1169 * full TLB flush from the guest's perspective. This is required even
1170 * if VPID is disabled in the host as KVM may need to synchronize the
1171 * MMU in response to the guest TLB flush.
1172 *
1173 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
1174 * EPT is a special snowflake, as guest-physical mappings aren't
1175 * flushed on VPID invalidations, including VM-Enter or VM-Exit with
1176 * VPID disabled. As a result, KVM _never_ needs to sync nEPT
1177 * entries on VM-Enter because L1 can't rely on VM-Enter to flush
1178 * those mappings.
50b265a4 1179 */
50a41796
SC
1180 if (!nested_cpu_has_vpid(vmcs12)) {
1181 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
50b265a4 1182 return;
50a41796
SC
1183 }
1184
1185 /* L2 should never have a VPID if VPID is disabled. */
1186 WARN_ON(!enable_vpid);
50b265a4
SC
1187
1188 /*
712494de
SC
1189 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
1190 * emulate a guest TLB flush as KVM does not track vpid12 history nor
1191 * is the VPID incorporated into the MMU context. I.e. KVM must assume
1192 * that the new vpid12 has never been used and thus represents a new
1193 * guest ASID that cannot have entries in the TLB.
50b265a4 1194 */
712494de 1195 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
50b265a4 1196 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
712494de
SC
1197 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1198 return;
50b265a4 1199 }
712494de
SC
1200
1201 /*
1202 * If VPID is enabled, used by vmc12, and vpid12 is not changing but
1203 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
1204 * KVM was unable to allocate a VPID for L2, flush the current context
1205 * as the effective ASID is common to both L1 and L2.
1206 */
1207 if (!nested_has_guest_tlb_tag(vcpu))
1208 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
50b265a4
SC
1209}
1210
55d2375e
SC
1211static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1212{
1213 superset &= mask;
1214 subset &= mask;
1215
1216 return (superset | subset) == superset;
1217}
1218
1219static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1220{
1221 const u64 feature_and_reserved =
1222 /* feature (except bit 48; see below) */
1223 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1224 /* reserved */
1225 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1226 u64 vmx_basic = vmx->nested.msrs.basic;
1227
1228 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1229 return -EINVAL;
1230
1231 /*
1232 * KVM does not emulate a version of VMX that constrains physical
1233 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1234 */
1235 if (data & BIT_ULL(48))
1236 return -EINVAL;
1237
1238 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1239 vmx_basic_vmcs_revision_id(data))
1240 return -EINVAL;
1241
1242 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1243 return -EINVAL;
1244
1245 vmx->nested.msrs.basic = data;
1246 return 0;
1247}
1248
1249static int
1250vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1251{
1252 u64 supported;
1253 u32 *lowp, *highp;
1254
1255 switch (msr_index) {
1256 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1257 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1258 highp = &vmx->nested.msrs.pinbased_ctls_high;
1259 break;
1260 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1261 lowp = &vmx->nested.msrs.procbased_ctls_low;
1262 highp = &vmx->nested.msrs.procbased_ctls_high;
1263 break;
1264 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1265 lowp = &vmx->nested.msrs.exit_ctls_low;
1266 highp = &vmx->nested.msrs.exit_ctls_high;
1267 break;
1268 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1269 lowp = &vmx->nested.msrs.entry_ctls_low;
1270 highp = &vmx->nested.msrs.entry_ctls_high;
1271 break;
1272 case MSR_IA32_VMX_PROCBASED_CTLS2:
1273 lowp = &vmx->nested.msrs.secondary_ctls_low;
1274 highp = &vmx->nested.msrs.secondary_ctls_high;
1275 break;
1276 default:
1277 BUG();
1278 }
1279
1280 supported = vmx_control_msr(*lowp, *highp);
1281
1282 /* Check must-be-1 bits are still 1. */
1283 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1284 return -EINVAL;
1285
1286 /* Check must-be-0 bits are still 0. */
1287 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1288 return -EINVAL;
1289
1290 *lowp = data;
1291 *highp = data >> 32;
1292 return 0;
1293}
1294
1295static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1296{
1297 const u64 feature_and_reserved_bits =
1298 /* feature */
1299 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1300 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1301 /* reserved */
1302 GENMASK_ULL(13, 9) | BIT_ULL(31);
1303 u64 vmx_misc;
1304
1305 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1306 vmx->nested.msrs.misc_high);
1307
1308 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1309 return -EINVAL;
1310
1311 if ((vmx->nested.msrs.pinbased_ctls_high &
1312 PIN_BASED_VMX_PREEMPTION_TIMER) &&
1313 vmx_misc_preemption_timer_rate(data) !=
1314 vmx_misc_preemption_timer_rate(vmx_misc))
1315 return -EINVAL;
1316
1317 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1318 return -EINVAL;
1319
1320 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1321 return -EINVAL;
1322
1323 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1324 return -EINVAL;
1325
1326 vmx->nested.msrs.misc_low = data;
1327 vmx->nested.msrs.misc_high = data >> 32;
1328
55d2375e
SC
1329 return 0;
1330}
1331
1332static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1333{
1334 u64 vmx_ept_vpid_cap;
1335
1336 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1337 vmx->nested.msrs.vpid_caps);
1338
1339 /* Every bit is either reserved or a feature bit. */
1340 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1341 return -EINVAL;
1342
1343 vmx->nested.msrs.ept_caps = data;
1344 vmx->nested.msrs.vpid_caps = data >> 32;
1345 return 0;
1346}
1347
1348static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1349{
1350 u64 *msr;
1351
1352 switch (msr_index) {
1353 case MSR_IA32_VMX_CR0_FIXED0:
1354 msr = &vmx->nested.msrs.cr0_fixed0;
1355 break;
1356 case MSR_IA32_VMX_CR4_FIXED0:
1357 msr = &vmx->nested.msrs.cr4_fixed0;
1358 break;
1359 default:
1360 BUG();
1361 }
1362
1363 /*
1364 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1365 * must be 1 in the restored value.
1366 */
1367 if (!is_bitwise_subset(data, *msr, -1ULL))
1368 return -EINVAL;
1369
1370 *msr = data;
1371 return 0;
1372}
1373
1374/*
1375 * Called when userspace is restoring VMX MSRs.
1376 *
1377 * Returns 0 on success, non-0 otherwise.
1378 */
1379int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1380{
1381 struct vcpu_vmx *vmx = to_vmx(vcpu);
1382
1383 /*
1384 * Don't allow changes to the VMX capability MSRs while the vCPU
1385 * is in VMX operation.
1386 */
1387 if (vmx->nested.vmxon)
1388 return -EBUSY;
1389
1390 switch (msr_index) {
1391 case MSR_IA32_VMX_BASIC:
1392 return vmx_restore_vmx_basic(vmx, data);
1393 case MSR_IA32_VMX_PINBASED_CTLS:
1394 case MSR_IA32_VMX_PROCBASED_CTLS:
1395 case MSR_IA32_VMX_EXIT_CTLS:
1396 case MSR_IA32_VMX_ENTRY_CTLS:
1397 /*
1398 * The "non-true" VMX capability MSRs are generated from the
1399 * "true" MSRs, so we do not support restoring them directly.
1400 *
1401 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1402 * should restore the "true" MSRs with the must-be-1 bits
1403 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1404 * DEFAULT SETTINGS".
1405 */
1406 return -EINVAL;
1407 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1408 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1409 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1410 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1411 case MSR_IA32_VMX_PROCBASED_CTLS2:
1412 return vmx_restore_control_msr(vmx, msr_index, data);
1413 case MSR_IA32_VMX_MISC:
1414 return vmx_restore_vmx_misc(vmx, data);
1415 case MSR_IA32_VMX_CR0_FIXED0:
1416 case MSR_IA32_VMX_CR4_FIXED0:
1417 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1418 case MSR_IA32_VMX_CR0_FIXED1:
1419 case MSR_IA32_VMX_CR4_FIXED1:
1420 /*
1421 * These MSRs are generated based on the vCPU's CPUID, so we
1422 * do not support restoring them directly.
1423 */
1424 return -EINVAL;
1425 case MSR_IA32_VMX_EPT_VPID_CAP:
1426 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1427 case MSR_IA32_VMX_VMCS_ENUM:
1428 vmx->nested.msrs.vmcs_enum = data;
1429 return 0;
e8a70bd4
PB
1430 case MSR_IA32_VMX_VMFUNC:
1431 if (data & ~vmx->nested.msrs.vmfunc_controls)
1432 return -EINVAL;
1433 vmx->nested.msrs.vmfunc_controls = data;
1434 return 0;
55d2375e
SC
1435 default:
1436 /*
1437 * The rest of the VMX capability MSRs do not support restore.
1438 */
1439 return -EINVAL;
1440 }
1441}
1442
1443/* Returns 0 on success, non-0 otherwise. */
1444int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1445{
1446 switch (msr_index) {
1447 case MSR_IA32_VMX_BASIC:
1448 *pdata = msrs->basic;
1449 break;
1450 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1451 case MSR_IA32_VMX_PINBASED_CTLS:
1452 *pdata = vmx_control_msr(
1453 msrs->pinbased_ctls_low,
1454 msrs->pinbased_ctls_high);
1455 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1456 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1457 break;
1458 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1459 case MSR_IA32_VMX_PROCBASED_CTLS:
1460 *pdata = vmx_control_msr(
1461 msrs->procbased_ctls_low,
1462 msrs->procbased_ctls_high);
1463 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1464 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1465 break;
1466 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1467 case MSR_IA32_VMX_EXIT_CTLS:
1468 *pdata = vmx_control_msr(
1469 msrs->exit_ctls_low,
1470 msrs->exit_ctls_high);
1471 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1472 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1473 break;
1474 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1475 case MSR_IA32_VMX_ENTRY_CTLS:
1476 *pdata = vmx_control_msr(
1477 msrs->entry_ctls_low,
1478 msrs->entry_ctls_high);
1479 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1480 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1481 break;
1482 case MSR_IA32_VMX_MISC:
1483 *pdata = vmx_control_msr(
1484 msrs->misc_low,
1485 msrs->misc_high);
1486 break;
1487 case MSR_IA32_VMX_CR0_FIXED0:
1488 *pdata = msrs->cr0_fixed0;
1489 break;
1490 case MSR_IA32_VMX_CR0_FIXED1:
1491 *pdata = msrs->cr0_fixed1;
1492 break;
1493 case MSR_IA32_VMX_CR4_FIXED0:
1494 *pdata = msrs->cr4_fixed0;
1495 break;
1496 case MSR_IA32_VMX_CR4_FIXED1:
1497 *pdata = msrs->cr4_fixed1;
1498 break;
1499 case MSR_IA32_VMX_VMCS_ENUM:
1500 *pdata = msrs->vmcs_enum;
1501 break;
1502 case MSR_IA32_VMX_PROCBASED_CTLS2:
1503 *pdata = vmx_control_msr(
1504 msrs->secondary_ctls_low,
1505 msrs->secondary_ctls_high);
1506 break;
1507 case MSR_IA32_VMX_EPT_VPID_CAP:
1508 *pdata = msrs->ept_caps |
1509 ((u64)msrs->vpid_caps << 32);
1510 break;
1511 case MSR_IA32_VMX_VMFUNC:
1512 *pdata = msrs->vmfunc_controls;
1513 break;
1514 default:
1515 return 1;
1516 }
1517
1518 return 0;
1519}
1520
1521/*
fadcead0
SC
1522 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1523 * been modified by the L1 guest. Note, "writable" in this context means
1524 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1525 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1526 * VM-exit information fields (which are actually writable if the vCPU is
1527 * configured to support "VMWRITE to any supported field in the VMCS").
55d2375e
SC
1528 */
1529static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1530{
55d2375e 1531 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
fadcead0 1532 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1c6f0b47
SC
1533 struct shadow_vmcs_field field;
1534 unsigned long val;
fadcead0 1535 int i;
55d2375e 1536
88dddc11
PB
1537 if (WARN_ON(!shadow_vmcs))
1538 return;
1539
55d2375e
SC
1540 preempt_disable();
1541
1542 vmcs_load(shadow_vmcs);
1543
fadcead0
SC
1544 for (i = 0; i < max_shadow_read_write_fields; i++) {
1545 field = shadow_read_write_fields[i];
1c6f0b47
SC
1546 val = __vmcs_readl(field.encoding);
1547 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
55d2375e
SC
1548 }
1549
1550 vmcs_clear(shadow_vmcs);
1551 vmcs_load(vmx->loaded_vmcs->vmcs);
1552
1553 preempt_enable();
1554}
1555
1556static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1557{
1c6f0b47 1558 const struct shadow_vmcs_field *fields[] = {
55d2375e
SC
1559 shadow_read_write_fields,
1560 shadow_read_only_fields
1561 };
1562 const int max_fields[] = {
1563 max_shadow_read_write_fields,
1564 max_shadow_read_only_fields
1565 };
55d2375e 1566 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1c6f0b47
SC
1567 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1568 struct shadow_vmcs_field field;
1569 unsigned long val;
1570 int i, q;
55d2375e 1571
88dddc11
PB
1572 if (WARN_ON(!shadow_vmcs))
1573 return;
1574
55d2375e
SC
1575 vmcs_load(shadow_vmcs);
1576
1577 for (q = 0; q < ARRAY_SIZE(fields); q++) {
1578 for (i = 0; i < max_fields[q]; i++) {
1579 field = fields[q][i];
1c6f0b47
SC
1580 val = vmcs12_read_any(vmcs12, field.encoding,
1581 field.offset);
1582 __vmcs_writel(field.encoding, val);
55d2375e
SC
1583 }
1584 }
1585
1586 vmcs_clear(shadow_vmcs);
1587 vmcs_load(vmx->loaded_vmcs->vmcs);
1588}
1589
d6bf71a1 1590static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
55d2375e
SC
1591{
1592 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1593 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1594
1595 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1596 vmcs12->tpr_threshold = evmcs->tpr_threshold;
1597 vmcs12->guest_rip = evmcs->guest_rip;
1598
d6bf71a1 1599 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1600 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1601 vmcs12->guest_rsp = evmcs->guest_rsp;
1602 vmcs12->guest_rflags = evmcs->guest_rflags;
1603 vmcs12->guest_interruptibility_info =
1604 evmcs->guest_interruptibility_info;
1605 }
1606
d6bf71a1 1607 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1608 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1609 vmcs12->cpu_based_vm_exec_control =
1610 evmcs->cpu_based_vm_exec_control;
1611 }
1612
d6bf71a1 1613 if (unlikely(!(hv_clean_fields &
f9bc5227 1614 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
55d2375e
SC
1615 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1616 }
1617
d6bf71a1 1618 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1619 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1620 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1621 }
1622
d6bf71a1 1623 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1624 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1625 vmcs12->vm_entry_intr_info_field =
1626 evmcs->vm_entry_intr_info_field;
1627 vmcs12->vm_entry_exception_error_code =
1628 evmcs->vm_entry_exception_error_code;
1629 vmcs12->vm_entry_instruction_len =
1630 evmcs->vm_entry_instruction_len;
1631 }
1632
d6bf71a1 1633 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1634 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1635 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1636 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1637 vmcs12->host_cr0 = evmcs->host_cr0;
1638 vmcs12->host_cr3 = evmcs->host_cr3;
1639 vmcs12->host_cr4 = evmcs->host_cr4;
1640 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1641 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1642 vmcs12->host_rip = evmcs->host_rip;
1643 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1644 vmcs12->host_es_selector = evmcs->host_es_selector;
1645 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1646 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1647 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1648 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1649 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1650 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1651 }
1652
d6bf71a1 1653 if (unlikely(!(hv_clean_fields &
f9bc5227 1654 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
55d2375e
SC
1655 vmcs12->pin_based_vm_exec_control =
1656 evmcs->pin_based_vm_exec_control;
1657 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1658 vmcs12->secondary_vm_exec_control =
1659 evmcs->secondary_vm_exec_control;
1660 }
1661
d6bf71a1 1662 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1663 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1664 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1665 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1666 }
1667
d6bf71a1 1668 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1669 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1670 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1671 }
1672
d6bf71a1 1673 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1674 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1675 vmcs12->guest_es_base = evmcs->guest_es_base;
1676 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1677 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1678 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1679 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1680 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1681 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1682 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1683 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1684 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1685 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1686 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1687 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1688 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1689 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1690 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1691 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1692 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1693 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1694 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1695 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1696 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1697 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1698 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1699 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1700 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1701 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1702 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1703 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1704 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1705 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1706 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1707 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1708 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1709 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1710 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1711 }
1712
d6bf71a1 1713 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1714 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1715 vmcs12->tsc_offset = evmcs->tsc_offset;
1716 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1717 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1718 }
1719
d6bf71a1 1720 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1721 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1722 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1723 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1724 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1725 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1726 vmcs12->guest_cr0 = evmcs->guest_cr0;
1727 vmcs12->guest_cr3 = evmcs->guest_cr3;
1728 vmcs12->guest_cr4 = evmcs->guest_cr4;
1729 vmcs12->guest_dr7 = evmcs->guest_dr7;
1730 }
1731
d6bf71a1 1732 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1733 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1734 vmcs12->host_fs_base = evmcs->host_fs_base;
1735 vmcs12->host_gs_base = evmcs->host_gs_base;
1736 vmcs12->host_tr_base = evmcs->host_tr_base;
1737 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1738 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1739 vmcs12->host_rsp = evmcs->host_rsp;
1740 }
1741
d6bf71a1 1742 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1743 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1744 vmcs12->ept_pointer = evmcs->ept_pointer;
1745 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1746 }
1747
d6bf71a1 1748 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1749 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1750 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1751 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1752 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1753 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1754 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1755 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1756 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1757 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1758 vmcs12->guest_pending_dbg_exceptions =
1759 evmcs->guest_pending_dbg_exceptions;
1760 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1761 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1762 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1763 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1764 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1765 }
1766
1767 /*
1768 * Not used?
1769 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1770 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1771 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
55d2375e
SC
1772 * vmcs12->page_fault_error_code_mask =
1773 * evmcs->page_fault_error_code_mask;
1774 * vmcs12->page_fault_error_code_match =
1775 * evmcs->page_fault_error_code_match;
1776 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1777 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1778 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1779 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1780 */
1781
1782 /*
1783 * Read only fields:
1784 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1785 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1786 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1787 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1788 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1789 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1790 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1791 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1792 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1793 * vmcs12->exit_qualification = evmcs->exit_qualification;
1794 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1795 *
1796 * Not present in struct vmcs12:
1797 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1798 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1799 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1800 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1801 */
1802
25641caf 1803 return;
55d2375e
SC
1804}
1805
25641caf 1806static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
55d2375e
SC
1807{
1808 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1809 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1810
1811 /*
1812 * Should not be changed by KVM:
1813 *
1814 * evmcs->host_es_selector = vmcs12->host_es_selector;
1815 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1816 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1817 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1818 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1819 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1820 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1821 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1822 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1823 * evmcs->host_cr0 = vmcs12->host_cr0;
1824 * evmcs->host_cr3 = vmcs12->host_cr3;
1825 * evmcs->host_cr4 = vmcs12->host_cr4;
1826 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1827 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1828 * evmcs->host_rip = vmcs12->host_rip;
1829 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1830 * evmcs->host_fs_base = vmcs12->host_fs_base;
1831 * evmcs->host_gs_base = vmcs12->host_gs_base;
1832 * evmcs->host_tr_base = vmcs12->host_tr_base;
1833 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1834 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1835 * evmcs->host_rsp = vmcs12->host_rsp;
3731905e 1836 * sync_vmcs02_to_vmcs12() doesn't read these:
55d2375e
SC
1837 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1838 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1839 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1840 * evmcs->ept_pointer = vmcs12->ept_pointer;
1841 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1842 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1843 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1844 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
55d2375e
SC
1845 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1846 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1847 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1848 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1849 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1850 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1851 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1852 * evmcs->page_fault_error_code_mask =
1853 * vmcs12->page_fault_error_code_mask;
1854 * evmcs->page_fault_error_code_match =
1855 * vmcs12->page_fault_error_code_match;
1856 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1857 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1858 * evmcs->tsc_offset = vmcs12->tsc_offset;
1859 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1860 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1861 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1862 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1863 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1864 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1865 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1866 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1867 *
1868 * Not present in struct vmcs12:
1869 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1870 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1871 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1872 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1873 */
1874
1875 evmcs->guest_es_selector = vmcs12->guest_es_selector;
1876 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1877 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1878 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1879 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1880 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1881 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1882 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1883
1884 evmcs->guest_es_limit = vmcs12->guest_es_limit;
1885 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1886 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1887 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1888 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1889 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1890 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1891 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1892 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1893 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1894
1895 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1896 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1897 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1898 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1899 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1900 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1901 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1902 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1903
1904 evmcs->guest_es_base = vmcs12->guest_es_base;
1905 evmcs->guest_cs_base = vmcs12->guest_cs_base;
1906 evmcs->guest_ss_base = vmcs12->guest_ss_base;
1907 evmcs->guest_ds_base = vmcs12->guest_ds_base;
1908 evmcs->guest_fs_base = vmcs12->guest_fs_base;
1909 evmcs->guest_gs_base = vmcs12->guest_gs_base;
1910 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1911 evmcs->guest_tr_base = vmcs12->guest_tr_base;
1912 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1913 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1914
1915 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1916 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1917
1918 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1919 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1920 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1921 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1922
1923 evmcs->guest_pending_dbg_exceptions =
1924 vmcs12->guest_pending_dbg_exceptions;
1925 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1926 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1927
1928 evmcs->guest_activity_state = vmcs12->guest_activity_state;
1929 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1930
1931 evmcs->guest_cr0 = vmcs12->guest_cr0;
1932 evmcs->guest_cr3 = vmcs12->guest_cr3;
1933 evmcs->guest_cr4 = vmcs12->guest_cr4;
1934 evmcs->guest_dr7 = vmcs12->guest_dr7;
1935
1936 evmcs->guest_physical_address = vmcs12->guest_physical_address;
1937
1938 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1939 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1940 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1941 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1942 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1943 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1944 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1945 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1946
1947 evmcs->exit_qualification = vmcs12->exit_qualification;
1948
1949 evmcs->guest_linear_address = vmcs12->guest_linear_address;
1950 evmcs->guest_rsp = vmcs12->guest_rsp;
1951 evmcs->guest_rflags = vmcs12->guest_rflags;
1952
1953 evmcs->guest_interruptibility_info =
1954 vmcs12->guest_interruptibility_info;
1955 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1956 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1957 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1958 evmcs->vm_entry_exception_error_code =
1959 vmcs12->vm_entry_exception_error_code;
1960 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1961
1962 evmcs->guest_rip = vmcs12->guest_rip;
1963
1964 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1965
25641caf 1966 return;
55d2375e
SC
1967}
1968
1969/*
1970 * This is an equivalent of the nested hypervisor executing the vmptrld
1971 * instruction.
1972 */
b6a0653a
VK
1973static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
1974 struct kvm_vcpu *vcpu, bool from_launch)
55d2375e
SC
1975{
1976 struct vcpu_vmx *vmx = to_vmx(vcpu);
a21a39c2 1977 bool evmcs_gpa_changed = false;
11e34914 1978 u64 evmcs_gpa;
55d2375e
SC
1979
1980 if (likely(!vmx->nested.enlightened_vmcs_enabled))
b6a0653a 1981 return EVMPTRLD_DISABLED;
55d2375e 1982
02761716
VK
1983 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
1984 nested_release_evmcs(vcpu);
b6a0653a 1985 return EVMPTRLD_DISABLED;
02761716 1986 }
55d2375e 1987
1e9dfbd7 1988 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
64c78508 1989 vmx->nested.current_vmptr = INVALID_GPA;
55d2375e
SC
1990
1991 nested_release_evmcs(vcpu);
1992
11e34914 1993 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
dee9c049 1994 &vmx->nested.hv_evmcs_map))
b6a0653a 1995 return EVMPTRLD_ERROR;
55d2375e 1996
dee9c049 1997 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
55d2375e
SC
1998
1999 /*
2000 * Currently, KVM only supports eVMCS version 1
2001 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2002 * value to first u32 field of eVMCS which should specify eVMCS
2003 * VersionNumber.
2004 *
2005 * Guest should be aware of supported eVMCS versions by host by
2006 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2007 * expected to set this CPUID leaf according to the value
2008 * returned in vmcs_version from nested_enable_evmcs().
2009 *
2010 * However, it turns out that Microsoft Hyper-V fails to comply
2011 * to their own invented interface: When Hyper-V use eVMCS, it
2012 * just sets first u32 field of eVMCS to revision_id specified
2013 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2014 * which is one of the supported versions specified in
2015 * CPUID.0x4000000A.EAX[0:15].
2016 *
2017 * To overcome Hyper-V bug, we accept here either a supported
2018 * eVMCS version or VMCS12 revision_id as valid values for first
2019 * u32 field of eVMCS.
2020 */
2021 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2022 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2023 nested_release_evmcs(vcpu);
b6a0653a 2024 return EVMPTRLD_VMFAIL;
55d2375e
SC
2025 }
2026
11e34914 2027 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
55d2375e 2028
a21a39c2 2029 evmcs_gpa_changed = true;
55d2375e
SC
2030 /*
2031 * Unlike normal vmcs12, enlightened vmcs12 is not fully
2032 * reloaded from guest's memory (read only fields, fields not
2033 * present in struct hv_enlightened_vmcs, ...). Make sure there
2034 * are no leftovers.
2035 */
2036 if (from_launch) {
2037 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2038 memset(vmcs12, 0, sizeof(*vmcs12));
2039 vmcs12->hdr.revision_id = VMCS12_REVISION;
2040 }
2041
2042 }
a21a39c2
VK
2043
2044 /*
ffdbd50d 2045 * Clean fields data can't be used on VMLAUNCH and when we switch
a21a39c2
VK
2046 * between different L2 guests as KVM keeps a single VMCS12 per L1.
2047 */
ed2a4800 2048 if (from_launch || evmcs_gpa_changed) {
a21a39c2
VK
2049 vmx->nested.hv_evmcs->hv_clean_fields &=
2050 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2051
ed2a4800
VK
2052 vmx->nested.force_msr_bitmap_recalc = true;
2053 }
2054
b6a0653a 2055 return EVMPTRLD_SUCCEEDED;
55d2375e
SC
2056}
2057
3731905e 2058void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
55d2375e
SC
2059{
2060 struct vcpu_vmx *vmx = to_vmx(vcpu);
2061
dc313385 2062 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
55d2375e 2063 copy_vmcs12_to_enlightened(vmx);
dc313385 2064 else
55d2375e 2065 copy_vmcs12_to_shadow(vmx);
55d2375e 2066
3731905e 2067 vmx->nested.need_vmcs12_to_shadow_sync = false;
55d2375e
SC
2068}
2069
2070static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2071{
2072 struct vcpu_vmx *vmx =
2073 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2074
2075 vmx->nested.preemption_timer_expired = true;
2076 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2077 kvm_vcpu_kick(&vmx->vcpu);
2078
2079 return HRTIMER_NORESTART;
2080}
2081
850448f3
PS
2082static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2083{
2084 struct vcpu_vmx *vmx = to_vmx(vcpu);
2085 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
850448f3
PS
2086
2087 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2088 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2089
2090 if (!vmx->nested.has_preemption_timer_deadline) {
8d7fbf01
MS
2091 vmx->nested.preemption_timer_deadline =
2092 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
850448f3 2093 vmx->nested.has_preemption_timer_deadline = true;
8d7fbf01
MS
2094 }
2095 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
850448f3
PS
2096}
2097
2098static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2099 u64 preemption_timeout)
55d2375e 2100{
55d2375e
SC
2101 struct vcpu_vmx *vmx = to_vmx(vcpu);
2102
2103 /*
2104 * A timer value of zero is architecturally guaranteed to cause
2105 * a VMExit prior to executing any instructions in the guest.
2106 */
2107 if (preemption_timeout == 0) {
2108 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2109 return;
2110 }
2111
2112 if (vcpu->arch.virtual_tsc_khz == 0)
2113 return;
2114
2115 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2116 preemption_timeout *= 1000000;
2117 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2118 hrtimer_start(&vmx->nested.preemption_timer,
ada0098d
JM
2119 ktime_add_ns(ktime_get(), preemption_timeout),
2120 HRTIMER_MODE_ABS_PINNED);
55d2375e
SC
2121}
2122
2123static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2124{
2125 if (vmx->nested.nested_run_pending &&
2126 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2127 return vmcs12->guest_ia32_efer;
2128 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2129 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2130 else
2131 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2132}
2133
2134static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2135{
2136 /*
2137 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2138 * according to L0's settings (vmcs12 is irrelevant here). Host
2139 * fields that come from L0 and are not constant, e.g. HOST_CR3,
2140 * will be set as needed prior to VMLAUNCH/VMRESUME.
2141 */
2142 if (vmx->nested.vmcs02_initialized)
2143 return;
2144 vmx->nested.vmcs02_initialized = true;
2145
2146 /*
2147 * We don't care what the EPTP value is we just need to guarantee
2148 * it's valid so we don't get a false positive when doing early
2149 * consistency checks.
2150 */
2151 if (enable_ept && nested_early_check)
2a40b900
SC
2152 vmcs_write64(EPT_POINTER,
2153 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
55d2375e
SC
2154
2155 /* All VMFUNCs are currently emulated through L0 vmexits. */
2156 if (cpu_has_vmx_vmfunc())
2157 vmcs_write64(VM_FUNCTION_CONTROL, 0);
2158
2159 if (cpu_has_vmx_posted_intr())
2160 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2161
2162 if (cpu_has_vmx_msr_bitmap())
2163 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2164
4d6c9892 2165 /*
c3bb9a20
SC
2166 * PML is emulated for L2, but never enabled in hardware as the MMU
2167 * handles A/D emulation. Disabling PML for L2 also avoids having to
2168 * deal with filtering out L2 GPAs from the buffer.
4d6c9892
SC
2169 */
2170 if (enable_pml) {
c3bb9a20
SC
2171 vmcs_write64(PML_ADDRESS, 0);
2172 vmcs_write16(GUEST_PML_INDEX, -1);
4d6c9892 2173 }
55d2375e 2174
c538d57f 2175 if (cpu_has_vmx_encls_vmexit())
64c78508 2176 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
55d2375e
SC
2177
2178 /*
2179 * Set the MSR load/store lists to match L0's settings. Only the
2180 * addresses are constant (for vmcs02), the counts can change based
2181 * on L2's behavior, e.g. switching to/from long mode.
2182 */
662f1d1d 2183 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
55d2375e
SC
2184 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2185 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2186
2187 vmx_set_constant_host_state(vmx);
2188}
2189
b1346ab2 2190static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
55d2375e
SC
2191 struct vmcs12 *vmcs12)
2192{
2193 prepare_vmcs02_constant_state(vmx);
2194
64c78508 2195 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
55d2375e
SC
2196
2197 if (enable_vpid) {
2198 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2199 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2200 else
2201 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2202 }
2203}
2204
389ab252
SC
2205static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2206 struct vmcs12 *vmcs12)
55d2375e 2207{
c3bb9a20 2208 u32 exec_control;
55d2375e
SC
2209 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2210
1e9dfbd7 2211 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
b1346ab2 2212 prepare_vmcs02_early_rare(vmx, vmcs12);
55d2375e 2213
55d2375e
SC
2214 /*
2215 * PIN CONTROLS
2216 */
389ab252 2217 exec_control = __pin_controls_get(vmcs01);
804939ea
SC
2218 exec_control |= (vmcs12->pin_based_vm_exec_control &
2219 ~PIN_BASED_VMX_PREEMPTION_TIMER);
55d2375e
SC
2220
2221 /* Posted interrupts setting is only taken from vmcs12. */
f7782bb8
SC
2222 vmx->nested.pi_pending = false;
2223 if (nested_cpu_has_posted_intr(vmcs12))
55d2375e 2224 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
f7782bb8 2225 else
55d2375e 2226 exec_control &= ~PIN_BASED_POSTED_INTR;
3af80fec 2227 pin_controls_set(vmx, exec_control);
55d2375e
SC
2228
2229 /*
2230 * EXEC CONTROLS
2231 */
389ab252 2232 exec_control = __exec_controls_get(vmcs01); /* L0's desires */
9dadc2f9 2233 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
4e2a0bc5 2234 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
55d2375e
SC
2235 exec_control &= ~CPU_BASED_TPR_SHADOW;
2236 exec_control |= vmcs12->cpu_based_vm_exec_control;
2237
02d496cf 2238 vmx->nested.l1_tpr_threshold = -1;
ca2f5466 2239 if (exec_control & CPU_BASED_TPR_SHADOW)
55d2375e 2240 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
55d2375e 2241#ifdef CONFIG_X86_64
ca2f5466 2242 else
55d2375e
SC
2243 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2244 CPU_BASED_CR8_STORE_EXITING;
2245#endif
55d2375e
SC
2246
2247 /*
2248 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2249 * for I/O port accesses.
2250 */
55d2375e 2251 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
de0286b7
SC
2252 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2253
2254 /*
2255 * This bit will be computed in nested_get_vmcs12_pages, because
2256 * we do not have access to L1's MSR bitmap yet. For now, keep
2257 * the same bit as before, hoping to avoid multiple VMWRITEs that
2258 * only set/clear this bit.
2259 */
2260 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2261 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2262
3af80fec 2263 exec_controls_set(vmx, exec_control);
55d2375e
SC
2264
2265 /*
2266 * SECONDARY EXEC CONTROLS
2267 */
2268 if (cpu_has_secondary_exec_ctrls()) {
389ab252 2269 exec_control = __secondary_exec_controls_get(vmcs01);
55d2375e
SC
2270
2271 /* Take the following fields only from vmcs12 */
2272 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
389ab252 2273 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
55d2375e 2274 SECONDARY_EXEC_ENABLE_INVPCID |
7f3603b6 2275 SECONDARY_EXEC_ENABLE_RDTSCP |
55d2375e 2276 SECONDARY_EXEC_XSAVES |
e69e72fa 2277 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
55d2375e
SC
2278 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2279 SECONDARY_EXEC_APIC_REGISTER_VIRT |
d041b5ea 2280 SECONDARY_EXEC_ENABLE_VMFUNC |
389ab252
SC
2281 SECONDARY_EXEC_DESC);
2282
55d2375e 2283 if (nested_cpu_has(vmcs12,
c3bb9a20
SC
2284 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2285 exec_control |= vmcs12->secondary_vm_exec_control;
2286
2287 /* PML is emulated and never enabled in hardware for L2. */
2288 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
55d2375e
SC
2289
2290 /* VMCS shadowing for L2 is emulated for now */
2291 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2292
55d2375e 2293 /*
469debdb
SC
2294 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2295 * will not have to rewrite the controls just for this bit.
55d2375e 2296 */
469debdb
SC
2297 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2298 (vmcs12->guest_cr4 & X86_CR4_UMIP))
2299 exec_control |= SECONDARY_EXEC_DESC;
55d2375e 2300
55d2375e
SC
2301 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2302 vmcs_write16(GUEST_INTR_STATUS,
2303 vmcs12->guest_intr_status);
55d2375e 2304
bddd82d1
KS
2305 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2306 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2307
72add915
SC
2308 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2309 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2310
3af80fec 2311 secondary_exec_controls_set(vmx, exec_control);
55d2375e
SC
2312 }
2313
2314 /*
2315 * ENTRY CONTROLS
2316 *
2317 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2318 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2319 * on the related bits (if supported by the CPU) in the hope that
2320 * we can avoid VMWrites during vmx_set_efer().
2321 */
389ab252
SC
2322 exec_control = __vm_entry_controls_get(vmcs01);
2323 exec_control |= vmcs12->vm_entry_controls;
2324 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
55d2375e
SC
2325 if (cpu_has_load_ia32_efer()) {
2326 if (guest_efer & EFER_LMA)
2327 exec_control |= VM_ENTRY_IA32E_MODE;
2328 if (guest_efer != host_efer)
2329 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2330 }
3af80fec 2331 vm_entry_controls_set(vmx, exec_control);
55d2375e
SC
2332
2333 /*
2334 * EXIT CONTROLS
2335 *
2336 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2337 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2338 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2339 */
389ab252 2340 exec_control = __vm_exit_controls_get(vmcs01);
55d2375e
SC
2341 if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2342 exec_control |= VM_EXIT_LOAD_IA32_EFER;
389ab252
SC
2343 else
2344 exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
3af80fec 2345 vm_exit_controls_set(vmx, exec_control);
55d2375e
SC
2346
2347 /*
2348 * Interrupt/Exception Fields
2349 */
2350 if (vmx->nested.nested_run_pending) {
2351 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2352 vmcs12->vm_entry_intr_info_field);
2353 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2354 vmcs12->vm_entry_exception_error_code);
2355 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2356 vmcs12->vm_entry_instruction_len);
2357 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2358 vmcs12->guest_interruptibility_info);
2359 vmx->loaded_vmcs->nmi_known_unmasked =
2360 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2361 } else {
2362 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2363 }
2364}
2365
b1346ab2 2366static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
55d2375e
SC
2367{
2368 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2369
2370 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2371 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2372 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2373 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2374 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2375 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2376 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2377 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2378 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2379 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2380 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2381 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2382 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2383 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2384 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2385 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2386 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2387 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2388 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2389 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
1c6f0b47
SC
2390 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2391 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
55d2375e
SC
2392 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2393 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2394 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2395 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2396 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2397 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2398 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2399 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2400 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2401 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2402 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2403 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2404 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2405 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2406 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2407 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
fc387d8d
SC
2408
2409 vmx->segment_cache.bitmask = 0;
55d2375e
SC
2410 }
2411
2412 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2413 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2414 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2415 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2416 vmcs12->guest_pending_dbg_exceptions);
2417 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2418 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2419
2420 /*
2421 * L1 may access the L2's PDPTR, so save them to construct
2422 * vmcs12
2423 */
2424 if (enable_ept) {
2425 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2426 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2427 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2428 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2429 }
c27e5b0d
SC
2430
2431 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2432 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2433 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
55d2375e
SC
2434 }
2435
2436 if (nested_cpu_has_xsaves(vmcs12))
2437 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2438
2439 /*
2440 * Whether page-faults are trapped is determined by a combination of
a0c13434
PB
2441 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2442 * doesn't care about page faults then we should set all of these to
2443 * L1's desires. However, if L0 does care about (some) page faults, it
2444 * is not easy (if at all possible?) to merge L0 and L1's desires, we
2445 * simply ask to exit on each and every L2 page fault. This is done by
2446 * setting MASK=MATCH=0 and (see below) EB.PF=1.
55d2375e
SC
2447 * Note that below we don't need special code to set EB.PF beyond the
2448 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2449 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2450 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2451 */
a0c13434
PB
2452 if (vmx_need_pf_intercept(&vmx->vcpu)) {
2453 /*
2454 * TODO: if both L0 and L1 need the same MASK and MATCH,
2455 * go ahead and use it?
2456 */
2457 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2458 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2459 } else {
2460 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2461 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2462 }
55d2375e
SC
2463
2464 if (cpu_has_vmx_apicv()) {
2465 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2466 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2467 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2468 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2469 }
2470
662f1d1d
AL
2471 /*
2472 * Make sure the msr_autostore list is up to date before we set the
2473 * count in the vmcs02.
2474 */
2475 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2476
2477 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
55d2375e
SC
2478 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2479 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2480
2481 set_cr4_guest_host_mask(vmx);
55d2375e
SC
2482}
2483
2484/*
2485 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2486 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2487 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2488 * guest in a way that will both be appropriate to L1's requests, and our
2489 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2490 * function also has additional necessary side-effects, like setting various
2491 * vcpu->arch fields.
2492 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2493 * is assigned to entry_failure_code on failure.
2494 */
2495static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
0f857223 2496 bool from_vmentry,
68cda40d 2497 enum vm_entry_failure_code *entry_failure_code)
55d2375e
SC
2498{
2499 struct vcpu_vmx *vmx = to_vmx(vcpu);
c7554efc 2500 bool load_guest_pdptrs_vmcs12 = false;
55d2375e 2501
1e9dfbd7 2502 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
b1346ab2 2503 prepare_vmcs02_rare(vmx, vmcs12);
55d2375e 2504 vmx->nested.dirty_vmcs12 = false;
55d2375e 2505
1e9dfbd7
VK
2506 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
2507 !(vmx->nested.hv_evmcs->hv_clean_fields &
c7554efc 2508 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
55d2375e
SC
2509 }
2510
2511 if (vmx->nested.nested_run_pending &&
2512 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2513 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2514 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2515 } else {
2516 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2517 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2518 }
3b013a29
SC
2519 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2520 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2521 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
55d2375e
SC
2522 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2523
55d2375e
SC
2524 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2525 * bitwise-or of what L1 wants to trap for L2, and what we want to
2526 * trap. Note that CR0.TS also needs updating - we do this later.
2527 */
b6a7cc35 2528 vmx_update_exception_bitmap(vcpu);
55d2375e
SC
2529 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2530 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2531
2532 if (vmx->nested.nested_run_pending &&
2533 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2534 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2535 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2536 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2537 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2538 }
2539
d041b5ea
IS
2540 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2541 vcpu->arch.l1_tsc_offset,
2542 vmx_get_l2_tsc_offset(vcpu),
2543 vmx_get_l2_tsc_multiplier(vcpu));
2544
2545 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2546 vcpu->arch.l1_tsc_scaling_ratio,
2547 vmx_get_l2_tsc_multiplier(vcpu));
2548
55d2375e 2549 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
55d2375e 2550 if (kvm_has_tsc_control)
1ab9287a 2551 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
55d2375e 2552
50b265a4 2553 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
55d2375e
SC
2554
2555 if (nested_cpu_has_ept(vmcs12))
2556 nested_ept_init_mmu_context(vcpu);
55d2375e
SC
2557
2558 /*
2559 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2560 * bits which we consider mandatory enabled.
2561 * The CR0_READ_SHADOW is what L2 should have expected to read given
2562 * the specifications by L1; It's not enough to take
2563 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2564 * have more bits than L1 expected.
2565 */
2566 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2567 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2568
2569 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2570 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2571
2572 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2573 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2574 vmx_set_efer(vcpu, vcpu->arch.efer);
2575
2576 /*
2577 * Guest state is invalid and unrestricted guest is disabled,
2578 * which means L1 attempted VMEntry to L2 with invalid state.
2579 * Fail the VMEntry.
c8607e4a
ML
2580 *
2581 * However when force loading the guest state (SMM exit or
2582 * loading nested state after migration, it is possible to
2583 * have invalid guest state now, which will be later fixed by
2584 * restoring L2 register state
55d2375e 2585 */
c8607e4a 2586 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
55d2375e 2587 *entry_failure_code = ENTRY_FAIL_DEFAULT;
c80add0f 2588 return -EINVAL;
55d2375e
SC
2589 }
2590
2591 /* Shadow page tables on either EPT or shadow page tables. */
2592 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
0f857223 2593 from_vmentry, entry_failure_code))
c80add0f 2594 return -EINVAL;
55d2375e 2595
04f11ef4
SC
2596 /*
2597 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2598 * on nested VM-Exit, which can occur without actually running L2 and
727a7e27 2599 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
04f11ef4
SC
2600 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2601 * transition to HLT instead of running L2.
2602 */
2603 if (enable_ept)
2604 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2605
c7554efc
SC
2606 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2607 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2608 is_pae_paging(vcpu)) {
2609 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2610 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2611 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2612 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2613 }
2614
71f73470 2615 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
d1968421 2616 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
bfbb307c
DC
2617 vmcs12->guest_ia32_perf_global_ctrl))) {
2618 *entry_failure_code = ENTRY_FAIL_DEFAULT;
71f73470 2619 return -EINVAL;
bfbb307c 2620 }
71f73470 2621
e9c16c78
PB
2622 kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2623 kvm_rip_write(vcpu, vmcs12->guest_rip);
dc313385
VK
2624
2625 /*
2626 * It was observed that genuine Hyper-V running in L1 doesn't reset
2627 * 'hv_clean_fields' by itself, it only sets the corresponding dirty
2628 * bits when it changes a field in eVMCS. Mark all fields as clean
2629 * here.
2630 */
2631 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2632 vmx->nested.hv_evmcs->hv_clean_fields |=
2633 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2634
55d2375e
SC
2635 return 0;
2636}
2637
2638static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2639{
5497b955
SC
2640 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2641 nested_cpu_has_virtual_nmis(vmcs12)))
55d2375e
SC
2642 return -EINVAL;
2643
5497b955 2644 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
4e2a0bc5 2645 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
55d2375e
SC
2646 return -EINVAL;
2647
2648 return 0;
2649}
2650
ac6389ab 2651static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
55d2375e
SC
2652{
2653 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e
SC
2654
2655 /* Check for memory type validity */
ac6389ab 2656 switch (new_eptp & VMX_EPTP_MT_MASK) {
55d2375e 2657 case VMX_EPTP_MT_UC:
5497b955 2658 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
55d2375e
SC
2659 return false;
2660 break;
2661 case VMX_EPTP_MT_WB:
5497b955 2662 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
55d2375e
SC
2663 return false;
2664 break;
2665 default:
2666 return false;
2667 }
2668
bb1fcc70 2669 /* Page-walk levels validity. */
ac6389ab 2670 switch (new_eptp & VMX_EPTP_PWL_MASK) {
bb1fcc70
SC
2671 case VMX_EPTP_PWL_5:
2672 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2673 return false;
2674 break;
2675 case VMX_EPTP_PWL_4:
2676 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2677 return false;
2678 break;
2679 default:
55d2375e 2680 return false;
bb1fcc70 2681 }
55d2375e
SC
2682
2683 /* Reserved bits should not be set */
636e8b73 2684 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
55d2375e
SC
2685 return false;
2686
2687 /* AD, if set, should be supported */
ac6389ab 2688 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
5497b955 2689 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
55d2375e
SC
2690 return false;
2691 }
2692
2693 return true;
2694}
2695
461b4ba4
KS
2696/*
2697 * Checks related to VM-Execution Control Fields
2698 */
2699static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2700 struct vmcs12 *vmcs12)
55d2375e
SC
2701{
2702 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 2703
5497b955
SC
2704 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2705 vmx->nested.msrs.pinbased_ctls_low,
2706 vmx->nested.msrs.pinbased_ctls_high)) ||
2707 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2708 vmx->nested.msrs.procbased_ctls_low,
2709 vmx->nested.msrs.procbased_ctls_high)))
461b4ba4 2710 return -EINVAL;
55d2375e 2711
461b4ba4 2712 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
5497b955
SC
2713 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2714 vmx->nested.msrs.secondary_ctls_low,
2715 vmx->nested.msrs.secondary_ctls_high)))
461b4ba4
KS
2716 return -EINVAL;
2717
5497b955 2718 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
461b4ba4
KS
2719 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2720 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2721 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2722 nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2723 nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2724 nested_vmx_check_nmi_controls(vmcs12) ||
2725 nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2726 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2727 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2728 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
5497b955 2729 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
461b4ba4
KS
2730 return -EINVAL;
2731
bc441211
SC
2732 if (!nested_cpu_has_preemption_timer(vmcs12) &&
2733 nested_cpu_has_save_preemption_timer(vmcs12))
2734 return -EINVAL;
2735
461b4ba4 2736 if (nested_cpu_has_ept(vmcs12) &&
ac6389ab 2737 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
461b4ba4 2738 return -EINVAL;
55d2375e
SC
2739
2740 if (nested_cpu_has_vmfunc(vmcs12)) {
5497b955
SC
2741 if (CC(vmcs12->vm_function_control &
2742 ~vmx->nested.msrs.vmfunc_controls))
461b4ba4 2743 return -EINVAL;
55d2375e
SC
2744
2745 if (nested_cpu_has_eptp_switching(vmcs12)) {
5497b955
SC
2746 if (CC(!nested_cpu_has_ept(vmcs12)) ||
2747 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
461b4ba4 2748 return -EINVAL;
55d2375e
SC
2749 }
2750 }
2751
461b4ba4
KS
2752 return 0;
2753}
2754
61446ba7
KS
2755/*
2756 * Checks related to VM-Exit Control Fields
2757 */
2758static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2759 struct vmcs12 *vmcs12)
2760{
2761 struct vcpu_vmx *vmx = to_vmx(vcpu);
2762
5497b955
SC
2763 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2764 vmx->nested.msrs.exit_ctls_low,
2765 vmx->nested.msrs.exit_ctls_high)) ||
2766 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
61446ba7
KS
2767 return -EINVAL;
2768
2769 return 0;
2770}
2771
5fbf9634
KS
2772/*
2773 * Checks related to VM-Entry Control Fields
2774 */
2775static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2776 struct vmcs12 *vmcs12)
461b4ba4
KS
2777{
2778 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 2779
5497b955
SC
2780 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2781 vmx->nested.msrs.entry_ctls_low,
2782 vmx->nested.msrs.entry_ctls_high)))
5fbf9634 2783 return -EINVAL;
55d2375e
SC
2784
2785 /*
2786 * From the Intel SDM, volume 3:
2787 * Fields relevant to VM-entry event injection must be set properly.
2788 * These fields are the VM-entry interruption-information field, the
2789 * VM-entry exception error code, and the VM-entry instruction length.
2790 */
2791 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2792 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2793 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2794 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2795 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2796 bool should_have_error_code;
2797 bool urg = nested_cpu_has2(vmcs12,
2798 SECONDARY_EXEC_UNRESTRICTED_GUEST);
2799 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2800
2801 /* VM-entry interruption-info field: interruption type */
5497b955
SC
2802 if (CC(intr_type == INTR_TYPE_RESERVED) ||
2803 CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2804 !nested_cpu_supports_monitor_trap_flag(vcpu)))
5fbf9634 2805 return -EINVAL;
55d2375e
SC
2806
2807 /* VM-entry interruption-info field: vector */
5497b955
SC
2808 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2809 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2810 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
5fbf9634 2811 return -EINVAL;
55d2375e
SC
2812
2813 /* VM-entry interruption-info field: deliver error code */
2814 should_have_error_code =
2815 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2816 x86_exception_has_error_code(vector);
5497b955 2817 if (CC(has_error_code != should_have_error_code))
5fbf9634 2818 return -EINVAL;
55d2375e
SC
2819
2820 /* VM-entry exception error code */
5497b955 2821 if (CC(has_error_code &&
567926cc 2822 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
5fbf9634 2823 return -EINVAL;
55d2375e
SC
2824
2825 /* VM-entry interruption-info field: reserved bits */
5497b955 2826 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
5fbf9634 2827 return -EINVAL;
55d2375e
SC
2828
2829 /* VM-entry instruction length */
2830 switch (intr_type) {
2831 case INTR_TYPE_SOFT_EXCEPTION:
2832 case INTR_TYPE_SOFT_INTR:
2833 case INTR_TYPE_PRIV_SW_EXCEPTION:
5497b955
SC
2834 if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2835 CC(vmcs12->vm_entry_instruction_len == 0 &&
2836 CC(!nested_cpu_has_zero_length_injection(vcpu))))
5fbf9634 2837 return -EINVAL;
55d2375e
SC
2838 }
2839 }
2840
5fbf9634
KS
2841 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2842 return -EINVAL;
2843
2844 return 0;
2845}
2846
5478ba34
SC
2847static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2848 struct vmcs12 *vmcs12)
2849{
2850 if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2851 nested_check_vm_exit_controls(vcpu, vmcs12) ||
2852 nested_check_vm_entry_controls(vcpu, vmcs12))
98d9e858 2853 return -EINVAL;
5478ba34 2854
a8350231
VK
2855 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
2856 return nested_evmcs_check_controls(vmcs12);
2857
5478ba34
SC
2858 return 0;
2859}
2860
af957eeb
ML
2861static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
2862 struct vmcs12 *vmcs12)
2863{
2864#ifdef CONFIG_X86_64
2865 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
2866 !!(vcpu->arch.efer & EFER_LMA)))
2867 return -EINVAL;
2868#endif
2869 return 0;
2870}
2871
98d9e858
PB
2872static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2873 struct vmcs12 *vmcs12)
5fbf9634
KS
2874{
2875 bool ia32e;
2876
5497b955
SC
2877 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2878 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
636e8b73 2879 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
254b2f3b 2880 return -EINVAL;
711eff3a 2881
5497b955
SC
2882 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2883 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
711eff3a
KS
2884 return -EINVAL;
2885
f6b0db1f 2886 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
5497b955 2887 CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
f6b0db1f
KS
2888 return -EINVAL;
2889
c547cb6f
OU
2890 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2891 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2892 vmcs12->host_ia32_perf_global_ctrl)))
2893 return -EINVAL;
2894
fd3edd4a 2895#ifdef CONFIG_X86_64
af957eeb 2896 ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
fd3edd4a
PB
2897#else
2898 ia32e = false;
2899#endif
2900
2901 if (ia32e) {
af957eeb 2902 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
fd3edd4a
PB
2903 return -EINVAL;
2904 } else {
af957eeb 2905 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
fd3edd4a
PB
2906 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2907 CC((vmcs12->host_rip) >> 32))
2908 return -EINVAL;
2909 }
1ef23e1f 2910
5497b955
SC
2911 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2912 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2913 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2914 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2915 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2916 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2917 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2918 CC(vmcs12->host_cs_selector == 0) ||
2919 CC(vmcs12->host_tr_selector == 0) ||
2920 CC(vmcs12->host_ss_selector == 0 && !ia32e))
1ef23e1f
KS
2921 return -EINVAL;
2922
5497b955
SC
2923 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2924 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2925 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2926 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
fd3edd4a
PB
2927 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2928 CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
5845038c 2929 return -EINVAL;
1ef23e1f 2930
5fbf9634
KS
2931 /*
2932 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2933 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2934 * the values of the LMA and LME bits in the field must each be that of
2935 * the host address-space size VM-exit control.
2936 */
2937 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
5497b955
SC
2938 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2939 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2940 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
254b2f3b 2941 return -EINVAL;
5fbf9634
KS
2942 }
2943
55d2375e
SC
2944 return 0;
2945}
2946
2947static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2948 struct vmcs12 *vmcs12)
2949{
7d0172b3
DW
2950 struct vcpu_vmx *vmx = to_vmx(vcpu);
2951 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
2952 struct vmcs_hdr hdr;
55d2375e 2953
64c78508 2954 if (vmcs12->vmcs_link_pointer == INVALID_GPA)
55d2375e
SC
2955 return 0;
2956
5497b955 2957 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
55d2375e
SC
2958 return -EINVAL;
2959
7d0172b3
DW
2960 if (ghc->gpa != vmcs12->vmcs_link_pointer &&
2961 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
2962 vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
2963 return -EINVAL;
55d2375e 2964
7d0172b3
DW
2965 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
2966 offsetof(struct vmcs12, hdr),
2967 sizeof(hdr))))
2968 return -EINVAL;
88925305 2969
7d0172b3
DW
2970 if (CC(hdr.revision_id != VMCS12_REVISION) ||
2971 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2972 return -EINVAL;
88925305 2973
7d0172b3 2974 return 0;
55d2375e
SC
2975}
2976
9c3e922b
SC
2977/*
2978 * Checks related to Guest Non-register State
2979 */
2980static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2981{
5497b955 2982 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
bf0cd88c
YQ
2983 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
2984 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
9c3e922b
SC
2985 return -EINVAL;
2986
2987 return 0;
2988}
2989
5478ba34
SC
2990static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2991 struct vmcs12 *vmcs12,
68cda40d 2992 enum vm_entry_failure_code *entry_failure_code)
55d2375e
SC
2993{
2994 bool ia32e;
2995
68cda40d 2996 *entry_failure_code = ENTRY_FAIL_DEFAULT;
55d2375e 2997
5497b955
SC
2998 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2999 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
c80add0f 3000 return -EINVAL;
55d2375e 3001
b91991bf
KS
3002 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
3003 CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
3004 return -EINVAL;
3005
de2bc2bf 3006 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
5497b955 3007 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
c80add0f 3008 return -EINVAL;
55d2375e
SC
3009
3010 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
68cda40d 3011 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
c80add0f 3012 return -EINVAL;
55d2375e
SC
3013 }
3014
bfc6ad6a
OU
3015 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3016 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3017 vmcs12->guest_ia32_perf_global_ctrl)))
3018 return -EINVAL;
3019
55d2375e
SC
3020 /*
3021 * If the load IA32_EFER VM-entry control is 1, the following checks
3022 * are performed on the field for the IA32_EFER MSR:
3023 * - Bits reserved in the IA32_EFER MSR must be 0.
3024 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3025 * the IA-32e mode guest VM-exit control. It must also be identical
3026 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3027 * CR0.PG) is 1.
3028 */
3029 if (to_vmx(vcpu)->nested.nested_run_pending &&
3030 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3031 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
5497b955
SC
3032 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3033 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3034 CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3035 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
c80add0f 3036 return -EINVAL;
55d2375e
SC
3037 }
3038
3039 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
5497b955
SC
3040 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3041 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
c80add0f 3042 return -EINVAL;
55d2375e 3043
9c3e922b 3044 if (nested_check_guest_non_reg_state(vmcs12))
c80add0f 3045 return -EINVAL;
55d2375e
SC
3046
3047 return 0;
3048}
3049
453eafbe 3050static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
55d2375e
SC
3051{
3052 struct vcpu_vmx *vmx = to_vmx(vcpu);
1a715810 3053 unsigned long cr3, cr4;
f1727b49 3054 bool vm_fail;
55d2375e
SC
3055
3056 if (!nested_early_check)
3057 return 0;
3058
3059 if (vmx->msr_autoload.host.nr)
3060 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3061 if (vmx->msr_autoload.guest.nr)
3062 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3063
3064 preempt_disable();
3065
3066 vmx_prepare_switch_to_guest(vcpu);
3067
3068 /*
3069 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3070 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
49f933d4 3071 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
55d2375e
SC
3072 * there is no need to preserve other bits or save/restore the field.
3073 */
3074 vmcs_writel(GUEST_RFLAGS, 0);
3075
1a715810
SC
3076 cr3 = __get_current_cr3_fast();
3077 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3078 vmcs_writel(HOST_CR3, cr3);
3079 vmx->loaded_vmcs->host_state.cr3 = cr3;
3080 }
3081
55d2375e
SC
3082 cr4 = cr4_read_shadow();
3083 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3084 vmcs_writel(HOST_CR4, cr4);
3085 vmx->loaded_vmcs->host_state.cr4 = cr4;
3086 }
3087
150f17bf 3088 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
bb066506 3089 __vmx_vcpu_run_flags(vmx));
55d2375e 3090
55d2375e
SC
3091 if (vmx->msr_autoload.host.nr)
3092 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3093 if (vmx->msr_autoload.guest.nr)
3094 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3095
f1727b49 3096 if (vm_fail) {
380e0055
SC
3097 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3098
541e886f 3099 preempt_enable();
380e0055
SC
3100
3101 trace_kvm_nested_vmenter_failed(
3102 "early hardware check VM-instruction error: ", error);
3103 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
55d2375e
SC
3104 return 1;
3105 }
3106
3107 /*
3108 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3109 */
55d2375e
SC
3110 if (hw_breakpoint_active())
3111 set_debugreg(__this_cpu_read(cpu_dr7), 7);
84b6a349 3112 local_irq_enable();
541e886f 3113 preempt_enable();
55d2375e
SC
3114
3115 /*
3116 * A non-failing VMEntry means we somehow entered guest mode with
3117 * an illegal RIP, and that's just the tip of the iceberg. There
3118 * is no telling what memory has been modified or what state has
3119 * been exposed to unknown code. Hitting this all but guarantees
3120 * a (very critical) hardware issue.
3121 */
3122 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3123 VMX_EXIT_REASONS_FAILED_VMENTRY));
3124
3125 return 0;
3126}
55d2375e 3127
9a78e158 3128static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
55d2375e 3129{
55d2375e 3130 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 3131
e942dbf8
VK
3132 /*
3133 * hv_evmcs may end up being not mapped after migration (when
3134 * L2 was running), map it here to make sure vmcs12 changes are
3135 * properly reflected.
3136 */
1e9dfbd7 3137 if (vmx->nested.enlightened_vmcs_enabled &&
27849968 3138 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
b6a0653a
VK
3139 enum nested_evmptrld_status evmptrld_status =
3140 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3141
3142 if (evmptrld_status == EVMPTRLD_VMFAIL ||
f5c7e842 3143 evmptrld_status == EVMPTRLD_ERROR)
b6a0653a 3144 return false;
8629b625
VK
3145
3146 /*
3147 * Post migration VMCS12 always provides the most actual
3148 * information, copy it to eVMCS upon entry.
3149 */
3150 vmx->nested.need_vmcs12_to_shadow_sync = true;
b6a0653a 3151 }
e942dbf8 3152
9a78e158
PB
3153 return true;
3154}
3155
3156static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3157{
3158 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3159 struct vcpu_vmx *vmx = to_vmx(vcpu);
3160 struct kvm_host_map *map;
3161 struct page *page;
3162 u64 hpa;
3163
158a48ec
ML
3164 if (!vcpu->arch.pdptrs_from_userspace &&
3165 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
0f857223
ML
3166 /*
3167 * Reload the guest's PDPTRs since after a migration
3168 * the guest CR3 might be restored prior to setting the nested
3169 * state which can lead to a load of wrong PDPTRs.
3170 */
2df4a5eb 3171 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
0f857223
ML
3172 return false;
3173 }
3174
3175
55d2375e
SC
3176 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3177 /*
3178 * Translate L1 physical address to host physical
3179 * address for vmcs02. Keep the page pinned, so this
3180 * physical address remains valid. We keep a reference
3181 * to it so we can release it later.
3182 */
3183 if (vmx->nested.apic_access_page) { /* shouldn't happen */
b11494bc 3184 kvm_release_page_clean(vmx->nested.apic_access_page);
55d2375e
SC
3185 vmx->nested.apic_access_page = NULL;
3186 }
3187 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
55d2375e
SC
3188 if (!is_error_page(page)) {
3189 vmx->nested.apic_access_page = page;
3190 hpa = page_to_phys(vmx->nested.apic_access_page);
3191 vmcs_write64(APIC_ACCESS_ADDR, hpa);
3192 } else {
671ddc70
JM
3193 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3194 __func__);
3195 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3196 vcpu->run->internal.suberror =
3197 KVM_INTERNAL_ERROR_EMULATION;
3198 vcpu->run->internal.ndata = 0;
3199 return false;
55d2375e
SC
3200 }
3201 }
3202
3203 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
96c66e87 3204 map = &vmx->nested.virtual_apic_map;
55d2375e 3205
96c66e87
KA
3206 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3207 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
69090810
PB
3208 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3209 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3210 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3211 /*
3212 * The processor will never use the TPR shadow, simply
3213 * clear the bit from the execution control. Such a
3214 * configuration is useless, but it happens in tests.
3215 * For any other configuration, failing the vm entry is
3216 * _not_ what the processor does but it's basically the
3217 * only possibility we have.
3218 */
2183f564 3219 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
69090810 3220 } else {
ca2f5466
SC
3221 /*
3222 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3223 * force VM-Entry to fail.
3224 */
64c78508 3225 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
55d2375e
SC
3226 }
3227 }
3228
3229 if (nested_cpu_has_posted_intr(vmcs12)) {
3278e049
KA
3230 map = &vmx->nested.pi_desc_map;
3231
3232 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3233 vmx->nested.pi_desc =
3234 (struct pi_desc *)(((void *)map->hva) +
3235 offset_in_page(vmcs12->posted_intr_desc_addr));
3236 vmcs_write64(POSTED_INTR_DESC_ADDR,
3237 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
966eefb8
JM
3238 } else {
3239 /*
3240 * Defer the KVM_INTERNAL_EXIT until KVM tries to
3241 * access the contents of the VMCS12 posted interrupt
3242 * descriptor. (Note that KVM may do this when it
3243 * should not, per the architectural specification.)
3244 */
3245 vmx->nested.pi_desc = NULL;
3246 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
55d2375e 3247 }
55d2375e
SC
3248 }
3249 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2183f564 3250 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
55d2375e 3251 else
2183f564 3252 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
9a78e158
PB
3253
3254 return true;
3255}
3256
3257static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3258{
f5c7e842
VK
3259 if (!nested_get_evmcs_page(vcpu)) {
3260 pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3261 __func__);
3262 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3263 vcpu->run->internal.suberror =
3264 KVM_INTERNAL_ERROR_EMULATION;
3265 vcpu->run->internal.ndata = 0;
3266
9a78e158 3267 return false;
f5c7e842 3268 }
9a78e158
PB
3269
3270 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3271 return false;
3272
671ddc70 3273 return true;
55d2375e
SC
3274}
3275
02f5fb2e
SC
3276static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3277{
3278 struct vmcs12 *vmcs12;
3279 struct vcpu_vmx *vmx = to_vmx(vcpu);
3280 gpa_t dst;
3281
3282 if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3283 return 0;
3284
3285 if (WARN_ON_ONCE(vmx->nested.pml_full))
3286 return 1;
3287
3288 /*
3289 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3290 * set is already checked as part of A/D emulation.
3291 */
3292 vmcs12 = get_vmcs12(vcpu);
3293 if (!nested_cpu_has_pml(vmcs12))
3294 return 0;
3295
3296 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
3297 vmx->nested.pml_full = true;
3298 return 1;
3299 }
3300
3301 gpa &= ~0xFFFull;
3302 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3303
3304 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3305 offset_in_page(dst), sizeof(gpa)))
3306 return 0;
3307
3308 vmcs12->guest_pml_index--;
3309
3310 return 0;
3311}
3312
55d2375e
SC
3313/*
3314 * Intel's VMX Instruction Reference specifies a common set of prerequisites
3315 * for running VMX instructions (except VMXON, whose prerequisites are
3316 * slightly different). It also specifies what exception to inject otherwise.
3317 * Note that many of these exceptions have priority over VM exits, so they
3318 * don't have to be checked again here.
3319 */
3320static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3321{
3322 if (!to_vmx(vcpu)->nested.vmxon) {
3323 kvm_queue_exception(vcpu, UD_VECTOR);
3324 return 0;
3325 }
3326
3327 if (vmx_get_cpl(vcpu)) {
3328 kvm_inject_gp(vcpu, 0);
3329 return 0;
3330 }
3331
3332 return 1;
3333}
3334
3335static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3336{
3337 u8 rvi = vmx_get_rvi();
3338 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3339
3340 return ((rvi & 0xf0) > (vppr & 0xf0));
3341}
3342
3343static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3344 struct vmcs12 *vmcs12);
3345
3346/*
3347 * If from_vmentry is false, this is being called from state restore (either RSM
3348 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
671ddc70
JM
3349 *
3350 * Returns:
463bfeee
ML
3351 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3352 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3353 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3354 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
55d2375e 3355 */
671ddc70
JM
3356enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3357 bool from_vmentry)
55d2375e
SC
3358{
3359 struct vcpu_vmx *vmx = to_vmx(vcpu);
3360 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
68cda40d 3361 enum vm_entry_failure_code entry_failure_code;
55d2375e 3362 bool evaluate_pending_interrupts;
8e533240
SC
3363 union vmx_exit_reason exit_reason = {
3364 .basic = EXIT_REASON_INVALID_STATE,
3365 .failed_vmentry = 1,
3366 };
3367 u32 failed_index;
55d2375e 3368
40e5f908 3369 kvm_service_local_tlb_flush_requests(vcpu);
eeeb4f67 3370
2183f564 3371 evaluate_pending_interrupts = exec_controls_get(vmx) &
4e2a0bc5 3372 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
55d2375e
SC
3373 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3374 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3375
3376 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3377 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3378 if (kvm_mpx_supported() &&
3379 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3380 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3381
f087a029
SC
3382 /*
3383 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3384 * nested early checks are disabled. In the event of a "late" VM-Fail,
3385 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3386 * software model to the pre-VMEntry host state. When EPT is disabled,
3387 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3388 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3389 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3390 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3391 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3392 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3393 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3394 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3395 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3396 * path would need to manually save/restore vmcs01.GUEST_CR3.
3397 */
3398 if (!enable_ept && !nested_early_check)
3399 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3400
55d2375e
SC
3401 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3402
389ab252 3403 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
55d2375e
SC
3404
3405 if (from_vmentry) {
b89d5ad0
SC
3406 if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3407 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
671ddc70 3408 return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
b89d5ad0 3409 }
55d2375e
SC
3410
3411 if (nested_vmx_check_vmentry_hw(vcpu)) {
3412 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
671ddc70 3413 return NVMX_VMENTRY_VMFAIL;
55d2375e
SC
3414 }
3415
68cda40d
SC
3416 if (nested_vmx_check_guest_state(vcpu, vmcs12,
3417 &entry_failure_code)) {
8e533240 3418 exit_reason.basic = EXIT_REASON_INVALID_STATE;
68cda40d 3419 vmcs12->exit_qualification = entry_failure_code;
55d2375e 3420 goto vmentry_fail_vmexit;
68cda40d 3421 }
55d2375e
SC
3422 }
3423
3424 enter_guest_mode(vcpu);
55d2375e 3425
0f857223 3426 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
8e533240 3427 exit_reason.basic = EXIT_REASON_INVALID_STATE;
68cda40d 3428 vmcs12->exit_qualification = entry_failure_code;
55d2375e 3429 goto vmentry_fail_vmexit_guest_mode;
68cda40d 3430 }
55d2375e
SC
3431
3432 if (from_vmentry) {
68cda40d
SC
3433 failed_index = nested_vmx_load_msr(vcpu,
3434 vmcs12->vm_entry_msr_load_addr,
3435 vmcs12->vm_entry_msr_load_count);
3436 if (failed_index) {
8e533240 3437 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
68cda40d 3438 vmcs12->exit_qualification = failed_index;
55d2375e 3439 goto vmentry_fail_vmexit_guest_mode;
68cda40d 3440 }
55d2375e
SC
3441 } else {
3442 /*
3443 * The MMU is not initialized to point at the right entities yet and
3444 * "get pages" would need to read data from the guest (i.e. we will
3445 * need to perform gpa to hpa translation). Request a call
3446 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3447 * have already been set at vmentry time and should not be reset.
3448 */
729c15c2 3449 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
55d2375e
SC
3450 }
3451
3452 /*
3453 * If L1 had a pending IRQ/NMI until it executed
3454 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3455 * disallowed (e.g. interrupts disabled), L0 needs to
3456 * evaluate if this pending event should cause an exit from L2
3457 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3458 * intercept EXTERNAL_INTERRUPT).
3459 *
3460 * Usually this would be handled by the processor noticing an
3461 * IRQ/NMI window request, or checking RVI during evaluation of
3462 * pending virtual interrupts. However, this setting was done
3463 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3464 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3465 */
3466 if (unlikely(evaluate_pending_interrupts))
3467 kvm_make_request(KVM_REQ_EVENT, vcpu);
3468
359a6c3d
PB
3469 /*
3470 * Do not start the preemption timer hrtimer until after we know
3471 * we are successful, so that only nested_vmx_vmexit needs to cancel
3472 * the timer.
3473 */
3474 vmx->nested.preemption_timer_expired = false;
850448f3
PS
3475 if (nested_cpu_has_preemption_timer(vmcs12)) {
3476 u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3477 vmx_start_preemption_timer(vcpu, timer_value);
3478 }
359a6c3d 3479
55d2375e
SC
3480 /*
3481 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3482 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3483 * returned as far as L1 is concerned. It will only return (and set
3484 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3485 */
671ddc70 3486 return NVMX_VMENTRY_SUCCESS;
55d2375e
SC
3487
3488 /*
3489 * A failed consistency check that leads to a VMExit during L1's
3490 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3491 * 26.7 "VM-entry failures during or after loading guest state".
3492 */
3493vmentry_fail_vmexit_guest_mode:
5e3d394f 3494 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
55d2375e
SC
3495 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3496 leave_guest_mode(vcpu);
3497
3498vmentry_fail_vmexit:
3499 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3500
3501 if (!from_vmentry)
671ddc70 3502 return NVMX_VMENTRY_VMEXIT;
55d2375e
SC
3503
3504 load_vmcs12_host_state(vcpu, vmcs12);
8e533240 3505 vmcs12->vm_exit_reason = exit_reason.full;
1e9dfbd7 3506 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
3731905e 3507 vmx->nested.need_vmcs12_to_shadow_sync = true;
671ddc70 3508 return NVMX_VMENTRY_VMEXIT;
55d2375e
SC
3509}
3510
3511/*
3512 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3513 * for running an L2 nested guest.
3514 */
3515static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3516{
3517 struct vmcs12 *vmcs12;
671ddc70 3518 enum nvmx_vmentry_status status;
55d2375e
SC
3519 struct vcpu_vmx *vmx = to_vmx(vcpu);
3520 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
b6a0653a 3521 enum nested_evmptrld_status evmptrld_status;
55d2375e
SC
3522
3523 if (!nested_vmx_check_permission(vcpu))
3524 return 1;
3525
b6a0653a
VK
3526 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3527 if (evmptrld_status == EVMPTRLD_ERROR) {
3528 kvm_queue_exception(vcpu, UD_VECTOR);
55d2375e 3529 return 1;
b6a0653a 3530 }
55d2375e 3531
018d70ff
EH
3532 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
3533
3534 if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
3535 return nested_vmx_failInvalid(vcpu);
3536
1e9dfbd7 3537 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
64c78508 3538 vmx->nested.current_vmptr == INVALID_GPA))
55d2375e
SC
3539 return nested_vmx_failInvalid(vcpu);
3540
3541 vmcs12 = get_vmcs12(vcpu);
3542
3543 /*
3544 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3545 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3546 * rather than RFLAGS.ZF, and no error number is stored to the
3547 * VM-instruction error field.
3548 */
fc595f35 3549 if (CC(vmcs12->hdr.shadow_vmcs))
55d2375e
SC
3550 return nested_vmx_failInvalid(vcpu);
3551
1e9dfbd7 3552 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
d6bf71a1 3553 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
55d2375e
SC
3554 /* Enlightened VMCS doesn't have launch state */
3555 vmcs12->launch_state = !launch;
3556 } else if (enable_shadow_vmcs) {
3557 copy_shadow_to_vmcs12(vmx);
3558 }
3559
3560 /*
3561 * The nested entry process starts with enforcing various prerequisites
3562 * on vmcs12 as required by the Intel SDM, and act appropriately when
3563 * they fail: As the SDM explains, some conditions should cause the
3564 * instruction to fail, while others will cause the instruction to seem
3565 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3566 * To speed up the normal (success) code path, we should avoid checking
3567 * for misconfigurations which will anyway be caught by the processor
3568 * when using the merged vmcs02.
3569 */
fc595f35 3570 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
b2656e4d 3571 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
55d2375e 3572
fc595f35 3573 if (CC(vmcs12->launch_state == launch))
b2656e4d 3574 return nested_vmx_fail(vcpu,
55d2375e
SC
3575 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3576 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3577
98d9e858 3578 if (nested_vmx_check_controls(vcpu, vmcs12))
b2656e4d 3579 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5478ba34 3580
af957eeb
ML
3581 if (nested_vmx_check_address_space_size(vcpu, vmcs12))
3582 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3583
98d9e858 3584 if (nested_vmx_check_host_state(vcpu, vmcs12))
b2656e4d 3585 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
55d2375e
SC
3586
3587 /*
3588 * We're finally done with prerequisite checking, and can start with
3589 * the nested entry.
3590 */
3591 vmx->nested.nested_run_pending = 1;
850448f3 3592 vmx->nested.has_preemption_timer_deadline = false;
671ddc70
JM
3593 status = nested_vmx_enter_non_root_mode(vcpu, true);
3594 if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3595 goto vmentry_failed;
55d2375e 3596
25bb2cf9
SC
3597 /* Emulate processing of posted interrupts on VM-Enter. */
3598 if (nested_cpu_has_posted_intr(vmcs12) &&
3599 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
3600 vmx->nested.pi_pending = true;
3601 kvm_make_request(KVM_REQ_EVENT, vcpu);
3602 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
3603 }
3604
55d2375e
SC
3605 /* Hide L1D cache contents from the nested guest. */
3606 vmx->vcpu.arch.l1tf_flush_l1d = true;
3607
3608 /*
3609 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3610 * also be used as part of restoring nVMX state for
3611 * snapshot restore (migration).
3612 *
3613 * In this flow, it is assumed that vmcs12 cache was
163b0991 3614 * transferred as part of captured nVMX state and should
55d2375e
SC
3615 * therefore not be read from guest memory (which may not
3616 * exist on destination host yet).
3617 */
3618 nested_cache_shadow_vmcs12(vcpu, vmcs12);
3619
bf0cd88c
YQ
3620 switch (vmcs12->guest_activity_state) {
3621 case GUEST_ACTIVITY_HLT:
3622 /*
3623 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3624 * awakened by event injection or by an NMI-window VM-exit or
3625 * by an interrupt-window VM-exit, halt the vcpu.
3626 */
3627 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3628 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3629 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3630 (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3631 vmx->nested.nested_run_pending = 0;
1460179d 3632 return kvm_emulate_halt_noskip(vcpu);
bf0cd88c
YQ
3633 }
3634 break;
3635 case GUEST_ACTIVITY_WAIT_SIPI:
55d2375e 3636 vmx->nested.nested_run_pending = 0;
bf0cd88c
YQ
3637 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
3638 break;
3639 default:
3640 break;
55d2375e 3641 }
bf0cd88c 3642
55d2375e 3643 return 1;
671ddc70
JM
3644
3645vmentry_failed:
3646 vmx->nested.nested_run_pending = 0;
3647 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3648 return 0;
3649 if (status == NVMX_VMENTRY_VMEXIT)
3650 return 1;
3651 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
b2656e4d 3652 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
55d2375e
SC
3653}
3654
3655/*
3656 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
67b0ae43 3657 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
55d2375e
SC
3658 * This function returns the new value we should put in vmcs12.guest_cr0.
3659 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3660 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3661 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3662 * didn't trap the bit, because if L1 did, so would L0).
3663 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3664 * been modified by L2, and L1 knows it. So just leave the old value of
3665 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3666 * isn't relevant, because if L0 traps this bit it can set it to anything.
3667 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3668 * changed these bits, and therefore they need to be updated, but L0
3669 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3670 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3671 */
3672static inline unsigned long
3673vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3674{
3675 return
3676 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3677 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3678 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3679 vcpu->arch.cr0_guest_owned_bits));
3680}
3681
3682static inline unsigned long
3683vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3684{
3685 return
3686 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3687 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3688 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3689 vcpu->arch.cr4_guest_owned_bits));
3690}
3691
3692static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
9bd1f0ef
SC
3693 struct vmcs12 *vmcs12,
3694 u32 vm_exit_reason, u32 exit_intr_info)
55d2375e
SC
3695{
3696 u32 idt_vectoring;
3697 unsigned int nr;
3698
9bd1f0ef
SC
3699 /*
3700 * Per the SDM, VM-Exits due to double and triple faults are never
3701 * considered to occur during event delivery, even if the double/triple
3702 * fault is the result of an escalating vectoring issue.
3703 *
3704 * Note, the SDM qualifies the double fault behavior with "The original
3705 * event results in a double-fault exception". It's unclear why the
3706 * qualification exists since exits due to double fault can occur only
3707 * while vectoring a different exception (injected events are never
3708 * subject to interception), i.e. there's _always_ an original event.
3709 *
3710 * The SDM also uses NMI as a confusing example for the "original event
3711 * causes the VM exit directly" clause. NMI isn't special in any way,
3712 * the same rule applies to all events that cause an exit directly.
3713 * NMI is an odd choice for the example because NMIs can only occur on
3714 * instruction boundaries, i.e. they _can't_ occur during vectoring.
3715 */
3716 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
3717 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
3718 is_double_fault(exit_intr_info))) {
3719 vmcs12->idt_vectoring_info_field = 0;
3720 } else if (vcpu->arch.exception.injected) {
55d2375e
SC
3721 nr = vcpu->arch.exception.nr;
3722 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3723
3724 if (kvm_exception_is_soft(nr)) {
3725 vmcs12->vm_exit_instruction_len =
3726 vcpu->arch.event_exit_inst_len;
3727 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3728 } else
3729 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3730
3731 if (vcpu->arch.exception.has_error_code) {
3732 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3733 vmcs12->idt_vectoring_error_code =
3734 vcpu->arch.exception.error_code;
3735 }
3736
3737 vmcs12->idt_vectoring_info_field = idt_vectoring;
3738 } else if (vcpu->arch.nmi_injected) {
3739 vmcs12->idt_vectoring_info_field =
3740 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3741 } else if (vcpu->arch.interrupt.injected) {
3742 nr = vcpu->arch.interrupt.nr;
3743 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3744
3745 if (vcpu->arch.interrupt.soft) {
3746 idt_vectoring |= INTR_TYPE_SOFT_INTR;
3747 vmcs12->vm_entry_instruction_len =
3748 vcpu->arch.event_exit_inst_len;
3749 } else
3750 idt_vectoring |= INTR_TYPE_EXT_INTR;
3751
3752 vmcs12->idt_vectoring_info_field = idt_vectoring;
9bd1f0ef
SC
3753 } else {
3754 vmcs12->idt_vectoring_info_field = 0;
55d2375e
SC
3755 }
3756}
3757
3758
96b100cd 3759void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
55d2375e
SC
3760{
3761 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3762 gfn_t gfn;
3763
3764 /*
3765 * Don't need to mark the APIC access page dirty; it is never
3766 * written to by the CPU during APIC virtualization.
3767 */
3768
3769 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3770 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3771 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3772 }
3773
3774 if (nested_cpu_has_posted_intr(vmcs12)) {
3775 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3776 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3777 }
3778}
3779
650293c3 3780static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
55d2375e
SC
3781{
3782 struct vcpu_vmx *vmx = to_vmx(vcpu);
3783 int max_irr;
3784 void *vapic_page;
3785 u16 status;
3786
966eefb8 3787 if (!vmx->nested.pi_pending)
650293c3 3788 return 0;
55d2375e 3789
966eefb8
JM
3790 if (!vmx->nested.pi_desc)
3791 goto mmio_needed;
3792
55d2375e 3793 vmx->nested.pi_pending = false;
966eefb8 3794
55d2375e 3795 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
650293c3 3796 return 0;
55d2375e
SC
3797
3798 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3799 if (max_irr != 256) {
96c66e87
KA
3800 vapic_page = vmx->nested.virtual_apic_map.hva;
3801 if (!vapic_page)
0fe998b2 3802 goto mmio_needed;
96c66e87 3803
55d2375e
SC
3804 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3805 vapic_page, &max_irr);
55d2375e
SC
3806 status = vmcs_read16(GUEST_INTR_STATUS);
3807 if ((u8)max_irr > ((u8)status & 0xff)) {
3808 status &= ~0xff;
3809 status |= (u8)max_irr;
3810 vmcs_write16(GUEST_INTR_STATUS, status);
3811 }
3812 }
3813
3814 nested_mark_vmcs12_pages_dirty(vcpu);
650293c3 3815 return 0;
0fe998b2
JM
3816
3817mmio_needed:
3818 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
3819 return -ENXIO;
55d2375e
SC
3820}
3821
3822static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3823 unsigned long exit_qual)
3824{
3825 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3826 unsigned int nr = vcpu->arch.exception.nr;
3827 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3828
3829 if (vcpu->arch.exception.has_error_code) {
3830 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3831 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3832 }
3833
3834 if (kvm_exception_is_soft(nr))
3835 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3836 else
3837 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3838
3839 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3840 vmx_get_nmi_mask(vcpu))
3841 intr_info |= INTR_INFO_UNBLOCK_NMI;
3842
3843 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3844}
3845
684c0422
OU
3846/*
3847 * Returns true if a debug trap is pending delivery.
3848 *
3849 * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3850 * exception may be inferred from the presence of an exception payload.
3851 */
3852static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
3853{
3854 return vcpu->arch.exception.pending &&
3855 vcpu->arch.exception.nr == DB_VECTOR &&
3856 vcpu->arch.exception.payload;
3857}
3858
3859/*
3860 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3861 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3862 * represents these debug traps with a payload that is said to be compatible
3863 * with the 'pending debug exceptions' field, write the payload to the VMCS
3864 * field if a VM-exit is delivered before the debug trap.
3865 */
3866static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
3867{
3868 if (vmx_pending_dbg_trap(vcpu))
3869 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
3870 vcpu->arch.exception.payload);
3871}
3872
d2060bd4
SC
3873static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
3874{
3875 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3876 to_vmx(vcpu)->nested.preemption_timer_expired;
3877}
3878
a1c77abb 3879static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
55d2375e
SC
3880{
3881 struct vcpu_vmx *vmx = to_vmx(vcpu);
3882 unsigned long exit_qual;
3883 bool block_nested_events =
3884 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
5ef8acbd 3885 bool mtf_pending = vmx->nested.mtf_pending;
4b9852f4
LA
3886 struct kvm_lapic *apic = vcpu->arch.apic;
3887
5ef8acbd
OU
3888 /*
3889 * Clear the MTF state. If a higher priority VM-exit is delivered first,
3890 * this state is discarded.
3891 */
5c8beb47
OU
3892 if (!block_nested_events)
3893 vmx->nested.mtf_pending = false;
5ef8acbd 3894
4b9852f4
LA
3895 if (lapic_in_kernel(vcpu) &&
3896 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3897 if (block_nested_events)
3898 return -EBUSY;
684c0422 3899 nested_vmx_update_pending_dbg(vcpu);
e64a8508 3900 clear_bit(KVM_APIC_INIT, &apic->pending_events);
bf0cd88c
YQ
3901 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
3902 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3903 return 0;
3904 }
3905
3906 if (lapic_in_kernel(vcpu) &&
3907 test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
3908 if (block_nested_events)
3909 return -EBUSY;
3910
3911 clear_bit(KVM_APIC_SIPI, &apic->pending_events);
3912 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3913 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
3914 apic->sipi_vector & 0xFFUL);
4b9852f4
LA
3915 return 0;
3916 }
55d2375e 3917
5ef8acbd
OU
3918 /*
3919 * Process any exceptions that are not debug traps before MTF.
4020da3b
ML
3920 *
3921 * Note that only a pending nested run can block a pending exception.
3922 * Otherwise an injected NMI/interrupt should either be
3923 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
3924 * while delivering the pending exception.
5ef8acbd 3925 */
4020da3b 3926
6ce347af 3927 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
4020da3b 3928 if (vmx->nested.nested_run_pending)
5ef8acbd 3929 return -EBUSY;
6ce347af
SC
3930 if (!nested_vmx_check_exception(vcpu, &exit_qual))
3931 goto no_vmexit;
5ef8acbd
OU
3932 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3933 return 0;
3934 }
3935
3936 if (mtf_pending) {
3937 if (block_nested_events)
3938 return -EBUSY;
3939 nested_vmx_update_pending_dbg(vcpu);
3940 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
3941 return 0;
3942 }
3943
6ce347af 3944 if (vcpu->arch.exception.pending) {
4020da3b 3945 if (vmx->nested.nested_run_pending)
55d2375e 3946 return -EBUSY;
6ce347af
SC
3947 if (!nested_vmx_check_exception(vcpu, &exit_qual))
3948 goto no_vmexit;
55d2375e
SC
3949 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3950 return 0;
3951 }
3952
d2060bd4 3953 if (nested_vmx_preemption_timer_pending(vcpu)) {
55d2375e
SC
3954 if (block_nested_events)
3955 return -EBUSY;
3956 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3957 return 0;
3958 }
3959
1cd2f0b0
SC
3960 if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
3961 if (block_nested_events)
3962 return -EBUSY;
3963 goto no_vmexit;
3964 }
3965
15ff0b45 3966 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
55d2375e
SC
3967 if (block_nested_events)
3968 return -EBUSY;
15ff0b45
SC
3969 if (!nested_exit_on_nmi(vcpu))
3970 goto no_vmexit;
3971
55d2375e
SC
3972 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3973 NMI_VECTOR | INTR_TYPE_NMI_INTR |
3974 INTR_INFO_VALID_MASK, 0);
3975 /*
3976 * The NMI-triggered VM exit counts as injection:
3977 * clear this one and block further NMIs.
3978 */
3979 vcpu->arch.nmi_pending = 0;
3980 vmx_set_nmi_mask(vcpu, true);
3981 return 0;
3982 }
3983
15ff0b45 3984 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
55d2375e
SC
3985 if (block_nested_events)
3986 return -EBUSY;
15ff0b45
SC
3987 if (!nested_exit_on_intr(vcpu))
3988 goto no_vmexit;
55d2375e
SC
3989 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3990 return 0;
3991 }
3992
6ce347af 3993no_vmexit:
650293c3 3994 return vmx_complete_nested_posted_interrupt(vcpu);
55d2375e
SC
3995}
3996
3997static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3998{
3999 ktime_t remaining =
4000 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
4001 u64 value;
4002
4003 if (ktime_to_ns(remaining) <= 0)
4004 return 0;
4005
4006 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
4007 do_div(value, 1000000);
4008 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
4009}
4010
7952d769 4011static bool is_vmcs12_ext_field(unsigned long field)
55d2375e 4012{
7952d769
SC
4013 switch (field) {
4014 case GUEST_ES_SELECTOR:
4015 case GUEST_CS_SELECTOR:
4016 case GUEST_SS_SELECTOR:
4017 case GUEST_DS_SELECTOR:
4018 case GUEST_FS_SELECTOR:
4019 case GUEST_GS_SELECTOR:
4020 case GUEST_LDTR_SELECTOR:
4021 case GUEST_TR_SELECTOR:
4022 case GUEST_ES_LIMIT:
4023 case GUEST_CS_LIMIT:
4024 case GUEST_SS_LIMIT:
4025 case GUEST_DS_LIMIT:
4026 case GUEST_FS_LIMIT:
4027 case GUEST_GS_LIMIT:
4028 case GUEST_LDTR_LIMIT:
4029 case GUEST_TR_LIMIT:
4030 case GUEST_GDTR_LIMIT:
4031 case GUEST_IDTR_LIMIT:
4032 case GUEST_ES_AR_BYTES:
4033 case GUEST_DS_AR_BYTES:
4034 case GUEST_FS_AR_BYTES:
4035 case GUEST_GS_AR_BYTES:
4036 case GUEST_LDTR_AR_BYTES:
4037 case GUEST_TR_AR_BYTES:
4038 case GUEST_ES_BASE:
4039 case GUEST_CS_BASE:
4040 case GUEST_SS_BASE:
4041 case GUEST_DS_BASE:
4042 case GUEST_FS_BASE:
4043 case GUEST_GS_BASE:
4044 case GUEST_LDTR_BASE:
4045 case GUEST_TR_BASE:
4046 case GUEST_GDTR_BASE:
4047 case GUEST_IDTR_BASE:
4048 case GUEST_PENDING_DBG_EXCEPTIONS:
4049 case GUEST_BNDCFGS:
4050 return true;
4051 default:
4052 break;
4053 }
55d2375e 4054
7952d769
SC
4055 return false;
4056}
4057
4058static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4059 struct vmcs12 *vmcs12)
4060{
4061 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e
SC
4062
4063 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4064 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4065 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4066 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4067 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4068 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4069 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4070 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4071 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4072 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4073 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4074 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4075 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4076 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4077 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4078 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4079 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4080 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4081 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
55d2375e
SC
4082 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4083 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4084 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4085 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4086 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4087 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4088 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4089 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4090 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4091 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4092 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4093 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4094 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4095 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4096 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
7952d769
SC
4097 vmcs12->guest_pending_dbg_exceptions =
4098 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4099 if (kvm_mpx_supported())
4100 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
4101
4102 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4103}
4104
4105static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4106 struct vmcs12 *vmcs12)
4107{
4108 struct vcpu_vmx *vmx = to_vmx(vcpu);
4109 int cpu;
4110
4111 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4112 return;
4113
4114
4115 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4116
4117 cpu = get_cpu();
4118 vmx->loaded_vmcs = &vmx->nested.vmcs02;
1af1bb05 4119 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
7952d769
SC
4120
4121 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4122
4123 vmx->loaded_vmcs = &vmx->vmcs01;
1af1bb05 4124 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
7952d769
SC
4125 put_cpu();
4126}
4127
4128/*
4129 * Update the guest state fields of vmcs12 to reflect changes that
4130 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4131 * VM-entry controls is also updated, since this is really a guest
4132 * state bit.)
4133 */
4134static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4135{
4136 struct vcpu_vmx *vmx = to_vmx(vcpu);
4137
1e9dfbd7 4138 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
7952d769
SC
4139 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4140
1e9dfbd7
VK
4141 vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4142 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
7952d769
SC
4143
4144 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4145 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4146
4147 vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4148 vmcs12->guest_rip = kvm_rip_read(vcpu);
4149 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4150
4151 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4152 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
55d2375e
SC
4153
4154 vmcs12->guest_interruptibility_info =
4155 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
7952d769 4156
55d2375e
SC
4157 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4158 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
bf0cd88c
YQ
4159 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4160 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
55d2375e
SC
4161 else
4162 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4163
b4b65b56 4164 if (nested_cpu_has_preemption_timer(vmcs12) &&
850448f3
PS
4165 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4166 !vmx->nested.nested_run_pending)
4167 vmcs12->vmx_preemption_timer_value =
4168 vmx_get_preemption_timer_value(vcpu);
55d2375e
SC
4169
4170 /*
4171 * In some cases (usually, nested EPT), L2 is allowed to change its
4172 * own CR3 without exiting. If it has changed it, we must keep it.
4173 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4174 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4175 *
4176 * Additionally, restore L2's PDPTR to vmcs12.
4177 */
4178 if (enable_ept) {
4179 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
c7554efc
SC
4180 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4181 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4182 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4183 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4184 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4185 }
55d2375e
SC
4186 }
4187
4188 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4189
4190 if (nested_cpu_has_vid(vmcs12))
4191 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4192
4193 vmcs12->vm_entry_controls =
4194 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4195 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4196
699a1ac2 4197 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
55d2375e 4198 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
55d2375e 4199
55d2375e
SC
4200 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4201 vmcs12->guest_ia32_efer = vcpu->arch.efer;
55d2375e
SC
4202}
4203
4204/*
4205 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4206 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4207 * and this function updates it to reflect the changes to the guest state while
4208 * L2 was running (and perhaps made some exits which were handled directly by L0
4209 * without going back to L1), and to reflect the exit reason.
4210 * Note that we do not have to copy here all VMCS fields, just those that
4211 * could have changed by the L2 guest or the exit - i.e., the guest-state and
4212 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4213 * which already writes to vmcs12 directly.
4214 */
4215static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4dcefa31 4216 u32 vm_exit_reason, u32 exit_intr_info,
55d2375e
SC
4217 unsigned long exit_qualification)
4218{
55d2375e 4219 /* update exit information fields: */
4dcefa31 4220 vmcs12->vm_exit_reason = vm_exit_reason;
3c0c2ad1
SC
4221 if (to_vmx(vcpu)->exit_reason.enclave_mode)
4222 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
55d2375e 4223 vmcs12->exit_qualification = exit_qualification;
55d2375e 4224
c3634d25
SC
4225 /*
4226 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
4227 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
4228 * exit info fields are unmodified.
4229 */
55d2375e
SC
4230 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4231 vmcs12->launch_state = 1;
4232
4233 /* vm_entry_intr_info_field is cleared on exit. Emulate this
4234 * instead of reading the real value. */
4235 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4236
4237 /*
4238 * Transfer the event that L0 or L1 may wanted to inject into
4239 * L2 to IDT_VECTORING_INFO_FIELD.
4240 */
9bd1f0ef
SC
4241 vmcs12_save_pending_event(vcpu, vmcs12,
4242 vm_exit_reason, exit_intr_info);
a0d4f803 4243
c3634d25
SC
4244 vmcs12->vm_exit_intr_info = exit_intr_info;
4245 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4246 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4247
a0d4f803
KS
4248 /*
4249 * According to spec, there's no need to store the guest's
4250 * MSRs if the exit is due to a VM-entry failure that occurs
4251 * during or after loading the guest state. Since this exit
4252 * does not fall in that category, we need to save the MSRs.
4253 */
4254 if (nested_vmx_store_msr(vcpu,
4255 vmcs12->vm_exit_msr_store_addr,
4256 vmcs12->vm_exit_msr_store_count))
4257 nested_vmx_abort(vcpu,
4258 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
55d2375e
SC
4259 }
4260
4261 /*
4262 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
4263 * preserved above and would only end up incorrectly in L1.
4264 */
4265 vcpu->arch.nmi_injected = false;
4266 kvm_clear_exception_queue(vcpu);
4267 kvm_clear_interrupt_queue(vcpu);
4268}
4269
4270/*
4271 * A part of what we need to when the nested L2 guest exits and we want to
4272 * run its L1 parent, is to reset L1's guest state to the host state specified
4273 * in vmcs12.
4274 * This function is to be called not only on normal nested exit, but also on
4275 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4276 * Failures During or After Loading Guest State").
4277 * This function should be called when the active VMCS is L1's (vmcs01).
4278 */
4279static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4280 struct vmcs12 *vmcs12)
4281{
68cda40d 4282 enum vm_entry_failure_code ignored;
55d2375e 4283 struct kvm_segment seg;
55d2375e
SC
4284
4285 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4286 vcpu->arch.efer = vmcs12->host_ia32_efer;
4287 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4288 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4289 else
4290 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4291 vmx_set_efer(vcpu, vcpu->arch.efer);
4292
e9c16c78
PB
4293 kvm_rsp_write(vcpu, vmcs12->host_rsp);
4294 kvm_rip_write(vcpu, vmcs12->host_rip);
55d2375e
SC
4295 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4296 vmx_set_interrupt_shadow(vcpu, 0);
4297
4298 /*
4299 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4300 * actually changed, because vmx_set_cr0 refers to efer set above.
4301 *
4302 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4303 * (KVM doesn't change it);
4304 */
fa71e952 4305 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
55d2375e
SC
4306 vmx_set_cr0(vcpu, vmcs12->host_cr0);
4307
4308 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
4309 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4310 vmx_set_cr4(vcpu, vmcs12->host_cr4);
4311
4312 nested_ept_uninit_mmu_context(vcpu);
4313
4314 /*
4315 * Only PDPTE load can fail as the value of cr3 was checked on entry and
4316 * couldn't have changed.
4317 */
0f857223 4318 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
55d2375e
SC
4319 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4320
50b265a4 4321 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
55d2375e
SC
4322
4323 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4324 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4325 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4326 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4327 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4328 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4329 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4330
4331 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4332 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4333 vmcs_write64(GUEST_BNDCFGS, 0);
4334
4335 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4336 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4337 vcpu->arch.pat = vmcs12->host_ia32_pat;
4338 }
4339 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
d1968421
OU
4340 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4341 vmcs12->host_ia32_perf_global_ctrl));
55d2375e
SC
4342
4343 /* Set L1 segment info according to Intel SDM
4344 27.5.2 Loading Host Segment and Descriptor-Table Registers */
4345 seg = (struct kvm_segment) {
4346 .base = 0,
4347 .limit = 0xFFFFFFFF,
4348 .selector = vmcs12->host_cs_selector,
4349 .type = 11,
4350 .present = 1,
4351 .s = 1,
4352 .g = 1
4353 };
4354 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4355 seg.l = 1;
4356 else
4357 seg.db = 1;
816be9e9 4358 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
55d2375e
SC
4359 seg = (struct kvm_segment) {
4360 .base = 0,
4361 .limit = 0xFFFFFFFF,
4362 .type = 3,
4363 .present = 1,
4364 .s = 1,
4365 .db = 1,
4366 .g = 1
4367 };
4368 seg.selector = vmcs12->host_ds_selector;
816be9e9 4369 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
55d2375e 4370 seg.selector = vmcs12->host_es_selector;
816be9e9 4371 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
55d2375e 4372 seg.selector = vmcs12->host_ss_selector;
816be9e9 4373 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
55d2375e
SC
4374 seg.selector = vmcs12->host_fs_selector;
4375 seg.base = vmcs12->host_fs_base;
816be9e9 4376 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
55d2375e
SC
4377 seg.selector = vmcs12->host_gs_selector;
4378 seg.base = vmcs12->host_gs_base;
816be9e9 4379 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
55d2375e
SC
4380 seg = (struct kvm_segment) {
4381 .base = vmcs12->host_tr_base,
4382 .limit = 0x67,
4383 .selector = vmcs12->host_tr_selector,
4384 .type = 11,
4385 .present = 1
4386 };
816be9e9 4387 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
55d2375e 4388
afc8de01
SC
4389 memset(&seg, 0, sizeof(seg));
4390 seg.unusable = 1;
816be9e9 4391 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
55d2375e
SC
4392
4393 kvm_set_dr(vcpu, 7, 0x400);
4394 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4395
55d2375e
SC
4396 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4397 vmcs12->vm_exit_msr_load_count))
4398 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
dbab610a
ML
4399
4400 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
55d2375e
SC
4401}
4402
4403static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4404{
eb3db1b1 4405 struct vmx_uret_msr *efer_msr;
55d2375e
SC
4406 unsigned int i;
4407
4408 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4409 return vmcs_read64(GUEST_IA32_EFER);
4410
4411 if (cpu_has_load_ia32_efer())
4412 return host_efer;
4413
4414 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4415 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4416 return vmx->msr_autoload.guest.val[i].value;
4417 }
4418
d85a8034 4419 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
55d2375e
SC
4420 if (efer_msr)
4421 return efer_msr->data;
4422
4423 return host_efer;
4424}
4425
4426static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4427{
4428 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4429 struct vcpu_vmx *vmx = to_vmx(vcpu);
4430 struct vmx_msr_entry g, h;
55d2375e
SC
4431 gpa_t gpa;
4432 u32 i, j;
4433
4434 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4435
4436 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4437 /*
4438 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4439 * as vmcs01.GUEST_DR7 contains a userspace defined value
4440 * and vcpu->arch.dr7 is not squirreled away before the
4441 * nested VMENTER (not worth adding a variable in nested_vmx).
4442 */
4443 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4444 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4445 else
4446 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4447 }
4448
4449 /*
4450 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4451 * handle a variety of side effects to KVM's software model.
4452 */
4453 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4454
fa71e952 4455 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
55d2375e
SC
4456 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4457
4458 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4459 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4460
4461 nested_ept_uninit_mmu_context(vcpu);
f087a029 4462 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
cb3c1e2f 4463 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
55d2375e
SC
4464
4465 /*
4466 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4467 * from vmcs01 (if necessary). The PDPTRs are not loaded on
4468 * VMFail, like everything else we just need to ensure our
4469 * software model is up-to-date.
4470 */
9932b49e 4471 if (enable_ept && is_pae_paging(vcpu))
f087a029 4472 ept_save_pdptrs(vcpu);
55d2375e
SC
4473
4474 kvm_mmu_reset_context(vcpu);
4475
55d2375e
SC
4476 /*
4477 * This nasty bit of open coding is a compromise between blindly
4478 * loading L1's MSRs using the exit load lists (incorrect emulation
4479 * of VMFail), leaving the nested VM's MSRs in the software model
4480 * (incorrect behavior) and snapshotting the modified MSRs (too
4481 * expensive since the lists are unbound by hardware). For each
4482 * MSR that was (prematurely) loaded from the nested VMEntry load
4483 * list, reload it from the exit load list if it exists and differs
4484 * from the guest value. The intent is to stuff host state as
4485 * silently as possible, not to fully process the exit load list.
4486 */
55d2375e
SC
4487 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4488 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4489 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4490 pr_debug_ratelimited(
4491 "%s read MSR index failed (%u, 0x%08llx)\n",
4492 __func__, i, gpa);
4493 goto vmabort;
4494 }
4495
4496 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4497 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4498 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4499 pr_debug_ratelimited(
4500 "%s read MSR failed (%u, 0x%08llx)\n",
4501 __func__, j, gpa);
4502 goto vmabort;
4503 }
4504 if (h.index != g.index)
4505 continue;
4506 if (h.value == g.value)
4507 break;
4508
4509 if (nested_vmx_load_msr_check(vcpu, &h)) {
4510 pr_debug_ratelimited(
4511 "%s check failed (%u, 0x%x, 0x%x)\n",
4512 __func__, j, h.index, h.reserved);
4513 goto vmabort;
4514 }
4515
f20935d8 4516 if (kvm_set_msr(vcpu, h.index, h.value)) {
55d2375e
SC
4517 pr_debug_ratelimited(
4518 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4519 __func__, j, h.index, h.value);
4520 goto vmabort;
4521 }
4522 }
4523 }
4524
4525 return;
4526
4527vmabort:
4528 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4529}
4530
4531/*
4532 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4533 * and modify vmcs12 to make it see what it would expect to see there if
4534 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4535 */
4dcefa31 4536void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
55d2375e
SC
4537 u32 exit_intr_info, unsigned long exit_qualification)
4538{
4539 struct vcpu_vmx *vmx = to_vmx(vcpu);
4540 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4541
4542 /* trying to cancel vmlaunch/vmresume is a bug */
4543 WARN_ON_ONCE(vmx->nested.nested_run_pending);
4544
f5c7e842
VK
4545 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4546 /*
4547 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4548 * Enlightened VMCS after migration and we still need to
4549 * do that when something is forcing L2->L1 exit prior to
4550 * the first L2 run.
4551 */
4552 (void)nested_get_evmcs_page(vcpu);
4553 }
f2c7ef3b 4554
40e5f908
SC
4555 /* Service pending TLB flush requests for L2 before switching to L1. */
4556 kvm_service_local_tlb_flush_requests(vcpu);
eeeb4f67 4557
43fea4e4
PS
4558 /*
4559 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4560 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
4561 * up-to-date before switching to L1.
4562 */
4563 if (enable_ept && is_pae_paging(vcpu))
4564 vmx_ept_load_pdptrs(vcpu);
4565
55d2375e
SC
4566 leave_guest_mode(vcpu);
4567
b4b65b56
PB
4568 if (nested_cpu_has_preemption_timer(vmcs12))
4569 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4570
d041b5ea
IS
4571 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
4572 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
4573 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
4574 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
4575 }
55d2375e
SC
4576
4577 if (likely(!vmx->fail)) {
3731905e 4578 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
f4f8316d 4579
4dcefa31
SC
4580 if (vm_exit_reason != -1)
4581 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
4582 exit_intr_info, exit_qualification);
55d2375e
SC
4583
4584 /*
3731905e 4585 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
55d2375e
SC
4586 * also be used to capture vmcs12 cache as part of
4587 * capturing nVMX state for snapshot (migration).
4588 *
4589 * Otherwise, this flush will dirty guest memory at a
4590 * point it is already assumed by user-space to be
4591 * immutable.
4592 */
4593 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
55d2375e
SC
4594 } else {
4595 /*
4596 * The only expected VM-instruction error is "VM entry with
4597 * invalid control field(s)." Anything else indicates a
4598 * problem with L0. And we should never get here with a
4599 * VMFail of any type if early consistency checks are enabled.
4600 */
4601 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4602 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4603 WARN_ON_ONCE(nested_early_check);
4604 }
4605
4606 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4607
4608 /* Update any VMCS fields that might have changed while L2 ran */
4609 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4610 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4611 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
1ab9287a
IS
4612 if (kvm_has_tsc_control)
4613 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
4614
02d496cf
LA
4615 if (vmx->nested.l1_tpr_threshold != -1)
4616 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
55d2375e 4617
55d2375e
SC
4618 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4619 vmx->nested.change_vmcs01_virtual_apic_mode = false;
4620 vmx_set_virtual_apic_mode(vcpu);
55d2375e
SC
4621 }
4622
a85863c2
MS
4623 if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
4624 vmx->nested.update_vmcs01_cpu_dirty_logging = false;
4625 vmx_update_cpu_dirty_logging(vcpu);
4626 }
4627
55d2375e
SC
4628 /* Unpin physical memory we referred to in vmcs02 */
4629 if (vmx->nested.apic_access_page) {
b11494bc 4630 kvm_release_page_clean(vmx->nested.apic_access_page);
55d2375e
SC
4631 vmx->nested.apic_access_page = NULL;
4632 }
96c66e87 4633 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
3278e049
KA
4634 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4635 vmx->nested.pi_desc = NULL;
55d2375e 4636
1196cb97
SC
4637 if (vmx->nested.reload_vmcs01_apic_access_page) {
4638 vmx->nested.reload_vmcs01_apic_access_page = false;
4639 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4640 }
55d2375e 4641
7c69661e
SC
4642 if (vmx->nested.update_vmcs01_apicv_status) {
4643 vmx->nested.update_vmcs01_apicv_status = false;
4644 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
4645 }
4646
4dcefa31 4647 if ((vm_exit_reason != -1) &&
1e9dfbd7 4648 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
3731905e 4649 vmx->nested.need_vmcs12_to_shadow_sync = true;
55d2375e
SC
4650
4651 /* in case we halted in L2 */
4652 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4653
4654 if (likely(!vmx->fail)) {
4dcefa31 4655 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
a1c77abb 4656 nested_exit_intr_ack_set(vcpu)) {
55d2375e
SC
4657 int irq = kvm_cpu_get_interrupt(vcpu);
4658 WARN_ON(irq < 0);
4659 vmcs12->vm_exit_intr_info = irq |
4660 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4661 }
4662
4dcefa31 4663 if (vm_exit_reason != -1)
55d2375e
SC
4664 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4665 vmcs12->exit_qualification,
4666 vmcs12->idt_vectoring_info_field,
4667 vmcs12->vm_exit_intr_info,
4668 vmcs12->vm_exit_intr_error_code,
4669 KVM_ISA_VMX);
4670
4671 load_vmcs12_host_state(vcpu, vmcs12);
4672
4673 return;
4674 }
4675
4676 /*
4677 * After an early L2 VM-entry failure, we're now back
4678 * in L1 which thinks it just finished a VMLAUNCH or
4679 * VMRESUME instruction, so we need to set the failure
4680 * flag and the VM-instruction error field of the VMCS
4681 * accordingly, and skip the emulated instruction.
4682 */
b2656e4d 4683 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
55d2375e
SC
4684
4685 /*
4686 * Restore L1's host state to KVM's software model. We're here
4687 * because a consistency check was caught by hardware, which
4688 * means some amount of guest state has been propagated to KVM's
4689 * model and needs to be unwound to the host's state.
4690 */
4691 nested_vmx_restore_host_state(vcpu);
4692
4693 vmx->fail = 0;
4694}
4695
cb6a32c2
SC
4696static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
4697{
4698 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
4699}
4700
55d2375e
SC
4701/*
4702 * Decode the memory-address operand of a vmx instruction, as recorded on an
4703 * exit caused by such an instruction (run by a guest hypervisor).
4704 * On success, returns 0. When the operand is invalid, returns 1 and throws
49f933d4 4705 * #UD, #GP, or #SS.
55d2375e
SC
4706 */
4707int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
fdb28619 4708 u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
55d2375e
SC
4709{
4710 gva_t off;
4711 bool exn;
4712 struct kvm_segment s;
4713
4714 /*
4715 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4716 * Execution", on an exit, vmx_instruction_info holds most of the
4717 * addressing components of the operand. Only the displacement part
4718 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4719 * For how an actual address is calculated from all these components,
4720 * refer to Vol. 1, "Operand Addressing".
4721 */
4722 int scaling = vmx_instruction_info & 3;
4723 int addr_size = (vmx_instruction_info >> 7) & 7;
4724 bool is_reg = vmx_instruction_info & (1u << 10);
4725 int seg_reg = (vmx_instruction_info >> 15) & 7;
4726 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4727 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4728 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4729 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4730
4731 if (is_reg) {
4732 kvm_queue_exception(vcpu, UD_VECTOR);
4733 return 1;
4734 }
4735
4736 /* Addr = segment_base + offset */
4737 /* offset = base + [index * scale] + displacement */
4738 off = exit_qualification; /* holds the displacement */
946c522b
SC
4739 if (addr_size == 1)
4740 off = (gva_t)sign_extend64(off, 31);
4741 else if (addr_size == 0)
4742 off = (gva_t)sign_extend64(off, 15);
55d2375e
SC
4743 if (base_is_valid)
4744 off += kvm_register_read(vcpu, base_reg);
4745 if (index_is_valid)
e6302698 4746 off += kvm_register_read(vcpu, index_reg) << scaling;
55d2375e 4747 vmx_get_segment(vcpu, &s, seg_reg);
55d2375e 4748
8570f9e8
SC
4749 /*
4750 * The effective address, i.e. @off, of a memory operand is truncated
4751 * based on the address size of the instruction. Note that this is
4752 * the *effective address*, i.e. the address prior to accounting for
4753 * the segment's base.
4754 */
55d2375e 4755 if (addr_size == 1) /* 32 bit */
8570f9e8
SC
4756 off &= 0xffffffff;
4757 else if (addr_size == 0) /* 16 bit */
4758 off &= 0xffff;
55d2375e
SC
4759
4760 /* Checks for #GP/#SS exceptions. */
4761 exn = false;
4762 if (is_long_mode(vcpu)) {
8570f9e8
SC
4763 /*
4764 * The virtual/linear address is never truncated in 64-bit
4765 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4766 * address when using FS/GS with a non-zero base.
4767 */
6694e480
LA
4768 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4769 *ret = s.base + off;
4770 else
4771 *ret = off;
8570f9e8 4772
55d2375e
SC
4773 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4774 * non-canonical form. This is the only check on the memory
4775 * destination for long mode!
4776 */
4777 exn = is_noncanonical_address(*ret, vcpu);
e0dfacbf 4778 } else {
8570f9e8
SC
4779 /*
4780 * When not in long mode, the virtual/linear address is
4781 * unconditionally truncated to 32 bits regardless of the
4782 * address size.
4783 */
4784 *ret = (s.base + off) & 0xffffffff;
4785
55d2375e
SC
4786 /* Protected mode: apply checks for segment validity in the
4787 * following order:
4788 * - segment type check (#GP(0) may be thrown)
4789 * - usability check (#GP(0)/#SS(0))
4790 * - limit check (#GP(0)/#SS(0))
4791 */
4792 if (wr)
4793 /* #GP(0) if the destination operand is located in a
4794 * read-only data segment or any code segment.
4795 */
4796 exn = ((s.type & 0xa) == 0 || (s.type & 8));
4797 else
4798 /* #GP(0) if the source operand is located in an
4799 * execute-only code segment
4800 */
4801 exn = ((s.type & 0xa) == 8);
4802 if (exn) {
4803 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4804 return 1;
4805 }
4806 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4807 */
4808 exn = (s.unusable != 0);
34333cc6
SC
4809
4810 /*
4811 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4812 * outside the segment limit. All CPUs that support VMX ignore
4813 * limit checks for flat segments, i.e. segments with base==0,
4814 * limit==0xffffffff and of type expand-up data or code.
55d2375e 4815 */
34333cc6
SC
4816 if (!(s.base == 0 && s.limit == 0xffffffff &&
4817 ((s.type & 8) || !(s.type & 4))))
fdb28619 4818 exn = exn || ((u64)off + len - 1 > s.limit);
55d2375e
SC
4819 }
4820 if (exn) {
4821 kvm_queue_exception_e(vcpu,
4822 seg_reg == VCPU_SREG_SS ?
4823 SS_VECTOR : GP_VECTOR,
4824 0);
4825 return 1;
4826 }
4827
4828 return 0;
4829}
4830
0bcd556e
SC
4831void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
4832 bool vcpu_has_perf_global_ctrl)
03a8871a
OU
4833{
4834 struct vcpu_vmx *vmx;
4835
4836 if (!nested_vmx_allowed(vcpu))
4837 return;
4838
4839 vmx = to_vmx(vcpu);
0bcd556e 4840 if (vcpu_has_perf_global_ctrl) {
03a8871a
OU
4841 vmx->nested.msrs.entry_ctls_high |=
4842 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4843 vmx->nested.msrs.exit_ctls_high |=
4844 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4845 } else {
4846 vmx->nested.msrs.entry_ctls_high &=
4847 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4848 vmx->nested.msrs.exit_ctls_high &=
c6b177a3 4849 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
03a8871a
OU
4850 }
4851}
4852
7a35e515
VK
4853static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
4854 int *ret)
55d2375e
SC
4855{
4856 gva_t gva;
4857 struct x86_exception e;
7a35e515 4858 int r;
55d2375e 4859
5addc235 4860 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619 4861 vmcs_read32(VMX_INSTRUCTION_INFO), false,
7a35e515
VK
4862 sizeof(*vmpointer), &gva)) {
4863 *ret = 1;
4864 return -EINVAL;
4865 }
55d2375e 4866
7a35e515
VK
4867 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
4868 if (r != X86EMUL_CONTINUE) {
3f3393b3 4869 *ret = kvm_handle_memory_failure(vcpu, r, &e);
7a35e515 4870 return -EINVAL;
55d2375e
SC
4871 }
4872
4873 return 0;
4874}
4875
4876/*
4877 * Allocate a shadow VMCS and associate it with the currently loaded
4878 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4879 * VMCS is also VMCLEARed, so that it is ready for use.
4880 */
4881static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4882{
4883 struct vcpu_vmx *vmx = to_vmx(vcpu);
4884 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4885
4886 /*
d6e656cd
SC
4887 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
4888 * when L1 executes VMXOFF or the vCPU is forced out of nested
4889 * operation. VMXON faults if the CPU is already post-VMXON, so it
4890 * should be impossible to already have an allocated shadow VMCS. KVM
4891 * doesn't support virtualization of VMCS shadowing, so vmcs01 should
4892 * always be the loaded VMCS.
55d2375e 4893 */
d6e656cd
SC
4894 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
4895 return loaded_vmcs->shadow_vmcs;
4896
4897 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4898 if (loaded_vmcs->shadow_vmcs)
4899 vmcs_clear(loaded_vmcs->shadow_vmcs);
55d2375e 4900
55d2375e
SC
4901 return loaded_vmcs->shadow_vmcs;
4902}
4903
4904static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4905{
4906 struct vcpu_vmx *vmx = to_vmx(vcpu);
4907 int r;
4908
4909 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4910 if (r < 0)
4911 goto out_vmcs02;
4912
41836839 4913 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
55d2375e
SC
4914 if (!vmx->nested.cached_vmcs12)
4915 goto out_cached_vmcs12;
4916
8503fea6 4917 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
41836839 4918 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
55d2375e
SC
4919 if (!vmx->nested.cached_shadow_vmcs12)
4920 goto out_cached_shadow_vmcs12;
4921
4922 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4923 goto out_shadow_vmcs;
4924
4925 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
ada0098d 4926 HRTIMER_MODE_ABS_PINNED);
55d2375e
SC
4927 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4928
4929 vmx->nested.vpid02 = allocate_vpid();
4930
4931 vmx->nested.vmcs02_initialized = false;
4932 vmx->nested.vmxon = true;
ee85dec2 4933
2ef7619d 4934 if (vmx_pt_mode_is_host_guest()) {
ee85dec2 4935 vmx->pt_desc.guest.ctl = 0;
476c9bd8 4936 pt_update_intercept_for_msr(vcpu);
ee85dec2
LK
4937 }
4938
55d2375e
SC
4939 return 0;
4940
4941out_shadow_vmcs:
4942 kfree(vmx->nested.cached_shadow_vmcs12);
4943
4944out_cached_shadow_vmcs12:
4945 kfree(vmx->nested.cached_vmcs12);
4946
4947out_cached_vmcs12:
4948 free_loaded_vmcs(&vmx->nested.vmcs02);
4949
4950out_vmcs02:
4951 return -ENOMEM;
4952}
4953
ed7023a1 4954/* Emulate the VMXON instruction. */
55d2375e
SC
4955static int handle_vmon(struct kvm_vcpu *vcpu)
4956{
4957 int ret;
4958 gpa_t vmptr;
2e408936 4959 uint32_t revision;
55d2375e 4960 struct vcpu_vmx *vmx = to_vmx(vcpu);
32ad73db
SC
4961 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4962 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
55d2375e
SC
4963
4964 /*
4965 * The Intel VMX Instruction Reference lists a bunch of bits that are
4966 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
c2fe3cd4 4967 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
55d2375e
SC
4968 * Otherwise, we should fail with #UD. But most faulting conditions
4969 * have already been checked by hardware, prior to the VM-exit for
4970 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
4971 * that bit set to 1 in non-root mode.
4972 */
4973 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4974 kvm_queue_exception(vcpu, UD_VECTOR);
4975 return 1;
4976 }
4977
4978 /* CPL=0 must be checked manually. */
4979 if (vmx_get_cpl(vcpu)) {
4980 kvm_inject_gp(vcpu, 0);
4981 return 1;
4982 }
4983
4984 if (vmx->nested.vmxon)
b2656e4d 4985 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
55d2375e
SC
4986
4987 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4988 != VMXON_NEEDED_FEATURES) {
4989 kvm_inject_gp(vcpu, 0);
4990 return 1;
4991 }
4992
7a35e515
VK
4993 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
4994 return ret;
55d2375e
SC
4995
4996 /*
4997 * SDM 3: 24.11.5
4998 * The first 4 bytes of VMXON region contain the supported
4999 * VMCS revision identifier
5000 *
5001 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
5002 * which replaces physical address width with 32
5003 */
e0bf2665 5004 if (!page_address_valid(vcpu, vmptr))
55d2375e
SC
5005 return nested_vmx_failInvalid(vcpu);
5006
2e408936
KA
5007 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
5008 revision != VMCS12_REVISION)
55d2375e 5009 return nested_vmx_failInvalid(vcpu);
55d2375e
SC
5010
5011 vmx->nested.vmxon_ptr = vmptr;
5012 ret = enter_vmx_operation(vcpu);
5013 if (ret)
5014 return ret;
5015
5016 return nested_vmx_succeed(vcpu);
5017}
5018
5019static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
5020{
5021 struct vcpu_vmx *vmx = to_vmx(vcpu);
5022
64c78508 5023 if (vmx->nested.current_vmptr == INVALID_GPA)
55d2375e
SC
5024 return;
5025
7952d769
SC
5026 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
5027
55d2375e
SC
5028 if (enable_shadow_vmcs) {
5029 /* copy to memory all shadowed fields in case
5030 they were modified */
5031 copy_shadow_to_vmcs12(vmx);
55d2375e
SC
5032 vmx_disable_shadow_vmcs(vmx);
5033 }
5034 vmx->nested.posted_intr_nv = -1;
5035
5036 /* Flush VMCS12 to guest memory */
5037 kvm_vcpu_write_guest_page(vcpu,
5038 vmx->nested.current_vmptr >> PAGE_SHIFT,
5039 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
5040
0c1c92f1 5041 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
55d2375e 5042
64c78508 5043 vmx->nested.current_vmptr = INVALID_GPA;
55d2375e
SC
5044}
5045
5046/* Emulate the VMXOFF instruction */
5047static int handle_vmoff(struct kvm_vcpu *vcpu)
5048{
5049 if (!nested_vmx_check_permission(vcpu))
5050 return 1;
4b9852f4 5051
55d2375e 5052 free_nested(vcpu);
4b9852f4
LA
5053
5054 /* Process a latched INIT during time CPU was in VMX operation */
5055 kvm_make_request(KVM_REQ_EVENT, vcpu);
5056
55d2375e
SC
5057 return nested_vmx_succeed(vcpu);
5058}
5059
5060/* Emulate the VMCLEAR instruction */
5061static int handle_vmclear(struct kvm_vcpu *vcpu)
5062{
5063 struct vcpu_vmx *vmx = to_vmx(vcpu);
5064 u32 zero = 0;
5065 gpa_t vmptr;
11e34914 5066 u64 evmcs_gpa;
7a35e515 5067 int r;
55d2375e
SC
5068
5069 if (!nested_vmx_check_permission(vcpu))
5070 return 1;
5071
7a35e515
VK
5072 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5073 return r;
55d2375e 5074
e0bf2665 5075 if (!page_address_valid(vcpu, vmptr))
b2656e4d 5076 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
55d2375e
SC
5077
5078 if (vmptr == vmx->nested.vmxon_ptr)
b2656e4d 5079 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
55d2375e 5080
11e34914
VK
5081 /*
5082 * When Enlightened VMEntry is enabled on the calling CPU we treat
5083 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
5084 * way to distinguish it from VMCS12) and we must not corrupt it by
5085 * writing to the non-existent 'launch_state' field. The area doesn't
5086 * have to be the currently active EVMCS on the calling CPU and there's
5087 * nothing KVM has to do to transition it from 'active' to 'non-active'
5088 * state. It is possible that the area will stay mapped as
5089 * vmx->nested.hv_evmcs but this shouldn't be a problem.
5090 */
5091 if (likely(!vmx->nested.enlightened_vmcs_enabled ||
5092 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
55d2375e
SC
5093 if (vmptr == vmx->nested.current_vmptr)
5094 nested_release_vmcs12(vcpu);
5095
5096 kvm_vcpu_write_guest(vcpu,
5097 vmptr + offsetof(struct vmcs12,
5098 launch_state),
5099 &zero, sizeof(zero));
3b19b81a
VK
5100 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
5101 nested_release_evmcs(vcpu);
55d2375e
SC
5102 }
5103
5104 return nested_vmx_succeed(vcpu);
5105}
5106
55d2375e
SC
5107/* Emulate the VMLAUNCH instruction */
5108static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5109{
5110 return nested_vmx_run(vcpu, true);
5111}
5112
5113/* Emulate the VMRESUME instruction */
5114static int handle_vmresume(struct kvm_vcpu *vcpu)
5115{
5116
5117 return nested_vmx_run(vcpu, false);
5118}
5119
5120static int handle_vmread(struct kvm_vcpu *vcpu)
5121{
dd2d6042
JM
5122 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5123 : get_vmcs12(vcpu);
5addc235 5124 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
c90f4d03
JM
5125 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5126 struct vcpu_vmx *vmx = to_vmx(vcpu);
f7eea636 5127 struct x86_exception e;
c90f4d03
JM
5128 unsigned long field;
5129 u64 value;
5130 gva_t gva = 0;
1c6f0b47 5131 short offset;
7a35e515 5132 int len, r;
55d2375e
SC
5133
5134 if (!nested_vmx_check_permission(vcpu))
5135 return 1;
5136
55d2375e 5137 /* Decode instruction info and find the field to read */
27b4a9c4 5138 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
1c6f0b47 5139
6cbbaab6
VK
5140 if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
5141 /*
5142 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5143 * any VMREAD sets the ALU flags for VMfailInvalid.
5144 */
5145 if (vmx->nested.current_vmptr == INVALID_GPA ||
5146 (is_guest_mode(vcpu) &&
5147 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5148 return nested_vmx_failInvalid(vcpu);
5149
5150 offset = get_vmcs12_field_offset(field);
5151 if (offset < 0)
5152 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
55d2375e 5153
6cbbaab6
VK
5154 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5155 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
7952d769 5156
6cbbaab6
VK
5157 /* Read the field, zero-extended to a u64 value */
5158 value = vmcs12_read_any(vmcs12, field, offset);
5159 } else {
5160 /*
5161 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
5162 * enlightened VMCS is active VMREAD/VMWRITE instructions are
5163 * unsupported. Unfortunately, certain versions of Windows 11
5164 * don't comply with this requirement which is not enforced in
5165 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
5166 * workaround, as misbehaving guests will panic on VM-Fail.
5167 * Note, enlightened VMCS is incompatible with shadow VMCS so
5168 * all VMREADs from L2 should go to L1.
5169 */
5170 if (WARN_ON_ONCE(is_guest_mode(vcpu)))
5171 return nested_vmx_failInvalid(vcpu);
5172
5173 offset = evmcs_field_offset(field, NULL);
5174 if (offset < 0)
5175 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5176
5177 /* Read the field, zero-extended to a u64 value */
5178 value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
5179 }
1c6f0b47 5180
55d2375e
SC
5181 /*
5182 * Now copy part of this value to register or memory, as requested.
5183 * Note that the number of bits actually copied is 32 or 64 depending
5184 * on the guest's mode (32 or 64 bit), not on the given field's length.
5185 */
c90f4d03 5186 if (instr_info & BIT(10)) {
27b4a9c4 5187 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
55d2375e 5188 } else {
fdb28619 5189 len = is_64_bit_mode(vcpu) ? 8 : 4;
55d2375e 5190 if (get_vmx_mem_address(vcpu, exit_qualification,
c90f4d03 5191 instr_info, true, len, &gva))
55d2375e
SC
5192 return 1;
5193 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
7a35e515
VK
5194 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5195 if (r != X86EMUL_CONTINUE)
3f3393b3 5196 return kvm_handle_memory_failure(vcpu, r, &e);
55d2375e
SC
5197 }
5198
5199 return nested_vmx_succeed(vcpu);
5200}
5201
e2174295
SC
5202static bool is_shadow_field_rw(unsigned long field)
5203{
5204 switch (field) {
5205#define SHADOW_FIELD_RW(x, y) case x:
5206#include "vmcs_shadow_fields.h"
5207 return true;
5208 default:
5209 break;
5210 }
5211 return false;
5212}
5213
5214static bool is_shadow_field_ro(unsigned long field)
5215{
5216 switch (field) {
5217#define SHADOW_FIELD_RO(x, y) case x:
5218#include "vmcs_shadow_fields.h"
5219 return true;
5220 default:
5221 break;
5222 }
5223 return false;
5224}
55d2375e
SC
5225
5226static int handle_vmwrite(struct kvm_vcpu *vcpu)
5227{
c90f4d03
JM
5228 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5229 : get_vmcs12(vcpu);
5addc235 5230 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
c90f4d03
JM
5231 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5232 struct vcpu_vmx *vmx = to_vmx(vcpu);
5233 struct x86_exception e;
55d2375e 5234 unsigned long field;
c90f4d03 5235 short offset;
55d2375e 5236 gva_t gva;
7a35e515 5237 int len, r;
55d2375e 5238
c90f4d03
JM
5239 /*
5240 * The value to write might be 32 or 64 bits, depending on L1's long
55d2375e
SC
5241 * mode, and eventually we need to write that into a field of several
5242 * possible lengths. The code below first zero-extends the value to 64
c90f4d03 5243 * bit (value), and then copies only the appropriate number of
55d2375e
SC
5244 * bits into the vmcs12 field.
5245 */
c90f4d03 5246 u64 value = 0;
55d2375e
SC
5247
5248 if (!nested_vmx_check_permission(vcpu))
5249 return 1;
5250
dd2d6042 5251 /*
64c78508 5252 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
dd2d6042
JM
5253 * any VMWRITE sets the ALU flags for VMfailInvalid.
5254 */
64c78508 5255 if (vmx->nested.current_vmptr == INVALID_GPA ||
dd2d6042 5256 (is_guest_mode(vcpu) &&
64c78508 5257 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
55d2375e
SC
5258 return nested_vmx_failInvalid(vcpu);
5259
c90f4d03 5260 if (instr_info & BIT(10))
27b4a9c4 5261 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
55d2375e 5262 else {
fdb28619 5263 len = is_64_bit_mode(vcpu) ? 8 : 4;
55d2375e 5264 if (get_vmx_mem_address(vcpu, exit_qualification,
c90f4d03 5265 instr_info, false, len, &gva))
55d2375e 5266 return 1;
7a35e515
VK
5267 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5268 if (r != X86EMUL_CONTINUE)
3f3393b3 5269 return kvm_handle_memory_failure(vcpu, r, &e);
55d2375e
SC
5270 }
5271
27b4a9c4 5272 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
693e02cc 5273
2423a4c0 5274 offset = get_vmcs12_field_offset(field);
693e02cc 5275 if (offset < 0)
b2656e4d 5276 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
55d2375e 5277
55d2375e
SC
5278 /*
5279 * If the vCPU supports "VMWRITE to any supported field in the
5280 * VMCS," then the "read-only" fields are actually read/write.
5281 */
5282 if (vmcs_field_readonly(field) &&
5283 !nested_cpu_has_vmwrite_any_field(vcpu))
b2656e4d 5284 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
55d2375e 5285
dd2d6042
JM
5286 /*
5287 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5288 * vmcs12, else we may crush a field or consume a stale value.
5289 */
5290 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5291 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
55d2375e
SC
5292
5293 /*
b6437805
SC
5294 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5295 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5296 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5297 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5298 * from L1 will return a different value than VMREAD from L2 (L1 sees
5299 * the stripped down value, L2 sees the full value as stored by KVM).
55d2375e 5300 */
b6437805 5301 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
c90f4d03 5302 value &= 0x1f0ff;
b6437805 5303
c90f4d03 5304 vmcs12_write_any(vmcs12, field, offset, value);
55d2375e
SC
5305
5306 /*
e2174295
SC
5307 * Do not track vmcs12 dirty-state if in guest-mode as we actually
5308 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5309 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5310 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
55d2375e 5311 */
e2174295
SC
5312 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5313 /*
5314 * L1 can read these fields without exiting, ensure the
5315 * shadow VMCS is up-to-date.
5316 */
5317 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5318 preempt_disable();
5319 vmcs_load(vmx->vmcs01.shadow_vmcs);
fadcead0 5320
c90f4d03 5321 __vmcs_writel(field, value);
fadcead0 5322
e2174295
SC
5323 vmcs_clear(vmx->vmcs01.shadow_vmcs);
5324 vmcs_load(vmx->loaded_vmcs->vmcs);
5325 preempt_enable();
55d2375e 5326 }
e2174295 5327 vmx->nested.dirty_vmcs12 = true;
55d2375e
SC
5328 }
5329
5330 return nested_vmx_succeed(vcpu);
5331}
5332
5333static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5334{
5335 vmx->nested.current_vmptr = vmptr;
5336 if (enable_shadow_vmcs) {
fe7f895d 5337 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
55d2375e
SC
5338 vmcs_write64(VMCS_LINK_POINTER,
5339 __pa(vmx->vmcs01.shadow_vmcs));
3731905e 5340 vmx->nested.need_vmcs12_to_shadow_sync = true;
55d2375e
SC
5341 }
5342 vmx->nested.dirty_vmcs12 = true;
ed2a4800 5343 vmx->nested.force_msr_bitmap_recalc = true;
55d2375e
SC
5344}
5345
5346/* Emulate the VMPTRLD instruction */
5347static int handle_vmptrld(struct kvm_vcpu *vcpu)
5348{
5349 struct vcpu_vmx *vmx = to_vmx(vcpu);
5350 gpa_t vmptr;
7a35e515 5351 int r;
55d2375e
SC
5352
5353 if (!nested_vmx_check_permission(vcpu))
5354 return 1;
5355
7a35e515
VK
5356 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5357 return r;
55d2375e 5358
e0bf2665 5359 if (!page_address_valid(vcpu, vmptr))
b2656e4d 5360 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
55d2375e
SC
5361
5362 if (vmptr == vmx->nested.vmxon_ptr)
b2656e4d 5363 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
55d2375e
SC
5364
5365 /* Forbid normal VMPTRLD if Enlightened version was used */
1e9dfbd7 5366 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
55d2375e
SC
5367 return 1;
5368
5369 if (vmx->nested.current_vmptr != vmptr) {
cee66664
DW
5370 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
5371 struct vmcs_hdr hdr;
55d2375e 5372
8503fea6 5373 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
55d2375e
SC
5374 /*
5375 * Reads from an unbacked page return all 1s,
5376 * which means that the 32 bits located at the
5377 * given physical address won't match the required
5378 * VMCS12_REVISION identifier.
5379 */
b2656e4d 5380 return nested_vmx_fail(vcpu,
55d2375e 5381 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
55d2375e 5382 }
b146b839 5383
cee66664
DW
5384 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
5385 offsetof(struct vmcs12, hdr),
5386 sizeof(hdr))) {
5387 return nested_vmx_fail(vcpu,
5388 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5389 }
b146b839 5390
cee66664
DW
5391 if (hdr.revision_id != VMCS12_REVISION ||
5392 (hdr.shadow_vmcs &&
55d2375e 5393 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
b2656e4d 5394 return nested_vmx_fail(vcpu,
55d2375e
SC
5395 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5396 }
5397
5398 nested_release_vmcs12(vcpu);
5399
5400 /*
5401 * Load VMCS12 from guest memory since it is not already
5402 * cached.
5403 */
cee66664
DW
5404 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
5405 VMCS12_SIZE)) {
5406 return nested_vmx_fail(vcpu,
5407 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5408 }
55d2375e
SC
5409
5410 set_current_vmptr(vmx, vmptr);
5411 }
5412
5413 return nested_vmx_succeed(vcpu);
5414}
5415
5416/* Emulate the VMPTRST instruction */
5417static int handle_vmptrst(struct kvm_vcpu *vcpu)
5418{
5addc235 5419 unsigned long exit_qual = vmx_get_exit_qual(vcpu);
55d2375e
SC
5420 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5421 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5422 struct x86_exception e;
5423 gva_t gva;
7a35e515 5424 int r;
55d2375e
SC
5425
5426 if (!nested_vmx_check_permission(vcpu))
5427 return 1;
5428
1e9dfbd7 5429 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
55d2375e
SC
5430 return 1;
5431
fdb28619
EK
5432 if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5433 true, sizeof(gpa_t), &gva))
55d2375e
SC
5434 return 1;
5435 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
7a35e515
VK
5436 r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5437 sizeof(gpa_t), &e);
5438 if (r != X86EMUL_CONTINUE)
3f3393b3 5439 return kvm_handle_memory_failure(vcpu, r, &e);
7a35e515 5440
55d2375e
SC
5441 return nested_vmx_succeed(vcpu);
5442}
5443
5444/* Emulate the INVEPT instruction */
5445static int handle_invept(struct kvm_vcpu *vcpu)
5446{
5447 struct vcpu_vmx *vmx = to_vmx(vcpu);
5448 u32 vmx_instruction_info, types;
ce8fe7b7
SC
5449 unsigned long type, roots_to_free;
5450 struct kvm_mmu *mmu;
55d2375e
SC
5451 gva_t gva;
5452 struct x86_exception e;
5453 struct {
5454 u64 eptp, gpa;
5455 } operand;
329bd56c 5456 int i, r, gpr_index;
55d2375e
SC
5457
5458 if (!(vmx->nested.msrs.secondary_ctls_high &
5459 SECONDARY_EXEC_ENABLE_EPT) ||
5460 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5461 kvm_queue_exception(vcpu, UD_VECTOR);
5462 return 1;
5463 }
5464
5465 if (!nested_vmx_check_permission(vcpu))
5466 return 1;
5467
5468 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
329bd56c
VS
5469 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5470 type = kvm_register_read(vcpu, gpr_index);
55d2375e
SC
5471
5472 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5473
5474 if (type >= 32 || !(types & (1 << type)))
b2656e4d 5475 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
55d2375e
SC
5476
5477 /* According to the Intel VMX instruction reference, the memory
5478 * operand is read even if it isn't needed (e.g., for type==global)
5479 */
5addc235 5480 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619 5481 vmx_instruction_info, false, sizeof(operand), &gva))
55d2375e 5482 return 1;
7a35e515
VK
5483 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5484 if (r != X86EMUL_CONTINUE)
3f3393b3 5485 return kvm_handle_memory_failure(vcpu, r, &e);
55d2375e 5486
ce8fe7b7
SC
5487 /*
5488 * Nested EPT roots are always held through guest_mmu,
5489 * not root_mmu.
5490 */
5491 mmu = &vcpu->arch.guest_mmu;
5492
55d2375e 5493 switch (type) {
b1190198 5494 case VMX_EPT_EXTENT_CONTEXT:
eed0030e 5495 if (!nested_vmx_check_eptp(vcpu, operand.eptp))
b2656e4d 5496 return nested_vmx_fail(vcpu,
eed0030e 5497 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
f8aa7e39 5498
ce8fe7b7 5499 roots_to_free = 0;
b9e5603c 5500 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
ce8fe7b7
SC
5501 operand.eptp))
5502 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5503
5504 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5505 if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
be01e8e2 5506 mmu->prev_roots[i].pgd,
ce8fe7b7
SC
5507 operand.eptp))
5508 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5509 }
5510 break;
eed0030e 5511 case VMX_EPT_EXTENT_GLOBAL:
ce8fe7b7 5512 roots_to_free = KVM_MMU_ROOTS_ALL;
55d2375e
SC
5513 break;
5514 default:
f9336e32 5515 BUG();
55d2375e
SC
5516 break;
5517 }
5518
ce8fe7b7 5519 if (roots_to_free)
0c1c92f1 5520 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
ce8fe7b7 5521
55d2375e
SC
5522 return nested_vmx_succeed(vcpu);
5523}
5524
5525static int handle_invvpid(struct kvm_vcpu *vcpu)
5526{
5527 struct vcpu_vmx *vmx = to_vmx(vcpu);
5528 u32 vmx_instruction_info;
5529 unsigned long type, types;
5530 gva_t gva;
5531 struct x86_exception e;
5532 struct {
5533 u64 vpid;
5534 u64 gla;
5535 } operand;
5536 u16 vpid02;
329bd56c 5537 int r, gpr_index;
55d2375e
SC
5538
5539 if (!(vmx->nested.msrs.secondary_ctls_high &
5540 SECONDARY_EXEC_ENABLE_VPID) ||
5541 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5542 kvm_queue_exception(vcpu, UD_VECTOR);
5543 return 1;
5544 }
5545
5546 if (!nested_vmx_check_permission(vcpu))
5547 return 1;
5548
5549 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
329bd56c
VS
5550 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5551 type = kvm_register_read(vcpu, gpr_index);
55d2375e
SC
5552
5553 types = (vmx->nested.msrs.vpid_caps &
5554 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5555
5556 if (type >= 32 || !(types & (1 << type)))
b2656e4d 5557 return nested_vmx_fail(vcpu,
55d2375e
SC
5558 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5559
5560 /* according to the intel vmx instruction reference, the memory
5561 * operand is read even if it isn't needed (e.g., for type==global)
5562 */
5addc235 5563 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619 5564 vmx_instruction_info, false, sizeof(operand), &gva))
55d2375e 5565 return 1;
7a35e515
VK
5566 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5567 if (r != X86EMUL_CONTINUE)
3f3393b3 5568 return kvm_handle_memory_failure(vcpu, r, &e);
7a35e515 5569
55d2375e 5570 if (operand.vpid >> 16)
b2656e4d 5571 return nested_vmx_fail(vcpu,
55d2375e
SC
5572 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5573
5574 vpid02 = nested_get_vpid02(vcpu);
5575 switch (type) {
5576 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5577 if (!operand.vpid ||
5578 is_noncanonical_address(operand.gla, vcpu))
b2656e4d 5579 return nested_vmx_fail(vcpu,
55d2375e 5580 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
bc41d0c4 5581 vpid_sync_vcpu_addr(vpid02, operand.gla);
55d2375e
SC
5582 break;
5583 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5584 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5585 if (!operand.vpid)
b2656e4d 5586 return nested_vmx_fail(vcpu,
55d2375e 5587 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
446ace4b 5588 vpid_sync_context(vpid02);
55d2375e
SC
5589 break;
5590 case VMX_VPID_EXTENT_ALL_CONTEXT:
446ace4b 5591 vpid_sync_context(vpid02);
55d2375e
SC
5592 break;
5593 default:
5594 WARN_ON_ONCE(1);
5595 return kvm_skip_emulated_instruction(vcpu);
5596 }
5597
d6e3f838
JS
5598 /*
5599 * Sync the shadow page tables if EPT is disabled, L1 is invalidating
25b62c62
SC
5600 * linear mappings for L2 (tagged with L2's VPID). Free all guest
5601 * roots as VPIDs are not tracked in the MMU role.
d6e3f838
JS
5602 *
5603 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
5604 * an MMU when EPT is disabled.
5605 *
5606 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
5607 */
5608 if (!enable_ept)
0c1c92f1 5609 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
d6e3f838 5610
55d2375e
SC
5611 return nested_vmx_succeed(vcpu);
5612}
5613
5614static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5615 struct vmcs12 *vmcs12)
5616{
2b3eaf81 5617 u32 index = kvm_rcx_read(vcpu);
ac6389ab 5618 u64 new_eptp;
55d2375e 5619
c5ffd408 5620 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
55d2375e 5621 return 1;
55d2375e
SC
5622 if (index >= VMFUNC_EPTP_ENTRIES)
5623 return 1;
5624
55d2375e 5625 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
ac6389ab 5626 &new_eptp, index * 8, 8))
55d2375e
SC
5627 return 1;
5628
55d2375e
SC
5629 /*
5630 * If the (L2) guest does a vmfunc to the currently
5631 * active ept pointer, we don't have to do anything else
5632 */
ac6389ab
SC
5633 if (vmcs12->ept_pointer != new_eptp) {
5634 if (!nested_vmx_check_eptp(vcpu, new_eptp))
55d2375e
SC
5635 return 1;
5636
ac6389ab 5637 vmcs12->ept_pointer = new_eptp;
39353ab5 5638 nested_ept_new_eptp(vcpu);
c805f5d5 5639
39353ab5
SC
5640 if (!nested_cpu_has_vpid(vmcs12))
5641 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
55d2375e
SC
5642 }
5643
5644 return 0;
5645}
5646
5647static int handle_vmfunc(struct kvm_vcpu *vcpu)
5648{
5649 struct vcpu_vmx *vmx = to_vmx(vcpu);
5650 struct vmcs12 *vmcs12;
2b3eaf81 5651 u32 function = kvm_rax_read(vcpu);
55d2375e
SC
5652
5653 /*
5654 * VMFUNC is only supported for nested guests, but we always enable the
5655 * secondary control for simplicity; for non-nested mode, fake that we
5656 * didn't by injecting #UD.
5657 */
5658 if (!is_guest_mode(vcpu)) {
5659 kvm_queue_exception(vcpu, UD_VECTOR);
5660 return 1;
5661 }
5662
5663 vmcs12 = get_vmcs12(vcpu);
546e8398
SC
5664
5665 /*
5666 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
5667 * is enabled in vmcs02 if and only if it's enabled in vmcs12.
5668 */
5669 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
5670 kvm_queue_exception(vcpu, UD_VECTOR);
5671 return 1;
5672 }
5673
0e75225d 5674 if (!(vmcs12->vm_function_control & BIT_ULL(function)))
55d2375e
SC
5675 goto fail;
5676
5677 switch (function) {
5678 case 0:
5679 if (nested_vmx_eptp_switching(vcpu, vmcs12))
5680 goto fail;
5681 break;
5682 default:
5683 goto fail;
5684 }
5685 return kvm_skip_emulated_instruction(vcpu);
5686
5687fail:
8e533240
SC
5688 /*
5689 * This is effectively a reflected VM-Exit, as opposed to a synthesized
5690 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
5691 * EXIT_REASON_VMFUNC as the exit reason.
5692 */
5693 nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
87915858 5694 vmx_get_intr_info(vcpu),
5addc235 5695 vmx_get_exit_qual(vcpu));
55d2375e
SC
5696 return 1;
5697}
5698
e71237d3
OU
5699/*
5700 * Return true if an IO instruction with the specified port and size should cause
5701 * a VM-exit into L1.
5702 */
5703bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
5704 int size)
55d2375e 5705{
e71237d3 5706 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
55d2375e 5707 gpa_t bitmap, last_bitmap;
55d2375e
SC
5708 u8 b;
5709
64c78508 5710 last_bitmap = INVALID_GPA;
55d2375e
SC
5711 b = -1;
5712
5713 while (size > 0) {
5714 if (port < 0x8000)
5715 bitmap = vmcs12->io_bitmap_a;
5716 else if (port < 0x10000)
5717 bitmap = vmcs12->io_bitmap_b;
5718 else
5719 return true;
5720 bitmap += (port & 0x7fff) / 8;
5721
5722 if (last_bitmap != bitmap)
5723 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5724 return true;
5725 if (b & (1 << (port & 7)))
5726 return true;
5727
5728 port++;
5729 size--;
5730 last_bitmap = bitmap;
5731 }
5732
5733 return false;
5734}
5735
e71237d3
OU
5736static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5737 struct vmcs12 *vmcs12)
5738{
5739 unsigned long exit_qualification;
35a57134 5740 unsigned short port;
e71237d3
OU
5741 int size;
5742
5743 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5744 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5745
5addc235 5746 exit_qualification = vmx_get_exit_qual(vcpu);
e71237d3
OU
5747
5748 port = exit_qualification >> 16;
5749 size = (exit_qualification & 7) + 1;
5750
5751 return nested_vmx_check_io_bitmaps(vcpu, port, size);
5752}
5753
55d2375e 5754/*
463bfeee 5755 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
55d2375e
SC
5756 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5757 * disinterest in the current event (read or write a specific MSR) by using an
5758 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5759 */
5760static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
8e533240
SC
5761 struct vmcs12 *vmcs12,
5762 union vmx_exit_reason exit_reason)
55d2375e 5763{
2b3eaf81 5764 u32 msr_index = kvm_rcx_read(vcpu);
55d2375e
SC
5765 gpa_t bitmap;
5766
5767 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5768 return true;
5769
5770 /*
5771 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5772 * for the four combinations of read/write and low/high MSR numbers.
5773 * First we need to figure out which of the four to use:
5774 */
5775 bitmap = vmcs12->msr_bitmap;
8e533240 5776 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
55d2375e
SC
5777 bitmap += 2048;
5778 if (msr_index >= 0xc0000000) {
5779 msr_index -= 0xc0000000;
5780 bitmap += 1024;
5781 }
5782
5783 /* Then read the msr_index'th bit from this bitmap: */
5784 if (msr_index < 1024*8) {
5785 unsigned char b;
5786 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5787 return true;
5788 return 1 & (b >> (msr_index & 7));
5789 } else
5790 return true; /* let L1 handle the wrong parameter */
5791}
5792
5793/*
5794 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5795 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5796 * intercept (via guest_host_mask etc.) the current event.
5797 */
5798static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5799 struct vmcs12 *vmcs12)
5800{
5addc235 5801 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
55d2375e
SC
5802 int cr = exit_qualification & 15;
5803 int reg;
5804 unsigned long val;
5805
5806 switch ((exit_qualification >> 4) & 3) {
5807 case 0: /* mov to cr */
5808 reg = (exit_qualification >> 8) & 15;
27b4a9c4 5809 val = kvm_register_read(vcpu, reg);
55d2375e
SC
5810 switch (cr) {
5811 case 0:
5812 if (vmcs12->cr0_guest_host_mask &
5813 (val ^ vmcs12->cr0_read_shadow))
5814 return true;
5815 break;
5816 case 3:
55d2375e
SC
5817 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5818 return true;
5819 break;
5820 case 4:
5821 if (vmcs12->cr4_guest_host_mask &
5822 (vmcs12->cr4_read_shadow ^ val))
5823 return true;
5824 break;
5825 case 8:
5826 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5827 return true;
5828 break;
5829 }
5830 break;
5831 case 2: /* clts */
5832 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5833 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5834 return true;
5835 break;
5836 case 1: /* mov from cr */
5837 switch (cr) {
5838 case 3:
5839 if (vmcs12->cpu_based_vm_exec_control &
5840 CPU_BASED_CR3_STORE_EXITING)
5841 return true;
5842 break;
5843 case 8:
5844 if (vmcs12->cpu_based_vm_exec_control &
5845 CPU_BASED_CR8_STORE_EXITING)
5846 return true;
5847 break;
5848 }
5849 break;
5850 case 3: /* lmsw */
5851 /*
5852 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5853 * cr0. Other attempted changes are ignored, with no exit.
5854 */
5855 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5856 if (vmcs12->cr0_guest_host_mask & 0xe &
5857 (val ^ vmcs12->cr0_read_shadow))
5858 return true;
5859 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5860 !(vmcs12->cr0_read_shadow & 0x1) &&
5861 (val & 0x1))
5862 return true;
5863 break;
5864 }
5865 return false;
5866}
5867
72add915
SC
5868static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
5869 struct vmcs12 *vmcs12)
5870{
5871 u32 encls_leaf;
5872
5873 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
5874 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
5875 return false;
5876
5877 encls_leaf = kvm_rax_read(vcpu);
5878 if (encls_leaf > 62)
5879 encls_leaf = 63;
5880 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
5881}
5882
55d2375e
SC
5883static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5884 struct vmcs12 *vmcs12, gpa_t bitmap)
5885{
5886 u32 vmx_instruction_info;
5887 unsigned long field;
5888 u8 b;
5889
5890 if (!nested_cpu_has_shadow_vmcs(vmcs12))
5891 return true;
5892
5893 /* Decode instruction info and find the field to access */
5894 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5895 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5896
5897 /* Out-of-range fields always cause a VM exit from L2 to L1 */
5898 if (field >> 15)
5899 return true;
5900
5901 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5902 return true;
5903
5904 return 1 & (b >> (field & 7));
5905}
5906
b045ae90
OU
5907static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
5908{
5909 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
5910
5911 if (nested_cpu_has_mtf(vmcs12))
5912 return true;
5913
5914 /*
5915 * An MTF VM-exit may be injected into the guest by setting the
5916 * interruption-type to 7 (other event) and the vector field to 0. Such
5917 * is the case regardless of the 'monitor trap flag' VM-execution
5918 * control.
5919 */
5920 return entry_intr_info == (INTR_INFO_VALID_MASK
5921 | INTR_TYPE_OTHER_EVENT);
5922}
5923
55d2375e 5924/*
2c1f3323
SC
5925 * Return true if L0 wants to handle an exit from L2 regardless of whether or not
5926 * L1 wants the exit. Only call this when in is_guest_mode (L2).
55d2375e 5927 */
8e533240
SC
5928static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
5929 union vmx_exit_reason exit_reason)
55d2375e 5930{
236871b6 5931 u32 intr_info;
55d2375e 5932
8e533240 5933 switch ((u16)exit_reason.basic) {
55d2375e 5934 case EXIT_REASON_EXCEPTION_NMI:
87915858 5935 intr_info = vmx_get_intr_info(vcpu);
55d2375e 5936 if (is_nmi(intr_info))
2c1f3323 5937 return true;
55d2375e 5938 else if (is_page_fault(intr_info))
18712c13
SC
5939 return vcpu->arch.apf.host_apf_flags ||
5940 vmx_need_pf_intercept(vcpu);
55d2375e
SC
5941 else if (is_debug(intr_info) &&
5942 vcpu->guest_debug &
5943 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
2c1f3323 5944 return true;
55d2375e
SC
5945 else if (is_breakpoint(intr_info) &&
5946 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2c1f3323 5947 return true;
b33bb78a
SC
5948 else if (is_alignment_check(intr_info) &&
5949 !vmx_guest_inject_ac(vcpu))
5950 return true;
2c1f3323
SC
5951 return false;
5952 case EXIT_REASON_EXTERNAL_INTERRUPT:
5953 return true;
5954 case EXIT_REASON_MCE_DURING_VMENTRY:
5955 return true;
5956 case EXIT_REASON_EPT_VIOLATION:
5957 /*
5958 * L0 always deals with the EPT violation. If nested EPT is
5959 * used, and the nested mmu code discovers that the address is
5960 * missing in the guest EPT table (EPT12), the EPT violation
5961 * will be injected with nested_ept_inject_page_fault()
5962 */
5963 return true;
5964 case EXIT_REASON_EPT_MISCONFIG:
5965 /*
5966 * L2 never uses directly L1's EPT, but rather L0's own EPT
5967 * table (shadow on EPT) or a merged EPT table that L0 built
5968 * (EPT on EPT). So any problems with the structure of the
5969 * table is L0's fault.
5970 */
5971 return true;
5972 case EXIT_REASON_PREEMPTION_TIMER:
5973 return true;
5974 case EXIT_REASON_PML_FULL:
c3bb9a20
SC
5975 /*
5976 * PML is emulated for an L1 VMM and should never be enabled in
5977 * vmcs02, always "handle" PML_FULL by exiting to userspace.
5978 */
2c1f3323
SC
5979 return true;
5980 case EXIT_REASON_VMFUNC:
5981 /* VM functions are emulated through L2->L0 vmexits. */
5982 return true;
24a996ad
CQ
5983 case EXIT_REASON_BUS_LOCK:
5984 /*
5985 * At present, bus lock VM exit is never exposed to L1.
5986 * Handle L2's bus locks in L0 directly.
5987 */
5988 return true;
2c1f3323
SC
5989 default:
5990 break;
5991 }
5992 return false;
5993}
5994
5995/*
5996 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
5997 * is_guest_mode (L2).
5998 */
8e533240
SC
5999static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
6000 union vmx_exit_reason exit_reason)
2c1f3323
SC
6001{
6002 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9bd4af24 6003 u32 intr_info;
2c1f3323 6004
8e533240 6005 switch ((u16)exit_reason.basic) {
2c1f3323 6006 case EXIT_REASON_EXCEPTION_NMI:
87915858 6007 intr_info = vmx_get_intr_info(vcpu);
2c1f3323
SC
6008 if (is_nmi(intr_info))
6009 return true;
6010 else if (is_page_fault(intr_info))
6011 return true;
55d2375e
SC
6012 return vmcs12->exception_bitmap &
6013 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
6014 case EXIT_REASON_EXTERNAL_INTERRUPT:
2c1f3323 6015 return nested_exit_on_intr(vcpu);
55d2375e
SC
6016 case EXIT_REASON_TRIPLE_FAULT:
6017 return true;
9dadc2f9
XL
6018 case EXIT_REASON_INTERRUPT_WINDOW:
6019 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
55d2375e 6020 case EXIT_REASON_NMI_WINDOW:
4e2a0bc5 6021 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
55d2375e
SC
6022 case EXIT_REASON_TASK_SWITCH:
6023 return true;
6024 case EXIT_REASON_CPUID:
6025 return true;
6026 case EXIT_REASON_HLT:
6027 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
6028 case EXIT_REASON_INVD:
6029 return true;
6030 case EXIT_REASON_INVLPG:
6031 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6032 case EXIT_REASON_RDPMC:
6033 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
6034 case EXIT_REASON_RDRAND:
6035 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
6036 case EXIT_REASON_RDSEED:
6037 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
6038 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
6039 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
6040 case EXIT_REASON_VMREAD:
6041 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6042 vmcs12->vmread_bitmap);
6043 case EXIT_REASON_VMWRITE:
6044 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6045 vmcs12->vmwrite_bitmap);
6046 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
6047 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
6048 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
6049 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6050 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
6051 /*
6052 * VMX instructions trap unconditionally. This allows L1 to
6053 * emulate them for its L2 guest, i.e., allows 3-level nesting!
6054 */
6055 return true;
6056 case EXIT_REASON_CR_ACCESS:
6057 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
6058 case EXIT_REASON_DR_ACCESS:
6059 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6060 case EXIT_REASON_IO_INSTRUCTION:
6061 return nested_vmx_exit_handled_io(vcpu, vmcs12);
6062 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
6063 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
6064 case EXIT_REASON_MSR_READ:
6065 case EXIT_REASON_MSR_WRITE:
6066 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
6067 case EXIT_REASON_INVALID_STATE:
6068 return true;
6069 case EXIT_REASON_MWAIT_INSTRUCTION:
6070 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
6071 case EXIT_REASON_MONITOR_TRAP_FLAG:
b045ae90 6072 return nested_vmx_exit_handled_mtf(vmcs12);
55d2375e
SC
6073 case EXIT_REASON_MONITOR_INSTRUCTION:
6074 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
6075 case EXIT_REASON_PAUSE_INSTRUCTION:
6076 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
6077 nested_cpu_has2(vmcs12,
6078 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
6079 case EXIT_REASON_MCE_DURING_VMENTRY:
2c1f3323 6080 return true;
55d2375e
SC
6081 case EXIT_REASON_TPR_BELOW_THRESHOLD:
6082 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
6083 case EXIT_REASON_APIC_ACCESS:
6084 case EXIT_REASON_APIC_WRITE:
6085 case EXIT_REASON_EOI_INDUCED:
6086 /*
6087 * The controls for "virtualize APIC accesses," "APIC-
6088 * register virtualization," and "virtual-interrupt
6089 * delivery" only come from vmcs12.
6090 */
6091 return true;
55d2375e
SC
6092 case EXIT_REASON_INVPCID:
6093 return
6094 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6095 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6096 case EXIT_REASON_WBINVD:
6097 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6098 case EXIT_REASON_XSETBV:
6099 return true;
6100 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
6101 /*
6102 * This should never happen, since it is not possible to
6103 * set XSS to a non-zero value---neither in L1 nor in L2.
6104 * If if it were, XSS would have to be checked against
6105 * the XSS exit bitmap in vmcs12.
6106 */
6107 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
bf653b78
TX
6108 case EXIT_REASON_UMWAIT:
6109 case EXIT_REASON_TPAUSE:
6110 return nested_cpu_has2(vmcs12,
6111 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
72add915
SC
6112 case EXIT_REASON_ENCLS:
6113 return nested_vmx_exit_handled_encls(vcpu, vmcs12);
55d2375e
SC
6114 default:
6115 return true;
6116 }
6117}
6118
7b7bd87d
SC
6119/*
6120 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6121 * reflected into L1.
6122 */
f47baaed 6123bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
7b7bd87d 6124{
fbdd5025 6125 struct vcpu_vmx *vmx = to_vmx(vcpu);
8e533240 6126 union vmx_exit_reason exit_reason = vmx->exit_reason;
87796555
SC
6127 unsigned long exit_qual;
6128 u32 exit_intr_info;
fbdd5025
SC
6129
6130 WARN_ON_ONCE(vmx->nested.nested_run_pending);
6131
6132 /*
6133 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6134 * has already loaded L2's state.
6135 */
6136 if (unlikely(vmx->fail)) {
6137 trace_kvm_nested_vmenter_failed(
6138 "hardware VM-instruction error: ",
6139 vmcs_read32(VM_INSTRUCTION_ERROR));
6140 exit_intr_info = 0;
6141 exit_qual = 0;
6142 goto reflect_vmexit;
6143 }
7b7bd87d 6144
0a62a031 6145 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
236871b6 6146
2c1f3323
SC
6147 /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6148 if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6149 return false;
6150
6151 /* If L1 doesn't want the exit, handle it in L0. */
6152 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
7b7bd87d
SC
6153 return false;
6154
6155 /*
1d283062
SC
6156 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6157 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6158 * need to be synthesized by querying the in-kernel LAPIC, but external
6159 * interrupts are never reflected to L1 so it's a non-issue.
7b7bd87d 6160 */
02f1965f 6161 exit_intr_info = vmx_get_intr_info(vcpu);
f315f2b1 6162 if (is_exception_with_error_code(exit_intr_info)) {
7b7bd87d
SC
6163 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6164
6165 vmcs12->vm_exit_intr_error_code =
6166 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6167 }
02f1965f 6168 exit_qual = vmx_get_exit_qual(vcpu);
7b7bd87d 6169
fbdd5025 6170reflect_vmexit:
8e533240 6171 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
7b7bd87d
SC
6172 return true;
6173}
55d2375e
SC
6174
6175static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6176 struct kvm_nested_state __user *user_kvm_nested_state,
6177 u32 user_data_size)
6178{
6179 struct vcpu_vmx *vmx;
6180 struct vmcs12 *vmcs12;
6181 struct kvm_nested_state kvm_state = {
6182 .flags = 0,
6ca00dfa 6183 .format = KVM_STATE_NESTED_FORMAT_VMX,
55d2375e 6184 .size = sizeof(kvm_state),
850448f3 6185 .hdr.vmx.flags = 0,
64c78508
YZ
6186 .hdr.vmx.vmxon_pa = INVALID_GPA,
6187 .hdr.vmx.vmcs12_pa = INVALID_GPA,
850448f3 6188 .hdr.vmx.preemption_timer_deadline = 0,
55d2375e 6189 };
6ca00dfa
LA
6190 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6191 &user_kvm_nested_state->data.vmx[0];
55d2375e
SC
6192
6193 if (!vcpu)
6ca00dfa 6194 return kvm_state.size + sizeof(*user_vmx_nested_state);
55d2375e
SC
6195
6196 vmx = to_vmx(vcpu);
6197 vmcs12 = get_vmcs12(vcpu);
6198
55d2375e
SC
6199 if (nested_vmx_allowed(vcpu) &&
6200 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6ca00dfa
LA
6201 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6202 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
55d2375e
SC
6203
6204 if (vmx_has_valid_vmcs12(vcpu)) {
6ca00dfa 6205 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
55d2375e 6206
27849968
VK
6207 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6208 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
323d73a8
LA
6209 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6210
55d2375e
SC
6211 if (is_guest_mode(vcpu) &&
6212 nested_cpu_has_shadow_vmcs(vmcs12) &&
64c78508 6213 vmcs12->vmcs_link_pointer != INVALID_GPA)
6ca00dfa 6214 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
55d2375e
SC
6215 }
6216
6217 if (vmx->nested.smm.vmxon)
6ca00dfa 6218 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
55d2375e
SC
6219
6220 if (vmx->nested.smm.guest_mode)
6ca00dfa 6221 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
55d2375e
SC
6222
6223 if (is_guest_mode(vcpu)) {
6224 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6225
6226 if (vmx->nested.nested_run_pending)
6227 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5ef8acbd
OU
6228
6229 if (vmx->nested.mtf_pending)
6230 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
850448f3
PS
6231
6232 if (nested_cpu_has_preemption_timer(vmcs12) &&
6233 vmx->nested.has_preemption_timer_deadline) {
6234 kvm_state.hdr.vmx.flags |=
6235 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6236 kvm_state.hdr.vmx.preemption_timer_deadline =
6237 vmx->nested.preemption_timer_deadline;
6238 }
55d2375e
SC
6239 }
6240 }
6241
6242 if (user_data_size < kvm_state.size)
6243 goto out;
6244
6245 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6246 return -EFAULT;
6247
6248 if (!vmx_has_valid_vmcs12(vcpu))
6249 goto out;
6250
6251 /*
6252 * When running L2, the authoritative vmcs12 state is in the
6253 * vmcs02. When running L1, the authoritative vmcs12 state is
6254 * in the shadow or enlightened vmcs linked to vmcs01, unless
3731905e 6255 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
55d2375e
SC
6256 * vmcs12 state is in the vmcs12 already.
6257 */
6258 if (is_guest_mode(vcpu)) {
3731905e 6259 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
7952d769 6260 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
d51e1d3f
ML
6261 } else {
6262 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6263 if (!vmx->nested.need_vmcs12_to_shadow_sync) {
1e9dfbd7 6264 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
d6bf71a1
VK
6265 /*
6266 * L1 hypervisor is not obliged to keep eVMCS
6267 * clean fields data always up-to-date while
6268 * not in guest mode, 'hv_clean_fields' is only
6269 * supposed to be actual upon vmentry so we need
6270 * to ignore it here and do full copy.
6271 */
6272 copy_enlightened_to_vmcs12(vmx, 0);
d51e1d3f
ML
6273 else if (enable_shadow_vmcs)
6274 copy_shadow_to_vmcs12(vmx);
6275 }
55d2375e
SC
6276 }
6277
6ca00dfa
LA
6278 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6279 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6280
3a33d030
TR
6281 /*
6282 * Copy over the full allocated size of vmcs12 rather than just the size
6283 * of the struct.
6284 */
6ca00dfa 6285 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
55d2375e
SC
6286 return -EFAULT;
6287
6288 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
64c78508 6289 vmcs12->vmcs_link_pointer != INVALID_GPA) {
6ca00dfa 6290 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
3a33d030 6291 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
55d2375e
SC
6292 return -EFAULT;
6293 }
55d2375e
SC
6294out:
6295 return kvm_state.size;
6296}
6297
6298/*
6299 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
6300 */
6301void vmx_leave_nested(struct kvm_vcpu *vcpu)
6302{
6303 if (is_guest_mode(vcpu)) {
6304 to_vmx(vcpu)->nested.nested_run_pending = 0;
6305 nested_vmx_vmexit(vcpu, -1, 0, 0);
6306 }
6307 free_nested(vcpu);
6308}
6309
6310static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6311 struct kvm_nested_state __user *user_kvm_nested_state,
6312 struct kvm_nested_state *kvm_state)
6313{
6314 struct vcpu_vmx *vmx = to_vmx(vcpu);
6315 struct vmcs12 *vmcs12;
68cda40d 6316 enum vm_entry_failure_code ignored;
6ca00dfa
LA
6317 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6318 &user_kvm_nested_state->data.vmx[0];
55d2375e
SC
6319 int ret;
6320
6ca00dfa 6321 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
55d2375e
SC
6322 return -EINVAL;
6323
64c78508 6324 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
6ca00dfa 6325 if (kvm_state->hdr.vmx.smm.flags)
55d2375e
SC
6326 return -EINVAL;
6327
64c78508 6328 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
55d2375e
SC
6329 return -EINVAL;
6330
323d73a8
LA
6331 /*
6332 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6333 * enable eVMCS capability on vCPU. However, since then
6334 * code was changed such that flag signals vmcs12 should
6335 * be copied into eVMCS in guest memory.
6336 *
6337 * To preserve backwards compatability, allow user
6338 * to set this flag even when there is no VMXON region.
6339 */
9fd58877
PB
6340 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6341 return -EINVAL;
6342 } else {
6343 if (!nested_vmx_allowed(vcpu))
6344 return -EINVAL;
55d2375e 6345
9fd58877
PB
6346 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6347 return -EINVAL;
323d73a8 6348 }
55d2375e 6349
6ca00dfa 6350 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
55d2375e
SC
6351 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6352 return -EINVAL;
6353
6ca00dfa 6354 if (kvm_state->hdr.vmx.smm.flags &
55d2375e
SC
6355 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6356 return -EINVAL;
6357
5e105c88
PB
6358 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6359 return -EINVAL;
6360
55d2375e
SC
6361 /*
6362 * SMM temporarily disables VMX, so we cannot be in guest mode,
6363 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6364 * must be zero.
6365 */
65b712f1
LA
6366 if (is_smm(vcpu) ?
6367 (kvm_state->flags &
6368 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6369 : kvm_state->hdr.vmx.smm.flags)
55d2375e
SC
6370 return -EINVAL;
6371
6ca00dfa
LA
6372 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6373 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
55d2375e
SC
6374 return -EINVAL;
6375
323d73a8
LA
6376 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6377 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
9fd58877 6378 return -EINVAL;
55d2375e 6379
323d73a8 6380 vmx_leave_nested(vcpu);
9fd58877 6381
64c78508 6382 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
9fd58877 6383 return 0;
332d0797 6384
6ca00dfa 6385 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
55d2375e
SC
6386 ret = enter_vmx_operation(vcpu);
6387 if (ret)
6388 return ret;
6389
0f02bd0a
PB
6390 /* Empty 'VMXON' state is permitted if no VMCS loaded */
6391 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6392 /* See vmx_has_valid_vmcs12. */
6393 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6394 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
64c78508 6395 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
0f02bd0a
PB
6396 return -EINVAL;
6397 else
6398 return 0;
6399 }
55d2375e 6400
64c78508 6401 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
6ca00dfa
LA
6402 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6403 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
55d2375e
SC
6404 return -EINVAL;
6405
6ca00dfa 6406 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
55d2375e
SC
6407 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6408 /*
e942dbf8
VK
6409 * nested_vmx_handle_enlightened_vmptrld() cannot be called
6410 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6411 * restored yet. EVMCS will be mapped from
6412 * nested_get_vmcs12_pages().
55d2375e 6413 */
27849968 6414 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
729c15c2 6415 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
55d2375e
SC
6416 } else {
6417 return -EINVAL;
6418 }
6419
6ca00dfa 6420 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
55d2375e
SC
6421 vmx->nested.smm.vmxon = true;
6422 vmx->nested.vmxon = false;
6423
6ca00dfa 6424 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
55d2375e
SC
6425 vmx->nested.smm.guest_mode = true;
6426 }
6427
6428 vmcs12 = get_vmcs12(vcpu);
6ca00dfa 6429 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
55d2375e
SC
6430 return -EFAULT;
6431
6432 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6433 return -EINVAL;
6434
6435 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6436 return 0;
6437
21be4ca1
SC
6438 vmx->nested.nested_run_pending =
6439 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6440
5ef8acbd
OU
6441 vmx->nested.mtf_pending =
6442 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6443
21be4ca1 6444 ret = -EINVAL;
55d2375e 6445 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
64c78508 6446 vmcs12->vmcs_link_pointer != INVALID_GPA) {
55d2375e
SC
6447 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6448
6ca00dfa
LA
6449 if (kvm_state->size <
6450 sizeof(*kvm_state) +
6451 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
21be4ca1 6452 goto error_guest_mode;
55d2375e
SC
6453
6454 if (copy_from_user(shadow_vmcs12,
6ca00dfa
LA
6455 user_vmx_nested_state->shadow_vmcs12,
6456 sizeof(*shadow_vmcs12))) {
21be4ca1
SC
6457 ret = -EFAULT;
6458 goto error_guest_mode;
6459 }
55d2375e
SC
6460
6461 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6462 !shadow_vmcs12->hdr.shadow_vmcs)
21be4ca1 6463 goto error_guest_mode;
55d2375e
SC
6464 }
6465
83d31e52 6466 vmx->nested.has_preemption_timer_deadline = false;
850448f3
PS
6467 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6468 vmx->nested.has_preemption_timer_deadline = true;
6469 vmx->nested.preemption_timer_deadline =
6470 kvm_state->hdr.vmx.preemption_timer_deadline;
6471 }
6472
5478ba34
SC
6473 if (nested_vmx_check_controls(vcpu, vmcs12) ||
6474 nested_vmx_check_host_state(vcpu, vmcs12) ||
68cda40d 6475 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
21be4ca1 6476 goto error_guest_mode;
55d2375e
SC
6477
6478 vmx->nested.dirty_vmcs12 = true;
ed2a4800 6479 vmx->nested.force_msr_bitmap_recalc = true;
55d2375e 6480 ret = nested_vmx_enter_non_root_mode(vcpu, false);
21be4ca1
SC
6481 if (ret)
6482 goto error_guest_mode;
55d2375e
SC
6483
6484 return 0;
21be4ca1
SC
6485
6486error_guest_mode:
6487 vmx->nested.nested_run_pending = 0;
6488 return ret;
55d2375e
SC
6489}
6490
1b84292b 6491void nested_vmx_set_vmcs_shadowing_bitmap(void)
55d2375e
SC
6492{
6493 if (enable_shadow_vmcs) {
55d2375e 6494 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
fadcead0 6495 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
55d2375e
SC
6496 }
6497}
6498
ba1f8245
SC
6499/*
6500 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
6501 * that madness to get the encoding for comparison.
6502 */
6503#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
6504
6505static u64 nested_vmx_calc_vmcs_enum_msr(void)
6506{
6507 /*
6508 * Note these are the so called "index" of the VMCS field encoding, not
6509 * the index into vmcs12.
6510 */
6511 unsigned int max_idx, idx;
6512 int i;
6513
6514 /*
6515 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
6516 * vmcs12, regardless of whether or not the associated feature is
6517 * exposed to L1. Simply find the field with the highest index.
6518 */
6519 max_idx = 0;
6520 for (i = 0; i < nr_vmcs12_fields; i++) {
6521 /* The vmcs12 table is very, very sparsely populated. */
2423a4c0 6522 if (!vmcs12_field_offsets[i])
ba1f8245
SC
6523 continue;
6524
6525 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
6526 if (idx > max_idx)
6527 max_idx = idx;
6528 }
6529
6530 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
6531}
6532
55d2375e
SC
6533/*
6534 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
6535 * returned for the various VMX controls MSRs when nested VMX is enabled.
6536 * The same values should also be used to verify that vmcs12 control fields are
6537 * valid during nested entry from L1 to L2.
6538 * Each of these control msrs has a low and high 32-bit half: A low bit is on
6539 * if the corresponding bit in the (32-bit) control field *must* be on, and a
6540 * bit in the high half is on if the corresponding bit in the control field
6541 * may be on. See also vmx_control_verify().
6542 */
a4443267 6543void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
55d2375e
SC
6544{
6545 /*
6546 * Note that as a general rule, the high half of the MSRs (bits in
6547 * the control fields which may be 1) should be initialized by the
6548 * intersection of the underlying hardware's MSR (i.e., features which
6549 * can be supported) and the list of features we want to expose -
6550 * because they are known to be properly supported in our code.
6551 * Also, usually, the low half of the MSRs (bits which must be 1) can
6552 * be set to 0, meaning that L1 may turn off any of these bits. The
6553 * reason is that if one of these bits is necessary, it will appear
6554 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6555 * fields of vmcs01 and vmcs02, will turn these bits off - and
2c1f3323 6556 * nested_vmx_l1_wants_exit() will not pass related exits to L1.
55d2375e
SC
6557 * These rules have exceptions below.
6558 */
6559
6560 /* pin-based controls */
6561 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6562 msrs->pinbased_ctls_low,
6563 msrs->pinbased_ctls_high);
6564 msrs->pinbased_ctls_low |=
6565 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6566 msrs->pinbased_ctls_high &=
6567 PIN_BASED_EXT_INTR_MASK |
6568 PIN_BASED_NMI_EXITING |
6569 PIN_BASED_VIRTUAL_NMIS |
a4443267 6570 (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
55d2375e
SC
6571 msrs->pinbased_ctls_high |=
6572 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6573 PIN_BASED_VMX_PREEMPTION_TIMER;
6574
6575 /* exit controls */
6576 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6577 msrs->exit_ctls_low,
6578 msrs->exit_ctls_high);
6579 msrs->exit_ctls_low =
6580 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6581
6582 msrs->exit_ctls_high &=
6583#ifdef CONFIG_X86_64
6584 VM_EXIT_HOST_ADDR_SPACE_SIZE |
6585#endif
efc83133
CQ
6586 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
6587 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
55d2375e
SC
6588 msrs->exit_ctls_high |=
6589 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6590 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6591 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
6592
6593 /* We support free control of debug control saving. */
6594 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6595
6596 /* entry controls */
6597 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6598 msrs->entry_ctls_low,
6599 msrs->entry_ctls_high);
6600 msrs->entry_ctls_low =
6601 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6602 msrs->entry_ctls_high &=
6603#ifdef CONFIG_X86_64
6604 VM_ENTRY_IA32E_MODE |
6605#endif
efc83133
CQ
6606 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
6607 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
55d2375e
SC
6608 msrs->entry_ctls_high |=
6609 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6610
6611 /* We support free control of debug control loading. */
6612 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6613
6614 /* cpu-based controls */
6615 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6616 msrs->procbased_ctls_low,
6617 msrs->procbased_ctls_high);
6618 msrs->procbased_ctls_low =
6619 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6620 msrs->procbased_ctls_high &=
9dadc2f9 6621 CPU_BASED_INTR_WINDOW_EXITING |
5e3d394f 6622 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
55d2375e
SC
6623 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6624 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6625 CPU_BASED_CR3_STORE_EXITING |
6626#ifdef CONFIG_X86_64
6627 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6628#endif
6629 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6630 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6631 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6632 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6633 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6634 /*
6635 * We can allow some features even when not supported by the
6636 * hardware. For example, L1 can specify an MSR bitmap - and we
6637 * can use it to avoid exits to L1 - even when L0 runs L2
6638 * without MSR bitmaps.
6639 */
6640 msrs->procbased_ctls_high |=
6641 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6642 CPU_BASED_USE_MSR_BITMAPS;
6643
6644 /* We support free control of CR3 access interception. */
6645 msrs->procbased_ctls_low &=
6646 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6647
6648 /*
6649 * secondary cpu-based controls. Do not include those that
7c1b761b
XL
6650 * depend on CPUID bits, they are added later by
6651 * vmx_vcpu_after_set_cpuid.
55d2375e 6652 */
6b1971c6
VK
6653 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6654 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6655 msrs->secondary_ctls_low,
6656 msrs->secondary_ctls_high);
6657
55d2375e
SC
6658 msrs->secondary_ctls_low = 0;
6659 msrs->secondary_ctls_high &=
6660 SECONDARY_EXEC_DESC |
7f3603b6 6661 SECONDARY_EXEC_ENABLE_RDTSCP |
55d2375e 6662 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6defc591 6663 SECONDARY_EXEC_WBINVD_EXITING |
55d2375e
SC
6664 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6665 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6defc591
PB
6666 SECONDARY_EXEC_RDRAND_EXITING |
6667 SECONDARY_EXEC_ENABLE_INVPCID |
6668 SECONDARY_EXEC_RDSEED_EXITING |
d041b5ea
IS
6669 SECONDARY_EXEC_XSAVES |
6670 SECONDARY_EXEC_TSC_SCALING;
55d2375e
SC
6671
6672 /*
6673 * We can emulate "VMCS shadowing," even if the hardware
6674 * doesn't support it.
6675 */
6676 msrs->secondary_ctls_high |=
6677 SECONDARY_EXEC_SHADOW_VMCS;
6678
6679 if (enable_ept) {
6680 /* nested EPT: emulate EPT also to L1 */
6681 msrs->secondary_ctls_high |=
6682 SECONDARY_EXEC_ENABLE_EPT;
bb1fcc70
SC
6683 msrs->ept_caps =
6684 VMX_EPT_PAGE_WALK_4_BIT |
6685 VMX_EPT_PAGE_WALK_5_BIT |
6686 VMX_EPTP_WB_BIT |
96d47010
SC
6687 VMX_EPT_INVEPT_BIT |
6688 VMX_EPT_EXECUTE_ONLY_BIT;
6689
55d2375e
SC
6690 msrs->ept_caps &= ept_caps;
6691 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6692 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6693 VMX_EPT_1GB_PAGE_BIT;
6694 if (enable_ept_ad_bits) {
6695 msrs->secondary_ctls_high |=
6696 SECONDARY_EXEC_ENABLE_PML;
6697 msrs->ept_caps |= VMX_EPT_AD_BIT;
6698 }
6699 }
6700
6701 if (cpu_has_vmx_vmfunc()) {
6702 msrs->secondary_ctls_high |=
6703 SECONDARY_EXEC_ENABLE_VMFUNC;
6704 /*
6705 * Advertise EPTP switching unconditionally
6706 * since we emulate it
6707 */
6708 if (enable_ept)
6709 msrs->vmfunc_controls =
6710 VMX_VMFUNC_EPTP_SWITCHING;
6711 }
6712
6713 /*
6714 * Old versions of KVM use the single-context version without
6715 * checking for support, so declare that it is supported even
6716 * though it is treated as global context. The alternative is
6717 * not failing the single-context invvpid, and it is worse.
6718 */
6719 if (enable_vpid) {
6720 msrs->secondary_ctls_high |=
6721 SECONDARY_EXEC_ENABLE_VPID;
6722 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6723 VMX_VPID_EXTENT_SUPPORTED_MASK;
6724 }
6725
6726 if (enable_unrestricted_guest)
6727 msrs->secondary_ctls_high |=
6728 SECONDARY_EXEC_UNRESTRICTED_GUEST;
6729
6730 if (flexpriority_enabled)
6731 msrs->secondary_ctls_high |=
6732 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6733
72add915
SC
6734 if (enable_sgx)
6735 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
6736
55d2375e
SC
6737 /* miscellaneous data */
6738 rdmsr(MSR_IA32_VMX_MISC,
6739 msrs->misc_low,
6740 msrs->misc_high);
6741 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6742 msrs->misc_low |=
6743 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6744 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
bf0cd88c
YQ
6745 VMX_MISC_ACTIVITY_HLT |
6746 VMX_MISC_ACTIVITY_WAIT_SIPI;
55d2375e
SC
6747 msrs->misc_high = 0;
6748
6749 /*
6750 * This MSR reports some information about VMX support. We
6751 * should return information about the VMX we emulate for the
6752 * guest, and the VMCS structure we give it - not about the
6753 * VMX support of the underlying hardware.
6754 */
6755 msrs->basic =
6756 VMCS12_REVISION |
6757 VMX_BASIC_TRUE_CTLS |
6758 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6759 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6760
6761 if (cpu_has_vmx_basic_inout())
6762 msrs->basic |= VMX_BASIC_INOUT;
6763
6764 /*
6765 * These MSRs specify bits which the guest must keep fixed on
6766 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6767 * We picked the standard core2 setting.
6768 */
6769#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6770#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6771 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6772 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6773
6774 /* These MSRs specify bits which the guest must keep fixed off. */
6775 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6776 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6777
ba1f8245 6778 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
55d2375e
SC
6779}
6780
6781void nested_vmx_hardware_unsetup(void)
6782{
6783 int i;
6784
6785 if (enable_shadow_vmcs) {
6786 for (i = 0; i < VMX_BITMAP_NR; i++)
6787 free_page((unsigned long)vmx_bitmap[i]);
6788 }
6789}
6790
6c1c6e58 6791__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
55d2375e
SC
6792{
6793 int i;
6794
6795 if (!cpu_has_vmx_shadow_vmcs())
6796 enable_shadow_vmcs = 0;
6797 if (enable_shadow_vmcs) {
6798 for (i = 0; i < VMX_BITMAP_NR; i++) {
41836839
BG
6799 /*
6800 * The vmx_bitmap is not tied to a VM and so should
6801 * not be charged to a memcg.
6802 */
55d2375e
SC
6803 vmx_bitmap[i] = (unsigned long *)
6804 __get_free_page(GFP_KERNEL);
6805 if (!vmx_bitmap[i]) {
6806 nested_vmx_hardware_unsetup();
6807 return -ENOMEM;
6808 }
6809 }
6810
6811 init_vmcs_shadow_fields();
6812 }
6813
cc877670
LA
6814 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
6815 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
6816 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
6817 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
6818 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
6819 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
6820 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
6821 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
6822 exit_handlers[EXIT_REASON_VMON] = handle_vmon;
6823 exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
6824 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
6825 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
55d2375e 6826
55d2375e
SC
6827 return 0;
6828}
33b22172
PB
6829
6830struct kvm_x86_nested_ops vmx_nested_ops = {
f7e57078 6831 .leave_nested = vmx_leave_nested,
33b22172 6832 .check_events = vmx_check_nested_events,
6819af75 6833 .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
d2060bd4 6834 .hv_timer_pending = nested_vmx_preemption_timer_pending,
cb6a32c2 6835 .triple_fault = nested_vmx_triple_fault,
33b22172
PB
6836 .get_state = vmx_get_nested_state,
6837 .set_state = vmx_set_nested_state,
9a78e158 6838 .get_nested_state_pages = vmx_get_nested_state_pages,
02f5fb2e 6839 .write_log_dirty = nested_vmx_write_pml_buffer,
33b22172
PB
6840 .enable_evmcs = nested_enable_evmcs,
6841 .get_evmcs_version = nested_get_evmcs_version,
6842};