Commit | Line | Data |
---|---|---|
55d2375e | 1 | // SPDX-License-Identifier: GPL-2.0 |
8d20bd63 | 2 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
55d2375e | 3 | |
00089c04 | 4 | #include <linux/objtool.h> |
55d2375e SC |
5 | #include <linux/percpu.h> |
6 | ||
7 | #include <asm/debugreg.h> | |
8 | #include <asm/mmu_context.h> | |
9 | ||
10 | #include "cpuid.h" | |
11 | #include "hyperv.h" | |
12 | #include "mmu.h" | |
13 | #include "nested.h" | |
bfc6ad6a | 14 | #include "pmu.h" |
d83c36d8 | 15 | #include "posted_intr.h" |
72add915 | 16 | #include "sgx.h" |
55d2375e | 17 | #include "trace.h" |
150f17bf | 18 | #include "vmx.h" |
55d2375e | 19 | #include "x86.h" |
b0b42197 | 20 | #include "smm.h" |
55d2375e SC |
21 | |
22 | static bool __read_mostly enable_shadow_vmcs = 1; | |
23 | module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); | |
24 | ||
25 | static bool __read_mostly nested_early_check = 0; | |
26 | module_param(nested_early_check, bool, S_IRUGO); | |
27 | ||
648fc8ae | 28 | #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK |
5497b955 | 29 | |
55d2375e SC |
30 | /* |
31 | * Hyper-V requires all of these, so mark them as supported even though | |
32 | * they are just treated the same as all-context. | |
33 | */ | |
34 | #define VMX_VPID_EXTENT_SUPPORTED_MASK \ | |
35 | (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ | |
36 | VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ | |
37 | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ | |
38 | VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) | |
39 | ||
40 | #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 | |
41 | ||
42 | enum { | |
43 | VMX_VMREAD_BITMAP, | |
44 | VMX_VMWRITE_BITMAP, | |
45 | VMX_BITMAP_NR | |
46 | }; | |
47 | static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; | |
48 | ||
49 | #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) | |
50 | #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) | |
51 | ||
1c6f0b47 SC |
52 | struct shadow_vmcs_field { |
53 | u16 encoding; | |
54 | u16 offset; | |
55 | }; | |
56 | static struct shadow_vmcs_field shadow_read_only_fields[] = { | |
57 | #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, | |
55d2375e SC |
58 | #include "vmcs_shadow_fields.h" |
59 | }; | |
60 | static int max_shadow_read_only_fields = | |
61 | ARRAY_SIZE(shadow_read_only_fields); | |
62 | ||
1c6f0b47 SC |
63 | static struct shadow_vmcs_field shadow_read_write_fields[] = { |
64 | #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, | |
55d2375e SC |
65 | #include "vmcs_shadow_fields.h" |
66 | }; | |
67 | static int max_shadow_read_write_fields = | |
68 | ARRAY_SIZE(shadow_read_write_fields); | |
69 | ||
8997f657 | 70 | static void init_vmcs_shadow_fields(void) |
55d2375e SC |
71 | { |
72 | int i, j; | |
73 | ||
74 | memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); | |
75 | memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); | |
76 | ||
77 | for (i = j = 0; i < max_shadow_read_only_fields; i++) { | |
1c6f0b47 SC |
78 | struct shadow_vmcs_field entry = shadow_read_only_fields[i]; |
79 | u16 field = entry.encoding; | |
55d2375e SC |
80 | |
81 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | |
82 | (i + 1 == max_shadow_read_only_fields || | |
1c6f0b47 | 83 | shadow_read_only_fields[i + 1].encoding != field + 1)) |
55d2375e SC |
84 | pr_err("Missing field from shadow_read_only_field %x\n", |
85 | field + 1); | |
86 | ||
87 | clear_bit(field, vmx_vmread_bitmap); | |
55d2375e | 88 | if (field & 1) |
1c6f0b47 | 89 | #ifdef CONFIG_X86_64 |
55d2375e | 90 | continue; |
1c6f0b47 SC |
91 | #else |
92 | entry.offset += sizeof(u32); | |
55d2375e | 93 | #endif |
1c6f0b47 | 94 | shadow_read_only_fields[j++] = entry; |
55d2375e SC |
95 | } |
96 | max_shadow_read_only_fields = j; | |
97 | ||
98 | for (i = j = 0; i < max_shadow_read_write_fields; i++) { | |
1c6f0b47 SC |
99 | struct shadow_vmcs_field entry = shadow_read_write_fields[i]; |
100 | u16 field = entry.encoding; | |
55d2375e SC |
101 | |
102 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | |
103 | (i + 1 == max_shadow_read_write_fields || | |
1c6f0b47 | 104 | shadow_read_write_fields[i + 1].encoding != field + 1)) |
55d2375e SC |
105 | pr_err("Missing field from shadow_read_write_field %x\n", |
106 | field + 1); | |
107 | ||
b6437805 SC |
108 | WARN_ONCE(field >= GUEST_ES_AR_BYTES && |
109 | field <= GUEST_TR_AR_BYTES, | |
1c6f0b47 | 110 | "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); |
b6437805 | 111 | |
55d2375e SC |
112 | /* |
113 | * PML and the preemption timer can be emulated, but the | |
114 | * processor cannot vmwrite to fields that don't exist | |
115 | * on bare metal. | |
116 | */ | |
117 | switch (field) { | |
118 | case GUEST_PML_INDEX: | |
119 | if (!cpu_has_vmx_pml()) | |
120 | continue; | |
121 | break; | |
122 | case VMX_PREEMPTION_TIMER_VALUE: | |
123 | if (!cpu_has_vmx_preemption_timer()) | |
124 | continue; | |
125 | break; | |
126 | case GUEST_INTR_STATUS: | |
127 | if (!cpu_has_vmx_apicv()) | |
128 | continue; | |
129 | break; | |
130 | default: | |
131 | break; | |
132 | } | |
133 | ||
134 | clear_bit(field, vmx_vmwrite_bitmap); | |
135 | clear_bit(field, vmx_vmread_bitmap); | |
55d2375e | 136 | if (field & 1) |
1c6f0b47 | 137 | #ifdef CONFIG_X86_64 |
55d2375e | 138 | continue; |
1c6f0b47 SC |
139 | #else |
140 | entry.offset += sizeof(u32); | |
55d2375e | 141 | #endif |
1c6f0b47 | 142 | shadow_read_write_fields[j++] = entry; |
55d2375e SC |
143 | } |
144 | max_shadow_read_write_fields = j; | |
145 | } | |
146 | ||
147 | /* | |
148 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), | |
149 | * set the success or error code of an emulated VMX instruction (as specified | |
150 | * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated | |
151 | * instruction. | |
152 | */ | |
153 | static int nested_vmx_succeed(struct kvm_vcpu *vcpu) | |
154 | { | |
155 | vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | |
156 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | |
157 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); | |
158 | return kvm_skip_emulated_instruction(vcpu); | |
159 | } | |
160 | ||
161 | static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) | |
162 | { | |
163 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | |
164 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | | |
165 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | |
166 | | X86_EFLAGS_CF); | |
167 | return kvm_skip_emulated_instruction(vcpu); | |
168 | } | |
169 | ||
170 | static int nested_vmx_failValid(struct kvm_vcpu *vcpu, | |
171 | u32 vm_instruction_error) | |
172 | { | |
55d2375e SC |
173 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) |
174 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | |
175 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | |
176 | | X86_EFLAGS_ZF); | |
177 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; | |
178 | /* | |
b7685cfd VK |
179 | * We don't need to force sync to shadow VMCS because |
180 | * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all | |
181 | * fields and thus must be synced. | |
55d2375e | 182 | */ |
453e42b0 | 183 | if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) |
b7685cfd VK |
184 | to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; |
185 | ||
55d2375e SC |
186 | return kvm_skip_emulated_instruction(vcpu); |
187 | } | |
188 | ||
b2656e4d SC |
189 | static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) |
190 | { | |
191 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
192 | ||
193 | /* | |
194 | * failValid writes the error number to the current VMCS, which | |
195 | * can't be done if there isn't a current VMCS. | |
196 | */ | |
64c78508 | 197 | if (vmx->nested.current_vmptr == INVALID_GPA && |
453e42b0 | 198 | !nested_vmx_is_evmptr12_valid(vmx)) |
b2656e4d SC |
199 | return nested_vmx_failInvalid(vcpu); |
200 | ||
201 | return nested_vmx_failValid(vcpu, vm_instruction_error); | |
202 | } | |
203 | ||
55d2375e SC |
204 | static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) |
205 | { | |
206 | /* TODO: not to reset guest simply here. */ | |
207 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | |
8d20bd63 | 208 | pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); |
55d2375e SC |
209 | } |
210 | ||
f0b5105a MO |
211 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) |
212 | { | |
213 | return fixed_bits_valid(control, low, high); | |
214 | } | |
215 | ||
216 | static inline u64 vmx_control_msr(u32 low, u32 high) | |
217 | { | |
218 | return low | ((u64)high << 32); | |
219 | } | |
220 | ||
55d2375e SC |
221 | static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) |
222 | { | |
fe7f895d | 223 | secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); |
64c78508 | 224 | vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); |
88dddc11 | 225 | vmx->nested.need_vmcs12_to_shadow_sync = false; |
55d2375e SC |
226 | } |
227 | ||
228 | static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) | |
229 | { | |
b4f69df0 | 230 | #ifdef CONFIG_KVM_HYPERV |
38edb452 | 231 | struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); |
55d2375e SC |
232 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
233 | ||
453e42b0 | 234 | if (nested_vmx_is_evmptr12_valid(vmx)) { |
1e9dfbd7 VK |
235 | kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); |
236 | vmx->nested.hv_evmcs = NULL; | |
237 | } | |
55d2375e | 238 | |
1e9dfbd7 | 239 | vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; |
38edb452 VK |
240 | |
241 | if (hv_vcpu) { | |
242 | hv_vcpu->nested.pa_page_gpa = INVALID_GPA; | |
243 | hv_vcpu->nested.vm_id = 0; | |
244 | hv_vcpu->nested.vp_id = 0; | |
245 | } | |
b4f69df0 | 246 | #endif |
55d2375e SC |
247 | } |
248 | ||
b2e02f82 VK |
249 | static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) |
250 | { | |
b4f69df0 | 251 | #ifdef CONFIG_KVM_HYPERV |
b2e02f82 VK |
252 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
253 | /* | |
254 | * When Enlightened VMEntry is enabled on the calling CPU we treat | |
255 | * memory area pointer by vmptr as Enlightened VMCS (as there's no good | |
256 | * way to distinguish it from VMCS12) and we must not corrupt it by | |
257 | * writing to the non-existent 'launch_state' field. The area doesn't | |
258 | * have to be the currently active EVMCS on the calling CPU and there's | |
259 | * nothing KVM has to do to transition it from 'active' to 'non-active' | |
260 | * state. It is possible that the area will stay mapped as | |
261 | * vmx->nested.hv_evmcs but this shouldn't be a problem. | |
262 | */ | |
263 | if (!guest_cpuid_has_evmcs(vcpu) || | |
264 | !evmptr_is_valid(nested_get_evmptr(vcpu))) | |
265 | return false; | |
266 | ||
c98842b2 | 267 | if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) |
b2e02f82 VK |
268 | nested_release_evmcs(vcpu); |
269 | ||
270 | return true; | |
b4f69df0 VK |
271 | #else |
272 | return false; | |
273 | #endif | |
55d2375e SC |
274 | } |
275 | ||
c61ca2fc SC |
276 | static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, |
277 | struct loaded_vmcs *prev) | |
278 | { | |
279 | struct vmcs_host_state *dest, *src; | |
280 | ||
281 | if (unlikely(!vmx->guest_state_loaded)) | |
282 | return; | |
283 | ||
284 | src = &prev->host_state; | |
285 | dest = &vmx->loaded_vmcs->host_state; | |
286 | ||
bca06b85 | 287 | vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); |
c61ca2fc SC |
288 | dest->ldt_sel = src->ldt_sel; |
289 | #ifdef CONFIG_X86_64 | |
290 | dest->ds_sel = src->ds_sel; | |
291 | dest->es_sel = src->es_sel; | |
292 | #endif | |
293 | } | |
294 | ||
295 | static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) | |
296 | { | |
297 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
298 | struct loaded_vmcs *prev; | |
299 | int cpu; | |
300 | ||
138534a8 | 301 | if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) |
c61ca2fc SC |
302 | return; |
303 | ||
304 | cpu = get_cpu(); | |
305 | prev = vmx->loaded_vmcs; | |
306 | vmx->loaded_vmcs = vmcs; | |
307 | vmx_vcpu_load_vmcs(vcpu, cpu, prev); | |
308 | vmx_sync_vmcs_host_state(vmx, prev); | |
309 | put_cpu(); | |
310 | ||
41e68b69 PB |
311 | vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; |
312 | ||
313 | /* | |
314 | * All lazily updated registers will be reloaded from VMCS12 on both | |
315 | * vmentry and vmexit. | |
316 | */ | |
317 | vcpu->arch.regs_dirty = 0; | |
c61ca2fc SC |
318 | } |
319 | ||
55d2375e SC |
320 | /* |
321 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or | |
322 | * just stops using VMX. | |
323 | */ | |
324 | static void free_nested(struct kvm_vcpu *vcpu) | |
325 | { | |
326 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
327 | ||
df82a24b SC |
328 | if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) |
329 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | |
330 | ||
55d2375e SC |
331 | if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) |
332 | return; | |
333 | ||
729c15c2 | 334 | kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); |
cf64527b | 335 | |
55d2375e SC |
336 | vmx->nested.vmxon = false; |
337 | vmx->nested.smm.vmxon = false; | |
feb3162f | 338 | vmx->nested.vmxon_ptr = INVALID_GPA; |
55d2375e SC |
339 | free_vpid(vmx->nested.vpid02); |
340 | vmx->nested.posted_intr_nv = -1; | |
64c78508 | 341 | vmx->nested.current_vmptr = INVALID_GPA; |
55d2375e SC |
342 | if (enable_shadow_vmcs) { |
343 | vmx_disable_shadow_vmcs(vmx); | |
344 | vmcs_clear(vmx->vmcs01.shadow_vmcs); | |
345 | free_vmcs(vmx->vmcs01.shadow_vmcs); | |
346 | vmx->vmcs01.shadow_vmcs = NULL; | |
347 | } | |
348 | kfree(vmx->nested.cached_vmcs12); | |
c6bf2ae9 | 349 | vmx->nested.cached_vmcs12 = NULL; |
55d2375e | 350 | kfree(vmx->nested.cached_shadow_vmcs12); |
c6bf2ae9 | 351 | vmx->nested.cached_shadow_vmcs12 = NULL; |
fe1911aa SC |
352 | /* |
353 | * Unpin physical memory we referred to in the vmcs02. The APIC access | |
354 | * page's backing page (yeah, confusing) shouldn't actually be accessed, | |
355 | * and if it is written, the contents are irrelevant. | |
356 | */ | |
357 | kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); | |
96c66e87 | 358 | kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); |
3278e049 KA |
359 | kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); |
360 | vmx->nested.pi_desc = NULL; | |
55d2375e | 361 | |
0c1c92f1 | 362 | kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); |
55d2375e SC |
363 | |
364 | nested_release_evmcs(vcpu); | |
365 | ||
366 | free_loaded_vmcs(&vmx->nested.vmcs02); | |
367 | } | |
368 | ||
55d2375e SC |
369 | /* |
370 | * Ensure that the current vmcs of the logical processor is the | |
371 | * vmcs01 of the vcpu before calling free_nested(). | |
372 | */ | |
373 | void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) | |
374 | { | |
375 | vcpu_load(vcpu); | |
b4b65b56 | 376 | vmx_leave_nested(vcpu); |
55d2375e SC |
377 | vcpu_put(vcpu); |
378 | } | |
379 | ||
85aa8889 JS |
380 | #define EPTP_PA_MASK GENMASK_ULL(51, 12) |
381 | ||
382 | static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) | |
383 | { | |
384 | return VALID_PAGE(root_hpa) && | |
385 | ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); | |
386 | } | |
387 | ||
388 | static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, | |
389 | gpa_t addr) | |
390 | { | |
2c86c444 | 391 | unsigned long roots = 0; |
85aa8889 JS |
392 | uint i; |
393 | struct kvm_mmu_root_info *cached_root; | |
394 | ||
395 | WARN_ON_ONCE(!mmu_is_nested(vcpu)); | |
396 | ||
397 | for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { | |
398 | cached_root = &vcpu->arch.mmu->prev_roots[i]; | |
399 | ||
400 | if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, | |
401 | eptp)) | |
2c86c444 | 402 | roots |= KVM_MMU_ROOT_PREVIOUS(i); |
85aa8889 | 403 | } |
2c86c444 LJ |
404 | if (roots) |
405 | kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); | |
85aa8889 JS |
406 | } |
407 | ||
55d2375e SC |
408 | static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, |
409 | struct x86_exception *fault) | |
410 | { | |
411 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
412 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
a9466078 | 413 | unsigned long exit_qualification; |
4dcefa31 | 414 | u32 vm_exit_reason; |
55d2375e SC |
415 | |
416 | if (vmx->nested.pml_full) { | |
4dcefa31 | 417 | vm_exit_reason = EXIT_REASON_PML_FULL; |
55d2375e | 418 | vmx->nested.pml_full = false; |
a9466078 | 419 | |
23ffe4bb SC |
420 | /* |
421 | * It should be impossible to trigger a nested PML Full VM-Exit | |
422 | * for anything other than an EPT Violation from L2. KVM *can* | |
423 | * trigger nEPT page fault injection in response to an EPT | |
424 | * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT | |
425 | * tables also changed, but KVM should not treat EPT Misconfig | |
426 | * VM-Exits as writes. | |
427 | */ | |
428 | WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); | |
429 | ||
a9466078 SC |
430 | /* |
431 | * PML Full and EPT Violation VM-Exits both use bit 12 to report | |
432 | * "NMI unblocking due to IRET", i.e. the bit can be propagated | |
433 | * as-is from the original EXIT_QUALIFICATION. | |
434 | */ | |
435 | exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; | |
85aa8889 | 436 | } else { |
0c476514 | 437 | if (fault->error_code & PFERR_RSVD_MASK) { |
85aa8889 | 438 | vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; |
0c476514 SC |
439 | exit_qualification = 0; |
440 | } else { | |
a9466078 SC |
441 | exit_qualification = fault->exit_qualification; |
442 | exit_qualification |= vmx_get_exit_qual(vcpu) & | |
443 | (EPT_VIOLATION_GVA_IS_VALID | | |
444 | EPT_VIOLATION_GVA_TRANSLATED); | |
85aa8889 | 445 | vm_exit_reason = EXIT_REASON_EPT_VIOLATION; |
0c476514 | 446 | } |
85aa8889 JS |
447 | |
448 | /* | |
449 | * Although the caller (kvm_inject_emulated_page_fault) would | |
450 | * have already synced the faulting address in the shadow EPT | |
451 | * tables for the current EPTP12, we also need to sync it for | |
452 | * any other cached EPTP02s based on the same EP4TA, since the | |
453 | * TLB associates mappings to the EP4TA rather than the full EPTP. | |
454 | */ | |
455 | nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, | |
456 | fault->address); | |
457 | } | |
55d2375e | 458 | |
4dcefa31 | 459 | nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); |
55d2375e SC |
460 | vmcs12->guest_physical_address = fault->address; |
461 | } | |
462 | ||
39353ab5 SC |
463 | static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) |
464 | { | |
cc022ae1 LJ |
465 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
466 | bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; | |
467 | int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); | |
468 | ||
469 | kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, | |
39353ab5 SC |
470 | nested_ept_ad_enabled(vcpu), |
471 | nested_ept_get_eptp(vcpu)); | |
472 | } | |
473 | ||
55d2375e SC |
474 | static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) |
475 | { | |
476 | WARN_ON(mmu_is_nested(vcpu)); | |
477 | ||
478 | vcpu->arch.mmu = &vcpu->arch.guest_mmu; | |
39353ab5 | 479 | nested_ept_new_eptp(vcpu); |
d8dd54e0 | 480 | vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; |
55d2375e SC |
481 | vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; |
482 | vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; | |
483 | ||
484 | vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; | |
485 | } | |
486 | ||
487 | static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) | |
488 | { | |
489 | vcpu->arch.mmu = &vcpu->arch.root_mmu; | |
490 | vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; | |
491 | } | |
492 | ||
493 | static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, | |
494 | u16 error_code) | |
495 | { | |
496 | bool inequality, bit; | |
497 | ||
498 | bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; | |
499 | inequality = | |
500 | (error_code & vmcs12->page_fault_error_code_mask) != | |
501 | vmcs12->page_fault_error_code_match; | |
502 | return inequality ^ bit; | |
503 | } | |
504 | ||
7709aba8 SC |
505 | static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, |
506 | u32 error_code) | |
55d2375e SC |
507 | { |
508 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
55d2375e | 509 | |
7709aba8 SC |
510 | /* |
511 | * Drop bits 31:16 of the error code when performing the #PF mask+match | |
512 | * check. All VMCS fields involved are 32 bits, but Intel CPUs never | |
513 | * set bits 31:16 and VMX disallows setting bits 31:16 in the injected | |
514 | * error code. Including the to-be-dropped bits in the check might | |
515 | * result in an "impossible" or missed exit from L1's perspective. | |
516 | */ | |
517 | if (vector == PF_VECTOR) | |
518 | return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); | |
55d2375e | 519 | |
7709aba8 | 520 | return (vmcs12->exception_bitmap & (1u << vector)); |
55d2375e SC |
521 | } |
522 | ||
55d2375e SC |
523 | static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, |
524 | struct vmcs12 *vmcs12) | |
525 | { | |
526 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | |
527 | return 0; | |
528 | ||
5497b955 SC |
529 | if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || |
530 | CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) | |
55d2375e SC |
531 | return -EINVAL; |
532 | ||
533 | return 0; | |
534 | } | |
535 | ||
536 | static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, | |
537 | struct vmcs12 *vmcs12) | |
538 | { | |
539 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | |
540 | return 0; | |
541 | ||
5497b955 | 542 | if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) |
55d2375e SC |
543 | return -EINVAL; |
544 | ||
545 | return 0; | |
546 | } | |
547 | ||
548 | static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, | |
549 | struct vmcs12 *vmcs12) | |
550 | { | |
551 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | |
552 | return 0; | |
553 | ||
5497b955 | 554 | if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) |
55d2375e SC |
555 | return -EINVAL; |
556 | ||
557 | return 0; | |
558 | } | |
559 | ||
55d2375e | 560 | /* |
a5e0c252 SC |
561 | * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 |
562 | * itself utilizing x2APIC. All MSRs were previously set to be intercepted, | |
563 | * only the "disable intercept" case needs to be handled. | |
55d2375e | 564 | */ |
a5e0c252 SC |
565 | static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, |
566 | unsigned long *msr_bitmap_l0, | |
567 | u32 msr, int type) | |
55d2375e | 568 | { |
a5e0c252 SC |
569 | if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) |
570 | vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); | |
55d2375e | 571 | |
a5e0c252 SC |
572 | if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) |
573 | vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); | |
55d2375e SC |
574 | } |
575 | ||
ffdbd50d ML |
576 | static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) |
577 | { | |
acff7847 MO |
578 | int msr; |
579 | ||
580 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | |
581 | unsigned word = msr / BITS_PER_LONG; | |
582 | ||
583 | msr_bitmap[word] = ~0; | |
584 | msr_bitmap[word + (0x800 / sizeof(long))] = ~0; | |
585 | } | |
586 | } | |
587 | ||
67f4b996 SC |
588 | #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ |
589 | static inline \ | |
590 | void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ | |
591 | unsigned long *msr_bitmap_l1, \ | |
592 | unsigned long *msr_bitmap_l0, u32 msr) \ | |
593 | { \ | |
594 | if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ | |
595 | vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ | |
596 | vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ | |
597 | else \ | |
598 | vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ | |
599 | } | |
600 | BUILD_NVMX_MSR_INTERCEPT_HELPER(read) | |
601 | BUILD_NVMX_MSR_INTERCEPT_HELPER(write) | |
602 | ||
603 | static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, | |
604 | unsigned long *msr_bitmap_l1, | |
605 | unsigned long *msr_bitmap_l0, | |
606 | u32 msr, int types) | |
607 | { | |
608 | if (types & MSR_TYPE_R) | |
609 | nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, | |
610 | msr_bitmap_l0, msr); | |
611 | if (types & MSR_TYPE_W) | |
612 | nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, | |
613 | msr_bitmap_l0, msr); | |
614 | } | |
615 | ||
55d2375e SC |
616 | /* |
617 | * Merge L0's and L1's MSR bitmap, return false to indicate that | |
618 | * we do not use the hardware. | |
619 | */ | |
620 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | |
621 | struct vmcs12 *vmcs12) | |
622 | { | |
67f4b996 | 623 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
55d2375e | 624 | int msr; |
55d2375e | 625 | unsigned long *msr_bitmap_l1; |
67f4b996 SC |
626 | unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; |
627 | struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; | |
55d2375e SC |
628 | |
629 | /* Nothing to do if the MSR bitmap is not in use. */ | |
630 | if (!cpu_has_vmx_msr_bitmap() || | |
631 | !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | |
632 | return false; | |
633 | ||
502d2bf5 VK |
634 | /* |
635 | * MSR bitmap update can be skipped when: | |
636 | * - MSR bitmap for L1 hasn't changed. | |
637 | * - Nested hypervisor (L1) is attempting to launch the same L2 as | |
638 | * before. | |
639 | * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature | |
640 | * and tells KVM (L0) there were no changes in MSR bitmap for L2. | |
641 | */ | |
c98842b2 VK |
642 | if (!vmx->nested.force_msr_bitmap_recalc) { |
643 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); | |
644 | ||
645 | if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && | |
646 | evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) | |
647 | return true; | |
648 | } | |
502d2bf5 | 649 | |
31f0b6c4 | 650 | if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) |
55d2375e SC |
651 | return false; |
652 | ||
31f0b6c4 | 653 | msr_bitmap_l1 = (unsigned long *)map->hva; |
55d2375e | 654 | |
acff7847 MO |
655 | /* |
656 | * To keep the control flow simple, pay eight 8-byte writes (sixteen | |
657 | * 4-byte writes on 32-bit systems) up front to enable intercepts for | |
a5e0c252 | 658 | * the x2APIC MSR range and selectively toggle those relevant to L2. |
acff7847 MO |
659 | */ |
660 | enable_x2apic_msr_intercepts(msr_bitmap_l0); | |
661 | ||
662 | if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { | |
663 | if (nested_cpu_has_apic_reg_virt(vmcs12)) { | |
664 | /* | |
665 | * L0 need not intercept reads for MSRs between 0x800 | |
666 | * and 0x8ff, it just lets the processor take the value | |
667 | * from the virtual-APIC page; take those 256 bits | |
668 | * directly from the L1 bitmap. | |
669 | */ | |
670 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | |
671 | unsigned word = msr / BITS_PER_LONG; | |
672 | ||
673 | msr_bitmap_l0[word] = msr_bitmap_l1[word]; | |
674 | } | |
675 | } | |
55d2375e | 676 | |
a5e0c252 | 677 | nested_vmx_disable_intercept_for_x2apic_msr( |
55d2375e | 678 | msr_bitmap_l1, msr_bitmap_l0, |
acff7847 | 679 | X2APIC_MSR(APIC_TASKPRI), |
c73f4c99 | 680 | MSR_TYPE_R | MSR_TYPE_W); |
acff7847 MO |
681 | |
682 | if (nested_cpu_has_vid(vmcs12)) { | |
a5e0c252 | 683 | nested_vmx_disable_intercept_for_x2apic_msr( |
acff7847 MO |
684 | msr_bitmap_l1, msr_bitmap_l0, |
685 | X2APIC_MSR(APIC_EOI), | |
686 | MSR_TYPE_W); | |
a5e0c252 | 687 | nested_vmx_disable_intercept_for_x2apic_msr( |
acff7847 MO |
688 | msr_bitmap_l1, msr_bitmap_l0, |
689 | X2APIC_MSR(APIC_SELF_IPI), | |
690 | MSR_TYPE_W); | |
691 | } | |
55d2375e SC |
692 | } |
693 | ||
67f4b996 SC |
694 | /* |
695 | * Always check vmcs01's bitmap to honor userspace MSR filters and any | |
696 | * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. | |
697 | */ | |
dbdd096a | 698 | #ifdef CONFIG_X86_64 |
67f4b996 SC |
699 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
700 | MSR_FS_BASE, MSR_TYPE_RW); | |
d69129b4 | 701 | |
67f4b996 SC |
702 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
703 | MSR_GS_BASE, MSR_TYPE_RW); | |
d69129b4 | 704 | |
67f4b996 SC |
705 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
706 | MSR_KERNEL_GS_BASE, MSR_TYPE_RW); | |
dbdd096a | 707 | #endif |
67f4b996 SC |
708 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
709 | MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); | |
d69129b4 | 710 | |
67f4b996 SC |
711 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
712 | MSR_IA32_PRED_CMD, MSR_TYPE_W); | |
55d2375e | 713 | |
a807b78a EGE |
714 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
715 | MSR_IA32_FLUSH_CMD, MSR_TYPE_W); | |
716 | ||
67f4b996 | 717 | kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); |
55d2375e | 718 | |
ed2a4800 VK |
719 | vmx->nested.force_msr_bitmap_recalc = false; |
720 | ||
55d2375e SC |
721 | return true; |
722 | } | |
723 | ||
724 | static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, | |
725 | struct vmcs12 *vmcs12) | |
726 | { | |
297d597a DW |
727 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
728 | struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; | |
55d2375e SC |
729 | |
730 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | |
64c78508 | 731 | vmcs12->vmcs_link_pointer == INVALID_GPA) |
55d2375e SC |
732 | return; |
733 | ||
297d597a DW |
734 | if (ghc->gpa != vmcs12->vmcs_link_pointer && |
735 | kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, | |
736 | vmcs12->vmcs_link_pointer, VMCS12_SIZE)) | |
88925305 | 737 | return; |
55d2375e | 738 | |
297d597a DW |
739 | kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), |
740 | VMCS12_SIZE); | |
55d2375e SC |
741 | } |
742 | ||
743 | static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, | |
744 | struct vmcs12 *vmcs12) | |
745 | { | |
746 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
297d597a | 747 | struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; |
55d2375e SC |
748 | |
749 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | |
64c78508 | 750 | vmcs12->vmcs_link_pointer == INVALID_GPA) |
55d2375e SC |
751 | return; |
752 | ||
297d597a DW |
753 | if (ghc->gpa != vmcs12->vmcs_link_pointer && |
754 | kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, | |
755 | vmcs12->vmcs_link_pointer, VMCS12_SIZE)) | |
756 | return; | |
757 | ||
758 | kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), | |
759 | VMCS12_SIZE); | |
55d2375e SC |
760 | } |
761 | ||
762 | /* | |
763 | * In nested virtualization, check if L1 has set | |
764 | * VM_EXIT_ACK_INTR_ON_EXIT | |
765 | */ | |
766 | static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) | |
767 | { | |
768 | return get_vmcs12(vcpu)->vm_exit_controls & | |
769 | VM_EXIT_ACK_INTR_ON_EXIT; | |
770 | } | |
771 | ||
55d2375e SC |
772 | static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, |
773 | struct vmcs12 *vmcs12) | |
774 | { | |
775 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && | |
5497b955 | 776 | CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) |
55d2375e SC |
777 | return -EINVAL; |
778 | else | |
779 | return 0; | |
780 | } | |
781 | ||
782 | static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, | |
783 | struct vmcs12 *vmcs12) | |
784 | { | |
785 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | |
786 | !nested_cpu_has_apic_reg_virt(vmcs12) && | |
787 | !nested_cpu_has_vid(vmcs12) && | |
788 | !nested_cpu_has_posted_intr(vmcs12)) | |
789 | return 0; | |
790 | ||
791 | /* | |
792 | * If virtualize x2apic mode is enabled, | |
793 | * virtualize apic access must be disabled. | |
794 | */ | |
5497b955 SC |
795 | if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && |
796 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) | |
55d2375e SC |
797 | return -EINVAL; |
798 | ||
799 | /* | |
800 | * If virtual interrupt delivery is enabled, | |
801 | * we must exit on external interrupts. | |
802 | */ | |
5497b955 | 803 | if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) |
55d2375e SC |
804 | return -EINVAL; |
805 | ||
806 | /* | |
807 | * bits 15:8 should be zero in posted_intr_nv, | |
808 | * the descriptor address has been already checked | |
809 | * in nested_get_vmcs12_pages. | |
810 | * | |
811 | * bits 5:0 of posted_intr_desc_addr should be zero. | |
812 | */ | |
813 | if (nested_cpu_has_posted_intr(vmcs12) && | |
5497b955 SC |
814 | (CC(!nested_cpu_has_vid(vmcs12)) || |
815 | CC(!nested_exit_intr_ack_set(vcpu)) || | |
816 | CC((vmcs12->posted_intr_nv & 0xff00)) || | |
636e8b73 | 817 | CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) |
55d2375e SC |
818 | return -EINVAL; |
819 | ||
820 | /* tpr shadow is needed by all apicv features. */ | |
5497b955 | 821 | if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) |
55d2375e SC |
822 | return -EINVAL; |
823 | ||
824 | return 0; | |
825 | } | |
826 | ||
827 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | |
f9b245e1 | 828 | u32 count, u64 addr) |
55d2375e | 829 | { |
55d2375e SC |
830 | if (count == 0) |
831 | return 0; | |
636e8b73 SC |
832 | |
833 | if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || | |
834 | !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) | |
55d2375e | 835 | return -EINVAL; |
f9b245e1 | 836 | |
55d2375e SC |
837 | return 0; |
838 | } | |
839 | ||
61446ba7 KS |
840 | static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, |
841 | struct vmcs12 *vmcs12) | |
55d2375e | 842 | { |
5497b955 SC |
843 | if (CC(nested_vmx_check_msr_switch(vcpu, |
844 | vmcs12->vm_exit_msr_load_count, | |
845 | vmcs12->vm_exit_msr_load_addr)) || | |
846 | CC(nested_vmx_check_msr_switch(vcpu, | |
847 | vmcs12->vm_exit_msr_store_count, | |
848 | vmcs12->vm_exit_msr_store_addr))) | |
55d2375e | 849 | return -EINVAL; |
f9b245e1 | 850 | |
55d2375e SC |
851 | return 0; |
852 | } | |
853 | ||
5fbf9634 KS |
854 | static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, |
855 | struct vmcs12 *vmcs12) | |
61446ba7 | 856 | { |
5497b955 SC |
857 | if (CC(nested_vmx_check_msr_switch(vcpu, |
858 | vmcs12->vm_entry_msr_load_count, | |
859 | vmcs12->vm_entry_msr_load_addr))) | |
61446ba7 KS |
860 | return -EINVAL; |
861 | ||
862 | return 0; | |
863 | } | |
864 | ||
55d2375e SC |
865 | static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, |
866 | struct vmcs12 *vmcs12) | |
867 | { | |
868 | if (!nested_cpu_has_pml(vmcs12)) | |
869 | return 0; | |
870 | ||
5497b955 SC |
871 | if (CC(!nested_cpu_has_ept(vmcs12)) || |
872 | CC(!page_address_valid(vcpu, vmcs12->pml_address))) | |
55d2375e SC |
873 | return -EINVAL; |
874 | ||
875 | return 0; | |
876 | } | |
877 | ||
878 | static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, | |
879 | struct vmcs12 *vmcs12) | |
880 | { | |
5497b955 SC |
881 | if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && |
882 | !nested_cpu_has_ept(vmcs12))) | |
55d2375e SC |
883 | return -EINVAL; |
884 | return 0; | |
885 | } | |
886 | ||
887 | static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, | |
888 | struct vmcs12 *vmcs12) | |
889 | { | |
5497b955 SC |
890 | if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && |
891 | !nested_cpu_has_ept(vmcs12))) | |
55d2375e SC |
892 | return -EINVAL; |
893 | return 0; | |
894 | } | |
895 | ||
896 | static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, | |
897 | struct vmcs12 *vmcs12) | |
898 | { | |
899 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | |
900 | return 0; | |
901 | ||
5497b955 SC |
902 | if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || |
903 | CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) | |
55d2375e SC |
904 | return -EINVAL; |
905 | ||
906 | return 0; | |
907 | } | |
908 | ||
909 | static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, | |
910 | struct vmx_msr_entry *e) | |
911 | { | |
912 | /* x2APIC MSR accesses are not allowed */ | |
5497b955 | 913 | if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) |
55d2375e | 914 | return -EINVAL; |
5497b955 SC |
915 | if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ |
916 | CC(e->index == MSR_IA32_UCODE_REV)) | |
55d2375e | 917 | return -EINVAL; |
5497b955 | 918 | if (CC(e->reserved != 0)) |
55d2375e SC |
919 | return -EINVAL; |
920 | return 0; | |
921 | } | |
922 | ||
923 | static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, | |
924 | struct vmx_msr_entry *e) | |
925 | { | |
5497b955 SC |
926 | if (CC(e->index == MSR_FS_BASE) || |
927 | CC(e->index == MSR_GS_BASE) || | |
928 | CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ | |
55d2375e SC |
929 | nested_vmx_msr_check_common(vcpu, e)) |
930 | return -EINVAL; | |
931 | return 0; | |
932 | } | |
933 | ||
934 | static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, | |
935 | struct vmx_msr_entry *e) | |
936 | { | |
5497b955 | 937 | if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ |
55d2375e SC |
938 | nested_vmx_msr_check_common(vcpu, e)) |
939 | return -EINVAL; | |
940 | return 0; | |
941 | } | |
942 | ||
f0b5105a MO |
943 | static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) |
944 | { | |
945 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
946 | u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, | |
947 | vmx->nested.msrs.misc_high); | |
948 | ||
949 | return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; | |
950 | } | |
951 | ||
55d2375e SC |
952 | /* |
953 | * Load guest's/host's msr at nested entry/exit. | |
954 | * return 0 for success, entry index for failure. | |
f0b5105a MO |
955 | * |
956 | * One of the failure modes for MSR load/store is when a list exceeds the | |
957 | * virtual hardware's capacity. To maintain compatibility with hardware inasmuch | |
958 | * as possible, process all valid entries before failing rather than precheck | |
959 | * for a capacity violation. | |
55d2375e SC |
960 | */ |
961 | static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | |
962 | { | |
963 | u32 i; | |
964 | struct vmx_msr_entry e; | |
f0b5105a | 965 | u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); |
55d2375e | 966 | |
55d2375e | 967 | for (i = 0; i < count; i++) { |
f0b5105a MO |
968 | if (unlikely(i >= max_msr_list_size)) |
969 | goto fail; | |
970 | ||
55d2375e SC |
971 | if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), |
972 | &e, sizeof(e))) { | |
973 | pr_debug_ratelimited( | |
974 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | |
975 | __func__, i, gpa + i * sizeof(e)); | |
976 | goto fail; | |
977 | } | |
978 | if (nested_vmx_load_msr_check(vcpu, &e)) { | |
979 | pr_debug_ratelimited( | |
980 | "%s check failed (%u, 0x%x, 0x%x)\n", | |
981 | __func__, i, e.index, e.reserved); | |
982 | goto fail; | |
983 | } | |
f20935d8 | 984 | if (kvm_set_msr(vcpu, e.index, e.value)) { |
55d2375e SC |
985 | pr_debug_ratelimited( |
986 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | |
987 | __func__, i, e.index, e.value); | |
988 | goto fail; | |
989 | } | |
990 | } | |
991 | return 0; | |
992 | fail: | |
68cda40d | 993 | /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ |
55d2375e SC |
994 | return i + 1; |
995 | } | |
996 | ||
662f1d1d AL |
997 | static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, |
998 | u32 msr_index, | |
999 | u64 *data) | |
1000 | { | |
1001 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1002 | ||
1003 | /* | |
1004 | * If the L0 hypervisor stored a more accurate value for the TSC that | |
1005 | * does not include the time taken for emulation of the L2->L1 | |
1006 | * VM-exit in L0, use the more accurate value. | |
1007 | */ | |
1008 | if (msr_index == MSR_IA32_TSC) { | |
a128a934 SC |
1009 | int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, |
1010 | MSR_IA32_TSC); | |
662f1d1d | 1011 | |
a128a934 SC |
1012 | if (i >= 0) { |
1013 | u64 val = vmx->msr_autostore.guest.val[i].value; | |
662f1d1d AL |
1014 | |
1015 | *data = kvm_read_l1_tsc(vcpu, val); | |
1016 | return true; | |
1017 | } | |
1018 | } | |
1019 | ||
1020 | if (kvm_get_msr(vcpu, msr_index, data)) { | |
1021 | pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, | |
1022 | msr_index); | |
1023 | return false; | |
1024 | } | |
1025 | return true; | |
1026 | } | |
1027 | ||
365d3d55 AL |
1028 | static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, |
1029 | struct vmx_msr_entry *e) | |
1030 | { | |
1031 | if (kvm_vcpu_read_guest(vcpu, | |
1032 | gpa + i * sizeof(*e), | |
1033 | e, 2 * sizeof(u32))) { | |
1034 | pr_debug_ratelimited( | |
1035 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | |
1036 | __func__, i, gpa + i * sizeof(*e)); | |
1037 | return false; | |
1038 | } | |
1039 | if (nested_vmx_store_msr_check(vcpu, e)) { | |
1040 | pr_debug_ratelimited( | |
1041 | "%s check failed (%u, 0x%x, 0x%x)\n", | |
1042 | __func__, i, e->index, e->reserved); | |
1043 | return false; | |
1044 | } | |
1045 | return true; | |
1046 | } | |
1047 | ||
55d2375e SC |
1048 | static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) |
1049 | { | |
f20935d8 | 1050 | u64 data; |
55d2375e SC |
1051 | u32 i; |
1052 | struct vmx_msr_entry e; | |
f0b5105a | 1053 | u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); |
55d2375e SC |
1054 | |
1055 | for (i = 0; i < count; i++) { | |
f0b5105a MO |
1056 | if (unlikely(i >= max_msr_list_size)) |
1057 | return -EINVAL; | |
1058 | ||
365d3d55 | 1059 | if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) |
55d2375e | 1060 | return -EINVAL; |
365d3d55 | 1061 | |
662f1d1d | 1062 | if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) |
55d2375e | 1063 | return -EINVAL; |
662f1d1d | 1064 | |
55d2375e SC |
1065 | if (kvm_vcpu_write_guest(vcpu, |
1066 | gpa + i * sizeof(e) + | |
1067 | offsetof(struct vmx_msr_entry, value), | |
f20935d8 | 1068 | &data, sizeof(data))) { |
55d2375e SC |
1069 | pr_debug_ratelimited( |
1070 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | |
f20935d8 | 1071 | __func__, i, e.index, data); |
55d2375e SC |
1072 | return -EINVAL; |
1073 | } | |
1074 | } | |
1075 | return 0; | |
1076 | } | |
1077 | ||
662f1d1d AL |
1078 | static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) |
1079 | { | |
1080 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
1081 | u32 count = vmcs12->vm_exit_msr_store_count; | |
1082 | u64 gpa = vmcs12->vm_exit_msr_store_addr; | |
1083 | struct vmx_msr_entry e; | |
1084 | u32 i; | |
1085 | ||
1086 | for (i = 0; i < count; i++) { | |
1087 | if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) | |
1088 | return false; | |
1089 | ||
1090 | if (e.index == msr_index) | |
1091 | return true; | |
1092 | } | |
1093 | return false; | |
1094 | } | |
1095 | ||
1096 | static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, | |
1097 | u32 msr_index) | |
1098 | { | |
1099 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1100 | struct vmx_msrs *autostore = &vmx->msr_autostore.guest; | |
1101 | bool in_vmcs12_store_list; | |
a128a934 | 1102 | int msr_autostore_slot; |
662f1d1d AL |
1103 | bool in_autostore_list; |
1104 | int last; | |
1105 | ||
a128a934 SC |
1106 | msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); |
1107 | in_autostore_list = msr_autostore_slot >= 0; | |
662f1d1d AL |
1108 | in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); |
1109 | ||
1110 | if (in_vmcs12_store_list && !in_autostore_list) { | |
ce833b23 | 1111 | if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { |
662f1d1d AL |
1112 | /* |
1113 | * Emulated VMEntry does not fail here. Instead a less | |
1114 | * accurate value will be returned by | |
1115 | * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() | |
1116 | * instead of reading the value from the vmcs02 VMExit | |
1117 | * MSR-store area. | |
1118 | */ | |
1119 | pr_warn_ratelimited( | |
1120 | "Not enough msr entries in msr_autostore. Can't add msr %x\n", | |
1121 | msr_index); | |
1122 | return; | |
1123 | } | |
1124 | last = autostore->nr++; | |
1125 | autostore->val[last].index = msr_index; | |
1126 | } else if (!in_vmcs12_store_list && in_autostore_list) { | |
1127 | last = --autostore->nr; | |
a128a934 | 1128 | autostore->val[msr_autostore_slot] = autostore->val[last]; |
662f1d1d AL |
1129 | } |
1130 | } | |
1131 | ||
55d2375e | 1132 | /* |
ea79a750 SC |
1133 | * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are |
1134 | * emulating VM-Entry into a guest with EPT enabled. On failure, the expected | |
1135 | * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to | |
1136 | * @entry_failure_code. | |
55d2375e | 1137 | */ |
0f857223 ML |
1138 | static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, |
1139 | bool nested_ept, bool reload_pdptrs, | |
68cda40d | 1140 | enum vm_entry_failure_code *entry_failure_code) |
55d2375e | 1141 | { |
2c49db45 | 1142 | if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { |
0cc69204 SC |
1143 | *entry_failure_code = ENTRY_FAIL_DEFAULT; |
1144 | return -EINVAL; | |
1145 | } | |
55d2375e | 1146 | |
0cc69204 SC |
1147 | /* |
1148 | * If PAE paging and EPT are both on, CR3 is not used by the CPU and | |
1149 | * must not be dereferenced. | |
1150 | */ | |
0f857223 | 1151 | if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && |
2df4a5eb | 1152 | CC(!load_pdptrs(vcpu, cr3))) { |
bcb72d06 SC |
1153 | *entry_failure_code = ENTRY_FAIL_PDPTE; |
1154 | return -EINVAL; | |
55d2375e SC |
1155 | } |
1156 | ||
55d2375e | 1157 | vcpu->arch.cr3 = cr3; |
3883bc9d | 1158 | kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); |
55d2375e | 1159 | |
616007c8 | 1160 | /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ |
c9060662 | 1161 | kvm_init_mmu(vcpu); |
55d2375e | 1162 | |
3cffc89d PB |
1163 | if (!nested_ept) |
1164 | kvm_mmu_new_pgd(vcpu, cr3); | |
1165 | ||
55d2375e SC |
1166 | return 0; |
1167 | } | |
1168 | ||
1169 | /* | |
1170 | * Returns if KVM is able to config CPU to tag TLB entries | |
1171 | * populated by L2 differently than TLB entries populated | |
1172 | * by L1. | |
1173 | * | |
992edeae LA |
1174 | * If L0 uses EPT, L1 and L2 run with different EPTP because |
1175 | * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries | |
1176 | * are tagged with different EPTP. | |
55d2375e SC |
1177 | * |
1178 | * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged | |
1179 | * with different VPID (L1 entries are tagged with vmx->vpid | |
1180 | * while L2 entries are tagged with vmx->nested.vpid02). | |
1181 | */ | |
1182 | static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) | |
1183 | { | |
1184 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
1185 | ||
992edeae | 1186 | return enable_ept || |
55d2375e SC |
1187 | (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); |
1188 | } | |
1189 | ||
50b265a4 SC |
1190 | static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, |
1191 | struct vmcs12 *vmcs12, | |
1192 | bool is_vmenter) | |
1193 | { | |
1194 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1195 | ||
af9d544a VK |
1196 | /* Handle pending Hyper-V TLB flush requests */ |
1197 | kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); | |
c30e9bc8 | 1198 | |
50b265a4 | 1199 | /* |
50a41796 SC |
1200 | * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings |
1201 | * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a | |
1202 | * full TLB flush from the guest's perspective. This is required even | |
1203 | * if VPID is disabled in the host as KVM may need to synchronize the | |
1204 | * MMU in response to the guest TLB flush. | |
1205 | * | |
1206 | * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. | |
1207 | * EPT is a special snowflake, as guest-physical mappings aren't | |
1208 | * flushed on VPID invalidations, including VM-Enter or VM-Exit with | |
1209 | * VPID disabled. As a result, KVM _never_ needs to sync nEPT | |
1210 | * entries on VM-Enter because L1 can't rely on VM-Enter to flush | |
1211 | * those mappings. | |
50b265a4 | 1212 | */ |
50a41796 SC |
1213 | if (!nested_cpu_has_vpid(vmcs12)) { |
1214 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); | |
50b265a4 | 1215 | return; |
50a41796 SC |
1216 | } |
1217 | ||
1218 | /* L2 should never have a VPID if VPID is disabled. */ | |
1219 | WARN_ON(!enable_vpid); | |
50b265a4 SC |
1220 | |
1221 | /* | |
712494de SC |
1222 | * VPID is enabled and in use by vmcs12. If vpid12 is changing, then |
1223 | * emulate a guest TLB flush as KVM does not track vpid12 history nor | |
1224 | * is the VPID incorporated into the MMU context. I.e. KVM must assume | |
1225 | * that the new vpid12 has never been used and thus represents a new | |
1226 | * guest ASID that cannot have entries in the TLB. | |
50b265a4 | 1227 | */ |
712494de | 1228 | if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { |
50b265a4 | 1229 | vmx->nested.last_vpid = vmcs12->virtual_processor_id; |
712494de SC |
1230 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
1231 | return; | |
50b265a4 | 1232 | } |
712494de SC |
1233 | |
1234 | /* | |
1235 | * If VPID is enabled, used by vmc12, and vpid12 is not changing but | |
1236 | * does not have a unique TLB tag (ASID), i.e. EPT is disabled and | |
1237 | * KVM was unable to allocate a VPID for L2, flush the current context | |
1238 | * as the effective ASID is common to both L1 and L2. | |
1239 | */ | |
1240 | if (!nested_has_guest_tlb_tag(vcpu)) | |
1241 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); | |
50b265a4 SC |
1242 | } |
1243 | ||
55d2375e SC |
1244 | static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) |
1245 | { | |
1246 | superset &= mask; | |
1247 | subset &= mask; | |
1248 | ||
1249 | return (superset | subset) == superset; | |
1250 | } | |
1251 | ||
1252 | static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) | |
1253 | { | |
1254 | const u64 feature_and_reserved = | |
1255 | /* feature (except bit 48; see below) */ | |
1256 | BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | | |
1257 | /* reserved */ | |
1258 | BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); | |
f8ae08f9 | 1259 | u64 vmx_basic = vmcs_config.nested.basic; |
55d2375e SC |
1260 | |
1261 | if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) | |
1262 | return -EINVAL; | |
1263 | ||
1264 | /* | |
1265 | * KVM does not emulate a version of VMX that constrains physical | |
1266 | * addresses of VMX structures (e.g. VMCS) to 32-bits. | |
1267 | */ | |
1268 | if (data & BIT_ULL(48)) | |
1269 | return -EINVAL; | |
1270 | ||
1271 | if (vmx_basic_vmcs_revision_id(vmx_basic) != | |
1272 | vmx_basic_vmcs_revision_id(data)) | |
1273 | return -EINVAL; | |
1274 | ||
1275 | if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) | |
1276 | return -EINVAL; | |
1277 | ||
1278 | vmx->nested.msrs.basic = data; | |
1279 | return 0; | |
1280 | } | |
1281 | ||
f8ae08f9 SC |
1282 | static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, |
1283 | u32 **low, u32 **high) | |
55d2375e | 1284 | { |
55d2375e SC |
1285 | switch (msr_index) { |
1286 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | |
f8ae08f9 SC |
1287 | *low = &msrs->pinbased_ctls_low; |
1288 | *high = &msrs->pinbased_ctls_high; | |
55d2375e SC |
1289 | break; |
1290 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | |
f8ae08f9 SC |
1291 | *low = &msrs->procbased_ctls_low; |
1292 | *high = &msrs->procbased_ctls_high; | |
55d2375e SC |
1293 | break; |
1294 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | |
f8ae08f9 SC |
1295 | *low = &msrs->exit_ctls_low; |
1296 | *high = &msrs->exit_ctls_high; | |
55d2375e SC |
1297 | break; |
1298 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | |
f8ae08f9 SC |
1299 | *low = &msrs->entry_ctls_low; |
1300 | *high = &msrs->entry_ctls_high; | |
55d2375e SC |
1301 | break; |
1302 | case MSR_IA32_VMX_PROCBASED_CTLS2: | |
f8ae08f9 SC |
1303 | *low = &msrs->secondary_ctls_low; |
1304 | *high = &msrs->secondary_ctls_high; | |
55d2375e SC |
1305 | break; |
1306 | default: | |
1307 | BUG(); | |
1308 | } | |
f8ae08f9 SC |
1309 | } |
1310 | ||
1311 | static int | |
1312 | vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | |
1313 | { | |
1314 | u32 *lowp, *highp; | |
1315 | u64 supported; | |
1316 | ||
1317 | vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); | |
55d2375e SC |
1318 | |
1319 | supported = vmx_control_msr(*lowp, *highp); | |
1320 | ||
1321 | /* Check must-be-1 bits are still 1. */ | |
1322 | if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) | |
1323 | return -EINVAL; | |
1324 | ||
1325 | /* Check must-be-0 bits are still 0. */ | |
1326 | if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) | |
1327 | return -EINVAL; | |
1328 | ||
f8ae08f9 | 1329 | vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); |
55d2375e SC |
1330 | *lowp = data; |
1331 | *highp = data >> 32; | |
1332 | return 0; | |
1333 | } | |
1334 | ||
1335 | static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) | |
1336 | { | |
1337 | const u64 feature_and_reserved_bits = | |
1338 | /* feature */ | |
1339 | BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | | |
1340 | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | | |
1341 | /* reserved */ | |
1342 | GENMASK_ULL(13, 9) | BIT_ULL(31); | |
f8ae08f9 SC |
1343 | u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, |
1344 | vmcs_config.nested.misc_high); | |
55d2375e SC |
1345 | |
1346 | if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) | |
1347 | return -EINVAL; | |
1348 | ||
1349 | if ((vmx->nested.msrs.pinbased_ctls_high & | |
1350 | PIN_BASED_VMX_PREEMPTION_TIMER) && | |
1351 | vmx_misc_preemption_timer_rate(data) != | |
1352 | vmx_misc_preemption_timer_rate(vmx_misc)) | |
1353 | return -EINVAL; | |
1354 | ||
1355 | if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) | |
1356 | return -EINVAL; | |
1357 | ||
1358 | if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) | |
1359 | return -EINVAL; | |
1360 | ||
1361 | if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) | |
1362 | return -EINVAL; | |
1363 | ||
1364 | vmx->nested.msrs.misc_low = data; | |
1365 | vmx->nested.msrs.misc_high = data >> 32; | |
1366 | ||
55d2375e SC |
1367 | return 0; |
1368 | } | |
1369 | ||
1370 | static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) | |
1371 | { | |
f8ae08f9 SC |
1372 | u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, |
1373 | vmcs_config.nested.vpid_caps); | |
55d2375e SC |
1374 | |
1375 | /* Every bit is either reserved or a feature bit. */ | |
1376 | if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) | |
1377 | return -EINVAL; | |
1378 | ||
1379 | vmx->nested.msrs.ept_caps = data; | |
1380 | vmx->nested.msrs.vpid_caps = data >> 32; | |
1381 | return 0; | |
1382 | } | |
1383 | ||
f8ae08f9 | 1384 | static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) |
55d2375e | 1385 | { |
55d2375e SC |
1386 | switch (msr_index) { |
1387 | case MSR_IA32_VMX_CR0_FIXED0: | |
f8ae08f9 | 1388 | return &msrs->cr0_fixed0; |
55d2375e | 1389 | case MSR_IA32_VMX_CR4_FIXED0: |
f8ae08f9 | 1390 | return &msrs->cr4_fixed0; |
55d2375e SC |
1391 | default: |
1392 | BUG(); | |
1393 | } | |
f8ae08f9 SC |
1394 | } |
1395 | ||
1396 | static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | |
1397 | { | |
1398 | const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); | |
55d2375e SC |
1399 | |
1400 | /* | |
1401 | * 1 bits (which indicates bits which "must-be-1" during VMX operation) | |
1402 | * must be 1 in the restored value. | |
1403 | */ | |
1404 | if (!is_bitwise_subset(data, *msr, -1ULL)) | |
1405 | return -EINVAL; | |
1406 | ||
f8ae08f9 | 1407 | *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; |
55d2375e SC |
1408 | return 0; |
1409 | } | |
1410 | ||
1411 | /* | |
1412 | * Called when userspace is restoring VMX MSRs. | |
1413 | * | |
1414 | * Returns 0 on success, non-0 otherwise. | |
1415 | */ | |
1416 | int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |
1417 | { | |
1418 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1419 | ||
1420 | /* | |
1421 | * Don't allow changes to the VMX capability MSRs while the vCPU | |
1422 | * is in VMX operation. | |
1423 | */ | |
1424 | if (vmx->nested.vmxon) | |
1425 | return -EBUSY; | |
1426 | ||
1427 | switch (msr_index) { | |
1428 | case MSR_IA32_VMX_BASIC: | |
1429 | return vmx_restore_vmx_basic(vmx, data); | |
1430 | case MSR_IA32_VMX_PINBASED_CTLS: | |
1431 | case MSR_IA32_VMX_PROCBASED_CTLS: | |
1432 | case MSR_IA32_VMX_EXIT_CTLS: | |
1433 | case MSR_IA32_VMX_ENTRY_CTLS: | |
1434 | /* | |
1435 | * The "non-true" VMX capability MSRs are generated from the | |
1436 | * "true" MSRs, so we do not support restoring them directly. | |
1437 | * | |
1438 | * If userspace wants to emulate VMX_BASIC[55]=0, userspace | |
1439 | * should restore the "true" MSRs with the must-be-1 bits | |
1440 | * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND | |
1441 | * DEFAULT SETTINGS". | |
1442 | */ | |
1443 | return -EINVAL; | |
1444 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | |
1445 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | |
1446 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | |
1447 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | |
1448 | case MSR_IA32_VMX_PROCBASED_CTLS2: | |
1449 | return vmx_restore_control_msr(vmx, msr_index, data); | |
1450 | case MSR_IA32_VMX_MISC: | |
1451 | return vmx_restore_vmx_misc(vmx, data); | |
1452 | case MSR_IA32_VMX_CR0_FIXED0: | |
1453 | case MSR_IA32_VMX_CR4_FIXED0: | |
1454 | return vmx_restore_fixed0_msr(vmx, msr_index, data); | |
1455 | case MSR_IA32_VMX_CR0_FIXED1: | |
1456 | case MSR_IA32_VMX_CR4_FIXED1: | |
1457 | /* | |
1458 | * These MSRs are generated based on the vCPU's CPUID, so we | |
1459 | * do not support restoring them directly. | |
1460 | */ | |
1461 | return -EINVAL; | |
1462 | case MSR_IA32_VMX_EPT_VPID_CAP: | |
1463 | return vmx_restore_vmx_ept_vpid_cap(vmx, data); | |
1464 | case MSR_IA32_VMX_VMCS_ENUM: | |
1465 | vmx->nested.msrs.vmcs_enum = data; | |
1466 | return 0; | |
e8a70bd4 | 1467 | case MSR_IA32_VMX_VMFUNC: |
f8ae08f9 | 1468 | if (data & ~vmcs_config.nested.vmfunc_controls) |
e8a70bd4 PB |
1469 | return -EINVAL; |
1470 | vmx->nested.msrs.vmfunc_controls = data; | |
1471 | return 0; | |
55d2375e SC |
1472 | default: |
1473 | /* | |
1474 | * The rest of the VMX capability MSRs do not support restore. | |
1475 | */ | |
1476 | return -EINVAL; | |
1477 | } | |
1478 | } | |
1479 | ||
1480 | /* Returns 0 on success, non-0 otherwise. */ | |
1481 | int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) | |
1482 | { | |
1483 | switch (msr_index) { | |
1484 | case MSR_IA32_VMX_BASIC: | |
1485 | *pdata = msrs->basic; | |
1486 | break; | |
1487 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | |
1488 | case MSR_IA32_VMX_PINBASED_CTLS: | |
1489 | *pdata = vmx_control_msr( | |
1490 | msrs->pinbased_ctls_low, | |
1491 | msrs->pinbased_ctls_high); | |
1492 | if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) | |
1493 | *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | |
1494 | break; | |
1495 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | |
1496 | case MSR_IA32_VMX_PROCBASED_CTLS: | |
1497 | *pdata = vmx_control_msr( | |
1498 | msrs->procbased_ctls_low, | |
1499 | msrs->procbased_ctls_high); | |
1500 | if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) | |
1501 | *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | |
1502 | break; | |
1503 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | |
1504 | case MSR_IA32_VMX_EXIT_CTLS: | |
1505 | *pdata = vmx_control_msr( | |
1506 | msrs->exit_ctls_low, | |
1507 | msrs->exit_ctls_high); | |
1508 | if (msr_index == MSR_IA32_VMX_EXIT_CTLS) | |
1509 | *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | |
1510 | break; | |
1511 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | |
1512 | case MSR_IA32_VMX_ENTRY_CTLS: | |
1513 | *pdata = vmx_control_msr( | |
1514 | msrs->entry_ctls_low, | |
1515 | msrs->entry_ctls_high); | |
1516 | if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) | |
1517 | *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | |
1518 | break; | |
1519 | case MSR_IA32_VMX_MISC: | |
1520 | *pdata = vmx_control_msr( | |
1521 | msrs->misc_low, | |
1522 | msrs->misc_high); | |
1523 | break; | |
1524 | case MSR_IA32_VMX_CR0_FIXED0: | |
1525 | *pdata = msrs->cr0_fixed0; | |
1526 | break; | |
1527 | case MSR_IA32_VMX_CR0_FIXED1: | |
1528 | *pdata = msrs->cr0_fixed1; | |
1529 | break; | |
1530 | case MSR_IA32_VMX_CR4_FIXED0: | |
1531 | *pdata = msrs->cr4_fixed0; | |
1532 | break; | |
1533 | case MSR_IA32_VMX_CR4_FIXED1: | |
1534 | *pdata = msrs->cr4_fixed1; | |
1535 | break; | |
1536 | case MSR_IA32_VMX_VMCS_ENUM: | |
1537 | *pdata = msrs->vmcs_enum; | |
1538 | break; | |
1539 | case MSR_IA32_VMX_PROCBASED_CTLS2: | |
1540 | *pdata = vmx_control_msr( | |
1541 | msrs->secondary_ctls_low, | |
1542 | msrs->secondary_ctls_high); | |
1543 | break; | |
1544 | case MSR_IA32_VMX_EPT_VPID_CAP: | |
1545 | *pdata = msrs->ept_caps | | |
1546 | ((u64)msrs->vpid_caps << 32); | |
1547 | break; | |
1548 | case MSR_IA32_VMX_VMFUNC: | |
1549 | *pdata = msrs->vmfunc_controls; | |
1550 | break; | |
1551 | default: | |
1552 | return 1; | |
1553 | } | |
1554 | ||
1555 | return 0; | |
1556 | } | |
1557 | ||
1558 | /* | |
fadcead0 SC |
1559 | * Copy the writable VMCS shadow fields back to the VMCS12, in case they have |
1560 | * been modified by the L1 guest. Note, "writable" in this context means | |
1561 | * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of | |
1562 | * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" | |
1563 | * VM-exit information fields (which are actually writable if the vCPU is | |
1564 | * configured to support "VMWRITE to any supported field in the VMCS"). | |
55d2375e SC |
1565 | */ |
1566 | static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) | |
1567 | { | |
55d2375e | 1568 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; |
fadcead0 | 1569 | struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); |
1c6f0b47 SC |
1570 | struct shadow_vmcs_field field; |
1571 | unsigned long val; | |
fadcead0 | 1572 | int i; |
55d2375e | 1573 | |
88dddc11 PB |
1574 | if (WARN_ON(!shadow_vmcs)) |
1575 | return; | |
1576 | ||
55d2375e SC |
1577 | preempt_disable(); |
1578 | ||
1579 | vmcs_load(shadow_vmcs); | |
1580 | ||
fadcead0 SC |
1581 | for (i = 0; i < max_shadow_read_write_fields; i++) { |
1582 | field = shadow_read_write_fields[i]; | |
1c6f0b47 SC |
1583 | val = __vmcs_readl(field.encoding); |
1584 | vmcs12_write_any(vmcs12, field.encoding, field.offset, val); | |
55d2375e SC |
1585 | } |
1586 | ||
1587 | vmcs_clear(shadow_vmcs); | |
1588 | vmcs_load(vmx->loaded_vmcs->vmcs); | |
1589 | ||
1590 | preempt_enable(); | |
1591 | } | |
1592 | ||
1593 | static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) | |
1594 | { | |
1c6f0b47 | 1595 | const struct shadow_vmcs_field *fields[] = { |
55d2375e SC |
1596 | shadow_read_write_fields, |
1597 | shadow_read_only_fields | |
1598 | }; | |
1599 | const int max_fields[] = { | |
1600 | max_shadow_read_write_fields, | |
1601 | max_shadow_read_only_fields | |
1602 | }; | |
55d2375e | 1603 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; |
1c6f0b47 SC |
1604 | struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); |
1605 | struct shadow_vmcs_field field; | |
1606 | unsigned long val; | |
1607 | int i, q; | |
55d2375e | 1608 | |
88dddc11 PB |
1609 | if (WARN_ON(!shadow_vmcs)) |
1610 | return; | |
1611 | ||
55d2375e SC |
1612 | vmcs_load(shadow_vmcs); |
1613 | ||
1614 | for (q = 0; q < ARRAY_SIZE(fields); q++) { | |
1615 | for (i = 0; i < max_fields[q]; i++) { | |
1616 | field = fields[q][i]; | |
1c6f0b47 SC |
1617 | val = vmcs12_read_any(vmcs12, field.encoding, |
1618 | field.offset); | |
1619 | __vmcs_writel(field.encoding, val); | |
55d2375e SC |
1620 | } |
1621 | } | |
1622 | ||
1623 | vmcs_clear(shadow_vmcs); | |
1624 | vmcs_load(vmx->loaded_vmcs->vmcs); | |
1625 | } | |
1626 | ||
d6bf71a1 | 1627 | static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) |
55d2375e | 1628 | { |
b4f69df0 | 1629 | #ifdef CONFIG_KVM_HYPERV |
55d2375e | 1630 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; |
c98842b2 | 1631 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
38edb452 | 1632 | struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); |
55d2375e SC |
1633 | |
1634 | /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ | |
1635 | vmcs12->tpr_threshold = evmcs->tpr_threshold; | |
1636 | vmcs12->guest_rip = evmcs->guest_rip; | |
1637 | ||
38edb452 VK |
1638 | if (unlikely(!(hv_clean_fields & |
1639 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { | |
1640 | hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; | |
1641 | hv_vcpu->nested.vm_id = evmcs->hv_vm_id; | |
1642 | hv_vcpu->nested.vp_id = evmcs->hv_vp_id; | |
1643 | } | |
1644 | ||
d6bf71a1 | 1645 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1646 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { |
1647 | vmcs12->guest_rsp = evmcs->guest_rsp; | |
1648 | vmcs12->guest_rflags = evmcs->guest_rflags; | |
1649 | vmcs12->guest_interruptibility_info = | |
1650 | evmcs->guest_interruptibility_info; | |
c9d31986 VK |
1651 | /* |
1652 | * Not present in struct vmcs12: | |
1653 | * vmcs12->guest_ssp = evmcs->guest_ssp; | |
1654 | */ | |
55d2375e SC |
1655 | } |
1656 | ||
d6bf71a1 | 1657 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1658 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { |
1659 | vmcs12->cpu_based_vm_exec_control = | |
1660 | evmcs->cpu_based_vm_exec_control; | |
1661 | } | |
1662 | ||
d6bf71a1 | 1663 | if (unlikely(!(hv_clean_fields & |
f9bc5227 | 1664 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { |
55d2375e SC |
1665 | vmcs12->exception_bitmap = evmcs->exception_bitmap; |
1666 | } | |
1667 | ||
d6bf71a1 | 1668 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1669 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { |
1670 | vmcs12->vm_entry_controls = evmcs->vm_entry_controls; | |
1671 | } | |
1672 | ||
d6bf71a1 | 1673 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1674 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { |
1675 | vmcs12->vm_entry_intr_info_field = | |
1676 | evmcs->vm_entry_intr_info_field; | |
1677 | vmcs12->vm_entry_exception_error_code = | |
1678 | evmcs->vm_entry_exception_error_code; | |
1679 | vmcs12->vm_entry_instruction_len = | |
1680 | evmcs->vm_entry_instruction_len; | |
1681 | } | |
1682 | ||
d6bf71a1 | 1683 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1684 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { |
1685 | vmcs12->host_ia32_pat = evmcs->host_ia32_pat; | |
1686 | vmcs12->host_ia32_efer = evmcs->host_ia32_efer; | |
1687 | vmcs12->host_cr0 = evmcs->host_cr0; | |
1688 | vmcs12->host_cr3 = evmcs->host_cr3; | |
1689 | vmcs12->host_cr4 = evmcs->host_cr4; | |
1690 | vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; | |
1691 | vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; | |
1692 | vmcs12->host_rip = evmcs->host_rip; | |
1693 | vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; | |
1694 | vmcs12->host_es_selector = evmcs->host_es_selector; | |
1695 | vmcs12->host_cs_selector = evmcs->host_cs_selector; | |
1696 | vmcs12->host_ss_selector = evmcs->host_ss_selector; | |
1697 | vmcs12->host_ds_selector = evmcs->host_ds_selector; | |
1698 | vmcs12->host_fs_selector = evmcs->host_fs_selector; | |
1699 | vmcs12->host_gs_selector = evmcs->host_gs_selector; | |
1700 | vmcs12->host_tr_selector = evmcs->host_tr_selector; | |
c9d31986 VK |
1701 | vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; |
1702 | /* | |
1703 | * Not present in struct vmcs12: | |
1704 | * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; | |
1705 | * vmcs12->host_ssp = evmcs->host_ssp; | |
1706 | * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; | |
1707 | */ | |
55d2375e SC |
1708 | } |
1709 | ||
d6bf71a1 | 1710 | if (unlikely(!(hv_clean_fields & |
f9bc5227 | 1711 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { |
55d2375e SC |
1712 | vmcs12->pin_based_vm_exec_control = |
1713 | evmcs->pin_based_vm_exec_control; | |
1714 | vmcs12->vm_exit_controls = evmcs->vm_exit_controls; | |
1715 | vmcs12->secondary_vm_exec_control = | |
1716 | evmcs->secondary_vm_exec_control; | |
1717 | } | |
1718 | ||
d6bf71a1 | 1719 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1720 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { |
1721 | vmcs12->io_bitmap_a = evmcs->io_bitmap_a; | |
1722 | vmcs12->io_bitmap_b = evmcs->io_bitmap_b; | |
1723 | } | |
1724 | ||
d6bf71a1 | 1725 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1726 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { |
1727 | vmcs12->msr_bitmap = evmcs->msr_bitmap; | |
1728 | } | |
1729 | ||
d6bf71a1 | 1730 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1731 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { |
1732 | vmcs12->guest_es_base = evmcs->guest_es_base; | |
1733 | vmcs12->guest_cs_base = evmcs->guest_cs_base; | |
1734 | vmcs12->guest_ss_base = evmcs->guest_ss_base; | |
1735 | vmcs12->guest_ds_base = evmcs->guest_ds_base; | |
1736 | vmcs12->guest_fs_base = evmcs->guest_fs_base; | |
1737 | vmcs12->guest_gs_base = evmcs->guest_gs_base; | |
1738 | vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; | |
1739 | vmcs12->guest_tr_base = evmcs->guest_tr_base; | |
1740 | vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; | |
1741 | vmcs12->guest_idtr_base = evmcs->guest_idtr_base; | |
1742 | vmcs12->guest_es_limit = evmcs->guest_es_limit; | |
1743 | vmcs12->guest_cs_limit = evmcs->guest_cs_limit; | |
1744 | vmcs12->guest_ss_limit = evmcs->guest_ss_limit; | |
1745 | vmcs12->guest_ds_limit = evmcs->guest_ds_limit; | |
1746 | vmcs12->guest_fs_limit = evmcs->guest_fs_limit; | |
1747 | vmcs12->guest_gs_limit = evmcs->guest_gs_limit; | |
1748 | vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; | |
1749 | vmcs12->guest_tr_limit = evmcs->guest_tr_limit; | |
1750 | vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; | |
1751 | vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; | |
1752 | vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; | |
1753 | vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; | |
1754 | vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; | |
1755 | vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; | |
1756 | vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; | |
1757 | vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; | |
1758 | vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; | |
1759 | vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; | |
1760 | vmcs12->guest_es_selector = evmcs->guest_es_selector; | |
1761 | vmcs12->guest_cs_selector = evmcs->guest_cs_selector; | |
1762 | vmcs12->guest_ss_selector = evmcs->guest_ss_selector; | |
1763 | vmcs12->guest_ds_selector = evmcs->guest_ds_selector; | |
1764 | vmcs12->guest_fs_selector = evmcs->guest_fs_selector; | |
1765 | vmcs12->guest_gs_selector = evmcs->guest_gs_selector; | |
1766 | vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; | |
1767 | vmcs12->guest_tr_selector = evmcs->guest_tr_selector; | |
1768 | } | |
1769 | ||
d6bf71a1 | 1770 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1771 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { |
1772 | vmcs12->tsc_offset = evmcs->tsc_offset; | |
1773 | vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; | |
1774 | vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; | |
c9d31986 VK |
1775 | vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; |
1776 | vmcs12->tsc_multiplier = evmcs->tsc_multiplier; | |
55d2375e SC |
1777 | } |
1778 | ||
d6bf71a1 | 1779 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1780 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { |
1781 | vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; | |
1782 | vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; | |
1783 | vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; | |
1784 | vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; | |
1785 | vmcs12->guest_cr0 = evmcs->guest_cr0; | |
1786 | vmcs12->guest_cr3 = evmcs->guest_cr3; | |
1787 | vmcs12->guest_cr4 = evmcs->guest_cr4; | |
1788 | vmcs12->guest_dr7 = evmcs->guest_dr7; | |
1789 | } | |
1790 | ||
d6bf71a1 | 1791 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1792 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { |
1793 | vmcs12->host_fs_base = evmcs->host_fs_base; | |
1794 | vmcs12->host_gs_base = evmcs->host_gs_base; | |
1795 | vmcs12->host_tr_base = evmcs->host_tr_base; | |
1796 | vmcs12->host_gdtr_base = evmcs->host_gdtr_base; | |
1797 | vmcs12->host_idtr_base = evmcs->host_idtr_base; | |
1798 | vmcs12->host_rsp = evmcs->host_rsp; | |
1799 | } | |
1800 | ||
d6bf71a1 | 1801 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1802 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { |
1803 | vmcs12->ept_pointer = evmcs->ept_pointer; | |
1804 | vmcs12->virtual_processor_id = evmcs->virtual_processor_id; | |
1805 | } | |
1806 | ||
d6bf71a1 | 1807 | if (unlikely(!(hv_clean_fields & |
55d2375e SC |
1808 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { |
1809 | vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; | |
1810 | vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; | |
1811 | vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; | |
1812 | vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; | |
1813 | vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; | |
1814 | vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; | |
1815 | vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; | |
1816 | vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; | |
1817 | vmcs12->guest_pending_dbg_exceptions = | |
1818 | evmcs->guest_pending_dbg_exceptions; | |
1819 | vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; | |
1820 | vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; | |
1821 | vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; | |
1822 | vmcs12->guest_activity_state = evmcs->guest_activity_state; | |
1823 | vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; | |
c9d31986 VK |
1824 | vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; |
1825 | /* | |
1826 | * Not present in struct vmcs12: | |
1827 | * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; | |
1828 | * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; | |
1829 | * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; | |
1830 | */ | |
55d2375e SC |
1831 | } |
1832 | ||
1833 | /* | |
1834 | * Not used? | |
1835 | * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; | |
1836 | * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; | |
1837 | * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; | |
55d2375e SC |
1838 | * vmcs12->page_fault_error_code_mask = |
1839 | * evmcs->page_fault_error_code_mask; | |
1840 | * vmcs12->page_fault_error_code_match = | |
1841 | * evmcs->page_fault_error_code_match; | |
1842 | * vmcs12->cr3_target_count = evmcs->cr3_target_count; | |
1843 | * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; | |
1844 | * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; | |
1845 | * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; | |
1846 | */ | |
1847 | ||
1848 | /* | |
1849 | * Read only fields: | |
1850 | * vmcs12->guest_physical_address = evmcs->guest_physical_address; | |
1851 | * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; | |
1852 | * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; | |
1853 | * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; | |
1854 | * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; | |
1855 | * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; | |
1856 | * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; | |
1857 | * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; | |
1858 | * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; | |
1859 | * vmcs12->exit_qualification = evmcs->exit_qualification; | |
1860 | * vmcs12->guest_linear_address = evmcs->guest_linear_address; | |
1861 | * | |
1862 | * Not present in struct vmcs12: | |
1863 | * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; | |
1864 | * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; | |
1865 | * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; | |
1866 | * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; | |
1867 | */ | |
1868 | ||
25641caf | 1869 | return; |
b4f69df0 VK |
1870 | #else /* CONFIG_KVM_HYPERV */ |
1871 | KVM_BUG_ON(1, vmx->vcpu.kvm); | |
1872 | #endif /* CONFIG_KVM_HYPERV */ | |
55d2375e SC |
1873 | } |
1874 | ||
25641caf | 1875 | static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) |
55d2375e | 1876 | { |
b4f69df0 | 1877 | #ifdef CONFIG_KVM_HYPERV |
55d2375e | 1878 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; |
c98842b2 | 1879 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
55d2375e SC |
1880 | |
1881 | /* | |
1882 | * Should not be changed by KVM: | |
1883 | * | |
1884 | * evmcs->host_es_selector = vmcs12->host_es_selector; | |
1885 | * evmcs->host_cs_selector = vmcs12->host_cs_selector; | |
1886 | * evmcs->host_ss_selector = vmcs12->host_ss_selector; | |
1887 | * evmcs->host_ds_selector = vmcs12->host_ds_selector; | |
1888 | * evmcs->host_fs_selector = vmcs12->host_fs_selector; | |
1889 | * evmcs->host_gs_selector = vmcs12->host_gs_selector; | |
1890 | * evmcs->host_tr_selector = vmcs12->host_tr_selector; | |
1891 | * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; | |
1892 | * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; | |
1893 | * evmcs->host_cr0 = vmcs12->host_cr0; | |
1894 | * evmcs->host_cr3 = vmcs12->host_cr3; | |
1895 | * evmcs->host_cr4 = vmcs12->host_cr4; | |
1896 | * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; | |
1897 | * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; | |
1898 | * evmcs->host_rip = vmcs12->host_rip; | |
1899 | * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; | |
1900 | * evmcs->host_fs_base = vmcs12->host_fs_base; | |
1901 | * evmcs->host_gs_base = vmcs12->host_gs_base; | |
1902 | * evmcs->host_tr_base = vmcs12->host_tr_base; | |
1903 | * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; | |
1904 | * evmcs->host_idtr_base = vmcs12->host_idtr_base; | |
1905 | * evmcs->host_rsp = vmcs12->host_rsp; | |
3731905e | 1906 | * sync_vmcs02_to_vmcs12() doesn't read these: |
55d2375e SC |
1907 | * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; |
1908 | * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; | |
1909 | * evmcs->msr_bitmap = vmcs12->msr_bitmap; | |
1910 | * evmcs->ept_pointer = vmcs12->ept_pointer; | |
1911 | * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; | |
1912 | * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; | |
1913 | * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; | |
1914 | * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; | |
55d2375e SC |
1915 | * evmcs->tpr_threshold = vmcs12->tpr_threshold; |
1916 | * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; | |
1917 | * evmcs->exception_bitmap = vmcs12->exception_bitmap; | |
1918 | * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; | |
1919 | * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; | |
1920 | * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; | |
1921 | * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; | |
1922 | * evmcs->page_fault_error_code_mask = | |
1923 | * vmcs12->page_fault_error_code_mask; | |
1924 | * evmcs->page_fault_error_code_match = | |
1925 | * vmcs12->page_fault_error_code_match; | |
1926 | * evmcs->cr3_target_count = vmcs12->cr3_target_count; | |
1927 | * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; | |
1928 | * evmcs->tsc_offset = vmcs12->tsc_offset; | |
1929 | * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; | |
1930 | * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; | |
1931 | * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; | |
1932 | * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; | |
1933 | * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; | |
1934 | * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; | |
1935 | * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; | |
1936 | * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; | |
c9d31986 VK |
1937 | * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; |
1938 | * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; | |
1939 | * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; | |
1940 | * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; | |
55d2375e SC |
1941 | * |
1942 | * Not present in struct vmcs12: | |
1943 | * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; | |
1944 | * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; | |
1945 | * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; | |
1946 | * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; | |
c9d31986 VK |
1947 | * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; |
1948 | * evmcs->host_ssp = vmcs12->host_ssp; | |
1949 | * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; | |
1950 | * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; | |
1951 | * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; | |
1952 | * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; | |
1953 | * evmcs->guest_ssp = vmcs12->guest_ssp; | |
55d2375e SC |
1954 | */ |
1955 | ||
1956 | evmcs->guest_es_selector = vmcs12->guest_es_selector; | |
1957 | evmcs->guest_cs_selector = vmcs12->guest_cs_selector; | |
1958 | evmcs->guest_ss_selector = vmcs12->guest_ss_selector; | |
1959 | evmcs->guest_ds_selector = vmcs12->guest_ds_selector; | |
1960 | evmcs->guest_fs_selector = vmcs12->guest_fs_selector; | |
1961 | evmcs->guest_gs_selector = vmcs12->guest_gs_selector; | |
1962 | evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; | |
1963 | evmcs->guest_tr_selector = vmcs12->guest_tr_selector; | |
1964 | ||
1965 | evmcs->guest_es_limit = vmcs12->guest_es_limit; | |
1966 | evmcs->guest_cs_limit = vmcs12->guest_cs_limit; | |
1967 | evmcs->guest_ss_limit = vmcs12->guest_ss_limit; | |
1968 | evmcs->guest_ds_limit = vmcs12->guest_ds_limit; | |
1969 | evmcs->guest_fs_limit = vmcs12->guest_fs_limit; | |
1970 | evmcs->guest_gs_limit = vmcs12->guest_gs_limit; | |
1971 | evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; | |
1972 | evmcs->guest_tr_limit = vmcs12->guest_tr_limit; | |
1973 | evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; | |
1974 | evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; | |
1975 | ||
1976 | evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; | |
1977 | evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; | |
1978 | evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; | |
1979 | evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; | |
1980 | evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; | |
1981 | evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; | |
1982 | evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; | |
1983 | evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; | |
1984 | ||
1985 | evmcs->guest_es_base = vmcs12->guest_es_base; | |
1986 | evmcs->guest_cs_base = vmcs12->guest_cs_base; | |
1987 | evmcs->guest_ss_base = vmcs12->guest_ss_base; | |
1988 | evmcs->guest_ds_base = vmcs12->guest_ds_base; | |
1989 | evmcs->guest_fs_base = vmcs12->guest_fs_base; | |
1990 | evmcs->guest_gs_base = vmcs12->guest_gs_base; | |
1991 | evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; | |
1992 | evmcs->guest_tr_base = vmcs12->guest_tr_base; | |
1993 | evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; | |
1994 | evmcs->guest_idtr_base = vmcs12->guest_idtr_base; | |
1995 | ||
1996 | evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; | |
1997 | evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; | |
1998 | ||
1999 | evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; | |
2000 | evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; | |
2001 | evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; | |
2002 | evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; | |
2003 | ||
2004 | evmcs->guest_pending_dbg_exceptions = | |
2005 | vmcs12->guest_pending_dbg_exceptions; | |
2006 | evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; | |
2007 | evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; | |
2008 | ||
2009 | evmcs->guest_activity_state = vmcs12->guest_activity_state; | |
2010 | evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; | |
2011 | ||
2012 | evmcs->guest_cr0 = vmcs12->guest_cr0; | |
2013 | evmcs->guest_cr3 = vmcs12->guest_cr3; | |
2014 | evmcs->guest_cr4 = vmcs12->guest_cr4; | |
2015 | evmcs->guest_dr7 = vmcs12->guest_dr7; | |
2016 | ||
2017 | evmcs->guest_physical_address = vmcs12->guest_physical_address; | |
2018 | ||
2019 | evmcs->vm_instruction_error = vmcs12->vm_instruction_error; | |
2020 | evmcs->vm_exit_reason = vmcs12->vm_exit_reason; | |
2021 | evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; | |
2022 | evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; | |
2023 | evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; | |
2024 | evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; | |
2025 | evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; | |
2026 | evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; | |
2027 | ||
2028 | evmcs->exit_qualification = vmcs12->exit_qualification; | |
2029 | ||
2030 | evmcs->guest_linear_address = vmcs12->guest_linear_address; | |
2031 | evmcs->guest_rsp = vmcs12->guest_rsp; | |
2032 | evmcs->guest_rflags = vmcs12->guest_rflags; | |
2033 | ||
2034 | evmcs->guest_interruptibility_info = | |
2035 | vmcs12->guest_interruptibility_info; | |
2036 | evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; | |
2037 | evmcs->vm_entry_controls = vmcs12->vm_entry_controls; | |
2038 | evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; | |
2039 | evmcs->vm_entry_exception_error_code = | |
2040 | vmcs12->vm_entry_exception_error_code; | |
2041 | evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; | |
2042 | ||
2043 | evmcs->guest_rip = vmcs12->guest_rip; | |
2044 | ||
2045 | evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; | |
2046 | ||
25641caf | 2047 | return; |
b4f69df0 VK |
2048 | #else /* CONFIG_KVM_HYPERV */ |
2049 | KVM_BUG_ON(1, vmx->vcpu.kvm); | |
2050 | #endif /* CONFIG_KVM_HYPERV */ | |
55d2375e SC |
2051 | } |
2052 | ||
2053 | /* | |
2054 | * This is an equivalent of the nested hypervisor executing the vmptrld | |
2055 | * instruction. | |
2056 | */ | |
b6a0653a VK |
2057 | static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( |
2058 | struct kvm_vcpu *vcpu, bool from_launch) | |
55d2375e | 2059 | { |
b4f69df0 | 2060 | #ifdef CONFIG_KVM_HYPERV |
55d2375e | 2061 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
a21a39c2 | 2062 | bool evmcs_gpa_changed = false; |
11e34914 | 2063 | u64 evmcs_gpa; |
55d2375e | 2064 | |
85ab071a | 2065 | if (likely(!guest_cpuid_has_evmcs(vcpu))) |
b6a0653a | 2066 | return EVMPTRLD_DISABLED; |
55d2375e | 2067 | |
046f5756 VK |
2068 | evmcs_gpa = nested_get_evmptr(vcpu); |
2069 | if (!evmptr_is_valid(evmcs_gpa)) { | |
02761716 | 2070 | nested_release_evmcs(vcpu); |
b6a0653a | 2071 | return EVMPTRLD_DISABLED; |
02761716 | 2072 | } |
55d2375e | 2073 | |
1e9dfbd7 | 2074 | if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { |
64c78508 | 2075 | vmx->nested.current_vmptr = INVALID_GPA; |
55d2375e SC |
2076 | |
2077 | nested_release_evmcs(vcpu); | |
2078 | ||
11e34914 | 2079 | if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), |
dee9c049 | 2080 | &vmx->nested.hv_evmcs_map)) |
b6a0653a | 2081 | return EVMPTRLD_ERROR; |
55d2375e | 2082 | |
dee9c049 | 2083 | vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; |
55d2375e SC |
2084 | |
2085 | /* | |
2086 | * Currently, KVM only supports eVMCS version 1 | |
2087 | * (== KVM_EVMCS_VERSION) and thus we expect guest to set this | |
2088 | * value to first u32 field of eVMCS which should specify eVMCS | |
2089 | * VersionNumber. | |
2090 | * | |
2091 | * Guest should be aware of supported eVMCS versions by host by | |
2092 | * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is | |
2093 | * expected to set this CPUID leaf according to the value | |
2094 | * returned in vmcs_version from nested_enable_evmcs(). | |
2095 | * | |
2096 | * However, it turns out that Microsoft Hyper-V fails to comply | |
2097 | * to their own invented interface: When Hyper-V use eVMCS, it | |
2098 | * just sets first u32 field of eVMCS to revision_id specified | |
2099 | * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number | |
2100 | * which is one of the supported versions specified in | |
2101 | * CPUID.0x4000000A.EAX[0:15]. | |
2102 | * | |
2103 | * To overcome Hyper-V bug, we accept here either a supported | |
2104 | * eVMCS version or VMCS12 revision_id as valid values for first | |
2105 | * u32 field of eVMCS. | |
2106 | */ | |
2107 | if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && | |
2108 | (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { | |
2109 | nested_release_evmcs(vcpu); | |
b6a0653a | 2110 | return EVMPTRLD_VMFAIL; |
55d2375e SC |
2111 | } |
2112 | ||
11e34914 | 2113 | vmx->nested.hv_evmcs_vmptr = evmcs_gpa; |
55d2375e | 2114 | |
a21a39c2 | 2115 | evmcs_gpa_changed = true; |
55d2375e SC |
2116 | /* |
2117 | * Unlike normal vmcs12, enlightened vmcs12 is not fully | |
2118 | * reloaded from guest's memory (read only fields, fields not | |
2119 | * present in struct hv_enlightened_vmcs, ...). Make sure there | |
2120 | * are no leftovers. | |
2121 | */ | |
2122 | if (from_launch) { | |
2123 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
2124 | memset(vmcs12, 0, sizeof(*vmcs12)); | |
2125 | vmcs12->hdr.revision_id = VMCS12_REVISION; | |
2126 | } | |
2127 | ||
2128 | } | |
a21a39c2 VK |
2129 | |
2130 | /* | |
ffdbd50d | 2131 | * Clean fields data can't be used on VMLAUNCH and when we switch |
a21a39c2 VK |
2132 | * between different L2 guests as KVM keeps a single VMCS12 per L1. |
2133 | */ | |
ed2a4800 | 2134 | if (from_launch || evmcs_gpa_changed) { |
a21a39c2 VK |
2135 | vmx->nested.hv_evmcs->hv_clean_fields &= |
2136 | ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | |
2137 | ||
ed2a4800 VK |
2138 | vmx->nested.force_msr_bitmap_recalc = true; |
2139 | } | |
2140 | ||
b6a0653a | 2141 | return EVMPTRLD_SUCCEEDED; |
b4f69df0 VK |
2142 | #else |
2143 | return EVMPTRLD_DISABLED; | |
2144 | #endif | |
55d2375e SC |
2145 | } |
2146 | ||
3731905e | 2147 | void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) |
55d2375e SC |
2148 | { |
2149 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2150 | ||
453e42b0 | 2151 | if (nested_vmx_is_evmptr12_valid(vmx)) |
55d2375e | 2152 | copy_vmcs12_to_enlightened(vmx); |
dc313385 | 2153 | else |
55d2375e | 2154 | copy_vmcs12_to_shadow(vmx); |
55d2375e | 2155 | |
3731905e | 2156 | vmx->nested.need_vmcs12_to_shadow_sync = false; |
55d2375e SC |
2157 | } |
2158 | ||
2159 | static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) | |
2160 | { | |
2161 | struct vcpu_vmx *vmx = | |
2162 | container_of(timer, struct vcpu_vmx, nested.preemption_timer); | |
2163 | ||
2164 | vmx->nested.preemption_timer_expired = true; | |
2165 | kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); | |
2166 | kvm_vcpu_kick(&vmx->vcpu); | |
2167 | ||
2168 | return HRTIMER_NORESTART; | |
2169 | } | |
2170 | ||
850448f3 PS |
2171 | static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) |
2172 | { | |
2173 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2174 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
850448f3 PS |
2175 | |
2176 | u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> | |
2177 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | |
2178 | ||
2179 | if (!vmx->nested.has_preemption_timer_deadline) { | |
8d7fbf01 MS |
2180 | vmx->nested.preemption_timer_deadline = |
2181 | vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; | |
850448f3 | 2182 | vmx->nested.has_preemption_timer_deadline = true; |
8d7fbf01 MS |
2183 | } |
2184 | return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; | |
850448f3 PS |
2185 | } |
2186 | ||
2187 | static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, | |
2188 | u64 preemption_timeout) | |
55d2375e | 2189 | { |
55d2375e SC |
2190 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2191 | ||
2192 | /* | |
2193 | * A timer value of zero is architecturally guaranteed to cause | |
2194 | * a VMExit prior to executing any instructions in the guest. | |
2195 | */ | |
2196 | if (preemption_timeout == 0) { | |
2197 | vmx_preemption_timer_fn(&vmx->nested.preemption_timer); | |
2198 | return; | |
2199 | } | |
2200 | ||
2201 | if (vcpu->arch.virtual_tsc_khz == 0) | |
2202 | return; | |
2203 | ||
2204 | preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | |
2205 | preemption_timeout *= 1000000; | |
2206 | do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); | |
2207 | hrtimer_start(&vmx->nested.preemption_timer, | |
ada0098d JM |
2208 | ktime_add_ns(ktime_get(), preemption_timeout), |
2209 | HRTIMER_MODE_ABS_PINNED); | |
55d2375e SC |
2210 | } |
2211 | ||
2212 | static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | |
2213 | { | |
2214 | if (vmx->nested.nested_run_pending && | |
2215 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) | |
2216 | return vmcs12->guest_ia32_efer; | |
2217 | else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) | |
2218 | return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); | |
2219 | else | |
2220 | return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); | |
2221 | } | |
2222 | ||
2223 | static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) | |
2224 | { | |
2f4073e0 TX |
2225 | struct kvm *kvm = vmx->vcpu.kvm; |
2226 | ||
55d2375e SC |
2227 | /* |
2228 | * If vmcs02 hasn't been initialized, set the constant vmcs02 state | |
2229 | * according to L0's settings (vmcs12 is irrelevant here). Host | |
2230 | * fields that come from L0 and are not constant, e.g. HOST_CR3, | |
2231 | * will be set as needed prior to VMLAUNCH/VMRESUME. | |
2232 | */ | |
2233 | if (vmx->nested.vmcs02_initialized) | |
2234 | return; | |
2235 | vmx->nested.vmcs02_initialized = true; | |
2236 | ||
2237 | /* | |
2238 | * We don't care what the EPTP value is we just need to guarantee | |
2239 | * it's valid so we don't get a false positive when doing early | |
2240 | * consistency checks. | |
2241 | */ | |
2242 | if (enable_ept && nested_early_check) | |
2a40b900 SC |
2243 | vmcs_write64(EPT_POINTER, |
2244 | construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); | |
55d2375e | 2245 | |
d1b32ecd SC |
2246 | if (vmx->ve_info) |
2247 | vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); | |
2248 | ||
55d2375e SC |
2249 | /* All VMFUNCs are currently emulated through L0 vmexits. */ |
2250 | if (cpu_has_vmx_vmfunc()) | |
2251 | vmcs_write64(VM_FUNCTION_CONTROL, 0); | |
2252 | ||
2253 | if (cpu_has_vmx_posted_intr()) | |
2254 | vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); | |
2255 | ||
2256 | if (cpu_has_vmx_msr_bitmap()) | |
2257 | vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); | |
2258 | ||
4d6c9892 | 2259 | /* |
c3bb9a20 SC |
2260 | * PML is emulated for L2, but never enabled in hardware as the MMU |
2261 | * handles A/D emulation. Disabling PML for L2 also avoids having to | |
2262 | * deal with filtering out L2 GPAs from the buffer. | |
4d6c9892 SC |
2263 | */ |
2264 | if (enable_pml) { | |
c3bb9a20 SC |
2265 | vmcs_write64(PML_ADDRESS, 0); |
2266 | vmcs_write16(GUEST_PML_INDEX, -1); | |
4d6c9892 | 2267 | } |
55d2375e | 2268 | |
c538d57f | 2269 | if (cpu_has_vmx_encls_vmexit()) |
64c78508 | 2270 | vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); |
55d2375e | 2271 | |
2f4073e0 TX |
2272 | if (kvm_notify_vmexit_enabled(kvm)) |
2273 | vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); | |
2274 | ||
55d2375e SC |
2275 | /* |
2276 | * Set the MSR load/store lists to match L0's settings. Only the | |
2277 | * addresses are constant (for vmcs02), the counts can change based | |
2278 | * on L2's behavior, e.g. switching to/from long mode. | |
2279 | */ | |
662f1d1d | 2280 | vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); |
55d2375e SC |
2281 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); |
2282 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); | |
2283 | ||
2284 | vmx_set_constant_host_state(vmx); | |
2285 | } | |
2286 | ||
b1346ab2 | 2287 | static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, |
55d2375e SC |
2288 | struct vmcs12 *vmcs12) |
2289 | { | |
2290 | prepare_vmcs02_constant_state(vmx); | |
2291 | ||
64c78508 | 2292 | vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); |
55d2375e SC |
2293 | |
2294 | if (enable_vpid) { | |
2295 | if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) | |
2296 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); | |
2297 | else | |
2298 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | |
2299 | } | |
2300 | } | |
2301 | ||
389ab252 SC |
2302 | static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, |
2303 | struct vmcs12 *vmcs12) | |
55d2375e | 2304 | { |
c3bb9a20 | 2305 | u32 exec_control; |
55d2375e SC |
2306 | u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); |
2307 | ||
453e42b0 | 2308 | if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) |
b1346ab2 | 2309 | prepare_vmcs02_early_rare(vmx, vmcs12); |
55d2375e | 2310 | |
55d2375e SC |
2311 | /* |
2312 | * PIN CONTROLS | |
2313 | */ | |
389ab252 | 2314 | exec_control = __pin_controls_get(vmcs01); |
804939ea SC |
2315 | exec_control |= (vmcs12->pin_based_vm_exec_control & |
2316 | ~PIN_BASED_VMX_PREEMPTION_TIMER); | |
55d2375e SC |
2317 | |
2318 | /* Posted interrupts setting is only taken from vmcs12. */ | |
f7782bb8 SC |
2319 | vmx->nested.pi_pending = false; |
2320 | if (nested_cpu_has_posted_intr(vmcs12)) | |
55d2375e | 2321 | vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; |
f7782bb8 | 2322 | else |
55d2375e | 2323 | exec_control &= ~PIN_BASED_POSTED_INTR; |
3af80fec | 2324 | pin_controls_set(vmx, exec_control); |
55d2375e SC |
2325 | |
2326 | /* | |
2327 | * EXEC CONTROLS | |
2328 | */ | |
389ab252 | 2329 | exec_control = __exec_controls_get(vmcs01); /* L0's desires */ |
9dadc2f9 | 2330 | exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; |
4e2a0bc5 | 2331 | exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; |
55d2375e SC |
2332 | exec_control &= ~CPU_BASED_TPR_SHADOW; |
2333 | exec_control |= vmcs12->cpu_based_vm_exec_control; | |
2334 | ||
02d496cf | 2335 | vmx->nested.l1_tpr_threshold = -1; |
ca2f5466 | 2336 | if (exec_control & CPU_BASED_TPR_SHADOW) |
55d2375e | 2337 | vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); |
55d2375e | 2338 | #ifdef CONFIG_X86_64 |
ca2f5466 | 2339 | else |
55d2375e SC |
2340 | exec_control |= CPU_BASED_CR8_LOAD_EXITING | |
2341 | CPU_BASED_CR8_STORE_EXITING; | |
2342 | #endif | |
55d2375e SC |
2343 | |
2344 | /* | |
2345 | * A vmexit (to either L1 hypervisor or L0 userspace) is always needed | |
2346 | * for I/O port accesses. | |
2347 | */ | |
55d2375e | 2348 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; |
de0286b7 SC |
2349 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; |
2350 | ||
2351 | /* | |
2352 | * This bit will be computed in nested_get_vmcs12_pages, because | |
2353 | * we do not have access to L1's MSR bitmap yet. For now, keep | |
2354 | * the same bit as before, hoping to avoid multiple VMWRITEs that | |
2355 | * only set/clear this bit. | |
2356 | */ | |
2357 | exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; | |
2358 | exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; | |
2359 | ||
3af80fec | 2360 | exec_controls_set(vmx, exec_control); |
55d2375e SC |
2361 | |
2362 | /* | |
2363 | * SECONDARY EXEC CONTROLS | |
2364 | */ | |
2365 | if (cpu_has_secondary_exec_ctrls()) { | |
389ab252 | 2366 | exec_control = __secondary_exec_controls_get(vmcs01); |
55d2375e SC |
2367 | |
2368 | /* Take the following fields only from vmcs12 */ | |
2369 | exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | |
389ab252 | 2370 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | |
55d2375e | 2371 | SECONDARY_EXEC_ENABLE_INVPCID | |
7f3603b6 | 2372 | SECONDARY_EXEC_ENABLE_RDTSCP | |
662f6815 | 2373 | SECONDARY_EXEC_ENABLE_XSAVES | |
e69e72fa | 2374 | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | |
55d2375e SC |
2375 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | |
2376 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | |
d041b5ea | 2377 | SECONDARY_EXEC_ENABLE_VMFUNC | |
389ab252 SC |
2378 | SECONDARY_EXEC_DESC); |
2379 | ||
55d2375e | 2380 | if (nested_cpu_has(vmcs12, |
c3bb9a20 SC |
2381 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) |
2382 | exec_control |= vmcs12->secondary_vm_exec_control; | |
2383 | ||
2384 | /* PML is emulated and never enabled in hardware for L2. */ | |
2385 | exec_control &= ~SECONDARY_EXEC_ENABLE_PML; | |
55d2375e SC |
2386 | |
2387 | /* VMCS shadowing for L2 is emulated for now */ | |
2388 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; | |
2389 | ||
55d2375e | 2390 | /* |
469debdb SC |
2391 | * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() |
2392 | * will not have to rewrite the controls just for this bit. | |
55d2375e | 2393 | */ |
3243b93c | 2394 | if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) |
469debdb | 2395 | exec_control |= SECONDARY_EXEC_DESC; |
55d2375e | 2396 | |
55d2375e SC |
2397 | if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) |
2398 | vmcs_write16(GUEST_INTR_STATUS, | |
2399 | vmcs12->guest_intr_status); | |
55d2375e | 2400 | |
bddd82d1 KS |
2401 | if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) |
2402 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | |
2403 | ||
72add915 SC |
2404 | if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) |
2405 | vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); | |
2406 | ||
3af80fec | 2407 | secondary_exec_controls_set(vmx, exec_control); |
55d2375e SC |
2408 | } |
2409 | ||
2410 | /* | |
2411 | * ENTRY CONTROLS | |
2412 | * | |
2413 | * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE | |
2414 | * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate | |
2415 | * on the related bits (if supported by the CPU) in the hope that | |
2416 | * we can avoid VMWrites during vmx_set_efer(). | |
def9d705 SC |
2417 | * |
2418 | * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is | |
2419 | * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to | |
2420 | * do the same for L2. | |
55d2375e | 2421 | */ |
389ab252 | 2422 | exec_control = __vm_entry_controls_get(vmcs01); |
def9d705 SC |
2423 | exec_control |= (vmcs12->vm_entry_controls & |
2424 | ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); | |
389ab252 | 2425 | exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); |
55d2375e SC |
2426 | if (cpu_has_load_ia32_efer()) { |
2427 | if (guest_efer & EFER_LMA) | |
2428 | exec_control |= VM_ENTRY_IA32E_MODE; | |
7974c064 | 2429 | if (guest_efer != kvm_host.efer) |
55d2375e SC |
2430 | exec_control |= VM_ENTRY_LOAD_IA32_EFER; |
2431 | } | |
3af80fec | 2432 | vm_entry_controls_set(vmx, exec_control); |
55d2375e SC |
2433 | |
2434 | /* | |
2435 | * EXIT CONTROLS | |
2436 | * | |
2437 | * L2->L1 exit controls are emulated - the hardware exit is to L0 so | |
2438 | * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER | |
2439 | * bits may be modified by vmx_set_efer() in prepare_vmcs02(). | |
2440 | */ | |
389ab252 | 2441 | exec_control = __vm_exit_controls_get(vmcs01); |
7974c064 | 2442 | if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) |
55d2375e | 2443 | exec_control |= VM_EXIT_LOAD_IA32_EFER; |
389ab252 SC |
2444 | else |
2445 | exec_control &= ~VM_EXIT_LOAD_IA32_EFER; | |
3af80fec | 2446 | vm_exit_controls_set(vmx, exec_control); |
55d2375e SC |
2447 | |
2448 | /* | |
2449 | * Interrupt/Exception Fields | |
2450 | */ | |
2451 | if (vmx->nested.nested_run_pending) { | |
2452 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | |
2453 | vmcs12->vm_entry_intr_info_field); | |
2454 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | |
2455 | vmcs12->vm_entry_exception_error_code); | |
2456 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | |
2457 | vmcs12->vm_entry_instruction_len); | |
2458 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | |
2459 | vmcs12->guest_interruptibility_info); | |
2460 | vmx->loaded_vmcs->nmi_known_unmasked = | |
2461 | !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); | |
2462 | } else { | |
2463 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); | |
2464 | } | |
2465 | } | |
2466 | ||
b1346ab2 | 2467 | static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) |
55d2375e | 2468 | { |
c98842b2 | 2469 | struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); |
55d2375e SC |
2470 | |
2471 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | |
2472 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { | |
2473 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); | |
2474 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); | |
2475 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); | |
2476 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); | |
2477 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); | |
2478 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); | |
2479 | vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); | |
2480 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); | |
2481 | vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); | |
2482 | vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); | |
2483 | vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); | |
2484 | vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); | |
2485 | vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); | |
2486 | vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); | |
2487 | vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); | |
2488 | vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); | |
2489 | vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); | |
2490 | vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); | |
1c6f0b47 SC |
2491 | vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); |
2492 | vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); | |
55d2375e SC |
2493 | vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); |
2494 | vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); | |
2495 | vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); | |
2496 | vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); | |
2497 | vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); | |
2498 | vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); | |
2499 | vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); | |
2500 | vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); | |
2501 | vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); | |
2502 | vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); | |
2503 | vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); | |
2504 | vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); | |
2505 | vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); | |
2506 | vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); | |
2507 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); | |
2508 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); | |
fc387d8d SC |
2509 | |
2510 | vmx->segment_cache.bitmask = 0; | |
55d2375e SC |
2511 | } |
2512 | ||
2513 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | |
2514 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { | |
2515 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); | |
2516 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, | |
2517 | vmcs12->guest_pending_dbg_exceptions); | |
2518 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); | |
2519 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); | |
2520 | ||
2521 | /* | |
2522 | * L1 may access the L2's PDPTR, so save them to construct | |
2523 | * vmcs12 | |
2524 | */ | |
2525 | if (enable_ept) { | |
2526 | vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); | |
2527 | vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); | |
2528 | vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); | |
2529 | vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); | |
2530 | } | |
c27e5b0d SC |
2531 | |
2532 | if (kvm_mpx_supported() && vmx->nested.nested_run_pending && | |
2533 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) | |
2534 | vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); | |
55d2375e SC |
2535 | } |
2536 | ||
2537 | if (nested_cpu_has_xsaves(vmcs12)) | |
2538 | vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); | |
2539 | ||
2540 | /* | |
2541 | * Whether page-faults are trapped is determined by a combination of | |
a0c13434 PB |
2542 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 |
2543 | * doesn't care about page faults then we should set all of these to | |
2544 | * L1's desires. However, if L0 does care about (some) page faults, it | |
2545 | * is not easy (if at all possible?) to merge L0 and L1's desires, we | |
2546 | * simply ask to exit on each and every L2 page fault. This is done by | |
2547 | * setting MASK=MATCH=0 and (see below) EB.PF=1. | |
55d2375e SC |
2548 | * Note that below we don't need special code to set EB.PF beyond the |
2549 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, | |
2550 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when | |
2551 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. | |
2552 | */ | |
a0c13434 PB |
2553 | if (vmx_need_pf_intercept(&vmx->vcpu)) { |
2554 | /* | |
2555 | * TODO: if both L0 and L1 need the same MASK and MATCH, | |
2556 | * go ahead and use it? | |
2557 | */ | |
2558 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | |
2559 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | |
2560 | } else { | |
2561 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); | |
2562 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); | |
2563 | } | |
55d2375e SC |
2564 | |
2565 | if (cpu_has_vmx_apicv()) { | |
2566 | vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); | |
2567 | vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); | |
2568 | vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); | |
2569 | vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); | |
2570 | } | |
2571 | ||
662f1d1d AL |
2572 | /* |
2573 | * Make sure the msr_autostore list is up to date before we set the | |
2574 | * count in the vmcs02. | |
2575 | */ | |
2576 | prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); | |
2577 | ||
2578 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); | |
55d2375e SC |
2579 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); |
2580 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | |
2581 | ||
2582 | set_cr4_guest_host_mask(vmx); | |
55d2375e SC |
2583 | } |
2584 | ||
2585 | /* | |
2586 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | |
2587 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | |
2588 | * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 | |
2589 | * guest in a way that will both be appropriate to L1's requests, and our | |
2590 | * needs. In addition to modifying the active vmcs (which is vmcs02), this | |
2591 | * function also has additional necessary side-effects, like setting various | |
2592 | * vcpu->arch fields. | |
2593 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code | |
2594 | * is assigned to entry_failure_code on failure. | |
2595 | */ | |
2596 | static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | |
0f857223 | 2597 | bool from_vmentry, |
68cda40d | 2598 | enum vm_entry_failure_code *entry_failure_code) |
55d2375e SC |
2599 | { |
2600 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
c98842b2 | 2601 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
c7554efc | 2602 | bool load_guest_pdptrs_vmcs12 = false; |
55d2375e | 2603 | |
453e42b0 | 2604 | if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { |
b1346ab2 | 2605 | prepare_vmcs02_rare(vmx, vmcs12); |
55d2375e | 2606 | vmx->nested.dirty_vmcs12 = false; |
55d2375e | 2607 | |
453e42b0 | 2608 | load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || |
c98842b2 | 2609 | !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); |
55d2375e SC |
2610 | } |
2611 | ||
2612 | if (vmx->nested.nested_run_pending && | |
2613 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { | |
2614 | kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); | |
2615 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); | |
2616 | } else { | |
2617 | kvm_set_dr(vcpu, 7, vcpu->arch.dr7); | |
5d76b1f8 | 2618 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); |
55d2375e | 2619 | } |
3b013a29 SC |
2620 | if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || |
2621 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) | |
5d76b1f8 | 2622 | vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); |
55d2375e SC |
2623 | vmx_set_rflags(vcpu, vmcs12->guest_rflags); |
2624 | ||
55d2375e SC |
2625 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the |
2626 | * bitwise-or of what L1 wants to trap for L2, and what we want to | |
2627 | * trap. Note that CR0.TS also needs updating - we do this later. | |
2628 | */ | |
b6a7cc35 | 2629 | vmx_update_exception_bitmap(vcpu); |
55d2375e SC |
2630 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; |
2631 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | |
2632 | ||
2633 | if (vmx->nested.nested_run_pending && | |
2634 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { | |
2635 | vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); | |
2636 | vcpu->arch.pat = vmcs12->guest_ia32_pat; | |
2637 | } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | |
2638 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | |
2639 | } | |
2640 | ||
d041b5ea IS |
2641 | vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( |
2642 | vcpu->arch.l1_tsc_offset, | |
2643 | vmx_get_l2_tsc_offset(vcpu), | |
2644 | vmx_get_l2_tsc_multiplier(vcpu)); | |
2645 | ||
2646 | vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( | |
2647 | vcpu->arch.l1_tsc_scaling_ratio, | |
2648 | vmx_get_l2_tsc_multiplier(vcpu)); | |
2649 | ||
55d2375e | 2650 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); |
938c8745 | 2651 | if (kvm_caps.has_tsc_control) |
1ab9287a | 2652 | vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); |
55d2375e | 2653 | |
50b265a4 | 2654 | nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); |
55d2375e SC |
2655 | |
2656 | if (nested_cpu_has_ept(vmcs12)) | |
2657 | nested_ept_init_mmu_context(vcpu); | |
55d2375e SC |
2658 | |
2659 | /* | |
4a8fd4a7 SC |
2660 | * Override the CR0/CR4 read shadows after setting the effective guest |
2661 | * CR0/CR4. The common helpers also set the shadows, but they don't | |
2662 | * account for vmcs12's cr0/4_guest_host_mask. | |
55d2375e SC |
2663 | */ |
2664 | vmx_set_cr0(vcpu, vmcs12->guest_cr0); | |
2665 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | |
2666 | ||
2667 | vmx_set_cr4(vcpu, vmcs12->guest_cr4); | |
2668 | vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); | |
2669 | ||
2670 | vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); | |
2671 | /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ | |
2672 | vmx_set_efer(vcpu, vcpu->arch.efer); | |
2673 | ||
2674 | /* | |
2675 | * Guest state is invalid and unrestricted guest is disabled, | |
2676 | * which means L1 attempted VMEntry to L2 with invalid state. | |
2677 | * Fail the VMEntry. | |
c8607e4a ML |
2678 | * |
2679 | * However when force loading the guest state (SMM exit or | |
2680 | * loading nested state after migration, it is possible to | |
2681 | * have invalid guest state now, which will be later fixed by | |
2682 | * restoring L2 register state | |
55d2375e | 2683 | */ |
c8607e4a | 2684 | if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { |
55d2375e | 2685 | *entry_failure_code = ENTRY_FAIL_DEFAULT; |
c80add0f | 2686 | return -EINVAL; |
55d2375e SC |
2687 | } |
2688 | ||
2689 | /* Shadow page tables on either EPT or shadow page tables. */ | |
2690 | if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), | |
0f857223 | 2691 | from_vmentry, entry_failure_code)) |
c80add0f | 2692 | return -EINVAL; |
55d2375e | 2693 | |
04f11ef4 SC |
2694 | /* |
2695 | * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 | |
2696 | * on nested VM-Exit, which can occur without actually running L2 and | |
727a7e27 | 2697 | * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with |
04f11ef4 SC |
2698 | * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the |
2699 | * transition to HLT instead of running L2. | |
2700 | */ | |
2701 | if (enable_ept) | |
2702 | vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); | |
2703 | ||
c7554efc SC |
2704 | /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ |
2705 | if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && | |
2706 | is_pae_paging(vcpu)) { | |
2707 | vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); | |
2708 | vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); | |
2709 | vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); | |
2710 | vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); | |
2711 | } | |
2712 | ||
71f73470 | 2713 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && |
c85cdc1c | 2714 | kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && |
d1968421 | 2715 | WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, |
bfbb307c DC |
2716 | vmcs12->guest_ia32_perf_global_ctrl))) { |
2717 | *entry_failure_code = ENTRY_FAIL_DEFAULT; | |
71f73470 | 2718 | return -EINVAL; |
bfbb307c | 2719 | } |
71f73470 | 2720 | |
e9c16c78 PB |
2721 | kvm_rsp_write(vcpu, vmcs12->guest_rsp); |
2722 | kvm_rip_write(vcpu, vmcs12->guest_rip); | |
dc313385 VK |
2723 | |
2724 | /* | |
2725 | * It was observed that genuine Hyper-V running in L1 doesn't reset | |
2726 | * 'hv_clean_fields' by itself, it only sets the corresponding dirty | |
2727 | * bits when it changes a field in eVMCS. Mark all fields as clean | |
2728 | * here. | |
2729 | */ | |
453e42b0 | 2730 | if (nested_vmx_is_evmptr12_valid(vmx)) |
c98842b2 | 2731 | evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; |
dc313385 | 2732 | |
55d2375e SC |
2733 | return 0; |
2734 | } | |
2735 | ||
2736 | static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) | |
2737 | { | |
5497b955 SC |
2738 | if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && |
2739 | nested_cpu_has_virtual_nmis(vmcs12))) | |
55d2375e SC |
2740 | return -EINVAL; |
2741 | ||
5497b955 | 2742 | if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && |
4e2a0bc5 | 2743 | nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) |
55d2375e SC |
2744 | return -EINVAL; |
2745 | ||
2746 | return 0; | |
2747 | } | |
2748 | ||
ac6389ab | 2749 | static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) |
55d2375e SC |
2750 | { |
2751 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
55d2375e SC |
2752 | |
2753 | /* Check for memory type validity */ | |
ac6389ab | 2754 | switch (new_eptp & VMX_EPTP_MT_MASK) { |
55d2375e | 2755 | case VMX_EPTP_MT_UC: |
5497b955 | 2756 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) |
55d2375e SC |
2757 | return false; |
2758 | break; | |
2759 | case VMX_EPTP_MT_WB: | |
5497b955 | 2760 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) |
55d2375e SC |
2761 | return false; |
2762 | break; | |
2763 | default: | |
2764 | return false; | |
2765 | } | |
2766 | ||
bb1fcc70 | 2767 | /* Page-walk levels validity. */ |
ac6389ab | 2768 | switch (new_eptp & VMX_EPTP_PWL_MASK) { |
bb1fcc70 SC |
2769 | case VMX_EPTP_PWL_5: |
2770 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) | |
2771 | return false; | |
2772 | break; | |
2773 | case VMX_EPTP_PWL_4: | |
2774 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) | |
2775 | return false; | |
2776 | break; | |
2777 | default: | |
55d2375e | 2778 | return false; |
bb1fcc70 | 2779 | } |
55d2375e SC |
2780 | |
2781 | /* Reserved bits should not be set */ | |
9c8021d4 | 2782 | if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) |
55d2375e SC |
2783 | return false; |
2784 | ||
2785 | /* AD, if set, should be supported */ | |
ac6389ab | 2786 | if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { |
5497b955 | 2787 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) |
55d2375e SC |
2788 | return false; |
2789 | } | |
2790 | ||
2791 | return true; | |
2792 | } | |
2793 | ||
461b4ba4 KS |
2794 | /* |
2795 | * Checks related to VM-Execution Control Fields | |
2796 | */ | |
2797 | static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, | |
2798 | struct vmcs12 *vmcs12) | |
55d2375e SC |
2799 | { |
2800 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
55d2375e | 2801 | |
5497b955 SC |
2802 | if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, |
2803 | vmx->nested.msrs.pinbased_ctls_low, | |
2804 | vmx->nested.msrs.pinbased_ctls_high)) || | |
2805 | CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, | |
2806 | vmx->nested.msrs.procbased_ctls_low, | |
2807 | vmx->nested.msrs.procbased_ctls_high))) | |
461b4ba4 | 2808 | return -EINVAL; |
55d2375e | 2809 | |
461b4ba4 | 2810 | if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && |
5497b955 SC |
2811 | CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, |
2812 | vmx->nested.msrs.secondary_ctls_low, | |
2813 | vmx->nested.msrs.secondary_ctls_high))) | |
461b4ba4 KS |
2814 | return -EINVAL; |
2815 | ||
5497b955 | 2816 | if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || |
461b4ba4 KS |
2817 | nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || |
2818 | nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || | |
2819 | nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || | |
2820 | nested_vmx_check_apic_access_controls(vcpu, vmcs12) || | |
2821 | nested_vmx_check_apicv_controls(vcpu, vmcs12) || | |
2822 | nested_vmx_check_nmi_controls(vmcs12) || | |
2823 | nested_vmx_check_pml_controls(vcpu, vmcs12) || | |
2824 | nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || | |
2825 | nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || | |
2826 | nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || | |
5497b955 | 2827 | CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) |
461b4ba4 KS |
2828 | return -EINVAL; |
2829 | ||
bc441211 SC |
2830 | if (!nested_cpu_has_preemption_timer(vmcs12) && |
2831 | nested_cpu_has_save_preemption_timer(vmcs12)) | |
2832 | return -EINVAL; | |
2833 | ||
461b4ba4 | 2834 | if (nested_cpu_has_ept(vmcs12) && |
ac6389ab | 2835 | CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) |
461b4ba4 | 2836 | return -EINVAL; |
55d2375e SC |
2837 | |
2838 | if (nested_cpu_has_vmfunc(vmcs12)) { | |
5497b955 SC |
2839 | if (CC(vmcs12->vm_function_control & |
2840 | ~vmx->nested.msrs.vmfunc_controls)) | |
461b4ba4 | 2841 | return -EINVAL; |
55d2375e SC |
2842 | |
2843 | if (nested_cpu_has_eptp_switching(vmcs12)) { | |
5497b955 SC |
2844 | if (CC(!nested_cpu_has_ept(vmcs12)) || |
2845 | CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) | |
461b4ba4 | 2846 | return -EINVAL; |
55d2375e SC |
2847 | } |
2848 | } | |
2849 | ||
461b4ba4 KS |
2850 | return 0; |
2851 | } | |
2852 | ||
61446ba7 KS |
2853 | /* |
2854 | * Checks related to VM-Exit Control Fields | |
2855 | */ | |
2856 | static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, | |
2857 | struct vmcs12 *vmcs12) | |
2858 | { | |
2859 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2860 | ||
5497b955 SC |
2861 | if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, |
2862 | vmx->nested.msrs.exit_ctls_low, | |
2863 | vmx->nested.msrs.exit_ctls_high)) || | |
2864 | CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) | |
61446ba7 KS |
2865 | return -EINVAL; |
2866 | ||
2867 | return 0; | |
2868 | } | |
2869 | ||
5fbf9634 KS |
2870 | /* |
2871 | * Checks related to VM-Entry Control Fields | |
2872 | */ | |
2873 | static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, | |
2874 | struct vmcs12 *vmcs12) | |
461b4ba4 KS |
2875 | { |
2876 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
55d2375e | 2877 | |
5497b955 SC |
2878 | if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, |
2879 | vmx->nested.msrs.entry_ctls_low, | |
2880 | vmx->nested.msrs.entry_ctls_high))) | |
5fbf9634 | 2881 | return -EINVAL; |
55d2375e SC |
2882 | |
2883 | /* | |
2884 | * From the Intel SDM, volume 3: | |
2885 | * Fields relevant to VM-entry event injection must be set properly. | |
2886 | * These fields are the VM-entry interruption-information field, the | |
2887 | * VM-entry exception error code, and the VM-entry instruction length. | |
2888 | */ | |
2889 | if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { | |
2890 | u32 intr_info = vmcs12->vm_entry_intr_info_field; | |
2891 | u8 vector = intr_info & INTR_INFO_VECTOR_MASK; | |
2892 | u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; | |
2893 | bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; | |
2894 | bool should_have_error_code; | |
2895 | bool urg = nested_cpu_has2(vmcs12, | |
2896 | SECONDARY_EXEC_UNRESTRICTED_GUEST); | |
2897 | bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; | |
2898 | ||
2899 | /* VM-entry interruption-info field: interruption type */ | |
5497b955 SC |
2900 | if (CC(intr_type == INTR_TYPE_RESERVED) || |
2901 | CC(intr_type == INTR_TYPE_OTHER_EVENT && | |
2902 | !nested_cpu_supports_monitor_trap_flag(vcpu))) | |
5fbf9634 | 2903 | return -EINVAL; |
55d2375e SC |
2904 | |
2905 | /* VM-entry interruption-info field: vector */ | |
5497b955 SC |
2906 | if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || |
2907 | CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || | |
2908 | CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) | |
5fbf9634 | 2909 | return -EINVAL; |
55d2375e SC |
2910 | |
2911 | /* VM-entry interruption-info field: deliver error code */ | |
2912 | should_have_error_code = | |
2913 | intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && | |
2914 | x86_exception_has_error_code(vector); | |
5497b955 | 2915 | if (CC(has_error_code != should_have_error_code)) |
5fbf9634 | 2916 | return -EINVAL; |
55d2375e SC |
2917 | |
2918 | /* VM-entry exception error code */ | |
5497b955 | 2919 | if (CC(has_error_code && |
567926cc | 2920 | vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) |
5fbf9634 | 2921 | return -EINVAL; |
55d2375e SC |
2922 | |
2923 | /* VM-entry interruption-info field: reserved bits */ | |
5497b955 | 2924 | if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) |
5fbf9634 | 2925 | return -EINVAL; |
55d2375e SC |
2926 | |
2927 | /* VM-entry instruction length */ | |
2928 | switch (intr_type) { | |
2929 | case INTR_TYPE_SOFT_EXCEPTION: | |
2930 | case INTR_TYPE_SOFT_INTR: | |
2931 | case INTR_TYPE_PRIV_SW_EXCEPTION: | |
5497b955 SC |
2932 | if (CC(vmcs12->vm_entry_instruction_len > 15) || |
2933 | CC(vmcs12->vm_entry_instruction_len == 0 && | |
2934 | CC(!nested_cpu_has_zero_length_injection(vcpu)))) | |
5fbf9634 | 2935 | return -EINVAL; |
55d2375e SC |
2936 | } |
2937 | } | |
2938 | ||
5fbf9634 KS |
2939 | if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) |
2940 | return -EINVAL; | |
2941 | ||
2942 | return 0; | |
2943 | } | |
2944 | ||
5478ba34 SC |
2945 | static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, |
2946 | struct vmcs12 *vmcs12) | |
2947 | { | |
2948 | if (nested_check_vm_execution_controls(vcpu, vmcs12) || | |
2949 | nested_check_vm_exit_controls(vcpu, vmcs12) || | |
2950 | nested_check_vm_entry_controls(vcpu, vmcs12)) | |
98d9e858 | 2951 | return -EINVAL; |
5478ba34 | 2952 | |
b4f69df0 | 2953 | #ifdef CONFIG_KVM_HYPERV |
85ab071a | 2954 | if (guest_cpuid_has_evmcs(vcpu)) |
a8350231 | 2955 | return nested_evmcs_check_controls(vmcs12); |
b4f69df0 | 2956 | #endif |
a8350231 | 2957 | |
5478ba34 SC |
2958 | return 0; |
2959 | } | |
2960 | ||
af957eeb ML |
2961 | static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, |
2962 | struct vmcs12 *vmcs12) | |
2963 | { | |
2964 | #ifdef CONFIG_X86_64 | |
2965 | if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != | |
2966 | !!(vcpu->arch.efer & EFER_LMA))) | |
2967 | return -EINVAL; | |
2968 | #endif | |
2969 | return 0; | |
2970 | } | |
2971 | ||
98d9e858 PB |
2972 | static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, |
2973 | struct vmcs12 *vmcs12) | |
5fbf9634 | 2974 | { |
77900bff | 2975 | bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); |
5fbf9634 | 2976 | |
5497b955 SC |
2977 | if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || |
2978 | CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || | |
2c49db45 | 2979 | CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) |
254b2f3b | 2980 | return -EINVAL; |
711eff3a | 2981 | |
5497b955 SC |
2982 | if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || |
2983 | CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) | |
711eff3a KS |
2984 | return -EINVAL; |
2985 | ||
f6b0db1f | 2986 | if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && |
5497b955 | 2987 | CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) |
f6b0db1f KS |
2988 | return -EINVAL; |
2989 | ||
c547cb6f OU |
2990 | if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && |
2991 | CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), | |
2992 | vmcs12->host_ia32_perf_global_ctrl))) | |
2993 | return -EINVAL; | |
2994 | ||
fd3edd4a | 2995 | if (ia32e) { |
af957eeb | 2996 | if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) |
fd3edd4a PB |
2997 | return -EINVAL; |
2998 | } else { | |
af957eeb | 2999 | if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || |
fd3edd4a PB |
3000 | CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || |
3001 | CC((vmcs12->host_rip) >> 32)) | |
3002 | return -EINVAL; | |
3003 | } | |
1ef23e1f | 3004 | |
5497b955 SC |
3005 | if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
3006 | CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || | |
3007 | CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || | |
3008 | CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || | |
3009 | CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || | |
3010 | CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || | |
3011 | CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || | |
3012 | CC(vmcs12->host_cs_selector == 0) || | |
3013 | CC(vmcs12->host_tr_selector == 0) || | |
3014 | CC(vmcs12->host_ss_selector == 0 && !ia32e)) | |
1ef23e1f KS |
3015 | return -EINVAL; |
3016 | ||
5497b955 SC |
3017 | if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || |
3018 | CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || | |
3019 | CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || | |
3020 | CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || | |
fd3edd4a PB |
3021 | CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || |
3022 | CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) | |
5845038c | 3023 | return -EINVAL; |
1ef23e1f | 3024 | |
5fbf9634 KS |
3025 | /* |
3026 | * If the load IA32_EFER VM-exit control is 1, bits reserved in the | |
3027 | * IA32_EFER MSR must be 0 in the field for that register. In addition, | |
3028 | * the values of the LMA and LME bits in the field must each be that of | |
3029 | * the host address-space size VM-exit control. | |
3030 | */ | |
3031 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { | |
5497b955 SC |
3032 | if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || |
3033 | CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || | |
3034 | CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) | |
254b2f3b | 3035 | return -EINVAL; |
5fbf9634 KS |
3036 | } |
3037 | ||
55d2375e SC |
3038 | return 0; |
3039 | } | |
3040 | ||
3041 | static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, | |
3042 | struct vmcs12 *vmcs12) | |
3043 | { | |
7d0172b3 DW |
3044 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3045 | struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; | |
3046 | struct vmcs_hdr hdr; | |
55d2375e | 3047 | |
64c78508 | 3048 | if (vmcs12->vmcs_link_pointer == INVALID_GPA) |
55d2375e SC |
3049 | return 0; |
3050 | ||
5497b955 | 3051 | if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) |
55d2375e SC |
3052 | return -EINVAL; |
3053 | ||
7d0172b3 DW |
3054 | if (ghc->gpa != vmcs12->vmcs_link_pointer && |
3055 | CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, | |
3056 | vmcs12->vmcs_link_pointer, VMCS12_SIZE))) | |
3057 | return -EINVAL; | |
55d2375e | 3058 | |
7d0172b3 DW |
3059 | if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, |
3060 | offsetof(struct vmcs12, hdr), | |
3061 | sizeof(hdr)))) | |
3062 | return -EINVAL; | |
88925305 | 3063 | |
7d0172b3 DW |
3064 | if (CC(hdr.revision_id != VMCS12_REVISION) || |
3065 | CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) | |
3066 | return -EINVAL; | |
88925305 | 3067 | |
7d0172b3 | 3068 | return 0; |
55d2375e SC |
3069 | } |
3070 | ||
9c3e922b SC |
3071 | /* |
3072 | * Checks related to Guest Non-register State | |
3073 | */ | |
3074 | static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) | |
3075 | { | |
5497b955 | 3076 | if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && |
bf0cd88c YQ |
3077 | vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && |
3078 | vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) | |
9c3e922b SC |
3079 | return -EINVAL; |
3080 | ||
3081 | return 0; | |
3082 | } | |
3083 | ||
5478ba34 SC |
3084 | static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, |
3085 | struct vmcs12 *vmcs12, | |
68cda40d | 3086 | enum vm_entry_failure_code *entry_failure_code) |
55d2375e | 3087 | { |
112e6601 | 3088 | bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); |
55d2375e | 3089 | |
68cda40d | 3090 | *entry_failure_code = ENTRY_FAIL_DEFAULT; |
55d2375e | 3091 | |
5497b955 SC |
3092 | if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || |
3093 | CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) | |
c80add0f | 3094 | return -EINVAL; |
55d2375e | 3095 | |
b91991bf KS |
3096 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && |
3097 | CC(!kvm_dr7_valid(vmcs12->guest_dr7))) | |
3098 | return -EINVAL; | |
3099 | ||
de2bc2bf | 3100 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && |
5497b955 | 3101 | CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) |
c80add0f | 3102 | return -EINVAL; |
55d2375e SC |
3103 | |
3104 | if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { | |
68cda40d | 3105 | *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; |
c80add0f | 3106 | return -EINVAL; |
55d2375e SC |
3107 | } |
3108 | ||
bfc6ad6a OU |
3109 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && |
3110 | CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), | |
3111 | vmcs12->guest_ia32_perf_global_ctrl))) | |
3112 | return -EINVAL; | |
3113 | ||
112e6601 PB |
3114 | if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) |
3115 | return -EINVAL; | |
3116 | ||
3117 | if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || | |
3118 | CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) | |
3119 | return -EINVAL; | |
3120 | ||
55d2375e SC |
3121 | /* |
3122 | * If the load IA32_EFER VM-entry control is 1, the following checks | |
3123 | * are performed on the field for the IA32_EFER MSR: | |
3124 | * - Bits reserved in the IA32_EFER MSR must be 0. | |
3125 | * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of | |
3126 | * the IA-32e mode guest VM-exit control. It must also be identical | |
3127 | * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to | |
3128 | * CR0.PG) is 1. | |
3129 | */ | |
3130 | if (to_vmx(vcpu)->nested.nested_run_pending && | |
3131 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { | |
5497b955 SC |
3132 | if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || |
3133 | CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || | |
3134 | CC(((vmcs12->guest_cr0 & X86_CR0_PG) && | |
3135 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) | |
c80add0f | 3136 | return -EINVAL; |
55d2375e SC |
3137 | } |
3138 | ||
3139 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && | |
5497b955 SC |
3140 | (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || |
3141 | CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) | |
c80add0f | 3142 | return -EINVAL; |
55d2375e | 3143 | |
9c3e922b | 3144 | if (nested_check_guest_non_reg_state(vmcs12)) |
c80add0f | 3145 | return -EINVAL; |
55d2375e SC |
3146 | |
3147 | return 0; | |
3148 | } | |
3149 | ||
453eafbe | 3150 | static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) |
55d2375e SC |
3151 | { |
3152 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1a715810 | 3153 | unsigned long cr3, cr4; |
f1727b49 | 3154 | bool vm_fail; |
55d2375e SC |
3155 | |
3156 | if (!nested_early_check) | |
3157 | return 0; | |
3158 | ||
3159 | if (vmx->msr_autoload.host.nr) | |
3160 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | |
3161 | if (vmx->msr_autoload.guest.nr) | |
3162 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | |
3163 | ||
3164 | preempt_disable(); | |
3165 | ||
3166 | vmx_prepare_switch_to_guest(vcpu); | |
3167 | ||
3168 | /* | |
3169 | * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, | |
3170 | * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to | |
49f933d4 | 3171 | * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. |
55d2375e SC |
3172 | * there is no need to preserve other bits or save/restore the field. |
3173 | */ | |
3174 | vmcs_writel(GUEST_RFLAGS, 0); | |
3175 | ||
1a715810 SC |
3176 | cr3 = __get_current_cr3_fast(); |
3177 | if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { | |
3178 | vmcs_writel(HOST_CR3, cr3); | |
3179 | vmx->loaded_vmcs->host_state.cr3 = cr3; | |
3180 | } | |
3181 | ||
55d2375e SC |
3182 | cr4 = cr4_read_shadow(); |
3183 | if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { | |
3184 | vmcs_writel(HOST_CR4, cr4); | |
3185 | vmx->loaded_vmcs->host_state.cr4 = cr4; | |
3186 | } | |
3187 | ||
150f17bf | 3188 | vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, |
bb066506 | 3189 | __vmx_vcpu_run_flags(vmx)); |
55d2375e | 3190 | |
55d2375e SC |
3191 | if (vmx->msr_autoload.host.nr) |
3192 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | |
3193 | if (vmx->msr_autoload.guest.nr) | |
3194 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | |
3195 | ||
f1727b49 | 3196 | if (vm_fail) { |
380e0055 SC |
3197 | u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); |
3198 | ||
541e886f | 3199 | preempt_enable(); |
380e0055 SC |
3200 | |
3201 | trace_kvm_nested_vmenter_failed( | |
3202 | "early hardware check VM-instruction error: ", error); | |
3203 | WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); | |
55d2375e SC |
3204 | return 1; |
3205 | } | |
3206 | ||
3207 | /* | |
3208 | * VMExit clears RFLAGS.IF and DR7, even on a consistency check. | |
3209 | */ | |
55d2375e SC |
3210 | if (hw_breakpoint_active()) |
3211 | set_debugreg(__this_cpu_read(cpu_dr7), 7); | |
84b6a349 | 3212 | local_irq_enable(); |
541e886f | 3213 | preempt_enable(); |
55d2375e SC |
3214 | |
3215 | /* | |
3216 | * A non-failing VMEntry means we somehow entered guest mode with | |
3217 | * an illegal RIP, and that's just the tip of the iceberg. There | |
3218 | * is no telling what memory has been modified or what state has | |
3219 | * been exposed to unknown code. Hitting this all but guarantees | |
3220 | * a (very critical) hardware issue. | |
3221 | */ | |
3222 | WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & | |
3223 | VMX_EXIT_REASONS_FAILED_VMENTRY)); | |
3224 | ||
3225 | return 0; | |
3226 | } | |
55d2375e | 3227 | |
b4f69df0 | 3228 | #ifdef CONFIG_KVM_HYPERV |
9a78e158 | 3229 | static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) |
55d2375e | 3230 | { |
55d2375e | 3231 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
55d2375e | 3232 | |
e942dbf8 VK |
3233 | /* |
3234 | * hv_evmcs may end up being not mapped after migration (when | |
3235 | * L2 was running), map it here to make sure vmcs12 changes are | |
3236 | * properly reflected. | |
3237 | */ | |
85ab071a | 3238 | if (guest_cpuid_has_evmcs(vcpu) && |
27849968 | 3239 | vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { |
b6a0653a VK |
3240 | enum nested_evmptrld_status evmptrld_status = |
3241 | nested_vmx_handle_enlightened_vmptrld(vcpu, false); | |
3242 | ||
3243 | if (evmptrld_status == EVMPTRLD_VMFAIL || | |
f5c7e842 | 3244 | evmptrld_status == EVMPTRLD_ERROR) |
b6a0653a | 3245 | return false; |
8629b625 VK |
3246 | |
3247 | /* | |
3248 | * Post migration VMCS12 always provides the most actual | |
3249 | * information, copy it to eVMCS upon entry. | |
3250 | */ | |
3251 | vmx->nested.need_vmcs12_to_shadow_sync = true; | |
b6a0653a | 3252 | } |
e942dbf8 | 3253 | |
9a78e158 PB |
3254 | return true; |
3255 | } | |
b4f69df0 | 3256 | #endif |
9a78e158 PB |
3257 | |
3258 | static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) | |
3259 | { | |
3260 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
3261 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3262 | struct kvm_host_map *map; | |
9a78e158 | 3263 | |
158a48ec ML |
3264 | if (!vcpu->arch.pdptrs_from_userspace && |
3265 | !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { | |
0f857223 ML |
3266 | /* |
3267 | * Reload the guest's PDPTRs since after a migration | |
3268 | * the guest CR3 might be restored prior to setting the nested | |
3269 | * state which can lead to a load of wrong PDPTRs. | |
3270 | */ | |
2df4a5eb | 3271 | if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) |
0f857223 ML |
3272 | return false; |
3273 | } | |
3274 | ||
3275 | ||
55d2375e | 3276 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { |
fe1911aa SC |
3277 | map = &vmx->nested.apic_access_page_map; |
3278 | ||
3279 | if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { | |
3280 | vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); | |
55d2375e | 3281 | } else { |
fe1911aa | 3282 | pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", |
671ddc70 JM |
3283 | __func__); |
3284 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | |
3285 | vcpu->run->internal.suberror = | |
3286 | KVM_INTERNAL_ERROR_EMULATION; | |
3287 | vcpu->run->internal.ndata = 0; | |
3288 | return false; | |
55d2375e SC |
3289 | } |
3290 | } | |
3291 | ||
3292 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | |
96c66e87 | 3293 | map = &vmx->nested.virtual_apic_map; |
55d2375e | 3294 | |
96c66e87 KA |
3295 | if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { |
3296 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); | |
69090810 PB |
3297 | } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && |
3298 | nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && | |
3299 | !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | |
3300 | /* | |
3301 | * The processor will never use the TPR shadow, simply | |
3302 | * clear the bit from the execution control. Such a | |
3303 | * configuration is useless, but it happens in tests. | |
3304 | * For any other configuration, failing the vm entry is | |
3305 | * _not_ what the processor does but it's basically the | |
3306 | * only possibility we have. | |
3307 | */ | |
2183f564 | 3308 | exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); |
69090810 | 3309 | } else { |
ca2f5466 SC |
3310 | /* |
3311 | * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to | |
3312 | * force VM-Entry to fail. | |
3313 | */ | |
64c78508 | 3314 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); |
55d2375e SC |
3315 | } |
3316 | } | |
3317 | ||
3318 | if (nested_cpu_has_posted_intr(vmcs12)) { | |
3278e049 KA |
3319 | map = &vmx->nested.pi_desc_map; |
3320 | ||
3321 | if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { | |
3322 | vmx->nested.pi_desc = | |
3323 | (struct pi_desc *)(((void *)map->hva) + | |
3324 | offset_in_page(vmcs12->posted_intr_desc_addr)); | |
3325 | vmcs_write64(POSTED_INTR_DESC_ADDR, | |
3326 | pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); | |
966eefb8 JM |
3327 | } else { |
3328 | /* | |
3329 | * Defer the KVM_INTERNAL_EXIT until KVM tries to | |
3330 | * access the contents of the VMCS12 posted interrupt | |
3331 | * descriptor. (Note that KVM may do this when it | |
3332 | * should not, per the architectural specification.) | |
3333 | */ | |
3334 | vmx->nested.pi_desc = NULL; | |
3335 | pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); | |
55d2375e | 3336 | } |
55d2375e SC |
3337 | } |
3338 | if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) | |
2183f564 | 3339 | exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); |
55d2375e | 3340 | else |
2183f564 | 3341 | exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); |
9a78e158 PB |
3342 | |
3343 | return true; | |
3344 | } | |
3345 | ||
3346 | static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) | |
3347 | { | |
b4f69df0 | 3348 | #ifdef CONFIG_KVM_HYPERV |
c30e9bc8 VK |
3349 | /* |
3350 | * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy | |
3351 | * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory | |
3352 | * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post | |
3353 | * migration. | |
3354 | */ | |
f5c7e842 VK |
3355 | if (!nested_get_evmcs_page(vcpu)) { |
3356 | pr_debug_ratelimited("%s: enlightened vmptrld failed\n", | |
3357 | __func__); | |
3358 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | |
3359 | vcpu->run->internal.suberror = | |
3360 | KVM_INTERNAL_ERROR_EMULATION; | |
3361 | vcpu->run->internal.ndata = 0; | |
3362 | ||
9a78e158 | 3363 | return false; |
f5c7e842 | 3364 | } |
b4f69df0 | 3365 | #endif |
9a78e158 PB |
3366 | |
3367 | if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) | |
3368 | return false; | |
3369 | ||
671ddc70 | 3370 | return true; |
55d2375e SC |
3371 | } |
3372 | ||
02f5fb2e SC |
3373 | static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) |
3374 | { | |
3375 | struct vmcs12 *vmcs12; | |
3376 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3377 | gpa_t dst; | |
3378 | ||
3379 | if (WARN_ON_ONCE(!is_guest_mode(vcpu))) | |
3380 | return 0; | |
3381 | ||
3382 | if (WARN_ON_ONCE(vmx->nested.pml_full)) | |
3383 | return 1; | |
3384 | ||
3385 | /* | |
3386 | * Check if PML is enabled for the nested guest. Whether eptp bit 6 is | |
3387 | * set is already checked as part of A/D emulation. | |
3388 | */ | |
3389 | vmcs12 = get_vmcs12(vcpu); | |
3390 | if (!nested_cpu_has_pml(vmcs12)) | |
3391 | return 0; | |
3392 | ||
3393 | if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { | |
3394 | vmx->nested.pml_full = true; | |
3395 | return 1; | |
3396 | } | |
3397 | ||
3398 | gpa &= ~0xFFFull; | |
3399 | dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; | |
3400 | ||
3401 | if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, | |
3402 | offset_in_page(dst), sizeof(gpa))) | |
3403 | return 0; | |
3404 | ||
3405 | vmcs12->guest_pml_index--; | |
3406 | ||
3407 | return 0; | |
3408 | } | |
3409 | ||
55d2375e SC |
3410 | /* |
3411 | * Intel's VMX Instruction Reference specifies a common set of prerequisites | |
3412 | * for running VMX instructions (except VMXON, whose prerequisites are | |
3413 | * slightly different). It also specifies what exception to inject otherwise. | |
3414 | * Note that many of these exceptions have priority over VM exits, so they | |
3415 | * don't have to be checked again here. | |
3416 | */ | |
3417 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) | |
3418 | { | |
3419 | if (!to_vmx(vcpu)->nested.vmxon) { | |
3420 | kvm_queue_exception(vcpu, UD_VECTOR); | |
3421 | return 0; | |
3422 | } | |
3423 | ||
3424 | if (vmx_get_cpl(vcpu)) { | |
3425 | kvm_inject_gp(vcpu, 0); | |
3426 | return 0; | |
3427 | } | |
3428 | ||
3429 | return 1; | |
3430 | } | |
3431 | ||
3432 | static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) | |
3433 | { | |
3434 | u8 rvi = vmx_get_rvi(); | |
3435 | u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); | |
3436 | ||
3437 | return ((rvi & 0xf0) > (vppr & 0xf0)); | |
3438 | } | |
3439 | ||
3440 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | |
3441 | struct vmcs12 *vmcs12); | |
3442 | ||
3443 | /* | |
3444 | * If from_vmentry is false, this is being called from state restore (either RSM | |
3445 | * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. | |
671ddc70 JM |
3446 | * |
3447 | * Returns: | |
463bfeee ML |
3448 | * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode |
3449 | * NVMX_VMENTRY_VMFAIL: Consistency check VMFail | |
3450 | * NVMX_VMENTRY_VMEXIT: Consistency check VMExit | |
3451 | * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error | |
55d2375e | 3452 | */ |
671ddc70 JM |
3453 | enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, |
3454 | bool from_vmentry) | |
55d2375e SC |
3455 | { |
3456 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3457 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
68cda40d | 3458 | enum vm_entry_failure_code entry_failure_code; |
55d2375e | 3459 | bool evaluate_pending_interrupts; |
8e533240 SC |
3460 | union vmx_exit_reason exit_reason = { |
3461 | .basic = EXIT_REASON_INVALID_STATE, | |
3462 | .failed_vmentry = 1, | |
3463 | }; | |
3464 | u32 failed_index; | |
55d2375e | 3465 | |
37ef0be2 DM |
3466 | trace_kvm_nested_vmenter(kvm_rip_read(vcpu), |
3467 | vmx->nested.current_vmptr, | |
3468 | vmcs12->guest_rip, | |
3469 | vmcs12->guest_intr_status, | |
3470 | vmcs12->vm_entry_intr_info_field, | |
3471 | vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, | |
02dfc44f MZ |
3472 | vmcs12->ept_pointer, |
3473 | vmcs12->guest_cr3, | |
37ef0be2 DM |
3474 | KVM_ISA_VMX); |
3475 | ||
40e5f908 | 3476 | kvm_service_local_tlb_flush_requests(vcpu); |
eeeb4f67 | 3477 | |
2183f564 | 3478 | evaluate_pending_interrupts = exec_controls_get(vmx) & |
4e2a0bc5 | 3479 | (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); |
55d2375e SC |
3480 | if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) |
3481 | evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); | |
a56953e9 SC |
3482 | if (!evaluate_pending_interrupts) |
3483 | evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); | |
55d2375e | 3484 | |
764643a6 SC |
3485 | if (!vmx->nested.nested_run_pending || |
3486 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) | |
5d76b1f8 | 3487 | vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); |
55d2375e | 3488 | if (kvm_mpx_supported() && |
fa578398 SC |
3489 | (!vmx->nested.nested_run_pending || |
3490 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) | |
5d76b1f8 | 3491 | vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); |
55d2375e | 3492 | |
f087a029 SC |
3493 | /* |
3494 | * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* | |
3495 | * nested early checks are disabled. In the event of a "late" VM-Fail, | |
3496 | * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its | |
3497 | * software model to the pre-VMEntry host state. When EPT is disabled, | |
3498 | * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes | |
3499 | * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing | |
3500 | * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to | |
3501 | * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested | |
3502 | * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is | |
3503 | * guaranteed to be overwritten with a shadow CR3 prior to re-entering | |
3504 | * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as | |
3505 | * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks | |
3506 | * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail | |
3507 | * path would need to manually save/restore vmcs01.GUEST_CR3. | |
3508 | */ | |
3509 | if (!enable_ept && !nested_early_check) | |
3510 | vmcs_writel(GUEST_CR3, vcpu->arch.cr3); | |
3511 | ||
55d2375e SC |
3512 | vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); |
3513 | ||
389ab252 | 3514 | prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); |
55d2375e SC |
3515 | |
3516 | if (from_vmentry) { | |
b89d5ad0 SC |
3517 | if (unlikely(!nested_get_vmcs12_pages(vcpu))) { |
3518 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | |
671ddc70 | 3519 | return NVMX_VMENTRY_KVM_INTERNAL_ERROR; |
b89d5ad0 | 3520 | } |
55d2375e SC |
3521 | |
3522 | if (nested_vmx_check_vmentry_hw(vcpu)) { | |
3523 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | |
671ddc70 | 3524 | return NVMX_VMENTRY_VMFAIL; |
55d2375e SC |
3525 | } |
3526 | ||
68cda40d SC |
3527 | if (nested_vmx_check_guest_state(vcpu, vmcs12, |
3528 | &entry_failure_code)) { | |
8e533240 | 3529 | exit_reason.basic = EXIT_REASON_INVALID_STATE; |
68cda40d | 3530 | vmcs12->exit_qualification = entry_failure_code; |
55d2375e | 3531 | goto vmentry_fail_vmexit; |
68cda40d | 3532 | } |
55d2375e SC |
3533 | } |
3534 | ||
3535 | enter_guest_mode(vcpu); | |
55d2375e | 3536 | |
0f857223 | 3537 | if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { |
8e533240 | 3538 | exit_reason.basic = EXIT_REASON_INVALID_STATE; |
68cda40d | 3539 | vmcs12->exit_qualification = entry_failure_code; |
55d2375e | 3540 | goto vmentry_fail_vmexit_guest_mode; |
68cda40d | 3541 | } |
55d2375e SC |
3542 | |
3543 | if (from_vmentry) { | |
68cda40d SC |
3544 | failed_index = nested_vmx_load_msr(vcpu, |
3545 | vmcs12->vm_entry_msr_load_addr, | |
3546 | vmcs12->vm_entry_msr_load_count); | |
3547 | if (failed_index) { | |
8e533240 | 3548 | exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; |
68cda40d | 3549 | vmcs12->exit_qualification = failed_index; |
55d2375e | 3550 | goto vmentry_fail_vmexit_guest_mode; |
68cda40d | 3551 | } |
55d2375e SC |
3552 | } else { |
3553 | /* | |
3554 | * The MMU is not initialized to point at the right entities yet and | |
3555 | * "get pages" would need to read data from the guest (i.e. we will | |
3556 | * need to perform gpa to hpa translation). Request a call | |
3557 | * to nested_get_vmcs12_pages before the next VM-entry. The MSRs | |
3558 | * have already been set at vmentry time and should not be reset. | |
3559 | */ | |
729c15c2 | 3560 | kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); |
55d2375e SC |
3561 | } |
3562 | ||
3563 | /* | |
a56953e9 SC |
3564 | * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI |
3565 | * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can | |
3566 | * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit | |
3567 | * unconditionally. | |
55d2375e SC |
3568 | */ |
3569 | if (unlikely(evaluate_pending_interrupts)) | |
3570 | kvm_make_request(KVM_REQ_EVENT, vcpu); | |
3571 | ||
359a6c3d PB |
3572 | /* |
3573 | * Do not start the preemption timer hrtimer until after we know | |
3574 | * we are successful, so that only nested_vmx_vmexit needs to cancel | |
3575 | * the timer. | |
3576 | */ | |
3577 | vmx->nested.preemption_timer_expired = false; | |
850448f3 PS |
3578 | if (nested_cpu_has_preemption_timer(vmcs12)) { |
3579 | u64 timer_value = vmx_calc_preemption_timer_value(vcpu); | |
3580 | vmx_start_preemption_timer(vcpu, timer_value); | |
3581 | } | |
359a6c3d | 3582 | |
55d2375e SC |
3583 | /* |
3584 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | |
3585 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | |
3586 | * returned as far as L1 is concerned. It will only return (and set | |
3587 | * the success flag) when L2 exits (see nested_vmx_vmexit()). | |
3588 | */ | |
671ddc70 | 3589 | return NVMX_VMENTRY_SUCCESS; |
55d2375e SC |
3590 | |
3591 | /* | |
3592 | * A failed consistency check that leads to a VMExit during L1's | |
3593 | * VMEnter to L2 is a variation of a normal VMexit, as explained in | |
3594 | * 26.7 "VM-entry failures during or after loading guest state". | |
3595 | */ | |
3596 | vmentry_fail_vmexit_guest_mode: | |
5e3d394f | 3597 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) |
55d2375e SC |
3598 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; |
3599 | leave_guest_mode(vcpu); | |
3600 | ||
3601 | vmentry_fail_vmexit: | |
3602 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | |
3603 | ||
3604 | if (!from_vmentry) | |
671ddc70 | 3605 | return NVMX_VMENTRY_VMEXIT; |
55d2375e SC |
3606 | |
3607 | load_vmcs12_host_state(vcpu, vmcs12); | |
8e533240 | 3608 | vmcs12->vm_exit_reason = exit_reason.full; |
453e42b0 | 3609 | if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) |
3731905e | 3610 | vmx->nested.need_vmcs12_to_shadow_sync = true; |
671ddc70 | 3611 | return NVMX_VMENTRY_VMEXIT; |
55d2375e SC |
3612 | } |
3613 | ||
3614 | /* | |
3615 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 | |
3616 | * for running an L2 nested guest. | |
3617 | */ | |
3618 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |
3619 | { | |
3620 | struct vmcs12 *vmcs12; | |
671ddc70 | 3621 | enum nvmx_vmentry_status status; |
55d2375e SC |
3622 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3623 | u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); | |
b6a0653a | 3624 | enum nested_evmptrld_status evmptrld_status; |
55d2375e SC |
3625 | |
3626 | if (!nested_vmx_check_permission(vcpu)) | |
3627 | return 1; | |
3628 | ||
b6a0653a VK |
3629 | evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); |
3630 | if (evmptrld_status == EVMPTRLD_ERROR) { | |
3631 | kvm_queue_exception(vcpu, UD_VECTOR); | |
55d2375e | 3632 | return 1; |
b6a0653a | 3633 | } |
55d2375e | 3634 | |
f19063b1 | 3635 | kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); |
018d70ff EH |
3636 | |
3637 | if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) | |
3638 | return nested_vmx_failInvalid(vcpu); | |
3639 | ||
453e42b0 | 3640 | if (CC(!nested_vmx_is_evmptr12_valid(vmx) && |
64c78508 | 3641 | vmx->nested.current_vmptr == INVALID_GPA)) |
55d2375e SC |
3642 | return nested_vmx_failInvalid(vcpu); |
3643 | ||
3644 | vmcs12 = get_vmcs12(vcpu); | |
3645 | ||
3646 | /* | |
3647 | * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact | |
3648 | * that there *is* a valid VMCS pointer, RFLAGS.CF is set | |
3649 | * rather than RFLAGS.ZF, and no error number is stored to the | |
3650 | * VM-instruction error field. | |
3651 | */ | |
fc595f35 | 3652 | if (CC(vmcs12->hdr.shadow_vmcs)) |
55d2375e SC |
3653 | return nested_vmx_failInvalid(vcpu); |
3654 | ||
453e42b0 | 3655 | if (nested_vmx_is_evmptr12_valid(vmx)) { |
c98842b2 VK |
3656 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
3657 | ||
3658 | copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); | |
55d2375e SC |
3659 | /* Enlightened VMCS doesn't have launch state */ |
3660 | vmcs12->launch_state = !launch; | |
3661 | } else if (enable_shadow_vmcs) { | |
3662 | copy_shadow_to_vmcs12(vmx); | |
3663 | } | |
3664 | ||
3665 | /* | |
3666 | * The nested entry process starts with enforcing various prerequisites | |
3667 | * on vmcs12 as required by the Intel SDM, and act appropriately when | |
3668 | * they fail: As the SDM explains, some conditions should cause the | |
3669 | * instruction to fail, while others will cause the instruction to seem | |
3670 | * to succeed, but return an EXIT_REASON_INVALID_STATE. | |
3671 | * To speed up the normal (success) code path, we should avoid checking | |
3672 | * for misconfigurations which will anyway be caught by the processor | |
3673 | * when using the merged vmcs02. | |
3674 | */ | |
fc595f35 | 3675 | if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) |
b2656e4d | 3676 | return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); |
55d2375e | 3677 | |
fc595f35 | 3678 | if (CC(vmcs12->launch_state == launch)) |
b2656e4d | 3679 | return nested_vmx_fail(vcpu, |
55d2375e SC |
3680 | launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS |
3681 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); | |
3682 | ||
98d9e858 | 3683 | if (nested_vmx_check_controls(vcpu, vmcs12)) |
b2656e4d | 3684 | return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
5478ba34 | 3685 | |
af957eeb ML |
3686 | if (nested_vmx_check_address_space_size(vcpu, vmcs12)) |
3687 | return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); | |
3688 | ||
98d9e858 | 3689 | if (nested_vmx_check_host_state(vcpu, vmcs12)) |
b2656e4d | 3690 | return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); |
55d2375e SC |
3691 | |
3692 | /* | |
3693 | * We're finally done with prerequisite checking, and can start with | |
3694 | * the nested entry. | |
3695 | */ | |
3696 | vmx->nested.nested_run_pending = 1; | |
850448f3 | 3697 | vmx->nested.has_preemption_timer_deadline = false; |
671ddc70 JM |
3698 | status = nested_vmx_enter_non_root_mode(vcpu, true); |
3699 | if (unlikely(status != NVMX_VMENTRY_SUCCESS)) | |
3700 | goto vmentry_failed; | |
55d2375e | 3701 | |
25bb2cf9 SC |
3702 | /* Emulate processing of posted interrupts on VM-Enter. */ |
3703 | if (nested_cpu_has_posted_intr(vmcs12) && | |
3704 | kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { | |
3705 | vmx->nested.pi_pending = true; | |
3706 | kvm_make_request(KVM_REQ_EVENT, vcpu); | |
3707 | kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); | |
3708 | } | |
3709 | ||
55d2375e SC |
3710 | /* Hide L1D cache contents from the nested guest. */ |
3711 | vmx->vcpu.arch.l1tf_flush_l1d = true; | |
3712 | ||
3713 | /* | |
3714 | * Must happen outside of nested_vmx_enter_non_root_mode() as it will | |
3715 | * also be used as part of restoring nVMX state for | |
3716 | * snapshot restore (migration). | |
3717 | * | |
3718 | * In this flow, it is assumed that vmcs12 cache was | |
163b0991 | 3719 | * transferred as part of captured nVMX state and should |
55d2375e SC |
3720 | * therefore not be read from guest memory (which may not |
3721 | * exist on destination host yet). | |
3722 | */ | |
3723 | nested_cache_shadow_vmcs12(vcpu, vmcs12); | |
3724 | ||
bf0cd88c YQ |
3725 | switch (vmcs12->guest_activity_state) { |
3726 | case GUEST_ACTIVITY_HLT: | |
3727 | /* | |
3728 | * If we're entering a halted L2 vcpu and the L2 vcpu won't be | |
3729 | * awakened by event injection or by an NMI-window VM-exit or | |
3730 | * by an interrupt-window VM-exit, halt the vcpu. | |
3731 | */ | |
3732 | if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && | |
3733 | !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && | |
3734 | !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && | |
3735 | (vmcs12->guest_rflags & X86_EFLAGS_IF))) { | |
3736 | vmx->nested.nested_run_pending = 0; | |
1460179d | 3737 | return kvm_emulate_halt_noskip(vcpu); |
bf0cd88c YQ |
3738 | } |
3739 | break; | |
3740 | case GUEST_ACTIVITY_WAIT_SIPI: | |
55d2375e | 3741 | vmx->nested.nested_run_pending = 0; |
bf0cd88c YQ |
3742 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
3743 | break; | |
3744 | default: | |
3745 | break; | |
55d2375e | 3746 | } |
bf0cd88c | 3747 | |
55d2375e | 3748 | return 1; |
671ddc70 JM |
3749 | |
3750 | vmentry_failed: | |
3751 | vmx->nested.nested_run_pending = 0; | |
3752 | if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) | |
3753 | return 0; | |
3754 | if (status == NVMX_VMENTRY_VMEXIT) | |
3755 | return 1; | |
3756 | WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); | |
b2656e4d | 3757 | return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
55d2375e SC |
3758 | } |
3759 | ||
3760 | /* | |
3761 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date | |
67b0ae43 | 3762 | * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). |
55d2375e SC |
3763 | * This function returns the new value we should put in vmcs12.guest_cr0. |
3764 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, | |
3765 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now | |
3766 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 | |
3767 | * didn't trap the bit, because if L1 did, so would L0). | |
3768 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have | |
3769 | * been modified by L2, and L1 knows it. So just leave the old value of | |
3770 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 | |
3771 | * isn't relevant, because if L0 traps this bit it can set it to anything. | |
3772 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have | |
3773 | * changed these bits, and therefore they need to be updated, but L0 | |
3774 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather | |
3775 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. | |
3776 | */ | |
3777 | static inline unsigned long | |
3778 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |
3779 | { | |
3780 | return | |
3781 | /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | | |
3782 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | | |
3783 | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | | |
3784 | vcpu->arch.cr0_guest_owned_bits)); | |
3785 | } | |
3786 | ||
3787 | static inline unsigned long | |
3788 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |
3789 | { | |
3790 | return | |
3791 | /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | | |
3792 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | | |
3793 | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | | |
3794 | vcpu->arch.cr4_guest_owned_bits)); | |
3795 | } | |
3796 | ||
3797 | static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, | |
9bd1f0ef SC |
3798 | struct vmcs12 *vmcs12, |
3799 | u32 vm_exit_reason, u32 exit_intr_info) | |
55d2375e SC |
3800 | { |
3801 | u32 idt_vectoring; | |
3802 | unsigned int nr; | |
3803 | ||
9bd1f0ef SC |
3804 | /* |
3805 | * Per the SDM, VM-Exits due to double and triple faults are never | |
3806 | * considered to occur during event delivery, even if the double/triple | |
3807 | * fault is the result of an escalating vectoring issue. | |
3808 | * | |
3809 | * Note, the SDM qualifies the double fault behavior with "The original | |
3810 | * event results in a double-fault exception". It's unclear why the | |
3811 | * qualification exists since exits due to double fault can occur only | |
3812 | * while vectoring a different exception (injected events are never | |
3813 | * subject to interception), i.e. there's _always_ an original event. | |
3814 | * | |
3815 | * The SDM also uses NMI as a confusing example for the "original event | |
3816 | * causes the VM exit directly" clause. NMI isn't special in any way, | |
3817 | * the same rule applies to all events that cause an exit directly. | |
3818 | * NMI is an odd choice for the example because NMIs can only occur on | |
3819 | * instruction boundaries, i.e. they _can't_ occur during vectoring. | |
3820 | */ | |
3821 | if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || | |
3822 | ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && | |
3823 | is_double_fault(exit_intr_info))) { | |
3824 | vmcs12->idt_vectoring_info_field = 0; | |
3825 | } else if (vcpu->arch.exception.injected) { | |
d4963e31 | 3826 | nr = vcpu->arch.exception.vector; |
55d2375e SC |
3827 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; |
3828 | ||
3829 | if (kvm_exception_is_soft(nr)) { | |
3830 | vmcs12->vm_exit_instruction_len = | |
3831 | vcpu->arch.event_exit_inst_len; | |
3832 | idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; | |
3833 | } else | |
3834 | idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; | |
3835 | ||
3836 | if (vcpu->arch.exception.has_error_code) { | |
3837 | idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; | |
3838 | vmcs12->idt_vectoring_error_code = | |
3839 | vcpu->arch.exception.error_code; | |
3840 | } | |
3841 | ||
3842 | vmcs12->idt_vectoring_info_field = idt_vectoring; | |
3843 | } else if (vcpu->arch.nmi_injected) { | |
3844 | vmcs12->idt_vectoring_info_field = | |
3845 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; | |
3846 | } else if (vcpu->arch.interrupt.injected) { | |
3847 | nr = vcpu->arch.interrupt.nr; | |
3848 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; | |
3849 | ||
3850 | if (vcpu->arch.interrupt.soft) { | |
3851 | idt_vectoring |= INTR_TYPE_SOFT_INTR; | |
3852 | vmcs12->vm_entry_instruction_len = | |
3853 | vcpu->arch.event_exit_inst_len; | |
3854 | } else | |
3855 | idt_vectoring |= INTR_TYPE_EXT_INTR; | |
3856 | ||
3857 | vmcs12->idt_vectoring_info_field = idt_vectoring; | |
9bd1f0ef SC |
3858 | } else { |
3859 | vmcs12->idt_vectoring_info_field = 0; | |
55d2375e SC |
3860 | } |
3861 | } | |
3862 | ||
3863 | ||
96b100cd | 3864 | void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) |
55d2375e SC |
3865 | { |
3866 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
3867 | gfn_t gfn; | |
3868 | ||
3869 | /* | |
3870 | * Don't need to mark the APIC access page dirty; it is never | |
3871 | * written to by the CPU during APIC virtualization. | |
3872 | */ | |
3873 | ||
3874 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | |
3875 | gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; | |
3876 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | |
3877 | } | |
3878 | ||
3879 | if (nested_cpu_has_posted_intr(vmcs12)) { | |
3880 | gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; | |
3881 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | |
3882 | } | |
3883 | } | |
3884 | ||
650293c3 | 3885 | static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) |
55d2375e SC |
3886 | { |
3887 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3888 | int max_irr; | |
3889 | void *vapic_page; | |
3890 | u16 status; | |
3891 | ||
966eefb8 | 3892 | if (!vmx->nested.pi_pending) |
650293c3 | 3893 | return 0; |
55d2375e | 3894 | |
966eefb8 JM |
3895 | if (!vmx->nested.pi_desc) |
3896 | goto mmio_needed; | |
3897 | ||
55d2375e | 3898 | vmx->nested.pi_pending = false; |
966eefb8 | 3899 | |
55d2375e | 3900 | if (!pi_test_and_clear_on(vmx->nested.pi_desc)) |
650293c3 | 3901 | return 0; |
55d2375e | 3902 | |
d83c36d8 SC |
3903 | max_irr = pi_find_highest_vector(vmx->nested.pi_desc); |
3904 | if (max_irr > 0) { | |
96c66e87 KA |
3905 | vapic_page = vmx->nested.virtual_apic_map.hva; |
3906 | if (!vapic_page) | |
0fe998b2 | 3907 | goto mmio_needed; |
96c66e87 | 3908 | |
55d2375e SC |
3909 | __kvm_apic_update_irr(vmx->nested.pi_desc->pir, |
3910 | vapic_page, &max_irr); | |
55d2375e SC |
3911 | status = vmcs_read16(GUEST_INTR_STATUS); |
3912 | if ((u8)max_irr > ((u8)status & 0xff)) { | |
3913 | status &= ~0xff; | |
3914 | status |= (u8)max_irr; | |
3915 | vmcs_write16(GUEST_INTR_STATUS, status); | |
3916 | } | |
3917 | } | |
3918 | ||
3919 | nested_mark_vmcs12_pages_dirty(vcpu); | |
650293c3 | 3920 | return 0; |
0fe998b2 JM |
3921 | |
3922 | mmio_needed: | |
3923 | kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); | |
3924 | return -ENXIO; | |
55d2375e SC |
3925 | } |
3926 | ||
7709aba8 | 3927 | static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) |
55d2375e | 3928 | { |
7709aba8 | 3929 | struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; |
d4963e31 | 3930 | u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; |
55d2375e | 3931 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
7709aba8 SC |
3932 | unsigned long exit_qual; |
3933 | ||
3934 | if (ex->has_payload) { | |
3935 | exit_qual = ex->payload; | |
3936 | } else if (ex->vector == PF_VECTOR) { | |
3937 | exit_qual = vcpu->arch.cr2; | |
3938 | } else if (ex->vector == DB_VECTOR) { | |
3939 | exit_qual = vcpu->arch.dr6; | |
3940 | exit_qual &= ~DR6_BT; | |
3941 | exit_qual ^= DR6_ACTIVE_LOW; | |
3942 | } else { | |
3943 | exit_qual = 0; | |
3944 | } | |
55d2375e | 3945 | |
80962ec9 SC |
3946 | /* |
3947 | * Unlike AMD's Paged Real Mode, which reports an error code on #PF | |
3948 | * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the | |
3949 | * "has error code" flags on VM-Exit if the CPU is in Real Mode. | |
3950 | */ | |
3951 | if (ex->has_error_code && is_protmode(vcpu)) { | |
eba9799b SC |
3952 | /* |
3953 | * Intel CPUs do not generate error codes with bits 31:16 set, | |
3954 | * and more importantly VMX disallows setting bits 31:16 in the | |
3955 | * injected error code for VM-Entry. Drop the bits to mimic | |
3956 | * hardware and avoid inducing failure on nested VM-Entry if L1 | |
3957 | * chooses to inject the exception back to L2. AMD CPUs _do_ | |
3958 | * generate "full" 32-bit error codes, so KVM allows userspace | |
3959 | * to inject exception error codes with bits 31:16 set. | |
3960 | */ | |
d4963e31 | 3961 | vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; |
55d2375e SC |
3962 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
3963 | } | |
3964 | ||
d4963e31 | 3965 | if (kvm_exception_is_soft(ex->vector)) |
55d2375e SC |
3966 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; |
3967 | else | |
3968 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | |
3969 | ||
3970 | if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && | |
3971 | vmx_get_nmi_mask(vcpu)) | |
3972 | intr_info |= INTR_INFO_UNBLOCK_NMI; | |
3973 | ||
3974 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); | |
3975 | } | |
3976 | ||
684c0422 | 3977 | /* |
8d178f46 SC |
3978 | * Returns true if a debug trap is (likely) pending delivery. Infer the class |
3979 | * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). | |
3980 | * Using the payload is flawed because code breakpoints (fault-like) and data | |
3981 | * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. | |
3982 | * this will return false positives if a to-be-injected code breakpoint #DB is | |
3983 | * pending (from KVM's perspective, but not "pending" across an instruction | |
3984 | * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it | |
3985 | * too is trap-like. | |
684c0422 | 3986 | * |
8d178f46 SC |
3987 | * KVM "works" despite these flaws as ICEBP isn't currently supported by the |
3988 | * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the | |
3989 | * #DB has already happened), and MTF isn't marked pending on code breakpoints | |
3990 | * from the emulator (because such #DBs are fault-like and thus don't trigger | |
3991 | * actions that fire on instruction retire). | |
684c0422 | 3992 | */ |
2b384165 | 3993 | static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) |
684c0422 | 3994 | { |
2b384165 | 3995 | if (!ex->pending || ex->vector != DB_VECTOR) |
8d178f46 SC |
3996 | return 0; |
3997 | ||
3998 | /* General Detect #DBs are always fault-like. */ | |
2b384165 SC |
3999 | return ex->payload & ~DR6_BD; |
4000 | } | |
4001 | ||
4002 | /* | |
4003 | * Returns true if there's a pending #DB exception that is lower priority than | |
4004 | * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by | |
4005 | * KVM, but could theoretically be injected by userspace. Note, this code is | |
4006 | * imperfect, see above. | |
684c0422 | 4007 | */ |
2b384165 | 4008 | static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) |
684c0422 | 4009 | { |
2b384165 | 4010 | return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; |
684c0422 OU |
4011 | } |
4012 | ||
4013 | /* | |
4014 | * Certain VM-exits set the 'pending debug exceptions' field to indicate a | |
4015 | * recognized #DB (data or single-step) that has yet to be delivered. Since KVM | |
4016 | * represents these debug traps with a payload that is said to be compatible | |
4017 | * with the 'pending debug exceptions' field, write the payload to the VMCS | |
4018 | * field if a VM-exit is delivered before the debug trap. | |
4019 | */ | |
4020 | static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) | |
4021 | { | |
2b384165 | 4022 | unsigned long pending_dbg; |
8d178f46 | 4023 | |
2b384165 | 4024 | pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); |
8d178f46 SC |
4025 | if (pending_dbg) |
4026 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); | |
684c0422 OU |
4027 | } |
4028 | ||
d2060bd4 SC |
4029 | static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) |
4030 | { | |
4031 | return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && | |
4032 | to_vmx(vcpu)->nested.preemption_timer_expired; | |
4033 | } | |
4034 | ||
32f55e47 | 4035 | static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) |
5b4ac1a1 | 4036 | { |
27c4fa42 SC |
4037 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4038 | void *vapic = vmx->nested.virtual_apic_map.hva; | |
4039 | int max_irr, vppr; | |
4040 | ||
4041 | if (nested_vmx_preemption_timer_pending(vcpu) || | |
4042 | vmx->nested.mtf_pending) | |
4043 | return true; | |
4044 | ||
4045 | /* | |
4046 | * Virtual Interrupt Delivery doesn't require manual injection. Either | |
4047 | * the interrupt is already in GUEST_RVI and will be recognized by CPU | |
4048 | * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move | |
4049 | * the interrupt from the PIR to RVI prior to entering the guest. | |
4050 | */ | |
4051 | if (for_injection) | |
4052 | return false; | |
4053 | ||
4054 | if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || | |
4055 | __vmx_interrupt_blocked(vcpu)) | |
4056 | return false; | |
4057 | ||
4058 | if (!vapic) | |
4059 | return false; | |
4060 | ||
4061 | vppr = *((u32 *)(vapic + APIC_PROCPRI)); | |
4062 | ||
321ef62b SC |
4063 | max_irr = vmx_get_rvi(); |
4064 | if ((max_irr & 0xf0) > (vppr & 0xf0)) | |
4065 | return true; | |
4066 | ||
27c4fa42 SC |
4067 | if (vmx->nested.pi_pending && vmx->nested.pi_desc && |
4068 | pi_test_on(vmx->nested.pi_desc)) { | |
4069 | max_irr = pi_find_highest_vector(vmx->nested.pi_desc); | |
4070 | if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) | |
4071 | return true; | |
4072 | } | |
4073 | ||
4074 | return false; | |
5b4ac1a1 PB |
4075 | } |
4076 | ||
f43f8a3b SC |
4077 | /* |
4078 | * Per the Intel SDM's table "Priority Among Concurrent Events", with minor | |
4079 | * edits to fill in missing examples, e.g. #DB due to split-lock accesses, | |
4080 | * and less minor edits to splice in the priority of VMX Non-Root specific | |
4081 | * events, e.g. MTF and NMI/INTR-window exiting. | |
4082 | * | |
4083 | * 1 Hardware Reset and Machine Checks | |
4084 | * - RESET | |
4085 | * - Machine Check | |
4086 | * | |
4087 | * 2 Trap on Task Switch | |
4088 | * - T flag in TSS is set (on task switch) | |
4089 | * | |
4090 | * 3 External Hardware Interventions | |
4091 | * - FLUSH | |
4092 | * - STOPCLK | |
4093 | * - SMI | |
4094 | * - INIT | |
4095 | * | |
4096 | * 3.5 Monitor Trap Flag (MTF) VM-exit[1] | |
4097 | * | |
4098 | * 4 Traps on Previous Instruction | |
4099 | * - Breakpoints | |
4100 | * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O | |
4101 | * breakpoint, or #DB due to a split-lock access) | |
4102 | * | |
4103 | * 4.3 VMX-preemption timer expired VM-exit | |
4104 | * | |
4105 | * 4.6 NMI-window exiting VM-exit[2] | |
4106 | * | |
4107 | * 5 Nonmaskable Interrupts (NMI) | |
4108 | * | |
4109 | * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery | |
4110 | * | |
4111 | * 6 Maskable Hardware Interrupts | |
4112 | * | |
4113 | * 7 Code Breakpoint Fault | |
4114 | * | |
4115 | * 8 Faults from Fetching Next Instruction | |
4116 | * - Code-Segment Limit Violation | |
4117 | * - Code Page Fault | |
4118 | * - Control protection exception (missing ENDBRANCH at target of indirect | |
4119 | * call or jump) | |
4120 | * | |
4121 | * 9 Faults from Decoding Next Instruction | |
4122 | * - Instruction length > 15 bytes | |
4123 | * - Invalid Opcode | |
4124 | * - Coprocessor Not Available | |
4125 | * | |
4126 | *10 Faults on Executing Instruction | |
4127 | * - Overflow | |
4128 | * - Bound error | |
4129 | * - Invalid TSS | |
4130 | * - Segment Not Present | |
4131 | * - Stack fault | |
4132 | * - General Protection | |
4133 | * - Data Page Fault | |
4134 | * - Alignment Check | |
4135 | * - x86 FPU Floating-point exception | |
4136 | * - SIMD floating-point exception | |
4137 | * - Virtualization exception | |
4138 | * - Control protection exception | |
4139 | * | |
4140 | * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), | |
4141 | * INIT signals, and higher priority events take priority over MTF VM exits. | |
4142 | * MTF VM exits take priority over debug-trap exceptions and lower priority | |
4143 | * events. | |
4144 | * | |
4145 | * [2] Debug-trap exceptions and higher priority events take priority over VM exits | |
4146 | * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption | |
4147 | * timer take priority over VM exits caused by the "NMI-window exiting" | |
4148 | * VM-execution control and lower priority events. | |
4149 | * | |
4150 | * [3] Debug-trap exceptions and higher priority events take priority over VM exits | |
4151 | * caused by "NMI-window exiting". VM exits caused by this control take | |
4152 | * priority over non-maskable interrupts (NMIs) and lower priority events. | |
4153 | * | |
4154 | * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to | |
4155 | * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, | |
4156 | * non-maskable interrupts (NMIs) and higher priority events take priority over | |
4157 | * delivery of a virtual interrupt; delivery of a virtual interrupt takes | |
4158 | * priority over external interrupts and lower priority events. | |
4159 | */ | |
a1c77abb | 4160 | static int vmx_check_nested_events(struct kvm_vcpu *vcpu) |
55d2375e | 4161 | { |
4b9852f4 | 4162 | struct kvm_lapic *apic = vcpu->arch.apic; |
55d2375e | 4163 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5ef8acbd | 4164 | /* |
72c14e00 SC |
4165 | * Only a pending nested run blocks a pending exception. If there is a |
4166 | * previously injected event, the pending exception occurred while said | |
4167 | * event was being delivered and thus needs to be handled. | |
5ef8acbd | 4168 | */ |
72c14e00 SC |
4169 | bool block_nested_exceptions = vmx->nested.nested_run_pending; |
4170 | /* | |
4171 | * New events (not exceptions) are only recognized at instruction | |
4172 | * boundaries. If an event needs reinjection, then KVM is handling a | |
4173 | * VM-Exit that occurred _during_ instruction execution; new events are | |
4174 | * blocked until the instruction completes. | |
4175 | */ | |
4176 | bool block_nested_events = block_nested_exceptions || | |
4177 | kvm_event_needs_reinjection(vcpu); | |
5ef8acbd | 4178 | |
4b9852f4 LA |
4179 | if (lapic_in_kernel(vcpu) && |
4180 | test_bit(KVM_APIC_INIT, &apic->pending_events)) { | |
4181 | if (block_nested_events) | |
4182 | return -EBUSY; | |
684c0422 | 4183 | nested_vmx_update_pending_dbg(vcpu); |
e64a8508 | 4184 | clear_bit(KVM_APIC_INIT, &apic->pending_events); |
bf0cd88c YQ |
4185 | if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) |
4186 | nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); | |
593a5c2e SC |
4187 | |
4188 | /* MTF is discarded if the vCPU is in WFS. */ | |
4189 | vmx->nested.mtf_pending = false; | |
bf0cd88c YQ |
4190 | return 0; |
4191 | } | |
4192 | ||
4193 | if (lapic_in_kernel(vcpu) && | |
4194 | test_bit(KVM_APIC_SIPI, &apic->pending_events)) { | |
4195 | if (block_nested_events) | |
4196 | return -EBUSY; | |
4197 | ||
4198 | clear_bit(KVM_APIC_SIPI, &apic->pending_events); | |
c2086eca | 4199 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
bf0cd88c YQ |
4200 | nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, |
4201 | apic->sipi_vector & 0xFFUL); | |
c2086eca SC |
4202 | return 0; |
4203 | } | |
4204 | /* Fallthrough, the SIPI is completely ignored. */ | |
4b9852f4 | 4205 | } |
55d2375e | 4206 | |
5ef8acbd | 4207 | /* |
b9d44f90 SC |
4208 | * Process exceptions that are higher priority than Monitor Trap Flag: |
4209 | * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but | |
4210 | * could theoretically come in from userspace), and ICEBP (INT1). | |
4020da3b | 4211 | * |
593a5c2e SC |
4212 | * TODO: SMIs have higher priority than MTF and trap-like #DBs (except |
4213 | * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF | |
4214 | * across SMI/RSM as it should; that needs to be addressed in order to | |
4215 | * prioritize SMI over MTF and trap-like #DBs. | |
5ef8acbd | 4216 | */ |
7709aba8 SC |
4217 | if (vcpu->arch.exception_vmexit.pending && |
4218 | !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { | |
4219 | if (block_nested_exceptions) | |
5ef8acbd | 4220 | return -EBUSY; |
7709aba8 SC |
4221 | |
4222 | nested_vmx_inject_exception_vmexit(vcpu); | |
5ef8acbd OU |
4223 | return 0; |
4224 | } | |
4225 | ||
b9d44f90 | 4226 | if (vcpu->arch.exception.pending && |
2b384165 | 4227 | !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { |
72c14e00 | 4228 | if (block_nested_exceptions) |
5ef8acbd | 4229 | return -EBUSY; |
7709aba8 | 4230 | goto no_vmexit; |
5ef8acbd OU |
4231 | } |
4232 | ||
593a5c2e | 4233 | if (vmx->nested.mtf_pending) { |
5ef8acbd OU |
4234 | if (block_nested_events) |
4235 | return -EBUSY; | |
4236 | nested_vmx_update_pending_dbg(vcpu); | |
4237 | nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); | |
4238 | return 0; | |
4239 | } | |
4240 | ||
7709aba8 | 4241 | if (vcpu->arch.exception_vmexit.pending) { |
72c14e00 | 4242 | if (block_nested_exceptions) |
55d2375e | 4243 | return -EBUSY; |
7709aba8 SC |
4244 | |
4245 | nested_vmx_inject_exception_vmexit(vcpu); | |
55d2375e SC |
4246 | return 0; |
4247 | } | |
4248 | ||
7709aba8 SC |
4249 | if (vcpu->arch.exception.pending) { |
4250 | if (block_nested_exceptions) | |
4251 | return -EBUSY; | |
4252 | goto no_vmexit; | |
4253 | } | |
4254 | ||
d2060bd4 | 4255 | if (nested_vmx_preemption_timer_pending(vcpu)) { |
55d2375e SC |
4256 | if (block_nested_events) |
4257 | return -EBUSY; | |
4258 | nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); | |
4259 | return 0; | |
4260 | } | |
4261 | ||
1cd2f0b0 SC |
4262 | if (vcpu->arch.smi_pending && !is_smm(vcpu)) { |
4263 | if (block_nested_events) | |
4264 | return -EBUSY; | |
4265 | goto no_vmexit; | |
4266 | } | |
4267 | ||
15ff0b45 | 4268 | if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { |
55d2375e SC |
4269 | if (block_nested_events) |
4270 | return -EBUSY; | |
15ff0b45 SC |
4271 | if (!nested_exit_on_nmi(vcpu)) |
4272 | goto no_vmexit; | |
4273 | ||
55d2375e SC |
4274 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, |
4275 | NMI_VECTOR | INTR_TYPE_NMI_INTR | | |
4276 | INTR_INFO_VALID_MASK, 0); | |
4277 | /* | |
4278 | * The NMI-triggered VM exit counts as injection: | |
4279 | * clear this one and block further NMIs. | |
4280 | */ | |
4281 | vcpu->arch.nmi_pending = 0; | |
4282 | vmx_set_nmi_mask(vcpu, true); | |
4283 | return 0; | |
4284 | } | |
4285 | ||
15ff0b45 | 4286 | if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { |
55d2375e SC |
4287 | if (block_nested_events) |
4288 | return -EBUSY; | |
15ff0b45 SC |
4289 | if (!nested_exit_on_intr(vcpu)) |
4290 | goto no_vmexit; | |
55d2375e SC |
4291 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); |
4292 | return 0; | |
4293 | } | |
4294 | ||
6ce347af | 4295 | no_vmexit: |
650293c3 | 4296 | return vmx_complete_nested_posted_interrupt(vcpu); |
55d2375e SC |
4297 | } |
4298 | ||
4299 | static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) | |
4300 | { | |
4301 | ktime_t remaining = | |
4302 | hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); | |
4303 | u64 value; | |
4304 | ||
4305 | if (ktime_to_ns(remaining) <= 0) | |
4306 | return 0; | |
4307 | ||
4308 | value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; | |
4309 | do_div(value, 1000000); | |
4310 | return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | |
4311 | } | |
4312 | ||
7952d769 | 4313 | static bool is_vmcs12_ext_field(unsigned long field) |
55d2375e | 4314 | { |
7952d769 SC |
4315 | switch (field) { |
4316 | case GUEST_ES_SELECTOR: | |
4317 | case GUEST_CS_SELECTOR: | |
4318 | case GUEST_SS_SELECTOR: | |
4319 | case GUEST_DS_SELECTOR: | |
4320 | case GUEST_FS_SELECTOR: | |
4321 | case GUEST_GS_SELECTOR: | |
4322 | case GUEST_LDTR_SELECTOR: | |
4323 | case GUEST_TR_SELECTOR: | |
4324 | case GUEST_ES_LIMIT: | |
4325 | case GUEST_CS_LIMIT: | |
4326 | case GUEST_SS_LIMIT: | |
4327 | case GUEST_DS_LIMIT: | |
4328 | case GUEST_FS_LIMIT: | |
4329 | case GUEST_GS_LIMIT: | |
4330 | case GUEST_LDTR_LIMIT: | |
4331 | case GUEST_TR_LIMIT: | |
4332 | case GUEST_GDTR_LIMIT: | |
4333 | case GUEST_IDTR_LIMIT: | |
4334 | case GUEST_ES_AR_BYTES: | |
4335 | case GUEST_DS_AR_BYTES: | |
4336 | case GUEST_FS_AR_BYTES: | |
4337 | case GUEST_GS_AR_BYTES: | |
4338 | case GUEST_LDTR_AR_BYTES: | |
4339 | case GUEST_TR_AR_BYTES: | |
4340 | case GUEST_ES_BASE: | |
4341 | case GUEST_CS_BASE: | |
4342 | case GUEST_SS_BASE: | |
4343 | case GUEST_DS_BASE: | |
4344 | case GUEST_FS_BASE: | |
4345 | case GUEST_GS_BASE: | |
4346 | case GUEST_LDTR_BASE: | |
4347 | case GUEST_TR_BASE: | |
4348 | case GUEST_GDTR_BASE: | |
4349 | case GUEST_IDTR_BASE: | |
4350 | case GUEST_PENDING_DBG_EXCEPTIONS: | |
4351 | case GUEST_BNDCFGS: | |
4352 | return true; | |
4353 | default: | |
4354 | break; | |
4355 | } | |
55d2375e | 4356 | |
7952d769 SC |
4357 | return false; |
4358 | } | |
4359 | ||
4360 | static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, | |
4361 | struct vmcs12 *vmcs12) | |
4362 | { | |
4363 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
55d2375e SC |
4364 | |
4365 | vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); | |
4366 | vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); | |
4367 | vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); | |
4368 | vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); | |
4369 | vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); | |
4370 | vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); | |
4371 | vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); | |
4372 | vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); | |
4373 | vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); | |
4374 | vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); | |
4375 | vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); | |
4376 | vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); | |
4377 | vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); | |
4378 | vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); | |
4379 | vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); | |
4380 | vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); | |
4381 | vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); | |
4382 | vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); | |
4383 | vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); | |
55d2375e SC |
4384 | vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); |
4385 | vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); | |
4386 | vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); | |
4387 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); | |
4388 | vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); | |
4389 | vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); | |
4390 | vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); | |
4391 | vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); | |
4392 | vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); | |
4393 | vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); | |
4394 | vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); | |
4395 | vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); | |
4396 | vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); | |
4397 | vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); | |
4398 | vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); | |
7952d769 SC |
4399 | vmcs12->guest_pending_dbg_exceptions = |
4400 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); | |
7952d769 SC |
4401 | |
4402 | vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; | |
4403 | } | |
4404 | ||
4405 | static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, | |
4406 | struct vmcs12 *vmcs12) | |
4407 | { | |
4408 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4409 | int cpu; | |
4410 | ||
4411 | if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) | |
4412 | return; | |
4413 | ||
4414 | ||
4415 | WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); | |
4416 | ||
4417 | cpu = get_cpu(); | |
4418 | vmx->loaded_vmcs = &vmx->nested.vmcs02; | |
1af1bb05 | 4419 | vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); |
7952d769 SC |
4420 | |
4421 | sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); | |
4422 | ||
4423 | vmx->loaded_vmcs = &vmx->vmcs01; | |
1af1bb05 | 4424 | vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); |
7952d769 SC |
4425 | put_cpu(); |
4426 | } | |
4427 | ||
4428 | /* | |
4429 | * Update the guest state fields of vmcs12 to reflect changes that | |
4430 | * occurred while L2 was running. (The "IA-32e mode guest" bit of the | |
4431 | * VM-entry controls is also updated, since this is really a guest | |
4432 | * state bit.) | |
4433 | */ | |
4434 | static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |
4435 | { | |
4436 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4437 | ||
453e42b0 | 4438 | if (nested_vmx_is_evmptr12_valid(vmx)) |
7952d769 SC |
4439 | sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); |
4440 | ||
1e9dfbd7 | 4441 | vmx->nested.need_sync_vmcs02_to_vmcs12_rare = |
453e42b0 | 4442 | !nested_vmx_is_evmptr12_valid(vmx); |
7952d769 SC |
4443 | |
4444 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); | |
4445 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); | |
4446 | ||
4447 | vmcs12->guest_rsp = kvm_rsp_read(vcpu); | |
4448 | vmcs12->guest_rip = kvm_rip_read(vcpu); | |
4449 | vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); | |
4450 | ||
4451 | vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); | |
4452 | vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); | |
55d2375e SC |
4453 | |
4454 | vmcs12->guest_interruptibility_info = | |
4455 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | |
7952d769 | 4456 | |
55d2375e SC |
4457 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) |
4458 | vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; | |
bf0cd88c YQ |
4459 | else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) |
4460 | vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; | |
55d2375e SC |
4461 | else |
4462 | vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; | |
4463 | ||
b4b65b56 | 4464 | if (nested_cpu_has_preemption_timer(vmcs12) && |
850448f3 PS |
4465 | vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && |
4466 | !vmx->nested.nested_run_pending) | |
4467 | vmcs12->vmx_preemption_timer_value = | |
4468 | vmx_get_preemption_timer_value(vcpu); | |
55d2375e SC |
4469 | |
4470 | /* | |
4471 | * In some cases (usually, nested EPT), L2 is allowed to change its | |
4472 | * own CR3 without exiting. If it has changed it, we must keep it. | |
4473 | * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined | |
4474 | * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. | |
4475 | * | |
4476 | * Additionally, restore L2's PDPTR to vmcs12. | |
4477 | */ | |
4478 | if (enable_ept) { | |
4479 | vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); | |
c7554efc SC |
4480 | if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { |
4481 | vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); | |
4482 | vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); | |
4483 | vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); | |
4484 | vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); | |
4485 | } | |
55d2375e SC |
4486 | } |
4487 | ||
4488 | vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); | |
4489 | ||
4490 | if (nested_cpu_has_vid(vmcs12)) | |
4491 | vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); | |
4492 | ||
4493 | vmcs12->vm_entry_controls = | |
4494 | (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | | |
4495 | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); | |
4496 | ||
699a1ac2 | 4497 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) |
2a5f091c | 4498 | vmcs12->guest_dr7 = vcpu->arch.dr7; |
55d2375e | 4499 | |
55d2375e SC |
4500 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) |
4501 | vmcs12->guest_ia32_efer = vcpu->arch.efer; | |
55d2375e SC |
4502 | } |
4503 | ||
4504 | /* | |
4505 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits | |
4506 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), | |
4507 | * and this function updates it to reflect the changes to the guest state while | |
4508 | * L2 was running (and perhaps made some exits which were handled directly by L0 | |
4509 | * without going back to L1), and to reflect the exit reason. | |
4510 | * Note that we do not have to copy here all VMCS fields, just those that | |
4511 | * could have changed by the L2 guest or the exit - i.e., the guest-state and | |
4512 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, | |
4513 | * which already writes to vmcs12 directly. | |
4514 | */ | |
4515 | static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | |
4dcefa31 | 4516 | u32 vm_exit_reason, u32 exit_intr_info, |
55d2375e SC |
4517 | unsigned long exit_qualification) |
4518 | { | |
55d2375e | 4519 | /* update exit information fields: */ |
4dcefa31 | 4520 | vmcs12->vm_exit_reason = vm_exit_reason; |
3c0c2ad1 SC |
4521 | if (to_vmx(vcpu)->exit_reason.enclave_mode) |
4522 | vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; | |
55d2375e | 4523 | vmcs12->exit_qualification = exit_qualification; |
55d2375e | 4524 | |
c3634d25 SC |
4525 | /* |
4526 | * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched | |
4527 | * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other | |
4528 | * exit info fields are unmodified. | |
4529 | */ | |
55d2375e SC |
4530 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { |
4531 | vmcs12->launch_state = 1; | |
4532 | ||
4533 | /* vm_entry_intr_info_field is cleared on exit. Emulate this | |
4534 | * instead of reading the real value. */ | |
4535 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; | |
4536 | ||
4537 | /* | |
4538 | * Transfer the event that L0 or L1 may wanted to inject into | |
4539 | * L2 to IDT_VECTORING_INFO_FIELD. | |
4540 | */ | |
9bd1f0ef SC |
4541 | vmcs12_save_pending_event(vcpu, vmcs12, |
4542 | vm_exit_reason, exit_intr_info); | |
a0d4f803 | 4543 | |
c3634d25 SC |
4544 | vmcs12->vm_exit_intr_info = exit_intr_info; |
4545 | vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | |
4546 | vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
4547 | ||
a0d4f803 KS |
4548 | /* |
4549 | * According to spec, there's no need to store the guest's | |
4550 | * MSRs if the exit is due to a VM-entry failure that occurs | |
4551 | * during or after loading the guest state. Since this exit | |
4552 | * does not fall in that category, we need to save the MSRs. | |
4553 | */ | |
4554 | if (nested_vmx_store_msr(vcpu, | |
4555 | vmcs12->vm_exit_msr_store_addr, | |
4556 | vmcs12->vm_exit_msr_store_count)) | |
4557 | nested_vmx_abort(vcpu, | |
4558 | VMX_ABORT_SAVE_GUEST_MSR_FAIL); | |
55d2375e | 4559 | } |
55d2375e SC |
4560 | } |
4561 | ||
4562 | /* | |
4563 | * A part of what we need to when the nested L2 guest exits and we want to | |
4564 | * run its L1 parent, is to reset L1's guest state to the host state specified | |
4565 | * in vmcs12. | |
4566 | * This function is to be called not only on normal nested exit, but also on | |
4567 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry | |
4568 | * Failures During or After Loading Guest State"). | |
4569 | * This function should be called when the active VMCS is L1's (vmcs01). | |
4570 | */ | |
4571 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | |
4572 | struct vmcs12 *vmcs12) | |
4573 | { | |
68cda40d | 4574 | enum vm_entry_failure_code ignored; |
55d2375e | 4575 | struct kvm_segment seg; |
55d2375e SC |
4576 | |
4577 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) | |
4578 | vcpu->arch.efer = vmcs12->host_ia32_efer; | |
4579 | else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | |
4580 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | |
4581 | else | |
4582 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | |
4583 | vmx_set_efer(vcpu, vcpu->arch.efer); | |
4584 | ||
e9c16c78 PB |
4585 | kvm_rsp_write(vcpu, vmcs12->host_rsp); |
4586 | kvm_rip_write(vcpu, vmcs12->host_rip); | |
55d2375e SC |
4587 | vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); |
4588 | vmx_set_interrupt_shadow(vcpu, 0); | |
4589 | ||
4590 | /* | |
4591 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't | |
4592 | * actually changed, because vmx_set_cr0 refers to efer set above. | |
4593 | * | |
4594 | * CR0_GUEST_HOST_MASK is already set in the original vmcs01 | |
4595 | * (KVM doesn't change it); | |
4596 | */ | |
fb509f76 | 4597 | vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); |
55d2375e SC |
4598 | vmx_set_cr0(vcpu, vmcs12->host_cr0); |
4599 | ||
4600 | /* Same as above - no reason to call set_cr4_guest_host_mask(). */ | |
4601 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | |
4602 | vmx_set_cr4(vcpu, vmcs12->host_cr4); | |
4603 | ||
4604 | nested_ept_uninit_mmu_context(vcpu); | |
4605 | ||
4606 | /* | |
4607 | * Only PDPTE load can fail as the value of cr3 was checked on entry and | |
4608 | * couldn't have changed. | |
4609 | */ | |
0f857223 | 4610 | if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) |
55d2375e SC |
4611 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); |
4612 | ||
50b265a4 | 4613 | nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); |
55d2375e SC |
4614 | |
4615 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); | |
4616 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); | |
4617 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); | |
4618 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); | |
4619 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); | |
4620 | vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); | |
4621 | vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); | |
4622 | ||
4623 | /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ | |
4624 | if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) | |
4625 | vmcs_write64(GUEST_BNDCFGS, 0); | |
4626 | ||
4627 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { | |
4628 | vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); | |
4629 | vcpu->arch.pat = vmcs12->host_ia32_pat; | |
4630 | } | |
4496a6f9 | 4631 | if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && |
c85cdc1c | 4632 | kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) |
d1968421 OU |
4633 | WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, |
4634 | vmcs12->host_ia32_perf_global_ctrl)); | |
55d2375e SC |
4635 | |
4636 | /* Set L1 segment info according to Intel SDM | |
4637 | 27.5.2 Loading Host Segment and Descriptor-Table Registers */ | |
4638 | seg = (struct kvm_segment) { | |
4639 | .base = 0, | |
4640 | .limit = 0xFFFFFFFF, | |
4641 | .selector = vmcs12->host_cs_selector, | |
4642 | .type = 11, | |
4643 | .present = 1, | |
4644 | .s = 1, | |
4645 | .g = 1 | |
4646 | }; | |
4647 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | |
4648 | seg.l = 1; | |
4649 | else | |
4650 | seg.db = 1; | |
816be9e9 | 4651 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); |
55d2375e SC |
4652 | seg = (struct kvm_segment) { |
4653 | .base = 0, | |
4654 | .limit = 0xFFFFFFFF, | |
4655 | .type = 3, | |
4656 | .present = 1, | |
4657 | .s = 1, | |
4658 | .db = 1, | |
4659 | .g = 1 | |
4660 | }; | |
4661 | seg.selector = vmcs12->host_ds_selector; | |
816be9e9 | 4662 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); |
55d2375e | 4663 | seg.selector = vmcs12->host_es_selector; |
816be9e9 | 4664 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); |
55d2375e | 4665 | seg.selector = vmcs12->host_ss_selector; |
816be9e9 | 4666 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); |
55d2375e SC |
4667 | seg.selector = vmcs12->host_fs_selector; |
4668 | seg.base = vmcs12->host_fs_base; | |
816be9e9 | 4669 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); |
55d2375e SC |
4670 | seg.selector = vmcs12->host_gs_selector; |
4671 | seg.base = vmcs12->host_gs_base; | |
816be9e9 | 4672 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); |
55d2375e SC |
4673 | seg = (struct kvm_segment) { |
4674 | .base = vmcs12->host_tr_base, | |
4675 | .limit = 0x67, | |
4676 | .selector = vmcs12->host_tr_selector, | |
4677 | .type = 11, | |
4678 | .present = 1 | |
4679 | }; | |
816be9e9 | 4680 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); |
55d2375e | 4681 | |
afc8de01 SC |
4682 | memset(&seg, 0, sizeof(seg)); |
4683 | seg.unusable = 1; | |
816be9e9 | 4684 | __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); |
55d2375e SC |
4685 | |
4686 | kvm_set_dr(vcpu, 7, 0x400); | |
4687 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | |
4688 | ||
55d2375e SC |
4689 | if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, |
4690 | vmcs12->vm_exit_msr_load_count)) | |
4691 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | |
dbab610a ML |
4692 | |
4693 | to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); | |
55d2375e SC |
4694 | } |
4695 | ||
4696 | static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) | |
4697 | { | |
eb3db1b1 | 4698 | struct vmx_uret_msr *efer_msr; |
55d2375e SC |
4699 | unsigned int i; |
4700 | ||
4701 | if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) | |
4702 | return vmcs_read64(GUEST_IA32_EFER); | |
4703 | ||
4704 | if (cpu_has_load_ia32_efer()) | |
7974c064 | 4705 | return kvm_host.efer; |
55d2375e SC |
4706 | |
4707 | for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { | |
4708 | if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) | |
4709 | return vmx->msr_autoload.guest.val[i].value; | |
4710 | } | |
4711 | ||
d85a8034 | 4712 | efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); |
55d2375e SC |
4713 | if (efer_msr) |
4714 | return efer_msr->data; | |
4715 | ||
7974c064 | 4716 | return kvm_host.efer; |
55d2375e SC |
4717 | } |
4718 | ||
4719 | static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) | |
4720 | { | |
4721 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
4722 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4723 | struct vmx_msr_entry g, h; | |
55d2375e SC |
4724 | gpa_t gpa; |
4725 | u32 i, j; | |
4726 | ||
4727 | vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); | |
4728 | ||
4729 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { | |
4730 | /* | |
4731 | * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set | |
4732 | * as vmcs01.GUEST_DR7 contains a userspace defined value | |
4733 | * and vcpu->arch.dr7 is not squirreled away before the | |
4734 | * nested VMENTER (not worth adding a variable in nested_vmx). | |
4735 | */ | |
4736 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | |
4737 | kvm_set_dr(vcpu, 7, DR7_FIXED_1); | |
4738 | else | |
4739 | WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); | |
4740 | } | |
4741 | ||
4742 | /* | |
4743 | * Note that calling vmx_set_{efer,cr0,cr4} is important as they | |
4744 | * handle a variety of side effects to KVM's software model. | |
4745 | */ | |
4746 | vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); | |
4747 | ||
fb509f76 | 4748 | vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); |
55d2375e SC |
4749 | vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); |
4750 | ||
4751 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | |
4752 | vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); | |
4753 | ||
4754 | nested_ept_uninit_mmu_context(vcpu); | |
f087a029 | 4755 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); |
cb3c1e2f | 4756 | kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); |
55d2375e SC |
4757 | |
4758 | /* | |
4759 | * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs | |
4760 | * from vmcs01 (if necessary). The PDPTRs are not loaded on | |
4761 | * VMFail, like everything else we just need to ensure our | |
4762 | * software model is up-to-date. | |
4763 | */ | |
9932b49e | 4764 | if (enable_ept && is_pae_paging(vcpu)) |
f087a029 | 4765 | ept_save_pdptrs(vcpu); |
55d2375e SC |
4766 | |
4767 | kvm_mmu_reset_context(vcpu); | |
4768 | ||
55d2375e SC |
4769 | /* |
4770 | * This nasty bit of open coding is a compromise between blindly | |
4771 | * loading L1's MSRs using the exit load lists (incorrect emulation | |
4772 | * of VMFail), leaving the nested VM's MSRs in the software model | |
4773 | * (incorrect behavior) and snapshotting the modified MSRs (too | |
4774 | * expensive since the lists are unbound by hardware). For each | |
4775 | * MSR that was (prematurely) loaded from the nested VMEntry load | |
4776 | * list, reload it from the exit load list if it exists and differs | |
4777 | * from the guest value. The intent is to stuff host state as | |
4778 | * silently as possible, not to fully process the exit load list. | |
4779 | */ | |
55d2375e SC |
4780 | for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { |
4781 | gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); | |
4782 | if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { | |
4783 | pr_debug_ratelimited( | |
4784 | "%s read MSR index failed (%u, 0x%08llx)\n", | |
4785 | __func__, i, gpa); | |
4786 | goto vmabort; | |
4787 | } | |
4788 | ||
4789 | for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { | |
4790 | gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); | |
4791 | if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { | |
4792 | pr_debug_ratelimited( | |
4793 | "%s read MSR failed (%u, 0x%08llx)\n", | |
4794 | __func__, j, gpa); | |
4795 | goto vmabort; | |
4796 | } | |
4797 | if (h.index != g.index) | |
4798 | continue; | |
4799 | if (h.value == g.value) | |
4800 | break; | |
4801 | ||
4802 | if (nested_vmx_load_msr_check(vcpu, &h)) { | |
4803 | pr_debug_ratelimited( | |
4804 | "%s check failed (%u, 0x%x, 0x%x)\n", | |
4805 | __func__, j, h.index, h.reserved); | |
4806 | goto vmabort; | |
4807 | } | |
4808 | ||
f20935d8 | 4809 | if (kvm_set_msr(vcpu, h.index, h.value)) { |
55d2375e SC |
4810 | pr_debug_ratelimited( |
4811 | "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", | |
4812 | __func__, j, h.index, h.value); | |
4813 | goto vmabort; | |
4814 | } | |
4815 | } | |
4816 | } | |
4817 | ||
4818 | return; | |
4819 | ||
4820 | vmabort: | |
4821 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | |
4822 | } | |
4823 | ||
4824 | /* | |
4825 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 | |
4826 | * and modify vmcs12 to make it see what it would expect to see there if | |
4827 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) | |
4828 | */ | |
4dcefa31 | 4829 | void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, |
55d2375e SC |
4830 | u32 exit_intr_info, unsigned long exit_qualification) |
4831 | { | |
4832 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4833 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
4834 | ||
593a5c2e SC |
4835 | /* Pending MTF traps are discarded on VM-Exit. */ |
4836 | vmx->nested.mtf_pending = false; | |
4837 | ||
55d2375e SC |
4838 | /* trying to cancel vmlaunch/vmresume is a bug */ |
4839 | WARN_ON_ONCE(vmx->nested.nested_run_pending); | |
4840 | ||
b4f69df0 | 4841 | #ifdef CONFIG_KVM_HYPERV |
f5c7e842 VK |
4842 | if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { |
4843 | /* | |
4844 | * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map | |
4845 | * Enlightened VMCS after migration and we still need to | |
4846 | * do that when something is forcing L2->L1 exit prior to | |
4847 | * the first L2 run. | |
4848 | */ | |
4849 | (void)nested_get_evmcs_page(vcpu); | |
4850 | } | |
b4f69df0 | 4851 | #endif |
f2c7ef3b | 4852 | |
40e5f908 SC |
4853 | /* Service pending TLB flush requests for L2 before switching to L1. */ |
4854 | kvm_service_local_tlb_flush_requests(vcpu); | |
eeeb4f67 | 4855 | |
43fea4e4 PS |
4856 | /* |
4857 | * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between | |
4858 | * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are | |
4859 | * up-to-date before switching to L1. | |
4860 | */ | |
4861 | if (enable_ept && is_pae_paging(vcpu)) | |
4862 | vmx_ept_load_pdptrs(vcpu); | |
4863 | ||
55d2375e SC |
4864 | leave_guest_mode(vcpu); |
4865 | ||
b4b65b56 PB |
4866 | if (nested_cpu_has_preemption_timer(vmcs12)) |
4867 | hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); | |
4868 | ||
d041b5ea IS |
4869 | if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { |
4870 | vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; | |
4871 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) | |
4872 | vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; | |
4873 | } | |
55d2375e SC |
4874 | |
4875 | if (likely(!vmx->fail)) { | |
3731905e | 4876 | sync_vmcs02_to_vmcs12(vcpu, vmcs12); |
f4f8316d | 4877 | |
4dcefa31 SC |
4878 | if (vm_exit_reason != -1) |
4879 | prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, | |
4880 | exit_intr_info, exit_qualification); | |
55d2375e SC |
4881 | |
4882 | /* | |
3731905e | 4883 | * Must happen outside of sync_vmcs02_to_vmcs12() as it will |
55d2375e SC |
4884 | * also be used to capture vmcs12 cache as part of |
4885 | * capturing nVMX state for snapshot (migration). | |
4886 | * | |
4887 | * Otherwise, this flush will dirty guest memory at a | |
4888 | * point it is already assumed by user-space to be | |
4889 | * immutable. | |
4890 | */ | |
4891 | nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); | |
55d2375e SC |
4892 | } else { |
4893 | /* | |
4894 | * The only expected VM-instruction error is "VM entry with | |
4895 | * invalid control field(s)." Anything else indicates a | |
4896 | * problem with L0. And we should never get here with a | |
4897 | * VMFail of any type if early consistency checks are enabled. | |
4898 | */ | |
4899 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != | |
4900 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | |
4901 | WARN_ON_ONCE(nested_early_check); | |
4902 | } | |
4903 | ||
d9535404 SC |
4904 | /* |
4905 | * Drop events/exceptions that were queued for re-injection to L2 | |
4906 | * (picked up via vmx_complete_interrupts()), as well as exceptions | |
4907 | * that were pending for L2. Note, this must NOT be hoisted above | |
4908 | * prepare_vmcs12(), events/exceptions queued for re-injection need to | |
4909 | * be captured in vmcs12 (see vmcs12_save_pending_event()). | |
4910 | */ | |
4911 | vcpu->arch.nmi_injected = false; | |
4912 | kvm_clear_exception_queue(vcpu); | |
4913 | kvm_clear_interrupt_queue(vcpu); | |
4914 | ||
55d2375e SC |
4915 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); |
4916 | ||
2e7eab81 JM |
4917 | /* |
4918 | * If IBRS is advertised to the vCPU, KVM must flush the indirect | |
4919 | * branch predictors when transitioning from L2 to L1, as L1 expects | |
4920 | * hardware (KVM in this case) to provide separate predictor modes. | |
4921 | * Bare metal isolates VMX root (host) from VMX non-root (guest), but | |
4922 | * doesn't isolate different VMCSs, i.e. in this case, doesn't provide | |
4923 | * separate modes for L2 vs L1. | |
4924 | */ | |
4925 | if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) | |
4926 | indirect_branch_prediction_barrier(); | |
4927 | ||
55d2375e SC |
4928 | /* Update any VMCS fields that might have changed while L2 ran */ |
4929 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | |
4930 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | |
4931 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); | |
938c8745 | 4932 | if (kvm_caps.has_tsc_control) |
1ab9287a IS |
4933 | vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); |
4934 | ||
02d496cf LA |
4935 | if (vmx->nested.l1_tpr_threshold != -1) |
4936 | vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); | |
55d2375e | 4937 | |
55d2375e SC |
4938 | if (vmx->nested.change_vmcs01_virtual_apic_mode) { |
4939 | vmx->nested.change_vmcs01_virtual_apic_mode = false; | |
4940 | vmx_set_virtual_apic_mode(vcpu); | |
55d2375e SC |
4941 | } |
4942 | ||
a85863c2 MS |
4943 | if (vmx->nested.update_vmcs01_cpu_dirty_logging) { |
4944 | vmx->nested.update_vmcs01_cpu_dirty_logging = false; | |
4945 | vmx_update_cpu_dirty_logging(vcpu); | |
4946 | } | |
4947 | ||
55d2375e | 4948 | /* Unpin physical memory we referred to in vmcs02 */ |
fe1911aa | 4949 | kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); |
96c66e87 | 4950 | kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); |
3278e049 KA |
4951 | kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); |
4952 | vmx->nested.pi_desc = NULL; | |
55d2375e | 4953 | |
1196cb97 SC |
4954 | if (vmx->nested.reload_vmcs01_apic_access_page) { |
4955 | vmx->nested.reload_vmcs01_apic_access_page = false; | |
4956 | kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); | |
4957 | } | |
55d2375e | 4958 | |
7c69661e SC |
4959 | if (vmx->nested.update_vmcs01_apicv_status) { |
4960 | vmx->nested.update_vmcs01_apicv_status = false; | |
4961 | kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); | |
4962 | } | |
4963 | ||
4dcefa31 | 4964 | if ((vm_exit_reason != -1) && |
453e42b0 | 4965 | (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) |
3731905e | 4966 | vmx->nested.need_vmcs12_to_shadow_sync = true; |
55d2375e SC |
4967 | |
4968 | /* in case we halted in L2 */ | |
4969 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | |
4970 | ||
4971 | if (likely(!vmx->fail)) { | |
4dcefa31 | 4972 | if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && |
a1c77abb | 4973 | nested_exit_intr_ack_set(vcpu)) { |
55d2375e SC |
4974 | int irq = kvm_cpu_get_interrupt(vcpu); |
4975 | WARN_ON(irq < 0); | |
4976 | vmcs12->vm_exit_intr_info = irq | | |
4977 | INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; | |
4978 | } | |
4979 | ||
4dcefa31 | 4980 | if (vm_exit_reason != -1) |
55d2375e SC |
4981 | trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, |
4982 | vmcs12->exit_qualification, | |
4983 | vmcs12->idt_vectoring_info_field, | |
4984 | vmcs12->vm_exit_intr_info, | |
4985 | vmcs12->vm_exit_intr_error_code, | |
4986 | KVM_ISA_VMX); | |
4987 | ||
4988 | load_vmcs12_host_state(vcpu, vmcs12); | |
4989 | ||
4990 | return; | |
4991 | } | |
4992 | ||
4993 | /* | |
4994 | * After an early L2 VM-entry failure, we're now back | |
4995 | * in L1 which thinks it just finished a VMLAUNCH or | |
4996 | * VMRESUME instruction, so we need to set the failure | |
4997 | * flag and the VM-instruction error field of the VMCS | |
4998 | * accordingly, and skip the emulated instruction. | |
4999 | */ | |
b2656e4d | 5000 | (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
55d2375e SC |
5001 | |
5002 | /* | |
5003 | * Restore L1's host state to KVM's software model. We're here | |
5004 | * because a consistency check was caught by hardware, which | |
5005 | * means some amount of guest state has been propagated to KVM's | |
5006 | * model and needs to be unwound to the host's state. | |
5007 | */ | |
5008 | nested_vmx_restore_host_state(vcpu); | |
5009 | ||
5010 | vmx->fail = 0; | |
5011 | } | |
5012 | ||
cb6a32c2 SC |
5013 | static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) |
5014 | { | |
92e7d5c8 | 5015 | kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
cb6a32c2 SC |
5016 | nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); |
5017 | } | |
5018 | ||
55d2375e SC |
5019 | /* |
5020 | * Decode the memory-address operand of a vmx instruction, as recorded on an | |
5021 | * exit caused by such an instruction (run by a guest hypervisor). | |
5022 | * On success, returns 0. When the operand is invalid, returns 1 and throws | |
49f933d4 | 5023 | * #UD, #GP, or #SS. |
55d2375e SC |
5024 | */ |
5025 | int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, | |
fdb28619 | 5026 | u32 vmx_instruction_info, bool wr, int len, gva_t *ret) |
55d2375e SC |
5027 | { |
5028 | gva_t off; | |
5029 | bool exn; | |
5030 | struct kvm_segment s; | |
5031 | ||
5032 | /* | |
5033 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | |
5034 | * Execution", on an exit, vmx_instruction_info holds most of the | |
5035 | * addressing components of the operand. Only the displacement part | |
5036 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | |
5037 | * For how an actual address is calculated from all these components, | |
5038 | * refer to Vol. 1, "Operand Addressing". | |
5039 | */ | |
5040 | int scaling = vmx_instruction_info & 3; | |
5041 | int addr_size = (vmx_instruction_info >> 7) & 7; | |
5042 | bool is_reg = vmx_instruction_info & (1u << 10); | |
5043 | int seg_reg = (vmx_instruction_info >> 15) & 7; | |
5044 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | |
5045 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | |
5046 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | |
5047 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | |
5048 | ||
5049 | if (is_reg) { | |
5050 | kvm_queue_exception(vcpu, UD_VECTOR); | |
5051 | return 1; | |
5052 | } | |
5053 | ||
5054 | /* Addr = segment_base + offset */ | |
5055 | /* offset = base + [index * scale] + displacement */ | |
5056 | off = exit_qualification; /* holds the displacement */ | |
946c522b SC |
5057 | if (addr_size == 1) |
5058 | off = (gva_t)sign_extend64(off, 31); | |
5059 | else if (addr_size == 0) | |
5060 | off = (gva_t)sign_extend64(off, 15); | |
55d2375e SC |
5061 | if (base_is_valid) |
5062 | off += kvm_register_read(vcpu, base_reg); | |
5063 | if (index_is_valid) | |
e6302698 | 5064 | off += kvm_register_read(vcpu, index_reg) << scaling; |
55d2375e | 5065 | vmx_get_segment(vcpu, &s, seg_reg); |
55d2375e | 5066 | |
8570f9e8 SC |
5067 | /* |
5068 | * The effective address, i.e. @off, of a memory operand is truncated | |
5069 | * based on the address size of the instruction. Note that this is | |
5070 | * the *effective address*, i.e. the address prior to accounting for | |
5071 | * the segment's base. | |
5072 | */ | |
55d2375e | 5073 | if (addr_size == 1) /* 32 bit */ |
8570f9e8 SC |
5074 | off &= 0xffffffff; |
5075 | else if (addr_size == 0) /* 16 bit */ | |
5076 | off &= 0xffff; | |
55d2375e SC |
5077 | |
5078 | /* Checks for #GP/#SS exceptions. */ | |
5079 | exn = false; | |
5080 | if (is_long_mode(vcpu)) { | |
8570f9e8 SC |
5081 | /* |
5082 | * The virtual/linear address is never truncated in 64-bit | |
5083 | * mode, e.g. a 32-bit address size can yield a 64-bit virtual | |
5084 | * address when using FS/GS with a non-zero base. | |
5085 | */ | |
6694e480 LA |
5086 | if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) |
5087 | *ret = s.base + off; | |
5088 | else | |
5089 | *ret = off; | |
8570f9e8 | 5090 | |
b39bd520 | 5091 | *ret = vmx_get_untagged_addr(vcpu, *ret, 0); |
55d2375e SC |
5092 | /* Long mode: #GP(0)/#SS(0) if the memory address is in a |
5093 | * non-canonical form. This is the only check on the memory | |
5094 | * destination for long mode! | |
5095 | */ | |
5096 | exn = is_noncanonical_address(*ret, vcpu); | |
e0dfacbf | 5097 | } else { |
8570f9e8 SC |
5098 | /* |
5099 | * When not in long mode, the virtual/linear address is | |
5100 | * unconditionally truncated to 32 bits regardless of the | |
5101 | * address size. | |
5102 | */ | |
5103 | *ret = (s.base + off) & 0xffffffff; | |
5104 | ||
55d2375e SC |
5105 | /* Protected mode: apply checks for segment validity in the |
5106 | * following order: | |
5107 | * - segment type check (#GP(0) may be thrown) | |
5108 | * - usability check (#GP(0)/#SS(0)) | |
5109 | * - limit check (#GP(0)/#SS(0)) | |
5110 | */ | |
5111 | if (wr) | |
5112 | /* #GP(0) if the destination operand is located in a | |
5113 | * read-only data segment or any code segment. | |
5114 | */ | |
5115 | exn = ((s.type & 0xa) == 0 || (s.type & 8)); | |
5116 | else | |
5117 | /* #GP(0) if the source operand is located in an | |
5118 | * execute-only code segment | |
5119 | */ | |
5120 | exn = ((s.type & 0xa) == 8); | |
5121 | if (exn) { | |
5122 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | |
5123 | return 1; | |
5124 | } | |
5125 | /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. | |
5126 | */ | |
5127 | exn = (s.unusable != 0); | |
34333cc6 SC |
5128 | |
5129 | /* | |
5130 | * Protected mode: #GP(0)/#SS(0) if the memory operand is | |
5131 | * outside the segment limit. All CPUs that support VMX ignore | |
5132 | * limit checks for flat segments, i.e. segments with base==0, | |
5133 | * limit==0xffffffff and of type expand-up data or code. | |
55d2375e | 5134 | */ |
34333cc6 SC |
5135 | if (!(s.base == 0 && s.limit == 0xffffffff && |
5136 | ((s.type & 8) || !(s.type & 4)))) | |
fdb28619 | 5137 | exn = exn || ((u64)off + len - 1 > s.limit); |
55d2375e SC |
5138 | } |
5139 | if (exn) { | |
5140 | kvm_queue_exception_e(vcpu, | |
5141 | seg_reg == VCPU_SREG_SS ? | |
5142 | SS_VECTOR : GP_VECTOR, | |
5143 | 0); | |
5144 | return 1; | |
5145 | } | |
5146 | ||
5147 | return 0; | |
5148 | } | |
5149 | ||
7a35e515 VK |
5150 | static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, |
5151 | int *ret) | |
55d2375e SC |
5152 | { |
5153 | gva_t gva; | |
5154 | struct x86_exception e; | |
7a35e515 | 5155 | int r; |
55d2375e | 5156 | |
5addc235 | 5157 | if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), |
fdb28619 | 5158 | vmcs_read32(VMX_INSTRUCTION_INFO), false, |
7a35e515 VK |
5159 | sizeof(*vmpointer), &gva)) { |
5160 | *ret = 1; | |
5161 | return -EINVAL; | |
5162 | } | |
55d2375e | 5163 | |
7a35e515 VK |
5164 | r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); |
5165 | if (r != X86EMUL_CONTINUE) { | |
3f3393b3 | 5166 | *ret = kvm_handle_memory_failure(vcpu, r, &e); |
7a35e515 | 5167 | return -EINVAL; |
55d2375e SC |
5168 | } |
5169 | ||
5170 | return 0; | |
5171 | } | |
5172 | ||
5173 | /* | |
5174 | * Allocate a shadow VMCS and associate it with the currently loaded | |
5175 | * VMCS, unless such a shadow VMCS already exists. The newly allocated | |
5176 | * VMCS is also VMCLEARed, so that it is ready for use. | |
5177 | */ | |
5178 | static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) | |
5179 | { | |
5180 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5181 | struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; | |
5182 | ||
5183 | /* | |
d6e656cd SC |
5184 | * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it |
5185 | * when L1 executes VMXOFF or the vCPU is forced out of nested | |
5186 | * operation. VMXON faults if the CPU is already post-VMXON, so it | |
5187 | * should be impossible to already have an allocated shadow VMCS. KVM | |
5188 | * doesn't support virtualization of VMCS shadowing, so vmcs01 should | |
5189 | * always be the loaded VMCS. | |
55d2375e | 5190 | */ |
d6e656cd SC |
5191 | if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) |
5192 | return loaded_vmcs->shadow_vmcs; | |
5193 | ||
5194 | loaded_vmcs->shadow_vmcs = alloc_vmcs(true); | |
5195 | if (loaded_vmcs->shadow_vmcs) | |
5196 | vmcs_clear(loaded_vmcs->shadow_vmcs); | |
55d2375e | 5197 | |
55d2375e SC |
5198 | return loaded_vmcs->shadow_vmcs; |
5199 | } | |
5200 | ||
5201 | static int enter_vmx_operation(struct kvm_vcpu *vcpu) | |
5202 | { | |
5203 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5204 | int r; | |
5205 | ||
5206 | r = alloc_loaded_vmcs(&vmx->nested.vmcs02); | |
5207 | if (r < 0) | |
5208 | goto out_vmcs02; | |
5209 | ||
41836839 | 5210 | vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
55d2375e SC |
5211 | if (!vmx->nested.cached_vmcs12) |
5212 | goto out_cached_vmcs12; | |
5213 | ||
8503fea6 | 5214 | vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; |
41836839 | 5215 | vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
55d2375e SC |
5216 | if (!vmx->nested.cached_shadow_vmcs12) |
5217 | goto out_cached_shadow_vmcs12; | |
5218 | ||
5219 | if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) | |
5220 | goto out_shadow_vmcs; | |
5221 | ||
5222 | hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, | |
ada0098d | 5223 | HRTIMER_MODE_ABS_PINNED); |
55d2375e SC |
5224 | vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; |
5225 | ||
5226 | vmx->nested.vpid02 = allocate_vpid(); | |
5227 | ||
5228 | vmx->nested.vmcs02_initialized = false; | |
5229 | vmx->nested.vmxon = true; | |
ee85dec2 | 5230 | |
2ef7619d | 5231 | if (vmx_pt_mode_is_host_guest()) { |
ee85dec2 | 5232 | vmx->pt_desc.guest.ctl = 0; |
476c9bd8 | 5233 | pt_update_intercept_for_msr(vcpu); |
ee85dec2 LK |
5234 | } |
5235 | ||
55d2375e SC |
5236 | return 0; |
5237 | ||
5238 | out_shadow_vmcs: | |
5239 | kfree(vmx->nested.cached_shadow_vmcs12); | |
5240 | ||
5241 | out_cached_shadow_vmcs12: | |
5242 | kfree(vmx->nested.cached_vmcs12); | |
5243 | ||
5244 | out_cached_vmcs12: | |
5245 | free_loaded_vmcs(&vmx->nested.vmcs02); | |
5246 | ||
5247 | out_vmcs02: | |
5248 | return -ENOMEM; | |
5249 | } | |
5250 | ||
ed7023a1 | 5251 | /* Emulate the VMXON instruction. */ |
a645c2b5 | 5252 | static int handle_vmxon(struct kvm_vcpu *vcpu) |
55d2375e SC |
5253 | { |
5254 | int ret; | |
5255 | gpa_t vmptr; | |
2e408936 | 5256 | uint32_t revision; |
55d2375e | 5257 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
32ad73db SC |
5258 | const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED |
5259 | | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; | |
55d2375e SC |
5260 | |
5261 | /* | |
9cc40932 SC |
5262 | * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter |
5263 | * the guest and so cannot rely on hardware to perform the check, | |
5264 | * which has higher priority than VM-Exit (see Intel SDM's pseudocode | |
5265 | * for VMXON). | |
c7d855c2 | 5266 | * |
9cc40932 SC |
5267 | * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 |
5268 | * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't | |
5269 | * force any of the relevant guest state. For a restricted guest, KVM | |
5270 | * does force CR0.PE=1, but only to also force VM86 in order to emulate | |
5271 | * Real Mode, and so there's no need to check CR0.PE manually. | |
55d2375e | 5272 | */ |
607475cf | 5273 | if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { |
55d2375e SC |
5274 | kvm_queue_exception(vcpu, UD_VECTOR); |
5275 | return 1; | |
5276 | } | |
5277 | ||
c7d855c2 | 5278 | /* |
9cc40932 SC |
5279 | * The CPL is checked for "not in VMX operation" and for "in VMX root", |
5280 | * and has higher priority than the VM-Fail due to being post-VMXON, | |
5281 | * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, | |
5282 | * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits | |
5283 | * from L2 to L1, i.e. there's no need to check for the vCPU being in | |
5284 | * VMX non-root. | |
5285 | * | |
5286 | * Forwarding the VM-Exit unconditionally, i.e. without performing the | |
5287 | * #UD checks (see above), is functionally ok because KVM doesn't allow | |
5288 | * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's | |
5289 | * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are | |
5290 | * missed by hardware due to shadowing CR0 and/or CR4. | |
c7d855c2 | 5291 | */ |
55d2375e SC |
5292 | if (vmx_get_cpl(vcpu)) { |
5293 | kvm_inject_gp(vcpu, 0); | |
5294 | return 1; | |
5295 | } | |
5296 | ||
5297 | if (vmx->nested.vmxon) | |
b2656e4d | 5298 | return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); |
55d2375e | 5299 | |
9cc40932 SC |
5300 | /* |
5301 | * Invalid CR0/CR4 generates #GP. These checks are performed if and | |
5302 | * only if the vCPU isn't already in VMX operation, i.e. effectively | |
5303 | * have lower priority than the VM-Fail above. | |
5304 | */ | |
5305 | if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || | |
5306 | !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { | |
5307 | kvm_inject_gp(vcpu, 0); | |
5308 | return 1; | |
5309 | } | |
5310 | ||
55d2375e SC |
5311 | if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) |
5312 | != VMXON_NEEDED_FEATURES) { | |
5313 | kvm_inject_gp(vcpu, 0); | |
5314 | return 1; | |
5315 | } | |
5316 | ||
7a35e515 VK |
5317 | if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) |
5318 | return ret; | |
55d2375e SC |
5319 | |
5320 | /* | |
5321 | * SDM 3: 24.11.5 | |
5322 | * The first 4 bytes of VMXON region contain the supported | |
5323 | * VMCS revision identifier | |
5324 | * | |
5325 | * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; | |
5326 | * which replaces physical address width with 32 | |
5327 | */ | |
e0bf2665 | 5328 | if (!page_address_valid(vcpu, vmptr)) |
55d2375e SC |
5329 | return nested_vmx_failInvalid(vcpu); |
5330 | ||
2e408936 KA |
5331 | if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || |
5332 | revision != VMCS12_REVISION) | |
55d2375e | 5333 | return nested_vmx_failInvalid(vcpu); |
55d2375e SC |
5334 | |
5335 | vmx->nested.vmxon_ptr = vmptr; | |
5336 | ret = enter_vmx_operation(vcpu); | |
5337 | if (ret) | |
5338 | return ret; | |
5339 | ||
5340 | return nested_vmx_succeed(vcpu); | |
5341 | } | |
5342 | ||
5343 | static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) | |
5344 | { | |
5345 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5346 | ||
64c78508 | 5347 | if (vmx->nested.current_vmptr == INVALID_GPA) |
55d2375e SC |
5348 | return; |
5349 | ||
7952d769 SC |
5350 | copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); |
5351 | ||
55d2375e SC |
5352 | if (enable_shadow_vmcs) { |
5353 | /* copy to memory all shadowed fields in case | |
5354 | they were modified */ | |
5355 | copy_shadow_to_vmcs12(vmx); | |
55d2375e SC |
5356 | vmx_disable_shadow_vmcs(vmx); |
5357 | } | |
5358 | vmx->nested.posted_intr_nv = -1; | |
5359 | ||
5360 | /* Flush VMCS12 to guest memory */ | |
5361 | kvm_vcpu_write_guest_page(vcpu, | |
5362 | vmx->nested.current_vmptr >> PAGE_SHIFT, | |
5363 | vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); | |
5364 | ||
0c1c92f1 | 5365 | kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); |
55d2375e | 5366 | |
64c78508 | 5367 | vmx->nested.current_vmptr = INVALID_GPA; |
55d2375e SC |
5368 | } |
5369 | ||
5370 | /* Emulate the VMXOFF instruction */ | |
a645c2b5 | 5371 | static int handle_vmxoff(struct kvm_vcpu *vcpu) |
55d2375e SC |
5372 | { |
5373 | if (!nested_vmx_check_permission(vcpu)) | |
5374 | return 1; | |
4b9852f4 | 5375 | |
55d2375e | 5376 | free_nested(vcpu); |
4b9852f4 | 5377 | |
ea2f00c6 SC |
5378 | if (kvm_apic_has_pending_init_or_sipi(vcpu)) |
5379 | kvm_make_request(KVM_REQ_EVENT, vcpu); | |
4b9852f4 | 5380 | |
55d2375e SC |
5381 | return nested_vmx_succeed(vcpu); |
5382 | } | |
5383 | ||
5384 | /* Emulate the VMCLEAR instruction */ | |
5385 | static int handle_vmclear(struct kvm_vcpu *vcpu) | |
5386 | { | |
5387 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5388 | u32 zero = 0; | |
5389 | gpa_t vmptr; | |
7a35e515 | 5390 | int r; |
55d2375e SC |
5391 | |
5392 | if (!nested_vmx_check_permission(vcpu)) | |
5393 | return 1; | |
5394 | ||
7a35e515 VK |
5395 | if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) |
5396 | return r; | |
55d2375e | 5397 | |
e0bf2665 | 5398 | if (!page_address_valid(vcpu, vmptr)) |
b2656e4d | 5399 | return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); |
55d2375e SC |
5400 | |
5401 | if (vmptr == vmx->nested.vmxon_ptr) | |
b2656e4d | 5402 | return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); |
55d2375e | 5403 | |
b2e02f82 | 5404 | if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { |
55d2375e SC |
5405 | if (vmptr == vmx->nested.current_vmptr) |
5406 | nested_release_vmcs12(vcpu); | |
5407 | ||
057b1875 SC |
5408 | /* |
5409 | * Silently ignore memory errors on VMCLEAR, Intel's pseudocode | |
5410 | * for VMCLEAR includes a "ensure that data for VMCS referenced | |
5411 | * by the operand is in memory" clause that guards writes to | |
5412 | * memory, i.e. doing nothing for I/O is architecturally valid. | |
5413 | * | |
5414 | * FIXME: Suppress failures if and only if no memslot is found, | |
5415 | * i.e. exit to userspace if __copy_to_user() fails. | |
5416 | */ | |
5417 | (void)kvm_vcpu_write_guest(vcpu, | |
5418 | vmptr + offsetof(struct vmcs12, | |
5419 | launch_state), | |
5420 | &zero, sizeof(zero)); | |
55d2375e SC |
5421 | } |
5422 | ||
5423 | return nested_vmx_succeed(vcpu); | |
5424 | } | |
5425 | ||
55d2375e SC |
5426 | /* Emulate the VMLAUNCH instruction */ |
5427 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) | |
5428 | { | |
5429 | return nested_vmx_run(vcpu, true); | |
5430 | } | |
5431 | ||
5432 | /* Emulate the VMRESUME instruction */ | |
5433 | static int handle_vmresume(struct kvm_vcpu *vcpu) | |
5434 | { | |
5435 | ||
5436 | return nested_vmx_run(vcpu, false); | |
5437 | } | |
5438 | ||
5439 | static int handle_vmread(struct kvm_vcpu *vcpu) | |
5440 | { | |
dd2d6042 JM |
5441 | struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) |
5442 | : get_vmcs12(vcpu); | |
5addc235 | 5443 | unsigned long exit_qualification = vmx_get_exit_qual(vcpu); |
c90f4d03 JM |
5444 | u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); |
5445 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
f7eea636 | 5446 | struct x86_exception e; |
c90f4d03 JM |
5447 | unsigned long field; |
5448 | u64 value; | |
5449 | gva_t gva = 0; | |
1c6f0b47 | 5450 | short offset; |
7a35e515 | 5451 | int len, r; |
55d2375e SC |
5452 | |
5453 | if (!nested_vmx_check_permission(vcpu)) | |
5454 | return 1; | |
5455 | ||
55d2375e | 5456 | /* Decode instruction info and find the field to read */ |
27b4a9c4 | 5457 | field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); |
1c6f0b47 | 5458 | |
453e42b0 | 5459 | if (!nested_vmx_is_evmptr12_valid(vmx)) { |
6cbbaab6 VK |
5460 | /* |
5461 | * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, | |
5462 | * any VMREAD sets the ALU flags for VMfailInvalid. | |
5463 | */ | |
5464 | if (vmx->nested.current_vmptr == INVALID_GPA || | |
5465 | (is_guest_mode(vcpu) && | |
5466 | get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) | |
5467 | return nested_vmx_failInvalid(vcpu); | |
5468 | ||
5469 | offset = get_vmcs12_field_offset(field); | |
5470 | if (offset < 0) | |
5471 | return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | |
55d2375e | 5472 | |
6cbbaab6 VK |
5473 | if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) |
5474 | copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); | |
7952d769 | 5475 | |
6cbbaab6 VK |
5476 | /* Read the field, zero-extended to a u64 value */ |
5477 | value = vmcs12_read_any(vmcs12, field, offset); | |
5478 | } else { | |
5479 | /* | |
5480 | * Hyper-V TLFS (as of 6.0b) explicitly states, that while an | |
5481 | * enlightened VMCS is active VMREAD/VMWRITE instructions are | |
5482 | * unsupported. Unfortunately, certain versions of Windows 11 | |
5483 | * don't comply with this requirement which is not enforced in | |
5484 | * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a | |
5485 | * workaround, as misbehaving guests will panic on VM-Fail. | |
5486 | * Note, enlightened VMCS is incompatible with shadow VMCS so | |
5487 | * all VMREADs from L2 should go to L1. | |
5488 | */ | |
5489 | if (WARN_ON_ONCE(is_guest_mode(vcpu))) | |
5490 | return nested_vmx_failInvalid(vcpu); | |
5491 | ||
5492 | offset = evmcs_field_offset(field, NULL); | |
5493 | if (offset < 0) | |
5494 | return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | |
5495 | ||
5496 | /* Read the field, zero-extended to a u64 value */ | |
c98842b2 | 5497 | value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); |
6cbbaab6 | 5498 | } |
1c6f0b47 | 5499 | |
55d2375e SC |
5500 | /* |
5501 | * Now copy part of this value to register or memory, as requested. | |
5502 | * Note that the number of bits actually copied is 32 or 64 depending | |
5503 | * on the guest's mode (32 or 64 bit), not on the given field's length. | |
5504 | */ | |
c90f4d03 | 5505 | if (instr_info & BIT(10)) { |
27b4a9c4 | 5506 | kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); |
55d2375e | 5507 | } else { |
fdb28619 | 5508 | len = is_64_bit_mode(vcpu) ? 8 : 4; |
55d2375e | 5509 | if (get_vmx_mem_address(vcpu, exit_qualification, |
c90f4d03 | 5510 | instr_info, true, len, &gva)) |
55d2375e SC |
5511 | return 1; |
5512 | /* _system ok, nested_vmx_check_permission has verified cpl=0 */ | |
7a35e515 VK |
5513 | r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); |
5514 | if (r != X86EMUL_CONTINUE) | |
3f3393b3 | 5515 | return kvm_handle_memory_failure(vcpu, r, &e); |
55d2375e SC |
5516 | } |
5517 | ||
5518 | return nested_vmx_succeed(vcpu); | |
5519 | } | |
5520 | ||
e2174295 SC |
5521 | static bool is_shadow_field_rw(unsigned long field) |
5522 | { | |
5523 | switch (field) { | |
5524 | #define SHADOW_FIELD_RW(x, y) case x: | |
5525 | #include "vmcs_shadow_fields.h" | |
5526 | return true; | |
5527 | default: | |
5528 | break; | |
5529 | } | |
5530 | return false; | |
5531 | } | |
5532 | ||
5533 | static bool is_shadow_field_ro(unsigned long field) | |
5534 | { | |
5535 | switch (field) { | |
5536 | #define SHADOW_FIELD_RO(x, y) case x: | |
5537 | #include "vmcs_shadow_fields.h" | |
5538 | return true; | |
5539 | default: | |
5540 | break; | |
5541 | } | |
5542 | return false; | |
5543 | } | |
55d2375e SC |
5544 | |
5545 | static int handle_vmwrite(struct kvm_vcpu *vcpu) | |
5546 | { | |
c90f4d03 JM |
5547 | struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) |
5548 | : get_vmcs12(vcpu); | |
5addc235 | 5549 | unsigned long exit_qualification = vmx_get_exit_qual(vcpu); |
c90f4d03 JM |
5550 | u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); |
5551 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5552 | struct x86_exception e; | |
55d2375e | 5553 | unsigned long field; |
c90f4d03 | 5554 | short offset; |
55d2375e | 5555 | gva_t gva; |
7a35e515 | 5556 | int len, r; |
55d2375e | 5557 | |
c90f4d03 JM |
5558 | /* |
5559 | * The value to write might be 32 or 64 bits, depending on L1's long | |
55d2375e SC |
5560 | * mode, and eventually we need to write that into a field of several |
5561 | * possible lengths. The code below first zero-extends the value to 64 | |
c90f4d03 | 5562 | * bit (value), and then copies only the appropriate number of |
55d2375e SC |
5563 | * bits into the vmcs12 field. |
5564 | */ | |
c90f4d03 | 5565 | u64 value = 0; |
55d2375e SC |
5566 | |
5567 | if (!nested_vmx_check_permission(vcpu)) | |
5568 | return 1; | |
5569 | ||
dd2d6042 | 5570 | /* |
64c78508 | 5571 | * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, |
dd2d6042 JM |
5572 | * any VMWRITE sets the ALU flags for VMfailInvalid. |
5573 | */ | |
64c78508 | 5574 | if (vmx->nested.current_vmptr == INVALID_GPA || |
dd2d6042 | 5575 | (is_guest_mode(vcpu) && |
64c78508 | 5576 | get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) |
55d2375e SC |
5577 | return nested_vmx_failInvalid(vcpu); |
5578 | ||
c90f4d03 | 5579 | if (instr_info & BIT(10)) |
27b4a9c4 | 5580 | value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); |
55d2375e | 5581 | else { |
fdb28619 | 5582 | len = is_64_bit_mode(vcpu) ? 8 : 4; |
55d2375e | 5583 | if (get_vmx_mem_address(vcpu, exit_qualification, |
c90f4d03 | 5584 | instr_info, false, len, &gva)) |
55d2375e | 5585 | return 1; |
7a35e515 VK |
5586 | r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); |
5587 | if (r != X86EMUL_CONTINUE) | |
3f3393b3 | 5588 | return kvm_handle_memory_failure(vcpu, r, &e); |
55d2375e SC |
5589 | } |
5590 | ||
27b4a9c4 | 5591 | field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); |
693e02cc | 5592 | |
2423a4c0 | 5593 | offset = get_vmcs12_field_offset(field); |
693e02cc | 5594 | if (offset < 0) |
b2656e4d | 5595 | return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); |
55d2375e | 5596 | |
55d2375e SC |
5597 | /* |
5598 | * If the vCPU supports "VMWRITE to any supported field in the | |
5599 | * VMCS," then the "read-only" fields are actually read/write. | |
5600 | */ | |
5601 | if (vmcs_field_readonly(field) && | |
5602 | !nested_cpu_has_vmwrite_any_field(vcpu)) | |
b2656e4d | 5603 | return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); |
55d2375e | 5604 | |
dd2d6042 JM |
5605 | /* |
5606 | * Ensure vmcs12 is up-to-date before any VMWRITE that dirties | |
5607 | * vmcs12, else we may crush a field or consume a stale value. | |
5608 | */ | |
5609 | if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) | |
5610 | copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); | |
55d2375e SC |
5611 | |
5612 | /* | |
b6437805 SC |
5613 | * Some Intel CPUs intentionally drop the reserved bits of the AR byte |
5614 | * fields on VMWRITE. Emulate this behavior to ensure consistent KVM | |
5615 | * behavior regardless of the underlying hardware, e.g. if an AR_BYTE | |
5616 | * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD | |
5617 | * from L1 will return a different value than VMREAD from L2 (L1 sees | |
5618 | * the stripped down value, L2 sees the full value as stored by KVM). | |
55d2375e | 5619 | */ |
b6437805 | 5620 | if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) |
c90f4d03 | 5621 | value &= 0x1f0ff; |
b6437805 | 5622 | |
c90f4d03 | 5623 | vmcs12_write_any(vmcs12, field, offset, value); |
55d2375e SC |
5624 | |
5625 | /* | |
e2174295 SC |
5626 | * Do not track vmcs12 dirty-state if in guest-mode as we actually |
5627 | * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated | |
5628 | * by L1 without a vmexit are always updated in the vmcs02, i.e. don't | |
5629 | * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. | |
55d2375e | 5630 | */ |
e2174295 SC |
5631 | if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { |
5632 | /* | |
5633 | * L1 can read these fields without exiting, ensure the | |
5634 | * shadow VMCS is up-to-date. | |
5635 | */ | |
5636 | if (enable_shadow_vmcs && is_shadow_field_ro(field)) { | |
5637 | preempt_disable(); | |
5638 | vmcs_load(vmx->vmcs01.shadow_vmcs); | |
fadcead0 | 5639 | |
c90f4d03 | 5640 | __vmcs_writel(field, value); |
fadcead0 | 5641 | |
e2174295 SC |
5642 | vmcs_clear(vmx->vmcs01.shadow_vmcs); |
5643 | vmcs_load(vmx->loaded_vmcs->vmcs); | |
5644 | preempt_enable(); | |
55d2375e | 5645 | } |
e2174295 | 5646 | vmx->nested.dirty_vmcs12 = true; |
55d2375e SC |
5647 | } |
5648 | ||
5649 | return nested_vmx_succeed(vcpu); | |
5650 | } | |
5651 | ||
5652 | static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) | |
5653 | { | |
5654 | vmx->nested.current_vmptr = vmptr; | |
5655 | if (enable_shadow_vmcs) { | |
fe7f895d | 5656 | secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); |
55d2375e SC |
5657 | vmcs_write64(VMCS_LINK_POINTER, |
5658 | __pa(vmx->vmcs01.shadow_vmcs)); | |
3731905e | 5659 | vmx->nested.need_vmcs12_to_shadow_sync = true; |
55d2375e SC |
5660 | } |
5661 | vmx->nested.dirty_vmcs12 = true; | |
ed2a4800 | 5662 | vmx->nested.force_msr_bitmap_recalc = true; |
55d2375e SC |
5663 | } |
5664 | ||
5665 | /* Emulate the VMPTRLD instruction */ | |
5666 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | |
5667 | { | |
5668 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5669 | gpa_t vmptr; | |
7a35e515 | 5670 | int r; |
55d2375e SC |
5671 | |
5672 | if (!nested_vmx_check_permission(vcpu)) | |
5673 | return 1; | |
5674 | ||
7a35e515 VK |
5675 | if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) |
5676 | return r; | |
55d2375e | 5677 | |
e0bf2665 | 5678 | if (!page_address_valid(vcpu, vmptr)) |
b2656e4d | 5679 | return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); |
55d2375e SC |
5680 | |
5681 | if (vmptr == vmx->nested.vmxon_ptr) | |
b2656e4d | 5682 | return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); |
55d2375e SC |
5683 | |
5684 | /* Forbid normal VMPTRLD if Enlightened version was used */ | |
453e42b0 | 5685 | if (nested_vmx_is_evmptr12_valid(vmx)) |
55d2375e SC |
5686 | return 1; |
5687 | ||
5688 | if (vmx->nested.current_vmptr != vmptr) { | |
cee66664 DW |
5689 | struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; |
5690 | struct vmcs_hdr hdr; | |
55d2375e | 5691 | |
8503fea6 | 5692 | if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { |
55d2375e SC |
5693 | /* |
5694 | * Reads from an unbacked page return all 1s, | |
5695 | * which means that the 32 bits located at the | |
5696 | * given physical address won't match the required | |
5697 | * VMCS12_REVISION identifier. | |
5698 | */ | |
b2656e4d | 5699 | return nested_vmx_fail(vcpu, |
55d2375e | 5700 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); |
55d2375e | 5701 | } |
b146b839 | 5702 | |
cee66664 DW |
5703 | if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, |
5704 | offsetof(struct vmcs12, hdr), | |
5705 | sizeof(hdr))) { | |
5706 | return nested_vmx_fail(vcpu, | |
5707 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | |
5708 | } | |
b146b839 | 5709 | |
cee66664 DW |
5710 | if (hdr.revision_id != VMCS12_REVISION || |
5711 | (hdr.shadow_vmcs && | |
55d2375e | 5712 | !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { |
b2656e4d | 5713 | return nested_vmx_fail(vcpu, |
55d2375e SC |
5714 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); |
5715 | } | |
5716 | ||
5717 | nested_release_vmcs12(vcpu); | |
5718 | ||
5719 | /* | |
5720 | * Load VMCS12 from guest memory since it is not already | |
5721 | * cached. | |
5722 | */ | |
cee66664 DW |
5723 | if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, |
5724 | VMCS12_SIZE)) { | |
5725 | return nested_vmx_fail(vcpu, | |
5726 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | |
5727 | } | |
55d2375e SC |
5728 | |
5729 | set_current_vmptr(vmx, vmptr); | |
5730 | } | |
5731 | ||
5732 | return nested_vmx_succeed(vcpu); | |
5733 | } | |
5734 | ||
5735 | /* Emulate the VMPTRST instruction */ | |
5736 | static int handle_vmptrst(struct kvm_vcpu *vcpu) | |
5737 | { | |
5addc235 | 5738 | unsigned long exit_qual = vmx_get_exit_qual(vcpu); |
55d2375e SC |
5739 | u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); |
5740 | gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; | |
5741 | struct x86_exception e; | |
5742 | gva_t gva; | |
7a35e515 | 5743 | int r; |
55d2375e SC |
5744 | |
5745 | if (!nested_vmx_check_permission(vcpu)) | |
5746 | return 1; | |
5747 | ||
453e42b0 | 5748 | if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) |
55d2375e SC |
5749 | return 1; |
5750 | ||
fdb28619 EK |
5751 | if (get_vmx_mem_address(vcpu, exit_qual, instr_info, |
5752 | true, sizeof(gpa_t), &gva)) | |
55d2375e SC |
5753 | return 1; |
5754 | /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ | |
7a35e515 VK |
5755 | r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, |
5756 | sizeof(gpa_t), &e); | |
5757 | if (r != X86EMUL_CONTINUE) | |
3f3393b3 | 5758 | return kvm_handle_memory_failure(vcpu, r, &e); |
7a35e515 | 5759 | |
55d2375e SC |
5760 | return nested_vmx_succeed(vcpu); |
5761 | } | |
5762 | ||
5763 | /* Emulate the INVEPT instruction */ | |
5764 | static int handle_invept(struct kvm_vcpu *vcpu) | |
5765 | { | |
5766 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5767 | u32 vmx_instruction_info, types; | |
ce8fe7b7 SC |
5768 | unsigned long type, roots_to_free; |
5769 | struct kvm_mmu *mmu; | |
55d2375e SC |
5770 | gva_t gva; |
5771 | struct x86_exception e; | |
5772 | struct { | |
5773 | u64 eptp, gpa; | |
5774 | } operand; | |
329bd56c | 5775 | int i, r, gpr_index; |
55d2375e SC |
5776 | |
5777 | if (!(vmx->nested.msrs.secondary_ctls_high & | |
5778 | SECONDARY_EXEC_ENABLE_EPT) || | |
5779 | !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { | |
5780 | kvm_queue_exception(vcpu, UD_VECTOR); | |
5781 | return 1; | |
5782 | } | |
5783 | ||
5784 | if (!nested_vmx_check_permission(vcpu)) | |
5785 | return 1; | |
5786 | ||
5787 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
329bd56c VS |
5788 | gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); |
5789 | type = kvm_register_read(vcpu, gpr_index); | |
55d2375e SC |
5790 | |
5791 | types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; | |
5792 | ||
5793 | if (type >= 32 || !(types & (1 << type))) | |
b2656e4d | 5794 | return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
55d2375e SC |
5795 | |
5796 | /* According to the Intel VMX instruction reference, the memory | |
5797 | * operand is read even if it isn't needed (e.g., for type==global) | |
5798 | */ | |
5addc235 | 5799 | if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), |
fdb28619 | 5800 | vmx_instruction_info, false, sizeof(operand), &gva)) |
55d2375e | 5801 | return 1; |
7a35e515 VK |
5802 | r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); |
5803 | if (r != X86EMUL_CONTINUE) | |
3f3393b3 | 5804 | return kvm_handle_memory_failure(vcpu, r, &e); |
55d2375e | 5805 | |
ce8fe7b7 SC |
5806 | /* |
5807 | * Nested EPT roots are always held through guest_mmu, | |
5808 | * not root_mmu. | |
5809 | */ | |
5810 | mmu = &vcpu->arch.guest_mmu; | |
5811 | ||
55d2375e | 5812 | switch (type) { |
b1190198 | 5813 | case VMX_EPT_EXTENT_CONTEXT: |
eed0030e | 5814 | if (!nested_vmx_check_eptp(vcpu, operand.eptp)) |
b2656e4d | 5815 | return nested_vmx_fail(vcpu, |
eed0030e | 5816 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
f8aa7e39 | 5817 | |
ce8fe7b7 | 5818 | roots_to_free = 0; |
b9e5603c | 5819 | if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, |
ce8fe7b7 SC |
5820 | operand.eptp)) |
5821 | roots_to_free |= KVM_MMU_ROOT_CURRENT; | |
5822 | ||
5823 | for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { | |
5824 | if (nested_ept_root_matches(mmu->prev_roots[i].hpa, | |
be01e8e2 | 5825 | mmu->prev_roots[i].pgd, |
ce8fe7b7 SC |
5826 | operand.eptp)) |
5827 | roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); | |
5828 | } | |
5829 | break; | |
eed0030e | 5830 | case VMX_EPT_EXTENT_GLOBAL: |
ce8fe7b7 | 5831 | roots_to_free = KVM_MMU_ROOTS_ALL; |
55d2375e SC |
5832 | break; |
5833 | default: | |
f9336e32 | 5834 | BUG(); |
55d2375e SC |
5835 | break; |
5836 | } | |
5837 | ||
ce8fe7b7 | 5838 | if (roots_to_free) |
0c1c92f1 | 5839 | kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); |
ce8fe7b7 | 5840 | |
55d2375e SC |
5841 | return nested_vmx_succeed(vcpu); |
5842 | } | |
5843 | ||
5844 | static int handle_invvpid(struct kvm_vcpu *vcpu) | |
5845 | { | |
5846 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5847 | u32 vmx_instruction_info; | |
5848 | unsigned long type, types; | |
5849 | gva_t gva; | |
5850 | struct x86_exception e; | |
5851 | struct { | |
5852 | u64 vpid; | |
5853 | u64 gla; | |
5854 | } operand; | |
5855 | u16 vpid02; | |
329bd56c | 5856 | int r, gpr_index; |
55d2375e SC |
5857 | |
5858 | if (!(vmx->nested.msrs.secondary_ctls_high & | |
5859 | SECONDARY_EXEC_ENABLE_VPID) || | |
5860 | !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { | |
5861 | kvm_queue_exception(vcpu, UD_VECTOR); | |
5862 | return 1; | |
5863 | } | |
5864 | ||
5865 | if (!nested_vmx_check_permission(vcpu)) | |
5866 | return 1; | |
5867 | ||
5868 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
329bd56c VS |
5869 | gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); |
5870 | type = kvm_register_read(vcpu, gpr_index); | |
55d2375e SC |
5871 | |
5872 | types = (vmx->nested.msrs.vpid_caps & | |
5873 | VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; | |
5874 | ||
5875 | if (type >= 32 || !(types & (1 << type))) | |
b2656e4d | 5876 | return nested_vmx_fail(vcpu, |
55d2375e SC |
5877 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5878 | ||
5879 | /* according to the intel vmx instruction reference, the memory | |
5880 | * operand is read even if it isn't needed (e.g., for type==global) | |
5881 | */ | |
5addc235 | 5882 | if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), |
fdb28619 | 5883 | vmx_instruction_info, false, sizeof(operand), &gva)) |
55d2375e | 5884 | return 1; |
7a35e515 VK |
5885 | r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); |
5886 | if (r != X86EMUL_CONTINUE) | |
3f3393b3 | 5887 | return kvm_handle_memory_failure(vcpu, r, &e); |
7a35e515 | 5888 | |
55d2375e | 5889 | if (operand.vpid >> 16) |
b2656e4d | 5890 | return nested_vmx_fail(vcpu, |
55d2375e SC |
5891 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5892 | ||
5893 | vpid02 = nested_get_vpid02(vcpu); | |
5894 | switch (type) { | |
5895 | case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: | |
b39bd520 BW |
5896 | /* |
5897 | * LAM doesn't apply to addresses that are inputs to TLB | |
5898 | * invalidation. | |
5899 | */ | |
55d2375e SC |
5900 | if (!operand.vpid || |
5901 | is_noncanonical_address(operand.gla, vcpu)) | |
b2656e4d | 5902 | return nested_vmx_fail(vcpu, |
55d2375e | 5903 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
bc41d0c4 | 5904 | vpid_sync_vcpu_addr(vpid02, operand.gla); |
55d2375e SC |
5905 | break; |
5906 | case VMX_VPID_EXTENT_SINGLE_CONTEXT: | |
5907 | case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: | |
5908 | if (!operand.vpid) | |
b2656e4d | 5909 | return nested_vmx_fail(vcpu, |
55d2375e | 5910 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
446ace4b | 5911 | vpid_sync_context(vpid02); |
55d2375e SC |
5912 | break; |
5913 | case VMX_VPID_EXTENT_ALL_CONTEXT: | |
446ace4b | 5914 | vpid_sync_context(vpid02); |
55d2375e SC |
5915 | break; |
5916 | default: | |
5917 | WARN_ON_ONCE(1); | |
5918 | return kvm_skip_emulated_instruction(vcpu); | |
5919 | } | |
5920 | ||
d6e3f838 JS |
5921 | /* |
5922 | * Sync the shadow page tables if EPT is disabled, L1 is invalidating | |
25b62c62 SC |
5923 | * linear mappings for L2 (tagged with L2's VPID). Free all guest |
5924 | * roots as VPIDs are not tracked in the MMU role. | |
d6e3f838 JS |
5925 | * |
5926 | * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share | |
5927 | * an MMU when EPT is disabled. | |
5928 | * | |
5929 | * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. | |
5930 | */ | |
5931 | if (!enable_ept) | |
0c1c92f1 | 5932 | kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); |
d6e3f838 | 5933 | |
55d2375e SC |
5934 | return nested_vmx_succeed(vcpu); |
5935 | } | |
5936 | ||
5937 | static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, | |
5938 | struct vmcs12 *vmcs12) | |
5939 | { | |
2b3eaf81 | 5940 | u32 index = kvm_rcx_read(vcpu); |
ac6389ab | 5941 | u64 new_eptp; |
55d2375e | 5942 | |
c5ffd408 | 5943 | if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) |
55d2375e | 5944 | return 1; |
55d2375e SC |
5945 | if (index >= VMFUNC_EPTP_ENTRIES) |
5946 | return 1; | |
5947 | ||
55d2375e | 5948 | if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, |
ac6389ab | 5949 | &new_eptp, index * 8, 8)) |
55d2375e SC |
5950 | return 1; |
5951 | ||
55d2375e SC |
5952 | /* |
5953 | * If the (L2) guest does a vmfunc to the currently | |
5954 | * active ept pointer, we don't have to do anything else | |
5955 | */ | |
ac6389ab SC |
5956 | if (vmcs12->ept_pointer != new_eptp) { |
5957 | if (!nested_vmx_check_eptp(vcpu, new_eptp)) | |
55d2375e SC |
5958 | return 1; |
5959 | ||
ac6389ab | 5960 | vmcs12->ept_pointer = new_eptp; |
39353ab5 | 5961 | nested_ept_new_eptp(vcpu); |
c805f5d5 | 5962 | |
39353ab5 SC |
5963 | if (!nested_cpu_has_vpid(vmcs12)) |
5964 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); | |
55d2375e SC |
5965 | } |
5966 | ||
5967 | return 0; | |
5968 | } | |
5969 | ||
5970 | static int handle_vmfunc(struct kvm_vcpu *vcpu) | |
5971 | { | |
5972 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5973 | struct vmcs12 *vmcs12; | |
2b3eaf81 | 5974 | u32 function = kvm_rax_read(vcpu); |
55d2375e SC |
5975 | |
5976 | /* | |
41acdd41 YZ |
5977 | * VMFUNC should never execute cleanly while L1 is active; KVM supports |
5978 | * VMFUNC for nested VMs, but not for L1. | |
55d2375e | 5979 | */ |
41acdd41 | 5980 | if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { |
55d2375e SC |
5981 | kvm_queue_exception(vcpu, UD_VECTOR); |
5982 | return 1; | |
5983 | } | |
5984 | ||
5985 | vmcs12 = get_vmcs12(vcpu); | |
546e8398 SC |
5986 | |
5987 | /* | |
5988 | * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC | |
5989 | * is enabled in vmcs02 if and only if it's enabled in vmcs12. | |
5990 | */ | |
5991 | if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { | |
5992 | kvm_queue_exception(vcpu, UD_VECTOR); | |
5993 | return 1; | |
5994 | } | |
5995 | ||
0e75225d | 5996 | if (!(vmcs12->vm_function_control & BIT_ULL(function))) |
55d2375e SC |
5997 | goto fail; |
5998 | ||
5999 | switch (function) { | |
6000 | case 0: | |
6001 | if (nested_vmx_eptp_switching(vcpu, vmcs12)) | |
6002 | goto fail; | |
6003 | break; | |
6004 | default: | |
6005 | goto fail; | |
6006 | } | |
6007 | return kvm_skip_emulated_instruction(vcpu); | |
6008 | ||
6009 | fail: | |
8e533240 SC |
6010 | /* |
6011 | * This is effectively a reflected VM-Exit, as opposed to a synthesized | |
6012 | * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode | |
6013 | * EXIT_REASON_VMFUNC as the exit reason. | |
6014 | */ | |
6015 | nested_vmx_vmexit(vcpu, vmx->exit_reason.full, | |
87915858 | 6016 | vmx_get_intr_info(vcpu), |
5addc235 | 6017 | vmx_get_exit_qual(vcpu)); |
55d2375e SC |
6018 | return 1; |
6019 | } | |
6020 | ||
e71237d3 OU |
6021 | /* |
6022 | * Return true if an IO instruction with the specified port and size should cause | |
6023 | * a VM-exit into L1. | |
6024 | */ | |
6025 | bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, | |
6026 | int size) | |
55d2375e | 6027 | { |
e71237d3 | 6028 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
55d2375e | 6029 | gpa_t bitmap, last_bitmap; |
55d2375e SC |
6030 | u8 b; |
6031 | ||
64c78508 | 6032 | last_bitmap = INVALID_GPA; |
55d2375e SC |
6033 | b = -1; |
6034 | ||
6035 | while (size > 0) { | |
6036 | if (port < 0x8000) | |
6037 | bitmap = vmcs12->io_bitmap_a; | |
6038 | else if (port < 0x10000) | |
6039 | bitmap = vmcs12->io_bitmap_b; | |
6040 | else | |
6041 | return true; | |
6042 | bitmap += (port & 0x7fff) / 8; | |
6043 | ||
6044 | if (last_bitmap != bitmap) | |
6045 | if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) | |
6046 | return true; | |
6047 | if (b & (1 << (port & 7))) | |
6048 | return true; | |
6049 | ||
6050 | port++; | |
6051 | size--; | |
6052 | last_bitmap = bitmap; | |
6053 | } | |
6054 | ||
6055 | return false; | |
6056 | } | |
6057 | ||
e71237d3 OU |
6058 | static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, |
6059 | struct vmcs12 *vmcs12) | |
6060 | { | |
6061 | unsigned long exit_qualification; | |
35a57134 | 6062 | unsigned short port; |
e71237d3 OU |
6063 | int size; |
6064 | ||
6065 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | |
6066 | return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); | |
6067 | ||
5addc235 | 6068 | exit_qualification = vmx_get_exit_qual(vcpu); |
e71237d3 OU |
6069 | |
6070 | port = exit_qualification >> 16; | |
6071 | size = (exit_qualification & 7) + 1; | |
6072 | ||
6073 | return nested_vmx_check_io_bitmaps(vcpu, port, size); | |
6074 | } | |
6075 | ||
55d2375e | 6076 | /* |
463bfeee | 6077 | * Return 1 if we should exit from L2 to L1 to handle an MSR access, |
55d2375e SC |
6078 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed |
6079 | * disinterest in the current event (read or write a specific MSR) by using an | |
6080 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. | |
6081 | */ | |
6082 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | |
8e533240 SC |
6083 | struct vmcs12 *vmcs12, |
6084 | union vmx_exit_reason exit_reason) | |
55d2375e | 6085 | { |
2b3eaf81 | 6086 | u32 msr_index = kvm_rcx_read(vcpu); |
55d2375e SC |
6087 | gpa_t bitmap; |
6088 | ||
6089 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | |
6090 | return true; | |
6091 | ||
6092 | /* | |
6093 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | |
6094 | * for the four combinations of read/write and low/high MSR numbers. | |
6095 | * First we need to figure out which of the four to use: | |
6096 | */ | |
6097 | bitmap = vmcs12->msr_bitmap; | |
8e533240 | 6098 | if (exit_reason.basic == EXIT_REASON_MSR_WRITE) |
55d2375e SC |
6099 | bitmap += 2048; |
6100 | if (msr_index >= 0xc0000000) { | |
6101 | msr_index -= 0xc0000000; | |
6102 | bitmap += 1024; | |
6103 | } | |
6104 | ||
6105 | /* Then read the msr_index'th bit from this bitmap: */ | |
6106 | if (msr_index < 1024*8) { | |
6107 | unsigned char b; | |
6108 | if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) | |
6109 | return true; | |
6110 | return 1 & (b >> (msr_index & 7)); | |
6111 | } else | |
6112 | return true; /* let L1 handle the wrong parameter */ | |
6113 | } | |
6114 | ||
6115 | /* | |
6116 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, | |
6117 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to | |
6118 | * intercept (via guest_host_mask etc.) the current event. | |
6119 | */ | |
6120 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | |
6121 | struct vmcs12 *vmcs12) | |
6122 | { | |
5addc235 | 6123 | unsigned long exit_qualification = vmx_get_exit_qual(vcpu); |
55d2375e SC |
6124 | int cr = exit_qualification & 15; |
6125 | int reg; | |
6126 | unsigned long val; | |
6127 | ||
6128 | switch ((exit_qualification >> 4) & 3) { | |
6129 | case 0: /* mov to cr */ | |
6130 | reg = (exit_qualification >> 8) & 15; | |
27b4a9c4 | 6131 | val = kvm_register_read(vcpu, reg); |
55d2375e SC |
6132 | switch (cr) { |
6133 | case 0: | |
6134 | if (vmcs12->cr0_guest_host_mask & | |
6135 | (val ^ vmcs12->cr0_read_shadow)) | |
6136 | return true; | |
6137 | break; | |
6138 | case 3: | |
55d2375e SC |
6139 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) |
6140 | return true; | |
6141 | break; | |
6142 | case 4: | |
6143 | if (vmcs12->cr4_guest_host_mask & | |
6144 | (vmcs12->cr4_read_shadow ^ val)) | |
6145 | return true; | |
6146 | break; | |
6147 | case 8: | |
6148 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | |
6149 | return true; | |
6150 | break; | |
6151 | } | |
6152 | break; | |
6153 | case 2: /* clts */ | |
6154 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | |
6155 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | |
6156 | return true; | |
6157 | break; | |
6158 | case 1: /* mov from cr */ | |
6159 | switch (cr) { | |
6160 | case 3: | |
6161 | if (vmcs12->cpu_based_vm_exec_control & | |
6162 | CPU_BASED_CR3_STORE_EXITING) | |
6163 | return true; | |
6164 | break; | |
6165 | case 8: | |
6166 | if (vmcs12->cpu_based_vm_exec_control & | |
6167 | CPU_BASED_CR8_STORE_EXITING) | |
6168 | return true; | |
6169 | break; | |
6170 | } | |
6171 | break; | |
6172 | case 3: /* lmsw */ | |
6173 | /* | |
6174 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of | |
6175 | * cr0. Other attempted changes are ignored, with no exit. | |
6176 | */ | |
6177 | val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; | |
6178 | if (vmcs12->cr0_guest_host_mask & 0xe & | |
6179 | (val ^ vmcs12->cr0_read_shadow)) | |
6180 | return true; | |
6181 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | |
6182 | !(vmcs12->cr0_read_shadow & 0x1) && | |
6183 | (val & 0x1)) | |
6184 | return true; | |
6185 | break; | |
6186 | } | |
6187 | return false; | |
6188 | } | |
6189 | ||
72add915 SC |
6190 | static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, |
6191 | struct vmcs12 *vmcs12) | |
6192 | { | |
6193 | u32 encls_leaf; | |
6194 | ||
6195 | if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || | |
6196 | !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) | |
6197 | return false; | |
6198 | ||
6199 | encls_leaf = kvm_rax_read(vcpu); | |
6200 | if (encls_leaf > 62) | |
6201 | encls_leaf = 63; | |
6202 | return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); | |
6203 | } | |
6204 | ||
55d2375e SC |
6205 | static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, |
6206 | struct vmcs12 *vmcs12, gpa_t bitmap) | |
6207 | { | |
6208 | u32 vmx_instruction_info; | |
6209 | unsigned long field; | |
6210 | u8 b; | |
6211 | ||
6212 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | |
6213 | return true; | |
6214 | ||
6215 | /* Decode instruction info and find the field to access */ | |
6216 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
6217 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | |
6218 | ||
6219 | /* Out-of-range fields always cause a VM exit from L2 to L1 */ | |
6220 | if (field >> 15) | |
6221 | return true; | |
6222 | ||
6223 | if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) | |
6224 | return true; | |
6225 | ||
6226 | return 1 & (b >> (field & 7)); | |
6227 | } | |
6228 | ||
b045ae90 OU |
6229 | static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) |
6230 | { | |
6231 | u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; | |
6232 | ||
6233 | if (nested_cpu_has_mtf(vmcs12)) | |
6234 | return true; | |
6235 | ||
6236 | /* | |
6237 | * An MTF VM-exit may be injected into the guest by setting the | |
6238 | * interruption-type to 7 (other event) and the vector field to 0. Such | |
6239 | * is the case regardless of the 'monitor trap flag' VM-execution | |
6240 | * control. | |
6241 | */ | |
6242 | return entry_intr_info == (INTR_INFO_VALID_MASK | |
6243 | | INTR_TYPE_OTHER_EVENT); | |
6244 | } | |
6245 | ||
55d2375e | 6246 | /* |
2c1f3323 SC |
6247 | * Return true if L0 wants to handle an exit from L2 regardless of whether or not |
6248 | * L1 wants the exit. Only call this when in is_guest_mode (L2). | |
55d2375e | 6249 | */ |
8e533240 SC |
6250 | static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, |
6251 | union vmx_exit_reason exit_reason) | |
55d2375e | 6252 | { |
236871b6 | 6253 | u32 intr_info; |
55d2375e | 6254 | |
8e533240 | 6255 | switch ((u16)exit_reason.basic) { |
55d2375e | 6256 | case EXIT_REASON_EXCEPTION_NMI: |
87915858 | 6257 | intr_info = vmx_get_intr_info(vcpu); |
55d2375e | 6258 | if (is_nmi(intr_info)) |
2c1f3323 | 6259 | return true; |
55d2375e | 6260 | else if (is_page_fault(intr_info)) |
18712c13 SC |
6261 | return vcpu->arch.apf.host_apf_flags || |
6262 | vmx_need_pf_intercept(vcpu); | |
55d2375e SC |
6263 | else if (is_debug(intr_info) && |
6264 | vcpu->guest_debug & | |
6265 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | |
2c1f3323 | 6266 | return true; |
55d2375e SC |
6267 | else if (is_breakpoint(intr_info) && |
6268 | vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | |
2c1f3323 | 6269 | return true; |
b33bb78a SC |
6270 | else if (is_alignment_check(intr_info) && |
6271 | !vmx_guest_inject_ac(vcpu)) | |
6272 | return true; | |
9031b421 SC |
6273 | else if (is_ve_fault(intr_info)) |
6274 | return true; | |
2c1f3323 SC |
6275 | return false; |
6276 | case EXIT_REASON_EXTERNAL_INTERRUPT: | |
6277 | return true; | |
6278 | case EXIT_REASON_MCE_DURING_VMENTRY: | |
6279 | return true; | |
6280 | case EXIT_REASON_EPT_VIOLATION: | |
6281 | /* | |
6282 | * L0 always deals with the EPT violation. If nested EPT is | |
6283 | * used, and the nested mmu code discovers that the address is | |
6284 | * missing in the guest EPT table (EPT12), the EPT violation | |
6285 | * will be injected with nested_ept_inject_page_fault() | |
6286 | */ | |
6287 | return true; | |
6288 | case EXIT_REASON_EPT_MISCONFIG: | |
6289 | /* | |
6290 | * L2 never uses directly L1's EPT, but rather L0's own EPT | |
6291 | * table (shadow on EPT) or a merged EPT table that L0 built | |
6292 | * (EPT on EPT). So any problems with the structure of the | |
6293 | * table is L0's fault. | |
6294 | */ | |
6295 | return true; | |
6296 | case EXIT_REASON_PREEMPTION_TIMER: | |
6297 | return true; | |
6298 | case EXIT_REASON_PML_FULL: | |
c3bb9a20 SC |
6299 | /* |
6300 | * PML is emulated for an L1 VMM and should never be enabled in | |
6301 | * vmcs02, always "handle" PML_FULL by exiting to userspace. | |
6302 | */ | |
2c1f3323 SC |
6303 | return true; |
6304 | case EXIT_REASON_VMFUNC: | |
6305 | /* VM functions are emulated through L2->L0 vmexits. */ | |
6306 | return true; | |
24a996ad CQ |
6307 | case EXIT_REASON_BUS_LOCK: |
6308 | /* | |
6309 | * At present, bus lock VM exit is never exposed to L1. | |
6310 | * Handle L2's bus locks in L0 directly. | |
6311 | */ | |
6312 | return true; | |
b4f69df0 | 6313 | #ifdef CONFIG_KVM_HYPERV |
c30e9bc8 VK |
6314 | case EXIT_REASON_VMCALL: |
6315 | /* Hyper-V L2 TLB flush hypercall is handled by L0 */ | |
6316 | return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && | |
6317 | nested_evmcs_l2_tlb_flush_enabled(vcpu) && | |
6318 | kvm_hv_is_tlb_flush_hcall(vcpu); | |
b4f69df0 | 6319 | #endif |
2c1f3323 SC |
6320 | default: |
6321 | break; | |
6322 | } | |
6323 | return false; | |
6324 | } | |
6325 | ||
6326 | /* | |
6327 | * Return 1 if L1 wants to intercept an exit from L2. Only call this when in | |
6328 | * is_guest_mode (L2). | |
6329 | */ | |
8e533240 SC |
6330 | static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, |
6331 | union vmx_exit_reason exit_reason) | |
2c1f3323 SC |
6332 | { |
6333 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
9bd4af24 | 6334 | u32 intr_info; |
2c1f3323 | 6335 | |
8e533240 | 6336 | switch ((u16)exit_reason.basic) { |
2c1f3323 | 6337 | case EXIT_REASON_EXCEPTION_NMI: |
87915858 | 6338 | intr_info = vmx_get_intr_info(vcpu); |
2c1f3323 SC |
6339 | if (is_nmi(intr_info)) |
6340 | return true; | |
6341 | else if (is_page_fault(intr_info)) | |
6342 | return true; | |
55d2375e SC |
6343 | return vmcs12->exception_bitmap & |
6344 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | |
6345 | case EXIT_REASON_EXTERNAL_INTERRUPT: | |
2c1f3323 | 6346 | return nested_exit_on_intr(vcpu); |
55d2375e SC |
6347 | case EXIT_REASON_TRIPLE_FAULT: |
6348 | return true; | |
9dadc2f9 XL |
6349 | case EXIT_REASON_INTERRUPT_WINDOW: |
6350 | return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); | |
55d2375e | 6351 | case EXIT_REASON_NMI_WINDOW: |
4e2a0bc5 | 6352 | return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); |
55d2375e SC |
6353 | case EXIT_REASON_TASK_SWITCH: |
6354 | return true; | |
6355 | case EXIT_REASON_CPUID: | |
6356 | return true; | |
6357 | case EXIT_REASON_HLT: | |
6358 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | |
6359 | case EXIT_REASON_INVD: | |
6360 | return true; | |
6361 | case EXIT_REASON_INVLPG: | |
6362 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | |
6363 | case EXIT_REASON_RDPMC: | |
6364 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | |
6365 | case EXIT_REASON_RDRAND: | |
6366 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); | |
6367 | case EXIT_REASON_RDSEED: | |
6368 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); | |
6369 | case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: | |
6370 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | |
6371 | case EXIT_REASON_VMREAD: | |
6372 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | |
6373 | vmcs12->vmread_bitmap); | |
6374 | case EXIT_REASON_VMWRITE: | |
6375 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | |
6376 | vmcs12->vmwrite_bitmap); | |
6377 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | |
6378 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | |
6379 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: | |
6380 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: | |
6381 | case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: | |
6382 | /* | |
6383 | * VMX instructions trap unconditionally. This allows L1 to | |
6384 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | |
6385 | */ | |
6386 | return true; | |
6387 | case EXIT_REASON_CR_ACCESS: | |
6388 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | |
6389 | case EXIT_REASON_DR_ACCESS: | |
6390 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); | |
6391 | case EXIT_REASON_IO_INSTRUCTION: | |
6392 | return nested_vmx_exit_handled_io(vcpu, vmcs12); | |
6393 | case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: | |
6394 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); | |
6395 | case EXIT_REASON_MSR_READ: | |
6396 | case EXIT_REASON_MSR_WRITE: | |
6397 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | |
6398 | case EXIT_REASON_INVALID_STATE: | |
6399 | return true; | |
6400 | case EXIT_REASON_MWAIT_INSTRUCTION: | |
6401 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | |
6402 | case EXIT_REASON_MONITOR_TRAP_FLAG: | |
b045ae90 | 6403 | return nested_vmx_exit_handled_mtf(vmcs12); |
55d2375e SC |
6404 | case EXIT_REASON_MONITOR_INSTRUCTION: |
6405 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); | |
6406 | case EXIT_REASON_PAUSE_INSTRUCTION: | |
6407 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || | |
6408 | nested_cpu_has2(vmcs12, | |
6409 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | |
6410 | case EXIT_REASON_MCE_DURING_VMENTRY: | |
2c1f3323 | 6411 | return true; |
55d2375e SC |
6412 | case EXIT_REASON_TPR_BELOW_THRESHOLD: |
6413 | return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); | |
6414 | case EXIT_REASON_APIC_ACCESS: | |
6415 | case EXIT_REASON_APIC_WRITE: | |
6416 | case EXIT_REASON_EOI_INDUCED: | |
6417 | /* | |
6418 | * The controls for "virtualize APIC accesses," "APIC- | |
6419 | * register virtualization," and "virtual-interrupt | |
6420 | * delivery" only come from vmcs12. | |
6421 | */ | |
6422 | return true; | |
55d2375e SC |
6423 | case EXIT_REASON_INVPCID: |
6424 | return | |
6425 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && | |
6426 | nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | |
6427 | case EXIT_REASON_WBINVD: | |
6428 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | |
6429 | case EXIT_REASON_XSETBV: | |
6430 | return true; | |
6431 | case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: | |
6432 | /* | |
6433 | * This should never happen, since it is not possible to | |
6434 | * set XSS to a non-zero value---neither in L1 nor in L2. | |
6435 | * If if it were, XSS would have to be checked against | |
6436 | * the XSS exit bitmap in vmcs12. | |
6437 | */ | |
662f6815 | 6438 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); |
bf653b78 TX |
6439 | case EXIT_REASON_UMWAIT: |
6440 | case EXIT_REASON_TPAUSE: | |
6441 | return nested_cpu_has2(vmcs12, | |
6442 | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); | |
72add915 SC |
6443 | case EXIT_REASON_ENCLS: |
6444 | return nested_vmx_exit_handled_encls(vcpu, vmcs12); | |
2f4073e0 TX |
6445 | case EXIT_REASON_NOTIFY: |
6446 | /* Notify VM exit is not exposed to L1 */ | |
6447 | return false; | |
55d2375e SC |
6448 | default: |
6449 | return true; | |
6450 | } | |
6451 | } | |
6452 | ||
7b7bd87d SC |
6453 | /* |
6454 | * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was | |
6455 | * reflected into L1. | |
6456 | */ | |
f47baaed | 6457 | bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) |
7b7bd87d | 6458 | { |
fbdd5025 | 6459 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
8e533240 | 6460 | union vmx_exit_reason exit_reason = vmx->exit_reason; |
87796555 SC |
6461 | unsigned long exit_qual; |
6462 | u32 exit_intr_info; | |
fbdd5025 SC |
6463 | |
6464 | WARN_ON_ONCE(vmx->nested.nested_run_pending); | |
6465 | ||
6466 | /* | |
6467 | * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM | |
6468 | * has already loaded L2's state. | |
6469 | */ | |
6470 | if (unlikely(vmx->fail)) { | |
6471 | trace_kvm_nested_vmenter_failed( | |
6472 | "hardware VM-instruction error: ", | |
6473 | vmcs_read32(VM_INSTRUCTION_ERROR)); | |
6474 | exit_intr_info = 0; | |
6475 | exit_qual = 0; | |
6476 | goto reflect_vmexit; | |
6477 | } | |
7b7bd87d | 6478 | |
0a62a031 | 6479 | trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); |
236871b6 | 6480 | |
2c1f3323 SC |
6481 | /* If L0 (KVM) wants the exit, it trumps L1's desires. */ |
6482 | if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) | |
6483 | return false; | |
6484 | ||
6485 | /* If L1 doesn't want the exit, handle it in L0. */ | |
6486 | if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) | |
7b7bd87d SC |
6487 | return false; |
6488 | ||
6489 | /* | |
1d283062 SC |
6490 | * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For |
6491 | * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would | |
6492 | * need to be synthesized by querying the in-kernel LAPIC, but external | |
6493 | * interrupts are never reflected to L1 so it's a non-issue. | |
7b7bd87d | 6494 | */ |
02f1965f | 6495 | exit_intr_info = vmx_get_intr_info(vcpu); |
f315f2b1 | 6496 | if (is_exception_with_error_code(exit_intr_info)) { |
7b7bd87d SC |
6497 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
6498 | ||
6499 | vmcs12->vm_exit_intr_error_code = | |
6500 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | |
6501 | } | |
02f1965f | 6502 | exit_qual = vmx_get_exit_qual(vcpu); |
7b7bd87d | 6503 | |
fbdd5025 | 6504 | reflect_vmexit: |
8e533240 | 6505 | nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); |
7b7bd87d SC |
6506 | return true; |
6507 | } | |
55d2375e SC |
6508 | |
6509 | static int vmx_get_nested_state(struct kvm_vcpu *vcpu, | |
6510 | struct kvm_nested_state __user *user_kvm_nested_state, | |
6511 | u32 user_data_size) | |
6512 | { | |
6513 | struct vcpu_vmx *vmx; | |
6514 | struct vmcs12 *vmcs12; | |
6515 | struct kvm_nested_state kvm_state = { | |
6516 | .flags = 0, | |
6ca00dfa | 6517 | .format = KVM_STATE_NESTED_FORMAT_VMX, |
55d2375e | 6518 | .size = sizeof(kvm_state), |
850448f3 | 6519 | .hdr.vmx.flags = 0, |
64c78508 YZ |
6520 | .hdr.vmx.vmxon_pa = INVALID_GPA, |
6521 | .hdr.vmx.vmcs12_pa = INVALID_GPA, | |
850448f3 | 6522 | .hdr.vmx.preemption_timer_deadline = 0, |
55d2375e | 6523 | }; |
6ca00dfa LA |
6524 | struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = |
6525 | &user_kvm_nested_state->data.vmx[0]; | |
55d2375e SC |
6526 | |
6527 | if (!vcpu) | |
6ca00dfa | 6528 | return kvm_state.size + sizeof(*user_vmx_nested_state); |
55d2375e SC |
6529 | |
6530 | vmx = to_vmx(vcpu); | |
6531 | vmcs12 = get_vmcs12(vcpu); | |
6532 | ||
1c18efda | 6533 | if (guest_can_use(vcpu, X86_FEATURE_VMX) && |
55d2375e | 6534 | (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { |
6ca00dfa LA |
6535 | kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; |
6536 | kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; | |
55d2375e SC |
6537 | |
6538 | if (vmx_has_valid_vmcs12(vcpu)) { | |
6ca00dfa | 6539 | kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); |
55d2375e | 6540 | |
27849968 | 6541 | /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ |
453e42b0 | 6542 | if (nested_vmx_is_evmptr12_set(vmx)) |
323d73a8 LA |
6543 | kvm_state.flags |= KVM_STATE_NESTED_EVMCS; |
6544 | ||
55d2375e SC |
6545 | if (is_guest_mode(vcpu) && |
6546 | nested_cpu_has_shadow_vmcs(vmcs12) && | |
64c78508 | 6547 | vmcs12->vmcs_link_pointer != INVALID_GPA) |
6ca00dfa | 6548 | kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); |
55d2375e SC |
6549 | } |
6550 | ||
6551 | if (vmx->nested.smm.vmxon) | |
6ca00dfa | 6552 | kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; |
55d2375e SC |
6553 | |
6554 | if (vmx->nested.smm.guest_mode) | |
6ca00dfa | 6555 | kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; |
55d2375e SC |
6556 | |
6557 | if (is_guest_mode(vcpu)) { | |
6558 | kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; | |
6559 | ||
6560 | if (vmx->nested.nested_run_pending) | |
6561 | kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; | |
5ef8acbd OU |
6562 | |
6563 | if (vmx->nested.mtf_pending) | |
6564 | kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; | |
850448f3 PS |
6565 | |
6566 | if (nested_cpu_has_preemption_timer(vmcs12) && | |
6567 | vmx->nested.has_preemption_timer_deadline) { | |
6568 | kvm_state.hdr.vmx.flags |= | |
6569 | KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; | |
6570 | kvm_state.hdr.vmx.preemption_timer_deadline = | |
6571 | vmx->nested.preemption_timer_deadline; | |
6572 | } | |
55d2375e SC |
6573 | } |
6574 | } | |
6575 | ||
6576 | if (user_data_size < kvm_state.size) | |
6577 | goto out; | |
6578 | ||
6579 | if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) | |
6580 | return -EFAULT; | |
6581 | ||
6582 | if (!vmx_has_valid_vmcs12(vcpu)) | |
6583 | goto out; | |
6584 | ||
6585 | /* | |
6586 | * When running L2, the authoritative vmcs12 state is in the | |
6587 | * vmcs02. When running L1, the authoritative vmcs12 state is | |
6588 | * in the shadow or enlightened vmcs linked to vmcs01, unless | |
3731905e | 6589 | * need_vmcs12_to_shadow_sync is set, in which case, the authoritative |
55d2375e SC |
6590 | * vmcs12 state is in the vmcs12 already. |
6591 | */ | |
6592 | if (is_guest_mode(vcpu)) { | |
3731905e | 6593 | sync_vmcs02_to_vmcs12(vcpu, vmcs12); |
7952d769 | 6594 | sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); |
d51e1d3f ML |
6595 | } else { |
6596 | copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); | |
6597 | if (!vmx->nested.need_vmcs12_to_shadow_sync) { | |
453e42b0 | 6598 | if (nested_vmx_is_evmptr12_valid(vmx)) |
d6bf71a1 VK |
6599 | /* |
6600 | * L1 hypervisor is not obliged to keep eVMCS | |
6601 | * clean fields data always up-to-date while | |
6602 | * not in guest mode, 'hv_clean_fields' is only | |
6603 | * supposed to be actual upon vmentry so we need | |
6604 | * to ignore it here and do full copy. | |
6605 | */ | |
6606 | copy_enlightened_to_vmcs12(vmx, 0); | |
d51e1d3f ML |
6607 | else if (enable_shadow_vmcs) |
6608 | copy_shadow_to_vmcs12(vmx); | |
6609 | } | |
55d2375e SC |
6610 | } |
6611 | ||
6ca00dfa LA |
6612 | BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); |
6613 | BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); | |
6614 | ||
3a33d030 TR |
6615 | /* |
6616 | * Copy over the full allocated size of vmcs12 rather than just the size | |
6617 | * of the struct. | |
6618 | */ | |
6ca00dfa | 6619 | if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) |
55d2375e SC |
6620 | return -EFAULT; |
6621 | ||
6622 | if (nested_cpu_has_shadow_vmcs(vmcs12) && | |
64c78508 | 6623 | vmcs12->vmcs_link_pointer != INVALID_GPA) { |
6ca00dfa | 6624 | if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, |
3a33d030 | 6625 | get_shadow_vmcs12(vcpu), VMCS12_SIZE)) |
55d2375e SC |
6626 | return -EFAULT; |
6627 | } | |
55d2375e SC |
6628 | out: |
6629 | return kvm_state.size; | |
6630 | } | |
6631 | ||
55d2375e SC |
6632 | void vmx_leave_nested(struct kvm_vcpu *vcpu) |
6633 | { | |
6634 | if (is_guest_mode(vcpu)) { | |
6635 | to_vmx(vcpu)->nested.nested_run_pending = 0; | |
6636 | nested_vmx_vmexit(vcpu, -1, 0, 0); | |
6637 | } | |
6638 | free_nested(vcpu); | |
6639 | } | |
6640 | ||
6641 | static int vmx_set_nested_state(struct kvm_vcpu *vcpu, | |
6642 | struct kvm_nested_state __user *user_kvm_nested_state, | |
6643 | struct kvm_nested_state *kvm_state) | |
6644 | { | |
6645 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
6646 | struct vmcs12 *vmcs12; | |
68cda40d | 6647 | enum vm_entry_failure_code ignored; |
6ca00dfa LA |
6648 | struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = |
6649 | &user_kvm_nested_state->data.vmx[0]; | |
55d2375e SC |
6650 | int ret; |
6651 | ||
6ca00dfa | 6652 | if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) |
55d2375e SC |
6653 | return -EINVAL; |
6654 | ||
64c78508 | 6655 | if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { |
6ca00dfa | 6656 | if (kvm_state->hdr.vmx.smm.flags) |
55d2375e SC |
6657 | return -EINVAL; |
6658 | ||
64c78508 | 6659 | if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) |
55d2375e SC |
6660 | return -EINVAL; |
6661 | ||
323d73a8 LA |
6662 | /* |
6663 | * KVM_STATE_NESTED_EVMCS used to signal that KVM should | |
6664 | * enable eVMCS capability on vCPU. However, since then | |
6665 | * code was changed such that flag signals vmcs12 should | |
6666 | * be copied into eVMCS in guest memory. | |
6667 | * | |
54aa699e | 6668 | * To preserve backwards compatibility, allow user |
323d73a8 LA |
6669 | * to set this flag even when there is no VMXON region. |
6670 | */ | |
9fd58877 PB |
6671 | if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) |
6672 | return -EINVAL; | |
6673 | } else { | |
1c18efda | 6674 | if (!guest_can_use(vcpu, X86_FEATURE_VMX)) |
9fd58877 | 6675 | return -EINVAL; |
55d2375e | 6676 | |
9fd58877 PB |
6677 | if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) |
6678 | return -EINVAL; | |
323d73a8 | 6679 | } |
55d2375e | 6680 | |
6ca00dfa | 6681 | if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && |
55d2375e SC |
6682 | (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) |
6683 | return -EINVAL; | |
6684 | ||
6ca00dfa | 6685 | if (kvm_state->hdr.vmx.smm.flags & |
55d2375e SC |
6686 | ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) |
6687 | return -EINVAL; | |
6688 | ||
5e105c88 PB |
6689 | if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) |
6690 | return -EINVAL; | |
6691 | ||
55d2375e SC |
6692 | /* |
6693 | * SMM temporarily disables VMX, so we cannot be in guest mode, | |
6694 | * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags | |
6695 | * must be zero. | |
6696 | */ | |
65b712f1 LA |
6697 | if (is_smm(vcpu) ? |
6698 | (kvm_state->flags & | |
6699 | (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) | |
6700 | : kvm_state->hdr.vmx.smm.flags) | |
55d2375e SC |
6701 | return -EINVAL; |
6702 | ||
6ca00dfa LA |
6703 | if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && |
6704 | !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) | |
55d2375e SC |
6705 | return -EINVAL; |
6706 | ||
323d73a8 | 6707 | if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && |
1c18efda SC |
6708 | (!guest_can_use(vcpu, X86_FEATURE_VMX) || |
6709 | !vmx->nested.enlightened_vmcs_enabled)) | |
9fd58877 | 6710 | return -EINVAL; |
55d2375e | 6711 | |
323d73a8 | 6712 | vmx_leave_nested(vcpu); |
9fd58877 | 6713 | |
64c78508 | 6714 | if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) |
9fd58877 | 6715 | return 0; |
332d0797 | 6716 | |
6ca00dfa | 6717 | vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; |
55d2375e SC |
6718 | ret = enter_vmx_operation(vcpu); |
6719 | if (ret) | |
6720 | return ret; | |
6721 | ||
0f02bd0a PB |
6722 | /* Empty 'VMXON' state is permitted if no VMCS loaded */ |
6723 | if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { | |
6724 | /* See vmx_has_valid_vmcs12. */ | |
6725 | if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || | |
6726 | (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || | |
64c78508 | 6727 | (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) |
0f02bd0a PB |
6728 | return -EINVAL; |
6729 | else | |
6730 | return 0; | |
6731 | } | |
55d2375e | 6732 | |
64c78508 | 6733 | if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { |
6ca00dfa LA |
6734 | if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || |
6735 | !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) | |
55d2375e SC |
6736 | return -EINVAL; |
6737 | ||
6ca00dfa | 6738 | set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); |
5a30f976 | 6739 | #ifdef CONFIG_KVM_HYPERV |
55d2375e SC |
6740 | } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { |
6741 | /* | |
e942dbf8 VK |
6742 | * nested_vmx_handle_enlightened_vmptrld() cannot be called |
6743 | * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be | |
6744 | * restored yet. EVMCS will be mapped from | |
6745 | * nested_get_vmcs12_pages(). | |
55d2375e | 6746 | */ |
27849968 | 6747 | vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; |
729c15c2 | 6748 | kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); |
5a30f976 | 6749 | #endif |
55d2375e SC |
6750 | } else { |
6751 | return -EINVAL; | |
6752 | } | |
6753 | ||
6ca00dfa | 6754 | if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { |
55d2375e SC |
6755 | vmx->nested.smm.vmxon = true; |
6756 | vmx->nested.vmxon = false; | |
6757 | ||
6ca00dfa | 6758 | if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) |
55d2375e SC |
6759 | vmx->nested.smm.guest_mode = true; |
6760 | } | |
6761 | ||
6762 | vmcs12 = get_vmcs12(vcpu); | |
6ca00dfa | 6763 | if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) |
55d2375e SC |
6764 | return -EFAULT; |
6765 | ||
6766 | if (vmcs12->hdr.revision_id != VMCS12_REVISION) | |
6767 | return -EINVAL; | |
6768 | ||
6769 | if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) | |
6770 | return 0; | |
6771 | ||
21be4ca1 SC |
6772 | vmx->nested.nested_run_pending = |
6773 | !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); | |
6774 | ||
5ef8acbd OU |
6775 | vmx->nested.mtf_pending = |
6776 | !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); | |
6777 | ||
21be4ca1 | 6778 | ret = -EINVAL; |
55d2375e | 6779 | if (nested_cpu_has_shadow_vmcs(vmcs12) && |
64c78508 | 6780 | vmcs12->vmcs_link_pointer != INVALID_GPA) { |
55d2375e SC |
6781 | struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); |
6782 | ||
6ca00dfa LA |
6783 | if (kvm_state->size < |
6784 | sizeof(*kvm_state) + | |
6785 | sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) | |
21be4ca1 | 6786 | goto error_guest_mode; |
55d2375e SC |
6787 | |
6788 | if (copy_from_user(shadow_vmcs12, | |
6ca00dfa LA |
6789 | user_vmx_nested_state->shadow_vmcs12, |
6790 | sizeof(*shadow_vmcs12))) { | |
21be4ca1 SC |
6791 | ret = -EFAULT; |
6792 | goto error_guest_mode; | |
6793 | } | |
55d2375e SC |
6794 | |
6795 | if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || | |
6796 | !shadow_vmcs12->hdr.shadow_vmcs) | |
21be4ca1 | 6797 | goto error_guest_mode; |
55d2375e SC |
6798 | } |
6799 | ||
83d31e52 | 6800 | vmx->nested.has_preemption_timer_deadline = false; |
850448f3 PS |
6801 | if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { |
6802 | vmx->nested.has_preemption_timer_deadline = true; | |
6803 | vmx->nested.preemption_timer_deadline = | |
6804 | kvm_state->hdr.vmx.preemption_timer_deadline; | |
6805 | } | |
6806 | ||
5478ba34 SC |
6807 | if (nested_vmx_check_controls(vcpu, vmcs12) || |
6808 | nested_vmx_check_host_state(vcpu, vmcs12) || | |
68cda40d | 6809 | nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) |
21be4ca1 | 6810 | goto error_guest_mode; |
55d2375e SC |
6811 | |
6812 | vmx->nested.dirty_vmcs12 = true; | |
ed2a4800 | 6813 | vmx->nested.force_msr_bitmap_recalc = true; |
55d2375e | 6814 | ret = nested_vmx_enter_non_root_mode(vcpu, false); |
21be4ca1 SC |
6815 | if (ret) |
6816 | goto error_guest_mode; | |
55d2375e | 6817 | |
2ea89c7f SC |
6818 | if (vmx->nested.mtf_pending) |
6819 | kvm_make_request(KVM_REQ_EVENT, vcpu); | |
6820 | ||
55d2375e | 6821 | return 0; |
21be4ca1 SC |
6822 | |
6823 | error_guest_mode: | |
6824 | vmx->nested.nested_run_pending = 0; | |
6825 | return ret; | |
55d2375e SC |
6826 | } |
6827 | ||
1b84292b | 6828 | void nested_vmx_set_vmcs_shadowing_bitmap(void) |
55d2375e SC |
6829 | { |
6830 | if (enable_shadow_vmcs) { | |
55d2375e | 6831 | vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); |
fadcead0 | 6832 | vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); |
55d2375e SC |
6833 | } |
6834 | } | |
6835 | ||
ba1f8245 SC |
6836 | /* |
6837 | * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo | |
6838 | * that madness to get the encoding for comparison. | |
6839 | */ | |
6840 | #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) | |
6841 | ||
6842 | static u64 nested_vmx_calc_vmcs_enum_msr(void) | |
6843 | { | |
6844 | /* | |
6845 | * Note these are the so called "index" of the VMCS field encoding, not | |
6846 | * the index into vmcs12. | |
6847 | */ | |
6848 | unsigned int max_idx, idx; | |
6849 | int i; | |
6850 | ||
6851 | /* | |
6852 | * For better or worse, KVM allows VMREAD/VMWRITE to all fields in | |
6853 | * vmcs12, regardless of whether or not the associated feature is | |
6854 | * exposed to L1. Simply find the field with the highest index. | |
6855 | */ | |
6856 | max_idx = 0; | |
6857 | for (i = 0; i < nr_vmcs12_fields; i++) { | |
6858 | /* The vmcs12 table is very, very sparsely populated. */ | |
2423a4c0 | 6859 | if (!vmcs12_field_offsets[i]) |
ba1f8245 SC |
6860 | continue; |
6861 | ||
6862 | idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); | |
6863 | if (idx > max_idx) | |
6864 | max_idx = idx; | |
6865 | } | |
6866 | ||
6867 | return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; | |
6868 | } | |
6869 | ||
f6cde920 YZ |
6870 | static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, |
6871 | struct nested_vmx_msrs *msrs) | |
55d2375e | 6872 | { |
66a329be | 6873 | msrs->pinbased_ctls_low = |
55d2375e | 6874 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; |
bcdf201f VK |
6875 | |
6876 | msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; | |
55d2375e SC |
6877 | msrs->pinbased_ctls_high &= |
6878 | PIN_BASED_EXT_INTR_MASK | | |
6879 | PIN_BASED_NMI_EXITING | | |
6880 | PIN_BASED_VIRTUAL_NMIS | | |
a4443267 | 6881 | (enable_apicv ? PIN_BASED_POSTED_INTR : 0); |
55d2375e SC |
6882 | msrs->pinbased_ctls_high |= |
6883 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | |
6884 | PIN_BASED_VMX_PREEMPTION_TIMER; | |
f6cde920 | 6885 | } |
55d2375e | 6886 | |
f6cde920 YZ |
6887 | static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, |
6888 | struct nested_vmx_msrs *msrs) | |
6889 | { | |
55d2375e SC |
6890 | msrs->exit_ctls_low = |
6891 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | |
6892 | ||
bcdf201f | 6893 | msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; |
55d2375e SC |
6894 | msrs->exit_ctls_high &= |
6895 | #ifdef CONFIG_X86_64 | |
6896 | VM_EXIT_HOST_ADDR_SPACE_SIZE | | |
6897 | #endif | |
efc83133 | 6898 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | |
f4c93d1a | 6899 | VM_EXIT_CLEAR_BNDCFGS; |
55d2375e SC |
6900 | msrs->exit_ctls_high |= |
6901 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | | |
6902 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | | |
f4c93d1a SC |
6903 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | |
6904 | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; | |
55d2375e SC |
6905 | |
6906 | /* We support free control of debug control saving. */ | |
6907 | msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; | |
f6cde920 | 6908 | } |
55d2375e | 6909 | |
f6cde920 YZ |
6910 | static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, |
6911 | struct nested_vmx_msrs *msrs) | |
6912 | { | |
55d2375e SC |
6913 | msrs->entry_ctls_low = |
6914 | VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | |
bcdf201f VK |
6915 | |
6916 | msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; | |
55d2375e SC |
6917 | msrs->entry_ctls_high &= |
6918 | #ifdef CONFIG_X86_64 | |
6919 | VM_ENTRY_IA32E_MODE | | |
6920 | #endif | |
f4c93d1a | 6921 | VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; |
55d2375e | 6922 | msrs->entry_ctls_high |= |
f4c93d1a SC |
6923 | (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | |
6924 | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); | |
55d2375e SC |
6925 | |
6926 | /* We support free control of debug control loading. */ | |
6927 | msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; | |
f6cde920 | 6928 | } |
55d2375e | 6929 | |
f6cde920 YZ |
6930 | static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, |
6931 | struct nested_vmx_msrs *msrs) | |
6932 | { | |
55d2375e SC |
6933 | msrs->procbased_ctls_low = |
6934 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | |
bcdf201f VK |
6935 | |
6936 | msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; | |
55d2375e | 6937 | msrs->procbased_ctls_high &= |
9dadc2f9 | 6938 | CPU_BASED_INTR_WINDOW_EXITING | |
5e3d394f | 6939 | CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | |
55d2375e SC |
6940 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | |
6941 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | | |
6942 | CPU_BASED_CR3_STORE_EXITING | | |
6943 | #ifdef CONFIG_X86_64 | |
6944 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | | |
6945 | #endif | |
6946 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | |
6947 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | | |
6948 | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | | |
6949 | CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | | |
6950 | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | |
6951 | /* | |
6952 | * We can allow some features even when not supported by the | |
6953 | * hardware. For example, L1 can specify an MSR bitmap - and we | |
6954 | * can use it to avoid exits to L1 - even when L0 runs L2 | |
6955 | * without MSR bitmaps. | |
6956 | */ | |
6957 | msrs->procbased_ctls_high |= | |
6958 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | |
6959 | CPU_BASED_USE_MSR_BITMAPS; | |
6960 | ||
6961 | /* We support free control of CR3 access interception. */ | |
6962 | msrs->procbased_ctls_low &= | |
6963 | ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); | |
f6cde920 | 6964 | } |
55d2375e | 6965 | |
f6cde920 YZ |
6966 | static void nested_vmx_setup_secondary_ctls(u32 ept_caps, |
6967 | struct vmcs_config *vmcs_conf, | |
6968 | struct nested_vmx_msrs *msrs) | |
6969 | { | |
55d2375e | 6970 | msrs->secondary_ctls_low = 0; |
bcdf201f VK |
6971 | |
6972 | msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; | |
55d2375e SC |
6973 | msrs->secondary_ctls_high &= |
6974 | SECONDARY_EXEC_DESC | | |
7f3603b6 | 6975 | SECONDARY_EXEC_ENABLE_RDTSCP | |
55d2375e | 6976 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | |
6defc591 | 6977 | SECONDARY_EXEC_WBINVD_EXITING | |
55d2375e SC |
6978 | SECONDARY_EXEC_APIC_REGISTER_VIRT | |
6979 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | |
6defc591 PB |
6980 | SECONDARY_EXEC_RDRAND_EXITING | |
6981 | SECONDARY_EXEC_ENABLE_INVPCID | | |
496c917b | 6982 | SECONDARY_EXEC_ENABLE_VMFUNC | |
6defc591 | 6983 | SECONDARY_EXEC_RDSEED_EXITING | |
662f6815 | 6984 | SECONDARY_EXEC_ENABLE_XSAVES | |
31de69f4 SC |
6985 | SECONDARY_EXEC_TSC_SCALING | |
6986 | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; | |
55d2375e SC |
6987 | |
6988 | /* | |
6989 | * We can emulate "VMCS shadowing," even if the hardware | |
6990 | * doesn't support it. | |
6991 | */ | |
6992 | msrs->secondary_ctls_high |= | |
6993 | SECONDARY_EXEC_SHADOW_VMCS; | |
6994 | ||
6995 | if (enable_ept) { | |
6996 | /* nested EPT: emulate EPT also to L1 */ | |
6997 | msrs->secondary_ctls_high |= | |
6998 | SECONDARY_EXEC_ENABLE_EPT; | |
bb1fcc70 SC |
6999 | msrs->ept_caps = |
7000 | VMX_EPT_PAGE_WALK_4_BIT | | |
7001 | VMX_EPT_PAGE_WALK_5_BIT | | |
7002 | VMX_EPTP_WB_BIT | | |
96d47010 SC |
7003 | VMX_EPT_INVEPT_BIT | |
7004 | VMX_EPT_EXECUTE_ONLY_BIT; | |
7005 | ||
55d2375e SC |
7006 | msrs->ept_caps &= ept_caps; |
7007 | msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | | |
7008 | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | | |
7009 | VMX_EPT_1GB_PAGE_BIT; | |
7010 | if (enable_ept_ad_bits) { | |
7011 | msrs->secondary_ctls_high |= | |
7012 | SECONDARY_EXEC_ENABLE_PML; | |
7013 | msrs->ept_caps |= VMX_EPT_AD_BIT; | |
7014 | } | |
55d2375e | 7015 | |
55d2375e | 7016 | /* |
496c917b YZ |
7017 | * Advertise EPTP switching irrespective of hardware support, |
7018 | * KVM emulates it in software so long as VMFUNC is supported. | |
55d2375e | 7019 | */ |
496c917b YZ |
7020 | if (cpu_has_vmx_vmfunc()) |
7021 | msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; | |
55d2375e SC |
7022 | } |
7023 | ||
7024 | /* | |
7025 | * Old versions of KVM use the single-context version without | |
7026 | * checking for support, so declare that it is supported even | |
7027 | * though it is treated as global context. The alternative is | |
7028 | * not failing the single-context invvpid, and it is worse. | |
7029 | */ | |
7030 | if (enable_vpid) { | |
7031 | msrs->secondary_ctls_high |= | |
7032 | SECONDARY_EXEC_ENABLE_VPID; | |
7033 | msrs->vpid_caps = VMX_VPID_INVVPID_BIT | | |
7034 | VMX_VPID_EXTENT_SUPPORTED_MASK; | |
7035 | } | |
7036 | ||
7037 | if (enable_unrestricted_guest) | |
7038 | msrs->secondary_ctls_high |= | |
7039 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | |
7040 | ||
7041 | if (flexpriority_enabled) | |
7042 | msrs->secondary_ctls_high |= | |
7043 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | |
7044 | ||
72add915 SC |
7045 | if (enable_sgx) |
7046 | msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; | |
f6cde920 | 7047 | } |
72add915 | 7048 | |
f6cde920 YZ |
7049 | static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, |
7050 | struct nested_vmx_msrs *msrs) | |
7051 | { | |
37d145ef | 7052 | msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; |
55d2375e SC |
7053 | msrs->misc_low |= |
7054 | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | | |
7055 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | | |
bf0cd88c YQ |
7056 | VMX_MISC_ACTIVITY_HLT | |
7057 | VMX_MISC_ACTIVITY_WAIT_SIPI; | |
55d2375e | 7058 | msrs->misc_high = 0; |
f6cde920 | 7059 | } |
55d2375e | 7060 | |
f6cde920 YZ |
7061 | static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) |
7062 | { | |
55d2375e SC |
7063 | /* |
7064 | * This MSR reports some information about VMX support. We | |
7065 | * should return information about the VMX we emulate for the | |
7066 | * guest, and the VMCS structure we give it - not about the | |
7067 | * VMX support of the underlying hardware. | |
7068 | */ | |
7069 | msrs->basic = | |
7070 | VMCS12_REVISION | | |
7071 | VMX_BASIC_TRUE_CTLS | | |
7072 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | | |
7073 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); | |
7074 | ||
7075 | if (cpu_has_vmx_basic_inout()) | |
7076 | msrs->basic |= VMX_BASIC_INOUT; | |
f6cde920 | 7077 | } |
55d2375e | 7078 | |
f6cde920 YZ |
7079 | static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) |
7080 | { | |
55d2375e SC |
7081 | /* |
7082 | * These MSRs specify bits which the guest must keep fixed on | |
7083 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). | |
7084 | * We picked the standard core2 setting. | |
7085 | */ | |
7086 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) | |
7087 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE | |
7088 | msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; | |
7089 | msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; | |
7090 | ||
7091 | /* These MSRs specify bits which the guest must keep fixed off. */ | |
7092 | rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); | |
7093 | rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); | |
7094 | ||
a910b5ab SC |
7095 | if (vmx_umip_emulated()) |
7096 | msrs->cr4_fixed1 |= X86_CR4_UMIP; | |
f6cde920 YZ |
7097 | } |
7098 | ||
7099 | /* | |
7100 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be | |
7101 | * returned for the various VMX controls MSRs when nested VMX is enabled. | |
7102 | * The same values should also be used to verify that vmcs12 control fields are | |
7103 | * valid during nested entry from L1 to L2. | |
7104 | * Each of these control msrs has a low and high 32-bit half: A low bit is on | |
7105 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | |
7106 | * bit in the high half is on if the corresponding bit in the control field | |
7107 | * may be on. See also vmx_control_verify(). | |
7108 | */ | |
7109 | void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) | |
7110 | { | |
7111 | struct nested_vmx_msrs *msrs = &vmcs_conf->nested; | |
7112 | ||
7113 | /* | |
7114 | * Note that as a general rule, the high half of the MSRs (bits in | |
7115 | * the control fields which may be 1) should be initialized by the | |
7116 | * intersection of the underlying hardware's MSR (i.e., features which | |
7117 | * can be supported) and the list of features we want to expose - | |
7118 | * because they are known to be properly supported in our code. | |
7119 | * Also, usually, the low half of the MSRs (bits which must be 1) can | |
7120 | * be set to 0, meaning that L1 may turn off any of these bits. The | |
7121 | * reason is that if one of these bits is necessary, it will appear | |
7122 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control | |
7123 | * fields of vmcs01 and vmcs02, will turn these bits off - and | |
7124 | * nested_vmx_l1_wants_exit() will not pass related exits to L1. | |
7125 | * These rules have exceptions below. | |
7126 | */ | |
7127 | nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); | |
7128 | ||
7129 | nested_vmx_setup_exit_ctls(vmcs_conf, msrs); | |
7130 | ||
7131 | nested_vmx_setup_entry_ctls(vmcs_conf, msrs); | |
7132 | ||
7133 | nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); | |
7134 | ||
7135 | nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); | |
7136 | ||
7137 | nested_vmx_setup_misc_data(vmcs_conf, msrs); | |
7138 | ||
7139 | nested_vmx_setup_basic(msrs); | |
7140 | ||
7141 | nested_vmx_setup_cr_fixed(msrs); | |
a910b5ab | 7142 | |
ba1f8245 | 7143 | msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); |
55d2375e SC |
7144 | } |
7145 | ||
7146 | void nested_vmx_hardware_unsetup(void) | |
7147 | { | |
7148 | int i; | |
7149 | ||
7150 | if (enable_shadow_vmcs) { | |
7151 | for (i = 0; i < VMX_BITMAP_NR; i++) | |
7152 | free_page((unsigned long)vmx_bitmap[i]); | |
7153 | } | |
7154 | } | |
7155 | ||
6c1c6e58 | 7156 | __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) |
55d2375e SC |
7157 | { |
7158 | int i; | |
7159 | ||
7160 | if (!cpu_has_vmx_shadow_vmcs()) | |
7161 | enable_shadow_vmcs = 0; | |
7162 | if (enable_shadow_vmcs) { | |
7163 | for (i = 0; i < VMX_BITMAP_NR; i++) { | |
41836839 BG |
7164 | /* |
7165 | * The vmx_bitmap is not tied to a VM and so should | |
7166 | * not be charged to a memcg. | |
7167 | */ | |
55d2375e SC |
7168 | vmx_bitmap[i] = (unsigned long *) |
7169 | __get_free_page(GFP_KERNEL); | |
7170 | if (!vmx_bitmap[i]) { | |
7171 | nested_vmx_hardware_unsetup(); | |
7172 | return -ENOMEM; | |
7173 | } | |
7174 | } | |
7175 | ||
7176 | init_vmcs_shadow_fields(); | |
7177 | } | |
7178 | ||
cc877670 LA |
7179 | exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; |
7180 | exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; | |
7181 | exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; | |
7182 | exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; | |
7183 | exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; | |
7184 | exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; | |
7185 | exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; | |
a645c2b5 SC |
7186 | exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; |
7187 | exit_handlers[EXIT_REASON_VMON] = handle_vmxon; | |
cc877670 LA |
7188 | exit_handlers[EXIT_REASON_INVEPT] = handle_invept; |
7189 | exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; | |
7190 | exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; | |
55d2375e | 7191 | |
55d2375e SC |
7192 | return 0; |
7193 | } | |
33b22172 PB |
7194 | |
7195 | struct kvm_x86_nested_ops vmx_nested_ops = { | |
f7e57078 | 7196 | .leave_nested = vmx_leave_nested, |
7709aba8 | 7197 | .is_exception_vmexit = nested_vmx_is_exception_vmexit, |
33b22172 | 7198 | .check_events = vmx_check_nested_events, |
5b4ac1a1 | 7199 | .has_events = vmx_has_nested_events, |
cb6a32c2 | 7200 | .triple_fault = nested_vmx_triple_fault, |
33b22172 PB |
7201 | .get_state = vmx_get_nested_state, |
7202 | .set_state = vmx_set_nested_state, | |
9a78e158 | 7203 | .get_nested_state_pages = vmx_get_nested_state_pages, |
02f5fb2e | 7204 | .write_log_dirty = nested_vmx_write_pml_buffer, |
b4f69df0 | 7205 | #ifdef CONFIG_KVM_HYPERV |
33b22172 PB |
7206 | .enable_evmcs = nested_enable_evmcs, |
7207 | .get_evmcs_version = nested_get_evmcs_version, | |
b0c9c25e | 7208 | .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, |
b4f69df0 | 7209 | #endif |
33b22172 | 7210 | }; |