c6f5e5821d4c34ba27f226f9f1b85672ce84a8c7
[linux-2.6-block.git] / arch / x86 / kvm / vmx / nested.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/frame.h>
4 #include <linux/percpu.h>
5
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
8
9 #include "cpuid.h"
10 #include "hyperv.h"
11 #include "mmu.h"
12 #include "nested.h"
13 #include "pmu.h"
14 #include "trace.h"
15 #include "x86.h"
16
17 static bool __read_mostly enable_shadow_vmcs = 1;
18 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
19
20 static bool __read_mostly nested_early_check = 0;
21 module_param(nested_early_check, bool, S_IRUGO);
22
23 #define CC(consistency_check)                                           \
24 ({                                                                      \
25         bool failed = (consistency_check);                              \
26         if (failed)                                                     \
27                 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
28         failed;                                                         \
29 })
30
31 #define SET_MSR_OR_WARN(vcpu, idx, data)                                \
32 ({                                                                      \
33         bool failed = kvm_set_msr(vcpu, idx, data);                     \
34         if (failed)                                                     \
35                 pr_warn_ratelimited(                                    \
36                                 "%s cannot write MSR (0x%x, 0x%llx)\n", \
37                                 __func__, idx, data);                   \
38         failed;                                                         \
39 })
40
41 /*
42  * Hyper-V requires all of these, so mark them as supported even though
43  * they are just treated the same as all-context.
44  */
45 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
46         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
47         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
48         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
49         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
50
51 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
52
53 enum {
54         VMX_VMREAD_BITMAP,
55         VMX_VMWRITE_BITMAP,
56         VMX_BITMAP_NR
57 };
58 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
59
60 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
61 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
62
63 struct shadow_vmcs_field {
64         u16     encoding;
65         u16     offset;
66 };
67 static struct shadow_vmcs_field shadow_read_only_fields[] = {
68 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
69 #include "vmcs_shadow_fields.h"
70 };
71 static int max_shadow_read_only_fields =
72         ARRAY_SIZE(shadow_read_only_fields);
73
74 static struct shadow_vmcs_field shadow_read_write_fields[] = {
75 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
76 #include "vmcs_shadow_fields.h"
77 };
78 static int max_shadow_read_write_fields =
79         ARRAY_SIZE(shadow_read_write_fields);
80
81 static void init_vmcs_shadow_fields(void)
82 {
83         int i, j;
84
85         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
86         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
87
88         for (i = j = 0; i < max_shadow_read_only_fields; i++) {
89                 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
90                 u16 field = entry.encoding;
91
92                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
93                     (i + 1 == max_shadow_read_only_fields ||
94                      shadow_read_only_fields[i + 1].encoding != field + 1))
95                         pr_err("Missing field from shadow_read_only_field %x\n",
96                                field + 1);
97
98                 clear_bit(field, vmx_vmread_bitmap);
99                 if (field & 1)
100 #ifdef CONFIG_X86_64
101                         continue;
102 #else
103                         entry.offset += sizeof(u32);
104 #endif
105                 shadow_read_only_fields[j++] = entry;
106         }
107         max_shadow_read_only_fields = j;
108
109         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
110                 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
111                 u16 field = entry.encoding;
112
113                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
114                     (i + 1 == max_shadow_read_write_fields ||
115                      shadow_read_write_fields[i + 1].encoding != field + 1))
116                         pr_err("Missing field from shadow_read_write_field %x\n",
117                                field + 1);
118
119                 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
120                           field <= GUEST_TR_AR_BYTES,
121                           "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
122
123                 /*
124                  * PML and the preemption timer can be emulated, but the
125                  * processor cannot vmwrite to fields that don't exist
126                  * on bare metal.
127                  */
128                 switch (field) {
129                 case GUEST_PML_INDEX:
130                         if (!cpu_has_vmx_pml())
131                                 continue;
132                         break;
133                 case VMX_PREEMPTION_TIMER_VALUE:
134                         if (!cpu_has_vmx_preemption_timer())
135                                 continue;
136                         break;
137                 case GUEST_INTR_STATUS:
138                         if (!cpu_has_vmx_apicv())
139                                 continue;
140                         break;
141                 default:
142                         break;
143                 }
144
145                 clear_bit(field, vmx_vmwrite_bitmap);
146                 clear_bit(field, vmx_vmread_bitmap);
147                 if (field & 1)
148 #ifdef CONFIG_X86_64
149                         continue;
150 #else
151                         entry.offset += sizeof(u32);
152 #endif
153                 shadow_read_write_fields[j++] = entry;
154         }
155         max_shadow_read_write_fields = j;
156 }
157
158 /*
159  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
160  * set the success or error code of an emulated VMX instruction (as specified
161  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
162  * instruction.
163  */
164 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
165 {
166         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
167                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
168                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
169         return kvm_skip_emulated_instruction(vcpu);
170 }
171
172 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
173 {
174         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
176                             X86_EFLAGS_SF | X86_EFLAGS_OF))
177                         | X86_EFLAGS_CF);
178         return kvm_skip_emulated_instruction(vcpu);
179 }
180
181 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
182                                 u32 vm_instruction_error)
183 {
184         struct vcpu_vmx *vmx = to_vmx(vcpu);
185
186         /*
187          * failValid writes the error number to the current VMCS, which
188          * can't be done if there isn't a current VMCS.
189          */
190         if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
191                 return nested_vmx_failInvalid(vcpu);
192
193         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
194                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
195                             X86_EFLAGS_SF | X86_EFLAGS_OF))
196                         | X86_EFLAGS_ZF);
197         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
198         /*
199          * We don't need to force a shadow sync because
200          * VM_INSTRUCTION_ERROR is not shadowed
201          */
202         return kvm_skip_emulated_instruction(vcpu);
203 }
204
205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
206 {
207         /* TODO: not to reset guest simply here. */
208         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
209         pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
210 }
211
212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
213 {
214         return fixed_bits_valid(control, low, high);
215 }
216
217 static inline u64 vmx_control_msr(u32 low, u32 high)
218 {
219         return low | ((u64)high << 32);
220 }
221
222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
223 {
224         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
225         vmcs_write64(VMCS_LINK_POINTER, -1ull);
226         vmx->nested.need_vmcs12_to_shadow_sync = false;
227 }
228
229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
230 {
231         struct vcpu_vmx *vmx = to_vmx(vcpu);
232
233         if (!vmx->nested.hv_evmcs)
234                 return;
235
236         kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
237         vmx->nested.hv_evmcs_vmptr = -1ull;
238         vmx->nested.hv_evmcs = NULL;
239 }
240
241 /*
242  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
243  * just stops using VMX.
244  */
245 static void free_nested(struct kvm_vcpu *vcpu)
246 {
247         struct vcpu_vmx *vmx = to_vmx(vcpu);
248
249         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
250                 return;
251
252         kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
253
254         vmx->nested.vmxon = false;
255         vmx->nested.smm.vmxon = false;
256         free_vpid(vmx->nested.vpid02);
257         vmx->nested.posted_intr_nv = -1;
258         vmx->nested.current_vmptr = -1ull;
259         if (enable_shadow_vmcs) {
260                 vmx_disable_shadow_vmcs(vmx);
261                 vmcs_clear(vmx->vmcs01.shadow_vmcs);
262                 free_vmcs(vmx->vmcs01.shadow_vmcs);
263                 vmx->vmcs01.shadow_vmcs = NULL;
264         }
265         kfree(vmx->nested.cached_vmcs12);
266         vmx->nested.cached_vmcs12 = NULL;
267         kfree(vmx->nested.cached_shadow_vmcs12);
268         vmx->nested.cached_shadow_vmcs12 = NULL;
269         /* Unpin physical memory we referred to in the vmcs02 */
270         if (vmx->nested.apic_access_page) {
271                 kvm_release_page_dirty(vmx->nested.apic_access_page);
272                 vmx->nested.apic_access_page = NULL;
273         }
274         kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
275         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
276         vmx->nested.pi_desc = NULL;
277
278         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
279
280         nested_release_evmcs(vcpu);
281
282         free_loaded_vmcs(&vmx->nested.vmcs02);
283 }
284
285 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
286                                      struct loaded_vmcs *prev)
287 {
288         struct vmcs_host_state *dest, *src;
289
290         if (unlikely(!vmx->guest_state_loaded))
291                 return;
292
293         src = &prev->host_state;
294         dest = &vmx->loaded_vmcs->host_state;
295
296         vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
297         dest->ldt_sel = src->ldt_sel;
298 #ifdef CONFIG_X86_64
299         dest->ds_sel = src->ds_sel;
300         dest->es_sel = src->es_sel;
301 #endif
302 }
303
304 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
305 {
306         struct vcpu_vmx *vmx = to_vmx(vcpu);
307         struct loaded_vmcs *prev;
308         int cpu;
309
310         if (vmx->loaded_vmcs == vmcs)
311                 return;
312
313         cpu = get_cpu();
314         prev = vmx->loaded_vmcs;
315         vmx->loaded_vmcs = vmcs;
316         vmx_vcpu_load_vmcs(vcpu, cpu);
317         vmx_sync_vmcs_host_state(vmx, prev);
318         put_cpu();
319
320         vmx_segment_cache_clear(vmx);
321 }
322
323 /*
324  * Ensure that the current vmcs of the logical processor is the
325  * vmcs01 of the vcpu before calling free_nested().
326  */
327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
328 {
329         vcpu_load(vcpu);
330         vmx_leave_nested(vcpu);
331         vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
332         free_nested(vcpu);
333         vcpu_put(vcpu);
334 }
335
336 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
337                 struct x86_exception *fault)
338 {
339         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
340         struct vcpu_vmx *vmx = to_vmx(vcpu);
341         u32 exit_reason;
342         unsigned long exit_qualification = vcpu->arch.exit_qualification;
343
344         if (vmx->nested.pml_full) {
345                 exit_reason = EXIT_REASON_PML_FULL;
346                 vmx->nested.pml_full = false;
347                 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
348         } else if (fault->error_code & PFERR_RSVD_MASK)
349                 exit_reason = EXIT_REASON_EPT_MISCONFIG;
350         else
351                 exit_reason = EXIT_REASON_EPT_VIOLATION;
352
353         nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
354         vmcs12->guest_physical_address = fault->address;
355 }
356
357 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
358 {
359         WARN_ON(mmu_is_nested(vcpu));
360
361         vcpu->arch.mmu = &vcpu->arch.guest_mmu;
362         kvm_init_shadow_ept_mmu(vcpu,
363                         to_vmx(vcpu)->nested.msrs.ept_caps &
364                         VMX_EPT_EXECUTE_ONLY_BIT,
365                         nested_ept_ad_enabled(vcpu),
366                         nested_ept_get_cr3(vcpu));
367         vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
368         vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
369         vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
370         vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
371
372         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
373 }
374
375 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
376 {
377         vcpu->arch.mmu = &vcpu->arch.root_mmu;
378         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
379 }
380
381 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
382                                             u16 error_code)
383 {
384         bool inequality, bit;
385
386         bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
387         inequality =
388                 (error_code & vmcs12->page_fault_error_code_mask) !=
389                  vmcs12->page_fault_error_code_match;
390         return inequality ^ bit;
391 }
392
393
394 /*
395  * KVM wants to inject page-faults which it got to the guest. This function
396  * checks whether in a nested guest, we need to inject them to L1 or L2.
397  */
398 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
399 {
400         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
401         unsigned int nr = vcpu->arch.exception.nr;
402         bool has_payload = vcpu->arch.exception.has_payload;
403         unsigned long payload = vcpu->arch.exception.payload;
404
405         if (nr == PF_VECTOR) {
406                 if (vcpu->arch.exception.nested_apf) {
407                         *exit_qual = vcpu->arch.apf.nested_apf_token;
408                         return 1;
409                 }
410                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
411                                                     vcpu->arch.exception.error_code)) {
412                         *exit_qual = has_payload ? payload : vcpu->arch.cr2;
413                         return 1;
414                 }
415         } else if (vmcs12->exception_bitmap & (1u << nr)) {
416                 if (nr == DB_VECTOR) {
417                         if (!has_payload) {
418                                 payload = vcpu->arch.dr6;
419                                 payload &= ~(DR6_FIXED_1 | DR6_BT);
420                                 payload ^= DR6_RTM;
421                         }
422                         *exit_qual = payload;
423                 } else
424                         *exit_qual = 0;
425                 return 1;
426         }
427
428         return 0;
429 }
430
431
432 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
433                 struct x86_exception *fault)
434 {
435         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
436
437         WARN_ON(!is_guest_mode(vcpu));
438
439         if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
440                 !to_vmx(vcpu)->nested.nested_run_pending) {
441                 vmcs12->vm_exit_intr_error_code = fault->error_code;
442                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
443                                   PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
444                                   INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
445                                   fault->address);
446         } else {
447                 kvm_inject_page_fault(vcpu, fault);
448         }
449 }
450
451 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
452 {
453         return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
454 }
455
456 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
457                                                struct vmcs12 *vmcs12)
458 {
459         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
460                 return 0;
461
462         if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
463             CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
464                 return -EINVAL;
465
466         return 0;
467 }
468
469 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
470                                                 struct vmcs12 *vmcs12)
471 {
472         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
473                 return 0;
474
475         if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
476                 return -EINVAL;
477
478         return 0;
479 }
480
481 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
482                                                 struct vmcs12 *vmcs12)
483 {
484         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
485                 return 0;
486
487         if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
488                 return -EINVAL;
489
490         return 0;
491 }
492
493 /*
494  * Check if MSR is intercepted for L01 MSR bitmap.
495  */
496 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
497 {
498         unsigned long *msr_bitmap;
499         int f = sizeof(unsigned long);
500
501         if (!cpu_has_vmx_msr_bitmap())
502                 return true;
503
504         msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
505
506         if (msr <= 0x1fff) {
507                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
508         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
509                 msr &= 0x1fff;
510                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
511         }
512
513         return true;
514 }
515
516 /*
517  * If a msr is allowed by L0, we should check whether it is allowed by L1.
518  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
519  */
520 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
521                                                unsigned long *msr_bitmap_nested,
522                                                u32 msr, int type)
523 {
524         int f = sizeof(unsigned long);
525
526         /*
527          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
528          * have the write-low and read-high bitmap offsets the wrong way round.
529          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
530          */
531         if (msr <= 0x1fff) {
532                 if (type & MSR_TYPE_R &&
533                    !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
534                         /* read-low */
535                         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
536
537                 if (type & MSR_TYPE_W &&
538                    !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
539                         /* write-low */
540                         __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
541
542         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
543                 msr &= 0x1fff;
544                 if (type & MSR_TYPE_R &&
545                    !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
546                         /* read-high */
547                         __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
548
549                 if (type & MSR_TYPE_W &&
550                    !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
551                         /* write-high */
552                         __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
553
554         }
555 }
556
557 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
558         int msr;
559
560         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
561                 unsigned word = msr / BITS_PER_LONG;
562
563                 msr_bitmap[word] = ~0;
564                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
565         }
566 }
567
568 /*
569  * Merge L0's and L1's MSR bitmap, return false to indicate that
570  * we do not use the hardware.
571  */
572 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
573                                                  struct vmcs12 *vmcs12)
574 {
575         int msr;
576         unsigned long *msr_bitmap_l1;
577         unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
578         struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
579
580         /* Nothing to do if the MSR bitmap is not in use.  */
581         if (!cpu_has_vmx_msr_bitmap() ||
582             !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
583                 return false;
584
585         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
586                 return false;
587
588         msr_bitmap_l1 = (unsigned long *)map->hva;
589
590         /*
591          * To keep the control flow simple, pay eight 8-byte writes (sixteen
592          * 4-byte writes on 32-bit systems) up front to enable intercepts for
593          * the x2APIC MSR range and selectively disable them below.
594          */
595         enable_x2apic_msr_intercepts(msr_bitmap_l0);
596
597         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
598                 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
599                         /*
600                          * L0 need not intercept reads for MSRs between 0x800
601                          * and 0x8ff, it just lets the processor take the value
602                          * from the virtual-APIC page; take those 256 bits
603                          * directly from the L1 bitmap.
604                          */
605                         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
606                                 unsigned word = msr / BITS_PER_LONG;
607
608                                 msr_bitmap_l0[word] = msr_bitmap_l1[word];
609                         }
610                 }
611
612                 nested_vmx_disable_intercept_for_msr(
613                         msr_bitmap_l1, msr_bitmap_l0,
614                         X2APIC_MSR(APIC_TASKPRI),
615                         MSR_TYPE_R | MSR_TYPE_W);
616
617                 if (nested_cpu_has_vid(vmcs12)) {
618                         nested_vmx_disable_intercept_for_msr(
619                                 msr_bitmap_l1, msr_bitmap_l0,
620                                 X2APIC_MSR(APIC_EOI),
621                                 MSR_TYPE_W);
622                         nested_vmx_disable_intercept_for_msr(
623                                 msr_bitmap_l1, msr_bitmap_l0,
624                                 X2APIC_MSR(APIC_SELF_IPI),
625                                 MSR_TYPE_W);
626                 }
627         }
628
629         /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
630         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
631                                              MSR_FS_BASE, MSR_TYPE_RW);
632
633         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
634                                              MSR_GS_BASE, MSR_TYPE_RW);
635
636         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
637                                              MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
638
639         /*
640          * Checking the L0->L1 bitmap is trying to verify two things:
641          *
642          * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
643          *    ensures that we do not accidentally generate an L02 MSR bitmap
644          *    from the L12 MSR bitmap that is too permissive.
645          * 2. That L1 or L2s have actually used the MSR. This avoids
646          *    unnecessarily merging of the bitmap if the MSR is unused. This
647          *    works properly because we only update the L01 MSR bitmap lazily.
648          *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
649          *    updated to reflect this when L1 (or its L2s) actually write to
650          *    the MSR.
651          */
652         if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
653                 nested_vmx_disable_intercept_for_msr(
654                                         msr_bitmap_l1, msr_bitmap_l0,
655                                         MSR_IA32_SPEC_CTRL,
656                                         MSR_TYPE_R | MSR_TYPE_W);
657
658         if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
659                 nested_vmx_disable_intercept_for_msr(
660                                         msr_bitmap_l1, msr_bitmap_l0,
661                                         MSR_IA32_PRED_CMD,
662                                         MSR_TYPE_W);
663
664         kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
665
666         return true;
667 }
668
669 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
670                                        struct vmcs12 *vmcs12)
671 {
672         struct kvm_host_map map;
673         struct vmcs12 *shadow;
674
675         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
676             vmcs12->vmcs_link_pointer == -1ull)
677                 return;
678
679         shadow = get_shadow_vmcs12(vcpu);
680
681         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
682                 return;
683
684         memcpy(shadow, map.hva, VMCS12_SIZE);
685         kvm_vcpu_unmap(vcpu, &map, false);
686 }
687
688 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
689                                               struct vmcs12 *vmcs12)
690 {
691         struct vcpu_vmx *vmx = to_vmx(vcpu);
692
693         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
694             vmcs12->vmcs_link_pointer == -1ull)
695                 return;
696
697         kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
698                         get_shadow_vmcs12(vcpu), VMCS12_SIZE);
699 }
700
701 /*
702  * In nested virtualization, check if L1 has set
703  * VM_EXIT_ACK_INTR_ON_EXIT
704  */
705 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
706 {
707         return get_vmcs12(vcpu)->vm_exit_controls &
708                 VM_EXIT_ACK_INTR_ON_EXIT;
709 }
710
711 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
712 {
713         return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
714 }
715
716 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
717                                           struct vmcs12 *vmcs12)
718 {
719         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
720             CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
721                 return -EINVAL;
722         else
723                 return 0;
724 }
725
726 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
727                                            struct vmcs12 *vmcs12)
728 {
729         if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
730             !nested_cpu_has_apic_reg_virt(vmcs12) &&
731             !nested_cpu_has_vid(vmcs12) &&
732             !nested_cpu_has_posted_intr(vmcs12))
733                 return 0;
734
735         /*
736          * If virtualize x2apic mode is enabled,
737          * virtualize apic access must be disabled.
738          */
739         if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
740                nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
741                 return -EINVAL;
742
743         /*
744          * If virtual interrupt delivery is enabled,
745          * we must exit on external interrupts.
746          */
747         if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
748                 return -EINVAL;
749
750         /*
751          * bits 15:8 should be zero in posted_intr_nv,
752          * the descriptor address has been already checked
753          * in nested_get_vmcs12_pages.
754          *
755          * bits 5:0 of posted_intr_desc_addr should be zero.
756          */
757         if (nested_cpu_has_posted_intr(vmcs12) &&
758            (CC(!nested_cpu_has_vid(vmcs12)) ||
759             CC(!nested_exit_intr_ack_set(vcpu)) ||
760             CC((vmcs12->posted_intr_nv & 0xff00)) ||
761             CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
762             CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
763                 return -EINVAL;
764
765         /* tpr shadow is needed by all apicv features. */
766         if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
767                 return -EINVAL;
768
769         return 0;
770 }
771
772 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
773                                        u32 count, u64 addr)
774 {
775         int maxphyaddr;
776
777         if (count == 0)
778                 return 0;
779         maxphyaddr = cpuid_maxphyaddr(vcpu);
780         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
781             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
782                 return -EINVAL;
783
784         return 0;
785 }
786
787 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
788                                                      struct vmcs12 *vmcs12)
789 {
790         if (CC(nested_vmx_check_msr_switch(vcpu,
791                                            vmcs12->vm_exit_msr_load_count,
792                                            vmcs12->vm_exit_msr_load_addr)) ||
793             CC(nested_vmx_check_msr_switch(vcpu,
794                                            vmcs12->vm_exit_msr_store_count,
795                                            vmcs12->vm_exit_msr_store_addr)))
796                 return -EINVAL;
797
798         return 0;
799 }
800
801 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
802                                                       struct vmcs12 *vmcs12)
803 {
804         if (CC(nested_vmx_check_msr_switch(vcpu,
805                                            vmcs12->vm_entry_msr_load_count,
806                                            vmcs12->vm_entry_msr_load_addr)))
807                 return -EINVAL;
808
809         return 0;
810 }
811
812 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
813                                          struct vmcs12 *vmcs12)
814 {
815         if (!nested_cpu_has_pml(vmcs12))
816                 return 0;
817
818         if (CC(!nested_cpu_has_ept(vmcs12)) ||
819             CC(!page_address_valid(vcpu, vmcs12->pml_address)))
820                 return -EINVAL;
821
822         return 0;
823 }
824
825 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
826                                                         struct vmcs12 *vmcs12)
827 {
828         if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
829                !nested_cpu_has_ept(vmcs12)))
830                 return -EINVAL;
831         return 0;
832 }
833
834 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
835                                                          struct vmcs12 *vmcs12)
836 {
837         if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
838                !nested_cpu_has_ept(vmcs12)))
839                 return -EINVAL;
840         return 0;
841 }
842
843 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
844                                                  struct vmcs12 *vmcs12)
845 {
846         if (!nested_cpu_has_shadow_vmcs(vmcs12))
847                 return 0;
848
849         if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
850             CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
851                 return -EINVAL;
852
853         return 0;
854 }
855
856 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
857                                        struct vmx_msr_entry *e)
858 {
859         /* x2APIC MSR accesses are not allowed */
860         if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
861                 return -EINVAL;
862         if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
863             CC(e->index == MSR_IA32_UCODE_REV))
864                 return -EINVAL;
865         if (CC(e->reserved != 0))
866                 return -EINVAL;
867         return 0;
868 }
869
870 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
871                                      struct vmx_msr_entry *e)
872 {
873         if (CC(e->index == MSR_FS_BASE) ||
874             CC(e->index == MSR_GS_BASE) ||
875             CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
876             nested_vmx_msr_check_common(vcpu, e))
877                 return -EINVAL;
878         return 0;
879 }
880
881 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
882                                       struct vmx_msr_entry *e)
883 {
884         if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
885             nested_vmx_msr_check_common(vcpu, e))
886                 return -EINVAL;
887         return 0;
888 }
889
890 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
891 {
892         struct vcpu_vmx *vmx = to_vmx(vcpu);
893         u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
894                                        vmx->nested.msrs.misc_high);
895
896         return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
897 }
898
899 /*
900  * Load guest's/host's msr at nested entry/exit.
901  * return 0 for success, entry index for failure.
902  *
903  * One of the failure modes for MSR load/store is when a list exceeds the
904  * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
905  * as possible, process all valid entries before failing rather than precheck
906  * for a capacity violation.
907  */
908 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
909 {
910         u32 i;
911         struct vmx_msr_entry e;
912         u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
913
914         for (i = 0; i < count; i++) {
915                 if (unlikely(i >= max_msr_list_size))
916                         goto fail;
917
918                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
919                                         &e, sizeof(e))) {
920                         pr_debug_ratelimited(
921                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
922                                 __func__, i, gpa + i * sizeof(e));
923                         goto fail;
924                 }
925                 if (nested_vmx_load_msr_check(vcpu, &e)) {
926                         pr_debug_ratelimited(
927                                 "%s check failed (%u, 0x%x, 0x%x)\n",
928                                 __func__, i, e.index, e.reserved);
929                         goto fail;
930                 }
931                 if (kvm_set_msr(vcpu, e.index, e.value)) {
932                         pr_debug_ratelimited(
933                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
934                                 __func__, i, e.index, e.value);
935                         goto fail;
936                 }
937         }
938         return 0;
939 fail:
940         return i + 1;
941 }
942
943 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
944 {
945         u64 data;
946         u32 i;
947         struct vmx_msr_entry e;
948         u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
949
950         for (i = 0; i < count; i++) {
951                 if (unlikely(i >= max_msr_list_size))
952                         return -EINVAL;
953
954                 if (kvm_vcpu_read_guest(vcpu,
955                                         gpa + i * sizeof(e),
956                                         &e, 2 * sizeof(u32))) {
957                         pr_debug_ratelimited(
958                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
959                                 __func__, i, gpa + i * sizeof(e));
960                         return -EINVAL;
961                 }
962                 if (nested_vmx_store_msr_check(vcpu, &e)) {
963                         pr_debug_ratelimited(
964                                 "%s check failed (%u, 0x%x, 0x%x)\n",
965                                 __func__, i, e.index, e.reserved);
966                         return -EINVAL;
967                 }
968                 if (kvm_get_msr(vcpu, e.index, &data)) {
969                         pr_debug_ratelimited(
970                                 "%s cannot read MSR (%u, 0x%x)\n",
971                                 __func__, i, e.index);
972                         return -EINVAL;
973                 }
974                 if (kvm_vcpu_write_guest(vcpu,
975                                          gpa + i * sizeof(e) +
976                                              offsetof(struct vmx_msr_entry, value),
977                                          &data, sizeof(data))) {
978                         pr_debug_ratelimited(
979                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
980                                 __func__, i, e.index, data);
981                         return -EINVAL;
982                 }
983         }
984         return 0;
985 }
986
987 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
988 {
989         unsigned long invalid_mask;
990
991         invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
992         return (val & invalid_mask) == 0;
993 }
994
995 /*
996  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
997  * emulating VM entry into a guest with EPT enabled.
998  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
999  * is assigned to entry_failure_code on failure.
1000  */
1001 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
1002                                u32 *entry_failure_code)
1003 {
1004         if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
1005                 if (CC(!nested_cr3_valid(vcpu, cr3))) {
1006                         *entry_failure_code = ENTRY_FAIL_DEFAULT;
1007                         return -EINVAL;
1008                 }
1009
1010                 /*
1011                  * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1012                  * must not be dereferenced.
1013                  */
1014                 if (is_pae_paging(vcpu) && !nested_ept) {
1015                         if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1016                                 *entry_failure_code = ENTRY_FAIL_PDPTE;
1017                                 return -EINVAL;
1018                         }
1019                 }
1020         }
1021
1022         if (!nested_ept)
1023                 kvm_mmu_new_cr3(vcpu, cr3, false);
1024
1025         vcpu->arch.cr3 = cr3;
1026         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1027
1028         kvm_init_mmu(vcpu, false);
1029
1030         return 0;
1031 }
1032
1033 /*
1034  * Returns if KVM is able to config CPU to tag TLB entries
1035  * populated by L2 differently than TLB entries populated
1036  * by L1.
1037  *
1038  * If L1 uses EPT, then TLB entries are tagged with different EPTP.
1039  *
1040  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1041  * with different VPID (L1 entries are tagged with vmx->vpid
1042  * while L2 entries are tagged with vmx->nested.vpid02).
1043  */
1044 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1045 {
1046         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1047
1048         return nested_cpu_has_ept(vmcs12) ||
1049                (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1050 }
1051
1052 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1053 {
1054         struct vcpu_vmx *vmx = to_vmx(vcpu);
1055
1056         return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1057 }
1058
1059 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1060 {
1061         superset &= mask;
1062         subset &= mask;
1063
1064         return (superset | subset) == superset;
1065 }
1066
1067 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1068 {
1069         const u64 feature_and_reserved =
1070                 /* feature (except bit 48; see below) */
1071                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1072                 /* reserved */
1073                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1074         u64 vmx_basic = vmx->nested.msrs.basic;
1075
1076         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1077                 return -EINVAL;
1078
1079         /*
1080          * KVM does not emulate a version of VMX that constrains physical
1081          * addresses of VMX structures (e.g. VMCS) to 32-bits.
1082          */
1083         if (data & BIT_ULL(48))
1084                 return -EINVAL;
1085
1086         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1087             vmx_basic_vmcs_revision_id(data))
1088                 return -EINVAL;
1089
1090         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1091                 return -EINVAL;
1092
1093         vmx->nested.msrs.basic = data;
1094         return 0;
1095 }
1096
1097 static int
1098 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1099 {
1100         u64 supported;
1101         u32 *lowp, *highp;
1102
1103         switch (msr_index) {
1104         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1105                 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1106                 highp = &vmx->nested.msrs.pinbased_ctls_high;
1107                 break;
1108         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1109                 lowp = &vmx->nested.msrs.procbased_ctls_low;
1110                 highp = &vmx->nested.msrs.procbased_ctls_high;
1111                 break;
1112         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1113                 lowp = &vmx->nested.msrs.exit_ctls_low;
1114                 highp = &vmx->nested.msrs.exit_ctls_high;
1115                 break;
1116         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1117                 lowp = &vmx->nested.msrs.entry_ctls_low;
1118                 highp = &vmx->nested.msrs.entry_ctls_high;
1119                 break;
1120         case MSR_IA32_VMX_PROCBASED_CTLS2:
1121                 lowp = &vmx->nested.msrs.secondary_ctls_low;
1122                 highp = &vmx->nested.msrs.secondary_ctls_high;
1123                 break;
1124         default:
1125                 BUG();
1126         }
1127
1128         supported = vmx_control_msr(*lowp, *highp);
1129
1130         /* Check must-be-1 bits are still 1. */
1131         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1132                 return -EINVAL;
1133
1134         /* Check must-be-0 bits are still 0. */
1135         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1136                 return -EINVAL;
1137
1138         *lowp = data;
1139         *highp = data >> 32;
1140         return 0;
1141 }
1142
1143 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1144 {
1145         const u64 feature_and_reserved_bits =
1146                 /* feature */
1147                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1148                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1149                 /* reserved */
1150                 GENMASK_ULL(13, 9) | BIT_ULL(31);
1151         u64 vmx_misc;
1152
1153         vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1154                                    vmx->nested.msrs.misc_high);
1155
1156         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1157                 return -EINVAL;
1158
1159         if ((vmx->nested.msrs.pinbased_ctls_high &
1160              PIN_BASED_VMX_PREEMPTION_TIMER) &&
1161             vmx_misc_preemption_timer_rate(data) !=
1162             vmx_misc_preemption_timer_rate(vmx_misc))
1163                 return -EINVAL;
1164
1165         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1166                 return -EINVAL;
1167
1168         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1169                 return -EINVAL;
1170
1171         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1172                 return -EINVAL;
1173
1174         vmx->nested.msrs.misc_low = data;
1175         vmx->nested.msrs.misc_high = data >> 32;
1176
1177         return 0;
1178 }
1179
1180 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1181 {
1182         u64 vmx_ept_vpid_cap;
1183
1184         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1185                                            vmx->nested.msrs.vpid_caps);
1186
1187         /* Every bit is either reserved or a feature bit. */
1188         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1189                 return -EINVAL;
1190
1191         vmx->nested.msrs.ept_caps = data;
1192         vmx->nested.msrs.vpid_caps = data >> 32;
1193         return 0;
1194 }
1195
1196 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1197 {
1198         u64 *msr;
1199
1200         switch (msr_index) {
1201         case MSR_IA32_VMX_CR0_FIXED0:
1202                 msr = &vmx->nested.msrs.cr0_fixed0;
1203                 break;
1204         case MSR_IA32_VMX_CR4_FIXED0:
1205                 msr = &vmx->nested.msrs.cr4_fixed0;
1206                 break;
1207         default:
1208                 BUG();
1209         }
1210
1211         /*
1212          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1213          * must be 1 in the restored value.
1214          */
1215         if (!is_bitwise_subset(data, *msr, -1ULL))
1216                 return -EINVAL;
1217
1218         *msr = data;
1219         return 0;
1220 }
1221
1222 /*
1223  * Called when userspace is restoring VMX MSRs.
1224  *
1225  * Returns 0 on success, non-0 otherwise.
1226  */
1227 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1228 {
1229         struct vcpu_vmx *vmx = to_vmx(vcpu);
1230
1231         /*
1232          * Don't allow changes to the VMX capability MSRs while the vCPU
1233          * is in VMX operation.
1234          */
1235         if (vmx->nested.vmxon)
1236                 return -EBUSY;
1237
1238         switch (msr_index) {
1239         case MSR_IA32_VMX_BASIC:
1240                 return vmx_restore_vmx_basic(vmx, data);
1241         case MSR_IA32_VMX_PINBASED_CTLS:
1242         case MSR_IA32_VMX_PROCBASED_CTLS:
1243         case MSR_IA32_VMX_EXIT_CTLS:
1244         case MSR_IA32_VMX_ENTRY_CTLS:
1245                 /*
1246                  * The "non-true" VMX capability MSRs are generated from the
1247                  * "true" MSRs, so we do not support restoring them directly.
1248                  *
1249                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1250                  * should restore the "true" MSRs with the must-be-1 bits
1251                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1252                  * DEFAULT SETTINGS".
1253                  */
1254                 return -EINVAL;
1255         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1256         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1257         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1258         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1259         case MSR_IA32_VMX_PROCBASED_CTLS2:
1260                 return vmx_restore_control_msr(vmx, msr_index, data);
1261         case MSR_IA32_VMX_MISC:
1262                 return vmx_restore_vmx_misc(vmx, data);
1263         case MSR_IA32_VMX_CR0_FIXED0:
1264         case MSR_IA32_VMX_CR4_FIXED0:
1265                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1266         case MSR_IA32_VMX_CR0_FIXED1:
1267         case MSR_IA32_VMX_CR4_FIXED1:
1268                 /*
1269                  * These MSRs are generated based on the vCPU's CPUID, so we
1270                  * do not support restoring them directly.
1271                  */
1272                 return -EINVAL;
1273         case MSR_IA32_VMX_EPT_VPID_CAP:
1274                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1275         case MSR_IA32_VMX_VMCS_ENUM:
1276                 vmx->nested.msrs.vmcs_enum = data;
1277                 return 0;
1278         case MSR_IA32_VMX_VMFUNC:
1279                 if (data & ~vmx->nested.msrs.vmfunc_controls)
1280                         return -EINVAL;
1281                 vmx->nested.msrs.vmfunc_controls = data;
1282                 return 0;
1283         default:
1284                 /*
1285                  * The rest of the VMX capability MSRs do not support restore.
1286                  */
1287                 return -EINVAL;
1288         }
1289 }
1290
1291 /* Returns 0 on success, non-0 otherwise. */
1292 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1293 {
1294         switch (msr_index) {
1295         case MSR_IA32_VMX_BASIC:
1296                 *pdata = msrs->basic;
1297                 break;
1298         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1299         case MSR_IA32_VMX_PINBASED_CTLS:
1300                 *pdata = vmx_control_msr(
1301                         msrs->pinbased_ctls_low,
1302                         msrs->pinbased_ctls_high);
1303                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1304                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1305                 break;
1306         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1307         case MSR_IA32_VMX_PROCBASED_CTLS:
1308                 *pdata = vmx_control_msr(
1309                         msrs->procbased_ctls_low,
1310                         msrs->procbased_ctls_high);
1311                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1312                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1313                 break;
1314         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1315         case MSR_IA32_VMX_EXIT_CTLS:
1316                 *pdata = vmx_control_msr(
1317                         msrs->exit_ctls_low,
1318                         msrs->exit_ctls_high);
1319                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1320                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1321                 break;
1322         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1323         case MSR_IA32_VMX_ENTRY_CTLS:
1324                 *pdata = vmx_control_msr(
1325                         msrs->entry_ctls_low,
1326                         msrs->entry_ctls_high);
1327                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1328                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1329                 break;
1330         case MSR_IA32_VMX_MISC:
1331                 *pdata = vmx_control_msr(
1332                         msrs->misc_low,
1333                         msrs->misc_high);
1334                 break;
1335         case MSR_IA32_VMX_CR0_FIXED0:
1336                 *pdata = msrs->cr0_fixed0;
1337                 break;
1338         case MSR_IA32_VMX_CR0_FIXED1:
1339                 *pdata = msrs->cr0_fixed1;
1340                 break;
1341         case MSR_IA32_VMX_CR4_FIXED0:
1342                 *pdata = msrs->cr4_fixed0;
1343                 break;
1344         case MSR_IA32_VMX_CR4_FIXED1:
1345                 *pdata = msrs->cr4_fixed1;
1346                 break;
1347         case MSR_IA32_VMX_VMCS_ENUM:
1348                 *pdata = msrs->vmcs_enum;
1349                 break;
1350         case MSR_IA32_VMX_PROCBASED_CTLS2:
1351                 *pdata = vmx_control_msr(
1352                         msrs->secondary_ctls_low,
1353                         msrs->secondary_ctls_high);
1354                 break;
1355         case MSR_IA32_VMX_EPT_VPID_CAP:
1356                 *pdata = msrs->ept_caps |
1357                         ((u64)msrs->vpid_caps << 32);
1358                 break;
1359         case MSR_IA32_VMX_VMFUNC:
1360                 *pdata = msrs->vmfunc_controls;
1361                 break;
1362         default:
1363                 return 1;
1364         }
1365
1366         return 0;
1367 }
1368
1369 /*
1370  * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1371  * been modified by the L1 guest.  Note, "writable" in this context means
1372  * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1373  * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1374  * VM-exit information fields (which are actually writable if the vCPU is
1375  * configured to support "VMWRITE to any supported field in the VMCS").
1376  */
1377 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1378 {
1379         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1380         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1381         struct shadow_vmcs_field field;
1382         unsigned long val;
1383         int i;
1384
1385         if (WARN_ON(!shadow_vmcs))
1386                 return;
1387
1388         preempt_disable();
1389
1390         vmcs_load(shadow_vmcs);
1391
1392         for (i = 0; i < max_shadow_read_write_fields; i++) {
1393                 field = shadow_read_write_fields[i];
1394                 val = __vmcs_readl(field.encoding);
1395                 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1396         }
1397
1398         vmcs_clear(shadow_vmcs);
1399         vmcs_load(vmx->loaded_vmcs->vmcs);
1400
1401         preempt_enable();
1402 }
1403
1404 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1405 {
1406         const struct shadow_vmcs_field *fields[] = {
1407                 shadow_read_write_fields,
1408                 shadow_read_only_fields
1409         };
1410         const int max_fields[] = {
1411                 max_shadow_read_write_fields,
1412                 max_shadow_read_only_fields
1413         };
1414         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1415         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1416         struct shadow_vmcs_field field;
1417         unsigned long val;
1418         int i, q;
1419
1420         if (WARN_ON(!shadow_vmcs))
1421                 return;
1422
1423         vmcs_load(shadow_vmcs);
1424
1425         for (q = 0; q < ARRAY_SIZE(fields); q++) {
1426                 for (i = 0; i < max_fields[q]; i++) {
1427                         field = fields[q][i];
1428                         val = vmcs12_read_any(vmcs12, field.encoding,
1429                                               field.offset);
1430                         __vmcs_writel(field.encoding, val);
1431                 }
1432         }
1433
1434         vmcs_clear(shadow_vmcs);
1435         vmcs_load(vmx->loaded_vmcs->vmcs);
1436 }
1437
1438 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1439 {
1440         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1441         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1442
1443         /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1444         vmcs12->tpr_threshold = evmcs->tpr_threshold;
1445         vmcs12->guest_rip = evmcs->guest_rip;
1446
1447         if (unlikely(!(evmcs->hv_clean_fields &
1448                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1449                 vmcs12->guest_rsp = evmcs->guest_rsp;
1450                 vmcs12->guest_rflags = evmcs->guest_rflags;
1451                 vmcs12->guest_interruptibility_info =
1452                         evmcs->guest_interruptibility_info;
1453         }
1454
1455         if (unlikely(!(evmcs->hv_clean_fields &
1456                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1457                 vmcs12->cpu_based_vm_exec_control =
1458                         evmcs->cpu_based_vm_exec_control;
1459         }
1460
1461         if (unlikely(!(evmcs->hv_clean_fields &
1462                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1463                 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1464         }
1465
1466         if (unlikely(!(evmcs->hv_clean_fields &
1467                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1468                 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1469         }
1470
1471         if (unlikely(!(evmcs->hv_clean_fields &
1472                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1473                 vmcs12->vm_entry_intr_info_field =
1474                         evmcs->vm_entry_intr_info_field;
1475                 vmcs12->vm_entry_exception_error_code =
1476                         evmcs->vm_entry_exception_error_code;
1477                 vmcs12->vm_entry_instruction_len =
1478                         evmcs->vm_entry_instruction_len;
1479         }
1480
1481         if (unlikely(!(evmcs->hv_clean_fields &
1482                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1483                 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1484                 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1485                 vmcs12->host_cr0 = evmcs->host_cr0;
1486                 vmcs12->host_cr3 = evmcs->host_cr3;
1487                 vmcs12->host_cr4 = evmcs->host_cr4;
1488                 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1489                 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1490                 vmcs12->host_rip = evmcs->host_rip;
1491                 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1492                 vmcs12->host_es_selector = evmcs->host_es_selector;
1493                 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1494                 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1495                 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1496                 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1497                 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1498                 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1499         }
1500
1501         if (unlikely(!(evmcs->hv_clean_fields &
1502                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1503                 vmcs12->pin_based_vm_exec_control =
1504                         evmcs->pin_based_vm_exec_control;
1505                 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1506                 vmcs12->secondary_vm_exec_control =
1507                         evmcs->secondary_vm_exec_control;
1508         }
1509
1510         if (unlikely(!(evmcs->hv_clean_fields &
1511                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1512                 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1513                 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1514         }
1515
1516         if (unlikely(!(evmcs->hv_clean_fields &
1517                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1518                 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1519         }
1520
1521         if (unlikely(!(evmcs->hv_clean_fields &
1522                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1523                 vmcs12->guest_es_base = evmcs->guest_es_base;
1524                 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1525                 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1526                 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1527                 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1528                 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1529                 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1530                 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1531                 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1532                 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1533                 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1534                 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1535                 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1536                 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1537                 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1538                 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1539                 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1540                 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1541                 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1542                 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1543                 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1544                 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1545                 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1546                 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1547                 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1548                 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1549                 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1550                 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1551                 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1552                 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1553                 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1554                 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1555                 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1556                 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1557                 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1558                 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1559         }
1560
1561         if (unlikely(!(evmcs->hv_clean_fields &
1562                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1563                 vmcs12->tsc_offset = evmcs->tsc_offset;
1564                 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1565                 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1566         }
1567
1568         if (unlikely(!(evmcs->hv_clean_fields &
1569                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1570                 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1571                 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1572                 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1573                 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1574                 vmcs12->guest_cr0 = evmcs->guest_cr0;
1575                 vmcs12->guest_cr3 = evmcs->guest_cr3;
1576                 vmcs12->guest_cr4 = evmcs->guest_cr4;
1577                 vmcs12->guest_dr7 = evmcs->guest_dr7;
1578         }
1579
1580         if (unlikely(!(evmcs->hv_clean_fields &
1581                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1582                 vmcs12->host_fs_base = evmcs->host_fs_base;
1583                 vmcs12->host_gs_base = evmcs->host_gs_base;
1584                 vmcs12->host_tr_base = evmcs->host_tr_base;
1585                 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1586                 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1587                 vmcs12->host_rsp = evmcs->host_rsp;
1588         }
1589
1590         if (unlikely(!(evmcs->hv_clean_fields &
1591                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1592                 vmcs12->ept_pointer = evmcs->ept_pointer;
1593                 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1594         }
1595
1596         if (unlikely(!(evmcs->hv_clean_fields &
1597                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1598                 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1599                 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1600                 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1601                 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1602                 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1603                 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1604                 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1605                 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1606                 vmcs12->guest_pending_dbg_exceptions =
1607                         evmcs->guest_pending_dbg_exceptions;
1608                 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1609                 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1610                 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1611                 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1612                 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1613         }
1614
1615         /*
1616          * Not used?
1617          * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1618          * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1619          * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1620          * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1621          * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1622          * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1623          * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1624          * vmcs12->page_fault_error_code_mask =
1625          *              evmcs->page_fault_error_code_mask;
1626          * vmcs12->page_fault_error_code_match =
1627          *              evmcs->page_fault_error_code_match;
1628          * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1629          * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1630          * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1631          * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1632          */
1633
1634         /*
1635          * Read only fields:
1636          * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1637          * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1638          * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1639          * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1640          * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1641          * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1642          * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1643          * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1644          * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1645          * vmcs12->exit_qualification = evmcs->exit_qualification;
1646          * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1647          *
1648          * Not present in struct vmcs12:
1649          * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1650          * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1651          * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1652          * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1653          */
1654
1655         return 0;
1656 }
1657
1658 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1659 {
1660         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1661         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1662
1663         /*
1664          * Should not be changed by KVM:
1665          *
1666          * evmcs->host_es_selector = vmcs12->host_es_selector;
1667          * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1668          * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1669          * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1670          * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1671          * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1672          * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1673          * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1674          * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1675          * evmcs->host_cr0 = vmcs12->host_cr0;
1676          * evmcs->host_cr3 = vmcs12->host_cr3;
1677          * evmcs->host_cr4 = vmcs12->host_cr4;
1678          * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1679          * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1680          * evmcs->host_rip = vmcs12->host_rip;
1681          * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1682          * evmcs->host_fs_base = vmcs12->host_fs_base;
1683          * evmcs->host_gs_base = vmcs12->host_gs_base;
1684          * evmcs->host_tr_base = vmcs12->host_tr_base;
1685          * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1686          * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1687          * evmcs->host_rsp = vmcs12->host_rsp;
1688          * sync_vmcs02_to_vmcs12() doesn't read these:
1689          * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1690          * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1691          * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1692          * evmcs->ept_pointer = vmcs12->ept_pointer;
1693          * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1694          * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1695          * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1696          * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1697          * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1698          * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1699          * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1700          * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1701          * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1702          * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1703          * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1704          * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1705          * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1706          * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1707          * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1708          * evmcs->page_fault_error_code_mask =
1709          *              vmcs12->page_fault_error_code_mask;
1710          * evmcs->page_fault_error_code_match =
1711          *              vmcs12->page_fault_error_code_match;
1712          * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1713          * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1714          * evmcs->tsc_offset = vmcs12->tsc_offset;
1715          * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1716          * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1717          * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1718          * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1719          * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1720          * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1721          * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1722          * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1723          *
1724          * Not present in struct vmcs12:
1725          * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1726          * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1727          * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1728          * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1729          */
1730
1731         evmcs->guest_es_selector = vmcs12->guest_es_selector;
1732         evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1733         evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1734         evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1735         evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1736         evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1737         evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1738         evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1739
1740         evmcs->guest_es_limit = vmcs12->guest_es_limit;
1741         evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1742         evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1743         evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1744         evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1745         evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1746         evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1747         evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1748         evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1749         evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1750
1751         evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1752         evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1753         evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1754         evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1755         evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1756         evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1757         evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1758         evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1759
1760         evmcs->guest_es_base = vmcs12->guest_es_base;
1761         evmcs->guest_cs_base = vmcs12->guest_cs_base;
1762         evmcs->guest_ss_base = vmcs12->guest_ss_base;
1763         evmcs->guest_ds_base = vmcs12->guest_ds_base;
1764         evmcs->guest_fs_base = vmcs12->guest_fs_base;
1765         evmcs->guest_gs_base = vmcs12->guest_gs_base;
1766         evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1767         evmcs->guest_tr_base = vmcs12->guest_tr_base;
1768         evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1769         evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1770
1771         evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1772         evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1773
1774         evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1775         evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1776         evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1777         evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1778
1779         evmcs->guest_pending_dbg_exceptions =
1780                 vmcs12->guest_pending_dbg_exceptions;
1781         evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1782         evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1783
1784         evmcs->guest_activity_state = vmcs12->guest_activity_state;
1785         evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1786
1787         evmcs->guest_cr0 = vmcs12->guest_cr0;
1788         evmcs->guest_cr3 = vmcs12->guest_cr3;
1789         evmcs->guest_cr4 = vmcs12->guest_cr4;
1790         evmcs->guest_dr7 = vmcs12->guest_dr7;
1791
1792         evmcs->guest_physical_address = vmcs12->guest_physical_address;
1793
1794         evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1795         evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1796         evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1797         evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1798         evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1799         evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1800         evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1801         evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1802
1803         evmcs->exit_qualification = vmcs12->exit_qualification;
1804
1805         evmcs->guest_linear_address = vmcs12->guest_linear_address;
1806         evmcs->guest_rsp = vmcs12->guest_rsp;
1807         evmcs->guest_rflags = vmcs12->guest_rflags;
1808
1809         evmcs->guest_interruptibility_info =
1810                 vmcs12->guest_interruptibility_info;
1811         evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1812         evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1813         evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1814         evmcs->vm_entry_exception_error_code =
1815                 vmcs12->vm_entry_exception_error_code;
1816         evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1817
1818         evmcs->guest_rip = vmcs12->guest_rip;
1819
1820         evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1821
1822         return 0;
1823 }
1824
1825 /*
1826  * This is an equivalent of the nested hypervisor executing the vmptrld
1827  * instruction.
1828  */
1829 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1830                                                  bool from_launch)
1831 {
1832         struct vcpu_vmx *vmx = to_vmx(vcpu);
1833         bool evmcs_gpa_changed = false;
1834         u64 evmcs_gpa;
1835
1836         if (likely(!vmx->nested.enlightened_vmcs_enabled))
1837                 return 1;
1838
1839         if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1840                 return 1;
1841
1842         if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1843                 if (!vmx->nested.hv_evmcs)
1844                         vmx->nested.current_vmptr = -1ull;
1845
1846                 nested_release_evmcs(vcpu);
1847
1848                 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1849                                  &vmx->nested.hv_evmcs_map))
1850                         return 0;
1851
1852                 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1853
1854                 /*
1855                  * Currently, KVM only supports eVMCS version 1
1856                  * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1857                  * value to first u32 field of eVMCS which should specify eVMCS
1858                  * VersionNumber.
1859                  *
1860                  * Guest should be aware of supported eVMCS versions by host by
1861                  * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1862                  * expected to set this CPUID leaf according to the value
1863                  * returned in vmcs_version from nested_enable_evmcs().
1864                  *
1865                  * However, it turns out that Microsoft Hyper-V fails to comply
1866                  * to their own invented interface: When Hyper-V use eVMCS, it
1867                  * just sets first u32 field of eVMCS to revision_id specified
1868                  * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1869                  * which is one of the supported versions specified in
1870                  * CPUID.0x4000000A.EAX[0:15].
1871                  *
1872                  * To overcome Hyper-V bug, we accept here either a supported
1873                  * eVMCS version or VMCS12 revision_id as valid values for first
1874                  * u32 field of eVMCS.
1875                  */
1876                 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1877                     (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1878                         nested_release_evmcs(vcpu);
1879                         return 0;
1880                 }
1881
1882                 vmx->nested.dirty_vmcs12 = true;
1883                 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1884
1885                 evmcs_gpa_changed = true;
1886                 /*
1887                  * Unlike normal vmcs12, enlightened vmcs12 is not fully
1888                  * reloaded from guest's memory (read only fields, fields not
1889                  * present in struct hv_enlightened_vmcs, ...). Make sure there
1890                  * are no leftovers.
1891                  */
1892                 if (from_launch) {
1893                         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1894                         memset(vmcs12, 0, sizeof(*vmcs12));
1895                         vmcs12->hdr.revision_id = VMCS12_REVISION;
1896                 }
1897
1898         }
1899
1900         /*
1901          * Clean fields data can't de used on VMLAUNCH and when we switch
1902          * between different L2 guests as KVM keeps a single VMCS12 per L1.
1903          */
1904         if (from_launch || evmcs_gpa_changed)
1905                 vmx->nested.hv_evmcs->hv_clean_fields &=
1906                         ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1907
1908         return 1;
1909 }
1910
1911 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
1912 {
1913         struct vcpu_vmx *vmx = to_vmx(vcpu);
1914
1915         /*
1916          * hv_evmcs may end up being not mapped after migration (when
1917          * L2 was running), map it here to make sure vmcs12 changes are
1918          * properly reflected.
1919          */
1920         if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
1921                 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
1922
1923         if (vmx->nested.hv_evmcs) {
1924                 copy_vmcs12_to_enlightened(vmx);
1925                 /* All fields are clean */
1926                 vmx->nested.hv_evmcs->hv_clean_fields |=
1927                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1928         } else {
1929                 copy_vmcs12_to_shadow(vmx);
1930         }
1931
1932         vmx->nested.need_vmcs12_to_shadow_sync = false;
1933 }
1934
1935 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
1936 {
1937         struct vcpu_vmx *vmx =
1938                 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
1939
1940         vmx->nested.preemption_timer_expired = true;
1941         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
1942         kvm_vcpu_kick(&vmx->vcpu);
1943
1944         return HRTIMER_NORESTART;
1945 }
1946
1947 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
1948 {
1949         u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
1950         struct vcpu_vmx *vmx = to_vmx(vcpu);
1951
1952         /*
1953          * A timer value of zero is architecturally guaranteed to cause
1954          * a VMExit prior to executing any instructions in the guest.
1955          */
1956         if (preemption_timeout == 0) {
1957                 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
1958                 return;
1959         }
1960
1961         if (vcpu->arch.virtual_tsc_khz == 0)
1962                 return;
1963
1964         preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
1965         preemption_timeout *= 1000000;
1966         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
1967         hrtimer_start(&vmx->nested.preemption_timer,
1968                       ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
1969 }
1970
1971 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1972 {
1973         if (vmx->nested.nested_run_pending &&
1974             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
1975                 return vmcs12->guest_ia32_efer;
1976         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
1977                 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
1978         else
1979                 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
1980 }
1981
1982 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
1983 {
1984         /*
1985          * If vmcs02 hasn't been initialized, set the constant vmcs02 state
1986          * according to L0's settings (vmcs12 is irrelevant here).  Host
1987          * fields that come from L0 and are not constant, e.g. HOST_CR3,
1988          * will be set as needed prior to VMLAUNCH/VMRESUME.
1989          */
1990         if (vmx->nested.vmcs02_initialized)
1991                 return;
1992         vmx->nested.vmcs02_initialized = true;
1993
1994         /*
1995          * We don't care what the EPTP value is we just need to guarantee
1996          * it's valid so we don't get a false positive when doing early
1997          * consistency checks.
1998          */
1999         if (enable_ept && nested_early_check)
2000                 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
2001
2002         /* All VMFUNCs are currently emulated through L0 vmexits.  */
2003         if (cpu_has_vmx_vmfunc())
2004                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
2005
2006         if (cpu_has_vmx_posted_intr())
2007                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2008
2009         if (cpu_has_vmx_msr_bitmap())
2010                 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2011
2012         /*
2013          * The PML address never changes, so it is constant in vmcs02.
2014          * Conceptually we want to copy the PML index from vmcs01 here,
2015          * and then back to vmcs01 on nested vmexit.  But since we flush
2016          * the log and reset GUEST_PML_INDEX on each vmexit, the PML
2017          * index is also effectively constant in vmcs02.
2018          */
2019         if (enable_pml) {
2020                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
2021                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2022         }
2023
2024         if (cpu_has_vmx_encls_vmexit())
2025                 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2026
2027         /*
2028          * Set the MSR load/store lists to match L0's settings.  Only the
2029          * addresses are constant (for vmcs02), the counts can change based
2030          * on L2's behavior, e.g. switching to/from long mode.
2031          */
2032         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2033         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2034         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2035
2036         vmx_set_constant_host_state(vmx);
2037 }
2038
2039 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2040                                       struct vmcs12 *vmcs12)
2041 {
2042         prepare_vmcs02_constant_state(vmx);
2043
2044         vmcs_write64(VMCS_LINK_POINTER, -1ull);
2045
2046         if (enable_vpid) {
2047                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2048                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2049                 else
2050                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2051         }
2052 }
2053
2054 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2055 {
2056         u32 exec_control, vmcs12_exec_ctrl;
2057         u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2058
2059         if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2060                 prepare_vmcs02_early_rare(vmx, vmcs12);
2061
2062         /*
2063          * PIN CONTROLS
2064          */
2065         exec_control = vmx_pin_based_exec_ctrl(vmx);
2066         exec_control |= (vmcs12->pin_based_vm_exec_control &
2067                          ~PIN_BASED_VMX_PREEMPTION_TIMER);
2068
2069         /* Posted interrupts setting is only taken from vmcs12.  */
2070         if (nested_cpu_has_posted_intr(vmcs12)) {
2071                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2072                 vmx->nested.pi_pending = false;
2073         } else {
2074                 exec_control &= ~PIN_BASED_POSTED_INTR;
2075         }
2076         pin_controls_set(vmx, exec_control);
2077
2078         /*
2079          * EXEC CONTROLS
2080          */
2081         exec_control = vmx_exec_control(vmx); /* L0's desires */
2082         exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2083         exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2084         exec_control &= ~CPU_BASED_TPR_SHADOW;
2085         exec_control |= vmcs12->cpu_based_vm_exec_control;
2086
2087         vmx->nested.l1_tpr_threshold = -1;
2088         if (exec_control & CPU_BASED_TPR_SHADOW)
2089                 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2090 #ifdef CONFIG_X86_64
2091         else
2092                 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2093                                 CPU_BASED_CR8_STORE_EXITING;
2094 #endif
2095
2096         /*
2097          * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2098          * for I/O port accesses.
2099          */
2100         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2101         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2102
2103         /*
2104          * This bit will be computed in nested_get_vmcs12_pages, because
2105          * we do not have access to L1's MSR bitmap yet.  For now, keep
2106          * the same bit as before, hoping to avoid multiple VMWRITEs that
2107          * only set/clear this bit.
2108          */
2109         exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2110         exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2111
2112         exec_controls_set(vmx, exec_control);
2113
2114         /*
2115          * SECONDARY EXEC CONTROLS
2116          */
2117         if (cpu_has_secondary_exec_ctrls()) {
2118                 exec_control = vmx->secondary_exec_control;
2119
2120                 /* Take the following fields only from vmcs12 */
2121                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2122                                   SECONDARY_EXEC_ENABLE_INVPCID |
2123                                   SECONDARY_EXEC_RDTSCP |
2124                                   SECONDARY_EXEC_XSAVES |
2125                                   SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2126                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2127                                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
2128                                   SECONDARY_EXEC_ENABLE_VMFUNC);
2129                 if (nested_cpu_has(vmcs12,
2130                                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2131                         vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2132                                 ~SECONDARY_EXEC_ENABLE_PML;
2133                         exec_control |= vmcs12_exec_ctrl;
2134                 }
2135
2136                 /* VMCS shadowing for L2 is emulated for now */
2137                 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2138
2139                 /*
2140                  * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2141                  * will not have to rewrite the controls just for this bit.
2142                  */
2143                 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2144                     (vmcs12->guest_cr4 & X86_CR4_UMIP))
2145                         exec_control |= SECONDARY_EXEC_DESC;
2146
2147                 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2148                         vmcs_write16(GUEST_INTR_STATUS,
2149                                 vmcs12->guest_intr_status);
2150
2151                 secondary_exec_controls_set(vmx, exec_control);
2152         }
2153
2154         /*
2155          * ENTRY CONTROLS
2156          *
2157          * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2158          * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2159          * on the related bits (if supported by the CPU) in the hope that
2160          * we can avoid VMWrites during vmx_set_efer().
2161          */
2162         exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2163                         ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2164         if (cpu_has_load_ia32_efer()) {
2165                 if (guest_efer & EFER_LMA)
2166                         exec_control |= VM_ENTRY_IA32E_MODE;
2167                 if (guest_efer != host_efer)
2168                         exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2169         }
2170         vm_entry_controls_set(vmx, exec_control);
2171
2172         /*
2173          * EXIT CONTROLS
2174          *
2175          * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2176          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2177          * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2178          */
2179         exec_control = vmx_vmexit_ctrl();
2180         if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2181                 exec_control |= VM_EXIT_LOAD_IA32_EFER;
2182         vm_exit_controls_set(vmx, exec_control);
2183
2184         /*
2185          * Interrupt/Exception Fields
2186          */
2187         if (vmx->nested.nested_run_pending) {
2188                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2189                              vmcs12->vm_entry_intr_info_field);
2190                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2191                              vmcs12->vm_entry_exception_error_code);
2192                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2193                              vmcs12->vm_entry_instruction_len);
2194                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2195                              vmcs12->guest_interruptibility_info);
2196                 vmx->loaded_vmcs->nmi_known_unmasked =
2197                         !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2198         } else {
2199                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2200         }
2201 }
2202
2203 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2204 {
2205         struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2206
2207         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2208                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2209                 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2210                 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2211                 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2212                 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2213                 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2214                 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2215                 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2216                 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2217                 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2218                 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2219                 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2220                 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2221                 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2222                 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2223                 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2224                 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2225                 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2226                 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2227                 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2228                 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2229                 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2230                 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2231                 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2232                 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2233                 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2234                 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2235                 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2236                 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2237                 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2238                 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2239                 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2240                 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2241                 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2242                 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2243                 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2244                 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2245         }
2246
2247         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2248                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2249                 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2250                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2251                             vmcs12->guest_pending_dbg_exceptions);
2252                 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2253                 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2254
2255                 /*
2256                  * L1 may access the L2's PDPTR, so save them to construct
2257                  * vmcs12
2258                  */
2259                 if (enable_ept) {
2260                         vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2261                         vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2262                         vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2263                         vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2264                 }
2265
2266                 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2267                     (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2268                         vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2269         }
2270
2271         if (nested_cpu_has_xsaves(vmcs12))
2272                 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2273
2274         /*
2275          * Whether page-faults are trapped is determined by a combination of
2276          * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2277          * If enable_ept, L0 doesn't care about page faults and we should
2278          * set all of these to L1's desires. However, if !enable_ept, L0 does
2279          * care about (at least some) page faults, and because it is not easy
2280          * (if at all possible?) to merge L0 and L1's desires, we simply ask
2281          * to exit on each and every L2 page fault. This is done by setting
2282          * MASK=MATCH=0 and (see below) EB.PF=1.
2283          * Note that below we don't need special code to set EB.PF beyond the
2284          * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2285          * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2286          * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2287          */
2288         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2289                 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2290         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2291                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
2292
2293         if (cpu_has_vmx_apicv()) {
2294                 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2295                 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2296                 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2297                 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2298         }
2299
2300         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2301         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2302
2303         set_cr4_guest_host_mask(vmx);
2304 }
2305
2306 /*
2307  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2308  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2309  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2310  * guest in a way that will both be appropriate to L1's requests, and our
2311  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2312  * function also has additional necessary side-effects, like setting various
2313  * vcpu->arch fields.
2314  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2315  * is assigned to entry_failure_code on failure.
2316  */
2317 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2318                           u32 *entry_failure_code)
2319 {
2320         struct vcpu_vmx *vmx = to_vmx(vcpu);
2321         struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2322         bool load_guest_pdptrs_vmcs12 = false;
2323
2324         if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2325                 prepare_vmcs02_rare(vmx, vmcs12);
2326                 vmx->nested.dirty_vmcs12 = false;
2327
2328                 load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2329                         !(hv_evmcs->hv_clean_fields &
2330                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2331         }
2332
2333         if (vmx->nested.nested_run_pending &&
2334             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2335                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2336                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2337         } else {
2338                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2339                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2340         }
2341         if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2342             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2343                 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2344         vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2345
2346         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2347          * bitwise-or of what L1 wants to trap for L2, and what we want to
2348          * trap. Note that CR0.TS also needs updating - we do this later.
2349          */
2350         update_exception_bitmap(vcpu);
2351         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2352         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2353
2354         if (vmx->nested.nested_run_pending &&
2355             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2356                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2357                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2358         } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2359                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2360         }
2361
2362         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2363
2364         if (kvm_has_tsc_control)
2365                 decache_tsc_multiplier(vmx);
2366
2367         if (enable_vpid) {
2368                 /*
2369                  * There is no direct mapping between vpid02 and vpid12, the
2370                  * vpid02 is per-vCPU for L0 and reused while the value of
2371                  * vpid12 is changed w/ one invvpid during nested vmentry.
2372                  * The vpid12 is allocated by L1 for L2, so it will not
2373                  * influence global bitmap(for vpid01 and vpid02 allocation)
2374                  * even if spawn a lot of nested vCPUs.
2375                  */
2376                 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2377                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2378                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2379                                 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2380                         }
2381                 } else {
2382                         /*
2383                          * If L1 use EPT, then L0 needs to execute INVEPT on
2384                          * EPTP02 instead of EPTP01. Therefore, delay TLB
2385                          * flush until vmcs02->eptp is fully updated by
2386                          * KVM_REQ_LOAD_CR3. Note that this assumes
2387                          * KVM_REQ_TLB_FLUSH is evaluated after
2388                          * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2389                          */
2390                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2391                 }
2392         }
2393
2394         if (nested_cpu_has_ept(vmcs12))
2395                 nested_ept_init_mmu_context(vcpu);
2396         else if (nested_cpu_has2(vmcs12,
2397                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2398                 vmx_flush_tlb(vcpu, true);
2399
2400         /*
2401          * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2402          * bits which we consider mandatory enabled.
2403          * The CR0_READ_SHADOW is what L2 should have expected to read given
2404          * the specifications by L1; It's not enough to take
2405          * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2406          * have more bits than L1 expected.
2407          */
2408         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2409         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2410
2411         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2412         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2413
2414         vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2415         /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2416         vmx_set_efer(vcpu, vcpu->arch.efer);
2417
2418         /*
2419          * Guest state is invalid and unrestricted guest is disabled,
2420          * which means L1 attempted VMEntry to L2 with invalid state.
2421          * Fail the VMEntry.
2422          */
2423         if (vmx->emulation_required) {
2424                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2425                 return -EINVAL;
2426         }
2427
2428         /* Shadow page tables on either EPT or shadow page tables. */
2429         if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2430                                 entry_failure_code))
2431                 return -EINVAL;
2432
2433         /*
2434          * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2435          * on nested VM-Exit, which can occur without actually running L2 and
2436          * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
2437          * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2438          * transition to HLT instead of running L2.
2439          */
2440         if (enable_ept)
2441                 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2442
2443         /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2444         if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2445             is_pae_paging(vcpu)) {
2446                 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2447                 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2448                 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2449                 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2450         }
2451
2452         if (!enable_ept)
2453                 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2454
2455         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2456             SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2457                             vmcs12->guest_ia32_perf_global_ctrl))
2458                 return -EINVAL;
2459
2460         kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2461         kvm_rip_write(vcpu, vmcs12->guest_rip);
2462         return 0;
2463 }
2464
2465 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2466 {
2467         if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2468                nested_cpu_has_virtual_nmis(vmcs12)))
2469                 return -EINVAL;
2470
2471         if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2472                nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)))
2473                 return -EINVAL;
2474
2475         return 0;
2476 }
2477
2478 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2479 {
2480         struct vcpu_vmx *vmx = to_vmx(vcpu);
2481         int maxphyaddr = cpuid_maxphyaddr(vcpu);
2482
2483         /* Check for memory type validity */
2484         switch (address & VMX_EPTP_MT_MASK) {
2485         case VMX_EPTP_MT_UC:
2486                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2487                         return false;
2488                 break;
2489         case VMX_EPTP_MT_WB:
2490                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2491                         return false;
2492                 break;
2493         default:
2494                 return false;
2495         }
2496
2497         /* only 4 levels page-walk length are valid */
2498         if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
2499                 return false;
2500
2501         /* Reserved bits should not be set */
2502         if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
2503                 return false;
2504
2505         /* AD, if set, should be supported */
2506         if (address & VMX_EPTP_AD_ENABLE_BIT) {
2507                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2508                         return false;
2509         }
2510
2511         return true;
2512 }
2513
2514 /*
2515  * Checks related to VM-Execution Control Fields
2516  */
2517 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2518                                               struct vmcs12 *vmcs12)
2519 {
2520         struct vcpu_vmx *vmx = to_vmx(vcpu);
2521
2522         if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2523                                    vmx->nested.msrs.pinbased_ctls_low,
2524                                    vmx->nested.msrs.pinbased_ctls_high)) ||
2525             CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2526                                    vmx->nested.msrs.procbased_ctls_low,
2527                                    vmx->nested.msrs.procbased_ctls_high)))
2528                 return -EINVAL;
2529
2530         if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2531             CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2532                                    vmx->nested.msrs.secondary_ctls_low,
2533                                    vmx->nested.msrs.secondary_ctls_high)))
2534                 return -EINVAL;
2535
2536         if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2537             nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2538             nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2539             nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2540             nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2541             nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2542             nested_vmx_check_nmi_controls(vmcs12) ||
2543             nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2544             nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2545             nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2546             nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2547             CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2548                 return -EINVAL;
2549
2550         if (!nested_cpu_has_preemption_timer(vmcs12) &&
2551             nested_cpu_has_save_preemption_timer(vmcs12))
2552                 return -EINVAL;
2553
2554         if (nested_cpu_has_ept(vmcs12) &&
2555             CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
2556                 return -EINVAL;
2557
2558         if (nested_cpu_has_vmfunc(vmcs12)) {
2559                 if (CC(vmcs12->vm_function_control &
2560                        ~vmx->nested.msrs.vmfunc_controls))
2561                         return -EINVAL;
2562
2563                 if (nested_cpu_has_eptp_switching(vmcs12)) {
2564                         if (CC(!nested_cpu_has_ept(vmcs12)) ||
2565                             CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2566                                 return -EINVAL;
2567                 }
2568         }
2569
2570         return 0;
2571 }
2572
2573 /*
2574  * Checks related to VM-Exit Control Fields
2575  */
2576 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2577                                          struct vmcs12 *vmcs12)
2578 {
2579         struct vcpu_vmx *vmx = to_vmx(vcpu);
2580
2581         if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2582                                     vmx->nested.msrs.exit_ctls_low,
2583                                     vmx->nested.msrs.exit_ctls_high)) ||
2584             CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2585                 return -EINVAL;
2586
2587         return 0;
2588 }
2589
2590 /*
2591  * Checks related to VM-Entry Control Fields
2592  */
2593 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2594                                           struct vmcs12 *vmcs12)
2595 {
2596         struct vcpu_vmx *vmx = to_vmx(vcpu);
2597
2598         if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2599                                     vmx->nested.msrs.entry_ctls_low,
2600                                     vmx->nested.msrs.entry_ctls_high)))
2601                 return -EINVAL;
2602
2603         /*
2604          * From the Intel SDM, volume 3:
2605          * Fields relevant to VM-entry event injection must be set properly.
2606          * These fields are the VM-entry interruption-information field, the
2607          * VM-entry exception error code, and the VM-entry instruction length.
2608          */
2609         if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2610                 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2611                 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2612                 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2613                 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2614                 bool should_have_error_code;
2615                 bool urg = nested_cpu_has2(vmcs12,
2616                                            SECONDARY_EXEC_UNRESTRICTED_GUEST);
2617                 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2618
2619                 /* VM-entry interruption-info field: interruption type */
2620                 if (CC(intr_type == INTR_TYPE_RESERVED) ||
2621                     CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2622                        !nested_cpu_supports_monitor_trap_flag(vcpu)))
2623                         return -EINVAL;
2624
2625                 /* VM-entry interruption-info field: vector */
2626                 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2627                     CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2628                     CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2629                         return -EINVAL;
2630
2631                 /* VM-entry interruption-info field: deliver error code */
2632                 should_have_error_code =
2633                         intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2634                         x86_exception_has_error_code(vector);
2635                 if (CC(has_error_code != should_have_error_code))
2636                         return -EINVAL;
2637
2638                 /* VM-entry exception error code */
2639                 if (CC(has_error_code &&
2640                        vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2641                         return -EINVAL;
2642
2643                 /* VM-entry interruption-info field: reserved bits */
2644                 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2645                         return -EINVAL;
2646
2647                 /* VM-entry instruction length */
2648                 switch (intr_type) {
2649                 case INTR_TYPE_SOFT_EXCEPTION:
2650                 case INTR_TYPE_SOFT_INTR:
2651                 case INTR_TYPE_PRIV_SW_EXCEPTION:
2652                         if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2653                             CC(vmcs12->vm_entry_instruction_len == 0 &&
2654                             CC(!nested_cpu_has_zero_length_injection(vcpu))))
2655                                 return -EINVAL;
2656                 }
2657         }
2658
2659         if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2660                 return -EINVAL;
2661
2662         return 0;
2663 }
2664
2665 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2666                                      struct vmcs12 *vmcs12)
2667 {
2668         if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2669             nested_check_vm_exit_controls(vcpu, vmcs12) ||
2670             nested_check_vm_entry_controls(vcpu, vmcs12))
2671                 return -EINVAL;
2672
2673         return 0;
2674 }
2675
2676 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2677                                        struct vmcs12 *vmcs12)
2678 {
2679         bool ia32e;
2680
2681         if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2682             CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2683             CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
2684                 return -EINVAL;
2685
2686         if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2687             CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2688                 return -EINVAL;
2689
2690         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2691             CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2692                 return -EINVAL;
2693
2694         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2695             CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2696                                            vmcs12->host_ia32_perf_global_ctrl)))
2697                 return -EINVAL;
2698
2699 #ifdef CONFIG_X86_64
2700         ia32e = !!(vcpu->arch.efer & EFER_LMA);
2701 #else
2702         ia32e = false;
2703 #endif
2704
2705         if (ia32e) {
2706                 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2707                     CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2708                         return -EINVAL;
2709         } else {
2710                 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2711                     CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2712                     CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2713                     CC((vmcs12->host_rip) >> 32))
2714                         return -EINVAL;
2715         }
2716
2717         if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2718             CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2719             CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2720             CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2721             CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2722             CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2723             CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2724             CC(vmcs12->host_cs_selector == 0) ||
2725             CC(vmcs12->host_tr_selector == 0) ||
2726             CC(vmcs12->host_ss_selector == 0 && !ia32e))
2727                 return -EINVAL;
2728
2729 #ifdef CONFIG_X86_64
2730         if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2731             CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2732             CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2733             CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2734             CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2735             CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2736                 return -EINVAL;
2737 #endif
2738
2739         /*
2740          * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2741          * IA32_EFER MSR must be 0 in the field for that register. In addition,
2742          * the values of the LMA and LME bits in the field must each be that of
2743          * the host address-space size VM-exit control.
2744          */
2745         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2746                 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2747                     CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2748                     CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2749                         return -EINVAL;
2750         }
2751
2752         return 0;
2753 }
2754
2755 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2756                                           struct vmcs12 *vmcs12)
2757 {
2758         int r = 0;
2759         struct vmcs12 *shadow;
2760         struct kvm_host_map map;
2761
2762         if (vmcs12->vmcs_link_pointer == -1ull)
2763                 return 0;
2764
2765         if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2766                 return -EINVAL;
2767
2768         if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2769                 return -EINVAL;
2770
2771         shadow = map.hva;
2772
2773         if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2774             CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2775                 r = -EINVAL;
2776
2777         kvm_vcpu_unmap(vcpu, &map, false);
2778         return r;
2779 }
2780
2781 /*
2782  * Checks related to Guest Non-register State
2783  */
2784 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2785 {
2786         if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2787                vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
2788                 return -EINVAL;
2789
2790         return 0;
2791 }
2792
2793 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2794                                         struct vmcs12 *vmcs12,
2795                                         u32 *exit_qual)
2796 {
2797         bool ia32e;
2798
2799         *exit_qual = ENTRY_FAIL_DEFAULT;
2800
2801         if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2802             CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
2803                 return -EINVAL;
2804
2805         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2806             CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
2807                 return -EINVAL;
2808
2809         if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2810                 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2811                 return -EINVAL;
2812         }
2813
2814         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2815             CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2816                                            vmcs12->guest_ia32_perf_global_ctrl)))
2817                 return -EINVAL;
2818
2819         /*
2820          * If the load IA32_EFER VM-entry control is 1, the following checks
2821          * are performed on the field for the IA32_EFER MSR:
2822          * - Bits reserved in the IA32_EFER MSR must be 0.
2823          * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2824          *   the IA-32e mode guest VM-exit control. It must also be identical
2825          *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2826          *   CR0.PG) is 1.
2827          */
2828         if (to_vmx(vcpu)->nested.nested_run_pending &&
2829             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2830                 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2831                 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
2832                     CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
2833                     CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
2834                      ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
2835                         return -EINVAL;
2836         }
2837
2838         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2839             (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
2840              CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
2841                 return -EINVAL;
2842
2843         if (nested_check_guest_non_reg_state(vmcs12))
2844                 return -EINVAL;
2845
2846         return 0;
2847 }
2848
2849 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2850 {
2851         struct vcpu_vmx *vmx = to_vmx(vcpu);
2852         unsigned long cr3, cr4;
2853         bool vm_fail;
2854
2855         if (!nested_early_check)
2856                 return 0;
2857
2858         if (vmx->msr_autoload.host.nr)
2859                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2860         if (vmx->msr_autoload.guest.nr)
2861                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2862
2863         preempt_disable();
2864
2865         vmx_prepare_switch_to_guest(vcpu);
2866
2867         /*
2868          * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2869          * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2870          * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2871          * there is no need to preserve other bits or save/restore the field.
2872          */
2873         vmcs_writel(GUEST_RFLAGS, 0);
2874
2875         cr3 = __get_current_cr3_fast();
2876         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2877                 vmcs_writel(HOST_CR3, cr3);
2878                 vmx->loaded_vmcs->host_state.cr3 = cr3;
2879         }
2880
2881         cr4 = cr4_read_shadow();
2882         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2883                 vmcs_writel(HOST_CR4, cr4);
2884                 vmx->loaded_vmcs->host_state.cr4 = cr4;
2885         }
2886
2887         asm(
2888                 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2889                 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2890                 "je 1f \n\t"
2891                 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2892                 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2893                 "1: \n\t"
2894                 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2895
2896                 /* Check if vmlaunch or vmresume is needed */
2897                 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2898
2899                 /*
2900                  * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2901                  * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2902                  * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
2903                  * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
2904                  */
2905                 "call vmx_vmenter\n\t"
2906
2907                 CC_SET(be)
2908               : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
2909               : [HOST_RSP]"r"((unsigned long)HOST_RSP),
2910                 [loaded_vmcs]"r"(vmx->loaded_vmcs),
2911                 [launched]"i"(offsetof(struct loaded_vmcs, launched)),
2912                 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2913                 [wordsize]"i"(sizeof(ulong))
2914               : "memory"
2915         );
2916
2917         if (vmx->msr_autoload.host.nr)
2918                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2919         if (vmx->msr_autoload.guest.nr)
2920                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2921
2922         if (vm_fail) {
2923                 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
2924
2925                 preempt_enable();
2926
2927                 trace_kvm_nested_vmenter_failed(
2928                         "early hardware check VM-instruction error: ", error);
2929                 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2930                 return 1;
2931         }
2932
2933         /*
2934          * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
2935          */
2936         local_irq_enable();
2937         if (hw_breakpoint_active())
2938                 set_debugreg(__this_cpu_read(cpu_dr7), 7);
2939         preempt_enable();
2940
2941         /*
2942          * A non-failing VMEntry means we somehow entered guest mode with
2943          * an illegal RIP, and that's just the tip of the iceberg.  There
2944          * is no telling what memory has been modified or what state has
2945          * been exposed to unknown code.  Hitting this all but guarantees
2946          * a (very critical) hardware issue.
2947          */
2948         WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
2949                 VMX_EXIT_REASONS_FAILED_VMENTRY));
2950
2951         return 0;
2952 }
2953
2954 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2955                                                  struct vmcs12 *vmcs12);
2956
2957 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2958 {
2959         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2960         struct vcpu_vmx *vmx = to_vmx(vcpu);
2961         struct kvm_host_map *map;
2962         struct page *page;
2963         u64 hpa;
2964
2965         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2966                 /*
2967                  * Translate L1 physical address to host physical
2968                  * address for vmcs02. Keep the page pinned, so this
2969                  * physical address remains valid. We keep a reference
2970                  * to it so we can release it later.
2971                  */
2972                 if (vmx->nested.apic_access_page) { /* shouldn't happen */
2973                         kvm_release_page_dirty(vmx->nested.apic_access_page);
2974                         vmx->nested.apic_access_page = NULL;
2975                 }
2976                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
2977                 /*
2978                  * If translation failed, no matter: This feature asks
2979                  * to exit when accessing the given address, and if it
2980                  * can never be accessed, this feature won't do
2981                  * anything anyway.
2982                  */
2983                 if (!is_error_page(page)) {
2984                         vmx->nested.apic_access_page = page;
2985                         hpa = page_to_phys(vmx->nested.apic_access_page);
2986                         vmcs_write64(APIC_ACCESS_ADDR, hpa);
2987                 } else {
2988                         secondary_exec_controls_clearbit(vmx,
2989                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
2990                 }
2991         }
2992
2993         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
2994                 map = &vmx->nested.virtual_apic_map;
2995
2996                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
2997                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
2998                 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
2999                            nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3000                            !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3001                         /*
3002                          * The processor will never use the TPR shadow, simply
3003                          * clear the bit from the execution control.  Such a
3004                          * configuration is useless, but it happens in tests.
3005                          * For any other configuration, failing the vm entry is
3006                          * _not_ what the processor does but it's basically the
3007                          * only possibility we have.
3008                          */
3009                         exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3010                 } else {
3011                         /*
3012                          * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3013                          * force VM-Entry to fail.
3014                          */
3015                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3016                 }
3017         }
3018
3019         if (nested_cpu_has_posted_intr(vmcs12)) {
3020                 map = &vmx->nested.pi_desc_map;
3021
3022                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3023                         vmx->nested.pi_desc =
3024                                 (struct pi_desc *)(((void *)map->hva) +
3025                                 offset_in_page(vmcs12->posted_intr_desc_addr));
3026                         vmcs_write64(POSTED_INTR_DESC_ADDR,
3027                                      pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3028                 }
3029         }
3030         if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3031                 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3032         else
3033                 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3034 }
3035
3036 /*
3037  * Intel's VMX Instruction Reference specifies a common set of prerequisites
3038  * for running VMX instructions (except VMXON, whose prerequisites are
3039  * slightly different). It also specifies what exception to inject otherwise.
3040  * Note that many of these exceptions have priority over VM exits, so they
3041  * don't have to be checked again here.
3042  */
3043 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3044 {
3045         if (!to_vmx(vcpu)->nested.vmxon) {
3046                 kvm_queue_exception(vcpu, UD_VECTOR);
3047                 return 0;
3048         }
3049
3050         if (vmx_get_cpl(vcpu)) {
3051                 kvm_inject_gp(vcpu, 0);
3052                 return 0;
3053         }
3054
3055         return 1;
3056 }
3057
3058 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3059 {
3060         u8 rvi = vmx_get_rvi();
3061         u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3062
3063         return ((rvi & 0xf0) > (vppr & 0xf0));
3064 }
3065
3066 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3067                                    struct vmcs12 *vmcs12);
3068
3069 /*
3070  * If from_vmentry is false, this is being called from state restore (either RSM
3071  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3072 + *
3073 + * Returns:
3074 + *   0 - success, i.e. proceed with actual VMEnter
3075 + *   1 - consistency check VMExit
3076 + *  -1 - consistency check VMFail
3077  */
3078 int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
3079 {
3080         struct vcpu_vmx *vmx = to_vmx(vcpu);
3081         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3082         bool evaluate_pending_interrupts;
3083         u32 exit_reason = EXIT_REASON_INVALID_STATE;
3084         u32 exit_qual;
3085
3086         evaluate_pending_interrupts = exec_controls_get(vmx) &
3087                 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
3088         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3089                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3090
3091         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3092                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3093         if (kvm_mpx_supported() &&
3094                 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3095                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3096
3097         /*
3098          * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3099          * nested early checks are disabled.  In the event of a "late" VM-Fail,
3100          * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3101          * software model to the pre-VMEntry host state.  When EPT is disabled,
3102          * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3103          * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3104          * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3105          * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3106          * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3107          * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3108          * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3109          * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3110          * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3111          * path would need to manually save/restore vmcs01.GUEST_CR3.
3112          */
3113         if (!enable_ept && !nested_early_check)
3114                 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3115
3116         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3117
3118         prepare_vmcs02_early(vmx, vmcs12);
3119
3120         if (from_vmentry) {
3121                 nested_get_vmcs12_pages(vcpu);
3122
3123                 if (nested_vmx_check_vmentry_hw(vcpu)) {
3124                         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3125                         return -1;
3126                 }
3127
3128                 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3129                         goto vmentry_fail_vmexit;
3130         }
3131
3132         enter_guest_mode(vcpu);
3133         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3134                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3135
3136         if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3137                 goto vmentry_fail_vmexit_guest_mode;
3138
3139         if (from_vmentry) {
3140                 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3141                 exit_qual = nested_vmx_load_msr(vcpu,
3142                                                 vmcs12->vm_entry_msr_load_addr,
3143                                                 vmcs12->vm_entry_msr_load_count);
3144                 if (exit_qual)
3145                         goto vmentry_fail_vmexit_guest_mode;
3146         } else {
3147                 /*
3148                  * The MMU is not initialized to point at the right entities yet and
3149                  * "get pages" would need to read data from the guest (i.e. we will
3150                  * need to perform gpa to hpa translation). Request a call
3151                  * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3152                  * have already been set at vmentry time and should not be reset.
3153                  */
3154                 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3155         }
3156
3157         /*
3158          * If L1 had a pending IRQ/NMI until it executed
3159          * VMLAUNCH/VMRESUME which wasn't delivered because it was
3160          * disallowed (e.g. interrupts disabled), L0 needs to
3161          * evaluate if this pending event should cause an exit from L2
3162          * to L1 or delivered directly to L2 (e.g. In case L1 don't
3163          * intercept EXTERNAL_INTERRUPT).
3164          *
3165          * Usually this would be handled by the processor noticing an
3166          * IRQ/NMI window request, or checking RVI during evaluation of
3167          * pending virtual interrupts.  However, this setting was done
3168          * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3169          * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3170          */
3171         if (unlikely(evaluate_pending_interrupts))
3172                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3173
3174         /*
3175          * Do not start the preemption timer hrtimer until after we know
3176          * we are successful, so that only nested_vmx_vmexit needs to cancel
3177          * the timer.
3178          */
3179         vmx->nested.preemption_timer_expired = false;
3180         if (nested_cpu_has_preemption_timer(vmcs12))
3181                 vmx_start_preemption_timer(vcpu);
3182
3183         /*
3184          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3185          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3186          * returned as far as L1 is concerned. It will only return (and set
3187          * the success flag) when L2 exits (see nested_vmx_vmexit()).
3188          */
3189         return 0;
3190
3191         /*
3192          * A failed consistency check that leads to a VMExit during L1's
3193          * VMEnter to L2 is a variation of a normal VMexit, as explained in
3194          * 26.7 "VM-entry failures during or after loading guest state".
3195          */
3196 vmentry_fail_vmexit_guest_mode:
3197         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3198                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3199         leave_guest_mode(vcpu);
3200
3201 vmentry_fail_vmexit:
3202         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3203
3204         if (!from_vmentry)
3205                 return 1;
3206
3207         load_vmcs12_host_state(vcpu, vmcs12);
3208         vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3209         vmcs12->exit_qualification = exit_qual;
3210         if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3211                 vmx->nested.need_vmcs12_to_shadow_sync = true;
3212         return 1;
3213 }
3214
3215 /*
3216  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3217  * for running an L2 nested guest.
3218  */
3219 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3220 {
3221         struct vmcs12 *vmcs12;
3222         struct vcpu_vmx *vmx = to_vmx(vcpu);
3223         u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3224         int ret;
3225
3226         if (!nested_vmx_check_permission(vcpu))
3227                 return 1;
3228
3229         if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3230                 return 1;
3231
3232         if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3233                 return nested_vmx_failInvalid(vcpu);
3234
3235         vmcs12 = get_vmcs12(vcpu);
3236
3237         /*
3238          * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3239          * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3240          * rather than RFLAGS.ZF, and no error number is stored to the
3241          * VM-instruction error field.
3242          */
3243         if (vmcs12->hdr.shadow_vmcs)
3244                 return nested_vmx_failInvalid(vcpu);
3245
3246         if (vmx->nested.hv_evmcs) {
3247                 copy_enlightened_to_vmcs12(vmx);
3248                 /* Enlightened VMCS doesn't have launch state */
3249                 vmcs12->launch_state = !launch;
3250         } else if (enable_shadow_vmcs) {
3251                 copy_shadow_to_vmcs12(vmx);
3252         }
3253
3254         /*
3255          * The nested entry process starts with enforcing various prerequisites
3256          * on vmcs12 as required by the Intel SDM, and act appropriately when
3257          * they fail: As the SDM explains, some conditions should cause the
3258          * instruction to fail, while others will cause the instruction to seem
3259          * to succeed, but return an EXIT_REASON_INVALID_STATE.
3260          * To speed up the normal (success) code path, we should avoid checking
3261          * for misconfigurations which will anyway be caught by the processor
3262          * when using the merged vmcs02.
3263          */
3264         if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3265                 return nested_vmx_failValid(vcpu,
3266                         VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3267
3268         if (vmcs12->launch_state == launch)
3269                 return nested_vmx_failValid(vcpu,
3270                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3271                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3272
3273         if (nested_vmx_check_controls(vcpu, vmcs12))
3274                 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3275
3276         if (nested_vmx_check_host_state(vcpu, vmcs12))
3277                 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3278
3279         /*
3280          * We're finally done with prerequisite checking, and can start with
3281          * the nested entry.
3282          */
3283         vmx->nested.nested_run_pending = 1;
3284         ret = nested_vmx_enter_non_root_mode(vcpu, true);
3285         vmx->nested.nested_run_pending = !ret;
3286         if (ret > 0)
3287                 return 1;
3288         else if (ret)
3289                 return nested_vmx_failValid(vcpu,
3290                         VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3291
3292         /* Hide L1D cache contents from the nested guest.  */
3293         vmx->vcpu.arch.l1tf_flush_l1d = true;
3294
3295         /*
3296          * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3297          * also be used as part of restoring nVMX state for
3298          * snapshot restore (migration).
3299          *
3300          * In this flow, it is assumed that vmcs12 cache was
3301          * trasferred as part of captured nVMX state and should
3302          * therefore not be read from guest memory (which may not
3303          * exist on destination host yet).
3304          */
3305         nested_cache_shadow_vmcs12(vcpu, vmcs12);
3306
3307         /*
3308          * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3309          * awakened by event injection or by an NMI-window VM-exit or
3310          * by an interrupt-window VM-exit, halt the vcpu.
3311          */
3312         if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3313             !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3314             !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3315             !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3316               (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3317                 vmx->nested.nested_run_pending = 0;
3318                 return kvm_vcpu_halt(vcpu);
3319         }
3320         return 1;
3321 }
3322
3323 /*
3324  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3325  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3326  * This function returns the new value we should put in vmcs12.guest_cr0.
3327  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3328  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3329  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3330  *     didn't trap the bit, because if L1 did, so would L0).
3331  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3332  *     been modified by L2, and L1 knows it. So just leave the old value of
3333  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3334  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3335  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3336  *     changed these bits, and therefore they need to be updated, but L0
3337  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3338  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3339  */
3340 static inline unsigned long
3341 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3342 {
3343         return
3344         /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3345         /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3346         /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3347                         vcpu->arch.cr0_guest_owned_bits));
3348 }
3349
3350 static inline unsigned long
3351 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3352 {
3353         return
3354         /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3355         /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3356         /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3357                         vcpu->arch.cr4_guest_owned_bits));
3358 }
3359
3360 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3361                                       struct vmcs12 *vmcs12)
3362 {
3363         u32 idt_vectoring;
3364         unsigned int nr;
3365
3366         if (vcpu->arch.exception.injected) {
3367                 nr = vcpu->arch.exception.nr;
3368                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3369
3370                 if (kvm_exception_is_soft(nr)) {
3371                         vmcs12->vm_exit_instruction_len =
3372                                 vcpu->arch.event_exit_inst_len;
3373                         idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3374                 } else
3375                         idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3376
3377                 if (vcpu->arch.exception.has_error_code) {
3378                         idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3379                         vmcs12->idt_vectoring_error_code =
3380                                 vcpu->arch.exception.error_code;
3381                 }
3382
3383                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3384         } else if (vcpu->arch.nmi_injected) {
3385                 vmcs12->idt_vectoring_info_field =
3386                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3387         } else if (vcpu->arch.interrupt.injected) {
3388                 nr = vcpu->arch.interrupt.nr;
3389                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3390
3391                 if (vcpu->arch.interrupt.soft) {
3392                         idt_vectoring |= INTR_TYPE_SOFT_INTR;
3393                         vmcs12->vm_entry_instruction_len =
3394                                 vcpu->arch.event_exit_inst_len;
3395                 } else
3396                         idt_vectoring |= INTR_TYPE_EXT_INTR;
3397
3398                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3399         }
3400 }
3401
3402
3403 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3404 {
3405         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3406         gfn_t gfn;
3407
3408         /*
3409          * Don't need to mark the APIC access page dirty; it is never
3410          * written to by the CPU during APIC virtualization.
3411          */
3412
3413         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3414                 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3415                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3416         }
3417
3418         if (nested_cpu_has_posted_intr(vmcs12)) {
3419                 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3420                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3421         }
3422 }
3423
3424 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3425 {
3426         struct vcpu_vmx *vmx = to_vmx(vcpu);
3427         int max_irr;
3428         void *vapic_page;
3429         u16 status;
3430
3431         if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3432                 return;
3433
3434         vmx->nested.pi_pending = false;
3435         if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3436                 return;
3437
3438         max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3439         if (max_irr != 256) {
3440                 vapic_page = vmx->nested.virtual_apic_map.hva;
3441                 if (!vapic_page)
3442                         return;
3443
3444                 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3445                         vapic_page, &max_irr);
3446                 status = vmcs_read16(GUEST_INTR_STATUS);
3447                 if ((u8)max_irr > ((u8)status & 0xff)) {
3448                         status &= ~0xff;
3449                         status |= (u8)max_irr;
3450                         vmcs_write16(GUEST_INTR_STATUS, status);
3451                 }
3452         }
3453
3454         nested_mark_vmcs12_pages_dirty(vcpu);
3455 }
3456
3457 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3458                                                unsigned long exit_qual)
3459 {
3460         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3461         unsigned int nr = vcpu->arch.exception.nr;
3462         u32 intr_info = nr | INTR_INFO_VALID_MASK;
3463
3464         if (vcpu->arch.exception.has_error_code) {
3465                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3466                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3467         }
3468
3469         if (kvm_exception_is_soft(nr))
3470                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3471         else
3472                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3473
3474         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3475             vmx_get_nmi_mask(vcpu))
3476                 intr_info |= INTR_INFO_UNBLOCK_NMI;
3477
3478         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3479 }
3480
3481 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3482 {
3483         struct vcpu_vmx *vmx = to_vmx(vcpu);
3484         unsigned long exit_qual;
3485         bool block_nested_events =
3486             vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3487         struct kvm_lapic *apic = vcpu->arch.apic;
3488
3489         if (lapic_in_kernel(vcpu) &&
3490                 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3491                 if (block_nested_events)
3492                         return -EBUSY;
3493                 clear_bit(KVM_APIC_INIT, &apic->pending_events);
3494                 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3495                 return 0;
3496         }
3497
3498         if (vcpu->arch.exception.pending &&
3499                 nested_vmx_check_exception(vcpu, &exit_qual)) {
3500                 if (block_nested_events)
3501                         return -EBUSY;
3502                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3503                 return 0;
3504         }
3505
3506         if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3507             vmx->nested.preemption_timer_expired) {
3508                 if (block_nested_events)
3509                         return -EBUSY;
3510                 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3511                 return 0;
3512         }
3513
3514         if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3515                 if (block_nested_events)
3516                         return -EBUSY;
3517                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3518                                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
3519                                   INTR_INFO_VALID_MASK, 0);
3520                 /*
3521                  * The NMI-triggered VM exit counts as injection:
3522                  * clear this one and block further NMIs.
3523                  */
3524                 vcpu->arch.nmi_pending = 0;
3525                 vmx_set_nmi_mask(vcpu, true);
3526                 return 0;
3527         }
3528
3529         if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3530             nested_exit_on_intr(vcpu)) {
3531                 if (block_nested_events)
3532                         return -EBUSY;
3533                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3534                 return 0;
3535         }
3536
3537         vmx_complete_nested_posted_interrupt(vcpu);
3538         return 0;
3539 }
3540
3541 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3542 {
3543         ktime_t remaining =
3544                 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3545         u64 value;
3546
3547         if (ktime_to_ns(remaining) <= 0)
3548                 return 0;
3549
3550         value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3551         do_div(value, 1000000);
3552         return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3553 }
3554
3555 static bool is_vmcs12_ext_field(unsigned long field)
3556 {
3557         switch (field) {
3558         case GUEST_ES_SELECTOR:
3559         case GUEST_CS_SELECTOR:
3560         case GUEST_SS_SELECTOR:
3561         case GUEST_DS_SELECTOR:
3562         case GUEST_FS_SELECTOR:
3563         case GUEST_GS_SELECTOR:
3564         case GUEST_LDTR_SELECTOR:
3565         case GUEST_TR_SELECTOR:
3566         case GUEST_ES_LIMIT:
3567         case GUEST_CS_LIMIT:
3568         case GUEST_SS_LIMIT:
3569         case GUEST_DS_LIMIT:
3570         case GUEST_FS_LIMIT:
3571         case GUEST_GS_LIMIT:
3572         case GUEST_LDTR_LIMIT:
3573         case GUEST_TR_LIMIT:
3574         case GUEST_GDTR_LIMIT:
3575         case GUEST_IDTR_LIMIT:
3576         case GUEST_ES_AR_BYTES:
3577         case GUEST_DS_AR_BYTES:
3578         case GUEST_FS_AR_BYTES:
3579         case GUEST_GS_AR_BYTES:
3580         case GUEST_LDTR_AR_BYTES:
3581         case GUEST_TR_AR_BYTES:
3582         case GUEST_ES_BASE:
3583         case GUEST_CS_BASE:
3584         case GUEST_SS_BASE:
3585         case GUEST_DS_BASE:
3586         case GUEST_FS_BASE:
3587         case GUEST_GS_BASE:
3588         case GUEST_LDTR_BASE:
3589         case GUEST_TR_BASE:
3590         case GUEST_GDTR_BASE:
3591         case GUEST_IDTR_BASE:
3592         case GUEST_PENDING_DBG_EXCEPTIONS:
3593         case GUEST_BNDCFGS:
3594                 return true;
3595         default:
3596                 break;
3597         }
3598
3599         return false;
3600 }
3601
3602 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3603                                        struct vmcs12 *vmcs12)
3604 {
3605         struct vcpu_vmx *vmx = to_vmx(vcpu);
3606
3607         vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3608         vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3609         vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3610         vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3611         vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3612         vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3613         vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3614         vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3615         vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3616         vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3617         vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3618         vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3619         vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3620         vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3621         vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3622         vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3623         vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3624         vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3625         vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3626         vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3627         vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3628         vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3629         vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3630         vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3631         vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3632         vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3633         vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3634         vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3635         vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3636         vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3637         vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3638         vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3639         vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3640         vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3641         vmcs12->guest_pending_dbg_exceptions =
3642                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3643         if (kvm_mpx_supported())
3644                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3645
3646         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3647 }
3648
3649 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3650                                        struct vmcs12 *vmcs12)
3651 {
3652         struct vcpu_vmx *vmx = to_vmx(vcpu);
3653         int cpu;
3654
3655         if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3656                 return;
3657
3658
3659         WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3660
3661         cpu = get_cpu();
3662         vmx->loaded_vmcs = &vmx->nested.vmcs02;
3663         vmx_vcpu_load(&vmx->vcpu, cpu);
3664
3665         sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3666
3667         vmx->loaded_vmcs = &vmx->vmcs01;
3668         vmx_vcpu_load(&vmx->vcpu, cpu);
3669         put_cpu();
3670 }
3671
3672 /*
3673  * Update the guest state fields of vmcs12 to reflect changes that
3674  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3675  * VM-entry controls is also updated, since this is really a guest
3676  * state bit.)
3677  */
3678 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3679 {
3680         struct vcpu_vmx *vmx = to_vmx(vcpu);
3681
3682         if (vmx->nested.hv_evmcs)
3683                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3684
3685         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3686
3687         vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3688         vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3689
3690         vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3691         vmcs12->guest_rip = kvm_rip_read(vcpu);
3692         vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3693
3694         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3695         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3696
3697         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3698         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3699         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3700
3701         vmcs12->guest_interruptibility_info =
3702                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3703
3704         if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3705                 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3706         else
3707                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3708
3709         if (nested_cpu_has_preemption_timer(vmcs12) &&
3710             vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3711                         vmcs12->vmx_preemption_timer_value =
3712                                 vmx_get_preemption_timer_value(vcpu);
3713
3714         /*
3715          * In some cases (usually, nested EPT), L2 is allowed to change its
3716          * own CR3 without exiting. If it has changed it, we must keep it.
3717          * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3718          * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3719          *
3720          * Additionally, restore L2's PDPTR to vmcs12.
3721          */
3722         if (enable_ept) {
3723                 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3724                 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3725                         vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3726                         vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3727                         vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3728                         vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3729                 }
3730         }
3731
3732         vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3733
3734         if (nested_cpu_has_vid(vmcs12))
3735                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3736
3737         vmcs12->vm_entry_controls =
3738                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3739                 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3740
3741         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3742                 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3743
3744         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3745                 vmcs12->guest_ia32_efer = vcpu->arch.efer;
3746 }
3747
3748 /*
3749  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3750  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3751  * and this function updates it to reflect the changes to the guest state while
3752  * L2 was running (and perhaps made some exits which were handled directly by L0
3753  * without going back to L1), and to reflect the exit reason.
3754  * Note that we do not have to copy here all VMCS fields, just those that
3755  * could have changed by the L2 guest or the exit - i.e., the guest-state and
3756  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3757  * which already writes to vmcs12 directly.
3758  */
3759 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3760                            u32 exit_reason, u32 exit_intr_info,
3761                            unsigned long exit_qualification)
3762 {
3763         /* update exit information fields: */
3764         vmcs12->vm_exit_reason = exit_reason;
3765         vmcs12->exit_qualification = exit_qualification;
3766         vmcs12->vm_exit_intr_info = exit_intr_info;
3767
3768         vmcs12->idt_vectoring_info_field = 0;
3769         vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3770         vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3771
3772         if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3773                 vmcs12->launch_state = 1;
3774
3775                 /* vm_entry_intr_info_field is cleared on exit. Emulate this
3776                  * instead of reading the real value. */
3777                 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3778
3779                 /*
3780                  * Transfer the event that L0 or L1 may wanted to inject into
3781                  * L2 to IDT_VECTORING_INFO_FIELD.
3782                  */
3783                 vmcs12_save_pending_event(vcpu, vmcs12);
3784
3785                 /*
3786                  * According to spec, there's no need to store the guest's
3787                  * MSRs if the exit is due to a VM-entry failure that occurs
3788                  * during or after loading the guest state. Since this exit
3789                  * does not fall in that category, we need to save the MSRs.
3790                  */
3791                 if (nested_vmx_store_msr(vcpu,
3792                                          vmcs12->vm_exit_msr_store_addr,
3793                                          vmcs12->vm_exit_msr_store_count))
3794                         nested_vmx_abort(vcpu,
3795                                          VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3796         }
3797
3798         /*
3799          * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3800          * preserved above and would only end up incorrectly in L1.
3801          */
3802         vcpu->arch.nmi_injected = false;
3803         kvm_clear_exception_queue(vcpu);
3804         kvm_clear_interrupt_queue(vcpu);
3805 }
3806
3807 /*
3808  * A part of what we need to when the nested L2 guest exits and we want to
3809  * run its L1 parent, is to reset L1's guest state to the host state specified
3810  * in vmcs12.
3811  * This function is to be called not only on normal nested exit, but also on
3812  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3813  * Failures During or After Loading Guest State").
3814  * This function should be called when the active VMCS is L1's (vmcs01).
3815  */
3816 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3817                                    struct vmcs12 *vmcs12)
3818 {
3819         struct kvm_segment seg;
3820         u32 entry_failure_code;
3821
3822         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3823                 vcpu->arch.efer = vmcs12->host_ia32_efer;
3824         else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3825                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3826         else
3827                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3828         vmx_set_efer(vcpu, vcpu->arch.efer);
3829
3830         kvm_rsp_write(vcpu, vmcs12->host_rsp);
3831         kvm_rip_write(vcpu, vmcs12->host_rip);
3832         vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3833         vmx_set_interrupt_shadow(vcpu, 0);
3834
3835         /*
3836          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3837          * actually changed, because vmx_set_cr0 refers to efer set above.
3838          *
3839          * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3840          * (KVM doesn't change it);
3841          */
3842         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3843         vmx_set_cr0(vcpu, vmcs12->host_cr0);
3844
3845         /* Same as above - no reason to call set_cr4_guest_host_mask().  */
3846         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3847         vmx_set_cr4(vcpu, vmcs12->host_cr4);
3848
3849         nested_ept_uninit_mmu_context(vcpu);
3850
3851         /*
3852          * Only PDPTE load can fail as the value of cr3 was checked on entry and
3853          * couldn't have changed.
3854          */
3855         if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3856                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3857
3858         if (!enable_ept)
3859                 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3860
3861         /*
3862          * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3863          * VMEntry/VMExit. Thus, no need to flush TLB.
3864          *
3865          * If vmcs12 doesn't use VPID, L1 expects TLB to be
3866          * flushed on every VMEntry/VMExit.
3867          *
3868          * Otherwise, we can preserve TLB entries as long as we are
3869          * able to tag L1 TLB entries differently than L2 TLB entries.
3870          *
3871          * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3872          * and therefore we request the TLB flush to happen only after VMCS EPTP
3873          * has been set by KVM_REQ_LOAD_CR3.
3874          */
3875         if (enable_vpid &&
3876             (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3877                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3878         }
3879
3880         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3881         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3882         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3883         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3884         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3885         vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3886         vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3887
3888         /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
3889         if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3890                 vmcs_write64(GUEST_BNDCFGS, 0);
3891
3892         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3893                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3894                 vcpu->arch.pat = vmcs12->host_ia32_pat;
3895         }
3896         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
3897                 SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
3898                                 vmcs12->host_ia32_perf_global_ctrl);
3899
3900         /* Set L1 segment info according to Intel SDM
3901             27.5.2 Loading Host Segment and Descriptor-Table Registers */
3902         seg = (struct kvm_segment) {
3903                 .base = 0,
3904                 .limit = 0xFFFFFFFF,
3905                 .selector = vmcs12->host_cs_selector,
3906                 .type = 11,
3907                 .present = 1,
3908                 .s = 1,
3909                 .g = 1
3910         };
3911         if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3912                 seg.l = 1;
3913         else
3914                 seg.db = 1;
3915         vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
3916         seg = (struct kvm_segment) {
3917                 .base = 0,
3918                 .limit = 0xFFFFFFFF,
3919                 .type = 3,
3920                 .present = 1,
3921                 .s = 1,
3922                 .db = 1,
3923                 .g = 1
3924         };
3925         seg.selector = vmcs12->host_ds_selector;
3926         vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
3927         seg.selector = vmcs12->host_es_selector;
3928         vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
3929         seg.selector = vmcs12->host_ss_selector;
3930         vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
3931         seg.selector = vmcs12->host_fs_selector;
3932         seg.base = vmcs12->host_fs_base;
3933         vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
3934         seg.selector = vmcs12->host_gs_selector;
3935         seg.base = vmcs12->host_gs_base;
3936         vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
3937         seg = (struct kvm_segment) {
3938                 .base = vmcs12->host_tr_base,
3939                 .limit = 0x67,
3940                 .selector = vmcs12->host_tr_selector,
3941                 .type = 11,
3942                 .present = 1
3943         };
3944         vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
3945
3946         kvm_set_dr(vcpu, 7, 0x400);
3947         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
3948
3949         if (cpu_has_vmx_msr_bitmap())
3950                 vmx_update_msr_bitmap(vcpu);
3951
3952         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
3953                                 vmcs12->vm_exit_msr_load_count))
3954                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3955 }
3956
3957 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
3958 {
3959         struct shared_msr_entry *efer_msr;
3960         unsigned int i;
3961
3962         if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
3963                 return vmcs_read64(GUEST_IA32_EFER);
3964
3965         if (cpu_has_load_ia32_efer())
3966                 return host_efer;
3967
3968         for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
3969                 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
3970                         return vmx->msr_autoload.guest.val[i].value;
3971         }
3972
3973         efer_msr = find_msr_entry(vmx, MSR_EFER);
3974         if (efer_msr)
3975                 return efer_msr->data;
3976
3977         return host_efer;
3978 }
3979
3980 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3981 {
3982         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3983         struct vcpu_vmx *vmx = to_vmx(vcpu);
3984         struct vmx_msr_entry g, h;
3985         gpa_t gpa;
3986         u32 i, j;
3987
3988         vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
3989
3990         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
3991                 /*
3992                  * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
3993                  * as vmcs01.GUEST_DR7 contains a userspace defined value
3994                  * and vcpu->arch.dr7 is not squirreled away before the
3995                  * nested VMENTER (not worth adding a variable in nested_vmx).
3996                  */
3997                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
3998                         kvm_set_dr(vcpu, 7, DR7_FIXED_1);
3999                 else
4000                         WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4001         }
4002
4003         /*
4004          * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4005          * handle a variety of side effects to KVM's software model.
4006          */
4007         vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4008
4009         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
4010         vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4011
4012         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4013         vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4014
4015         nested_ept_uninit_mmu_context(vcpu);
4016         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4017         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4018
4019         /*
4020          * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4021          * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4022          * VMFail, like everything else we just need to ensure our
4023          * software model is up-to-date.
4024          */
4025         if (enable_ept)
4026                 ept_save_pdptrs(vcpu);
4027
4028         kvm_mmu_reset_context(vcpu);
4029
4030         if (cpu_has_vmx_msr_bitmap())
4031                 vmx_update_msr_bitmap(vcpu);
4032
4033         /*
4034          * This nasty bit of open coding is a compromise between blindly
4035          * loading L1's MSRs using the exit load lists (incorrect emulation
4036          * of VMFail), leaving the nested VM's MSRs in the software model
4037          * (incorrect behavior) and snapshotting the modified MSRs (too
4038          * expensive since the lists are unbound by hardware).  For each
4039          * MSR that was (prematurely) loaded from the nested VMEntry load
4040          * list, reload it from the exit load list if it exists and differs
4041          * from the guest value.  The intent is to stuff host state as
4042          * silently as possible, not to fully process the exit load list.
4043          */
4044         for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4045                 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4046                 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4047                         pr_debug_ratelimited(
4048                                 "%s read MSR index failed (%u, 0x%08llx)\n",
4049                                 __func__, i, gpa);
4050                         goto vmabort;
4051                 }
4052
4053                 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4054                         gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4055                         if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4056                                 pr_debug_ratelimited(
4057                                         "%s read MSR failed (%u, 0x%08llx)\n",
4058                                         __func__, j, gpa);
4059                                 goto vmabort;
4060                         }
4061                         if (h.index != g.index)
4062                                 continue;
4063                         if (h.value == g.value)
4064                                 break;
4065
4066                         if (nested_vmx_load_msr_check(vcpu, &h)) {
4067                                 pr_debug_ratelimited(
4068                                         "%s check failed (%u, 0x%x, 0x%x)\n",
4069                                         __func__, j, h.index, h.reserved);
4070                                 goto vmabort;
4071                         }
4072
4073                         if (kvm_set_msr(vcpu, h.index, h.value)) {
4074                                 pr_debug_ratelimited(
4075                                         "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4076                                         __func__, j, h.index, h.value);
4077                                 goto vmabort;
4078                         }
4079                 }
4080         }
4081
4082         return;
4083
4084 vmabort:
4085         nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4086 }
4087
4088 /*
4089  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4090  * and modify vmcs12 to make it see what it would expect to see there if
4091  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4092  */
4093 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4094                        u32 exit_intr_info, unsigned long exit_qualification)
4095 {
4096         struct vcpu_vmx *vmx = to_vmx(vcpu);
4097         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4098
4099         /* trying to cancel vmlaunch/vmresume is a bug */
4100         WARN_ON_ONCE(vmx->nested.nested_run_pending);
4101
4102         leave_guest_mode(vcpu);
4103
4104         if (nested_cpu_has_preemption_timer(vmcs12))
4105                 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4106
4107         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
4108                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4109
4110         if (likely(!vmx->fail)) {
4111                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4112
4113                 if (exit_reason != -1)
4114                         prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4115                                        exit_qualification);
4116
4117                 /*
4118                  * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4119                  * also be used to capture vmcs12 cache as part of
4120                  * capturing nVMX state for snapshot (migration).
4121                  *
4122                  * Otherwise, this flush will dirty guest memory at a
4123                  * point it is already assumed by user-space to be
4124                  * immutable.
4125                  */
4126                 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4127         } else {
4128                 /*
4129                  * The only expected VM-instruction error is "VM entry with
4130                  * invalid control field(s)." Anything else indicates a
4131                  * problem with L0.  And we should never get here with a
4132                  * VMFail of any type if early consistency checks are enabled.
4133                  */
4134                 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4135                              VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4136                 WARN_ON_ONCE(nested_early_check);
4137         }
4138
4139         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4140
4141         /* Update any VMCS fields that might have changed while L2 ran */
4142         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4143         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4144         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4145         if (vmx->nested.l1_tpr_threshold != -1)
4146                 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4147
4148         if (kvm_has_tsc_control)
4149                 decache_tsc_multiplier(vmx);
4150
4151         if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4152                 vmx->nested.change_vmcs01_virtual_apic_mode = false;
4153                 vmx_set_virtual_apic_mode(vcpu);
4154         } else if (!nested_cpu_has_ept(vmcs12) &&
4155                    nested_cpu_has2(vmcs12,
4156                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
4157                 vmx_flush_tlb(vcpu, true);
4158         }
4159
4160         /* Unpin physical memory we referred to in vmcs02 */
4161         if (vmx->nested.apic_access_page) {
4162                 kvm_release_page_dirty(vmx->nested.apic_access_page);
4163                 vmx->nested.apic_access_page = NULL;
4164         }
4165         kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4166         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4167         vmx->nested.pi_desc = NULL;
4168
4169         /*
4170          * We are now running in L2, mmu_notifier will force to reload the
4171          * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4172          */
4173         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4174
4175         if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4176                 vmx->nested.need_vmcs12_to_shadow_sync = true;
4177
4178         /* in case we halted in L2 */
4179         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4180
4181         if (likely(!vmx->fail)) {
4182                 /*
4183                  * TODO: SDM says that with acknowledge interrupt on
4184                  * exit, bit 31 of the VM-exit interrupt information
4185                  * (valid interrupt) is always set to 1 on
4186                  * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4187                  * need kvm_cpu_has_interrupt().  See the commit
4188                  * message for details.
4189                  */
4190                 if (nested_exit_intr_ack_set(vcpu) &&
4191                     exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4192                     kvm_cpu_has_interrupt(vcpu)) {
4193                         int irq = kvm_cpu_get_interrupt(vcpu);
4194                         WARN_ON(irq < 0);
4195                         vmcs12->vm_exit_intr_info = irq |
4196                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4197                 }
4198
4199                 if (exit_reason != -1)
4200                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4201                                                        vmcs12->exit_qualification,
4202                                                        vmcs12->idt_vectoring_info_field,
4203                                                        vmcs12->vm_exit_intr_info,
4204                                                        vmcs12->vm_exit_intr_error_code,
4205                                                        KVM_ISA_VMX);
4206
4207                 load_vmcs12_host_state(vcpu, vmcs12);
4208
4209                 return;
4210         }
4211
4212         /*
4213          * After an early L2 VM-entry failure, we're now back
4214          * in L1 which thinks it just finished a VMLAUNCH or
4215          * VMRESUME instruction, so we need to set the failure
4216          * flag and the VM-instruction error field of the VMCS
4217          * accordingly, and skip the emulated instruction.
4218          */
4219         (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4220
4221         /*
4222          * Restore L1's host state to KVM's software model.  We're here
4223          * because a consistency check was caught by hardware, which
4224          * means some amount of guest state has been propagated to KVM's
4225          * model and needs to be unwound to the host's state.
4226          */
4227         nested_vmx_restore_host_state(vcpu);
4228
4229         vmx->fail = 0;
4230 }
4231
4232 /*
4233  * Decode the memory-address operand of a vmx instruction, as recorded on an
4234  * exit caused by such an instruction (run by a guest hypervisor).
4235  * On success, returns 0. When the operand is invalid, returns 1 and throws
4236  * #UD or #GP.
4237  */
4238 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4239                         u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4240 {
4241         gva_t off;
4242         bool exn;
4243         struct kvm_segment s;
4244
4245         /*
4246          * According to Vol. 3B, "Information for VM Exits Due to Instruction
4247          * Execution", on an exit, vmx_instruction_info holds most of the
4248          * addressing components of the operand. Only the displacement part
4249          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4250          * For how an actual address is calculated from all these components,
4251          * refer to Vol. 1, "Operand Addressing".
4252          */
4253         int  scaling = vmx_instruction_info & 3;
4254         int  addr_size = (vmx_instruction_info >> 7) & 7;
4255         bool is_reg = vmx_instruction_info & (1u << 10);
4256         int  seg_reg = (vmx_instruction_info >> 15) & 7;
4257         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4258         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4259         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4260         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4261
4262         if (is_reg) {
4263                 kvm_queue_exception(vcpu, UD_VECTOR);
4264                 return 1;
4265         }
4266
4267         /* Addr = segment_base + offset */
4268         /* offset = base + [index * scale] + displacement */
4269         off = exit_qualification; /* holds the displacement */
4270         if (addr_size == 1)
4271                 off = (gva_t)sign_extend64(off, 31);
4272         else if (addr_size == 0)
4273                 off = (gva_t)sign_extend64(off, 15);
4274         if (base_is_valid)
4275                 off += kvm_register_read(vcpu, base_reg);
4276         if (index_is_valid)
4277                 off += kvm_register_read(vcpu, index_reg)<<scaling;
4278         vmx_get_segment(vcpu, &s, seg_reg);
4279
4280         /*
4281          * The effective address, i.e. @off, of a memory operand is truncated
4282          * based on the address size of the instruction.  Note that this is
4283          * the *effective address*, i.e. the address prior to accounting for
4284          * the segment's base.
4285          */
4286         if (addr_size == 1) /* 32 bit */
4287                 off &= 0xffffffff;
4288         else if (addr_size == 0) /* 16 bit */
4289                 off &= 0xffff;
4290
4291         /* Checks for #GP/#SS exceptions. */
4292         exn = false;
4293         if (is_long_mode(vcpu)) {
4294                 /*
4295                  * The virtual/linear address is never truncated in 64-bit
4296                  * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4297                  * address when using FS/GS with a non-zero base.
4298                  */
4299                 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4300                         *ret = s.base + off;
4301                 else
4302                         *ret = off;
4303
4304                 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4305                  * non-canonical form. This is the only check on the memory
4306                  * destination for long mode!
4307                  */
4308                 exn = is_noncanonical_address(*ret, vcpu);
4309         } else {
4310                 /*
4311                  * When not in long mode, the virtual/linear address is
4312                  * unconditionally truncated to 32 bits regardless of the
4313                  * address size.
4314                  */
4315                 *ret = (s.base + off) & 0xffffffff;
4316
4317                 /* Protected mode: apply checks for segment validity in the
4318                  * following order:
4319                  * - segment type check (#GP(0) may be thrown)
4320                  * - usability check (#GP(0)/#SS(0))
4321                  * - limit check (#GP(0)/#SS(0))
4322                  */
4323                 if (wr)
4324                         /* #GP(0) if the destination operand is located in a
4325                          * read-only data segment or any code segment.
4326                          */
4327                         exn = ((s.type & 0xa) == 0 || (s.type & 8));
4328                 else
4329                         /* #GP(0) if the source operand is located in an
4330                          * execute-only code segment
4331                          */
4332                         exn = ((s.type & 0xa) == 8);
4333                 if (exn) {
4334                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4335                         return 1;
4336                 }
4337                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4338                  */
4339                 exn = (s.unusable != 0);
4340
4341                 /*
4342                  * Protected mode: #GP(0)/#SS(0) if the memory operand is
4343                  * outside the segment limit.  All CPUs that support VMX ignore
4344                  * limit checks for flat segments, i.e. segments with base==0,
4345                  * limit==0xffffffff and of type expand-up data or code.
4346                  */
4347                 if (!(s.base == 0 && s.limit == 0xffffffff &&
4348                      ((s.type & 8) || !(s.type & 4))))
4349                         exn = exn || ((u64)off + len - 1 > s.limit);
4350         }
4351         if (exn) {
4352                 kvm_queue_exception_e(vcpu,
4353                                       seg_reg == VCPU_SREG_SS ?
4354                                                 SS_VECTOR : GP_VECTOR,
4355                                       0);
4356                 return 1;
4357         }
4358
4359         return 0;
4360 }
4361
4362 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4363 {
4364         struct vcpu_vmx *vmx;
4365
4366         if (!nested_vmx_allowed(vcpu))
4367                 return;
4368
4369         vmx = to_vmx(vcpu);
4370         if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4371                 vmx->nested.msrs.entry_ctls_high |=
4372                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4373                 vmx->nested.msrs.exit_ctls_high |=
4374                                 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4375         } else {
4376                 vmx->nested.msrs.entry_ctls_high &=
4377                                 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4378                 vmx->nested.msrs.exit_ctls_high &=
4379                                 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4380         }
4381 }
4382
4383 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4384 {
4385         gva_t gva;
4386         struct x86_exception e;
4387
4388         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4389                                 vmcs_read32(VMX_INSTRUCTION_INFO), false,
4390                                 sizeof(*vmpointer), &gva))
4391                 return 1;
4392
4393         if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4394                 kvm_inject_page_fault(vcpu, &e);
4395                 return 1;
4396         }
4397
4398         return 0;
4399 }
4400
4401 /*
4402  * Allocate a shadow VMCS and associate it with the currently loaded
4403  * VMCS, unless such a shadow VMCS already exists. The newly allocated
4404  * VMCS is also VMCLEARed, so that it is ready for use.
4405  */
4406 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4407 {
4408         struct vcpu_vmx *vmx = to_vmx(vcpu);
4409         struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4410
4411         /*
4412          * We should allocate a shadow vmcs for vmcs01 only when L1
4413          * executes VMXON and free it when L1 executes VMXOFF.
4414          * As it is invalid to execute VMXON twice, we shouldn't reach
4415          * here when vmcs01 already have an allocated shadow vmcs.
4416          */
4417         WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4418
4419         if (!loaded_vmcs->shadow_vmcs) {
4420                 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4421                 if (loaded_vmcs->shadow_vmcs)
4422                         vmcs_clear(loaded_vmcs->shadow_vmcs);
4423         }
4424         return loaded_vmcs->shadow_vmcs;
4425 }
4426
4427 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4428 {
4429         struct vcpu_vmx *vmx = to_vmx(vcpu);
4430         int r;
4431
4432         r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4433         if (r < 0)
4434                 goto out_vmcs02;
4435
4436         vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4437         if (!vmx->nested.cached_vmcs12)
4438                 goto out_cached_vmcs12;
4439
4440         vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4441         if (!vmx->nested.cached_shadow_vmcs12)
4442                 goto out_cached_shadow_vmcs12;
4443
4444         if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4445                 goto out_shadow_vmcs;
4446
4447         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4448                      HRTIMER_MODE_REL_PINNED);
4449         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4450
4451         vmx->nested.vpid02 = allocate_vpid();
4452
4453         vmx->nested.vmcs02_initialized = false;
4454         vmx->nested.vmxon = true;
4455
4456         if (pt_mode == PT_MODE_HOST_GUEST) {
4457                 vmx->pt_desc.guest.ctl = 0;
4458                 pt_update_intercept_for_msr(vmx);
4459         }
4460
4461         return 0;
4462
4463 out_shadow_vmcs:
4464         kfree(vmx->nested.cached_shadow_vmcs12);
4465
4466 out_cached_shadow_vmcs12:
4467         kfree(vmx->nested.cached_vmcs12);
4468
4469 out_cached_vmcs12:
4470         free_loaded_vmcs(&vmx->nested.vmcs02);
4471
4472 out_vmcs02:
4473         return -ENOMEM;
4474 }
4475
4476 /*
4477  * Emulate the VMXON instruction.
4478  * Currently, we just remember that VMX is active, and do not save or even
4479  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4480  * do not currently need to store anything in that guest-allocated memory
4481  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4482  * argument is different from the VMXON pointer (which the spec says they do).
4483  */
4484 static int handle_vmon(struct kvm_vcpu *vcpu)
4485 {
4486         int ret;
4487         gpa_t vmptr;
4488         uint32_t revision;
4489         struct vcpu_vmx *vmx = to_vmx(vcpu);
4490         const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4491                 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4492
4493         /*
4494          * The Intel VMX Instruction Reference lists a bunch of bits that are
4495          * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4496          * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4497          * Otherwise, we should fail with #UD.  But most faulting conditions
4498          * have already been checked by hardware, prior to the VM-exit for
4499          * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4500          * that bit set to 1 in non-root mode.
4501          */
4502         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4503                 kvm_queue_exception(vcpu, UD_VECTOR);
4504                 return 1;
4505         }
4506
4507         /* CPL=0 must be checked manually. */
4508         if (vmx_get_cpl(vcpu)) {
4509                 kvm_inject_gp(vcpu, 0);
4510                 return 1;
4511         }
4512
4513         if (vmx->nested.vmxon)
4514                 return nested_vmx_failValid(vcpu,
4515                         VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4516
4517         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4518                         != VMXON_NEEDED_FEATURES) {
4519                 kvm_inject_gp(vcpu, 0);
4520                 return 1;
4521         }
4522
4523         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4524                 return 1;
4525
4526         /*
4527          * SDM 3: 24.11.5
4528          * The first 4 bytes of VMXON region contain the supported
4529          * VMCS revision identifier
4530          *
4531          * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4532          * which replaces physical address width with 32
4533          */
4534         if (!page_address_valid(vcpu, vmptr))
4535                 return nested_vmx_failInvalid(vcpu);
4536
4537         if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4538             revision != VMCS12_REVISION)
4539                 return nested_vmx_failInvalid(vcpu);
4540
4541         vmx->nested.vmxon_ptr = vmptr;
4542         ret = enter_vmx_operation(vcpu);
4543         if (ret)
4544                 return ret;
4545
4546         return nested_vmx_succeed(vcpu);
4547 }
4548
4549 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4550 {
4551         struct vcpu_vmx *vmx = to_vmx(vcpu);
4552
4553         if (vmx->nested.current_vmptr == -1ull)
4554                 return;
4555
4556         copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4557
4558         if (enable_shadow_vmcs) {
4559                 /* copy to memory all shadowed fields in case
4560                    they were modified */
4561                 copy_shadow_to_vmcs12(vmx);
4562                 vmx_disable_shadow_vmcs(vmx);
4563         }
4564         vmx->nested.posted_intr_nv = -1;
4565
4566         /* Flush VMCS12 to guest memory */
4567         kvm_vcpu_write_guest_page(vcpu,
4568                                   vmx->nested.current_vmptr >> PAGE_SHIFT,
4569                                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4570
4571         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4572
4573         vmx->nested.current_vmptr = -1ull;
4574 }
4575
4576 /* Emulate the VMXOFF instruction */
4577 static int handle_vmoff(struct kvm_vcpu *vcpu)
4578 {
4579         if (!nested_vmx_check_permission(vcpu))
4580                 return 1;
4581
4582         free_nested(vcpu);
4583
4584         /* Process a latched INIT during time CPU was in VMX operation */
4585         kvm_make_request(KVM_REQ_EVENT, vcpu);
4586
4587         return nested_vmx_succeed(vcpu);
4588 }
4589
4590 /* Emulate the VMCLEAR instruction */
4591 static int handle_vmclear(struct kvm_vcpu *vcpu)
4592 {
4593         struct vcpu_vmx *vmx = to_vmx(vcpu);
4594         u32 zero = 0;
4595         gpa_t vmptr;
4596         u64 evmcs_gpa;
4597
4598         if (!nested_vmx_check_permission(vcpu))
4599                 return 1;
4600
4601         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4602                 return 1;
4603
4604         if (!page_address_valid(vcpu, vmptr))
4605                 return nested_vmx_failValid(vcpu,
4606                         VMXERR_VMCLEAR_INVALID_ADDRESS);
4607
4608         if (vmptr == vmx->nested.vmxon_ptr)
4609                 return nested_vmx_failValid(vcpu,
4610                         VMXERR_VMCLEAR_VMXON_POINTER);
4611
4612         /*
4613          * When Enlightened VMEntry is enabled on the calling CPU we treat
4614          * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4615          * way to distinguish it from VMCS12) and we must not corrupt it by
4616          * writing to the non-existent 'launch_state' field. The area doesn't
4617          * have to be the currently active EVMCS on the calling CPU and there's
4618          * nothing KVM has to do to transition it from 'active' to 'non-active'
4619          * state. It is possible that the area will stay mapped as
4620          * vmx->nested.hv_evmcs but this shouldn't be a problem.
4621          */
4622         if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4623                    !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4624                 if (vmptr == vmx->nested.current_vmptr)
4625                         nested_release_vmcs12(vcpu);
4626
4627                 kvm_vcpu_write_guest(vcpu,
4628                                      vmptr + offsetof(struct vmcs12,
4629                                                       launch_state),
4630                                      &zero, sizeof(zero));
4631         }
4632
4633         return nested_vmx_succeed(vcpu);
4634 }
4635
4636 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4637
4638 /* Emulate the VMLAUNCH instruction */
4639 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4640 {
4641         return nested_vmx_run(vcpu, true);
4642 }
4643
4644 /* Emulate the VMRESUME instruction */
4645 static int handle_vmresume(struct kvm_vcpu *vcpu)
4646 {
4647
4648         return nested_vmx_run(vcpu, false);
4649 }
4650
4651 static int handle_vmread(struct kvm_vcpu *vcpu)
4652 {
4653         unsigned long field;
4654         u64 field_value;
4655         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4656         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4657         int len;
4658         gva_t gva = 0;
4659         struct vmcs12 *vmcs12;
4660         struct x86_exception e;
4661         short offset;
4662
4663         if (!nested_vmx_check_permission(vcpu))
4664                 return 1;
4665
4666         if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4667                 return nested_vmx_failInvalid(vcpu);
4668
4669         if (!is_guest_mode(vcpu))
4670                 vmcs12 = get_vmcs12(vcpu);
4671         else {
4672                 /*
4673                  * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4674                  * to shadowed-field sets the ALU flags for VMfailInvalid.
4675                  */
4676                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4677                         return nested_vmx_failInvalid(vcpu);
4678                 vmcs12 = get_shadow_vmcs12(vcpu);
4679         }
4680
4681         /* Decode instruction info and find the field to read */
4682         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4683
4684         offset = vmcs_field_to_offset(field);
4685         if (offset < 0)
4686                 return nested_vmx_failValid(vcpu,
4687                         VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4688
4689         if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4690                 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4691
4692         /* Read the field, zero-extended to a u64 field_value */
4693         field_value = vmcs12_read_any(vmcs12, field, offset);
4694
4695         /*
4696          * Now copy part of this value to register or memory, as requested.
4697          * Note that the number of bits actually copied is 32 or 64 depending
4698          * on the guest's mode (32 or 64 bit), not on the given field's length.
4699          */
4700         if (vmx_instruction_info & (1u << 10)) {
4701                 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4702                         field_value);
4703         } else {
4704                 len = is_64_bit_mode(vcpu) ? 8 : 4;
4705                 if (get_vmx_mem_address(vcpu, exit_qualification,
4706                                 vmx_instruction_info, true, len, &gva))
4707                         return 1;
4708                 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4709                 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
4710                         kvm_inject_page_fault(vcpu, &e);
4711         }
4712
4713         return nested_vmx_succeed(vcpu);
4714 }
4715
4716 static bool is_shadow_field_rw(unsigned long field)
4717 {
4718         switch (field) {
4719 #define SHADOW_FIELD_RW(x, y) case x:
4720 #include "vmcs_shadow_fields.h"
4721                 return true;
4722         default:
4723                 break;
4724         }
4725         return false;
4726 }
4727
4728 static bool is_shadow_field_ro(unsigned long field)
4729 {
4730         switch (field) {
4731 #define SHADOW_FIELD_RO(x, y) case x:
4732 #include "vmcs_shadow_fields.h"
4733                 return true;
4734         default:
4735                 break;
4736         }
4737         return false;
4738 }
4739
4740 static int handle_vmwrite(struct kvm_vcpu *vcpu)
4741 {
4742         unsigned long field;
4743         int len;
4744         gva_t gva;
4745         struct vcpu_vmx *vmx = to_vmx(vcpu);
4746         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4747         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4748
4749         /* The value to write might be 32 or 64 bits, depending on L1's long
4750          * mode, and eventually we need to write that into a field of several
4751          * possible lengths. The code below first zero-extends the value to 64
4752          * bit (field_value), and then copies only the appropriate number of
4753          * bits into the vmcs12 field.
4754          */
4755         u64 field_value = 0;
4756         struct x86_exception e;
4757         struct vmcs12 *vmcs12;
4758         short offset;
4759
4760         if (!nested_vmx_check_permission(vcpu))
4761                 return 1;
4762
4763         if (vmx->nested.current_vmptr == -1ull)
4764                 return nested_vmx_failInvalid(vcpu);
4765
4766         if (vmx_instruction_info & (1u << 10))
4767                 field_value = kvm_register_readl(vcpu,
4768                         (((vmx_instruction_info) >> 3) & 0xf));
4769         else {
4770                 len = is_64_bit_mode(vcpu) ? 8 : 4;
4771                 if (get_vmx_mem_address(vcpu, exit_qualification,
4772                                 vmx_instruction_info, false, len, &gva))
4773                         return 1;
4774                 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
4775                         kvm_inject_page_fault(vcpu, &e);
4776                         return 1;
4777                 }
4778         }
4779
4780
4781         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4782         /*
4783          * If the vCPU supports "VMWRITE to any supported field in the
4784          * VMCS," then the "read-only" fields are actually read/write.
4785          */
4786         if (vmcs_field_readonly(field) &&
4787             !nested_cpu_has_vmwrite_any_field(vcpu))
4788                 return nested_vmx_failValid(vcpu,
4789                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4790
4791         if (!is_guest_mode(vcpu)) {
4792                 vmcs12 = get_vmcs12(vcpu);
4793
4794                 /*
4795                  * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4796                  * vmcs12, else we may crush a field or consume a stale value.
4797                  */
4798                 if (!is_shadow_field_rw(field))
4799                         copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4800         } else {
4801                 /*
4802                  * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4803                  * to shadowed-field sets the ALU flags for VMfailInvalid.
4804                  */
4805                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4806                         return nested_vmx_failInvalid(vcpu);
4807                 vmcs12 = get_shadow_vmcs12(vcpu);
4808         }
4809
4810         offset = vmcs_field_to_offset(field);
4811         if (offset < 0)
4812                 return nested_vmx_failValid(vcpu,
4813                         VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4814
4815         /*
4816          * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4817          * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
4818          * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4819          * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4820          * from L1 will return a different value than VMREAD from L2 (L1 sees
4821          * the stripped down value, L2 sees the full value as stored by KVM).
4822          */
4823         if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4824                 field_value &= 0x1f0ff;
4825
4826         vmcs12_write_any(vmcs12, field, offset, field_value);
4827
4828         /*
4829          * Do not track vmcs12 dirty-state if in guest-mode as we actually
4830          * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
4831          * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4832          * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4833          */
4834         if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4835                 /*
4836                  * L1 can read these fields without exiting, ensure the
4837                  * shadow VMCS is up-to-date.
4838                  */
4839                 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4840                         preempt_disable();
4841                         vmcs_load(vmx->vmcs01.shadow_vmcs);
4842
4843                         __vmcs_writel(field, field_value);
4844
4845                         vmcs_clear(vmx->vmcs01.shadow_vmcs);
4846                         vmcs_load(vmx->loaded_vmcs->vmcs);
4847                         preempt_enable();
4848                 }
4849                 vmx->nested.dirty_vmcs12 = true;
4850         }
4851
4852         return nested_vmx_succeed(vcpu);
4853 }
4854
4855 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4856 {
4857         vmx->nested.current_vmptr = vmptr;
4858         if (enable_shadow_vmcs) {
4859                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
4860                 vmcs_write64(VMCS_LINK_POINTER,
4861                              __pa(vmx->vmcs01.shadow_vmcs));
4862                 vmx->nested.need_vmcs12_to_shadow_sync = true;
4863         }
4864         vmx->nested.dirty_vmcs12 = true;
4865 }
4866
4867 /* Emulate the VMPTRLD instruction */
4868 static int handle_vmptrld(struct kvm_vcpu *vcpu)
4869 {
4870         struct vcpu_vmx *vmx = to_vmx(vcpu);
4871         gpa_t vmptr;
4872
4873         if (!nested_vmx_check_permission(vcpu))
4874                 return 1;
4875
4876         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4877                 return 1;
4878
4879         if (!page_address_valid(vcpu, vmptr))
4880                 return nested_vmx_failValid(vcpu,
4881                         VMXERR_VMPTRLD_INVALID_ADDRESS);
4882
4883         if (vmptr == vmx->nested.vmxon_ptr)
4884                 return nested_vmx_failValid(vcpu,
4885                         VMXERR_VMPTRLD_VMXON_POINTER);
4886
4887         /* Forbid normal VMPTRLD if Enlightened version was used */
4888         if (vmx->nested.hv_evmcs)
4889                 return 1;
4890
4891         if (vmx->nested.current_vmptr != vmptr) {
4892                 struct kvm_host_map map;
4893                 struct vmcs12 *new_vmcs12;
4894
4895                 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4896                         /*
4897                          * Reads from an unbacked page return all 1s,
4898                          * which means that the 32 bits located at the
4899                          * given physical address won't match the required
4900                          * VMCS12_REVISION identifier.
4901                          */
4902                         return nested_vmx_failValid(vcpu,
4903                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4904                 }
4905
4906                 new_vmcs12 = map.hva;
4907
4908                 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
4909                     (new_vmcs12->hdr.shadow_vmcs &&
4910                      !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
4911                         kvm_vcpu_unmap(vcpu, &map, false);
4912                         return nested_vmx_failValid(vcpu,
4913                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4914                 }
4915
4916                 nested_release_vmcs12(vcpu);
4917
4918                 /*
4919                  * Load VMCS12 from guest memory since it is not already
4920                  * cached.
4921                  */
4922                 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
4923                 kvm_vcpu_unmap(vcpu, &map, false);
4924
4925                 set_current_vmptr(vmx, vmptr);
4926         }
4927
4928         return nested_vmx_succeed(vcpu);
4929 }
4930
4931 /* Emulate the VMPTRST instruction */
4932 static int handle_vmptrst(struct kvm_vcpu *vcpu)
4933 {
4934         unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
4935         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4936         gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
4937         struct x86_exception e;
4938         gva_t gva;
4939
4940         if (!nested_vmx_check_permission(vcpu))
4941                 return 1;
4942
4943         if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
4944                 return 1;
4945
4946         if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
4947                                 true, sizeof(gpa_t), &gva))
4948                 return 1;
4949         /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
4950         if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
4951                                         sizeof(gpa_t), &e)) {
4952                 kvm_inject_page_fault(vcpu, &e);
4953                 return 1;
4954         }
4955         return nested_vmx_succeed(vcpu);
4956 }
4957
4958 /* Emulate the INVEPT instruction */
4959 static int handle_invept(struct kvm_vcpu *vcpu)
4960 {
4961         struct vcpu_vmx *vmx = to_vmx(vcpu);
4962         u32 vmx_instruction_info, types;
4963         unsigned long type;
4964         gva_t gva;
4965         struct x86_exception e;
4966         struct {
4967                 u64 eptp, gpa;
4968         } operand;
4969
4970         if (!(vmx->nested.msrs.secondary_ctls_high &
4971               SECONDARY_EXEC_ENABLE_EPT) ||
4972             !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
4973                 kvm_queue_exception(vcpu, UD_VECTOR);
4974                 return 1;
4975         }
4976
4977         if (!nested_vmx_check_permission(vcpu))
4978                 return 1;
4979
4980         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4981         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4982
4983         types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
4984
4985         if (type >= 32 || !(types & (1 << type)))
4986                 return nested_vmx_failValid(vcpu,
4987                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4988
4989         /* According to the Intel VMX instruction reference, the memory
4990          * operand is read even if it isn't needed (e.g., for type==global)
4991          */
4992         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4993                         vmx_instruction_info, false, sizeof(operand), &gva))
4994                 return 1;
4995         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4996                 kvm_inject_page_fault(vcpu, &e);
4997                 return 1;
4998         }
4999
5000         switch (type) {
5001         case VMX_EPT_EXTENT_GLOBAL:
5002         case VMX_EPT_EXTENT_CONTEXT:
5003         /*
5004          * TODO: Sync the necessary shadow EPT roots here, rather than
5005          * at the next emulated VM-entry.
5006          */
5007                 break;
5008         default:
5009                 BUG_ON(1);
5010                 break;
5011         }
5012
5013         return nested_vmx_succeed(vcpu);
5014 }
5015
5016 static int handle_invvpid(struct kvm_vcpu *vcpu)
5017 {
5018         struct vcpu_vmx *vmx = to_vmx(vcpu);
5019         u32 vmx_instruction_info;
5020         unsigned long type, types;
5021         gva_t gva;
5022         struct x86_exception e;
5023         struct {
5024                 u64 vpid;
5025                 u64 gla;
5026         } operand;
5027         u16 vpid02;
5028
5029         if (!(vmx->nested.msrs.secondary_ctls_high &
5030               SECONDARY_EXEC_ENABLE_VPID) ||
5031                         !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5032                 kvm_queue_exception(vcpu, UD_VECTOR);
5033                 return 1;
5034         }
5035
5036         if (!nested_vmx_check_permission(vcpu))
5037                 return 1;
5038
5039         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5040         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5041
5042         types = (vmx->nested.msrs.vpid_caps &
5043                         VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5044
5045         if (type >= 32 || !(types & (1 << type)))
5046                 return nested_vmx_failValid(vcpu,
5047                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5048
5049         /* according to the intel vmx instruction reference, the memory
5050          * operand is read even if it isn't needed (e.g., for type==global)
5051          */
5052         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5053                         vmx_instruction_info, false, sizeof(operand), &gva))
5054                 return 1;
5055         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5056                 kvm_inject_page_fault(vcpu, &e);
5057                 return 1;
5058         }
5059         if (operand.vpid >> 16)
5060                 return nested_vmx_failValid(vcpu,
5061                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5062
5063         vpid02 = nested_get_vpid02(vcpu);
5064         switch (type) {
5065         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5066                 if (!operand.vpid ||
5067                     is_noncanonical_address(operand.gla, vcpu))
5068                         return nested_vmx_failValid(vcpu,
5069                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5070                 if (cpu_has_vmx_invvpid_individual_addr()) {
5071                         __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
5072                                 vpid02, operand.gla);
5073                 } else
5074                         __vmx_flush_tlb(vcpu, vpid02, false);
5075                 break;
5076         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5077         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5078                 if (!operand.vpid)
5079                         return nested_vmx_failValid(vcpu,
5080                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5081                 __vmx_flush_tlb(vcpu, vpid02, false);
5082                 break;
5083         case VMX_VPID_EXTENT_ALL_CONTEXT:
5084                 __vmx_flush_tlb(vcpu, vpid02, false);
5085                 break;
5086         default:
5087                 WARN_ON_ONCE(1);
5088                 return kvm_skip_emulated_instruction(vcpu);
5089         }
5090
5091         return nested_vmx_succeed(vcpu);
5092 }
5093
5094 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5095                                      struct vmcs12 *vmcs12)
5096 {
5097         u32 index = kvm_rcx_read(vcpu);
5098         u64 address;
5099         bool accessed_dirty;
5100         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5101
5102         if (!nested_cpu_has_eptp_switching(vmcs12) ||
5103             !nested_cpu_has_ept(vmcs12))
5104                 return 1;
5105
5106         if (index >= VMFUNC_EPTP_ENTRIES)
5107                 return 1;
5108
5109
5110         if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5111                                      &address, index * 8, 8))
5112                 return 1;
5113
5114         accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
5115
5116         /*
5117          * If the (L2) guest does a vmfunc to the currently
5118          * active ept pointer, we don't have to do anything else
5119          */
5120         if (vmcs12->ept_pointer != address) {
5121                 if (!valid_ept_address(vcpu, address))
5122                         return 1;
5123
5124                 kvm_mmu_unload(vcpu);
5125                 mmu->ept_ad = accessed_dirty;
5126                 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5127                 vmcs12->ept_pointer = address;
5128                 /*
5129                  * TODO: Check what's the correct approach in case
5130                  * mmu reload fails. Currently, we just let the next
5131                  * reload potentially fail
5132                  */
5133                 kvm_mmu_reload(vcpu);
5134         }
5135
5136         return 0;
5137 }
5138
5139 static int handle_vmfunc(struct kvm_vcpu *vcpu)
5140 {
5141         struct vcpu_vmx *vmx = to_vmx(vcpu);
5142         struct vmcs12 *vmcs12;
5143         u32 function = kvm_rax_read(vcpu);
5144
5145         /*
5146          * VMFUNC is only supported for nested guests, but we always enable the
5147          * secondary control for simplicity; for non-nested mode, fake that we
5148          * didn't by injecting #UD.
5149          */
5150         if (!is_guest_mode(vcpu)) {
5151                 kvm_queue_exception(vcpu, UD_VECTOR);
5152                 return 1;
5153         }
5154
5155         vmcs12 = get_vmcs12(vcpu);
5156         if ((vmcs12->vm_function_control & (1 << function)) == 0)
5157                 goto fail;
5158
5159         switch (function) {
5160         case 0:
5161                 if (nested_vmx_eptp_switching(vcpu, vmcs12))
5162                         goto fail;
5163                 break;
5164         default:
5165                 goto fail;
5166         }
5167         return kvm_skip_emulated_instruction(vcpu);
5168
5169 fail:
5170         nested_vmx_vmexit(vcpu, vmx->exit_reason,
5171                           vmcs_read32(VM_EXIT_INTR_INFO),
5172                           vmcs_readl(EXIT_QUALIFICATION));
5173         return 1;
5174 }
5175
5176
5177 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5178                                        struct vmcs12 *vmcs12)
5179 {
5180         unsigned long exit_qualification;
5181         gpa_t bitmap, last_bitmap;
5182         unsigned int port;
5183         int size;
5184         u8 b;
5185
5186         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5187                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5188
5189         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5190
5191         port = exit_qualification >> 16;
5192         size = (exit_qualification & 7) + 1;
5193
5194         last_bitmap = (gpa_t)-1;
5195         b = -1;
5196
5197         while (size > 0) {
5198                 if (port < 0x8000)
5199                         bitmap = vmcs12->io_bitmap_a;
5200                 else if (port < 0x10000)
5201                         bitmap = vmcs12->io_bitmap_b;
5202                 else
5203                         return true;
5204                 bitmap += (port & 0x7fff) / 8;
5205
5206                 if (last_bitmap != bitmap)
5207                         if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5208                                 return true;
5209                 if (b & (1 << (port & 7)))
5210                         return true;
5211
5212                 port++;
5213                 size--;
5214                 last_bitmap = bitmap;
5215         }
5216
5217         return false;
5218 }
5219
5220 /*
5221  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5222  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5223  * disinterest in the current event (read or write a specific MSR) by using an
5224  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5225  */
5226 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5227         struct vmcs12 *vmcs12, u32 exit_reason)
5228 {
5229         u32 msr_index = kvm_rcx_read(vcpu);
5230         gpa_t bitmap;
5231
5232         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5233                 return true;
5234
5235         /*
5236          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5237          * for the four combinations of read/write and low/high MSR numbers.
5238          * First we need to figure out which of the four to use:
5239          */
5240         bitmap = vmcs12->msr_bitmap;
5241         if (exit_reason == EXIT_REASON_MSR_WRITE)
5242                 bitmap += 2048;
5243         if (msr_index >= 0xc0000000) {
5244                 msr_index -= 0xc0000000;
5245                 bitmap += 1024;
5246         }
5247
5248         /* Then read the msr_index'th bit from this bitmap: */
5249         if (msr_index < 1024*8) {
5250                 unsigned char b;
5251                 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5252                         return true;
5253                 return 1 & (b >> (msr_index & 7));
5254         } else
5255                 return true; /* let L1 handle the wrong parameter */
5256 }
5257
5258 /*
5259  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5260  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5261  * intercept (via guest_host_mask etc.) the current event.
5262  */
5263 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5264         struct vmcs12 *vmcs12)
5265 {
5266         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5267         int cr = exit_qualification & 15;
5268         int reg;
5269         unsigned long val;
5270
5271         switch ((exit_qualification >> 4) & 3) {
5272         case 0: /* mov to cr */
5273                 reg = (exit_qualification >> 8) & 15;
5274                 val = kvm_register_readl(vcpu, reg);
5275                 switch (cr) {
5276                 case 0:
5277                         if (vmcs12->cr0_guest_host_mask &
5278                             (val ^ vmcs12->cr0_read_shadow))
5279                                 return true;
5280                         break;
5281                 case 3:
5282                         if ((vmcs12->cr3_target_count >= 1 &&
5283                                         vmcs12->cr3_target_value0 == val) ||
5284                                 (vmcs12->cr3_target_count >= 2 &&
5285                                         vmcs12->cr3_target_value1 == val) ||
5286                                 (vmcs12->cr3_target_count >= 3 &&
5287                                         vmcs12->cr3_target_value2 == val) ||
5288                                 (vmcs12->cr3_target_count >= 4 &&
5289                                         vmcs12->cr3_target_value3 == val))
5290                                 return false;
5291                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5292                                 return true;
5293                         break;
5294                 case 4:
5295                         if (vmcs12->cr4_guest_host_mask &
5296                             (vmcs12->cr4_read_shadow ^ val))
5297                                 return true;
5298                         break;
5299                 case 8:
5300                         if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5301                                 return true;
5302                         break;
5303                 }
5304                 break;
5305         case 2: /* clts */
5306                 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5307                     (vmcs12->cr0_read_shadow & X86_CR0_TS))
5308                         return true;
5309                 break;
5310         case 1: /* mov from cr */
5311                 switch (cr) {
5312                 case 3:
5313                         if (vmcs12->cpu_based_vm_exec_control &
5314                             CPU_BASED_CR3_STORE_EXITING)
5315                                 return true;
5316                         break;
5317                 case 8:
5318                         if (vmcs12->cpu_based_vm_exec_control &
5319                             CPU_BASED_CR8_STORE_EXITING)
5320                                 return true;
5321                         break;
5322                 }
5323                 break;
5324         case 3: /* lmsw */
5325                 /*
5326                  * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5327                  * cr0. Other attempted changes are ignored, with no exit.
5328                  */
5329                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5330                 if (vmcs12->cr0_guest_host_mask & 0xe &
5331                     (val ^ vmcs12->cr0_read_shadow))
5332                         return true;
5333                 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5334                     !(vmcs12->cr0_read_shadow & 0x1) &&
5335                     (val & 0x1))
5336                         return true;
5337                 break;
5338         }
5339         return false;
5340 }
5341
5342 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5343         struct vmcs12 *vmcs12, gpa_t bitmap)
5344 {
5345         u32 vmx_instruction_info;
5346         unsigned long field;
5347         u8 b;
5348
5349         if (!nested_cpu_has_shadow_vmcs(vmcs12))
5350                 return true;
5351
5352         /* Decode instruction info and find the field to access */
5353         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5354         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5355
5356         /* Out-of-range fields always cause a VM exit from L2 to L1 */
5357         if (field >> 15)
5358                 return true;
5359
5360         if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5361                 return true;
5362
5363         return 1 & (b >> (field & 7));
5364 }
5365
5366 /*
5367  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5368  * should handle it ourselves in L0 (and then continue L2). Only call this
5369  * when in is_guest_mode (L2).
5370  */
5371 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5372 {
5373         u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5374         struct vcpu_vmx *vmx = to_vmx(vcpu);
5375         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5376
5377         if (vmx->nested.nested_run_pending)
5378                 return false;
5379
5380         if (unlikely(vmx->fail)) {
5381                 trace_kvm_nested_vmenter_failed(
5382                         "hardware VM-instruction error: ",
5383                         vmcs_read32(VM_INSTRUCTION_ERROR));
5384                 return true;
5385         }
5386
5387         /*
5388          * The host physical addresses of some pages of guest memory
5389          * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5390          * Page). The CPU may write to these pages via their host
5391          * physical address while L2 is running, bypassing any
5392          * address-translation-based dirty tracking (e.g. EPT write
5393          * protection).
5394          *
5395          * Mark them dirty on every exit from L2 to prevent them from
5396          * getting out of sync with dirty tracking.
5397          */
5398         nested_mark_vmcs12_pages_dirty(vcpu);
5399
5400         trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5401                                 vmcs_readl(EXIT_QUALIFICATION),
5402                                 vmx->idt_vectoring_info,
5403                                 intr_info,
5404                                 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5405                                 KVM_ISA_VMX);
5406
5407         switch (exit_reason) {
5408         case EXIT_REASON_EXCEPTION_NMI:
5409                 if (is_nmi(intr_info))
5410                         return false;
5411                 else if (is_page_fault(intr_info))
5412                         return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5413                 else if (is_debug(intr_info) &&
5414                          vcpu->guest_debug &
5415                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5416                         return false;
5417                 else if (is_breakpoint(intr_info) &&
5418                          vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5419                         return false;
5420                 return vmcs12->exception_bitmap &
5421                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5422         case EXIT_REASON_EXTERNAL_INTERRUPT:
5423                 return false;
5424         case EXIT_REASON_TRIPLE_FAULT:
5425                 return true;
5426         case EXIT_REASON_PENDING_INTERRUPT:
5427                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5428         case EXIT_REASON_NMI_WINDOW:
5429                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5430         case EXIT_REASON_TASK_SWITCH:
5431                 return true;
5432         case EXIT_REASON_CPUID:
5433                 return true;
5434         case EXIT_REASON_HLT:
5435                 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5436         case EXIT_REASON_INVD:
5437                 return true;
5438         case EXIT_REASON_INVLPG:
5439                 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5440         case EXIT_REASON_RDPMC:
5441                 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5442         case EXIT_REASON_RDRAND:
5443                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5444         case EXIT_REASON_RDSEED:
5445                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5446         case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5447                 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5448         case EXIT_REASON_VMREAD:
5449                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5450                         vmcs12->vmread_bitmap);
5451         case EXIT_REASON_VMWRITE:
5452                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5453                         vmcs12->vmwrite_bitmap);
5454         case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5455         case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5456         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5457         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5458         case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5459                 /*
5460                  * VMX instructions trap unconditionally. This allows L1 to
5461                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
5462                  */
5463                 return true;
5464         case EXIT_REASON_CR_ACCESS:
5465                 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5466         case EXIT_REASON_DR_ACCESS:
5467                 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5468         case EXIT_REASON_IO_INSTRUCTION:
5469                 return nested_vmx_exit_handled_io(vcpu, vmcs12);
5470         case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5471                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5472         case EXIT_REASON_MSR_READ:
5473         case EXIT_REASON_MSR_WRITE:
5474                 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5475         case EXIT_REASON_INVALID_STATE:
5476                 return true;
5477         case EXIT_REASON_MWAIT_INSTRUCTION:
5478                 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5479         case EXIT_REASON_MONITOR_TRAP_FLAG:
5480                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5481         case EXIT_REASON_MONITOR_INSTRUCTION:
5482                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5483         case EXIT_REASON_PAUSE_INSTRUCTION:
5484                 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5485                         nested_cpu_has2(vmcs12,
5486                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5487         case EXIT_REASON_MCE_DURING_VMENTRY:
5488                 return false;
5489         case EXIT_REASON_TPR_BELOW_THRESHOLD:
5490                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5491         case EXIT_REASON_APIC_ACCESS:
5492         case EXIT_REASON_APIC_WRITE:
5493         case EXIT_REASON_EOI_INDUCED:
5494                 /*
5495                  * The controls for "virtualize APIC accesses," "APIC-
5496                  * register virtualization," and "virtual-interrupt
5497                  * delivery" only come from vmcs12.
5498                  */
5499                 return true;
5500         case EXIT_REASON_EPT_VIOLATION:
5501                 /*
5502                  * L0 always deals with the EPT violation. If nested EPT is
5503                  * used, and the nested mmu code discovers that the address is
5504                  * missing in the guest EPT table (EPT12), the EPT violation
5505                  * will be injected with nested_ept_inject_page_fault()
5506                  */
5507                 return false;
5508         case EXIT_REASON_EPT_MISCONFIG:
5509                 /*
5510                  * L2 never uses directly L1's EPT, but rather L0's own EPT
5511                  * table (shadow on EPT) or a merged EPT table that L0 built
5512                  * (EPT on EPT). So any problems with the structure of the
5513                  * table is L0's fault.
5514                  */
5515                 return false;
5516         case EXIT_REASON_INVPCID:
5517                 return
5518                         nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5519                         nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5520         case EXIT_REASON_WBINVD:
5521                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5522         case EXIT_REASON_XSETBV:
5523                 return true;
5524         case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5525                 /*
5526                  * This should never happen, since it is not possible to
5527                  * set XSS to a non-zero value---neither in L1 nor in L2.
5528                  * If if it were, XSS would have to be checked against
5529                  * the XSS exit bitmap in vmcs12.
5530                  */
5531                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5532         case EXIT_REASON_PREEMPTION_TIMER:
5533                 return false;
5534         case EXIT_REASON_PML_FULL:
5535                 /* We emulate PML support to L1. */
5536                 return false;
5537         case EXIT_REASON_VMFUNC:
5538                 /* VM functions are emulated through L2->L0 vmexits. */
5539                 return false;
5540         case EXIT_REASON_ENCLS:
5541                 /* SGX is never exposed to L1 */
5542                 return false;
5543         case EXIT_REASON_UMWAIT:
5544         case EXIT_REASON_TPAUSE:
5545                 return nested_cpu_has2(vmcs12,
5546                         SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
5547         default:
5548                 return true;
5549         }
5550 }
5551
5552
5553 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5554                                 struct kvm_nested_state __user *user_kvm_nested_state,
5555                                 u32 user_data_size)
5556 {
5557         struct vcpu_vmx *vmx;
5558         struct vmcs12 *vmcs12;
5559         struct kvm_nested_state kvm_state = {
5560                 .flags = 0,
5561                 .format = KVM_STATE_NESTED_FORMAT_VMX,
5562                 .size = sizeof(kvm_state),
5563                 .hdr.vmx.vmxon_pa = -1ull,
5564                 .hdr.vmx.vmcs12_pa = -1ull,
5565         };
5566         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5567                 &user_kvm_nested_state->data.vmx[0];
5568
5569         if (!vcpu)
5570                 return kvm_state.size + sizeof(*user_vmx_nested_state);
5571
5572         vmx = to_vmx(vcpu);
5573         vmcs12 = get_vmcs12(vcpu);
5574
5575         if (nested_vmx_allowed(vcpu) &&
5576             (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5577                 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5578                 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
5579
5580                 if (vmx_has_valid_vmcs12(vcpu)) {
5581                         kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
5582
5583                         if (vmx->nested.hv_evmcs)
5584                                 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5585
5586                         if (is_guest_mode(vcpu) &&
5587                             nested_cpu_has_shadow_vmcs(vmcs12) &&
5588                             vmcs12->vmcs_link_pointer != -1ull)
5589                                 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
5590                 }
5591
5592                 if (vmx->nested.smm.vmxon)
5593                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5594
5595                 if (vmx->nested.smm.guest_mode)
5596                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5597
5598                 if (is_guest_mode(vcpu)) {
5599                         kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5600
5601                         if (vmx->nested.nested_run_pending)
5602                                 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5603                 }
5604         }
5605
5606         if (user_data_size < kvm_state.size)
5607                 goto out;
5608
5609         if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5610                 return -EFAULT;
5611
5612         if (!vmx_has_valid_vmcs12(vcpu))
5613                 goto out;
5614
5615         /*
5616          * When running L2, the authoritative vmcs12 state is in the
5617          * vmcs02. When running L1, the authoritative vmcs12 state is
5618          * in the shadow or enlightened vmcs linked to vmcs01, unless
5619          * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5620          * vmcs12 state is in the vmcs12 already.
5621          */
5622         if (is_guest_mode(vcpu)) {
5623                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5624                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5625         } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5626                 if (vmx->nested.hv_evmcs)
5627                         copy_enlightened_to_vmcs12(vmx);
5628                 else if (enable_shadow_vmcs)
5629                         copy_shadow_to_vmcs12(vmx);
5630         }
5631
5632         BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5633         BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5634
5635         /*
5636          * Copy over the full allocated size of vmcs12 rather than just the size
5637          * of the struct.
5638          */
5639         if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
5640                 return -EFAULT;
5641
5642         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5643             vmcs12->vmcs_link_pointer != -1ull) {
5644                 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
5645                                  get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5646                         return -EFAULT;
5647         }
5648
5649 out:
5650         return kvm_state.size;
5651 }
5652
5653 /*
5654  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5655  */
5656 void vmx_leave_nested(struct kvm_vcpu *vcpu)
5657 {
5658         if (is_guest_mode(vcpu)) {
5659                 to_vmx(vcpu)->nested.nested_run_pending = 0;
5660                 nested_vmx_vmexit(vcpu, -1, 0, 0);
5661         }
5662         free_nested(vcpu);
5663 }
5664
5665 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5666                                 struct kvm_nested_state __user *user_kvm_nested_state,
5667                                 struct kvm_nested_state *kvm_state)
5668 {
5669         struct vcpu_vmx *vmx = to_vmx(vcpu);
5670         struct vmcs12 *vmcs12;
5671         u32 exit_qual;
5672         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5673                 &user_kvm_nested_state->data.vmx[0];
5674         int ret;
5675
5676         if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
5677                 return -EINVAL;
5678
5679         if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5680                 if (kvm_state->hdr.vmx.smm.flags)
5681                         return -EINVAL;
5682
5683                 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
5684                         return -EINVAL;
5685
5686                 /*
5687                  * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5688                  * enable eVMCS capability on vCPU. However, since then
5689                  * code was changed such that flag signals vmcs12 should
5690                  * be copied into eVMCS in guest memory.
5691                  *
5692                  * To preserve backwards compatability, allow user
5693                  * to set this flag even when there is no VMXON region.
5694                  */
5695                 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5696                         return -EINVAL;
5697         } else {
5698                 if (!nested_vmx_allowed(vcpu))
5699                         return -EINVAL;
5700
5701                 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5702                         return -EINVAL;
5703         }
5704
5705         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5706             (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5707                 return -EINVAL;
5708
5709         if (kvm_state->hdr.vmx.smm.flags &
5710             ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5711                 return -EINVAL;
5712
5713         /*
5714          * SMM temporarily disables VMX, so we cannot be in guest mode,
5715          * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5716          * must be zero.
5717          */
5718         if (is_smm(vcpu) ?
5719                 (kvm_state->flags &
5720                  (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5721                 : kvm_state->hdr.vmx.smm.flags)
5722                 return -EINVAL;
5723
5724         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5725             !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5726                 return -EINVAL;
5727
5728         if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5729                 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
5730                         return -EINVAL;
5731
5732         vmx_leave_nested(vcpu);
5733
5734         if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
5735                 return 0;
5736
5737         vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
5738         ret = enter_vmx_operation(vcpu);
5739         if (ret)
5740                 return ret;
5741
5742         /* Empty 'VMXON' state is permitted */
5743         if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5744                 return 0;
5745
5746         if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5747                 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5748                     !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
5749                         return -EINVAL;
5750
5751                 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
5752         } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5753                 /*
5754                  * Sync eVMCS upon entry as we may not have
5755                  * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5756                  */
5757                 vmx->nested.need_vmcs12_to_shadow_sync = true;
5758         } else {
5759                 return -EINVAL;
5760         }
5761
5762         if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5763                 vmx->nested.smm.vmxon = true;
5764                 vmx->nested.vmxon = false;
5765
5766                 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5767                         vmx->nested.smm.guest_mode = true;
5768         }
5769
5770         vmcs12 = get_vmcs12(vcpu);
5771         if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
5772                 return -EFAULT;
5773
5774         if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5775                 return -EINVAL;
5776
5777         if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5778                 return 0;
5779
5780         vmx->nested.nested_run_pending =
5781                 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5782
5783         ret = -EINVAL;
5784         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5785             vmcs12->vmcs_link_pointer != -1ull) {
5786                 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5787
5788                 if (kvm_state->size <
5789                     sizeof(*kvm_state) +
5790                     sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
5791                         goto error_guest_mode;
5792
5793                 if (copy_from_user(shadow_vmcs12,
5794                                    user_vmx_nested_state->shadow_vmcs12,
5795                                    sizeof(*shadow_vmcs12))) {
5796                         ret = -EFAULT;
5797                         goto error_guest_mode;
5798                 }
5799
5800                 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5801                     !shadow_vmcs12->hdr.shadow_vmcs)
5802                         goto error_guest_mode;
5803         }
5804
5805         if (nested_vmx_check_controls(vcpu, vmcs12) ||
5806             nested_vmx_check_host_state(vcpu, vmcs12) ||
5807             nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5808                 goto error_guest_mode;
5809
5810         vmx->nested.dirty_vmcs12 = true;
5811         ret = nested_vmx_enter_non_root_mode(vcpu, false);
5812         if (ret)
5813                 goto error_guest_mode;
5814
5815         return 0;
5816
5817 error_guest_mode:
5818         vmx->nested.nested_run_pending = 0;
5819         return ret;
5820 }
5821
5822 void nested_vmx_set_vmcs_shadowing_bitmap(void)
5823 {
5824         if (enable_shadow_vmcs) {
5825                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5826                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5827         }
5828 }
5829
5830 /*
5831  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5832  * returned for the various VMX controls MSRs when nested VMX is enabled.
5833  * The same values should also be used to verify that vmcs12 control fields are
5834  * valid during nested entry from L1 to L2.
5835  * Each of these control msrs has a low and high 32-bit half: A low bit is on
5836  * if the corresponding bit in the (32-bit) control field *must* be on, and a
5837  * bit in the high half is on if the corresponding bit in the control field
5838  * may be on. See also vmx_control_verify().
5839  */
5840 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5841                                 bool apicv)
5842 {
5843         /*
5844          * Note that as a general rule, the high half of the MSRs (bits in
5845          * the control fields which may be 1) should be initialized by the
5846          * intersection of the underlying hardware's MSR (i.e., features which
5847          * can be supported) and the list of features we want to expose -
5848          * because they are known to be properly supported in our code.
5849          * Also, usually, the low half of the MSRs (bits which must be 1) can
5850          * be set to 0, meaning that L1 may turn off any of these bits. The
5851          * reason is that if one of these bits is necessary, it will appear
5852          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5853          * fields of vmcs01 and vmcs02, will turn these bits off - and
5854          * nested_vmx_exit_reflected() will not pass related exits to L1.
5855          * These rules have exceptions below.
5856          */
5857
5858         /* pin-based controls */
5859         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5860                 msrs->pinbased_ctls_low,
5861                 msrs->pinbased_ctls_high);
5862         msrs->pinbased_ctls_low |=
5863                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5864         msrs->pinbased_ctls_high &=
5865                 PIN_BASED_EXT_INTR_MASK |
5866                 PIN_BASED_NMI_EXITING |
5867                 PIN_BASED_VIRTUAL_NMIS |
5868                 (apicv ? PIN_BASED_POSTED_INTR : 0);
5869         msrs->pinbased_ctls_high |=
5870                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5871                 PIN_BASED_VMX_PREEMPTION_TIMER;
5872
5873         /* exit controls */
5874         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5875                 msrs->exit_ctls_low,
5876                 msrs->exit_ctls_high);
5877         msrs->exit_ctls_low =
5878                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5879
5880         msrs->exit_ctls_high &=
5881 #ifdef CONFIG_X86_64
5882                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
5883 #endif
5884                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5885         msrs->exit_ctls_high |=
5886                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5887                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5888                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5889
5890         /* We support free control of debug control saving. */
5891         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5892
5893         /* entry controls */
5894         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5895                 msrs->entry_ctls_low,
5896                 msrs->entry_ctls_high);
5897         msrs->entry_ctls_low =
5898                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
5899         msrs->entry_ctls_high &=
5900 #ifdef CONFIG_X86_64
5901                 VM_ENTRY_IA32E_MODE |
5902 #endif
5903                 VM_ENTRY_LOAD_IA32_PAT;
5904         msrs->entry_ctls_high |=
5905                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
5906
5907         /* We support free control of debug control loading. */
5908         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
5909
5910         /* cpu-based controls */
5911         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
5912                 msrs->procbased_ctls_low,
5913                 msrs->procbased_ctls_high);
5914         msrs->procbased_ctls_low =
5915                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5916         msrs->procbased_ctls_high &=
5917                 CPU_BASED_VIRTUAL_INTR_PENDING |
5918                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
5919                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
5920                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
5921                 CPU_BASED_CR3_STORE_EXITING |
5922 #ifdef CONFIG_X86_64
5923                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
5924 #endif
5925                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
5926                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
5927                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
5928                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
5929                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
5930         /*
5931          * We can allow some features even when not supported by the
5932          * hardware. For example, L1 can specify an MSR bitmap - and we
5933          * can use it to avoid exits to L1 - even when L0 runs L2
5934          * without MSR bitmaps.
5935          */
5936         msrs->procbased_ctls_high |=
5937                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5938                 CPU_BASED_USE_MSR_BITMAPS;
5939
5940         /* We support free control of CR3 access interception. */
5941         msrs->procbased_ctls_low &=
5942                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
5943
5944         /*
5945          * secondary cpu-based controls.  Do not include those that
5946          * depend on CPUID bits, they are added later by vmx_cpuid_update.
5947          */
5948         if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
5949                 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
5950                       msrs->secondary_ctls_low,
5951                       msrs->secondary_ctls_high);
5952
5953         msrs->secondary_ctls_low = 0;
5954         msrs->secondary_ctls_high &=
5955                 SECONDARY_EXEC_DESC |
5956                 SECONDARY_EXEC_RDTSCP |
5957                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
5958                 SECONDARY_EXEC_WBINVD_EXITING |
5959                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
5960                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
5961                 SECONDARY_EXEC_RDRAND_EXITING |
5962                 SECONDARY_EXEC_ENABLE_INVPCID |
5963                 SECONDARY_EXEC_RDSEED_EXITING |
5964                 SECONDARY_EXEC_XSAVES;
5965
5966         /*
5967          * We can emulate "VMCS shadowing," even if the hardware
5968          * doesn't support it.
5969          */
5970         msrs->secondary_ctls_high |=
5971                 SECONDARY_EXEC_SHADOW_VMCS;
5972
5973         if (enable_ept) {
5974                 /* nested EPT: emulate EPT also to L1 */
5975                 msrs->secondary_ctls_high |=
5976                         SECONDARY_EXEC_ENABLE_EPT;
5977                 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
5978                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
5979                 if (cpu_has_vmx_ept_execute_only())
5980                         msrs->ept_caps |=
5981                                 VMX_EPT_EXECUTE_ONLY_BIT;
5982                 msrs->ept_caps &= ept_caps;
5983                 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
5984                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
5985                         VMX_EPT_1GB_PAGE_BIT;
5986                 if (enable_ept_ad_bits) {
5987                         msrs->secondary_ctls_high |=
5988                                 SECONDARY_EXEC_ENABLE_PML;
5989                         msrs->ept_caps |= VMX_EPT_AD_BIT;
5990                 }
5991         }
5992
5993         if (cpu_has_vmx_vmfunc()) {
5994                 msrs->secondary_ctls_high |=
5995                         SECONDARY_EXEC_ENABLE_VMFUNC;
5996                 /*
5997                  * Advertise EPTP switching unconditionally
5998                  * since we emulate it
5999                  */
6000                 if (enable_ept)
6001                         msrs->vmfunc_controls =
6002                                 VMX_VMFUNC_EPTP_SWITCHING;
6003         }
6004
6005         /*
6006          * Old versions of KVM use the single-context version without
6007          * checking for support, so declare that it is supported even
6008          * though it is treated as global context.  The alternative is
6009          * not failing the single-context invvpid, and it is worse.
6010          */
6011         if (enable_vpid) {
6012                 msrs->secondary_ctls_high |=
6013                         SECONDARY_EXEC_ENABLE_VPID;
6014                 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6015                         VMX_VPID_EXTENT_SUPPORTED_MASK;
6016         }
6017
6018         if (enable_unrestricted_guest)
6019                 msrs->secondary_ctls_high |=
6020                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
6021
6022         if (flexpriority_enabled)
6023                 msrs->secondary_ctls_high |=
6024                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6025
6026         /* miscellaneous data */
6027         rdmsr(MSR_IA32_VMX_MISC,
6028                 msrs->misc_low,
6029                 msrs->misc_high);
6030         msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6031         msrs->misc_low |=
6032                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6033                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6034                 VMX_MISC_ACTIVITY_HLT;
6035         msrs->misc_high = 0;
6036
6037         /*
6038          * This MSR reports some information about VMX support. We
6039          * should return information about the VMX we emulate for the
6040          * guest, and the VMCS structure we give it - not about the
6041          * VMX support of the underlying hardware.
6042          */
6043         msrs->basic =
6044                 VMCS12_REVISION |
6045                 VMX_BASIC_TRUE_CTLS |
6046                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6047                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6048
6049         if (cpu_has_vmx_basic_inout())
6050                 msrs->basic |= VMX_BASIC_INOUT;
6051
6052         /*
6053          * These MSRs specify bits which the guest must keep fixed on
6054          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6055          * We picked the standard core2 setting.
6056          */
6057 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6058 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
6059         msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6060         msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6061
6062         /* These MSRs specify bits which the guest must keep fixed off. */
6063         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6064         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6065
6066         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6067         msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6068 }
6069
6070 void nested_vmx_hardware_unsetup(void)
6071 {
6072         int i;
6073
6074         if (enable_shadow_vmcs) {
6075                 for (i = 0; i < VMX_BITMAP_NR; i++)
6076                         free_page((unsigned long)vmx_bitmap[i]);
6077         }
6078 }
6079
6080 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6081 {
6082         int i;
6083
6084         if (!cpu_has_vmx_shadow_vmcs())
6085                 enable_shadow_vmcs = 0;
6086         if (enable_shadow_vmcs) {
6087                 for (i = 0; i < VMX_BITMAP_NR; i++) {
6088                         /*
6089                          * The vmx_bitmap is not tied to a VM and so should
6090                          * not be charged to a memcg.
6091                          */
6092                         vmx_bitmap[i] = (unsigned long *)
6093                                 __get_free_page(GFP_KERNEL);
6094                         if (!vmx_bitmap[i]) {
6095                                 nested_vmx_hardware_unsetup();
6096                                 return -ENOMEM;
6097                         }
6098                 }
6099
6100                 init_vmcs_shadow_fields();
6101         }
6102
6103         exit_handlers[EXIT_REASON_VMCLEAR]      = handle_vmclear,
6104         exit_handlers[EXIT_REASON_VMLAUNCH]     = handle_vmlaunch,
6105         exit_handlers[EXIT_REASON_VMPTRLD]      = handle_vmptrld,
6106         exit_handlers[EXIT_REASON_VMPTRST]      = handle_vmptrst,
6107         exit_handlers[EXIT_REASON_VMREAD]       = handle_vmread,
6108         exit_handlers[EXIT_REASON_VMRESUME]     = handle_vmresume,
6109         exit_handlers[EXIT_REASON_VMWRITE]      = handle_vmwrite,
6110         exit_handlers[EXIT_REASON_VMOFF]        = handle_vmoff,
6111         exit_handlers[EXIT_REASON_VMON]         = handle_vmon,
6112         exit_handlers[EXIT_REASON_INVEPT]       = handle_invept,
6113         exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid,
6114         exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc,
6115
6116         kvm_x86_ops->check_nested_events = vmx_check_nested_events;
6117         kvm_x86_ops->get_nested_state = vmx_get_nested_state;
6118         kvm_x86_ops->set_nested_state = vmx_set_nested_state;
6119         kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
6120         kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
6121         kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
6122
6123         return 0;
6124 }