Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[linux-2.6-block.git] / arch / x86 / kvm / vmx / nested.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/frame.h>
4 #include <linux/percpu.h>
5
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
8
9 #include "cpuid.h"
10 #include "hyperv.h"
11 #include "mmu.h"
12 #include "nested.h"
13 #include "trace.h"
14 #include "x86.h"
15
16 static bool __read_mostly enable_shadow_vmcs = 1;
17 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
18
19 static bool __read_mostly nested_early_check = 0;
20 module_param(nested_early_check, bool, S_IRUGO);
21
22 /*
23  * Hyper-V requires all of these, so mark them as supported even though
24  * they are just treated the same as all-context.
25  */
26 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
27         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
28         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
29         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
30         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
31
32 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
33
34 enum {
35         VMX_VMREAD_BITMAP,
36         VMX_VMWRITE_BITMAP,
37         VMX_BITMAP_NR
38 };
39 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
40
41 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
42 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
43
44 struct shadow_vmcs_field {
45         u16     encoding;
46         u16     offset;
47 };
48 static struct shadow_vmcs_field shadow_read_only_fields[] = {
49 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
50 #include "vmcs_shadow_fields.h"
51 };
52 static int max_shadow_read_only_fields =
53         ARRAY_SIZE(shadow_read_only_fields);
54
55 static struct shadow_vmcs_field shadow_read_write_fields[] = {
56 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
57 #include "vmcs_shadow_fields.h"
58 };
59 static int max_shadow_read_write_fields =
60         ARRAY_SIZE(shadow_read_write_fields);
61
62 static void init_vmcs_shadow_fields(void)
63 {
64         int i, j;
65
66         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
67         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
68
69         for (i = j = 0; i < max_shadow_read_only_fields; i++) {
70                 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
71                 u16 field = entry.encoding;
72
73                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
74                     (i + 1 == max_shadow_read_only_fields ||
75                      shadow_read_only_fields[i + 1].encoding != field + 1))
76                         pr_err("Missing field from shadow_read_only_field %x\n",
77                                field + 1);
78
79                 clear_bit(field, vmx_vmread_bitmap);
80                 if (field & 1)
81 #ifdef CONFIG_X86_64
82                         continue;
83 #else
84                         entry.offset += sizeof(u32);
85 #endif
86                 shadow_read_only_fields[j++] = entry;
87         }
88         max_shadow_read_only_fields = j;
89
90         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
91                 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
92                 u16 field = entry.encoding;
93
94                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
95                     (i + 1 == max_shadow_read_write_fields ||
96                      shadow_read_write_fields[i + 1].encoding != field + 1))
97                         pr_err("Missing field from shadow_read_write_field %x\n",
98                                field + 1);
99
100                 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
101                           field <= GUEST_TR_AR_BYTES,
102                           "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
103
104                 /*
105                  * PML and the preemption timer can be emulated, but the
106                  * processor cannot vmwrite to fields that don't exist
107                  * on bare metal.
108                  */
109                 switch (field) {
110                 case GUEST_PML_INDEX:
111                         if (!cpu_has_vmx_pml())
112                                 continue;
113                         break;
114                 case VMX_PREEMPTION_TIMER_VALUE:
115                         if (!cpu_has_vmx_preemption_timer())
116                                 continue;
117                         break;
118                 case GUEST_INTR_STATUS:
119                         if (!cpu_has_vmx_apicv())
120                                 continue;
121                         break;
122                 default:
123                         break;
124                 }
125
126                 clear_bit(field, vmx_vmwrite_bitmap);
127                 clear_bit(field, vmx_vmread_bitmap);
128                 if (field & 1)
129 #ifdef CONFIG_X86_64
130                         continue;
131 #else
132                         entry.offset += sizeof(u32);
133 #endif
134                 shadow_read_write_fields[j++] = entry;
135         }
136         max_shadow_read_write_fields = j;
137 }
138
139 /*
140  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
141  * set the success or error code of an emulated VMX instruction (as specified
142  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
143  * instruction.
144  */
145 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
146 {
147         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
148                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
149                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
150         return kvm_skip_emulated_instruction(vcpu);
151 }
152
153 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
154 {
155         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
156                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
157                             X86_EFLAGS_SF | X86_EFLAGS_OF))
158                         | X86_EFLAGS_CF);
159         return kvm_skip_emulated_instruction(vcpu);
160 }
161
162 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
163                                 u32 vm_instruction_error)
164 {
165         struct vcpu_vmx *vmx = to_vmx(vcpu);
166
167         /*
168          * failValid writes the error number to the current VMCS, which
169          * can't be done if there isn't a current VMCS.
170          */
171         if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
172                 return nested_vmx_failInvalid(vcpu);
173
174         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
176                             X86_EFLAGS_SF | X86_EFLAGS_OF))
177                         | X86_EFLAGS_ZF);
178         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
179         /*
180          * We don't need to force a shadow sync because
181          * VM_INSTRUCTION_ERROR is not shadowed
182          */
183         return kvm_skip_emulated_instruction(vcpu);
184 }
185
186 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
187 {
188         /* TODO: not to reset guest simply here. */
189         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
190         pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
191 }
192
193 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
194 {
195         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
196         vmcs_write64(VMCS_LINK_POINTER, -1ull);
197 }
198
199 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
200 {
201         struct vcpu_vmx *vmx = to_vmx(vcpu);
202
203         if (!vmx->nested.hv_evmcs)
204                 return;
205
206         kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
207         vmx->nested.hv_evmcs_vmptr = -1ull;
208         vmx->nested.hv_evmcs = NULL;
209 }
210
211 /*
212  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
213  * just stops using VMX.
214  */
215 static void free_nested(struct kvm_vcpu *vcpu)
216 {
217         struct vcpu_vmx *vmx = to_vmx(vcpu);
218
219         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
220                 return;
221
222         vmx->nested.vmxon = false;
223         vmx->nested.smm.vmxon = false;
224         free_vpid(vmx->nested.vpid02);
225         vmx->nested.posted_intr_nv = -1;
226         vmx->nested.current_vmptr = -1ull;
227         if (enable_shadow_vmcs) {
228                 vmx_disable_shadow_vmcs(vmx);
229                 vmcs_clear(vmx->vmcs01.shadow_vmcs);
230                 free_vmcs(vmx->vmcs01.shadow_vmcs);
231                 vmx->vmcs01.shadow_vmcs = NULL;
232         }
233         kfree(vmx->nested.cached_vmcs12);
234         kfree(vmx->nested.cached_shadow_vmcs12);
235         /* Unpin physical memory we referred to in the vmcs02 */
236         if (vmx->nested.apic_access_page) {
237                 kvm_release_page_dirty(vmx->nested.apic_access_page);
238                 vmx->nested.apic_access_page = NULL;
239         }
240         kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
241         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
242         vmx->nested.pi_desc = NULL;
243
244         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
245
246         nested_release_evmcs(vcpu);
247
248         free_loaded_vmcs(&vmx->nested.vmcs02);
249 }
250
251 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
252                                      struct loaded_vmcs *prev)
253 {
254         struct vmcs_host_state *dest, *src;
255
256         if (unlikely(!vmx->guest_state_loaded))
257                 return;
258
259         src = &prev->host_state;
260         dest = &vmx->loaded_vmcs->host_state;
261
262         vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
263         dest->ldt_sel = src->ldt_sel;
264 #ifdef CONFIG_X86_64
265         dest->ds_sel = src->ds_sel;
266         dest->es_sel = src->es_sel;
267 #endif
268 }
269
270 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
271 {
272         struct vcpu_vmx *vmx = to_vmx(vcpu);
273         struct loaded_vmcs *prev;
274         int cpu;
275
276         if (vmx->loaded_vmcs == vmcs)
277                 return;
278
279         cpu = get_cpu();
280         prev = vmx->loaded_vmcs;
281         vmx->loaded_vmcs = vmcs;
282         vmx_vcpu_load_vmcs(vcpu, cpu);
283         vmx_sync_vmcs_host_state(vmx, prev);
284         put_cpu();
285
286         vmx_segment_cache_clear(vmx);
287 }
288
289 /*
290  * Ensure that the current vmcs of the logical processor is the
291  * vmcs01 of the vcpu before calling free_nested().
292  */
293 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
294 {
295         vcpu_load(vcpu);
296         vmx_leave_nested(vcpu);
297         vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
298         free_nested(vcpu);
299         vcpu_put(vcpu);
300 }
301
302 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
303                 struct x86_exception *fault)
304 {
305         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
306         struct vcpu_vmx *vmx = to_vmx(vcpu);
307         u32 exit_reason;
308         unsigned long exit_qualification = vcpu->arch.exit_qualification;
309
310         if (vmx->nested.pml_full) {
311                 exit_reason = EXIT_REASON_PML_FULL;
312                 vmx->nested.pml_full = false;
313                 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
314         } else if (fault->error_code & PFERR_RSVD_MASK)
315                 exit_reason = EXIT_REASON_EPT_MISCONFIG;
316         else
317                 exit_reason = EXIT_REASON_EPT_VIOLATION;
318
319         nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
320         vmcs12->guest_physical_address = fault->address;
321 }
322
323 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
324 {
325         WARN_ON(mmu_is_nested(vcpu));
326
327         vcpu->arch.mmu = &vcpu->arch.guest_mmu;
328         kvm_init_shadow_ept_mmu(vcpu,
329                         to_vmx(vcpu)->nested.msrs.ept_caps &
330                         VMX_EPT_EXECUTE_ONLY_BIT,
331                         nested_ept_ad_enabled(vcpu),
332                         nested_ept_get_cr3(vcpu));
333         vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
334         vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
335         vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
336         vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
337
338         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
339 }
340
341 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
342 {
343         vcpu->arch.mmu = &vcpu->arch.root_mmu;
344         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
345 }
346
347 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
348                                             u16 error_code)
349 {
350         bool inequality, bit;
351
352         bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
353         inequality =
354                 (error_code & vmcs12->page_fault_error_code_mask) !=
355                  vmcs12->page_fault_error_code_match;
356         return inequality ^ bit;
357 }
358
359
360 /*
361  * KVM wants to inject page-faults which it got to the guest. This function
362  * checks whether in a nested guest, we need to inject them to L1 or L2.
363  */
364 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
365 {
366         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
367         unsigned int nr = vcpu->arch.exception.nr;
368         bool has_payload = vcpu->arch.exception.has_payload;
369         unsigned long payload = vcpu->arch.exception.payload;
370
371         if (nr == PF_VECTOR) {
372                 if (vcpu->arch.exception.nested_apf) {
373                         *exit_qual = vcpu->arch.apf.nested_apf_token;
374                         return 1;
375                 }
376                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
377                                                     vcpu->arch.exception.error_code)) {
378                         *exit_qual = has_payload ? payload : vcpu->arch.cr2;
379                         return 1;
380                 }
381         } else if (vmcs12->exception_bitmap & (1u << nr)) {
382                 if (nr == DB_VECTOR) {
383                         if (!has_payload) {
384                                 payload = vcpu->arch.dr6;
385                                 payload &= ~(DR6_FIXED_1 | DR6_BT);
386                                 payload ^= DR6_RTM;
387                         }
388                         *exit_qual = payload;
389                 } else
390                         *exit_qual = 0;
391                 return 1;
392         }
393
394         return 0;
395 }
396
397
398 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
399                 struct x86_exception *fault)
400 {
401         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
402
403         WARN_ON(!is_guest_mode(vcpu));
404
405         if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
406                 !to_vmx(vcpu)->nested.nested_run_pending) {
407                 vmcs12->vm_exit_intr_error_code = fault->error_code;
408                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
409                                   PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
410                                   INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
411                                   fault->address);
412         } else {
413                 kvm_inject_page_fault(vcpu, fault);
414         }
415 }
416
417 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
418 {
419         return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
420 }
421
422 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
423                                                struct vmcs12 *vmcs12)
424 {
425         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
426                 return 0;
427
428         if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
429             !page_address_valid(vcpu, vmcs12->io_bitmap_b))
430                 return -EINVAL;
431
432         return 0;
433 }
434
435 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
436                                                 struct vmcs12 *vmcs12)
437 {
438         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
439                 return 0;
440
441         if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
442                 return -EINVAL;
443
444         return 0;
445 }
446
447 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
448                                                 struct vmcs12 *vmcs12)
449 {
450         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
451                 return 0;
452
453         if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
454                 return -EINVAL;
455
456         return 0;
457 }
458
459 /*
460  * Check if MSR is intercepted for L01 MSR bitmap.
461  */
462 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
463 {
464         unsigned long *msr_bitmap;
465         int f = sizeof(unsigned long);
466
467         if (!cpu_has_vmx_msr_bitmap())
468                 return true;
469
470         msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
471
472         if (msr <= 0x1fff) {
473                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
474         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
475                 msr &= 0x1fff;
476                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
477         }
478
479         return true;
480 }
481
482 /*
483  * If a msr is allowed by L0, we should check whether it is allowed by L1.
484  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
485  */
486 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
487                                                unsigned long *msr_bitmap_nested,
488                                                u32 msr, int type)
489 {
490         int f = sizeof(unsigned long);
491
492         /*
493          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
494          * have the write-low and read-high bitmap offsets the wrong way round.
495          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
496          */
497         if (msr <= 0x1fff) {
498                 if (type & MSR_TYPE_R &&
499                    !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
500                         /* read-low */
501                         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
502
503                 if (type & MSR_TYPE_W &&
504                    !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
505                         /* write-low */
506                         __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
507
508         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
509                 msr &= 0x1fff;
510                 if (type & MSR_TYPE_R &&
511                    !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
512                         /* read-high */
513                         __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
514
515                 if (type & MSR_TYPE_W &&
516                    !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
517                         /* write-high */
518                         __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
519
520         }
521 }
522
523 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
524         int msr;
525
526         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
527                 unsigned word = msr / BITS_PER_LONG;
528
529                 msr_bitmap[word] = ~0;
530                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
531         }
532 }
533
534 /*
535  * Merge L0's and L1's MSR bitmap, return false to indicate that
536  * we do not use the hardware.
537  */
538 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
539                                                  struct vmcs12 *vmcs12)
540 {
541         int msr;
542         unsigned long *msr_bitmap_l1;
543         unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
544         struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
545
546         /* Nothing to do if the MSR bitmap is not in use.  */
547         if (!cpu_has_vmx_msr_bitmap() ||
548             !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
549                 return false;
550
551         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
552                 return false;
553
554         msr_bitmap_l1 = (unsigned long *)map->hva;
555
556         /*
557          * To keep the control flow simple, pay eight 8-byte writes (sixteen
558          * 4-byte writes on 32-bit systems) up front to enable intercepts for
559          * the x2APIC MSR range and selectively disable them below.
560          */
561         enable_x2apic_msr_intercepts(msr_bitmap_l0);
562
563         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
564                 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
565                         /*
566                          * L0 need not intercept reads for MSRs between 0x800
567                          * and 0x8ff, it just lets the processor take the value
568                          * from the virtual-APIC page; take those 256 bits
569                          * directly from the L1 bitmap.
570                          */
571                         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
572                                 unsigned word = msr / BITS_PER_LONG;
573
574                                 msr_bitmap_l0[word] = msr_bitmap_l1[word];
575                         }
576                 }
577
578                 nested_vmx_disable_intercept_for_msr(
579                         msr_bitmap_l1, msr_bitmap_l0,
580                         X2APIC_MSR(APIC_TASKPRI),
581                         MSR_TYPE_R | MSR_TYPE_W);
582
583                 if (nested_cpu_has_vid(vmcs12)) {
584                         nested_vmx_disable_intercept_for_msr(
585                                 msr_bitmap_l1, msr_bitmap_l0,
586                                 X2APIC_MSR(APIC_EOI),
587                                 MSR_TYPE_W);
588                         nested_vmx_disable_intercept_for_msr(
589                                 msr_bitmap_l1, msr_bitmap_l0,
590                                 X2APIC_MSR(APIC_SELF_IPI),
591                                 MSR_TYPE_W);
592                 }
593         }
594
595         /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
596         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
597                                              MSR_FS_BASE, MSR_TYPE_RW);
598
599         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
600                                              MSR_GS_BASE, MSR_TYPE_RW);
601
602         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
603                                              MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
604
605         /*
606          * Checking the L0->L1 bitmap is trying to verify two things:
607          *
608          * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
609          *    ensures that we do not accidentally generate an L02 MSR bitmap
610          *    from the L12 MSR bitmap that is too permissive.
611          * 2. That L1 or L2s have actually used the MSR. This avoids
612          *    unnecessarily merging of the bitmap if the MSR is unused. This
613          *    works properly because we only update the L01 MSR bitmap lazily.
614          *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
615          *    updated to reflect this when L1 (or its L2s) actually write to
616          *    the MSR.
617          */
618         if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
619                 nested_vmx_disable_intercept_for_msr(
620                                         msr_bitmap_l1, msr_bitmap_l0,
621                                         MSR_IA32_SPEC_CTRL,
622                                         MSR_TYPE_R | MSR_TYPE_W);
623
624         if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
625                 nested_vmx_disable_intercept_for_msr(
626                                         msr_bitmap_l1, msr_bitmap_l0,
627                                         MSR_IA32_PRED_CMD,
628                                         MSR_TYPE_W);
629
630         kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
631
632         return true;
633 }
634
635 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
636                                        struct vmcs12 *vmcs12)
637 {
638         struct kvm_host_map map;
639         struct vmcs12 *shadow;
640
641         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
642             vmcs12->vmcs_link_pointer == -1ull)
643                 return;
644
645         shadow = get_shadow_vmcs12(vcpu);
646
647         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
648                 return;
649
650         memcpy(shadow, map.hva, VMCS12_SIZE);
651         kvm_vcpu_unmap(vcpu, &map, false);
652 }
653
654 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
655                                               struct vmcs12 *vmcs12)
656 {
657         struct vcpu_vmx *vmx = to_vmx(vcpu);
658
659         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
660             vmcs12->vmcs_link_pointer == -1ull)
661                 return;
662
663         kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
664                         get_shadow_vmcs12(vcpu), VMCS12_SIZE);
665 }
666
667 /*
668  * In nested virtualization, check if L1 has set
669  * VM_EXIT_ACK_INTR_ON_EXIT
670  */
671 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
672 {
673         return get_vmcs12(vcpu)->vm_exit_controls &
674                 VM_EXIT_ACK_INTR_ON_EXIT;
675 }
676
677 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
678 {
679         return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
680 }
681
682 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
683                                           struct vmcs12 *vmcs12)
684 {
685         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
686             !page_address_valid(vcpu, vmcs12->apic_access_addr))
687                 return -EINVAL;
688         else
689                 return 0;
690 }
691
692 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
693                                            struct vmcs12 *vmcs12)
694 {
695         if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
696             !nested_cpu_has_apic_reg_virt(vmcs12) &&
697             !nested_cpu_has_vid(vmcs12) &&
698             !nested_cpu_has_posted_intr(vmcs12))
699                 return 0;
700
701         /*
702          * If virtualize x2apic mode is enabled,
703          * virtualize apic access must be disabled.
704          */
705         if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
706             nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
707                 return -EINVAL;
708
709         /*
710          * If virtual interrupt delivery is enabled,
711          * we must exit on external interrupts.
712          */
713         if (nested_cpu_has_vid(vmcs12) &&
714            !nested_exit_on_intr(vcpu))
715                 return -EINVAL;
716
717         /*
718          * bits 15:8 should be zero in posted_intr_nv,
719          * the descriptor address has been already checked
720          * in nested_get_vmcs12_pages.
721          *
722          * bits 5:0 of posted_intr_desc_addr should be zero.
723          */
724         if (nested_cpu_has_posted_intr(vmcs12) &&
725            (!nested_cpu_has_vid(vmcs12) ||
726             !nested_exit_intr_ack_set(vcpu) ||
727             (vmcs12->posted_intr_nv & 0xff00) ||
728             (vmcs12->posted_intr_desc_addr & 0x3f) ||
729             (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
730                 return -EINVAL;
731
732         /* tpr shadow is needed by all apicv features. */
733         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
734                 return -EINVAL;
735
736         return 0;
737 }
738
739 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
740                                        u32 count, u64 addr)
741 {
742         int maxphyaddr;
743
744         if (count == 0)
745                 return 0;
746         maxphyaddr = cpuid_maxphyaddr(vcpu);
747         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
748             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
749                 return -EINVAL;
750
751         return 0;
752 }
753
754 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
755                                                      struct vmcs12 *vmcs12)
756 {
757         if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count,
758                                         vmcs12->vm_exit_msr_load_addr) ||
759             nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count,
760                                         vmcs12->vm_exit_msr_store_addr))
761                 return -EINVAL;
762
763         return 0;
764 }
765
766 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
767                                                       struct vmcs12 *vmcs12)
768 {
769         if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count,
770                                         vmcs12->vm_entry_msr_load_addr))
771                 return -EINVAL;
772
773         return 0;
774 }
775
776 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
777                                          struct vmcs12 *vmcs12)
778 {
779         if (!nested_cpu_has_pml(vmcs12))
780                 return 0;
781
782         if (!nested_cpu_has_ept(vmcs12) ||
783             !page_address_valid(vcpu, vmcs12->pml_address))
784                 return -EINVAL;
785
786         return 0;
787 }
788
789 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
790                                                         struct vmcs12 *vmcs12)
791 {
792         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
793             !nested_cpu_has_ept(vmcs12))
794                 return -EINVAL;
795         return 0;
796 }
797
798 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
799                                                          struct vmcs12 *vmcs12)
800 {
801         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
802             !nested_cpu_has_ept(vmcs12))
803                 return -EINVAL;
804         return 0;
805 }
806
807 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
808                                                  struct vmcs12 *vmcs12)
809 {
810         if (!nested_cpu_has_shadow_vmcs(vmcs12))
811                 return 0;
812
813         if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
814             !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
815                 return -EINVAL;
816
817         return 0;
818 }
819
820 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
821                                        struct vmx_msr_entry *e)
822 {
823         /* x2APIC MSR accesses are not allowed */
824         if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
825                 return -EINVAL;
826         if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
827             e->index == MSR_IA32_UCODE_REV)
828                 return -EINVAL;
829         if (e->reserved != 0)
830                 return -EINVAL;
831         return 0;
832 }
833
834 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
835                                      struct vmx_msr_entry *e)
836 {
837         if (e->index == MSR_FS_BASE ||
838             e->index == MSR_GS_BASE ||
839             e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
840             nested_vmx_msr_check_common(vcpu, e))
841                 return -EINVAL;
842         return 0;
843 }
844
845 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
846                                       struct vmx_msr_entry *e)
847 {
848         if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
849             nested_vmx_msr_check_common(vcpu, e))
850                 return -EINVAL;
851         return 0;
852 }
853
854 /*
855  * Load guest's/host's msr at nested entry/exit.
856  * return 0 for success, entry index for failure.
857  */
858 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
859 {
860         u32 i;
861         struct vmx_msr_entry e;
862         struct msr_data msr;
863
864         msr.host_initiated = false;
865         for (i = 0; i < count; i++) {
866                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
867                                         &e, sizeof(e))) {
868                         pr_debug_ratelimited(
869                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
870                                 __func__, i, gpa + i * sizeof(e));
871                         goto fail;
872                 }
873                 if (nested_vmx_load_msr_check(vcpu, &e)) {
874                         pr_debug_ratelimited(
875                                 "%s check failed (%u, 0x%x, 0x%x)\n",
876                                 __func__, i, e.index, e.reserved);
877                         goto fail;
878                 }
879                 msr.index = e.index;
880                 msr.data = e.value;
881                 if (kvm_set_msr(vcpu, &msr)) {
882                         pr_debug_ratelimited(
883                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
884                                 __func__, i, e.index, e.value);
885                         goto fail;
886                 }
887         }
888         return 0;
889 fail:
890         return i + 1;
891 }
892
893 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
894 {
895         u32 i;
896         struct vmx_msr_entry e;
897
898         for (i = 0; i < count; i++) {
899                 struct msr_data msr_info;
900                 if (kvm_vcpu_read_guest(vcpu,
901                                         gpa + i * sizeof(e),
902                                         &e, 2 * sizeof(u32))) {
903                         pr_debug_ratelimited(
904                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
905                                 __func__, i, gpa + i * sizeof(e));
906                         return -EINVAL;
907                 }
908                 if (nested_vmx_store_msr_check(vcpu, &e)) {
909                         pr_debug_ratelimited(
910                                 "%s check failed (%u, 0x%x, 0x%x)\n",
911                                 __func__, i, e.index, e.reserved);
912                         return -EINVAL;
913                 }
914                 msr_info.host_initiated = false;
915                 msr_info.index = e.index;
916                 if (kvm_get_msr(vcpu, &msr_info)) {
917                         pr_debug_ratelimited(
918                                 "%s cannot read MSR (%u, 0x%x)\n",
919                                 __func__, i, e.index);
920                         return -EINVAL;
921                 }
922                 if (kvm_vcpu_write_guest(vcpu,
923                                          gpa + i * sizeof(e) +
924                                              offsetof(struct vmx_msr_entry, value),
925                                          &msr_info.data, sizeof(msr_info.data))) {
926                         pr_debug_ratelimited(
927                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
928                                 __func__, i, e.index, msr_info.data);
929                         return -EINVAL;
930                 }
931         }
932         return 0;
933 }
934
935 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
936 {
937         unsigned long invalid_mask;
938
939         invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
940         return (val & invalid_mask) == 0;
941 }
942
943 /*
944  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
945  * emulating VM entry into a guest with EPT enabled.
946  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
947  * is assigned to entry_failure_code on failure.
948  */
949 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
950                                u32 *entry_failure_code)
951 {
952         if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
953                 if (!nested_cr3_valid(vcpu, cr3)) {
954                         *entry_failure_code = ENTRY_FAIL_DEFAULT;
955                         return -EINVAL;
956                 }
957
958                 /*
959                  * If PAE paging and EPT are both on, CR3 is not used by the CPU and
960                  * must not be dereferenced.
961                  */
962                 if (is_pae_paging(vcpu) && !nested_ept) {
963                         if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
964                                 *entry_failure_code = ENTRY_FAIL_PDPTE;
965                                 return -EINVAL;
966                         }
967                 }
968         }
969
970         if (!nested_ept)
971                 kvm_mmu_new_cr3(vcpu, cr3, false);
972
973         vcpu->arch.cr3 = cr3;
974         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
975
976         kvm_init_mmu(vcpu, false);
977
978         return 0;
979 }
980
981 /*
982  * Returns if KVM is able to config CPU to tag TLB entries
983  * populated by L2 differently than TLB entries populated
984  * by L1.
985  *
986  * If L1 uses EPT, then TLB entries are tagged with different EPTP.
987  *
988  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
989  * with different VPID (L1 entries are tagged with vmx->vpid
990  * while L2 entries are tagged with vmx->nested.vpid02).
991  */
992 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
993 {
994         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
995
996         return nested_cpu_has_ept(vmcs12) ||
997                (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
998 }
999
1000 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1001 {
1002         struct vcpu_vmx *vmx = to_vmx(vcpu);
1003
1004         return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1005 }
1006
1007
1008 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
1009 {
1010         return fixed_bits_valid(control, low, high);
1011 }
1012
1013 static inline u64 vmx_control_msr(u32 low, u32 high)
1014 {
1015         return low | ((u64)high << 32);
1016 }
1017
1018 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1019 {
1020         superset &= mask;
1021         subset &= mask;
1022
1023         return (superset | subset) == superset;
1024 }
1025
1026 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1027 {
1028         const u64 feature_and_reserved =
1029                 /* feature (except bit 48; see below) */
1030                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1031                 /* reserved */
1032                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1033         u64 vmx_basic = vmx->nested.msrs.basic;
1034
1035         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1036                 return -EINVAL;
1037
1038         /*
1039          * KVM does not emulate a version of VMX that constrains physical
1040          * addresses of VMX structures (e.g. VMCS) to 32-bits.
1041          */
1042         if (data & BIT_ULL(48))
1043                 return -EINVAL;
1044
1045         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1046             vmx_basic_vmcs_revision_id(data))
1047                 return -EINVAL;
1048
1049         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1050                 return -EINVAL;
1051
1052         vmx->nested.msrs.basic = data;
1053         return 0;
1054 }
1055
1056 static int
1057 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1058 {
1059         u64 supported;
1060         u32 *lowp, *highp;
1061
1062         switch (msr_index) {
1063         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1064                 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1065                 highp = &vmx->nested.msrs.pinbased_ctls_high;
1066                 break;
1067         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1068                 lowp = &vmx->nested.msrs.procbased_ctls_low;
1069                 highp = &vmx->nested.msrs.procbased_ctls_high;
1070                 break;
1071         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1072                 lowp = &vmx->nested.msrs.exit_ctls_low;
1073                 highp = &vmx->nested.msrs.exit_ctls_high;
1074                 break;
1075         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1076                 lowp = &vmx->nested.msrs.entry_ctls_low;
1077                 highp = &vmx->nested.msrs.entry_ctls_high;
1078                 break;
1079         case MSR_IA32_VMX_PROCBASED_CTLS2:
1080                 lowp = &vmx->nested.msrs.secondary_ctls_low;
1081                 highp = &vmx->nested.msrs.secondary_ctls_high;
1082                 break;
1083         default:
1084                 BUG();
1085         }
1086
1087         supported = vmx_control_msr(*lowp, *highp);
1088
1089         /* Check must-be-1 bits are still 1. */
1090         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1091                 return -EINVAL;
1092
1093         /* Check must-be-0 bits are still 0. */
1094         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1095                 return -EINVAL;
1096
1097         *lowp = data;
1098         *highp = data >> 32;
1099         return 0;
1100 }
1101
1102 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1103 {
1104         const u64 feature_and_reserved_bits =
1105                 /* feature */
1106                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1107                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1108                 /* reserved */
1109                 GENMASK_ULL(13, 9) | BIT_ULL(31);
1110         u64 vmx_misc;
1111
1112         vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1113                                    vmx->nested.msrs.misc_high);
1114
1115         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1116                 return -EINVAL;
1117
1118         if ((vmx->nested.msrs.pinbased_ctls_high &
1119              PIN_BASED_VMX_PREEMPTION_TIMER) &&
1120             vmx_misc_preemption_timer_rate(data) !=
1121             vmx_misc_preemption_timer_rate(vmx_misc))
1122                 return -EINVAL;
1123
1124         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1125                 return -EINVAL;
1126
1127         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1128                 return -EINVAL;
1129
1130         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1131                 return -EINVAL;
1132
1133         vmx->nested.msrs.misc_low = data;
1134         vmx->nested.msrs.misc_high = data >> 32;
1135
1136         return 0;
1137 }
1138
1139 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1140 {
1141         u64 vmx_ept_vpid_cap;
1142
1143         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1144                                            vmx->nested.msrs.vpid_caps);
1145
1146         /* Every bit is either reserved or a feature bit. */
1147         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1148                 return -EINVAL;
1149
1150         vmx->nested.msrs.ept_caps = data;
1151         vmx->nested.msrs.vpid_caps = data >> 32;
1152         return 0;
1153 }
1154
1155 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1156 {
1157         u64 *msr;
1158
1159         switch (msr_index) {
1160         case MSR_IA32_VMX_CR0_FIXED0:
1161                 msr = &vmx->nested.msrs.cr0_fixed0;
1162                 break;
1163         case MSR_IA32_VMX_CR4_FIXED0:
1164                 msr = &vmx->nested.msrs.cr4_fixed0;
1165                 break;
1166         default:
1167                 BUG();
1168         }
1169
1170         /*
1171          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1172          * must be 1 in the restored value.
1173          */
1174         if (!is_bitwise_subset(data, *msr, -1ULL))
1175                 return -EINVAL;
1176
1177         *msr = data;
1178         return 0;
1179 }
1180
1181 /*
1182  * Called when userspace is restoring VMX MSRs.
1183  *
1184  * Returns 0 on success, non-0 otherwise.
1185  */
1186 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1187 {
1188         struct vcpu_vmx *vmx = to_vmx(vcpu);
1189
1190         /*
1191          * Don't allow changes to the VMX capability MSRs while the vCPU
1192          * is in VMX operation.
1193          */
1194         if (vmx->nested.vmxon)
1195                 return -EBUSY;
1196
1197         switch (msr_index) {
1198         case MSR_IA32_VMX_BASIC:
1199                 return vmx_restore_vmx_basic(vmx, data);
1200         case MSR_IA32_VMX_PINBASED_CTLS:
1201         case MSR_IA32_VMX_PROCBASED_CTLS:
1202         case MSR_IA32_VMX_EXIT_CTLS:
1203         case MSR_IA32_VMX_ENTRY_CTLS:
1204                 /*
1205                  * The "non-true" VMX capability MSRs are generated from the
1206                  * "true" MSRs, so we do not support restoring them directly.
1207                  *
1208                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1209                  * should restore the "true" MSRs with the must-be-1 bits
1210                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1211                  * DEFAULT SETTINGS".
1212                  */
1213                 return -EINVAL;
1214         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1215         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1216         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1217         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1218         case MSR_IA32_VMX_PROCBASED_CTLS2:
1219                 return vmx_restore_control_msr(vmx, msr_index, data);
1220         case MSR_IA32_VMX_MISC:
1221                 return vmx_restore_vmx_misc(vmx, data);
1222         case MSR_IA32_VMX_CR0_FIXED0:
1223         case MSR_IA32_VMX_CR4_FIXED0:
1224                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1225         case MSR_IA32_VMX_CR0_FIXED1:
1226         case MSR_IA32_VMX_CR4_FIXED1:
1227                 /*
1228                  * These MSRs are generated based on the vCPU's CPUID, so we
1229                  * do not support restoring them directly.
1230                  */
1231                 return -EINVAL;
1232         case MSR_IA32_VMX_EPT_VPID_CAP:
1233                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1234         case MSR_IA32_VMX_VMCS_ENUM:
1235                 vmx->nested.msrs.vmcs_enum = data;
1236                 return 0;
1237         case MSR_IA32_VMX_VMFUNC:
1238                 if (data & ~vmx->nested.msrs.vmfunc_controls)
1239                         return -EINVAL;
1240                 vmx->nested.msrs.vmfunc_controls = data;
1241                 return 0;
1242         default:
1243                 /*
1244                  * The rest of the VMX capability MSRs do not support restore.
1245                  */
1246                 return -EINVAL;
1247         }
1248 }
1249
1250 /* Returns 0 on success, non-0 otherwise. */
1251 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1252 {
1253         switch (msr_index) {
1254         case MSR_IA32_VMX_BASIC:
1255                 *pdata = msrs->basic;
1256                 break;
1257         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1258         case MSR_IA32_VMX_PINBASED_CTLS:
1259                 *pdata = vmx_control_msr(
1260                         msrs->pinbased_ctls_low,
1261                         msrs->pinbased_ctls_high);
1262                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1263                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1264                 break;
1265         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1266         case MSR_IA32_VMX_PROCBASED_CTLS:
1267                 *pdata = vmx_control_msr(
1268                         msrs->procbased_ctls_low,
1269                         msrs->procbased_ctls_high);
1270                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1271                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1272                 break;
1273         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1274         case MSR_IA32_VMX_EXIT_CTLS:
1275                 *pdata = vmx_control_msr(
1276                         msrs->exit_ctls_low,
1277                         msrs->exit_ctls_high);
1278                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1279                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1280                 break;
1281         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1282         case MSR_IA32_VMX_ENTRY_CTLS:
1283                 *pdata = vmx_control_msr(
1284                         msrs->entry_ctls_low,
1285                         msrs->entry_ctls_high);
1286                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1287                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1288                 break;
1289         case MSR_IA32_VMX_MISC:
1290                 *pdata = vmx_control_msr(
1291                         msrs->misc_low,
1292                         msrs->misc_high);
1293                 break;
1294         case MSR_IA32_VMX_CR0_FIXED0:
1295                 *pdata = msrs->cr0_fixed0;
1296                 break;
1297         case MSR_IA32_VMX_CR0_FIXED1:
1298                 *pdata = msrs->cr0_fixed1;
1299                 break;
1300         case MSR_IA32_VMX_CR4_FIXED0:
1301                 *pdata = msrs->cr4_fixed0;
1302                 break;
1303         case MSR_IA32_VMX_CR4_FIXED1:
1304                 *pdata = msrs->cr4_fixed1;
1305                 break;
1306         case MSR_IA32_VMX_VMCS_ENUM:
1307                 *pdata = msrs->vmcs_enum;
1308                 break;
1309         case MSR_IA32_VMX_PROCBASED_CTLS2:
1310                 *pdata = vmx_control_msr(
1311                         msrs->secondary_ctls_low,
1312                         msrs->secondary_ctls_high);
1313                 break;
1314         case MSR_IA32_VMX_EPT_VPID_CAP:
1315                 *pdata = msrs->ept_caps |
1316                         ((u64)msrs->vpid_caps << 32);
1317                 break;
1318         case MSR_IA32_VMX_VMFUNC:
1319                 *pdata = msrs->vmfunc_controls;
1320                 break;
1321         default:
1322                 return 1;
1323         }
1324
1325         return 0;
1326 }
1327
1328 /*
1329  * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1330  * been modified by the L1 guest.  Note, "writable" in this context means
1331  * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1332  * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1333  * VM-exit information fields (which are actually writable if the vCPU is
1334  * configured to support "VMWRITE to any supported field in the VMCS").
1335  */
1336 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1337 {
1338         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1339         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1340         struct shadow_vmcs_field field;
1341         unsigned long val;
1342         int i;
1343
1344         preempt_disable();
1345
1346         vmcs_load(shadow_vmcs);
1347
1348         for (i = 0; i < max_shadow_read_write_fields; i++) {
1349                 field = shadow_read_write_fields[i];
1350                 val = __vmcs_readl(field.encoding);
1351                 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1352         }
1353
1354         vmcs_clear(shadow_vmcs);
1355         vmcs_load(vmx->loaded_vmcs->vmcs);
1356
1357         preempt_enable();
1358 }
1359
1360 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1361 {
1362         const struct shadow_vmcs_field *fields[] = {
1363                 shadow_read_write_fields,
1364                 shadow_read_only_fields
1365         };
1366         const int max_fields[] = {
1367                 max_shadow_read_write_fields,
1368                 max_shadow_read_only_fields
1369         };
1370         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1371         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1372         struct shadow_vmcs_field field;
1373         unsigned long val;
1374         int i, q;
1375
1376         vmcs_load(shadow_vmcs);
1377
1378         for (q = 0; q < ARRAY_SIZE(fields); q++) {
1379                 for (i = 0; i < max_fields[q]; i++) {
1380                         field = fields[q][i];
1381                         val = vmcs12_read_any(vmcs12, field.encoding,
1382                                               field.offset);
1383                         __vmcs_writel(field.encoding, val);
1384                 }
1385         }
1386
1387         vmcs_clear(shadow_vmcs);
1388         vmcs_load(vmx->loaded_vmcs->vmcs);
1389 }
1390
1391 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1392 {
1393         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1394         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1395
1396         /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1397         vmcs12->tpr_threshold = evmcs->tpr_threshold;
1398         vmcs12->guest_rip = evmcs->guest_rip;
1399
1400         if (unlikely(!(evmcs->hv_clean_fields &
1401                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1402                 vmcs12->guest_rsp = evmcs->guest_rsp;
1403                 vmcs12->guest_rflags = evmcs->guest_rflags;
1404                 vmcs12->guest_interruptibility_info =
1405                         evmcs->guest_interruptibility_info;
1406         }
1407
1408         if (unlikely(!(evmcs->hv_clean_fields &
1409                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1410                 vmcs12->cpu_based_vm_exec_control =
1411                         evmcs->cpu_based_vm_exec_control;
1412         }
1413
1414         if (unlikely(!(evmcs->hv_clean_fields &
1415                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1416                 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1417         }
1418
1419         if (unlikely(!(evmcs->hv_clean_fields &
1420                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1421                 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1422         }
1423
1424         if (unlikely(!(evmcs->hv_clean_fields &
1425                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1426                 vmcs12->vm_entry_intr_info_field =
1427                         evmcs->vm_entry_intr_info_field;
1428                 vmcs12->vm_entry_exception_error_code =
1429                         evmcs->vm_entry_exception_error_code;
1430                 vmcs12->vm_entry_instruction_len =
1431                         evmcs->vm_entry_instruction_len;
1432         }
1433
1434         if (unlikely(!(evmcs->hv_clean_fields &
1435                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1436                 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1437                 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1438                 vmcs12->host_cr0 = evmcs->host_cr0;
1439                 vmcs12->host_cr3 = evmcs->host_cr3;
1440                 vmcs12->host_cr4 = evmcs->host_cr4;
1441                 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1442                 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1443                 vmcs12->host_rip = evmcs->host_rip;
1444                 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1445                 vmcs12->host_es_selector = evmcs->host_es_selector;
1446                 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1447                 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1448                 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1449                 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1450                 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1451                 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1452         }
1453
1454         if (unlikely(!(evmcs->hv_clean_fields &
1455                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1456                 vmcs12->pin_based_vm_exec_control =
1457                         evmcs->pin_based_vm_exec_control;
1458                 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1459                 vmcs12->secondary_vm_exec_control =
1460                         evmcs->secondary_vm_exec_control;
1461         }
1462
1463         if (unlikely(!(evmcs->hv_clean_fields &
1464                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1465                 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1466                 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1467         }
1468
1469         if (unlikely(!(evmcs->hv_clean_fields &
1470                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1471                 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1472         }
1473
1474         if (unlikely(!(evmcs->hv_clean_fields &
1475                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1476                 vmcs12->guest_es_base = evmcs->guest_es_base;
1477                 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1478                 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1479                 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1480                 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1481                 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1482                 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1483                 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1484                 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1485                 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1486                 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1487                 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1488                 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1489                 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1490                 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1491                 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1492                 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1493                 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1494                 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1495                 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1496                 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1497                 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1498                 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1499                 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1500                 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1501                 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1502                 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1503                 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1504                 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1505                 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1506                 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1507                 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1508                 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1509                 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1510                 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1511                 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1512         }
1513
1514         if (unlikely(!(evmcs->hv_clean_fields &
1515                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1516                 vmcs12->tsc_offset = evmcs->tsc_offset;
1517                 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1518                 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1519         }
1520
1521         if (unlikely(!(evmcs->hv_clean_fields &
1522                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1523                 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1524                 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1525                 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1526                 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1527                 vmcs12->guest_cr0 = evmcs->guest_cr0;
1528                 vmcs12->guest_cr3 = evmcs->guest_cr3;
1529                 vmcs12->guest_cr4 = evmcs->guest_cr4;
1530                 vmcs12->guest_dr7 = evmcs->guest_dr7;
1531         }
1532
1533         if (unlikely(!(evmcs->hv_clean_fields &
1534                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1535                 vmcs12->host_fs_base = evmcs->host_fs_base;
1536                 vmcs12->host_gs_base = evmcs->host_gs_base;
1537                 vmcs12->host_tr_base = evmcs->host_tr_base;
1538                 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1539                 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1540                 vmcs12->host_rsp = evmcs->host_rsp;
1541         }
1542
1543         if (unlikely(!(evmcs->hv_clean_fields &
1544                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1545                 vmcs12->ept_pointer = evmcs->ept_pointer;
1546                 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1547         }
1548
1549         if (unlikely(!(evmcs->hv_clean_fields &
1550                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1551                 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1552                 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1553                 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1554                 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1555                 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1556                 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1557                 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1558                 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1559                 vmcs12->guest_pending_dbg_exceptions =
1560                         evmcs->guest_pending_dbg_exceptions;
1561                 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1562                 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1563                 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1564                 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1565                 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1566         }
1567
1568         /*
1569          * Not used?
1570          * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1571          * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1572          * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1573          * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1574          * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1575          * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1576          * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1577          * vmcs12->page_fault_error_code_mask =
1578          *              evmcs->page_fault_error_code_mask;
1579          * vmcs12->page_fault_error_code_match =
1580          *              evmcs->page_fault_error_code_match;
1581          * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1582          * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1583          * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1584          * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1585          */
1586
1587         /*
1588          * Read only fields:
1589          * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1590          * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1591          * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1592          * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1593          * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1594          * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1595          * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1596          * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1597          * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1598          * vmcs12->exit_qualification = evmcs->exit_qualification;
1599          * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1600          *
1601          * Not present in struct vmcs12:
1602          * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1603          * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1604          * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1605          * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1606          */
1607
1608         return 0;
1609 }
1610
1611 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1612 {
1613         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1614         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1615
1616         /*
1617          * Should not be changed by KVM:
1618          *
1619          * evmcs->host_es_selector = vmcs12->host_es_selector;
1620          * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1621          * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1622          * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1623          * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1624          * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1625          * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1626          * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1627          * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1628          * evmcs->host_cr0 = vmcs12->host_cr0;
1629          * evmcs->host_cr3 = vmcs12->host_cr3;
1630          * evmcs->host_cr4 = vmcs12->host_cr4;
1631          * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1632          * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1633          * evmcs->host_rip = vmcs12->host_rip;
1634          * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1635          * evmcs->host_fs_base = vmcs12->host_fs_base;
1636          * evmcs->host_gs_base = vmcs12->host_gs_base;
1637          * evmcs->host_tr_base = vmcs12->host_tr_base;
1638          * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1639          * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1640          * evmcs->host_rsp = vmcs12->host_rsp;
1641          * sync_vmcs02_to_vmcs12() doesn't read these:
1642          * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1643          * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1644          * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1645          * evmcs->ept_pointer = vmcs12->ept_pointer;
1646          * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1647          * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1648          * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1649          * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1650          * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1651          * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1652          * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1653          * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1654          * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1655          * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1656          * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1657          * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1658          * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1659          * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1660          * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1661          * evmcs->page_fault_error_code_mask =
1662          *              vmcs12->page_fault_error_code_mask;
1663          * evmcs->page_fault_error_code_match =
1664          *              vmcs12->page_fault_error_code_match;
1665          * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1666          * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1667          * evmcs->tsc_offset = vmcs12->tsc_offset;
1668          * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1669          * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1670          * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1671          * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1672          * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1673          * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1674          * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1675          * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1676          *
1677          * Not present in struct vmcs12:
1678          * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1679          * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1680          * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1681          * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1682          */
1683
1684         evmcs->guest_es_selector = vmcs12->guest_es_selector;
1685         evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1686         evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1687         evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1688         evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1689         evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1690         evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1691         evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1692
1693         evmcs->guest_es_limit = vmcs12->guest_es_limit;
1694         evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1695         evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1696         evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1697         evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1698         evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1699         evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1700         evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1701         evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1702         evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1703
1704         evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1705         evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1706         evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1707         evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1708         evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1709         evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1710         evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1711         evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1712
1713         evmcs->guest_es_base = vmcs12->guest_es_base;
1714         evmcs->guest_cs_base = vmcs12->guest_cs_base;
1715         evmcs->guest_ss_base = vmcs12->guest_ss_base;
1716         evmcs->guest_ds_base = vmcs12->guest_ds_base;
1717         evmcs->guest_fs_base = vmcs12->guest_fs_base;
1718         evmcs->guest_gs_base = vmcs12->guest_gs_base;
1719         evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1720         evmcs->guest_tr_base = vmcs12->guest_tr_base;
1721         evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1722         evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1723
1724         evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1725         evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1726
1727         evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1728         evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1729         evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1730         evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1731
1732         evmcs->guest_pending_dbg_exceptions =
1733                 vmcs12->guest_pending_dbg_exceptions;
1734         evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1735         evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1736
1737         evmcs->guest_activity_state = vmcs12->guest_activity_state;
1738         evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1739
1740         evmcs->guest_cr0 = vmcs12->guest_cr0;
1741         evmcs->guest_cr3 = vmcs12->guest_cr3;
1742         evmcs->guest_cr4 = vmcs12->guest_cr4;
1743         evmcs->guest_dr7 = vmcs12->guest_dr7;
1744
1745         evmcs->guest_physical_address = vmcs12->guest_physical_address;
1746
1747         evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1748         evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1749         evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1750         evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1751         evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1752         evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1753         evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1754         evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1755
1756         evmcs->exit_qualification = vmcs12->exit_qualification;
1757
1758         evmcs->guest_linear_address = vmcs12->guest_linear_address;
1759         evmcs->guest_rsp = vmcs12->guest_rsp;
1760         evmcs->guest_rflags = vmcs12->guest_rflags;
1761
1762         evmcs->guest_interruptibility_info =
1763                 vmcs12->guest_interruptibility_info;
1764         evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1765         evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1766         evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1767         evmcs->vm_entry_exception_error_code =
1768                 vmcs12->vm_entry_exception_error_code;
1769         evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1770
1771         evmcs->guest_rip = vmcs12->guest_rip;
1772
1773         evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1774
1775         return 0;
1776 }
1777
1778 /*
1779  * This is an equivalent of the nested hypervisor executing the vmptrld
1780  * instruction.
1781  */
1782 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1783                                                  bool from_launch)
1784 {
1785         struct vcpu_vmx *vmx = to_vmx(vcpu);
1786         bool evmcs_gpa_changed = false;
1787         u64 evmcs_gpa;
1788
1789         if (likely(!vmx->nested.enlightened_vmcs_enabled))
1790                 return 1;
1791
1792         if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1793                 return 1;
1794
1795         if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1796                 if (!vmx->nested.hv_evmcs)
1797                         vmx->nested.current_vmptr = -1ull;
1798
1799                 nested_release_evmcs(vcpu);
1800
1801                 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1802                                  &vmx->nested.hv_evmcs_map))
1803                         return 0;
1804
1805                 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1806
1807                 /*
1808                  * Currently, KVM only supports eVMCS version 1
1809                  * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1810                  * value to first u32 field of eVMCS which should specify eVMCS
1811                  * VersionNumber.
1812                  *
1813                  * Guest should be aware of supported eVMCS versions by host by
1814                  * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1815                  * expected to set this CPUID leaf according to the value
1816                  * returned in vmcs_version from nested_enable_evmcs().
1817                  *
1818                  * However, it turns out that Microsoft Hyper-V fails to comply
1819                  * to their own invented interface: When Hyper-V use eVMCS, it
1820                  * just sets first u32 field of eVMCS to revision_id specified
1821                  * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1822                  * which is one of the supported versions specified in
1823                  * CPUID.0x4000000A.EAX[0:15].
1824                  *
1825                  * To overcome Hyper-V bug, we accept here either a supported
1826                  * eVMCS version or VMCS12 revision_id as valid values for first
1827                  * u32 field of eVMCS.
1828                  */
1829                 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1830                     (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1831                         nested_release_evmcs(vcpu);
1832                         return 0;
1833                 }
1834
1835                 vmx->nested.dirty_vmcs12 = true;
1836                 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1837
1838                 evmcs_gpa_changed = true;
1839                 /*
1840                  * Unlike normal vmcs12, enlightened vmcs12 is not fully
1841                  * reloaded from guest's memory (read only fields, fields not
1842                  * present in struct hv_enlightened_vmcs, ...). Make sure there
1843                  * are no leftovers.
1844                  */
1845                 if (from_launch) {
1846                         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1847                         memset(vmcs12, 0, sizeof(*vmcs12));
1848                         vmcs12->hdr.revision_id = VMCS12_REVISION;
1849                 }
1850
1851         }
1852
1853         /*
1854          * Clean fields data can't de used on VMLAUNCH and when we switch
1855          * between different L2 guests as KVM keeps a single VMCS12 per L1.
1856          */
1857         if (from_launch || evmcs_gpa_changed)
1858                 vmx->nested.hv_evmcs->hv_clean_fields &=
1859                         ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1860
1861         return 1;
1862 }
1863
1864 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
1865 {
1866         struct vcpu_vmx *vmx = to_vmx(vcpu);
1867
1868         /*
1869          * hv_evmcs may end up being not mapped after migration (when
1870          * L2 was running), map it here to make sure vmcs12 changes are
1871          * properly reflected.
1872          */
1873         if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
1874                 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
1875
1876         if (vmx->nested.hv_evmcs) {
1877                 copy_vmcs12_to_enlightened(vmx);
1878                 /* All fields are clean */
1879                 vmx->nested.hv_evmcs->hv_clean_fields |=
1880                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1881         } else {
1882                 copy_vmcs12_to_shadow(vmx);
1883         }
1884
1885         vmx->nested.need_vmcs12_to_shadow_sync = false;
1886 }
1887
1888 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
1889 {
1890         struct vcpu_vmx *vmx =
1891                 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
1892
1893         vmx->nested.preemption_timer_expired = true;
1894         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
1895         kvm_vcpu_kick(&vmx->vcpu);
1896
1897         return HRTIMER_NORESTART;
1898 }
1899
1900 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
1901 {
1902         u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
1903         struct vcpu_vmx *vmx = to_vmx(vcpu);
1904
1905         /*
1906          * A timer value of zero is architecturally guaranteed to cause
1907          * a VMExit prior to executing any instructions in the guest.
1908          */
1909         if (preemption_timeout == 0) {
1910                 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
1911                 return;
1912         }
1913
1914         if (vcpu->arch.virtual_tsc_khz == 0)
1915                 return;
1916
1917         preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
1918         preemption_timeout *= 1000000;
1919         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
1920         hrtimer_start(&vmx->nested.preemption_timer,
1921                       ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
1922 }
1923
1924 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1925 {
1926         if (vmx->nested.nested_run_pending &&
1927             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
1928                 return vmcs12->guest_ia32_efer;
1929         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
1930                 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
1931         else
1932                 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
1933 }
1934
1935 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
1936 {
1937         /*
1938          * If vmcs02 hasn't been initialized, set the constant vmcs02 state
1939          * according to L0's settings (vmcs12 is irrelevant here).  Host
1940          * fields that come from L0 and are not constant, e.g. HOST_CR3,
1941          * will be set as needed prior to VMLAUNCH/VMRESUME.
1942          */
1943         if (vmx->nested.vmcs02_initialized)
1944                 return;
1945         vmx->nested.vmcs02_initialized = true;
1946
1947         /*
1948          * We don't care what the EPTP value is we just need to guarantee
1949          * it's valid so we don't get a false positive when doing early
1950          * consistency checks.
1951          */
1952         if (enable_ept && nested_early_check)
1953                 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
1954
1955         /* All VMFUNCs are currently emulated through L0 vmexits.  */
1956         if (cpu_has_vmx_vmfunc())
1957                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
1958
1959         if (cpu_has_vmx_posted_intr())
1960                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
1961
1962         if (cpu_has_vmx_msr_bitmap())
1963                 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
1964
1965         /*
1966          * The PML address never changes, so it is constant in vmcs02.
1967          * Conceptually we want to copy the PML index from vmcs01 here,
1968          * and then back to vmcs01 on nested vmexit.  But since we flush
1969          * the log and reset GUEST_PML_INDEX on each vmexit, the PML
1970          * index is also effectively constant in vmcs02.
1971          */
1972         if (enable_pml) {
1973                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
1974                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
1975         }
1976
1977         if (cpu_has_vmx_encls_vmexit())
1978                 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
1979
1980         /*
1981          * Set the MSR load/store lists to match L0's settings.  Only the
1982          * addresses are constant (for vmcs02), the counts can change based
1983          * on L2's behavior, e.g. switching to/from long mode.
1984          */
1985         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1986         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
1987         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
1988
1989         vmx_set_constant_host_state(vmx);
1990 }
1991
1992 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
1993                                       struct vmcs12 *vmcs12)
1994 {
1995         prepare_vmcs02_constant_state(vmx);
1996
1997         vmcs_write64(VMCS_LINK_POINTER, -1ull);
1998
1999         if (enable_vpid) {
2000                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2001                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2002                 else
2003                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2004         }
2005 }
2006
2007 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2008 {
2009         u32 exec_control, vmcs12_exec_ctrl;
2010         u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2011
2012         if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2013                 prepare_vmcs02_early_rare(vmx, vmcs12);
2014
2015         /*
2016          * PIN CONTROLS
2017          */
2018         exec_control = vmx_pin_based_exec_ctrl(vmx);
2019         exec_control |= (vmcs12->pin_based_vm_exec_control &
2020                          ~PIN_BASED_VMX_PREEMPTION_TIMER);
2021
2022         /* Posted interrupts setting is only taken from vmcs12.  */
2023         if (nested_cpu_has_posted_intr(vmcs12)) {
2024                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2025                 vmx->nested.pi_pending = false;
2026         } else {
2027                 exec_control &= ~PIN_BASED_POSTED_INTR;
2028         }
2029         pin_controls_set(vmx, exec_control);
2030
2031         /*
2032          * EXEC CONTROLS
2033          */
2034         exec_control = vmx_exec_control(vmx); /* L0's desires */
2035         exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2036         exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2037         exec_control &= ~CPU_BASED_TPR_SHADOW;
2038         exec_control |= vmcs12->cpu_based_vm_exec_control;
2039
2040         if (exec_control & CPU_BASED_TPR_SHADOW)
2041                 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2042 #ifdef CONFIG_X86_64
2043         else
2044                 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2045                                 CPU_BASED_CR8_STORE_EXITING;
2046 #endif
2047
2048         /*
2049          * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2050          * for I/O port accesses.
2051          */
2052         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2053         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2054
2055         /*
2056          * This bit will be computed in nested_get_vmcs12_pages, because
2057          * we do not have access to L1's MSR bitmap yet.  For now, keep
2058          * the same bit as before, hoping to avoid multiple VMWRITEs that
2059          * only set/clear this bit.
2060          */
2061         exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2062         exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2063
2064         exec_controls_set(vmx, exec_control);
2065
2066         /*
2067          * SECONDARY EXEC CONTROLS
2068          */
2069         if (cpu_has_secondary_exec_ctrls()) {
2070                 exec_control = vmx->secondary_exec_control;
2071
2072                 /* Take the following fields only from vmcs12 */
2073                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2074                                   SECONDARY_EXEC_ENABLE_INVPCID |
2075                                   SECONDARY_EXEC_RDTSCP |
2076                                   SECONDARY_EXEC_XSAVES |
2077                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2078                                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
2079                                   SECONDARY_EXEC_ENABLE_VMFUNC);
2080                 if (nested_cpu_has(vmcs12,
2081                                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2082                         vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2083                                 ~SECONDARY_EXEC_ENABLE_PML;
2084                         exec_control |= vmcs12_exec_ctrl;
2085                 }
2086
2087                 /* VMCS shadowing for L2 is emulated for now */
2088                 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2089
2090                 /*
2091                  * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2092                  * will not have to rewrite the controls just for this bit.
2093                  */
2094                 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2095                     (vmcs12->guest_cr4 & X86_CR4_UMIP))
2096                         exec_control |= SECONDARY_EXEC_DESC;
2097
2098                 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2099                         vmcs_write16(GUEST_INTR_STATUS,
2100                                 vmcs12->guest_intr_status);
2101
2102                 secondary_exec_controls_set(vmx, exec_control);
2103         }
2104
2105         /*
2106          * ENTRY CONTROLS
2107          *
2108          * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2109          * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2110          * on the related bits (if supported by the CPU) in the hope that
2111          * we can avoid VMWrites during vmx_set_efer().
2112          */
2113         exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2114                         ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2115         if (cpu_has_load_ia32_efer()) {
2116                 if (guest_efer & EFER_LMA)
2117                         exec_control |= VM_ENTRY_IA32E_MODE;
2118                 if (guest_efer != host_efer)
2119                         exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2120         }
2121         vm_entry_controls_set(vmx, exec_control);
2122
2123         /*
2124          * EXIT CONTROLS
2125          *
2126          * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2127          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2128          * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2129          */
2130         exec_control = vmx_vmexit_ctrl();
2131         if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2132                 exec_control |= VM_EXIT_LOAD_IA32_EFER;
2133         vm_exit_controls_set(vmx, exec_control);
2134
2135         /*
2136          * Interrupt/Exception Fields
2137          */
2138         if (vmx->nested.nested_run_pending) {
2139                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2140                              vmcs12->vm_entry_intr_info_field);
2141                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2142                              vmcs12->vm_entry_exception_error_code);
2143                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2144                              vmcs12->vm_entry_instruction_len);
2145                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2146                              vmcs12->guest_interruptibility_info);
2147                 vmx->loaded_vmcs->nmi_known_unmasked =
2148                         !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2149         } else {
2150                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2151         }
2152 }
2153
2154 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2155 {
2156         struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2157
2158         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2159                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2160                 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2161                 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2162                 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2163                 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2164                 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2165                 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2166                 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2167                 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2168                 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2169                 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2170                 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2171                 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2172                 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2173                 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2174                 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2175                 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2176                 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2177                 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2178                 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2179                 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2180                 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2181                 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2182                 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2183                 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2184                 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2185                 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2186                 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2187                 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2188                 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2189                 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2190                 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2191                 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2192                 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2193                 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2194                 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2195                 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2196         }
2197
2198         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2199                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2200                 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2201                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2202                             vmcs12->guest_pending_dbg_exceptions);
2203                 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2204                 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2205
2206                 /*
2207                  * L1 may access the L2's PDPTR, so save them to construct
2208                  * vmcs12
2209                  */
2210                 if (enable_ept) {
2211                         vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2212                         vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2213                         vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2214                         vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2215                 }
2216
2217                 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2218                     (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2219                         vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2220         }
2221
2222         if (nested_cpu_has_xsaves(vmcs12))
2223                 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2224
2225         /*
2226          * Whether page-faults are trapped is determined by a combination of
2227          * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2228          * If enable_ept, L0 doesn't care about page faults and we should
2229          * set all of these to L1's desires. However, if !enable_ept, L0 does
2230          * care about (at least some) page faults, and because it is not easy
2231          * (if at all possible?) to merge L0 and L1's desires, we simply ask
2232          * to exit on each and every L2 page fault. This is done by setting
2233          * MASK=MATCH=0 and (see below) EB.PF=1.
2234          * Note that below we don't need special code to set EB.PF beyond the
2235          * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2236          * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2237          * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2238          */
2239         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2240                 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2241         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2242                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
2243
2244         if (cpu_has_vmx_apicv()) {
2245                 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2246                 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2247                 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2248                 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2249         }
2250
2251         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2252         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2253
2254         set_cr4_guest_host_mask(vmx);
2255 }
2256
2257 /*
2258  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2259  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2260  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2261  * guest in a way that will both be appropriate to L1's requests, and our
2262  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2263  * function also has additional necessary side-effects, like setting various
2264  * vcpu->arch fields.
2265  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2266  * is assigned to entry_failure_code on failure.
2267  */
2268 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2269                           u32 *entry_failure_code)
2270 {
2271         struct vcpu_vmx *vmx = to_vmx(vcpu);
2272         struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2273         bool load_guest_pdptrs_vmcs12 = false;
2274
2275         if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2276                 prepare_vmcs02_rare(vmx, vmcs12);
2277                 vmx->nested.dirty_vmcs12 = false;
2278
2279                 load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2280                         !(hv_evmcs->hv_clean_fields &
2281                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2282         }
2283
2284         if (vmx->nested.nested_run_pending &&
2285             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2286                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2287                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2288         } else {
2289                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2290                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2291         }
2292         if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2293             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2294                 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2295         vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2296
2297         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2298          * bitwise-or of what L1 wants to trap for L2, and what we want to
2299          * trap. Note that CR0.TS also needs updating - we do this later.
2300          */
2301         update_exception_bitmap(vcpu);
2302         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2303         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2304
2305         if (vmx->nested.nested_run_pending &&
2306             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2307                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2308                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2309         } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2310                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2311         }
2312
2313         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2314
2315         if (kvm_has_tsc_control)
2316                 decache_tsc_multiplier(vmx);
2317
2318         if (enable_vpid) {
2319                 /*
2320                  * There is no direct mapping between vpid02 and vpid12, the
2321                  * vpid02 is per-vCPU for L0 and reused while the value of
2322                  * vpid12 is changed w/ one invvpid during nested vmentry.
2323                  * The vpid12 is allocated by L1 for L2, so it will not
2324                  * influence global bitmap(for vpid01 and vpid02 allocation)
2325                  * even if spawn a lot of nested vCPUs.
2326                  */
2327                 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2328                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2329                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2330                                 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2331                         }
2332                 } else {
2333                         /*
2334                          * If L1 use EPT, then L0 needs to execute INVEPT on
2335                          * EPTP02 instead of EPTP01. Therefore, delay TLB
2336                          * flush until vmcs02->eptp is fully updated by
2337                          * KVM_REQ_LOAD_CR3. Note that this assumes
2338                          * KVM_REQ_TLB_FLUSH is evaluated after
2339                          * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2340                          */
2341                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2342                 }
2343         }
2344
2345         if (nested_cpu_has_ept(vmcs12))
2346                 nested_ept_init_mmu_context(vcpu);
2347         else if (nested_cpu_has2(vmcs12,
2348                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2349                 vmx_flush_tlb(vcpu, true);
2350
2351         /*
2352          * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2353          * bits which we consider mandatory enabled.
2354          * The CR0_READ_SHADOW is what L2 should have expected to read given
2355          * the specifications by L1; It's not enough to take
2356          * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2357          * have more bits than L1 expected.
2358          */
2359         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2360         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2361
2362         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2363         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2364
2365         vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2366         /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2367         vmx_set_efer(vcpu, vcpu->arch.efer);
2368
2369         /*
2370          * Guest state is invalid and unrestricted guest is disabled,
2371          * which means L1 attempted VMEntry to L2 with invalid state.
2372          * Fail the VMEntry.
2373          */
2374         if (vmx->emulation_required) {
2375                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2376                 return -EINVAL;
2377         }
2378
2379         /* Shadow page tables on either EPT or shadow page tables. */
2380         if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2381                                 entry_failure_code))
2382                 return -EINVAL;
2383
2384         /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2385         if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2386             is_pae_paging(vcpu)) {
2387                 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2388                 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2389                 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2390                 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2391         }
2392
2393         if (!enable_ept)
2394                 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2395
2396         kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2397         kvm_rip_write(vcpu, vmcs12->guest_rip);
2398         return 0;
2399 }
2400
2401 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2402 {
2403         if (!nested_cpu_has_nmi_exiting(vmcs12) &&
2404             nested_cpu_has_virtual_nmis(vmcs12))
2405                 return -EINVAL;
2406
2407         if (!nested_cpu_has_virtual_nmis(vmcs12) &&
2408             nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
2409                 return -EINVAL;
2410
2411         return 0;
2412 }
2413
2414 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2415 {
2416         struct vcpu_vmx *vmx = to_vmx(vcpu);
2417         int maxphyaddr = cpuid_maxphyaddr(vcpu);
2418
2419         /* Check for memory type validity */
2420         switch (address & VMX_EPTP_MT_MASK) {
2421         case VMX_EPTP_MT_UC:
2422                 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
2423                         return false;
2424                 break;
2425         case VMX_EPTP_MT_WB:
2426                 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
2427                         return false;
2428                 break;
2429         default:
2430                 return false;
2431         }
2432
2433         /* only 4 levels page-walk length are valid */
2434         if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
2435                 return false;
2436
2437         /* Reserved bits should not be set */
2438         if (address >> maxphyaddr || ((address >> 7) & 0x1f))
2439                 return false;
2440
2441         /* AD, if set, should be supported */
2442         if (address & VMX_EPTP_AD_ENABLE_BIT) {
2443                 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
2444                         return false;
2445         }
2446
2447         return true;
2448 }
2449
2450 /*
2451  * Checks related to VM-Execution Control Fields
2452  */
2453 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2454                                               struct vmcs12 *vmcs12)
2455 {
2456         struct vcpu_vmx *vmx = to_vmx(vcpu);
2457
2458         if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2459                                 vmx->nested.msrs.pinbased_ctls_low,
2460                                 vmx->nested.msrs.pinbased_ctls_high) ||
2461             !vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2462                                 vmx->nested.msrs.procbased_ctls_low,
2463                                 vmx->nested.msrs.procbased_ctls_high))
2464                 return -EINVAL;
2465
2466         if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2467             !vmx_control_verify(vmcs12->secondary_vm_exec_control,
2468                                  vmx->nested.msrs.secondary_ctls_low,
2469                                  vmx->nested.msrs.secondary_ctls_high))
2470                 return -EINVAL;
2471
2472         if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) ||
2473             nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2474             nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2475             nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2476             nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2477             nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2478             nested_vmx_check_nmi_controls(vmcs12) ||
2479             nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2480             nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2481             nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2482             nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2483             (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2484                 return -EINVAL;
2485
2486         if (!nested_cpu_has_preemption_timer(vmcs12) &&
2487             nested_cpu_has_save_preemption_timer(vmcs12))
2488                 return -EINVAL;
2489
2490         if (nested_cpu_has_ept(vmcs12) &&
2491             !valid_ept_address(vcpu, vmcs12->ept_pointer))
2492                 return -EINVAL;
2493
2494         if (nested_cpu_has_vmfunc(vmcs12)) {
2495                 if (vmcs12->vm_function_control &
2496                     ~vmx->nested.msrs.vmfunc_controls)
2497                         return -EINVAL;
2498
2499                 if (nested_cpu_has_eptp_switching(vmcs12)) {
2500                         if (!nested_cpu_has_ept(vmcs12) ||
2501                             !page_address_valid(vcpu, vmcs12->eptp_list_address))
2502                                 return -EINVAL;
2503                 }
2504         }
2505
2506         return 0;
2507 }
2508
2509 /*
2510  * Checks related to VM-Exit Control Fields
2511  */
2512 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2513                                          struct vmcs12 *vmcs12)
2514 {
2515         struct vcpu_vmx *vmx = to_vmx(vcpu);
2516
2517         if (!vmx_control_verify(vmcs12->vm_exit_controls,
2518                                 vmx->nested.msrs.exit_ctls_low,
2519                                 vmx->nested.msrs.exit_ctls_high) ||
2520             nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))
2521                 return -EINVAL;
2522
2523         return 0;
2524 }
2525
2526 /*
2527  * Checks related to VM-Entry Control Fields
2528  */
2529 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2530                                           struct vmcs12 *vmcs12)
2531 {
2532         struct vcpu_vmx *vmx = to_vmx(vcpu);
2533
2534         if (!vmx_control_verify(vmcs12->vm_entry_controls,
2535                                 vmx->nested.msrs.entry_ctls_low,
2536                                 vmx->nested.msrs.entry_ctls_high))
2537                 return -EINVAL;
2538
2539         /*
2540          * From the Intel SDM, volume 3:
2541          * Fields relevant to VM-entry event injection must be set properly.
2542          * These fields are the VM-entry interruption-information field, the
2543          * VM-entry exception error code, and the VM-entry instruction length.
2544          */
2545         if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2546                 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2547                 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2548                 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2549                 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2550                 bool should_have_error_code;
2551                 bool urg = nested_cpu_has2(vmcs12,
2552                                            SECONDARY_EXEC_UNRESTRICTED_GUEST);
2553                 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2554
2555                 /* VM-entry interruption-info field: interruption type */
2556                 if (intr_type == INTR_TYPE_RESERVED ||
2557                     (intr_type == INTR_TYPE_OTHER_EVENT &&
2558                      !nested_cpu_supports_monitor_trap_flag(vcpu)))
2559                         return -EINVAL;
2560
2561                 /* VM-entry interruption-info field: vector */
2562                 if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2563                     (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2564                     (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2565                         return -EINVAL;
2566
2567                 /* VM-entry interruption-info field: deliver error code */
2568                 should_have_error_code =
2569                         intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2570                         x86_exception_has_error_code(vector);
2571                 if (has_error_code != should_have_error_code)
2572                         return -EINVAL;
2573
2574                 /* VM-entry exception error code */
2575                 if (has_error_code &&
2576                     vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
2577                         return -EINVAL;
2578
2579                 /* VM-entry interruption-info field: reserved bits */
2580                 if (intr_info & INTR_INFO_RESVD_BITS_MASK)
2581                         return -EINVAL;
2582
2583                 /* VM-entry instruction length */
2584                 switch (intr_type) {
2585                 case INTR_TYPE_SOFT_EXCEPTION:
2586                 case INTR_TYPE_SOFT_INTR:
2587                 case INTR_TYPE_PRIV_SW_EXCEPTION:
2588                         if ((vmcs12->vm_entry_instruction_len > 15) ||
2589                             (vmcs12->vm_entry_instruction_len == 0 &&
2590                              !nested_cpu_has_zero_length_injection(vcpu)))
2591                                 return -EINVAL;
2592                 }
2593         }
2594
2595         if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2596                 return -EINVAL;
2597
2598         return 0;
2599 }
2600
2601 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2602                                      struct vmcs12 *vmcs12)
2603 {
2604         if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2605             nested_check_vm_exit_controls(vcpu, vmcs12) ||
2606             nested_check_vm_entry_controls(vcpu, vmcs12))
2607                 return -EINVAL;
2608
2609         return 0;
2610 }
2611
2612 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2613                                        struct vmcs12 *vmcs12)
2614 {
2615         bool ia32e;
2616
2617         if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
2618             !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
2619             !nested_cr3_valid(vcpu, vmcs12->host_cr3))
2620                 return -EINVAL;
2621
2622         if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
2623             is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
2624                 return -EINVAL;
2625
2626         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2627             !kvm_pat_valid(vmcs12->host_ia32_pat))
2628                 return -EINVAL;
2629
2630         ia32e = (vmcs12->vm_exit_controls &
2631                  VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
2632
2633         if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2634             vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2635             vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2636             vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2637             vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2638             vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2639             vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2640             vmcs12->host_cs_selector == 0 ||
2641             vmcs12->host_tr_selector == 0 ||
2642             (vmcs12->host_ss_selector == 0 && !ia32e))
2643                 return -EINVAL;
2644
2645 #ifdef CONFIG_X86_64
2646         if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) ||
2647             is_noncanonical_address(vmcs12->host_gs_base, vcpu) ||
2648             is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) ||
2649             is_noncanonical_address(vmcs12->host_idtr_base, vcpu) ||
2650             is_noncanonical_address(vmcs12->host_tr_base, vcpu))
2651                 return -EINVAL;
2652 #endif
2653
2654         /*
2655          * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2656          * IA32_EFER MSR must be 0 in the field for that register. In addition,
2657          * the values of the LMA and LME bits in the field must each be that of
2658          * the host address-space size VM-exit control.
2659          */
2660         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2661                 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
2662                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
2663                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
2664                         return -EINVAL;
2665         }
2666
2667         return 0;
2668 }
2669
2670 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2671                                           struct vmcs12 *vmcs12)
2672 {
2673         int r = 0;
2674         struct vmcs12 *shadow;
2675         struct kvm_host_map map;
2676
2677         if (vmcs12->vmcs_link_pointer == -1ull)
2678                 return 0;
2679
2680         if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
2681                 return -EINVAL;
2682
2683         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
2684                 return -EINVAL;
2685
2686         shadow = map.hva;
2687
2688         if (shadow->hdr.revision_id != VMCS12_REVISION ||
2689             shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
2690                 r = -EINVAL;
2691
2692         kvm_vcpu_unmap(vcpu, &map, false);
2693         return r;
2694 }
2695
2696 /*
2697  * Checks related to Guest Non-register State
2698  */
2699 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2700 {
2701         if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2702             vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
2703                 return -EINVAL;
2704
2705         return 0;
2706 }
2707
2708 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2709                                         struct vmcs12 *vmcs12,
2710                                         u32 *exit_qual)
2711 {
2712         bool ia32e;
2713
2714         *exit_qual = ENTRY_FAIL_DEFAULT;
2715
2716         if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
2717             !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
2718                 return -EINVAL;
2719
2720         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2721             !kvm_pat_valid(vmcs12->guest_ia32_pat))
2722                 return -EINVAL;
2723
2724         if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2725                 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2726                 return -EINVAL;
2727         }
2728
2729         /*
2730          * If the load IA32_EFER VM-entry control is 1, the following checks
2731          * are performed on the field for the IA32_EFER MSR:
2732          * - Bits reserved in the IA32_EFER MSR must be 0.
2733          * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2734          *   the IA-32e mode guest VM-exit control. It must also be identical
2735          *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2736          *   CR0.PG) is 1.
2737          */
2738         if (to_vmx(vcpu)->nested.nested_run_pending &&
2739             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2740                 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2741                 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
2742                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
2743                     ((vmcs12->guest_cr0 & X86_CR0_PG) &&
2744                      ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
2745                         return -EINVAL;
2746         }
2747
2748         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2749             (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
2750              (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
2751                 return -EINVAL;
2752
2753         if (nested_check_guest_non_reg_state(vmcs12))
2754                 return -EINVAL;
2755
2756         return 0;
2757 }
2758
2759 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2760 {
2761         struct vcpu_vmx *vmx = to_vmx(vcpu);
2762         unsigned long cr3, cr4;
2763         bool vm_fail;
2764
2765         if (!nested_early_check)
2766                 return 0;
2767
2768         if (vmx->msr_autoload.host.nr)
2769                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2770         if (vmx->msr_autoload.guest.nr)
2771                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2772
2773         preempt_disable();
2774
2775         vmx_prepare_switch_to_guest(vcpu);
2776
2777         /*
2778          * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2779          * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2780          * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2781          * there is no need to preserve other bits or save/restore the field.
2782          */
2783         vmcs_writel(GUEST_RFLAGS, 0);
2784
2785         cr3 = __get_current_cr3_fast();
2786         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2787                 vmcs_writel(HOST_CR3, cr3);
2788                 vmx->loaded_vmcs->host_state.cr3 = cr3;
2789         }
2790
2791         cr4 = cr4_read_shadow();
2792         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2793                 vmcs_writel(HOST_CR4, cr4);
2794                 vmx->loaded_vmcs->host_state.cr4 = cr4;
2795         }
2796
2797         asm(
2798                 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2799                 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2800                 "je 1f \n\t"
2801                 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2802                 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2803                 "1: \n\t"
2804                 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2805
2806                 /* Check if vmlaunch or vmresume is needed */
2807                 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2808
2809                 /*
2810                  * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2811                  * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2812                  * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
2813                  * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
2814                  */
2815                 "call vmx_vmenter\n\t"
2816
2817                 CC_SET(be)
2818               : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
2819               : [HOST_RSP]"r"((unsigned long)HOST_RSP),
2820                 [loaded_vmcs]"r"(vmx->loaded_vmcs),
2821                 [launched]"i"(offsetof(struct loaded_vmcs, launched)),
2822                 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2823                 [wordsize]"i"(sizeof(ulong))
2824               : "memory"
2825         );
2826
2827         if (vmx->msr_autoload.host.nr)
2828                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2829         if (vmx->msr_autoload.guest.nr)
2830                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2831
2832         if (vm_fail) {
2833                 preempt_enable();
2834                 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
2835                              VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2836                 return 1;
2837         }
2838
2839         /*
2840          * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
2841          */
2842         local_irq_enable();
2843         if (hw_breakpoint_active())
2844                 set_debugreg(__this_cpu_read(cpu_dr7), 7);
2845         preempt_enable();
2846
2847         /*
2848          * A non-failing VMEntry means we somehow entered guest mode with
2849          * an illegal RIP, and that's just the tip of the iceberg.  There
2850          * is no telling what memory has been modified or what state has
2851          * been exposed to unknown code.  Hitting this all but guarantees
2852          * a (very critical) hardware issue.
2853          */
2854         WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
2855                 VMX_EXIT_REASONS_FAILED_VMENTRY));
2856
2857         return 0;
2858 }
2859
2860 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2861                                                  struct vmcs12 *vmcs12);
2862
2863 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2864 {
2865         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2866         struct vcpu_vmx *vmx = to_vmx(vcpu);
2867         struct kvm_host_map *map;
2868         struct page *page;
2869         u64 hpa;
2870
2871         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2872                 /*
2873                  * Translate L1 physical address to host physical
2874                  * address for vmcs02. Keep the page pinned, so this
2875                  * physical address remains valid. We keep a reference
2876                  * to it so we can release it later.
2877                  */
2878                 if (vmx->nested.apic_access_page) { /* shouldn't happen */
2879                         kvm_release_page_dirty(vmx->nested.apic_access_page);
2880                         vmx->nested.apic_access_page = NULL;
2881                 }
2882                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
2883                 /*
2884                  * If translation failed, no matter: This feature asks
2885                  * to exit when accessing the given address, and if it
2886                  * can never be accessed, this feature won't do
2887                  * anything anyway.
2888                  */
2889                 if (!is_error_page(page)) {
2890                         vmx->nested.apic_access_page = page;
2891                         hpa = page_to_phys(vmx->nested.apic_access_page);
2892                         vmcs_write64(APIC_ACCESS_ADDR, hpa);
2893                 } else {
2894                         secondary_exec_controls_clearbit(vmx,
2895                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
2896                 }
2897         }
2898
2899         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
2900                 map = &vmx->nested.virtual_apic_map;
2901
2902                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
2903                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
2904                 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
2905                            nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
2906                            !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2907                         /*
2908                          * The processor will never use the TPR shadow, simply
2909                          * clear the bit from the execution control.  Such a
2910                          * configuration is useless, but it happens in tests.
2911                          * For any other configuration, failing the vm entry is
2912                          * _not_ what the processor does but it's basically the
2913                          * only possibility we have.
2914                          */
2915                         exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
2916                 } else {
2917                         /*
2918                          * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
2919                          * force VM-Entry to fail.
2920                          */
2921                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
2922                 }
2923         }
2924
2925         if (nested_cpu_has_posted_intr(vmcs12)) {
2926                 map = &vmx->nested.pi_desc_map;
2927
2928                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
2929                         vmx->nested.pi_desc =
2930                                 (struct pi_desc *)(((void *)map->hva) +
2931                                 offset_in_page(vmcs12->posted_intr_desc_addr));
2932                         vmcs_write64(POSTED_INTR_DESC_ADDR,
2933                                      pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
2934                 }
2935         }
2936         if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2937                 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
2938         else
2939                 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
2940 }
2941
2942 /*
2943  * Intel's VMX Instruction Reference specifies a common set of prerequisites
2944  * for running VMX instructions (except VMXON, whose prerequisites are
2945  * slightly different). It also specifies what exception to inject otherwise.
2946  * Note that many of these exceptions have priority over VM exits, so they
2947  * don't have to be checked again here.
2948  */
2949 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
2950 {
2951         if (!to_vmx(vcpu)->nested.vmxon) {
2952                 kvm_queue_exception(vcpu, UD_VECTOR);
2953                 return 0;
2954         }
2955
2956         if (vmx_get_cpl(vcpu)) {
2957                 kvm_inject_gp(vcpu, 0);
2958                 return 0;
2959         }
2960
2961         return 1;
2962 }
2963
2964 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
2965 {
2966         u8 rvi = vmx_get_rvi();
2967         u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
2968
2969         return ((rvi & 0xf0) > (vppr & 0xf0));
2970 }
2971
2972 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2973                                    struct vmcs12 *vmcs12);
2974
2975 /*
2976  * If from_vmentry is false, this is being called from state restore (either RSM
2977  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
2978 + *
2979 + * Returns:
2980 + *   0 - success, i.e. proceed with actual VMEnter
2981 + *   1 - consistency check VMExit
2982 + *  -1 - consistency check VMFail
2983  */
2984 int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2985 {
2986         struct vcpu_vmx *vmx = to_vmx(vcpu);
2987         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2988         bool evaluate_pending_interrupts;
2989         u32 exit_reason = EXIT_REASON_INVALID_STATE;
2990         u32 exit_qual;
2991
2992         evaluate_pending_interrupts = exec_controls_get(vmx) &
2993                 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
2994         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
2995                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
2996
2997         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
2998                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
2999         if (kvm_mpx_supported() &&
3000                 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3001                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3002
3003         /*
3004          * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3005          * nested early checks are disabled.  In the event of a "late" VM-Fail,
3006          * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3007          * software model to the pre-VMEntry host state.  When EPT is disabled,
3008          * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3009          * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3010          * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3011          * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3012          * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3013          * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3014          * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3015          * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3016          * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3017          * path would need to manually save/restore vmcs01.GUEST_CR3.
3018          */
3019         if (!enable_ept && !nested_early_check)
3020                 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3021
3022         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3023
3024         prepare_vmcs02_early(vmx, vmcs12);
3025
3026         if (from_vmentry) {
3027                 nested_get_vmcs12_pages(vcpu);
3028
3029                 if (nested_vmx_check_vmentry_hw(vcpu)) {
3030                         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3031                         return -1;
3032                 }
3033
3034                 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3035                         goto vmentry_fail_vmexit;
3036         }
3037
3038         enter_guest_mode(vcpu);
3039         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3040                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3041
3042         if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3043                 goto vmentry_fail_vmexit_guest_mode;
3044
3045         if (from_vmentry) {
3046                 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3047                 exit_qual = nested_vmx_load_msr(vcpu,
3048                                                 vmcs12->vm_entry_msr_load_addr,
3049                                                 vmcs12->vm_entry_msr_load_count);
3050                 if (exit_qual)
3051                         goto vmentry_fail_vmexit_guest_mode;
3052         } else {
3053                 /*
3054                  * The MMU is not initialized to point at the right entities yet and
3055                  * "get pages" would need to read data from the guest (i.e. we will
3056                  * need to perform gpa to hpa translation). Request a call
3057                  * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3058                  * have already been set at vmentry time and should not be reset.
3059                  */
3060                 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3061         }
3062
3063         /*
3064          * If L1 had a pending IRQ/NMI until it executed
3065          * VMLAUNCH/VMRESUME which wasn't delivered because it was
3066          * disallowed (e.g. interrupts disabled), L0 needs to
3067          * evaluate if this pending event should cause an exit from L2
3068          * to L1 or delivered directly to L2 (e.g. In case L1 don't
3069          * intercept EXTERNAL_INTERRUPT).
3070          *
3071          * Usually this would be handled by the processor noticing an
3072          * IRQ/NMI window request, or checking RVI during evaluation of
3073          * pending virtual interrupts.  However, this setting was done
3074          * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3075          * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3076          */
3077         if (unlikely(evaluate_pending_interrupts))
3078                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3079
3080         /*
3081          * Do not start the preemption timer hrtimer until after we know
3082          * we are successful, so that only nested_vmx_vmexit needs to cancel
3083          * the timer.
3084          */
3085         vmx->nested.preemption_timer_expired = false;
3086         if (nested_cpu_has_preemption_timer(vmcs12))
3087                 vmx_start_preemption_timer(vcpu);
3088
3089         /*
3090          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3091          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3092          * returned as far as L1 is concerned. It will only return (and set
3093          * the success flag) when L2 exits (see nested_vmx_vmexit()).
3094          */
3095         return 0;
3096
3097         /*
3098          * A failed consistency check that leads to a VMExit during L1's
3099          * VMEnter to L2 is a variation of a normal VMexit, as explained in
3100          * 26.7 "VM-entry failures during or after loading guest state".
3101          */
3102 vmentry_fail_vmexit_guest_mode:
3103         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3104                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3105         leave_guest_mode(vcpu);
3106
3107 vmentry_fail_vmexit:
3108         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3109
3110         if (!from_vmentry)
3111                 return 1;
3112
3113         load_vmcs12_host_state(vcpu, vmcs12);
3114         vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3115         vmcs12->exit_qualification = exit_qual;
3116         if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3117                 vmx->nested.need_vmcs12_to_shadow_sync = true;
3118         return 1;
3119 }
3120
3121 /*
3122  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3123  * for running an L2 nested guest.
3124  */
3125 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3126 {
3127         struct vmcs12 *vmcs12;
3128         struct vcpu_vmx *vmx = to_vmx(vcpu);
3129         u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3130         int ret;
3131
3132         if (!nested_vmx_check_permission(vcpu))
3133                 return 1;
3134
3135         if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3136                 return 1;
3137
3138         if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3139                 return nested_vmx_failInvalid(vcpu);
3140
3141         vmcs12 = get_vmcs12(vcpu);
3142
3143         /*
3144          * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3145          * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3146          * rather than RFLAGS.ZF, and no error number is stored to the
3147          * VM-instruction error field.
3148          */
3149         if (vmcs12->hdr.shadow_vmcs)
3150                 return nested_vmx_failInvalid(vcpu);
3151
3152         if (vmx->nested.hv_evmcs) {
3153                 copy_enlightened_to_vmcs12(vmx);
3154                 /* Enlightened VMCS doesn't have launch state */
3155                 vmcs12->launch_state = !launch;
3156         } else if (enable_shadow_vmcs) {
3157                 copy_shadow_to_vmcs12(vmx);
3158         }
3159
3160         /*
3161          * The nested entry process starts with enforcing various prerequisites
3162          * on vmcs12 as required by the Intel SDM, and act appropriately when
3163          * they fail: As the SDM explains, some conditions should cause the
3164          * instruction to fail, while others will cause the instruction to seem
3165          * to succeed, but return an EXIT_REASON_INVALID_STATE.
3166          * To speed up the normal (success) code path, we should avoid checking
3167          * for misconfigurations which will anyway be caught by the processor
3168          * when using the merged vmcs02.
3169          */
3170         if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3171                 return nested_vmx_failValid(vcpu,
3172                         VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3173
3174         if (vmcs12->launch_state == launch)
3175                 return nested_vmx_failValid(vcpu,
3176                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3177                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3178
3179         if (nested_vmx_check_controls(vcpu, vmcs12))
3180                 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3181
3182         if (nested_vmx_check_host_state(vcpu, vmcs12))
3183                 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3184
3185         /*
3186          * We're finally done with prerequisite checking, and can start with
3187          * the nested entry.
3188          */
3189         vmx->nested.nested_run_pending = 1;
3190         ret = nested_vmx_enter_non_root_mode(vcpu, true);
3191         vmx->nested.nested_run_pending = !ret;
3192         if (ret > 0)
3193                 return 1;
3194         else if (ret)
3195                 return nested_vmx_failValid(vcpu,
3196                         VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3197
3198         /* Hide L1D cache contents from the nested guest.  */
3199         vmx->vcpu.arch.l1tf_flush_l1d = true;
3200
3201         /*
3202          * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3203          * also be used as part of restoring nVMX state for
3204          * snapshot restore (migration).
3205          *
3206          * In this flow, it is assumed that vmcs12 cache was
3207          * trasferred as part of captured nVMX state and should
3208          * therefore not be read from guest memory (which may not
3209          * exist on destination host yet).
3210          */
3211         nested_cache_shadow_vmcs12(vcpu, vmcs12);
3212
3213         /*
3214          * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3215          * awakened by event injection or by an NMI-window VM-exit or
3216          * by an interrupt-window VM-exit, halt the vcpu.
3217          */
3218         if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3219             !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3220             !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3221             !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3222               (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3223                 vmx->nested.nested_run_pending = 0;
3224                 return kvm_vcpu_halt(vcpu);
3225         }
3226         return 1;
3227 }
3228
3229 /*
3230  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3231  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3232  * This function returns the new value we should put in vmcs12.guest_cr0.
3233  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3234  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3235  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3236  *     didn't trap the bit, because if L1 did, so would L0).
3237  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3238  *     been modified by L2, and L1 knows it. So just leave the old value of
3239  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3240  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3241  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3242  *     changed these bits, and therefore they need to be updated, but L0
3243  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3244  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3245  */
3246 static inline unsigned long
3247 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3248 {
3249         return
3250         /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3251         /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3252         /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3253                         vcpu->arch.cr0_guest_owned_bits));
3254 }
3255
3256 static inline unsigned long
3257 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3258 {
3259         return
3260         /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3261         /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3262         /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3263                         vcpu->arch.cr4_guest_owned_bits));
3264 }
3265
3266 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3267                                       struct vmcs12 *vmcs12)
3268 {
3269         u32 idt_vectoring;
3270         unsigned int nr;
3271
3272         if (vcpu->arch.exception.injected) {
3273                 nr = vcpu->arch.exception.nr;
3274                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3275
3276                 if (kvm_exception_is_soft(nr)) {
3277                         vmcs12->vm_exit_instruction_len =
3278                                 vcpu->arch.event_exit_inst_len;
3279                         idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3280                 } else
3281                         idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3282
3283                 if (vcpu->arch.exception.has_error_code) {
3284                         idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3285                         vmcs12->idt_vectoring_error_code =
3286                                 vcpu->arch.exception.error_code;
3287                 }
3288
3289                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3290         } else if (vcpu->arch.nmi_injected) {
3291                 vmcs12->idt_vectoring_info_field =
3292                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3293         } else if (vcpu->arch.interrupt.injected) {
3294                 nr = vcpu->arch.interrupt.nr;
3295                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3296
3297                 if (vcpu->arch.interrupt.soft) {
3298                         idt_vectoring |= INTR_TYPE_SOFT_INTR;
3299                         vmcs12->vm_entry_instruction_len =
3300                                 vcpu->arch.event_exit_inst_len;
3301                 } else
3302                         idt_vectoring |= INTR_TYPE_EXT_INTR;
3303
3304                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3305         }
3306 }
3307
3308
3309 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3310 {
3311         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3312         gfn_t gfn;
3313
3314         /*
3315          * Don't need to mark the APIC access page dirty; it is never
3316          * written to by the CPU during APIC virtualization.
3317          */
3318
3319         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3320                 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3321                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3322         }
3323
3324         if (nested_cpu_has_posted_intr(vmcs12)) {
3325                 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3326                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3327         }
3328 }
3329
3330 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3331 {
3332         struct vcpu_vmx *vmx = to_vmx(vcpu);
3333         int max_irr;
3334         void *vapic_page;
3335         u16 status;
3336
3337         if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3338                 return;
3339
3340         vmx->nested.pi_pending = false;
3341         if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3342                 return;
3343
3344         max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3345         if (max_irr != 256) {
3346                 vapic_page = vmx->nested.virtual_apic_map.hva;
3347                 if (!vapic_page)
3348                         return;
3349
3350                 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3351                         vapic_page, &max_irr);
3352                 status = vmcs_read16(GUEST_INTR_STATUS);
3353                 if ((u8)max_irr > ((u8)status & 0xff)) {
3354                         status &= ~0xff;
3355                         status |= (u8)max_irr;
3356                         vmcs_write16(GUEST_INTR_STATUS, status);
3357                 }
3358         }
3359
3360         nested_mark_vmcs12_pages_dirty(vcpu);
3361 }
3362
3363 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3364                                                unsigned long exit_qual)
3365 {
3366         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3367         unsigned int nr = vcpu->arch.exception.nr;
3368         u32 intr_info = nr | INTR_INFO_VALID_MASK;
3369
3370         if (vcpu->arch.exception.has_error_code) {
3371                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3372                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3373         }
3374
3375         if (kvm_exception_is_soft(nr))
3376                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3377         else
3378                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3379
3380         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3381             vmx_get_nmi_mask(vcpu))
3382                 intr_info |= INTR_INFO_UNBLOCK_NMI;
3383
3384         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3385 }
3386
3387 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3388 {
3389         struct vcpu_vmx *vmx = to_vmx(vcpu);
3390         unsigned long exit_qual;
3391         bool block_nested_events =
3392             vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3393
3394         if (vcpu->arch.exception.pending &&
3395                 nested_vmx_check_exception(vcpu, &exit_qual)) {
3396                 if (block_nested_events)
3397                         return -EBUSY;
3398                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3399                 return 0;
3400         }
3401
3402         if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3403             vmx->nested.preemption_timer_expired) {
3404                 if (block_nested_events)
3405                         return -EBUSY;
3406                 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3407                 return 0;
3408         }
3409
3410         if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3411                 if (block_nested_events)
3412                         return -EBUSY;
3413                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3414                                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
3415                                   INTR_INFO_VALID_MASK, 0);
3416                 /*
3417                  * The NMI-triggered VM exit counts as injection:
3418                  * clear this one and block further NMIs.
3419                  */
3420                 vcpu->arch.nmi_pending = 0;
3421                 vmx_set_nmi_mask(vcpu, true);
3422                 return 0;
3423         }
3424
3425         if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3426             nested_exit_on_intr(vcpu)) {
3427                 if (block_nested_events)
3428                         return -EBUSY;
3429                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3430                 return 0;
3431         }
3432
3433         vmx_complete_nested_posted_interrupt(vcpu);
3434         return 0;
3435 }
3436
3437 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3438 {
3439         ktime_t remaining =
3440                 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3441         u64 value;
3442
3443         if (ktime_to_ns(remaining) <= 0)
3444                 return 0;
3445
3446         value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3447         do_div(value, 1000000);
3448         return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3449 }
3450
3451 static bool is_vmcs12_ext_field(unsigned long field)
3452 {
3453         switch (field) {
3454         case GUEST_ES_SELECTOR:
3455         case GUEST_CS_SELECTOR:
3456         case GUEST_SS_SELECTOR:
3457         case GUEST_DS_SELECTOR:
3458         case GUEST_FS_SELECTOR:
3459         case GUEST_GS_SELECTOR:
3460         case GUEST_LDTR_SELECTOR:
3461         case GUEST_TR_SELECTOR:
3462         case GUEST_ES_LIMIT:
3463         case GUEST_CS_LIMIT:
3464         case GUEST_SS_LIMIT:
3465         case GUEST_DS_LIMIT:
3466         case GUEST_FS_LIMIT:
3467         case GUEST_GS_LIMIT:
3468         case GUEST_LDTR_LIMIT:
3469         case GUEST_TR_LIMIT:
3470         case GUEST_GDTR_LIMIT:
3471         case GUEST_IDTR_LIMIT:
3472         case GUEST_ES_AR_BYTES:
3473         case GUEST_DS_AR_BYTES:
3474         case GUEST_FS_AR_BYTES:
3475         case GUEST_GS_AR_BYTES:
3476         case GUEST_LDTR_AR_BYTES:
3477         case GUEST_TR_AR_BYTES:
3478         case GUEST_ES_BASE:
3479         case GUEST_CS_BASE:
3480         case GUEST_SS_BASE:
3481         case GUEST_DS_BASE:
3482         case GUEST_FS_BASE:
3483         case GUEST_GS_BASE:
3484         case GUEST_LDTR_BASE:
3485         case GUEST_TR_BASE:
3486         case GUEST_GDTR_BASE:
3487         case GUEST_IDTR_BASE:
3488         case GUEST_PENDING_DBG_EXCEPTIONS:
3489         case GUEST_BNDCFGS:
3490                 return true;
3491         default:
3492                 break;
3493         }
3494
3495         return false;
3496 }
3497
3498 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3499                                        struct vmcs12 *vmcs12)
3500 {
3501         struct vcpu_vmx *vmx = to_vmx(vcpu);
3502
3503         vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3504         vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3505         vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3506         vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3507         vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3508         vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3509         vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3510         vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3511         vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3512         vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3513         vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3514         vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3515         vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3516         vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3517         vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3518         vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3519         vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3520         vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3521         vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3522         vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3523         vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3524         vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3525         vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3526         vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3527         vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3528         vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3529         vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3530         vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3531         vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3532         vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3533         vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3534         vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3535         vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3536         vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3537         vmcs12->guest_pending_dbg_exceptions =
3538                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3539         if (kvm_mpx_supported())
3540                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3541
3542         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3543 }
3544
3545 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3546                                        struct vmcs12 *vmcs12)
3547 {
3548         struct vcpu_vmx *vmx = to_vmx(vcpu);
3549         int cpu;
3550
3551         if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3552                 return;
3553
3554
3555         WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3556
3557         cpu = get_cpu();
3558         vmx->loaded_vmcs = &vmx->nested.vmcs02;
3559         vmx_vcpu_load(&vmx->vcpu, cpu);
3560
3561         sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3562
3563         vmx->loaded_vmcs = &vmx->vmcs01;
3564         vmx_vcpu_load(&vmx->vcpu, cpu);
3565         put_cpu();
3566 }
3567
3568 /*
3569  * Update the guest state fields of vmcs12 to reflect changes that
3570  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3571  * VM-entry controls is also updated, since this is really a guest
3572  * state bit.)
3573  */
3574 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3575 {
3576         struct vcpu_vmx *vmx = to_vmx(vcpu);
3577
3578         if (vmx->nested.hv_evmcs)
3579                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3580
3581         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3582
3583         vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3584         vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3585
3586         vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3587         vmcs12->guest_rip = kvm_rip_read(vcpu);
3588         vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3589
3590         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3591         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3592
3593         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3594         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3595         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3596
3597         vmcs12->guest_interruptibility_info =
3598                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3599
3600         if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3601                 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3602         else
3603                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3604
3605         if (nested_cpu_has_preemption_timer(vmcs12) &&
3606             vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3607                         vmcs12->vmx_preemption_timer_value =
3608                                 vmx_get_preemption_timer_value(vcpu);
3609
3610         /*
3611          * In some cases (usually, nested EPT), L2 is allowed to change its
3612          * own CR3 without exiting. If it has changed it, we must keep it.
3613          * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3614          * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3615          *
3616          * Additionally, restore L2's PDPTR to vmcs12.
3617          */
3618         if (enable_ept) {
3619                 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3620                 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3621                         vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3622                         vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3623                         vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3624                         vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3625                 }
3626         }
3627
3628         vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3629
3630         if (nested_cpu_has_vid(vmcs12))
3631                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3632
3633         vmcs12->vm_entry_controls =
3634                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3635                 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3636
3637         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3638                 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3639
3640         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3641                 vmcs12->guest_ia32_efer = vcpu->arch.efer;
3642 }
3643
3644 /*
3645  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3646  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3647  * and this function updates it to reflect the changes to the guest state while
3648  * L2 was running (and perhaps made some exits which were handled directly by L0
3649  * without going back to L1), and to reflect the exit reason.
3650  * Note that we do not have to copy here all VMCS fields, just those that
3651  * could have changed by the L2 guest or the exit - i.e., the guest-state and
3652  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3653  * which already writes to vmcs12 directly.
3654  */
3655 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3656                            u32 exit_reason, u32 exit_intr_info,
3657                            unsigned long exit_qualification)
3658 {
3659         /* update exit information fields: */
3660         vmcs12->vm_exit_reason = exit_reason;
3661         vmcs12->exit_qualification = exit_qualification;
3662         vmcs12->vm_exit_intr_info = exit_intr_info;
3663
3664         vmcs12->idt_vectoring_info_field = 0;
3665         vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3666         vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3667
3668         if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3669                 vmcs12->launch_state = 1;
3670
3671                 /* vm_entry_intr_info_field is cleared on exit. Emulate this
3672                  * instead of reading the real value. */
3673                 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3674
3675                 /*
3676                  * Transfer the event that L0 or L1 may wanted to inject into
3677                  * L2 to IDT_VECTORING_INFO_FIELD.
3678                  */
3679                 vmcs12_save_pending_event(vcpu, vmcs12);
3680
3681                 /*
3682                  * According to spec, there's no need to store the guest's
3683                  * MSRs if the exit is due to a VM-entry failure that occurs
3684                  * during or after loading the guest state. Since this exit
3685                  * does not fall in that category, we need to save the MSRs.
3686                  */
3687                 if (nested_vmx_store_msr(vcpu,
3688                                          vmcs12->vm_exit_msr_store_addr,
3689                                          vmcs12->vm_exit_msr_store_count))
3690                         nested_vmx_abort(vcpu,
3691                                          VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3692         }
3693
3694         /*
3695          * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3696          * preserved above and would only end up incorrectly in L1.
3697          */
3698         vcpu->arch.nmi_injected = false;
3699         kvm_clear_exception_queue(vcpu);
3700         kvm_clear_interrupt_queue(vcpu);
3701 }
3702
3703 /*
3704  * A part of what we need to when the nested L2 guest exits and we want to
3705  * run its L1 parent, is to reset L1's guest state to the host state specified
3706  * in vmcs12.
3707  * This function is to be called not only on normal nested exit, but also on
3708  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3709  * Failures During or After Loading Guest State").
3710  * This function should be called when the active VMCS is L1's (vmcs01).
3711  */
3712 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3713                                    struct vmcs12 *vmcs12)
3714 {
3715         struct kvm_segment seg;
3716         u32 entry_failure_code;
3717
3718         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3719                 vcpu->arch.efer = vmcs12->host_ia32_efer;
3720         else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3721                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3722         else
3723                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3724         vmx_set_efer(vcpu, vcpu->arch.efer);
3725
3726         kvm_rsp_write(vcpu, vmcs12->host_rsp);
3727         kvm_rip_write(vcpu, vmcs12->host_rip);
3728         vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3729         vmx_set_interrupt_shadow(vcpu, 0);
3730
3731         /*
3732          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3733          * actually changed, because vmx_set_cr0 refers to efer set above.
3734          *
3735          * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3736          * (KVM doesn't change it);
3737          */
3738         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3739         vmx_set_cr0(vcpu, vmcs12->host_cr0);
3740
3741         /* Same as above - no reason to call set_cr4_guest_host_mask().  */
3742         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3743         vmx_set_cr4(vcpu, vmcs12->host_cr4);
3744
3745         nested_ept_uninit_mmu_context(vcpu);
3746
3747         /*
3748          * Only PDPTE load can fail as the value of cr3 was checked on entry and
3749          * couldn't have changed.
3750          */
3751         if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3752                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3753
3754         if (!enable_ept)
3755                 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3756
3757         /*
3758          * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3759          * VMEntry/VMExit. Thus, no need to flush TLB.
3760          *
3761          * If vmcs12 doesn't use VPID, L1 expects TLB to be
3762          * flushed on every VMEntry/VMExit.
3763          *
3764          * Otherwise, we can preserve TLB entries as long as we are
3765          * able to tag L1 TLB entries differently than L2 TLB entries.
3766          *
3767          * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3768          * and therefore we request the TLB flush to happen only after VMCS EPTP
3769          * has been set by KVM_REQ_LOAD_CR3.
3770          */
3771         if (enable_vpid &&
3772             (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3773                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3774         }
3775
3776         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3777         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3778         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3779         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3780         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3781         vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3782         vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3783
3784         /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
3785         if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3786                 vmcs_write64(GUEST_BNDCFGS, 0);
3787
3788         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3789                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3790                 vcpu->arch.pat = vmcs12->host_ia32_pat;
3791         }
3792         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
3793                 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
3794                         vmcs12->host_ia32_perf_global_ctrl);
3795
3796         /* Set L1 segment info according to Intel SDM
3797             27.5.2 Loading Host Segment and Descriptor-Table Registers */
3798         seg = (struct kvm_segment) {
3799                 .base = 0,
3800                 .limit = 0xFFFFFFFF,
3801                 .selector = vmcs12->host_cs_selector,
3802                 .type = 11,
3803                 .present = 1,
3804                 .s = 1,
3805                 .g = 1
3806         };
3807         if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3808                 seg.l = 1;
3809         else
3810                 seg.db = 1;
3811         vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
3812         seg = (struct kvm_segment) {
3813                 .base = 0,
3814                 .limit = 0xFFFFFFFF,
3815                 .type = 3,
3816                 .present = 1,
3817                 .s = 1,
3818                 .db = 1,
3819                 .g = 1
3820         };
3821         seg.selector = vmcs12->host_ds_selector;
3822         vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
3823         seg.selector = vmcs12->host_es_selector;
3824         vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
3825         seg.selector = vmcs12->host_ss_selector;
3826         vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
3827         seg.selector = vmcs12->host_fs_selector;
3828         seg.base = vmcs12->host_fs_base;
3829         vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
3830         seg.selector = vmcs12->host_gs_selector;
3831         seg.base = vmcs12->host_gs_base;
3832         vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
3833         seg = (struct kvm_segment) {
3834                 .base = vmcs12->host_tr_base,
3835                 .limit = 0x67,
3836                 .selector = vmcs12->host_tr_selector,
3837                 .type = 11,
3838                 .present = 1
3839         };
3840         vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
3841
3842         kvm_set_dr(vcpu, 7, 0x400);
3843         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
3844
3845         if (cpu_has_vmx_msr_bitmap())
3846                 vmx_update_msr_bitmap(vcpu);
3847
3848         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
3849                                 vmcs12->vm_exit_msr_load_count))
3850                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3851 }
3852
3853 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
3854 {
3855         struct shared_msr_entry *efer_msr;
3856         unsigned int i;
3857
3858         if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
3859                 return vmcs_read64(GUEST_IA32_EFER);
3860
3861         if (cpu_has_load_ia32_efer())
3862                 return host_efer;
3863
3864         for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
3865                 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
3866                         return vmx->msr_autoload.guest.val[i].value;
3867         }
3868
3869         efer_msr = find_msr_entry(vmx, MSR_EFER);
3870         if (efer_msr)
3871                 return efer_msr->data;
3872
3873         return host_efer;
3874 }
3875
3876 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3877 {
3878         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3879         struct vcpu_vmx *vmx = to_vmx(vcpu);
3880         struct vmx_msr_entry g, h;
3881         struct msr_data msr;
3882         gpa_t gpa;
3883         u32 i, j;
3884
3885         vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
3886
3887         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
3888                 /*
3889                  * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
3890                  * as vmcs01.GUEST_DR7 contains a userspace defined value
3891                  * and vcpu->arch.dr7 is not squirreled away before the
3892                  * nested VMENTER (not worth adding a variable in nested_vmx).
3893                  */
3894                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
3895                         kvm_set_dr(vcpu, 7, DR7_FIXED_1);
3896                 else
3897                         WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
3898         }
3899
3900         /*
3901          * Note that calling vmx_set_{efer,cr0,cr4} is important as they
3902          * handle a variety of side effects to KVM's software model.
3903          */
3904         vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
3905
3906         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3907         vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
3908
3909         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3910         vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
3911
3912         nested_ept_uninit_mmu_context(vcpu);
3913         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3914         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3915
3916         /*
3917          * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
3918          * from vmcs01 (if necessary).  The PDPTRs are not loaded on
3919          * VMFail, like everything else we just need to ensure our
3920          * software model is up-to-date.
3921          */
3922         if (enable_ept)
3923                 ept_save_pdptrs(vcpu);
3924
3925         kvm_mmu_reset_context(vcpu);
3926
3927         if (cpu_has_vmx_msr_bitmap())
3928                 vmx_update_msr_bitmap(vcpu);
3929
3930         /*
3931          * This nasty bit of open coding is a compromise between blindly
3932          * loading L1's MSRs using the exit load lists (incorrect emulation
3933          * of VMFail), leaving the nested VM's MSRs in the software model
3934          * (incorrect behavior) and snapshotting the modified MSRs (too
3935          * expensive since the lists are unbound by hardware).  For each
3936          * MSR that was (prematurely) loaded from the nested VMEntry load
3937          * list, reload it from the exit load list if it exists and differs
3938          * from the guest value.  The intent is to stuff host state as
3939          * silently as possible, not to fully process the exit load list.
3940          */
3941         msr.host_initiated = false;
3942         for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
3943                 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
3944                 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
3945                         pr_debug_ratelimited(
3946                                 "%s read MSR index failed (%u, 0x%08llx)\n",
3947                                 __func__, i, gpa);
3948                         goto vmabort;
3949                 }
3950
3951                 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
3952                         gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
3953                         if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
3954                                 pr_debug_ratelimited(
3955                                         "%s read MSR failed (%u, 0x%08llx)\n",
3956                                         __func__, j, gpa);
3957                                 goto vmabort;
3958                         }
3959                         if (h.index != g.index)
3960                                 continue;
3961                         if (h.value == g.value)
3962                                 break;
3963
3964                         if (nested_vmx_load_msr_check(vcpu, &h)) {
3965                                 pr_debug_ratelimited(
3966                                         "%s check failed (%u, 0x%x, 0x%x)\n",
3967                                         __func__, j, h.index, h.reserved);
3968                                 goto vmabort;
3969                         }
3970
3971                         msr.index = h.index;
3972                         msr.data = h.value;
3973                         if (kvm_set_msr(vcpu, &msr)) {
3974                                 pr_debug_ratelimited(
3975                                         "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
3976                                         __func__, j, h.index, h.value);
3977                                 goto vmabort;
3978                         }
3979                 }
3980         }
3981
3982         return;
3983
3984 vmabort:
3985         nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3986 }
3987
3988 /*
3989  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
3990  * and modify vmcs12 to make it see what it would expect to see there if
3991  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
3992  */
3993 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3994                        u32 exit_intr_info, unsigned long exit_qualification)
3995 {
3996         struct vcpu_vmx *vmx = to_vmx(vcpu);
3997         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3998
3999         /* trying to cancel vmlaunch/vmresume is a bug */
4000         WARN_ON_ONCE(vmx->nested.nested_run_pending);
4001
4002         leave_guest_mode(vcpu);
4003
4004         if (nested_cpu_has_preemption_timer(vmcs12))
4005                 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4006
4007         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
4008                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4009
4010         if (likely(!vmx->fail)) {
4011                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4012
4013                 if (exit_reason != -1)
4014                         prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4015                                        exit_qualification);
4016
4017                 /*
4018                  * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4019                  * also be used to capture vmcs12 cache as part of
4020                  * capturing nVMX state for snapshot (migration).
4021                  *
4022                  * Otherwise, this flush will dirty guest memory at a
4023                  * point it is already assumed by user-space to be
4024                  * immutable.
4025                  */
4026                 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4027         } else {
4028                 /*
4029                  * The only expected VM-instruction error is "VM entry with
4030                  * invalid control field(s)." Anything else indicates a
4031                  * problem with L0.  And we should never get here with a
4032                  * VMFail of any type if early consistency checks are enabled.
4033                  */
4034                 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4035                              VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4036                 WARN_ON_ONCE(nested_early_check);
4037         }
4038
4039         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4040
4041         /* Update any VMCS fields that might have changed while L2 ran */
4042         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4043         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4044         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4045
4046         if (kvm_has_tsc_control)
4047                 decache_tsc_multiplier(vmx);
4048
4049         if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4050                 vmx->nested.change_vmcs01_virtual_apic_mode = false;
4051                 vmx_set_virtual_apic_mode(vcpu);
4052         } else if (!nested_cpu_has_ept(vmcs12) &&
4053                    nested_cpu_has2(vmcs12,
4054                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
4055                 vmx_flush_tlb(vcpu, true);
4056         }
4057
4058         /* Unpin physical memory we referred to in vmcs02 */
4059         if (vmx->nested.apic_access_page) {
4060                 kvm_release_page_dirty(vmx->nested.apic_access_page);
4061                 vmx->nested.apic_access_page = NULL;
4062         }
4063         kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4064         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4065         vmx->nested.pi_desc = NULL;
4066
4067         /*
4068          * We are now running in L2, mmu_notifier will force to reload the
4069          * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4070          */
4071         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4072
4073         if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4074                 vmx->nested.need_vmcs12_to_shadow_sync = true;
4075
4076         /* in case we halted in L2 */
4077         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4078
4079         if (likely(!vmx->fail)) {
4080                 /*
4081                  * TODO: SDM says that with acknowledge interrupt on
4082                  * exit, bit 31 of the VM-exit interrupt information
4083                  * (valid interrupt) is always set to 1 on
4084                  * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4085                  * need kvm_cpu_has_interrupt().  See the commit
4086                  * message for details.
4087                  */
4088                 if (nested_exit_intr_ack_set(vcpu) &&
4089                     exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4090                     kvm_cpu_has_interrupt(vcpu)) {
4091                         int irq = kvm_cpu_get_interrupt(vcpu);
4092                         WARN_ON(irq < 0);
4093                         vmcs12->vm_exit_intr_info = irq |
4094                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4095                 }
4096
4097                 if (exit_reason != -1)
4098                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4099                                                        vmcs12->exit_qualification,
4100                                                        vmcs12->idt_vectoring_info_field,
4101                                                        vmcs12->vm_exit_intr_info,
4102                                                        vmcs12->vm_exit_intr_error_code,
4103                                                        KVM_ISA_VMX);
4104
4105                 load_vmcs12_host_state(vcpu, vmcs12);
4106
4107                 return;
4108         }
4109
4110         /*
4111          * After an early L2 VM-entry failure, we're now back
4112          * in L1 which thinks it just finished a VMLAUNCH or
4113          * VMRESUME instruction, so we need to set the failure
4114          * flag and the VM-instruction error field of the VMCS
4115          * accordingly, and skip the emulated instruction.
4116          */
4117         (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4118
4119         /*
4120          * Restore L1's host state to KVM's software model.  We're here
4121          * because a consistency check was caught by hardware, which
4122          * means some amount of guest state has been propagated to KVM's
4123          * model and needs to be unwound to the host's state.
4124          */
4125         nested_vmx_restore_host_state(vcpu);
4126
4127         vmx->fail = 0;
4128 }
4129
4130 /*
4131  * Decode the memory-address operand of a vmx instruction, as recorded on an
4132  * exit caused by such an instruction (run by a guest hypervisor).
4133  * On success, returns 0. When the operand is invalid, returns 1 and throws
4134  * #UD or #GP.
4135  */
4136 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4137                         u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4138 {
4139         gva_t off;
4140         bool exn;
4141         struct kvm_segment s;
4142
4143         /*
4144          * According to Vol. 3B, "Information for VM Exits Due to Instruction
4145          * Execution", on an exit, vmx_instruction_info holds most of the
4146          * addressing components of the operand. Only the displacement part
4147          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4148          * For how an actual address is calculated from all these components,
4149          * refer to Vol. 1, "Operand Addressing".
4150          */
4151         int  scaling = vmx_instruction_info & 3;
4152         int  addr_size = (vmx_instruction_info >> 7) & 7;
4153         bool is_reg = vmx_instruction_info & (1u << 10);
4154         int  seg_reg = (vmx_instruction_info >> 15) & 7;
4155         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4156         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4157         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4158         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4159
4160         if (is_reg) {
4161                 kvm_queue_exception(vcpu, UD_VECTOR);
4162                 return 1;
4163         }
4164
4165         /* Addr = segment_base + offset */
4166         /* offset = base + [index * scale] + displacement */
4167         off = exit_qualification; /* holds the displacement */
4168         if (addr_size == 1)
4169                 off = (gva_t)sign_extend64(off, 31);
4170         else if (addr_size == 0)
4171                 off = (gva_t)sign_extend64(off, 15);
4172         if (base_is_valid)
4173                 off += kvm_register_read(vcpu, base_reg);
4174         if (index_is_valid)
4175                 off += kvm_register_read(vcpu, index_reg)<<scaling;
4176         vmx_get_segment(vcpu, &s, seg_reg);
4177
4178         /*
4179          * The effective address, i.e. @off, of a memory operand is truncated
4180          * based on the address size of the instruction.  Note that this is
4181          * the *effective address*, i.e. the address prior to accounting for
4182          * the segment's base.
4183          */
4184         if (addr_size == 1) /* 32 bit */
4185                 off &= 0xffffffff;
4186         else if (addr_size == 0) /* 16 bit */
4187                 off &= 0xffff;
4188
4189         /* Checks for #GP/#SS exceptions. */
4190         exn = false;
4191         if (is_long_mode(vcpu)) {
4192                 /*
4193                  * The virtual/linear address is never truncated in 64-bit
4194                  * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4195                  * address when using FS/GS with a non-zero base.
4196                  */
4197                 *ret = s.base + off;
4198
4199                 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4200                  * non-canonical form. This is the only check on the memory
4201                  * destination for long mode!
4202                  */
4203                 exn = is_noncanonical_address(*ret, vcpu);
4204         } else {
4205                 /*
4206                  * When not in long mode, the virtual/linear address is
4207                  * unconditionally truncated to 32 bits regardless of the
4208                  * address size.
4209                  */
4210                 *ret = (s.base + off) & 0xffffffff;
4211
4212                 /* Protected mode: apply checks for segment validity in the
4213                  * following order:
4214                  * - segment type check (#GP(0) may be thrown)
4215                  * - usability check (#GP(0)/#SS(0))
4216                  * - limit check (#GP(0)/#SS(0))
4217                  */
4218                 if (wr)
4219                         /* #GP(0) if the destination operand is located in a
4220                          * read-only data segment or any code segment.
4221                          */
4222                         exn = ((s.type & 0xa) == 0 || (s.type & 8));
4223                 else
4224                         /* #GP(0) if the source operand is located in an
4225                          * execute-only code segment
4226                          */
4227                         exn = ((s.type & 0xa) == 8);
4228                 if (exn) {
4229                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4230                         return 1;
4231                 }
4232                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4233                  */
4234                 exn = (s.unusable != 0);
4235
4236                 /*
4237                  * Protected mode: #GP(0)/#SS(0) if the memory operand is
4238                  * outside the segment limit.  All CPUs that support VMX ignore
4239                  * limit checks for flat segments, i.e. segments with base==0,
4240                  * limit==0xffffffff and of type expand-up data or code.
4241                  */
4242                 if (!(s.base == 0 && s.limit == 0xffffffff &&
4243                      ((s.type & 8) || !(s.type & 4))))
4244                         exn = exn || ((u64)off + len - 1 > s.limit);
4245         }
4246         if (exn) {
4247                 kvm_queue_exception_e(vcpu,
4248                                       seg_reg == VCPU_SREG_SS ?
4249                                                 SS_VECTOR : GP_VECTOR,
4250                                       0);
4251                 return 1;
4252         }
4253
4254         return 0;
4255 }
4256
4257 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4258 {
4259         gva_t gva;
4260         struct x86_exception e;
4261
4262         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4263                                 vmcs_read32(VMX_INSTRUCTION_INFO), false,
4264                                 sizeof(*vmpointer), &gva))
4265                 return 1;
4266
4267         if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4268                 kvm_inject_page_fault(vcpu, &e);
4269                 return 1;
4270         }
4271
4272         return 0;
4273 }
4274
4275 /*
4276  * Allocate a shadow VMCS and associate it with the currently loaded
4277  * VMCS, unless such a shadow VMCS already exists. The newly allocated
4278  * VMCS is also VMCLEARed, so that it is ready for use.
4279  */
4280 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4281 {
4282         struct vcpu_vmx *vmx = to_vmx(vcpu);
4283         struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4284
4285         /*
4286          * We should allocate a shadow vmcs for vmcs01 only when L1
4287          * executes VMXON and free it when L1 executes VMXOFF.
4288          * As it is invalid to execute VMXON twice, we shouldn't reach
4289          * here when vmcs01 already have an allocated shadow vmcs.
4290          */
4291         WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4292
4293         if (!loaded_vmcs->shadow_vmcs) {
4294                 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4295                 if (loaded_vmcs->shadow_vmcs)
4296                         vmcs_clear(loaded_vmcs->shadow_vmcs);
4297         }
4298         return loaded_vmcs->shadow_vmcs;
4299 }
4300
4301 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4302 {
4303         struct vcpu_vmx *vmx = to_vmx(vcpu);
4304         int r;
4305
4306         r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4307         if (r < 0)
4308                 goto out_vmcs02;
4309
4310         vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4311         if (!vmx->nested.cached_vmcs12)
4312                 goto out_cached_vmcs12;
4313
4314         vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4315         if (!vmx->nested.cached_shadow_vmcs12)
4316                 goto out_cached_shadow_vmcs12;
4317
4318         if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4319                 goto out_shadow_vmcs;
4320
4321         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4322                      HRTIMER_MODE_REL_PINNED);
4323         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4324
4325         vmx->nested.vpid02 = allocate_vpid();
4326
4327         vmx->nested.vmcs02_initialized = false;
4328         vmx->nested.vmxon = true;
4329
4330         if (pt_mode == PT_MODE_HOST_GUEST) {
4331                 vmx->pt_desc.guest.ctl = 0;
4332                 pt_update_intercept_for_msr(vmx);
4333         }
4334
4335         return 0;
4336
4337 out_shadow_vmcs:
4338         kfree(vmx->nested.cached_shadow_vmcs12);
4339
4340 out_cached_shadow_vmcs12:
4341         kfree(vmx->nested.cached_vmcs12);
4342
4343 out_cached_vmcs12:
4344         free_loaded_vmcs(&vmx->nested.vmcs02);
4345
4346 out_vmcs02:
4347         return -ENOMEM;
4348 }
4349
4350 /*
4351  * Emulate the VMXON instruction.
4352  * Currently, we just remember that VMX is active, and do not save or even
4353  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4354  * do not currently need to store anything in that guest-allocated memory
4355  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4356  * argument is different from the VMXON pointer (which the spec says they do).
4357  */
4358 static int handle_vmon(struct kvm_vcpu *vcpu)
4359 {
4360         int ret;
4361         gpa_t vmptr;
4362         uint32_t revision;
4363         struct vcpu_vmx *vmx = to_vmx(vcpu);
4364         const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4365                 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4366
4367         /*
4368          * The Intel VMX Instruction Reference lists a bunch of bits that are
4369          * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4370          * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4371          * Otherwise, we should fail with #UD.  But most faulting conditions
4372          * have already been checked by hardware, prior to the VM-exit for
4373          * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4374          * that bit set to 1 in non-root mode.
4375          */
4376         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4377                 kvm_queue_exception(vcpu, UD_VECTOR);
4378                 return 1;
4379         }
4380
4381         /* CPL=0 must be checked manually. */
4382         if (vmx_get_cpl(vcpu)) {
4383                 kvm_inject_gp(vcpu, 0);
4384                 return 1;
4385         }
4386
4387         if (vmx->nested.vmxon)
4388                 return nested_vmx_failValid(vcpu,
4389                         VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4390
4391         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4392                         != VMXON_NEEDED_FEATURES) {
4393                 kvm_inject_gp(vcpu, 0);
4394                 return 1;
4395         }
4396
4397         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4398                 return 1;
4399
4400         /*
4401          * SDM 3: 24.11.5
4402          * The first 4 bytes of VMXON region contain the supported
4403          * VMCS revision identifier
4404          *
4405          * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4406          * which replaces physical address width with 32
4407          */
4408         if (!page_address_valid(vcpu, vmptr))
4409                 return nested_vmx_failInvalid(vcpu);
4410
4411         if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4412             revision != VMCS12_REVISION)
4413                 return nested_vmx_failInvalid(vcpu);
4414
4415         vmx->nested.vmxon_ptr = vmptr;
4416         ret = enter_vmx_operation(vcpu);
4417         if (ret)
4418                 return ret;
4419
4420         return nested_vmx_succeed(vcpu);
4421 }
4422
4423 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4424 {
4425         struct vcpu_vmx *vmx = to_vmx(vcpu);
4426
4427         if (vmx->nested.current_vmptr == -1ull)
4428                 return;
4429
4430         copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4431
4432         if (enable_shadow_vmcs) {
4433                 /* copy to memory all shadowed fields in case
4434                    they were modified */
4435                 copy_shadow_to_vmcs12(vmx);
4436                 vmx->nested.need_vmcs12_to_shadow_sync = false;
4437                 vmx_disable_shadow_vmcs(vmx);
4438         }
4439         vmx->nested.posted_intr_nv = -1;
4440
4441         /* Flush VMCS12 to guest memory */
4442         kvm_vcpu_write_guest_page(vcpu,
4443                                   vmx->nested.current_vmptr >> PAGE_SHIFT,
4444                                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4445
4446         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4447
4448         vmx->nested.current_vmptr = -1ull;
4449 }
4450
4451 /* Emulate the VMXOFF instruction */
4452 static int handle_vmoff(struct kvm_vcpu *vcpu)
4453 {
4454         if (!nested_vmx_check_permission(vcpu))
4455                 return 1;
4456         free_nested(vcpu);
4457         return nested_vmx_succeed(vcpu);
4458 }
4459
4460 /* Emulate the VMCLEAR instruction */
4461 static int handle_vmclear(struct kvm_vcpu *vcpu)
4462 {
4463         struct vcpu_vmx *vmx = to_vmx(vcpu);
4464         u32 zero = 0;
4465         gpa_t vmptr;
4466         u64 evmcs_gpa;
4467
4468         if (!nested_vmx_check_permission(vcpu))
4469                 return 1;
4470
4471         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4472                 return 1;
4473
4474         if (!page_address_valid(vcpu, vmptr))
4475                 return nested_vmx_failValid(vcpu,
4476                         VMXERR_VMCLEAR_INVALID_ADDRESS);
4477
4478         if (vmptr == vmx->nested.vmxon_ptr)
4479                 return nested_vmx_failValid(vcpu,
4480                         VMXERR_VMCLEAR_VMXON_POINTER);
4481
4482         /*
4483          * When Enlightened VMEntry is enabled on the calling CPU we treat
4484          * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4485          * way to distinguish it from VMCS12) and we must not corrupt it by
4486          * writing to the non-existent 'launch_state' field. The area doesn't
4487          * have to be the currently active EVMCS on the calling CPU and there's
4488          * nothing KVM has to do to transition it from 'active' to 'non-active'
4489          * state. It is possible that the area will stay mapped as
4490          * vmx->nested.hv_evmcs but this shouldn't be a problem.
4491          */
4492         if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4493                    !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4494                 if (vmptr == vmx->nested.current_vmptr)
4495                         nested_release_vmcs12(vcpu);
4496
4497                 kvm_vcpu_write_guest(vcpu,
4498                                      vmptr + offsetof(struct vmcs12,
4499                                                       launch_state),
4500                                      &zero, sizeof(zero));
4501         }
4502
4503         return nested_vmx_succeed(vcpu);
4504 }
4505
4506 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4507
4508 /* Emulate the VMLAUNCH instruction */
4509 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4510 {
4511         return nested_vmx_run(vcpu, true);
4512 }
4513
4514 /* Emulate the VMRESUME instruction */
4515 static int handle_vmresume(struct kvm_vcpu *vcpu)
4516 {
4517
4518         return nested_vmx_run(vcpu, false);
4519 }
4520
4521 static int handle_vmread(struct kvm_vcpu *vcpu)
4522 {
4523         unsigned long field;
4524         u64 field_value;
4525         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4526         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4527         int len;
4528         gva_t gva = 0;
4529         struct vmcs12 *vmcs12;
4530         short offset;
4531
4532         if (!nested_vmx_check_permission(vcpu))
4533                 return 1;
4534
4535         if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4536                 return nested_vmx_failInvalid(vcpu);
4537
4538         if (!is_guest_mode(vcpu))
4539                 vmcs12 = get_vmcs12(vcpu);
4540         else {
4541                 /*
4542                  * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4543                  * to shadowed-field sets the ALU flags for VMfailInvalid.
4544                  */
4545                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4546                         return nested_vmx_failInvalid(vcpu);
4547                 vmcs12 = get_shadow_vmcs12(vcpu);
4548         }
4549
4550         /* Decode instruction info and find the field to read */
4551         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4552
4553         offset = vmcs_field_to_offset(field);
4554         if (offset < 0)
4555                 return nested_vmx_failValid(vcpu,
4556                         VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4557
4558         if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4559                 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4560
4561         /* Read the field, zero-extended to a u64 field_value */
4562         field_value = vmcs12_read_any(vmcs12, field, offset);
4563
4564         /*
4565          * Now copy part of this value to register or memory, as requested.
4566          * Note that the number of bits actually copied is 32 or 64 depending
4567          * on the guest's mode (32 or 64 bit), not on the given field's length.
4568          */
4569         if (vmx_instruction_info & (1u << 10)) {
4570                 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4571                         field_value);
4572         } else {
4573                 len = is_64_bit_mode(vcpu) ? 8 : 4;
4574                 if (get_vmx_mem_address(vcpu, exit_qualification,
4575                                 vmx_instruction_info, true, len, &gva))
4576                         return 1;
4577                 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4578                 kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL);
4579         }
4580
4581         return nested_vmx_succeed(vcpu);
4582 }
4583
4584 static bool is_shadow_field_rw(unsigned long field)
4585 {
4586         switch (field) {
4587 #define SHADOW_FIELD_RW(x, y) case x:
4588 #include "vmcs_shadow_fields.h"
4589                 return true;
4590         default:
4591                 break;
4592         }
4593         return false;
4594 }
4595
4596 static bool is_shadow_field_ro(unsigned long field)
4597 {
4598         switch (field) {
4599 #define SHADOW_FIELD_RO(x, y) case x:
4600 #include "vmcs_shadow_fields.h"
4601                 return true;
4602         default:
4603                 break;
4604         }
4605         return false;
4606 }
4607
4608 static int handle_vmwrite(struct kvm_vcpu *vcpu)
4609 {
4610         unsigned long field;
4611         int len;
4612         gva_t gva;
4613         struct vcpu_vmx *vmx = to_vmx(vcpu);
4614         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4615         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4616
4617         /* The value to write might be 32 or 64 bits, depending on L1's long
4618          * mode, and eventually we need to write that into a field of several
4619          * possible lengths. The code below first zero-extends the value to 64
4620          * bit (field_value), and then copies only the appropriate number of
4621          * bits into the vmcs12 field.
4622          */
4623         u64 field_value = 0;
4624         struct x86_exception e;
4625         struct vmcs12 *vmcs12;
4626         short offset;
4627
4628         if (!nested_vmx_check_permission(vcpu))
4629                 return 1;
4630
4631         if (vmx->nested.current_vmptr == -1ull)
4632                 return nested_vmx_failInvalid(vcpu);
4633
4634         if (vmx_instruction_info & (1u << 10))
4635                 field_value = kvm_register_readl(vcpu,
4636                         (((vmx_instruction_info) >> 3) & 0xf));
4637         else {
4638                 len = is_64_bit_mode(vcpu) ? 8 : 4;
4639                 if (get_vmx_mem_address(vcpu, exit_qualification,
4640                                 vmx_instruction_info, false, len, &gva))
4641                         return 1;
4642                 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
4643                         kvm_inject_page_fault(vcpu, &e);
4644                         return 1;
4645                 }
4646         }
4647
4648
4649         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4650         /*
4651          * If the vCPU supports "VMWRITE to any supported field in the
4652          * VMCS," then the "read-only" fields are actually read/write.
4653          */
4654         if (vmcs_field_readonly(field) &&
4655             !nested_cpu_has_vmwrite_any_field(vcpu))
4656                 return nested_vmx_failValid(vcpu,
4657                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4658
4659         if (!is_guest_mode(vcpu)) {
4660                 vmcs12 = get_vmcs12(vcpu);
4661
4662                 /*
4663                  * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4664                  * vmcs12, else we may crush a field or consume a stale value.
4665                  */
4666                 if (!is_shadow_field_rw(field))
4667                         copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4668         } else {
4669                 /*
4670                  * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4671                  * to shadowed-field sets the ALU flags for VMfailInvalid.
4672                  */
4673                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4674                         return nested_vmx_failInvalid(vcpu);
4675                 vmcs12 = get_shadow_vmcs12(vcpu);
4676         }
4677
4678         offset = vmcs_field_to_offset(field);
4679         if (offset < 0)
4680                 return nested_vmx_failValid(vcpu,
4681                         VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4682
4683         /*
4684          * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4685          * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
4686          * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4687          * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4688          * from L1 will return a different value than VMREAD from L2 (L1 sees
4689          * the stripped down value, L2 sees the full value as stored by KVM).
4690          */
4691         if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4692                 field_value &= 0x1f0ff;
4693
4694         vmcs12_write_any(vmcs12, field, offset, field_value);
4695
4696         /*
4697          * Do not track vmcs12 dirty-state if in guest-mode as we actually
4698          * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
4699          * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4700          * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4701          */
4702         if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4703                 /*
4704                  * L1 can read these fields without exiting, ensure the
4705                  * shadow VMCS is up-to-date.
4706                  */
4707                 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4708                         preempt_disable();
4709                         vmcs_load(vmx->vmcs01.shadow_vmcs);
4710
4711                         __vmcs_writel(field, field_value);
4712
4713                         vmcs_clear(vmx->vmcs01.shadow_vmcs);
4714                         vmcs_load(vmx->loaded_vmcs->vmcs);
4715                         preempt_enable();
4716                 }
4717                 vmx->nested.dirty_vmcs12 = true;
4718         }
4719
4720         return nested_vmx_succeed(vcpu);
4721 }
4722
4723 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4724 {
4725         vmx->nested.current_vmptr = vmptr;
4726         if (enable_shadow_vmcs) {
4727                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
4728                 vmcs_write64(VMCS_LINK_POINTER,
4729                              __pa(vmx->vmcs01.shadow_vmcs));
4730                 vmx->nested.need_vmcs12_to_shadow_sync = true;
4731         }
4732         vmx->nested.dirty_vmcs12 = true;
4733 }
4734
4735 /* Emulate the VMPTRLD instruction */
4736 static int handle_vmptrld(struct kvm_vcpu *vcpu)
4737 {
4738         struct vcpu_vmx *vmx = to_vmx(vcpu);
4739         gpa_t vmptr;
4740
4741         if (!nested_vmx_check_permission(vcpu))
4742                 return 1;
4743
4744         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4745                 return 1;
4746
4747         if (!page_address_valid(vcpu, vmptr))
4748                 return nested_vmx_failValid(vcpu,
4749                         VMXERR_VMPTRLD_INVALID_ADDRESS);
4750
4751         if (vmptr == vmx->nested.vmxon_ptr)
4752                 return nested_vmx_failValid(vcpu,
4753                         VMXERR_VMPTRLD_VMXON_POINTER);
4754
4755         /* Forbid normal VMPTRLD if Enlightened version was used */
4756         if (vmx->nested.hv_evmcs)
4757                 return 1;
4758
4759         if (vmx->nested.current_vmptr != vmptr) {
4760                 struct kvm_host_map map;
4761                 struct vmcs12 *new_vmcs12;
4762
4763                 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4764                         /*
4765                          * Reads from an unbacked page return all 1s,
4766                          * which means that the 32 bits located at the
4767                          * given physical address won't match the required
4768                          * VMCS12_REVISION identifier.
4769                          */
4770                         return nested_vmx_failValid(vcpu,
4771                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4772                 }
4773
4774                 new_vmcs12 = map.hva;
4775
4776                 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
4777                     (new_vmcs12->hdr.shadow_vmcs &&
4778                      !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
4779                         kvm_vcpu_unmap(vcpu, &map, false);
4780                         return nested_vmx_failValid(vcpu,
4781                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4782                 }
4783
4784                 nested_release_vmcs12(vcpu);
4785
4786                 /*
4787                  * Load VMCS12 from guest memory since it is not already
4788                  * cached.
4789                  */
4790                 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
4791                 kvm_vcpu_unmap(vcpu, &map, false);
4792
4793                 set_current_vmptr(vmx, vmptr);
4794         }
4795
4796         return nested_vmx_succeed(vcpu);
4797 }
4798
4799 /* Emulate the VMPTRST instruction */
4800 static int handle_vmptrst(struct kvm_vcpu *vcpu)
4801 {
4802         unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
4803         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4804         gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
4805         struct x86_exception e;
4806         gva_t gva;
4807
4808         if (!nested_vmx_check_permission(vcpu))
4809                 return 1;
4810
4811         if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
4812                 return 1;
4813
4814         if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
4815                                 true, sizeof(gpa_t), &gva))
4816                 return 1;
4817         /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
4818         if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
4819                                         sizeof(gpa_t), &e)) {
4820                 kvm_inject_page_fault(vcpu, &e);
4821                 return 1;
4822         }
4823         return nested_vmx_succeed(vcpu);
4824 }
4825
4826 /* Emulate the INVEPT instruction */
4827 static int handle_invept(struct kvm_vcpu *vcpu)
4828 {
4829         struct vcpu_vmx *vmx = to_vmx(vcpu);
4830         u32 vmx_instruction_info, types;
4831         unsigned long type;
4832         gva_t gva;
4833         struct x86_exception e;
4834         struct {
4835                 u64 eptp, gpa;
4836         } operand;
4837
4838         if (!(vmx->nested.msrs.secondary_ctls_high &
4839               SECONDARY_EXEC_ENABLE_EPT) ||
4840             !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
4841                 kvm_queue_exception(vcpu, UD_VECTOR);
4842                 return 1;
4843         }
4844
4845         if (!nested_vmx_check_permission(vcpu))
4846                 return 1;
4847
4848         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4849         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4850
4851         types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
4852
4853         if (type >= 32 || !(types & (1 << type)))
4854                 return nested_vmx_failValid(vcpu,
4855                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4856
4857         /* According to the Intel VMX instruction reference, the memory
4858          * operand is read even if it isn't needed (e.g., for type==global)
4859          */
4860         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4861                         vmx_instruction_info, false, sizeof(operand), &gva))
4862                 return 1;
4863         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4864                 kvm_inject_page_fault(vcpu, &e);
4865                 return 1;
4866         }
4867
4868         switch (type) {
4869         case VMX_EPT_EXTENT_GLOBAL:
4870         case VMX_EPT_EXTENT_CONTEXT:
4871         /*
4872          * TODO: Sync the necessary shadow EPT roots here, rather than
4873          * at the next emulated VM-entry.
4874          */
4875                 break;
4876         default:
4877                 BUG_ON(1);
4878                 break;
4879         }
4880
4881         return nested_vmx_succeed(vcpu);
4882 }
4883
4884 static int handle_invvpid(struct kvm_vcpu *vcpu)
4885 {
4886         struct vcpu_vmx *vmx = to_vmx(vcpu);
4887         u32 vmx_instruction_info;
4888         unsigned long type, types;
4889         gva_t gva;
4890         struct x86_exception e;
4891         struct {
4892                 u64 vpid;
4893                 u64 gla;
4894         } operand;
4895         u16 vpid02;
4896
4897         if (!(vmx->nested.msrs.secondary_ctls_high &
4898               SECONDARY_EXEC_ENABLE_VPID) ||
4899                         !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
4900                 kvm_queue_exception(vcpu, UD_VECTOR);
4901                 return 1;
4902         }
4903
4904         if (!nested_vmx_check_permission(vcpu))
4905                 return 1;
4906
4907         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4908         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4909
4910         types = (vmx->nested.msrs.vpid_caps &
4911                         VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
4912
4913         if (type >= 32 || !(types & (1 << type)))
4914                 return nested_vmx_failValid(vcpu,
4915                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4916
4917         /* according to the intel vmx instruction reference, the memory
4918          * operand is read even if it isn't needed (e.g., for type==global)
4919          */
4920         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4921                         vmx_instruction_info, false, sizeof(operand), &gva))
4922                 return 1;
4923         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4924                 kvm_inject_page_fault(vcpu, &e);
4925                 return 1;
4926         }
4927         if (operand.vpid >> 16)
4928                 return nested_vmx_failValid(vcpu,
4929                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4930
4931         vpid02 = nested_get_vpid02(vcpu);
4932         switch (type) {
4933         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
4934                 if (!operand.vpid ||
4935                     is_noncanonical_address(operand.gla, vcpu))
4936                         return nested_vmx_failValid(vcpu,
4937                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4938                 if (cpu_has_vmx_invvpid_individual_addr()) {
4939                         __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
4940                                 vpid02, operand.gla);
4941                 } else
4942                         __vmx_flush_tlb(vcpu, vpid02, false);
4943                 break;
4944         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
4945         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
4946                 if (!operand.vpid)
4947                         return nested_vmx_failValid(vcpu,
4948                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4949                 __vmx_flush_tlb(vcpu, vpid02, false);
4950                 break;
4951         case VMX_VPID_EXTENT_ALL_CONTEXT:
4952                 __vmx_flush_tlb(vcpu, vpid02, false);
4953                 break;
4954         default:
4955                 WARN_ON_ONCE(1);
4956                 return kvm_skip_emulated_instruction(vcpu);
4957         }
4958
4959         return nested_vmx_succeed(vcpu);
4960 }
4961
4962 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
4963                                      struct vmcs12 *vmcs12)
4964 {
4965         u32 index = kvm_rcx_read(vcpu);
4966         u64 address;
4967         bool accessed_dirty;
4968         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4969
4970         if (!nested_cpu_has_eptp_switching(vmcs12) ||
4971             !nested_cpu_has_ept(vmcs12))
4972                 return 1;
4973
4974         if (index >= VMFUNC_EPTP_ENTRIES)
4975                 return 1;
4976
4977
4978         if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
4979                                      &address, index * 8, 8))
4980                 return 1;
4981
4982         accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
4983
4984         /*
4985          * If the (L2) guest does a vmfunc to the currently
4986          * active ept pointer, we don't have to do anything else
4987          */
4988         if (vmcs12->ept_pointer != address) {
4989                 if (!valid_ept_address(vcpu, address))
4990                         return 1;
4991
4992                 kvm_mmu_unload(vcpu);
4993                 mmu->ept_ad = accessed_dirty;
4994                 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
4995                 vmcs12->ept_pointer = address;
4996                 /*
4997                  * TODO: Check what's the correct approach in case
4998                  * mmu reload fails. Currently, we just let the next
4999                  * reload potentially fail
5000                  */
5001                 kvm_mmu_reload(vcpu);
5002         }
5003
5004         return 0;
5005 }
5006
5007 static int handle_vmfunc(struct kvm_vcpu *vcpu)
5008 {
5009         struct vcpu_vmx *vmx = to_vmx(vcpu);
5010         struct vmcs12 *vmcs12;
5011         u32 function = kvm_rax_read(vcpu);
5012
5013         /*
5014          * VMFUNC is only supported for nested guests, but we always enable the
5015          * secondary control for simplicity; for non-nested mode, fake that we
5016          * didn't by injecting #UD.
5017          */
5018         if (!is_guest_mode(vcpu)) {
5019                 kvm_queue_exception(vcpu, UD_VECTOR);
5020                 return 1;
5021         }
5022
5023         vmcs12 = get_vmcs12(vcpu);
5024         if ((vmcs12->vm_function_control & (1 << function)) == 0)
5025                 goto fail;
5026
5027         switch (function) {
5028         case 0:
5029                 if (nested_vmx_eptp_switching(vcpu, vmcs12))
5030                         goto fail;
5031                 break;
5032         default:
5033                 goto fail;
5034         }
5035         return kvm_skip_emulated_instruction(vcpu);
5036
5037 fail:
5038         nested_vmx_vmexit(vcpu, vmx->exit_reason,
5039                           vmcs_read32(VM_EXIT_INTR_INFO),
5040                           vmcs_readl(EXIT_QUALIFICATION));
5041         return 1;
5042 }
5043
5044
5045 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5046                                        struct vmcs12 *vmcs12)
5047 {
5048         unsigned long exit_qualification;
5049         gpa_t bitmap, last_bitmap;
5050         unsigned int port;
5051         int size;
5052         u8 b;
5053
5054         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5055                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5056
5057         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5058
5059         port = exit_qualification >> 16;
5060         size = (exit_qualification & 7) + 1;
5061
5062         last_bitmap = (gpa_t)-1;
5063         b = -1;
5064
5065         while (size > 0) {
5066                 if (port < 0x8000)
5067                         bitmap = vmcs12->io_bitmap_a;
5068                 else if (port < 0x10000)
5069                         bitmap = vmcs12->io_bitmap_b;
5070                 else
5071                         return true;
5072                 bitmap += (port & 0x7fff) / 8;
5073
5074                 if (last_bitmap != bitmap)
5075                         if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5076                                 return true;
5077                 if (b & (1 << (port & 7)))
5078                         return true;
5079
5080                 port++;
5081                 size--;
5082                 last_bitmap = bitmap;
5083         }
5084
5085         return false;
5086 }
5087
5088 /*
5089  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5090  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5091  * disinterest in the current event (read or write a specific MSR) by using an
5092  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5093  */
5094 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5095         struct vmcs12 *vmcs12, u32 exit_reason)
5096 {
5097         u32 msr_index = kvm_rcx_read(vcpu);
5098         gpa_t bitmap;
5099
5100         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5101                 return true;
5102
5103         /*
5104          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5105          * for the four combinations of read/write and low/high MSR numbers.
5106          * First we need to figure out which of the four to use:
5107          */
5108         bitmap = vmcs12->msr_bitmap;
5109         if (exit_reason == EXIT_REASON_MSR_WRITE)
5110                 bitmap += 2048;
5111         if (msr_index >= 0xc0000000) {
5112                 msr_index -= 0xc0000000;
5113                 bitmap += 1024;
5114         }
5115
5116         /* Then read the msr_index'th bit from this bitmap: */
5117         if (msr_index < 1024*8) {
5118                 unsigned char b;
5119                 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5120                         return true;
5121                 return 1 & (b >> (msr_index & 7));
5122         } else
5123                 return true; /* let L1 handle the wrong parameter */
5124 }
5125
5126 /*
5127  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5128  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5129  * intercept (via guest_host_mask etc.) the current event.
5130  */
5131 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5132         struct vmcs12 *vmcs12)
5133 {
5134         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5135         int cr = exit_qualification & 15;
5136         int reg;
5137         unsigned long val;
5138
5139         switch ((exit_qualification >> 4) & 3) {
5140         case 0: /* mov to cr */
5141                 reg = (exit_qualification >> 8) & 15;
5142                 val = kvm_register_readl(vcpu, reg);
5143                 switch (cr) {
5144                 case 0:
5145                         if (vmcs12->cr0_guest_host_mask &
5146                             (val ^ vmcs12->cr0_read_shadow))
5147                                 return true;
5148                         break;
5149                 case 3:
5150                         if ((vmcs12->cr3_target_count >= 1 &&
5151                                         vmcs12->cr3_target_value0 == val) ||
5152                                 (vmcs12->cr3_target_count >= 2 &&
5153                                         vmcs12->cr3_target_value1 == val) ||
5154                                 (vmcs12->cr3_target_count >= 3 &&
5155                                         vmcs12->cr3_target_value2 == val) ||
5156                                 (vmcs12->cr3_target_count >= 4 &&
5157                                         vmcs12->cr3_target_value3 == val))
5158                                 return false;
5159                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5160                                 return true;
5161                         break;
5162                 case 4:
5163                         if (vmcs12->cr4_guest_host_mask &
5164                             (vmcs12->cr4_read_shadow ^ val))
5165                                 return true;
5166                         break;
5167                 case 8:
5168                         if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5169                                 return true;
5170                         break;
5171                 }
5172                 break;
5173         case 2: /* clts */
5174                 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5175                     (vmcs12->cr0_read_shadow & X86_CR0_TS))
5176                         return true;
5177                 break;
5178         case 1: /* mov from cr */
5179                 switch (cr) {
5180                 case 3:
5181                         if (vmcs12->cpu_based_vm_exec_control &
5182                             CPU_BASED_CR3_STORE_EXITING)
5183                                 return true;
5184                         break;
5185                 case 8:
5186                         if (vmcs12->cpu_based_vm_exec_control &
5187                             CPU_BASED_CR8_STORE_EXITING)
5188                                 return true;
5189                         break;
5190                 }
5191                 break;
5192         case 3: /* lmsw */
5193                 /*
5194                  * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5195                  * cr0. Other attempted changes are ignored, with no exit.
5196                  */
5197                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5198                 if (vmcs12->cr0_guest_host_mask & 0xe &
5199                     (val ^ vmcs12->cr0_read_shadow))
5200                         return true;
5201                 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5202                     !(vmcs12->cr0_read_shadow & 0x1) &&
5203                     (val & 0x1))
5204                         return true;
5205                 break;
5206         }
5207         return false;
5208 }
5209
5210 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5211         struct vmcs12 *vmcs12, gpa_t bitmap)
5212 {
5213         u32 vmx_instruction_info;
5214         unsigned long field;
5215         u8 b;
5216
5217         if (!nested_cpu_has_shadow_vmcs(vmcs12))
5218                 return true;
5219
5220         /* Decode instruction info and find the field to access */
5221         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5222         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5223
5224         /* Out-of-range fields always cause a VM exit from L2 to L1 */
5225         if (field >> 15)
5226                 return true;
5227
5228         if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5229                 return true;
5230
5231         return 1 & (b >> (field & 7));
5232 }
5233
5234 /*
5235  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5236  * should handle it ourselves in L0 (and then continue L2). Only call this
5237  * when in is_guest_mode (L2).
5238  */
5239 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5240 {
5241         u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5242         struct vcpu_vmx *vmx = to_vmx(vcpu);
5243         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5244
5245         if (vmx->nested.nested_run_pending)
5246                 return false;
5247
5248         if (unlikely(vmx->fail)) {
5249                 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
5250                                     vmcs_read32(VM_INSTRUCTION_ERROR));
5251                 return true;
5252         }
5253
5254         /*
5255          * The host physical addresses of some pages of guest memory
5256          * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5257          * Page). The CPU may write to these pages via their host
5258          * physical address while L2 is running, bypassing any
5259          * address-translation-based dirty tracking (e.g. EPT write
5260          * protection).
5261          *
5262          * Mark them dirty on every exit from L2 to prevent them from
5263          * getting out of sync with dirty tracking.
5264          */
5265         nested_mark_vmcs12_pages_dirty(vcpu);
5266
5267         trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5268                                 vmcs_readl(EXIT_QUALIFICATION),
5269                                 vmx->idt_vectoring_info,
5270                                 intr_info,
5271                                 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5272                                 KVM_ISA_VMX);
5273
5274         switch (exit_reason) {
5275         case EXIT_REASON_EXCEPTION_NMI:
5276                 if (is_nmi(intr_info))
5277                         return false;
5278                 else if (is_page_fault(intr_info))
5279                         return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5280                 else if (is_debug(intr_info) &&
5281                          vcpu->guest_debug &
5282                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5283                         return false;
5284                 else if (is_breakpoint(intr_info) &&
5285                          vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5286                         return false;
5287                 return vmcs12->exception_bitmap &
5288                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5289         case EXIT_REASON_EXTERNAL_INTERRUPT:
5290                 return false;
5291         case EXIT_REASON_TRIPLE_FAULT:
5292                 return true;
5293         case EXIT_REASON_PENDING_INTERRUPT:
5294                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5295         case EXIT_REASON_NMI_WINDOW:
5296                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5297         case EXIT_REASON_TASK_SWITCH:
5298                 return true;
5299         case EXIT_REASON_CPUID:
5300                 return true;
5301         case EXIT_REASON_HLT:
5302                 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5303         case EXIT_REASON_INVD:
5304                 return true;
5305         case EXIT_REASON_INVLPG:
5306                 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5307         case EXIT_REASON_RDPMC:
5308                 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5309         case EXIT_REASON_RDRAND:
5310                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5311         case EXIT_REASON_RDSEED:
5312                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5313         case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5314                 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5315         case EXIT_REASON_VMREAD:
5316                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5317                         vmcs12->vmread_bitmap);
5318         case EXIT_REASON_VMWRITE:
5319                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5320                         vmcs12->vmwrite_bitmap);
5321         case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5322         case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5323         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5324         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5325         case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5326                 /*
5327                  * VMX instructions trap unconditionally. This allows L1 to
5328                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
5329                  */
5330                 return true;
5331         case EXIT_REASON_CR_ACCESS:
5332                 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5333         case EXIT_REASON_DR_ACCESS:
5334                 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5335         case EXIT_REASON_IO_INSTRUCTION:
5336                 return nested_vmx_exit_handled_io(vcpu, vmcs12);
5337         case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5338                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5339         case EXIT_REASON_MSR_READ:
5340         case EXIT_REASON_MSR_WRITE:
5341                 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5342         case EXIT_REASON_INVALID_STATE:
5343                 return true;
5344         case EXIT_REASON_MWAIT_INSTRUCTION:
5345                 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5346         case EXIT_REASON_MONITOR_TRAP_FLAG:
5347                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5348         case EXIT_REASON_MONITOR_INSTRUCTION:
5349                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5350         case EXIT_REASON_PAUSE_INSTRUCTION:
5351                 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5352                         nested_cpu_has2(vmcs12,
5353                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5354         case EXIT_REASON_MCE_DURING_VMENTRY:
5355                 return false;
5356         case EXIT_REASON_TPR_BELOW_THRESHOLD:
5357                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5358         case EXIT_REASON_APIC_ACCESS:
5359         case EXIT_REASON_APIC_WRITE:
5360         case EXIT_REASON_EOI_INDUCED:
5361                 /*
5362                  * The controls for "virtualize APIC accesses," "APIC-
5363                  * register virtualization," and "virtual-interrupt
5364                  * delivery" only come from vmcs12.
5365                  */
5366                 return true;
5367         case EXIT_REASON_EPT_VIOLATION:
5368                 /*
5369                  * L0 always deals with the EPT violation. If nested EPT is
5370                  * used, and the nested mmu code discovers that the address is
5371                  * missing in the guest EPT table (EPT12), the EPT violation
5372                  * will be injected with nested_ept_inject_page_fault()
5373                  */
5374                 return false;
5375         case EXIT_REASON_EPT_MISCONFIG:
5376                 /*
5377                  * L2 never uses directly L1's EPT, but rather L0's own EPT
5378                  * table (shadow on EPT) or a merged EPT table that L0 built
5379                  * (EPT on EPT). So any problems with the structure of the
5380                  * table is L0's fault.
5381                  */
5382                 return false;
5383         case EXIT_REASON_INVPCID:
5384                 return
5385                         nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5386                         nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5387         case EXIT_REASON_WBINVD:
5388                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5389         case EXIT_REASON_XSETBV:
5390                 return true;
5391         case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5392                 /*
5393                  * This should never happen, since it is not possible to
5394                  * set XSS to a non-zero value---neither in L1 nor in L2.
5395                  * If if it were, XSS would have to be checked against
5396                  * the XSS exit bitmap in vmcs12.
5397                  */
5398                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5399         case EXIT_REASON_PREEMPTION_TIMER:
5400                 return false;
5401         case EXIT_REASON_PML_FULL:
5402                 /* We emulate PML support to L1. */
5403                 return false;
5404         case EXIT_REASON_VMFUNC:
5405                 /* VM functions are emulated through L2->L0 vmexits. */
5406                 return false;
5407         case EXIT_REASON_ENCLS:
5408                 /* SGX is never exposed to L1 */
5409                 return false;
5410         default:
5411                 return true;
5412         }
5413 }
5414
5415
5416 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5417                                 struct kvm_nested_state __user *user_kvm_nested_state,
5418                                 u32 user_data_size)
5419 {
5420         struct vcpu_vmx *vmx;
5421         struct vmcs12 *vmcs12;
5422         struct kvm_nested_state kvm_state = {
5423                 .flags = 0,
5424                 .format = KVM_STATE_NESTED_FORMAT_VMX,
5425                 .size = sizeof(kvm_state),
5426                 .hdr.vmx.vmxon_pa = -1ull,
5427                 .hdr.vmx.vmcs12_pa = -1ull,
5428         };
5429         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5430                 &user_kvm_nested_state->data.vmx[0];
5431
5432         if (!vcpu)
5433                 return kvm_state.size + sizeof(*user_vmx_nested_state);
5434
5435         vmx = to_vmx(vcpu);
5436         vmcs12 = get_vmcs12(vcpu);
5437
5438         if (nested_vmx_allowed(vcpu) &&
5439             (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5440                 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5441                 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
5442
5443                 if (vmx_has_valid_vmcs12(vcpu)) {
5444                         kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
5445
5446                         if (vmx->nested.hv_evmcs)
5447                                 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5448
5449                         if (is_guest_mode(vcpu) &&
5450                             nested_cpu_has_shadow_vmcs(vmcs12) &&
5451                             vmcs12->vmcs_link_pointer != -1ull)
5452                                 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
5453                 }
5454
5455                 if (vmx->nested.smm.vmxon)
5456                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5457
5458                 if (vmx->nested.smm.guest_mode)
5459                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5460
5461                 if (is_guest_mode(vcpu)) {
5462                         kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5463
5464                         if (vmx->nested.nested_run_pending)
5465                                 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5466                 }
5467         }
5468
5469         if (user_data_size < kvm_state.size)
5470                 goto out;
5471
5472         if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5473                 return -EFAULT;
5474
5475         if (!vmx_has_valid_vmcs12(vcpu))
5476                 goto out;
5477
5478         /*
5479          * When running L2, the authoritative vmcs12 state is in the
5480          * vmcs02. When running L1, the authoritative vmcs12 state is
5481          * in the shadow or enlightened vmcs linked to vmcs01, unless
5482          * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5483          * vmcs12 state is in the vmcs12 already.
5484          */
5485         if (is_guest_mode(vcpu)) {
5486                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5487                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5488         } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5489                 if (vmx->nested.hv_evmcs)
5490                         copy_enlightened_to_vmcs12(vmx);
5491                 else if (enable_shadow_vmcs)
5492                         copy_shadow_to_vmcs12(vmx);
5493         }
5494
5495         BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5496         BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5497
5498         /*
5499          * Copy over the full allocated size of vmcs12 rather than just the size
5500          * of the struct.
5501          */
5502         if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
5503                 return -EFAULT;
5504
5505         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5506             vmcs12->vmcs_link_pointer != -1ull) {
5507                 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
5508                                  get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5509                         return -EFAULT;
5510         }
5511
5512 out:
5513         return kvm_state.size;
5514 }
5515
5516 /*
5517  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5518  */
5519 void vmx_leave_nested(struct kvm_vcpu *vcpu)
5520 {
5521         if (is_guest_mode(vcpu)) {
5522                 to_vmx(vcpu)->nested.nested_run_pending = 0;
5523                 nested_vmx_vmexit(vcpu, -1, 0, 0);
5524         }
5525         free_nested(vcpu);
5526 }
5527
5528 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5529                                 struct kvm_nested_state __user *user_kvm_nested_state,
5530                                 struct kvm_nested_state *kvm_state)
5531 {
5532         struct vcpu_vmx *vmx = to_vmx(vcpu);
5533         struct vmcs12 *vmcs12;
5534         u32 exit_qual;
5535         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5536                 &user_kvm_nested_state->data.vmx[0];
5537         int ret;
5538
5539         if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
5540                 return -EINVAL;
5541
5542         if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5543                 if (kvm_state->hdr.vmx.smm.flags)
5544                         return -EINVAL;
5545
5546                 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
5547                         return -EINVAL;
5548
5549                 /*
5550                  * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5551                  * enable eVMCS capability on vCPU. However, since then
5552                  * code was changed such that flag signals vmcs12 should
5553                  * be copied into eVMCS in guest memory.
5554                  *
5555                  * To preserve backwards compatability, allow user
5556                  * to set this flag even when there is no VMXON region.
5557                  */
5558                 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5559                         return -EINVAL;
5560         } else {
5561                 if (!nested_vmx_allowed(vcpu))
5562                         return -EINVAL;
5563
5564                 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5565                         return -EINVAL;
5566         }
5567
5568         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5569             (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5570                 return -EINVAL;
5571
5572         if (kvm_state->hdr.vmx.smm.flags &
5573             ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5574                 return -EINVAL;
5575
5576         /*
5577          * SMM temporarily disables VMX, so we cannot be in guest mode,
5578          * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5579          * must be zero.
5580          */
5581         if (is_smm(vcpu) ?
5582                 (kvm_state->flags &
5583                  (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5584                 : kvm_state->hdr.vmx.smm.flags)
5585                 return -EINVAL;
5586
5587         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5588             !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5589                 return -EINVAL;
5590
5591         if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5592                 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
5593                         return -EINVAL;
5594
5595         vmx_leave_nested(vcpu);
5596
5597         if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
5598                 return 0;
5599
5600         vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
5601         ret = enter_vmx_operation(vcpu);
5602         if (ret)
5603                 return ret;
5604
5605         /* Empty 'VMXON' state is permitted */
5606         if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5607                 return 0;
5608
5609         if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5610                 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5611                     !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
5612                         return -EINVAL;
5613
5614                 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
5615         } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5616                 /*
5617                  * Sync eVMCS upon entry as we may not have
5618                  * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5619                  */
5620                 vmx->nested.need_vmcs12_to_shadow_sync = true;
5621         } else {
5622                 return -EINVAL;
5623         }
5624
5625         if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5626                 vmx->nested.smm.vmxon = true;
5627                 vmx->nested.vmxon = false;
5628
5629                 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5630                         vmx->nested.smm.guest_mode = true;
5631         }
5632
5633         vmcs12 = get_vmcs12(vcpu);
5634         if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
5635                 return -EFAULT;
5636
5637         if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5638                 return -EINVAL;
5639
5640         if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5641                 return 0;
5642
5643         vmx->nested.nested_run_pending =
5644                 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5645
5646         ret = -EINVAL;
5647         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5648             vmcs12->vmcs_link_pointer != -1ull) {
5649                 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5650
5651                 if (kvm_state->size <
5652                     sizeof(*kvm_state) +
5653                     sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
5654                         goto error_guest_mode;
5655
5656                 if (copy_from_user(shadow_vmcs12,
5657                                    user_vmx_nested_state->shadow_vmcs12,
5658                                    sizeof(*shadow_vmcs12))) {
5659                         ret = -EFAULT;
5660                         goto error_guest_mode;
5661                 }
5662
5663                 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5664                     !shadow_vmcs12->hdr.shadow_vmcs)
5665                         goto error_guest_mode;
5666         }
5667
5668         if (nested_vmx_check_controls(vcpu, vmcs12) ||
5669             nested_vmx_check_host_state(vcpu, vmcs12) ||
5670             nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5671                 goto error_guest_mode;
5672
5673         vmx->nested.dirty_vmcs12 = true;
5674         ret = nested_vmx_enter_non_root_mode(vcpu, false);
5675         if (ret)
5676                 goto error_guest_mode;
5677
5678         return 0;
5679
5680 error_guest_mode:
5681         vmx->nested.nested_run_pending = 0;
5682         return ret;
5683 }
5684
5685 void nested_vmx_vcpu_setup(void)
5686 {
5687         if (enable_shadow_vmcs) {
5688                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5689                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5690         }
5691 }
5692
5693 /*
5694  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5695  * returned for the various VMX controls MSRs when nested VMX is enabled.
5696  * The same values should also be used to verify that vmcs12 control fields are
5697  * valid during nested entry from L1 to L2.
5698  * Each of these control msrs has a low and high 32-bit half: A low bit is on
5699  * if the corresponding bit in the (32-bit) control field *must* be on, and a
5700  * bit in the high half is on if the corresponding bit in the control field
5701  * may be on. See also vmx_control_verify().
5702  */
5703 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5704                                 bool apicv)
5705 {
5706         /*
5707          * Note that as a general rule, the high half of the MSRs (bits in
5708          * the control fields which may be 1) should be initialized by the
5709          * intersection of the underlying hardware's MSR (i.e., features which
5710          * can be supported) and the list of features we want to expose -
5711          * because they are known to be properly supported in our code.
5712          * Also, usually, the low half of the MSRs (bits which must be 1) can
5713          * be set to 0, meaning that L1 may turn off any of these bits. The
5714          * reason is that if one of these bits is necessary, it will appear
5715          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5716          * fields of vmcs01 and vmcs02, will turn these bits off - and
5717          * nested_vmx_exit_reflected() will not pass related exits to L1.
5718          * These rules have exceptions below.
5719          */
5720
5721         /* pin-based controls */
5722         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5723                 msrs->pinbased_ctls_low,
5724                 msrs->pinbased_ctls_high);
5725         msrs->pinbased_ctls_low |=
5726                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5727         msrs->pinbased_ctls_high &=
5728                 PIN_BASED_EXT_INTR_MASK |
5729                 PIN_BASED_NMI_EXITING |
5730                 PIN_BASED_VIRTUAL_NMIS |
5731                 (apicv ? PIN_BASED_POSTED_INTR : 0);
5732         msrs->pinbased_ctls_high |=
5733                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5734                 PIN_BASED_VMX_PREEMPTION_TIMER;
5735
5736         /* exit controls */
5737         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5738                 msrs->exit_ctls_low,
5739                 msrs->exit_ctls_high);
5740         msrs->exit_ctls_low =
5741                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5742
5743         msrs->exit_ctls_high &=
5744 #ifdef CONFIG_X86_64
5745                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
5746 #endif
5747                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5748         msrs->exit_ctls_high |=
5749                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5750                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5751                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5752
5753         /* We support free control of debug control saving. */
5754         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5755
5756         /* entry controls */
5757         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5758                 msrs->entry_ctls_low,
5759                 msrs->entry_ctls_high);
5760         msrs->entry_ctls_low =
5761                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
5762         msrs->entry_ctls_high &=
5763 #ifdef CONFIG_X86_64
5764                 VM_ENTRY_IA32E_MODE |
5765 #endif
5766                 VM_ENTRY_LOAD_IA32_PAT;
5767         msrs->entry_ctls_high |=
5768                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
5769
5770         /* We support free control of debug control loading. */
5771         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
5772
5773         /* cpu-based controls */
5774         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
5775                 msrs->procbased_ctls_low,
5776                 msrs->procbased_ctls_high);
5777         msrs->procbased_ctls_low =
5778                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5779         msrs->procbased_ctls_high &=
5780                 CPU_BASED_VIRTUAL_INTR_PENDING |
5781                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
5782                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
5783                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
5784                 CPU_BASED_CR3_STORE_EXITING |
5785 #ifdef CONFIG_X86_64
5786                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
5787 #endif
5788                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
5789                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
5790                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
5791                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
5792                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
5793         /*
5794          * We can allow some features even when not supported by the
5795          * hardware. For example, L1 can specify an MSR bitmap - and we
5796          * can use it to avoid exits to L1 - even when L0 runs L2
5797          * without MSR bitmaps.
5798          */
5799         msrs->procbased_ctls_high |=
5800                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5801                 CPU_BASED_USE_MSR_BITMAPS;
5802
5803         /* We support free control of CR3 access interception. */
5804         msrs->procbased_ctls_low &=
5805                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
5806
5807         /*
5808          * secondary cpu-based controls.  Do not include those that
5809          * depend on CPUID bits, they are added later by vmx_cpuid_update.
5810          */
5811         if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
5812                 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
5813                       msrs->secondary_ctls_low,
5814                       msrs->secondary_ctls_high);
5815
5816         msrs->secondary_ctls_low = 0;
5817         msrs->secondary_ctls_high &=
5818                 SECONDARY_EXEC_DESC |
5819                 SECONDARY_EXEC_RDTSCP |
5820                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
5821                 SECONDARY_EXEC_WBINVD_EXITING |
5822                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
5823                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
5824                 SECONDARY_EXEC_RDRAND_EXITING |
5825                 SECONDARY_EXEC_ENABLE_INVPCID |
5826                 SECONDARY_EXEC_RDSEED_EXITING |
5827                 SECONDARY_EXEC_XSAVES;
5828
5829         /*
5830          * We can emulate "VMCS shadowing," even if the hardware
5831          * doesn't support it.
5832          */
5833         msrs->secondary_ctls_high |=
5834                 SECONDARY_EXEC_SHADOW_VMCS;
5835
5836         if (enable_ept) {
5837                 /* nested EPT: emulate EPT also to L1 */
5838                 msrs->secondary_ctls_high |=
5839                         SECONDARY_EXEC_ENABLE_EPT;
5840                 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
5841                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
5842                 if (cpu_has_vmx_ept_execute_only())
5843                         msrs->ept_caps |=
5844                                 VMX_EPT_EXECUTE_ONLY_BIT;
5845                 msrs->ept_caps &= ept_caps;
5846                 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
5847                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
5848                         VMX_EPT_1GB_PAGE_BIT;
5849                 if (enable_ept_ad_bits) {
5850                         msrs->secondary_ctls_high |=
5851                                 SECONDARY_EXEC_ENABLE_PML;
5852                         msrs->ept_caps |= VMX_EPT_AD_BIT;
5853                 }
5854         }
5855
5856         if (cpu_has_vmx_vmfunc()) {
5857                 msrs->secondary_ctls_high |=
5858                         SECONDARY_EXEC_ENABLE_VMFUNC;
5859                 /*
5860                  * Advertise EPTP switching unconditionally
5861                  * since we emulate it
5862                  */
5863                 if (enable_ept)
5864                         msrs->vmfunc_controls =
5865                                 VMX_VMFUNC_EPTP_SWITCHING;
5866         }
5867
5868         /*
5869          * Old versions of KVM use the single-context version without
5870          * checking for support, so declare that it is supported even
5871          * though it is treated as global context.  The alternative is
5872          * not failing the single-context invvpid, and it is worse.
5873          */
5874         if (enable_vpid) {
5875                 msrs->secondary_ctls_high |=
5876                         SECONDARY_EXEC_ENABLE_VPID;
5877                 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
5878                         VMX_VPID_EXTENT_SUPPORTED_MASK;
5879         }
5880
5881         if (enable_unrestricted_guest)
5882                 msrs->secondary_ctls_high |=
5883                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
5884
5885         if (flexpriority_enabled)
5886                 msrs->secondary_ctls_high |=
5887                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5888
5889         /* miscellaneous data */
5890         rdmsr(MSR_IA32_VMX_MISC,
5891                 msrs->misc_low,
5892                 msrs->misc_high);
5893         msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
5894         msrs->misc_low |=
5895                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
5896                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
5897                 VMX_MISC_ACTIVITY_HLT;
5898         msrs->misc_high = 0;
5899
5900         /*
5901          * This MSR reports some information about VMX support. We
5902          * should return information about the VMX we emulate for the
5903          * guest, and the VMCS structure we give it - not about the
5904          * VMX support of the underlying hardware.
5905          */
5906         msrs->basic =
5907                 VMCS12_REVISION |
5908                 VMX_BASIC_TRUE_CTLS |
5909                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
5910                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
5911
5912         if (cpu_has_vmx_basic_inout())
5913                 msrs->basic |= VMX_BASIC_INOUT;
5914
5915         /*
5916          * These MSRs specify bits which the guest must keep fixed on
5917          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
5918          * We picked the standard core2 setting.
5919          */
5920 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
5921 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
5922         msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
5923         msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
5924
5925         /* These MSRs specify bits which the guest must keep fixed off. */
5926         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
5927         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
5928
5929         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
5930         msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
5931 }
5932
5933 void nested_vmx_hardware_unsetup(void)
5934 {
5935         int i;
5936
5937         if (enable_shadow_vmcs) {
5938                 for (i = 0; i < VMX_BITMAP_NR; i++)
5939                         free_page((unsigned long)vmx_bitmap[i]);
5940         }
5941 }
5942
5943 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5944 {
5945         int i;
5946
5947         if (!cpu_has_vmx_shadow_vmcs())
5948                 enable_shadow_vmcs = 0;
5949         if (enable_shadow_vmcs) {
5950                 for (i = 0; i < VMX_BITMAP_NR; i++) {
5951                         /*
5952                          * The vmx_bitmap is not tied to a VM and so should
5953                          * not be charged to a memcg.
5954                          */
5955                         vmx_bitmap[i] = (unsigned long *)
5956                                 __get_free_page(GFP_KERNEL);
5957                         if (!vmx_bitmap[i]) {
5958                                 nested_vmx_hardware_unsetup();
5959                                 return -ENOMEM;
5960                         }
5961                 }
5962
5963                 init_vmcs_shadow_fields();
5964         }
5965
5966         exit_handlers[EXIT_REASON_VMCLEAR]      = handle_vmclear,
5967         exit_handlers[EXIT_REASON_VMLAUNCH]     = handle_vmlaunch,
5968         exit_handlers[EXIT_REASON_VMPTRLD]      = handle_vmptrld,
5969         exit_handlers[EXIT_REASON_VMPTRST]      = handle_vmptrst,
5970         exit_handlers[EXIT_REASON_VMREAD]       = handle_vmread,
5971         exit_handlers[EXIT_REASON_VMRESUME]     = handle_vmresume,
5972         exit_handlers[EXIT_REASON_VMWRITE]      = handle_vmwrite,
5973         exit_handlers[EXIT_REASON_VMOFF]        = handle_vmoff,
5974         exit_handlers[EXIT_REASON_VMON]         = handle_vmon,
5975         exit_handlers[EXIT_REASON_INVEPT]       = handle_invept,
5976         exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid,
5977         exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc,
5978
5979         kvm_x86_ops->check_nested_events = vmx_check_nested_events;
5980         kvm_x86_ops->get_nested_state = vmx_get_nested_state;
5981         kvm_x86_ops->set_nested_state = vmx_set_nested_state;
5982         kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
5983         kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
5984         kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
5985
5986         return 0;
5987 }