KVM: VMX: make MSR bitmaps per-VCPU
[linux-2.6-block.git] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include "irq.h"
20 #include "mmu.h"
21 #include "cpuid.h"
22 #include "lapic.h"
23
24 #include <linux/kvm_host.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
27 #include <linux/mm.h>
28 #include <linux/highmem.h>
29 #include <linux/sched.h>
30 #include <linux/moduleparam.h>
31 #include <linux/mod_devicetable.h>
32 #include <linux/trace_events.h>
33 #include <linux/slab.h>
34 #include <linux/tboot.h>
35 #include <linux/hrtimer.h>
36 #include <linux/frame.h>
37 #include "kvm_cache_regs.h"
38 #include "x86.h"
39
40 #include <asm/cpu.h>
41 #include <asm/io.h>
42 #include <asm/desc.h>
43 #include <asm/vmx.h>
44 #include <asm/virtext.h>
45 #include <asm/mce.h>
46 #include <asm/fpu/internal.h>
47 #include <asm/perf_event.h>
48 #include <asm/debugreg.h>
49 #include <asm/kexec.h>
50 #include <asm/apic.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/mmu_context.h>
53 #include <asm/nospec-branch.h>
54
55 #include "trace.h"
56 #include "pmu.h"
57
58 #define __ex(x) __kvm_handle_fault_on_reboot(x)
59 #define __ex_clear(x, reg) \
60         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
61
62 MODULE_AUTHOR("Qumranet");
63 MODULE_LICENSE("GPL");
64
65 static const struct x86_cpu_id vmx_cpu_id[] = {
66         X86_FEATURE_MATCH(X86_FEATURE_VMX),
67         {}
68 };
69 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
70
71 static bool __read_mostly enable_vpid = 1;
72 module_param_named(vpid, enable_vpid, bool, 0444);
73
74 static bool __read_mostly enable_vnmi = 1;
75 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
76
77 static bool __read_mostly flexpriority_enabled = 1;
78 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
79
80 static bool __read_mostly enable_ept = 1;
81 module_param_named(ept, enable_ept, bool, S_IRUGO);
82
83 static bool __read_mostly enable_unrestricted_guest = 1;
84 module_param_named(unrestricted_guest,
85                         enable_unrestricted_guest, bool, S_IRUGO);
86
87 static bool __read_mostly enable_ept_ad_bits = 1;
88 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
89
90 static bool __read_mostly emulate_invalid_guest_state = true;
91 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
92
93 static bool __read_mostly fasteoi = 1;
94 module_param(fasteoi, bool, S_IRUGO);
95
96 static bool __read_mostly enable_apicv = 1;
97 module_param(enable_apicv, bool, S_IRUGO);
98
99 static bool __read_mostly enable_shadow_vmcs = 1;
100 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
101 /*
102  * If nested=1, nested virtualization is supported, i.e., guests may use
103  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
104  * use VMX instructions.
105  */
106 static bool __read_mostly nested = 0;
107 module_param(nested, bool, S_IRUGO);
108
109 static u64 __read_mostly host_xss;
110
111 static bool __read_mostly enable_pml = 1;
112 module_param_named(pml, enable_pml, bool, S_IRUGO);
113
114 #define MSR_TYPE_R      1
115 #define MSR_TYPE_W      2
116 #define MSR_TYPE_RW     3
117
118 #define MSR_BITMAP_MODE_X2APIC          1
119 #define MSR_BITMAP_MODE_X2APIC_APICV    2
120 #define MSR_BITMAP_MODE_LM              4
121
122 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
123
124 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
125 static int __read_mostly cpu_preemption_timer_multi;
126 static bool __read_mostly enable_preemption_timer = 1;
127 #ifdef CONFIG_X86_64
128 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
129 #endif
130
131 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
132 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
133 #define KVM_VM_CR0_ALWAYS_ON                                            \
134         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
135 #define KVM_CR4_GUEST_OWNED_BITS                                      \
136         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
137          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
138
139 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
140 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
141
142 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
143
144 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
145
146 /*
147  * Hyper-V requires all of these, so mark them as supported even though
148  * they are just treated the same as all-context.
149  */
150 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
151         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
152         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
153         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
154         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
155
156 /*
157  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
158  * ple_gap:    upper bound on the amount of time between two successive
159  *             executions of PAUSE in a loop. Also indicate if ple enabled.
160  *             According to test, this time is usually smaller than 128 cycles.
161  * ple_window: upper bound on the amount of time a guest is allowed to execute
162  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
163  *             less than 2^12 cycles
164  * Time is measured based on a counter that runs at the same rate as the TSC,
165  * refer SDM volume 3b section 21.6.13 & 22.1.3.
166  */
167 #define KVM_VMX_DEFAULT_PLE_GAP           128
168 #define KVM_VMX_DEFAULT_PLE_WINDOW        4096
169 #define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
170 #define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
171 #define KVM_VMX_DEFAULT_PLE_WINDOW_MAX    \
172                 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
173
174 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
175 module_param(ple_gap, int, S_IRUGO);
176
177 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
178 module_param(ple_window, int, S_IRUGO);
179
180 /* Default doubles per-vcpu window every exit. */
181 static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
182 module_param(ple_window_grow, int, S_IRUGO);
183
184 /* Default resets per-vcpu window every exit to ple_window. */
185 static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
186 module_param(ple_window_shrink, int, S_IRUGO);
187
188 /* Default is to compute the maximum so we can never overflow. */
189 static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
190 static int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
191 module_param(ple_window_max, int, S_IRUGO);
192
193 extern const ulong vmx_return;
194
195 #define NR_AUTOLOAD_MSRS 8
196
197 struct vmcs {
198         u32 revision_id;
199         u32 abort;
200         char data[0];
201 };
202
203 /*
204  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
205  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
206  * loaded on this CPU (so we can clear them if the CPU goes down).
207  */
208 struct loaded_vmcs {
209         struct vmcs *vmcs;
210         struct vmcs *shadow_vmcs;
211         int cpu;
212         bool launched;
213         bool nmi_known_unmasked;
214         unsigned long vmcs_host_cr3;    /* May not match real cr3 */
215         unsigned long vmcs_host_cr4;    /* May not match real cr4 */
216         /* Support for vnmi-less CPUs */
217         int soft_vnmi_blocked;
218         ktime_t entry_time;
219         s64 vnmi_blocked_time;
220         unsigned long *msr_bitmap;
221         struct list_head loaded_vmcss_on_cpu_link;
222 };
223
224 struct shared_msr_entry {
225         unsigned index;
226         u64 data;
227         u64 mask;
228 };
229
230 /*
231  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
232  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
233  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
234  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
235  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
236  * More than one of these structures may exist, if L1 runs multiple L2 guests.
237  * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
238  * underlying hardware which will be used to run L2.
239  * This structure is packed to ensure that its layout is identical across
240  * machines (necessary for live migration).
241  * If there are changes in this struct, VMCS12_REVISION must be changed.
242  */
243 typedef u64 natural_width;
244 struct __packed vmcs12 {
245         /* According to the Intel spec, a VMCS region must start with the
246          * following two fields. Then follow implementation-specific data.
247          */
248         u32 revision_id;
249         u32 abort;
250
251         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
252         u32 padding[7]; /* room for future expansion */
253
254         u64 io_bitmap_a;
255         u64 io_bitmap_b;
256         u64 msr_bitmap;
257         u64 vm_exit_msr_store_addr;
258         u64 vm_exit_msr_load_addr;
259         u64 vm_entry_msr_load_addr;
260         u64 tsc_offset;
261         u64 virtual_apic_page_addr;
262         u64 apic_access_addr;
263         u64 posted_intr_desc_addr;
264         u64 vm_function_control;
265         u64 ept_pointer;
266         u64 eoi_exit_bitmap0;
267         u64 eoi_exit_bitmap1;
268         u64 eoi_exit_bitmap2;
269         u64 eoi_exit_bitmap3;
270         u64 eptp_list_address;
271         u64 xss_exit_bitmap;
272         u64 guest_physical_address;
273         u64 vmcs_link_pointer;
274         u64 pml_address;
275         u64 guest_ia32_debugctl;
276         u64 guest_ia32_pat;
277         u64 guest_ia32_efer;
278         u64 guest_ia32_perf_global_ctrl;
279         u64 guest_pdptr0;
280         u64 guest_pdptr1;
281         u64 guest_pdptr2;
282         u64 guest_pdptr3;
283         u64 guest_bndcfgs;
284         u64 host_ia32_pat;
285         u64 host_ia32_efer;
286         u64 host_ia32_perf_global_ctrl;
287         u64 padding64[8]; /* room for future expansion */
288         /*
289          * To allow migration of L1 (complete with its L2 guests) between
290          * machines of different natural widths (32 or 64 bit), we cannot have
291          * unsigned long fields with no explict size. We use u64 (aliased
292          * natural_width) instead. Luckily, x86 is little-endian.
293          */
294         natural_width cr0_guest_host_mask;
295         natural_width cr4_guest_host_mask;
296         natural_width cr0_read_shadow;
297         natural_width cr4_read_shadow;
298         natural_width cr3_target_value0;
299         natural_width cr3_target_value1;
300         natural_width cr3_target_value2;
301         natural_width cr3_target_value3;
302         natural_width exit_qualification;
303         natural_width guest_linear_address;
304         natural_width guest_cr0;
305         natural_width guest_cr3;
306         natural_width guest_cr4;
307         natural_width guest_es_base;
308         natural_width guest_cs_base;
309         natural_width guest_ss_base;
310         natural_width guest_ds_base;
311         natural_width guest_fs_base;
312         natural_width guest_gs_base;
313         natural_width guest_ldtr_base;
314         natural_width guest_tr_base;
315         natural_width guest_gdtr_base;
316         natural_width guest_idtr_base;
317         natural_width guest_dr7;
318         natural_width guest_rsp;
319         natural_width guest_rip;
320         natural_width guest_rflags;
321         natural_width guest_pending_dbg_exceptions;
322         natural_width guest_sysenter_esp;
323         natural_width guest_sysenter_eip;
324         natural_width host_cr0;
325         natural_width host_cr3;
326         natural_width host_cr4;
327         natural_width host_fs_base;
328         natural_width host_gs_base;
329         natural_width host_tr_base;
330         natural_width host_gdtr_base;
331         natural_width host_idtr_base;
332         natural_width host_ia32_sysenter_esp;
333         natural_width host_ia32_sysenter_eip;
334         natural_width host_rsp;
335         natural_width host_rip;
336         natural_width paddingl[8]; /* room for future expansion */
337         u32 pin_based_vm_exec_control;
338         u32 cpu_based_vm_exec_control;
339         u32 exception_bitmap;
340         u32 page_fault_error_code_mask;
341         u32 page_fault_error_code_match;
342         u32 cr3_target_count;
343         u32 vm_exit_controls;
344         u32 vm_exit_msr_store_count;
345         u32 vm_exit_msr_load_count;
346         u32 vm_entry_controls;
347         u32 vm_entry_msr_load_count;
348         u32 vm_entry_intr_info_field;
349         u32 vm_entry_exception_error_code;
350         u32 vm_entry_instruction_len;
351         u32 tpr_threshold;
352         u32 secondary_vm_exec_control;
353         u32 vm_instruction_error;
354         u32 vm_exit_reason;
355         u32 vm_exit_intr_info;
356         u32 vm_exit_intr_error_code;
357         u32 idt_vectoring_info_field;
358         u32 idt_vectoring_error_code;
359         u32 vm_exit_instruction_len;
360         u32 vmx_instruction_info;
361         u32 guest_es_limit;
362         u32 guest_cs_limit;
363         u32 guest_ss_limit;
364         u32 guest_ds_limit;
365         u32 guest_fs_limit;
366         u32 guest_gs_limit;
367         u32 guest_ldtr_limit;
368         u32 guest_tr_limit;
369         u32 guest_gdtr_limit;
370         u32 guest_idtr_limit;
371         u32 guest_es_ar_bytes;
372         u32 guest_cs_ar_bytes;
373         u32 guest_ss_ar_bytes;
374         u32 guest_ds_ar_bytes;
375         u32 guest_fs_ar_bytes;
376         u32 guest_gs_ar_bytes;
377         u32 guest_ldtr_ar_bytes;
378         u32 guest_tr_ar_bytes;
379         u32 guest_interruptibility_info;
380         u32 guest_activity_state;
381         u32 guest_sysenter_cs;
382         u32 host_ia32_sysenter_cs;
383         u32 vmx_preemption_timer_value;
384         u32 padding32[7]; /* room for future expansion */
385         u16 virtual_processor_id;
386         u16 posted_intr_nv;
387         u16 guest_es_selector;
388         u16 guest_cs_selector;
389         u16 guest_ss_selector;
390         u16 guest_ds_selector;
391         u16 guest_fs_selector;
392         u16 guest_gs_selector;
393         u16 guest_ldtr_selector;
394         u16 guest_tr_selector;
395         u16 guest_intr_status;
396         u16 guest_pml_index;
397         u16 host_es_selector;
398         u16 host_cs_selector;
399         u16 host_ss_selector;
400         u16 host_ds_selector;
401         u16 host_fs_selector;
402         u16 host_gs_selector;
403         u16 host_tr_selector;
404 };
405
406 /*
407  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
408  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
409  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
410  */
411 #define VMCS12_REVISION 0x11e57ed0
412
413 /*
414  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
415  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
416  * current implementation, 4K are reserved to avoid future complications.
417  */
418 #define VMCS12_SIZE 0x1000
419
420 /*
421  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
422  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
423  */
424 struct nested_vmx {
425         /* Has the level1 guest done vmxon? */
426         bool vmxon;
427         gpa_t vmxon_ptr;
428         bool pml_full;
429
430         /* The guest-physical address of the current VMCS L1 keeps for L2 */
431         gpa_t current_vmptr;
432         /*
433          * Cache of the guest's VMCS, existing outside of guest memory.
434          * Loaded from guest memory during VMPTRLD. Flushed to guest
435          * memory during VMCLEAR and VMPTRLD.
436          */
437         struct vmcs12 *cached_vmcs12;
438         /*
439          * Indicates if the shadow vmcs must be updated with the
440          * data hold by vmcs12
441          */
442         bool sync_shadow_vmcs;
443
444         bool change_vmcs01_virtual_x2apic_mode;
445         /* L2 must run next, and mustn't decide to exit to L1. */
446         bool nested_run_pending;
447
448         struct loaded_vmcs vmcs02;
449
450         /*
451          * Guest pages referred to in the vmcs02 with host-physical
452          * pointers, so we must keep them pinned while L2 runs.
453          */
454         struct page *apic_access_page;
455         struct page *virtual_apic_page;
456         struct page *pi_desc_page;
457         struct pi_desc *pi_desc;
458         bool pi_pending;
459         u16 posted_intr_nv;
460
461         struct hrtimer preemption_timer;
462         bool preemption_timer_expired;
463
464         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
465         u64 vmcs01_debugctl;
466
467         u16 vpid02;
468         u16 last_vpid;
469
470         /*
471          * We only store the "true" versions of the VMX capability MSRs. We
472          * generate the "non-true" versions by setting the must-be-1 bits
473          * according to the SDM.
474          */
475         u32 nested_vmx_procbased_ctls_low;
476         u32 nested_vmx_procbased_ctls_high;
477         u32 nested_vmx_secondary_ctls_low;
478         u32 nested_vmx_secondary_ctls_high;
479         u32 nested_vmx_pinbased_ctls_low;
480         u32 nested_vmx_pinbased_ctls_high;
481         u32 nested_vmx_exit_ctls_low;
482         u32 nested_vmx_exit_ctls_high;
483         u32 nested_vmx_entry_ctls_low;
484         u32 nested_vmx_entry_ctls_high;
485         u32 nested_vmx_misc_low;
486         u32 nested_vmx_misc_high;
487         u32 nested_vmx_ept_caps;
488         u32 nested_vmx_vpid_caps;
489         u64 nested_vmx_basic;
490         u64 nested_vmx_cr0_fixed0;
491         u64 nested_vmx_cr0_fixed1;
492         u64 nested_vmx_cr4_fixed0;
493         u64 nested_vmx_cr4_fixed1;
494         u64 nested_vmx_vmcs_enum;
495         u64 nested_vmx_vmfunc_controls;
496
497         /* SMM related state */
498         struct {
499                 /* in VMX operation on SMM entry? */
500                 bool vmxon;
501                 /* in guest mode on SMM entry? */
502                 bool guest_mode;
503         } smm;
504 };
505
506 #define POSTED_INTR_ON  0
507 #define POSTED_INTR_SN  1
508
509 /* Posted-Interrupt Descriptor */
510 struct pi_desc {
511         u32 pir[8];     /* Posted interrupt requested */
512         union {
513                 struct {
514                                 /* bit 256 - Outstanding Notification */
515                         u16     on      : 1,
516                                 /* bit 257 - Suppress Notification */
517                                 sn      : 1,
518                                 /* bit 271:258 - Reserved */
519                                 rsvd_1  : 14;
520                                 /* bit 279:272 - Notification Vector */
521                         u8      nv;
522                                 /* bit 287:280 - Reserved */
523                         u8      rsvd_2;
524                                 /* bit 319:288 - Notification Destination */
525                         u32     ndst;
526                 };
527                 u64 control;
528         };
529         u32 rsvd[6];
530 } __aligned(64);
531
532 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
533 {
534         return test_and_set_bit(POSTED_INTR_ON,
535                         (unsigned long *)&pi_desc->control);
536 }
537
538 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
539 {
540         return test_and_clear_bit(POSTED_INTR_ON,
541                         (unsigned long *)&pi_desc->control);
542 }
543
544 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
545 {
546         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
547 }
548
549 static inline void pi_clear_sn(struct pi_desc *pi_desc)
550 {
551         return clear_bit(POSTED_INTR_SN,
552                         (unsigned long *)&pi_desc->control);
553 }
554
555 static inline void pi_set_sn(struct pi_desc *pi_desc)
556 {
557         return set_bit(POSTED_INTR_SN,
558                         (unsigned long *)&pi_desc->control);
559 }
560
561 static inline void pi_clear_on(struct pi_desc *pi_desc)
562 {
563         clear_bit(POSTED_INTR_ON,
564                   (unsigned long *)&pi_desc->control);
565 }
566
567 static inline int pi_test_on(struct pi_desc *pi_desc)
568 {
569         return test_bit(POSTED_INTR_ON,
570                         (unsigned long *)&pi_desc->control);
571 }
572
573 static inline int pi_test_sn(struct pi_desc *pi_desc)
574 {
575         return test_bit(POSTED_INTR_SN,
576                         (unsigned long *)&pi_desc->control);
577 }
578
579 struct vcpu_vmx {
580         struct kvm_vcpu       vcpu;
581         unsigned long         host_rsp;
582         u8                    fail;
583         u8                    msr_bitmap_mode;
584         u32                   exit_intr_info;
585         u32                   idt_vectoring_info;
586         ulong                 rflags;
587         struct shared_msr_entry *guest_msrs;
588         int                   nmsrs;
589         int                   save_nmsrs;
590         unsigned long         host_idt_base;
591 #ifdef CONFIG_X86_64
592         u64                   msr_host_kernel_gs_base;
593         u64                   msr_guest_kernel_gs_base;
594 #endif
595         u32 vm_entry_controls_shadow;
596         u32 vm_exit_controls_shadow;
597         u32 secondary_exec_control;
598
599         /*
600          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
601          * non-nested (L1) guest, it always points to vmcs01. For a nested
602          * guest (L2), it points to a different VMCS.
603          */
604         struct loaded_vmcs    vmcs01;
605         struct loaded_vmcs   *loaded_vmcs;
606         bool                  __launched; /* temporary, used in vmx_vcpu_run */
607         struct msr_autoload {
608                 unsigned nr;
609                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
610                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
611         } msr_autoload;
612         struct {
613                 int           loaded;
614                 u16           fs_sel, gs_sel, ldt_sel;
615 #ifdef CONFIG_X86_64
616                 u16           ds_sel, es_sel;
617 #endif
618                 int           gs_ldt_reload_needed;
619                 int           fs_reload_needed;
620                 u64           msr_host_bndcfgs;
621         } host_state;
622         struct {
623                 int vm86_active;
624                 ulong save_rflags;
625                 struct kvm_segment segs[8];
626         } rmode;
627         struct {
628                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
629                 struct kvm_save_segment {
630                         u16 selector;
631                         unsigned long base;
632                         u32 limit;
633                         u32 ar;
634                 } seg[8];
635         } segment_cache;
636         int vpid;
637         bool emulation_required;
638
639         u32 exit_reason;
640
641         /* Posted interrupt descriptor */
642         struct pi_desc pi_desc;
643
644         /* Support for a guest hypervisor (nested VMX) */
645         struct nested_vmx nested;
646
647         /* Dynamic PLE window. */
648         int ple_window;
649         bool ple_window_dirty;
650
651         /* Support for PML */
652 #define PML_ENTITY_NUM          512
653         struct page *pml_pg;
654
655         /* apic deadline value in host tsc */
656         u64 hv_deadline_tsc;
657
658         u64 current_tsc_ratio;
659
660         u32 host_pkru;
661
662         /*
663          * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
664          * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
665          * in msr_ia32_feature_control_valid_bits.
666          */
667         u64 msr_ia32_feature_control;
668         u64 msr_ia32_feature_control_valid_bits;
669 };
670
671 enum segment_cache_field {
672         SEG_FIELD_SEL = 0,
673         SEG_FIELD_BASE = 1,
674         SEG_FIELD_LIMIT = 2,
675         SEG_FIELD_AR = 3,
676
677         SEG_FIELD_NR = 4
678 };
679
680 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
681 {
682         return container_of(vcpu, struct vcpu_vmx, vcpu);
683 }
684
685 static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
686 {
687         return &(to_vmx(vcpu)->pi_desc);
688 }
689
690 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
691 #define FIELD(number, name)     [number] = VMCS12_OFFSET(name)
692 #define FIELD64(number, name)   [number] = VMCS12_OFFSET(name), \
693                                 [number##_HIGH] = VMCS12_OFFSET(name)+4
694
695
696 static unsigned long shadow_read_only_fields[] = {
697         /*
698          * We do NOT shadow fields that are modified when L0
699          * traps and emulates any vmx instruction (e.g. VMPTRLD,
700          * VMXON...) executed by L1.
701          * For example, VM_INSTRUCTION_ERROR is read
702          * by L1 if a vmx instruction fails (part of the error path).
703          * Note the code assumes this logic. If for some reason
704          * we start shadowing these fields then we need to
705          * force a shadow sync when L0 emulates vmx instructions
706          * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
707          * by nested_vmx_failValid)
708          */
709         VM_EXIT_REASON,
710         VM_EXIT_INTR_INFO,
711         VM_EXIT_INSTRUCTION_LEN,
712         IDT_VECTORING_INFO_FIELD,
713         IDT_VECTORING_ERROR_CODE,
714         VM_EXIT_INTR_ERROR_CODE,
715         EXIT_QUALIFICATION,
716         GUEST_LINEAR_ADDRESS,
717         GUEST_PHYSICAL_ADDRESS
718 };
719 static int max_shadow_read_only_fields =
720         ARRAY_SIZE(shadow_read_only_fields);
721
722 static unsigned long shadow_read_write_fields[] = {
723         TPR_THRESHOLD,
724         GUEST_RIP,
725         GUEST_RSP,
726         GUEST_CR0,
727         GUEST_CR3,
728         GUEST_CR4,
729         GUEST_INTERRUPTIBILITY_INFO,
730         GUEST_RFLAGS,
731         GUEST_CS_SELECTOR,
732         GUEST_CS_AR_BYTES,
733         GUEST_CS_LIMIT,
734         GUEST_CS_BASE,
735         GUEST_ES_BASE,
736         GUEST_BNDCFGS,
737         CR0_GUEST_HOST_MASK,
738         CR0_READ_SHADOW,
739         CR4_READ_SHADOW,
740         TSC_OFFSET,
741         EXCEPTION_BITMAP,
742         CPU_BASED_VM_EXEC_CONTROL,
743         VM_ENTRY_EXCEPTION_ERROR_CODE,
744         VM_ENTRY_INTR_INFO_FIELD,
745         VM_ENTRY_INSTRUCTION_LEN,
746         VM_ENTRY_EXCEPTION_ERROR_CODE,
747         HOST_FS_BASE,
748         HOST_GS_BASE,
749         HOST_FS_SELECTOR,
750         HOST_GS_SELECTOR
751 };
752 static int max_shadow_read_write_fields =
753         ARRAY_SIZE(shadow_read_write_fields);
754
755 static const unsigned short vmcs_field_to_offset_table[] = {
756         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
757         FIELD(POSTED_INTR_NV, posted_intr_nv),
758         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
759         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
760         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
761         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
762         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
763         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
764         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
765         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
766         FIELD(GUEST_INTR_STATUS, guest_intr_status),
767         FIELD(GUEST_PML_INDEX, guest_pml_index),
768         FIELD(HOST_ES_SELECTOR, host_es_selector),
769         FIELD(HOST_CS_SELECTOR, host_cs_selector),
770         FIELD(HOST_SS_SELECTOR, host_ss_selector),
771         FIELD(HOST_DS_SELECTOR, host_ds_selector),
772         FIELD(HOST_FS_SELECTOR, host_fs_selector),
773         FIELD(HOST_GS_SELECTOR, host_gs_selector),
774         FIELD(HOST_TR_SELECTOR, host_tr_selector),
775         FIELD64(IO_BITMAP_A, io_bitmap_a),
776         FIELD64(IO_BITMAP_B, io_bitmap_b),
777         FIELD64(MSR_BITMAP, msr_bitmap),
778         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
779         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
780         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
781         FIELD64(TSC_OFFSET, tsc_offset),
782         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
783         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
784         FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
785         FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
786         FIELD64(EPT_POINTER, ept_pointer),
787         FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
788         FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
789         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
790         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
791         FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
792         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
793         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
794         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
795         FIELD64(PML_ADDRESS, pml_address),
796         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
797         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
798         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
799         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
800         FIELD64(GUEST_PDPTR0, guest_pdptr0),
801         FIELD64(GUEST_PDPTR1, guest_pdptr1),
802         FIELD64(GUEST_PDPTR2, guest_pdptr2),
803         FIELD64(GUEST_PDPTR3, guest_pdptr3),
804         FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
805         FIELD64(HOST_IA32_PAT, host_ia32_pat),
806         FIELD64(HOST_IA32_EFER, host_ia32_efer),
807         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
808         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
809         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
810         FIELD(EXCEPTION_BITMAP, exception_bitmap),
811         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
812         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
813         FIELD(CR3_TARGET_COUNT, cr3_target_count),
814         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
815         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
816         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
817         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
818         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
819         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
820         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
821         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
822         FIELD(TPR_THRESHOLD, tpr_threshold),
823         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
824         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
825         FIELD(VM_EXIT_REASON, vm_exit_reason),
826         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
827         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
828         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
829         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
830         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
831         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
832         FIELD(GUEST_ES_LIMIT, guest_es_limit),
833         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
834         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
835         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
836         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
837         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
838         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
839         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
840         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
841         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
842         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
843         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
844         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
845         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
846         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
847         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
848         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
849         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
850         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
851         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
852         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
853         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
854         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
855         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
856         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
857         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
858         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
859         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
860         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
861         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
862         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
863         FIELD(EXIT_QUALIFICATION, exit_qualification),
864         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
865         FIELD(GUEST_CR0, guest_cr0),
866         FIELD(GUEST_CR3, guest_cr3),
867         FIELD(GUEST_CR4, guest_cr4),
868         FIELD(GUEST_ES_BASE, guest_es_base),
869         FIELD(GUEST_CS_BASE, guest_cs_base),
870         FIELD(GUEST_SS_BASE, guest_ss_base),
871         FIELD(GUEST_DS_BASE, guest_ds_base),
872         FIELD(GUEST_FS_BASE, guest_fs_base),
873         FIELD(GUEST_GS_BASE, guest_gs_base),
874         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
875         FIELD(GUEST_TR_BASE, guest_tr_base),
876         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
877         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
878         FIELD(GUEST_DR7, guest_dr7),
879         FIELD(GUEST_RSP, guest_rsp),
880         FIELD(GUEST_RIP, guest_rip),
881         FIELD(GUEST_RFLAGS, guest_rflags),
882         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
883         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
884         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
885         FIELD(HOST_CR0, host_cr0),
886         FIELD(HOST_CR3, host_cr3),
887         FIELD(HOST_CR4, host_cr4),
888         FIELD(HOST_FS_BASE, host_fs_base),
889         FIELD(HOST_GS_BASE, host_gs_base),
890         FIELD(HOST_TR_BASE, host_tr_base),
891         FIELD(HOST_GDTR_BASE, host_gdtr_base),
892         FIELD(HOST_IDTR_BASE, host_idtr_base),
893         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
894         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
895         FIELD(HOST_RSP, host_rsp),
896         FIELD(HOST_RIP, host_rip),
897 };
898
899 static inline short vmcs_field_to_offset(unsigned long field)
900 {
901         BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
902
903         if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
904                 return -ENOENT;
905
906         /*
907          * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
908          * generic mechanism.
909          */
910         asm("lfence");
911
912         if (vmcs_field_to_offset_table[field] == 0)
913                 return -ENOENT;
914
915         return vmcs_field_to_offset_table[field];
916 }
917
918 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
919 {
920         return to_vmx(vcpu)->nested.cached_vmcs12;
921 }
922
923 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
924 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
925 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
926 static bool vmx_xsaves_supported(void);
927 static void vmx_set_segment(struct kvm_vcpu *vcpu,
928                             struct kvm_segment *var, int seg);
929 static void vmx_get_segment(struct kvm_vcpu *vcpu,
930                             struct kvm_segment *var, int seg);
931 static bool guest_state_valid(struct kvm_vcpu *vcpu);
932 static u32 vmx_segment_access_rights(struct kvm_segment *var);
933 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
934 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
935 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
936 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
937                                             u16 error_code);
938 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
939
940 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
941 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
942 /*
943  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
944  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
945  */
946 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
947
948 /*
949  * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
950  * can find which vCPU should be waken up.
951  */
952 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
953 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
954
955 enum {
956         VMX_IO_BITMAP_A,
957         VMX_IO_BITMAP_B,
958         VMX_VMREAD_BITMAP,
959         VMX_VMWRITE_BITMAP,
960         VMX_BITMAP_NR
961 };
962
963 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
964
965 #define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
966 #define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
967 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
968 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
969
970 static bool cpu_has_load_ia32_efer;
971 static bool cpu_has_load_perf_global_ctrl;
972
973 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
974 static DEFINE_SPINLOCK(vmx_vpid_lock);
975
976 static struct vmcs_config {
977         int size;
978         int order;
979         u32 basic_cap;
980         u32 revision_id;
981         u32 pin_based_exec_ctrl;
982         u32 cpu_based_exec_ctrl;
983         u32 cpu_based_2nd_exec_ctrl;
984         u32 vmexit_ctrl;
985         u32 vmentry_ctrl;
986 } vmcs_config;
987
988 static struct vmx_capability {
989         u32 ept;
990         u32 vpid;
991 } vmx_capability;
992
993 #define VMX_SEGMENT_FIELD(seg)                                  \
994         [VCPU_SREG_##seg] = {                                   \
995                 .selector = GUEST_##seg##_SELECTOR,             \
996                 .base = GUEST_##seg##_BASE,                     \
997                 .limit = GUEST_##seg##_LIMIT,                   \
998                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
999         }
1000
1001 static const struct kvm_vmx_segment_field {
1002         unsigned selector;
1003         unsigned base;
1004         unsigned limit;
1005         unsigned ar_bytes;
1006 } kvm_vmx_segment_fields[] = {
1007         VMX_SEGMENT_FIELD(CS),
1008         VMX_SEGMENT_FIELD(DS),
1009         VMX_SEGMENT_FIELD(ES),
1010         VMX_SEGMENT_FIELD(FS),
1011         VMX_SEGMENT_FIELD(GS),
1012         VMX_SEGMENT_FIELD(SS),
1013         VMX_SEGMENT_FIELD(TR),
1014         VMX_SEGMENT_FIELD(LDTR),
1015 };
1016
1017 static u64 host_efer;
1018
1019 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1020
1021 /*
1022  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1023  * away by decrementing the array size.
1024  */
1025 static const u32 vmx_msr_index[] = {
1026 #ifdef CONFIG_X86_64
1027         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1028 #endif
1029         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1030 };
1031
1032 static inline bool is_exception_n(u32 intr_info, u8 vector)
1033 {
1034         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1035                              INTR_INFO_VALID_MASK)) ==
1036                 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1037 }
1038
1039 static inline bool is_debug(u32 intr_info)
1040 {
1041         return is_exception_n(intr_info, DB_VECTOR);
1042 }
1043
1044 static inline bool is_breakpoint(u32 intr_info)
1045 {
1046         return is_exception_n(intr_info, BP_VECTOR);
1047 }
1048
1049 static inline bool is_page_fault(u32 intr_info)
1050 {
1051         return is_exception_n(intr_info, PF_VECTOR);
1052 }
1053
1054 static inline bool is_no_device(u32 intr_info)
1055 {
1056         return is_exception_n(intr_info, NM_VECTOR);
1057 }
1058
1059 static inline bool is_invalid_opcode(u32 intr_info)
1060 {
1061         return is_exception_n(intr_info, UD_VECTOR);
1062 }
1063
1064 static inline bool is_external_interrupt(u32 intr_info)
1065 {
1066         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1067                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1068 }
1069
1070 static inline bool is_machine_check(u32 intr_info)
1071 {
1072         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1073                              INTR_INFO_VALID_MASK)) ==
1074                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1075 }
1076
1077 static inline bool cpu_has_vmx_msr_bitmap(void)
1078 {
1079         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
1080 }
1081
1082 static inline bool cpu_has_vmx_tpr_shadow(void)
1083 {
1084         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1085 }
1086
1087 static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1088 {
1089         return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1090 }
1091
1092 static inline bool cpu_has_secondary_exec_ctrls(void)
1093 {
1094         return vmcs_config.cpu_based_exec_ctrl &
1095                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1096 }
1097
1098 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1099 {
1100         return vmcs_config.cpu_based_2nd_exec_ctrl &
1101                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1102 }
1103
1104 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1105 {
1106         return vmcs_config.cpu_based_2nd_exec_ctrl &
1107                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1108 }
1109
1110 static inline bool cpu_has_vmx_apic_register_virt(void)
1111 {
1112         return vmcs_config.cpu_based_2nd_exec_ctrl &
1113                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1114 }
1115
1116 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1117 {
1118         return vmcs_config.cpu_based_2nd_exec_ctrl &
1119                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1120 }
1121
1122 /*
1123  * Comment's format: document - errata name - stepping - processor name.
1124  * Refer from
1125  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1126  */
1127 static u32 vmx_preemption_cpu_tfms[] = {
1128 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
1129 0x000206E6,
1130 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
1131 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1132 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1133 0x00020652,
1134 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1135 0x00020655,
1136 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
1137 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
1138 /*
1139  * 320767.pdf - AAP86  - B1 -
1140  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1141  */
1142 0x000106E5,
1143 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1144 0x000106A0,
1145 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1146 0x000106A1,
1147 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1148 0x000106A4,
1149  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1150  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1151  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1152 0x000106A5,
1153 };
1154
1155 static inline bool cpu_has_broken_vmx_preemption_timer(void)
1156 {
1157         u32 eax = cpuid_eax(0x00000001), i;
1158
1159         /* Clear the reserved bits */
1160         eax &= ~(0x3U << 14 | 0xfU << 28);
1161         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1162                 if (eax == vmx_preemption_cpu_tfms[i])
1163                         return true;
1164
1165         return false;
1166 }
1167
1168 static inline bool cpu_has_vmx_preemption_timer(void)
1169 {
1170         return vmcs_config.pin_based_exec_ctrl &
1171                 PIN_BASED_VMX_PREEMPTION_TIMER;
1172 }
1173
1174 static inline bool cpu_has_vmx_posted_intr(void)
1175 {
1176         return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1177                 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1178 }
1179
1180 static inline bool cpu_has_vmx_apicv(void)
1181 {
1182         return cpu_has_vmx_apic_register_virt() &&
1183                 cpu_has_vmx_virtual_intr_delivery() &&
1184                 cpu_has_vmx_posted_intr();
1185 }
1186
1187 static inline bool cpu_has_vmx_flexpriority(void)
1188 {
1189         return cpu_has_vmx_tpr_shadow() &&
1190                 cpu_has_vmx_virtualize_apic_accesses();
1191 }
1192
1193 static inline bool cpu_has_vmx_ept_execute_only(void)
1194 {
1195         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1196 }
1197
1198 static inline bool cpu_has_vmx_ept_2m_page(void)
1199 {
1200         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1201 }
1202
1203 static inline bool cpu_has_vmx_ept_1g_page(void)
1204 {
1205         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1206 }
1207
1208 static inline bool cpu_has_vmx_ept_4levels(void)
1209 {
1210         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1211 }
1212
1213 static inline bool cpu_has_vmx_ept_mt_wb(void)
1214 {
1215         return vmx_capability.ept & VMX_EPTP_WB_BIT;
1216 }
1217
1218 static inline bool cpu_has_vmx_ept_5levels(void)
1219 {
1220         return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1221 }
1222
1223 static inline bool cpu_has_vmx_ept_ad_bits(void)
1224 {
1225         return vmx_capability.ept & VMX_EPT_AD_BIT;
1226 }
1227
1228 static inline bool cpu_has_vmx_invept_context(void)
1229 {
1230         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1231 }
1232
1233 static inline bool cpu_has_vmx_invept_global(void)
1234 {
1235         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1236 }
1237
1238 static inline bool cpu_has_vmx_invvpid_single(void)
1239 {
1240         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1241 }
1242
1243 static inline bool cpu_has_vmx_invvpid_global(void)
1244 {
1245         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1246 }
1247
1248 static inline bool cpu_has_vmx_invvpid(void)
1249 {
1250         return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1251 }
1252
1253 static inline bool cpu_has_vmx_ept(void)
1254 {
1255         return vmcs_config.cpu_based_2nd_exec_ctrl &
1256                 SECONDARY_EXEC_ENABLE_EPT;
1257 }
1258
1259 static inline bool cpu_has_vmx_unrestricted_guest(void)
1260 {
1261         return vmcs_config.cpu_based_2nd_exec_ctrl &
1262                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1263 }
1264
1265 static inline bool cpu_has_vmx_ple(void)
1266 {
1267         return vmcs_config.cpu_based_2nd_exec_ctrl &
1268                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1269 }
1270
1271 static inline bool cpu_has_vmx_basic_inout(void)
1272 {
1273         return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1274 }
1275
1276 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1277 {
1278         return flexpriority_enabled && lapic_in_kernel(vcpu);
1279 }
1280
1281 static inline bool cpu_has_vmx_vpid(void)
1282 {
1283         return vmcs_config.cpu_based_2nd_exec_ctrl &
1284                 SECONDARY_EXEC_ENABLE_VPID;
1285 }
1286
1287 static inline bool cpu_has_vmx_rdtscp(void)
1288 {
1289         return vmcs_config.cpu_based_2nd_exec_ctrl &
1290                 SECONDARY_EXEC_RDTSCP;
1291 }
1292
1293 static inline bool cpu_has_vmx_invpcid(void)
1294 {
1295         return vmcs_config.cpu_based_2nd_exec_ctrl &
1296                 SECONDARY_EXEC_ENABLE_INVPCID;
1297 }
1298
1299 static inline bool cpu_has_virtual_nmis(void)
1300 {
1301         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1302 }
1303
1304 static inline bool cpu_has_vmx_wbinvd_exit(void)
1305 {
1306         return vmcs_config.cpu_based_2nd_exec_ctrl &
1307                 SECONDARY_EXEC_WBINVD_EXITING;
1308 }
1309
1310 static inline bool cpu_has_vmx_shadow_vmcs(void)
1311 {
1312         u64 vmx_msr;
1313         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1314         /* check if the cpu supports writing r/o exit information fields */
1315         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1316                 return false;
1317
1318         return vmcs_config.cpu_based_2nd_exec_ctrl &
1319                 SECONDARY_EXEC_SHADOW_VMCS;
1320 }
1321
1322 static inline bool cpu_has_vmx_pml(void)
1323 {
1324         return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1325 }
1326
1327 static inline bool cpu_has_vmx_tsc_scaling(void)
1328 {
1329         return vmcs_config.cpu_based_2nd_exec_ctrl &
1330                 SECONDARY_EXEC_TSC_SCALING;
1331 }
1332
1333 static inline bool cpu_has_vmx_vmfunc(void)
1334 {
1335         return vmcs_config.cpu_based_2nd_exec_ctrl &
1336                 SECONDARY_EXEC_ENABLE_VMFUNC;
1337 }
1338
1339 static inline bool report_flexpriority(void)
1340 {
1341         return flexpriority_enabled;
1342 }
1343
1344 static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1345 {
1346         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
1347 }
1348
1349 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1350 {
1351         return vmcs12->cpu_based_vm_exec_control & bit;
1352 }
1353
1354 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1355 {
1356         return (vmcs12->cpu_based_vm_exec_control &
1357                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1358                 (vmcs12->secondary_vm_exec_control & bit);
1359 }
1360
1361 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1362 {
1363         return vmcs12->pin_based_vm_exec_control &
1364                 PIN_BASED_VMX_PREEMPTION_TIMER;
1365 }
1366
1367 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1368 {
1369         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1370 }
1371
1372 static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1373 {
1374         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
1375 }
1376
1377 static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
1378 {
1379         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
1380 }
1381
1382 static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1383 {
1384         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1385 }
1386
1387 static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
1388 {
1389         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
1390 }
1391
1392 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1393 {
1394         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1395 }
1396
1397 static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1398 {
1399         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1400 }
1401
1402 static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1403 {
1404         return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1405 }
1406
1407 static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
1408 {
1409         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
1410 }
1411
1412 static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
1413 {
1414         return nested_cpu_has_vmfunc(vmcs12) &&
1415                 (vmcs12->vm_function_control &
1416                  VMX_VMFUNC_EPTP_SWITCHING);
1417 }
1418
1419 static inline bool is_nmi(u32 intr_info)
1420 {
1421         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1422                 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
1423 }
1424
1425 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1426                               u32 exit_intr_info,
1427                               unsigned long exit_qualification);
1428 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1429                         struct vmcs12 *vmcs12,
1430                         u32 reason, unsigned long qualification);
1431
1432 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1433 {
1434         int i;
1435
1436         for (i = 0; i < vmx->nmsrs; ++i)
1437                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1438                         return i;
1439         return -1;
1440 }
1441
1442 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1443 {
1444     struct {
1445         u64 vpid : 16;
1446         u64 rsvd : 48;
1447         u64 gva;
1448     } operand = { vpid, 0, gva };
1449
1450     asm volatile (__ex(ASM_VMX_INVVPID)
1451                   /* CF==1 or ZF==1 --> rc = -1 */
1452                   "; ja 1f ; ud2 ; 1:"
1453                   : : "a"(&operand), "c"(ext) : "cc", "memory");
1454 }
1455
1456 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1457 {
1458         struct {
1459                 u64 eptp, gpa;
1460         } operand = {eptp, gpa};
1461
1462         asm volatile (__ex(ASM_VMX_INVEPT)
1463                         /* CF==1 or ZF==1 --> rc = -1 */
1464                         "; ja 1f ; ud2 ; 1:\n"
1465                         : : "a" (&operand), "c" (ext) : "cc", "memory");
1466 }
1467
1468 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1469 {
1470         int i;
1471
1472         i = __find_msr_index(vmx, msr);
1473         if (i >= 0)
1474                 return &vmx->guest_msrs[i];
1475         return NULL;
1476 }
1477
1478 static void vmcs_clear(struct vmcs *vmcs)
1479 {
1480         u64 phys_addr = __pa(vmcs);
1481         u8 error;
1482
1483         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1484                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1485                       : "cc", "memory");
1486         if (error)
1487                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1488                        vmcs, phys_addr);
1489 }
1490
1491 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1492 {
1493         vmcs_clear(loaded_vmcs->vmcs);
1494         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
1495                 vmcs_clear(loaded_vmcs->shadow_vmcs);
1496         loaded_vmcs->cpu = -1;
1497         loaded_vmcs->launched = 0;
1498 }
1499
1500 static void vmcs_load(struct vmcs *vmcs)
1501 {
1502         u64 phys_addr = __pa(vmcs);
1503         u8 error;
1504
1505         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1506                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1507                         : "cc", "memory");
1508         if (error)
1509                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1510                        vmcs, phys_addr);
1511 }
1512
1513 #ifdef CONFIG_KEXEC_CORE
1514 /*
1515  * This bitmap is used to indicate whether the vmclear
1516  * operation is enabled on all cpus. All disabled by
1517  * default.
1518  */
1519 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1520
1521 static inline void crash_enable_local_vmclear(int cpu)
1522 {
1523         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1524 }
1525
1526 static inline void crash_disable_local_vmclear(int cpu)
1527 {
1528         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1529 }
1530
1531 static inline int crash_local_vmclear_enabled(int cpu)
1532 {
1533         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1534 }
1535
1536 static void crash_vmclear_local_loaded_vmcss(void)
1537 {
1538         int cpu = raw_smp_processor_id();
1539         struct loaded_vmcs *v;
1540
1541         if (!crash_local_vmclear_enabled(cpu))
1542                 return;
1543
1544         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1545                             loaded_vmcss_on_cpu_link)
1546                 vmcs_clear(v->vmcs);
1547 }
1548 #else
1549 static inline void crash_enable_local_vmclear(int cpu) { }
1550 static inline void crash_disable_local_vmclear(int cpu) { }
1551 #endif /* CONFIG_KEXEC_CORE */
1552
1553 static void __loaded_vmcs_clear(void *arg)
1554 {
1555         struct loaded_vmcs *loaded_vmcs = arg;
1556         int cpu = raw_smp_processor_id();
1557
1558         if (loaded_vmcs->cpu != cpu)
1559                 return; /* vcpu migration can race with cpu offline */
1560         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1561                 per_cpu(current_vmcs, cpu) = NULL;
1562         crash_disable_local_vmclear(cpu);
1563         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1564
1565         /*
1566          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1567          * is before setting loaded_vmcs->vcpu to -1 which is done in
1568          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1569          * then adds the vmcs into percpu list before it is deleted.
1570          */
1571         smp_wmb();
1572
1573         loaded_vmcs_init(loaded_vmcs);
1574         crash_enable_local_vmclear(cpu);
1575 }
1576
1577 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1578 {
1579         int cpu = loaded_vmcs->cpu;
1580
1581         if (cpu != -1)
1582                 smp_call_function_single(cpu,
1583                          __loaded_vmcs_clear, loaded_vmcs, 1);
1584 }
1585
1586 static inline void vpid_sync_vcpu_single(int vpid)
1587 {
1588         if (vpid == 0)
1589                 return;
1590
1591         if (cpu_has_vmx_invvpid_single())
1592                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
1593 }
1594
1595 static inline void vpid_sync_vcpu_global(void)
1596 {
1597         if (cpu_has_vmx_invvpid_global())
1598                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1599 }
1600
1601 static inline void vpid_sync_context(int vpid)
1602 {
1603         if (cpu_has_vmx_invvpid_single())
1604                 vpid_sync_vcpu_single(vpid);
1605         else
1606                 vpid_sync_vcpu_global();
1607 }
1608
1609 static inline void ept_sync_global(void)
1610 {
1611         __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1612 }
1613
1614 static inline void ept_sync_context(u64 eptp)
1615 {
1616         if (cpu_has_vmx_invept_context())
1617                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1618         else
1619                 ept_sync_global();
1620 }
1621
1622 static __always_inline void vmcs_check16(unsigned long field)
1623 {
1624         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1625                          "16-bit accessor invalid for 64-bit field");
1626         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1627                          "16-bit accessor invalid for 64-bit high field");
1628         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1629                          "16-bit accessor invalid for 32-bit high field");
1630         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1631                          "16-bit accessor invalid for natural width field");
1632 }
1633
1634 static __always_inline void vmcs_check32(unsigned long field)
1635 {
1636         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1637                          "32-bit accessor invalid for 16-bit field");
1638         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1639                          "32-bit accessor invalid for natural width field");
1640 }
1641
1642 static __always_inline void vmcs_check64(unsigned long field)
1643 {
1644         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1645                          "64-bit accessor invalid for 16-bit field");
1646         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1647                          "64-bit accessor invalid for 64-bit high field");
1648         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1649                          "64-bit accessor invalid for 32-bit field");
1650         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
1651                          "64-bit accessor invalid for natural width field");
1652 }
1653
1654 static __always_inline void vmcs_checkl(unsigned long field)
1655 {
1656         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
1657                          "Natural width accessor invalid for 16-bit field");
1658         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
1659                          "Natural width accessor invalid for 64-bit field");
1660         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
1661                          "Natural width accessor invalid for 64-bit high field");
1662         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
1663                          "Natural width accessor invalid for 32-bit field");
1664 }
1665
1666 static __always_inline unsigned long __vmcs_readl(unsigned long field)
1667 {
1668         unsigned long value;
1669
1670         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1671                       : "=a"(value) : "d"(field) : "cc");
1672         return value;
1673 }
1674
1675 static __always_inline u16 vmcs_read16(unsigned long field)
1676 {
1677         vmcs_check16(field);
1678         return __vmcs_readl(field);
1679 }
1680
1681 static __always_inline u32 vmcs_read32(unsigned long field)
1682 {
1683         vmcs_check32(field);
1684         return __vmcs_readl(field);
1685 }
1686
1687 static __always_inline u64 vmcs_read64(unsigned long field)
1688 {
1689         vmcs_check64(field);
1690 #ifdef CONFIG_X86_64
1691         return __vmcs_readl(field);
1692 #else
1693         return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
1694 #endif
1695 }
1696
1697 static __always_inline unsigned long vmcs_readl(unsigned long field)
1698 {
1699         vmcs_checkl(field);
1700         return __vmcs_readl(field);
1701 }
1702
1703 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1704 {
1705         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1706                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1707         dump_stack();
1708 }
1709
1710 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
1711 {
1712         u8 error;
1713
1714         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1715                        : "=q"(error) : "a"(value), "d"(field) : "cc");
1716         if (unlikely(error))
1717                 vmwrite_error(field, value);
1718 }
1719
1720 static __always_inline void vmcs_write16(unsigned long field, u16 value)
1721 {
1722         vmcs_check16(field);
1723         __vmcs_writel(field, value);
1724 }
1725
1726 static __always_inline void vmcs_write32(unsigned long field, u32 value)
1727 {
1728         vmcs_check32(field);
1729         __vmcs_writel(field, value);
1730 }
1731
1732 static __always_inline void vmcs_write64(unsigned long field, u64 value)
1733 {
1734         vmcs_check64(field);
1735         __vmcs_writel(field, value);
1736 #ifndef CONFIG_X86_64
1737         asm volatile ("");
1738         __vmcs_writel(field+1, value >> 32);
1739 #endif
1740 }
1741
1742 static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
1743 {
1744         vmcs_checkl(field);
1745         __vmcs_writel(field, value);
1746 }
1747
1748 static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
1749 {
1750         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1751                          "vmcs_clear_bits does not support 64-bit fields");
1752         __vmcs_writel(field, __vmcs_readl(field) & ~mask);
1753 }
1754
1755 static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
1756 {
1757         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1758                          "vmcs_set_bits does not support 64-bit fields");
1759         __vmcs_writel(field, __vmcs_readl(field) | mask);
1760 }
1761
1762 static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
1763 {
1764         vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
1765 }
1766
1767 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1768 {
1769         vmcs_write32(VM_ENTRY_CONTROLS, val);
1770         vmx->vm_entry_controls_shadow = val;
1771 }
1772
1773 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1774 {
1775         if (vmx->vm_entry_controls_shadow != val)
1776                 vm_entry_controls_init(vmx, val);
1777 }
1778
1779 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1780 {
1781         return vmx->vm_entry_controls_shadow;
1782 }
1783
1784
1785 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1786 {
1787         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1788 }
1789
1790 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1791 {
1792         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1793 }
1794
1795 static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
1796 {
1797         vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
1798 }
1799
1800 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1801 {
1802         vmcs_write32(VM_EXIT_CONTROLS, val);
1803         vmx->vm_exit_controls_shadow = val;
1804 }
1805
1806 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1807 {
1808         if (vmx->vm_exit_controls_shadow != val)
1809                 vm_exit_controls_init(vmx, val);
1810 }
1811
1812 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1813 {
1814         return vmx->vm_exit_controls_shadow;
1815 }
1816
1817
1818 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1819 {
1820         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1821 }
1822
1823 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1824 {
1825         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1826 }
1827
1828 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1829 {
1830         vmx->segment_cache.bitmask = 0;
1831 }
1832
1833 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1834                                        unsigned field)
1835 {
1836         bool ret;
1837         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1838
1839         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1840                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1841                 vmx->segment_cache.bitmask = 0;
1842         }
1843         ret = vmx->segment_cache.bitmask & mask;
1844         vmx->segment_cache.bitmask |= mask;
1845         return ret;
1846 }
1847
1848 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1849 {
1850         u16 *p = &vmx->segment_cache.seg[seg].selector;
1851
1852         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1853                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1854         return *p;
1855 }
1856
1857 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1858 {
1859         ulong *p = &vmx->segment_cache.seg[seg].base;
1860
1861         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1862                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1863         return *p;
1864 }
1865
1866 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1867 {
1868         u32 *p = &vmx->segment_cache.seg[seg].limit;
1869
1870         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1871                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1872         return *p;
1873 }
1874
1875 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1876 {
1877         u32 *p = &vmx->segment_cache.seg[seg].ar;
1878
1879         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1880                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1881         return *p;
1882 }
1883
1884 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1885 {
1886         u32 eb;
1887
1888         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1889              (1u << DB_VECTOR) | (1u << AC_VECTOR);
1890         if ((vcpu->guest_debug &
1891              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1892             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1893                 eb |= 1u << BP_VECTOR;
1894         if (to_vmx(vcpu)->rmode.vm86_active)
1895                 eb = ~0;
1896         if (enable_ept)
1897                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1898
1899         /* When we are running a nested L2 guest and L1 specified for it a
1900          * certain exception bitmap, we must trap the same exceptions and pass
1901          * them to L1. When running L2, we will only handle the exceptions
1902          * specified above if L1 did not want them.
1903          */
1904         if (is_guest_mode(vcpu))
1905                 eb |= get_vmcs12(vcpu)->exception_bitmap;
1906
1907         vmcs_write32(EXCEPTION_BITMAP, eb);
1908 }
1909
1910 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1911                 unsigned long entry, unsigned long exit)
1912 {
1913         vm_entry_controls_clearbit(vmx, entry);
1914         vm_exit_controls_clearbit(vmx, exit);
1915 }
1916
1917 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1918 {
1919         unsigned i;
1920         struct msr_autoload *m = &vmx->msr_autoload;
1921
1922         switch (msr) {
1923         case MSR_EFER:
1924                 if (cpu_has_load_ia32_efer) {
1925                         clear_atomic_switch_msr_special(vmx,
1926                                         VM_ENTRY_LOAD_IA32_EFER,
1927                                         VM_EXIT_LOAD_IA32_EFER);
1928                         return;
1929                 }
1930                 break;
1931         case MSR_CORE_PERF_GLOBAL_CTRL:
1932                 if (cpu_has_load_perf_global_ctrl) {
1933                         clear_atomic_switch_msr_special(vmx,
1934                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1935                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1936                         return;
1937                 }
1938                 break;
1939         }
1940
1941         for (i = 0; i < m->nr; ++i)
1942                 if (m->guest[i].index == msr)
1943                         break;
1944
1945         if (i == m->nr)
1946                 return;
1947         --m->nr;
1948         m->guest[i] = m->guest[m->nr];
1949         m->host[i] = m->host[m->nr];
1950         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1951         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1952 }
1953
1954 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1955                 unsigned long entry, unsigned long exit,
1956                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1957                 u64 guest_val, u64 host_val)
1958 {
1959         vmcs_write64(guest_val_vmcs, guest_val);
1960         vmcs_write64(host_val_vmcs, host_val);
1961         vm_entry_controls_setbit(vmx, entry);
1962         vm_exit_controls_setbit(vmx, exit);
1963 }
1964
1965 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1966                                   u64 guest_val, u64 host_val)
1967 {
1968         unsigned i;
1969         struct msr_autoload *m = &vmx->msr_autoload;
1970
1971         switch (msr) {
1972         case MSR_EFER:
1973                 if (cpu_has_load_ia32_efer) {
1974                         add_atomic_switch_msr_special(vmx,
1975                                         VM_ENTRY_LOAD_IA32_EFER,
1976                                         VM_EXIT_LOAD_IA32_EFER,
1977                                         GUEST_IA32_EFER,
1978                                         HOST_IA32_EFER,
1979                                         guest_val, host_val);
1980                         return;
1981                 }
1982                 break;
1983         case MSR_CORE_PERF_GLOBAL_CTRL:
1984                 if (cpu_has_load_perf_global_ctrl) {
1985                         add_atomic_switch_msr_special(vmx,
1986                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1987                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1988                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1989                                         HOST_IA32_PERF_GLOBAL_CTRL,
1990                                         guest_val, host_val);
1991                         return;
1992                 }
1993                 break;
1994         case MSR_IA32_PEBS_ENABLE:
1995                 /* PEBS needs a quiescent period after being disabled (to write
1996                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1997                  * provide that period, so a CPU could write host's record into
1998                  * guest's memory.
1999                  */
2000                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
2001         }
2002
2003         for (i = 0; i < m->nr; ++i)
2004                 if (m->guest[i].index == msr)
2005                         break;
2006
2007         if (i == NR_AUTOLOAD_MSRS) {
2008                 printk_once(KERN_WARNING "Not enough msr switch entries. "
2009                                 "Can't add msr %x\n", msr);
2010                 return;
2011         } else if (i == m->nr) {
2012                 ++m->nr;
2013                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
2014                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
2015         }
2016
2017         m->guest[i].index = msr;
2018         m->guest[i].value = guest_val;
2019         m->host[i].index = msr;
2020         m->host[i].value = host_val;
2021 }
2022
2023 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2024 {
2025         u64 guest_efer = vmx->vcpu.arch.efer;
2026         u64 ignore_bits = 0;
2027
2028         if (!enable_ept) {
2029                 /*
2030                  * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
2031                  * host CPUID is more efficient than testing guest CPUID
2032                  * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
2033                  */
2034                 if (boot_cpu_has(X86_FEATURE_SMEP))
2035                         guest_efer |= EFER_NX;
2036                 else if (!(guest_efer & EFER_NX))
2037                         ignore_bits |= EFER_NX;
2038         }
2039
2040         /*
2041          * LMA and LME handled by hardware; SCE meaningless outside long mode.
2042          */
2043         ignore_bits |= EFER_SCE;
2044 #ifdef CONFIG_X86_64
2045         ignore_bits |= EFER_LMA | EFER_LME;
2046         /* SCE is meaningful only in long mode on Intel */
2047         if (guest_efer & EFER_LMA)
2048                 ignore_bits &= ~(u64)EFER_SCE;
2049 #endif
2050
2051         clear_atomic_switch_msr(vmx, MSR_EFER);
2052
2053         /*
2054          * On EPT, we can't emulate NX, so we must switch EFER atomically.
2055          * On CPUs that support "load IA32_EFER", always switch EFER
2056          * atomically, since it's faster than switching it manually.
2057          */
2058         if (cpu_has_load_ia32_efer ||
2059             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2060                 if (!(guest_efer & EFER_LMA))
2061                         guest_efer &= ~EFER_LME;
2062                 if (guest_efer != host_efer)
2063                         add_atomic_switch_msr(vmx, MSR_EFER,
2064                                               guest_efer, host_efer);
2065                 return false;
2066         } else {
2067                 guest_efer &= ~ignore_bits;
2068                 guest_efer |= host_efer & ignore_bits;
2069
2070                 vmx->guest_msrs[efer_offset].data = guest_efer;
2071                 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2072
2073                 return true;
2074         }
2075 }
2076
2077 #ifdef CONFIG_X86_32
2078 /*
2079  * On 32-bit kernels, VM exits still load the FS and GS bases from the
2080  * VMCS rather than the segment table.  KVM uses this helper to figure
2081  * out the current bases to poke them into the VMCS before entry.
2082  */
2083 static unsigned long segment_base(u16 selector)
2084 {
2085         struct desc_struct *table;
2086         unsigned long v;
2087
2088         if (!(selector & ~SEGMENT_RPL_MASK))
2089                 return 0;
2090
2091         table = get_current_gdt_ro();
2092
2093         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2094                 u16 ldt_selector = kvm_read_ldt();
2095
2096                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2097                         return 0;
2098
2099                 table = (struct desc_struct *)segment_base(ldt_selector);
2100         }
2101         v = get_desc_base(&table[selector >> 3]);
2102         return v;
2103 }
2104 #endif
2105
2106 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2107 {
2108         struct vcpu_vmx *vmx = to_vmx(vcpu);
2109         int i;
2110
2111         if (vmx->host_state.loaded)
2112                 return;
2113
2114         vmx->host_state.loaded = 1;
2115         /*
2116          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2117          * allow segment selectors with cpl > 0 or ti == 1.
2118          */
2119         vmx->host_state.ldt_sel = kvm_read_ldt();
2120         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
2121         savesegment(fs, vmx->host_state.fs_sel);
2122         if (!(vmx->host_state.fs_sel & 7)) {
2123                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
2124                 vmx->host_state.fs_reload_needed = 0;
2125         } else {
2126                 vmcs_write16(HOST_FS_SELECTOR, 0);
2127                 vmx->host_state.fs_reload_needed = 1;
2128         }
2129         savesegment(gs, vmx->host_state.gs_sel);
2130         if (!(vmx->host_state.gs_sel & 7))
2131                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
2132         else {
2133                 vmcs_write16(HOST_GS_SELECTOR, 0);
2134                 vmx->host_state.gs_ldt_reload_needed = 1;
2135         }
2136
2137 #ifdef CONFIG_X86_64
2138         savesegment(ds, vmx->host_state.ds_sel);
2139         savesegment(es, vmx->host_state.es_sel);
2140 #endif
2141
2142 #ifdef CONFIG_X86_64
2143         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
2144         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2145 #else
2146         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
2147         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
2148 #endif
2149
2150 #ifdef CONFIG_X86_64
2151         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2152         if (is_long_mode(&vmx->vcpu))
2153                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2154 #endif
2155         if (boot_cpu_has(X86_FEATURE_MPX))
2156                 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2157         for (i = 0; i < vmx->save_nmsrs; ++i)
2158                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
2159                                    vmx->guest_msrs[i].data,
2160                                    vmx->guest_msrs[i].mask);
2161 }
2162
2163 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
2164 {
2165         if (!vmx->host_state.loaded)
2166                 return;
2167
2168         ++vmx->vcpu.stat.host_state_reload;
2169         vmx->host_state.loaded = 0;
2170 #ifdef CONFIG_X86_64
2171         if (is_long_mode(&vmx->vcpu))
2172                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2173 #endif
2174         if (vmx->host_state.gs_ldt_reload_needed) {
2175                 kvm_load_ldt(vmx->host_state.ldt_sel);
2176 #ifdef CONFIG_X86_64
2177                 load_gs_index(vmx->host_state.gs_sel);
2178 #else
2179                 loadsegment(gs, vmx->host_state.gs_sel);
2180 #endif
2181         }
2182         if (vmx->host_state.fs_reload_needed)
2183                 loadsegment(fs, vmx->host_state.fs_sel);
2184 #ifdef CONFIG_X86_64
2185         if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
2186                 loadsegment(ds, vmx->host_state.ds_sel);
2187                 loadsegment(es, vmx->host_state.es_sel);
2188         }
2189 #endif
2190         invalidate_tss_limit();
2191 #ifdef CONFIG_X86_64
2192         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2193 #endif
2194         if (vmx->host_state.msr_host_bndcfgs)
2195                 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2196         load_fixmap_gdt(raw_smp_processor_id());
2197 }
2198
2199 static void vmx_load_host_state(struct vcpu_vmx *vmx)
2200 {
2201         preempt_disable();
2202         __vmx_load_host_state(vmx);
2203         preempt_enable();
2204 }
2205
2206 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2207 {
2208         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2209         struct pi_desc old, new;
2210         unsigned int dest;
2211
2212         /*
2213          * In case of hot-plug or hot-unplug, we may have to undo
2214          * vmx_vcpu_pi_put even if there is no assigned device.  And we
2215          * always keep PI.NDST up to date for simplicity: it makes the
2216          * code easier, and CPU migration is not a fast path.
2217          */
2218         if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2219                 return;
2220
2221         /*
2222          * First handle the simple case where no cmpxchg is necessary; just
2223          * allow posting non-urgent interrupts.
2224          *
2225          * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
2226          * PI.NDST: pi_post_block will do it for us and the wakeup_handler
2227          * expects the VCPU to be on the blocked_vcpu_list that matches
2228          * PI.NDST.
2229          */
2230         if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
2231             vcpu->cpu == cpu) {
2232                 pi_clear_sn(pi_desc);
2233                 return;
2234         }
2235
2236         /* The full case.  */
2237         do {
2238                 old.control = new.control = pi_desc->control;
2239
2240                 dest = cpu_physical_id(cpu);
2241
2242                 if (x2apic_enabled())
2243                         new.ndst = dest;
2244                 else
2245                         new.ndst = (dest << 8) & 0xFF00;
2246
2247                 new.sn = 0;
2248         } while (cmpxchg64(&pi_desc->control, old.control,
2249                            new.control) != old.control);
2250 }
2251
2252 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
2253 {
2254         vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
2255         vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
2256 }
2257
2258 /*
2259  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
2260  * vcpu mutex is already taken.
2261  */
2262 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2263 {
2264         struct vcpu_vmx *vmx = to_vmx(vcpu);
2265         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2266
2267         if (!already_loaded) {
2268                 loaded_vmcs_clear(vmx->loaded_vmcs);
2269                 local_irq_disable();
2270                 crash_disable_local_vmclear(cpu);
2271
2272                 /*
2273                  * Read loaded_vmcs->cpu should be before fetching
2274                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
2275                  * See the comments in __loaded_vmcs_clear().
2276                  */
2277                 smp_rmb();
2278
2279                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2280                          &per_cpu(loaded_vmcss_on_cpu, cpu));
2281                 crash_enable_local_vmclear(cpu);
2282                 local_irq_enable();
2283         }
2284
2285         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2286                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2287                 vmcs_load(vmx->loaded_vmcs->vmcs);
2288         }
2289
2290         if (!already_loaded) {
2291                 void *gdt = get_current_gdt_ro();
2292                 unsigned long sysenter_esp;
2293
2294                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2295
2296                 /*
2297                  * Linux uses per-cpu TSS and GDT, so set these when switching
2298                  * processors.  See 22.2.4.
2299                  */
2300                 vmcs_writel(HOST_TR_BASE,
2301                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2302                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
2303
2304                 /*
2305                  * VM exits change the host TR limit to 0x67 after a VM
2306                  * exit.  This is okay, since 0x67 covers everything except
2307                  * the IO bitmap and have have code to handle the IO bitmap
2308                  * being lost after a VM exit.
2309                  */
2310                 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
2311
2312                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2313                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2314
2315                 vmx->loaded_vmcs->cpu = cpu;
2316         }
2317
2318         /* Setup TSC multiplier */
2319         if (kvm_has_tsc_control &&
2320             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
2321                 decache_tsc_multiplier(vmx);
2322
2323         vmx_vcpu_pi_load(vcpu, cpu);
2324         vmx->host_pkru = read_pkru();
2325 }
2326
2327 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2328 {
2329         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2330
2331         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2332                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
2333                 !kvm_vcpu_apicv_active(vcpu))
2334                 return;
2335
2336         /* Set SN when the vCPU is preempted */
2337         if (vcpu->preempted)
2338                 pi_set_sn(pi_desc);
2339 }
2340
2341 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2342 {
2343         vmx_vcpu_pi_put(vcpu);
2344
2345         __vmx_load_host_state(to_vmx(vcpu));
2346 }
2347
2348 static bool emulation_required(struct kvm_vcpu *vcpu)
2349 {
2350         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2351 }
2352
2353 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2354
2355 /*
2356  * Return the cr0 value that a nested guest would read. This is a combination
2357  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
2358  * its hypervisor (cr0_read_shadow).
2359  */
2360 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
2361 {
2362         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
2363                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
2364 }
2365 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
2366 {
2367         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
2368                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
2369 }
2370
2371 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2372 {
2373         unsigned long rflags, save_rflags;
2374
2375         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
2376                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2377                 rflags = vmcs_readl(GUEST_RFLAGS);
2378                 if (to_vmx(vcpu)->rmode.vm86_active) {
2379                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2380                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
2381                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2382                 }
2383                 to_vmx(vcpu)->rflags = rflags;
2384         }
2385         return to_vmx(vcpu)->rflags;
2386 }
2387
2388 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2389 {
2390         unsigned long old_rflags = vmx_get_rflags(vcpu);
2391
2392         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2393         to_vmx(vcpu)->rflags = rflags;
2394         if (to_vmx(vcpu)->rmode.vm86_active) {
2395                 to_vmx(vcpu)->rmode.save_rflags = rflags;
2396                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2397         }
2398         vmcs_writel(GUEST_RFLAGS, rflags);
2399
2400         if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
2401                 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
2402 }
2403
2404 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2405 {
2406         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2407         int ret = 0;
2408
2409         if (interruptibility & GUEST_INTR_STATE_STI)
2410                 ret |= KVM_X86_SHADOW_INT_STI;
2411         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2412                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2413
2414         return ret;
2415 }
2416
2417 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2418 {
2419         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2420         u32 interruptibility = interruptibility_old;
2421
2422         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2423
2424         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2425                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
2426         else if (mask & KVM_X86_SHADOW_INT_STI)
2427                 interruptibility |= GUEST_INTR_STATE_STI;
2428
2429         if ((interruptibility != interruptibility_old))
2430                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2431 }
2432
2433 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
2434 {
2435         unsigned long rip;
2436
2437         rip = kvm_rip_read(vcpu);
2438         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2439         kvm_rip_write(vcpu, rip);
2440
2441         /* skipping an emulated instruction also counts */
2442         vmx_set_interrupt_shadow(vcpu, 0);
2443 }
2444
2445 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
2446                                                unsigned long exit_qual)
2447 {
2448         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2449         unsigned int nr = vcpu->arch.exception.nr;
2450         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2451
2452         if (vcpu->arch.exception.has_error_code) {
2453                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
2454                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2455         }
2456
2457         if (kvm_exception_is_soft(nr))
2458                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2459         else
2460                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2461
2462         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
2463             vmx_get_nmi_mask(vcpu))
2464                 intr_info |= INTR_INFO_UNBLOCK_NMI;
2465
2466         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
2467 }
2468
2469 /*
2470  * KVM wants to inject page-faults which it got to the guest. This function
2471  * checks whether in a nested guest, we need to inject them to L1 or L2.
2472  */
2473 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
2474 {
2475         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2476         unsigned int nr = vcpu->arch.exception.nr;
2477
2478         if (nr == PF_VECTOR) {
2479                 if (vcpu->arch.exception.nested_apf) {
2480                         *exit_qual = vcpu->arch.apf.nested_apf_token;
2481                         return 1;
2482                 }
2483                 /*
2484                  * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
2485                  * The fix is to add the ancillary datum (CR2 or DR6) to structs
2486                  * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
2487                  * can be written only when inject_pending_event runs.  This should be
2488                  * conditional on a new capability---if the capability is disabled,
2489                  * kvm_multiple_exception would write the ancillary information to
2490                  * CR2 or DR6, for backwards ABI-compatibility.
2491                  */
2492                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
2493                                                     vcpu->arch.exception.error_code)) {
2494                         *exit_qual = vcpu->arch.cr2;
2495                         return 1;
2496                 }
2497         } else {
2498                 if (vmcs12->exception_bitmap & (1u << nr)) {
2499                         if (nr == DB_VECTOR)
2500                                 *exit_qual = vcpu->arch.dr6;
2501                         else
2502                                 *exit_qual = 0;
2503                         return 1;
2504                 }
2505         }
2506
2507         return 0;
2508 }
2509
2510 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2511 {
2512         struct vcpu_vmx *vmx = to_vmx(vcpu);
2513         unsigned nr = vcpu->arch.exception.nr;
2514         bool has_error_code = vcpu->arch.exception.has_error_code;
2515         u32 error_code = vcpu->arch.exception.error_code;
2516         u32 intr_info = nr | INTR_INFO_VALID_MASK;
2517
2518         if (has_error_code) {
2519                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2520                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2521         }
2522
2523         if (vmx->rmode.vm86_active) {
2524                 int inc_eip = 0;
2525                 if (kvm_exception_is_soft(nr))
2526                         inc_eip = vcpu->arch.event_exit_inst_len;
2527                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2528                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2529                 return;
2530         }
2531
2532         if (kvm_exception_is_soft(nr)) {
2533                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2534                              vmx->vcpu.arch.event_exit_inst_len);
2535                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2536         } else
2537                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2538
2539         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2540 }
2541
2542 static bool vmx_rdtscp_supported(void)
2543 {
2544         return cpu_has_vmx_rdtscp();
2545 }
2546
2547 static bool vmx_invpcid_supported(void)
2548 {
2549         return cpu_has_vmx_invpcid() && enable_ept;
2550 }
2551
2552 /*
2553  * Swap MSR entry in host/guest MSR entry array.
2554  */
2555 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2556 {
2557         struct shared_msr_entry tmp;
2558
2559         tmp = vmx->guest_msrs[to];
2560         vmx->guest_msrs[to] = vmx->guest_msrs[from];
2561         vmx->guest_msrs[from] = tmp;
2562 }
2563
2564 /*
2565  * Set up the vmcs to automatically save and restore system
2566  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2567  * mode, as fiddling with msrs is very expensive.
2568  */
2569 static void setup_msrs(struct vcpu_vmx *vmx)
2570 {
2571         int save_nmsrs, index;
2572
2573         save_nmsrs = 0;
2574 #ifdef CONFIG_X86_64
2575         if (is_long_mode(&vmx->vcpu)) {
2576                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2577                 if (index >= 0)
2578                         move_msr_up(vmx, index, save_nmsrs++);
2579                 index = __find_msr_index(vmx, MSR_LSTAR);
2580                 if (index >= 0)
2581                         move_msr_up(vmx, index, save_nmsrs++);
2582                 index = __find_msr_index(vmx, MSR_CSTAR);
2583                 if (index >= 0)
2584                         move_msr_up(vmx, index, save_nmsrs++);
2585                 index = __find_msr_index(vmx, MSR_TSC_AUX);
2586                 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
2587                         move_msr_up(vmx, index, save_nmsrs++);
2588                 /*
2589                  * MSR_STAR is only needed on long mode guests, and only
2590                  * if efer.sce is enabled.
2591                  */
2592                 index = __find_msr_index(vmx, MSR_STAR);
2593                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2594                         move_msr_up(vmx, index, save_nmsrs++);
2595         }
2596 #endif
2597         index = __find_msr_index(vmx, MSR_EFER);
2598         if (index >= 0 && update_transition_efer(vmx, index))
2599                 move_msr_up(vmx, index, save_nmsrs++);
2600
2601         vmx->save_nmsrs = save_nmsrs;
2602
2603         if (cpu_has_vmx_msr_bitmap())
2604                 vmx_update_msr_bitmap(&vmx->vcpu);
2605 }
2606
2607 /*
2608  * reads and returns guest's timestamp counter "register"
2609  * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
2610  * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
2611  */
2612 static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
2613 {
2614         u64 host_tsc, tsc_offset;
2615
2616         host_tsc = rdtsc();
2617         tsc_offset = vmcs_read64(TSC_OFFSET);
2618         return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
2619 }
2620
2621 /*
2622  * writes 'offset' into guest's timestamp counter offset register
2623  */
2624 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2625 {
2626         if (is_guest_mode(vcpu)) {
2627                 /*
2628                  * We're here if L1 chose not to trap WRMSR to TSC. According
2629                  * to the spec, this should set L1's TSC; The offset that L1
2630                  * set for L2 remains unchanged, and still needs to be added
2631                  * to the newly set TSC to get L2's TSC.
2632                  */
2633                 struct vmcs12 *vmcs12;
2634                 /* recalculate vmcs02.TSC_OFFSET: */
2635                 vmcs12 = get_vmcs12(vcpu);
2636                 vmcs_write64(TSC_OFFSET, offset +
2637                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2638                          vmcs12->tsc_offset : 0));
2639         } else {
2640                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2641                                            vmcs_read64(TSC_OFFSET), offset);
2642                 vmcs_write64(TSC_OFFSET, offset);
2643         }
2644 }
2645
2646 /*
2647  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2648  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2649  * all guests if the "nested" module option is off, and can also be disabled
2650  * for a single guest by disabling its VMX cpuid bit.
2651  */
2652 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2653 {
2654         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
2655 }
2656
2657 /*
2658  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2659  * returned for the various VMX controls MSRs when nested VMX is enabled.
2660  * The same values should also be used to verify that vmcs12 control fields are
2661  * valid during nested entry from L1 to L2.
2662  * Each of these control msrs has a low and high 32-bit half: A low bit is on
2663  * if the corresponding bit in the (32-bit) control field *must* be on, and a
2664  * bit in the high half is on if the corresponding bit in the control field
2665  * may be on. See also vmx_control_verify().
2666  */
2667 static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2668 {
2669         /*
2670          * Note that as a general rule, the high half of the MSRs (bits in
2671          * the control fields which may be 1) should be initialized by the
2672          * intersection of the underlying hardware's MSR (i.e., features which
2673          * can be supported) and the list of features we want to expose -
2674          * because they are known to be properly supported in our code.
2675          * Also, usually, the low half of the MSRs (bits which must be 1) can
2676          * be set to 0, meaning that L1 may turn off any of these bits. The
2677          * reason is that if one of these bits is necessary, it will appear
2678          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2679          * fields of vmcs01 and vmcs02, will turn these bits off - and
2680          * nested_vmx_exit_reflected() will not pass related exits to L1.
2681          * These rules have exceptions below.
2682          */
2683
2684         /* pin-based controls */
2685         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2686                 vmx->nested.nested_vmx_pinbased_ctls_low,
2687                 vmx->nested.nested_vmx_pinbased_ctls_high);
2688         vmx->nested.nested_vmx_pinbased_ctls_low |=
2689                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2690         vmx->nested.nested_vmx_pinbased_ctls_high &=
2691                 PIN_BASED_EXT_INTR_MASK |
2692                 PIN_BASED_NMI_EXITING |
2693                 PIN_BASED_VIRTUAL_NMIS;
2694         vmx->nested.nested_vmx_pinbased_ctls_high |=
2695                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2696                 PIN_BASED_VMX_PREEMPTION_TIMER;
2697         if (kvm_vcpu_apicv_active(&vmx->vcpu))
2698                 vmx->nested.nested_vmx_pinbased_ctls_high |=
2699                         PIN_BASED_POSTED_INTR;
2700
2701         /* exit controls */
2702         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2703                 vmx->nested.nested_vmx_exit_ctls_low,
2704                 vmx->nested.nested_vmx_exit_ctls_high);
2705         vmx->nested.nested_vmx_exit_ctls_low =
2706                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2707
2708         vmx->nested.nested_vmx_exit_ctls_high &=
2709 #ifdef CONFIG_X86_64
2710                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2711 #endif
2712                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2713         vmx->nested.nested_vmx_exit_ctls_high |=
2714                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2715                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2716                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2717
2718         if (kvm_mpx_supported())
2719                 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2720
2721         /* We support free control of debug control saving. */
2722         vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2723
2724         /* entry controls */
2725         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2726                 vmx->nested.nested_vmx_entry_ctls_low,
2727                 vmx->nested.nested_vmx_entry_ctls_high);
2728         vmx->nested.nested_vmx_entry_ctls_low =
2729                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2730         vmx->nested.nested_vmx_entry_ctls_high &=
2731 #ifdef CONFIG_X86_64
2732                 VM_ENTRY_IA32E_MODE |
2733 #endif
2734                 VM_ENTRY_LOAD_IA32_PAT;
2735         vmx->nested.nested_vmx_entry_ctls_high |=
2736                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2737         if (kvm_mpx_supported())
2738                 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2739
2740         /* We support free control of debug control loading. */
2741         vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2742
2743         /* cpu-based controls */
2744         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2745                 vmx->nested.nested_vmx_procbased_ctls_low,
2746                 vmx->nested.nested_vmx_procbased_ctls_high);
2747         vmx->nested.nested_vmx_procbased_ctls_low =
2748                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2749         vmx->nested.nested_vmx_procbased_ctls_high &=
2750                 CPU_BASED_VIRTUAL_INTR_PENDING |
2751                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2752                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2753                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2754                 CPU_BASED_CR3_STORE_EXITING |
2755 #ifdef CONFIG_X86_64
2756                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2757 #endif
2758                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2759                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
2760                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
2761                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
2762                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2763         /*
2764          * We can allow some features even when not supported by the
2765          * hardware. For example, L1 can specify an MSR bitmap - and we
2766          * can use it to avoid exits to L1 - even when L0 runs L2
2767          * without MSR bitmaps.
2768          */
2769         vmx->nested.nested_vmx_procbased_ctls_high |=
2770                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2771                 CPU_BASED_USE_MSR_BITMAPS;
2772
2773         /* We support free control of CR3 access interception. */
2774         vmx->nested.nested_vmx_procbased_ctls_low &=
2775                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2776
2777         /*
2778          * secondary cpu-based controls.  Do not include those that
2779          * depend on CPUID bits, they are added later by vmx_cpuid_update.
2780          */
2781         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2782                 vmx->nested.nested_vmx_secondary_ctls_low,
2783                 vmx->nested.nested_vmx_secondary_ctls_high);
2784         vmx->nested.nested_vmx_secondary_ctls_low = 0;
2785         vmx->nested.nested_vmx_secondary_ctls_high &=
2786                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2787                 SECONDARY_EXEC_DESC |
2788                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2789                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2790                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2791                 SECONDARY_EXEC_WBINVD_EXITING;
2792
2793         if (enable_ept) {
2794                 /* nested EPT: emulate EPT also to L1 */
2795                 vmx->nested.nested_vmx_secondary_ctls_high |=
2796                         SECONDARY_EXEC_ENABLE_EPT;
2797                 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2798                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2799                 if (cpu_has_vmx_ept_execute_only())
2800                         vmx->nested.nested_vmx_ept_caps |=
2801                                 VMX_EPT_EXECUTE_ONLY_BIT;
2802                 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2803                 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2804                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
2805                         VMX_EPT_1GB_PAGE_BIT;
2806                 if (enable_ept_ad_bits) {
2807                         vmx->nested.nested_vmx_secondary_ctls_high |=
2808                                 SECONDARY_EXEC_ENABLE_PML;
2809                         vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
2810                 }
2811         }
2812
2813         if (cpu_has_vmx_vmfunc()) {
2814                 vmx->nested.nested_vmx_secondary_ctls_high |=
2815                         SECONDARY_EXEC_ENABLE_VMFUNC;
2816                 /*
2817                  * Advertise EPTP switching unconditionally
2818                  * since we emulate it
2819                  */
2820                 if (enable_ept)
2821                         vmx->nested.nested_vmx_vmfunc_controls =
2822                                 VMX_VMFUNC_EPTP_SWITCHING;
2823         }
2824
2825         /*
2826          * Old versions of KVM use the single-context version without
2827          * checking for support, so declare that it is supported even
2828          * though it is treated as global context.  The alternative is
2829          * not failing the single-context invvpid, and it is worse.
2830          */
2831         if (enable_vpid) {
2832                 vmx->nested.nested_vmx_secondary_ctls_high |=
2833                         SECONDARY_EXEC_ENABLE_VPID;
2834                 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2835                         VMX_VPID_EXTENT_SUPPORTED_MASK;
2836         }
2837
2838         if (enable_unrestricted_guest)
2839                 vmx->nested.nested_vmx_secondary_ctls_high |=
2840                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
2841
2842         /* miscellaneous data */
2843         rdmsr(MSR_IA32_VMX_MISC,
2844                 vmx->nested.nested_vmx_misc_low,
2845                 vmx->nested.nested_vmx_misc_high);
2846         vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2847         vmx->nested.nested_vmx_misc_low |=
2848                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2849                 VMX_MISC_ACTIVITY_HLT;
2850         vmx->nested.nested_vmx_misc_high = 0;
2851
2852         /*
2853          * This MSR reports some information about VMX support. We
2854          * should return information about the VMX we emulate for the
2855          * guest, and the VMCS structure we give it - not about the
2856          * VMX support of the underlying hardware.
2857          */
2858         vmx->nested.nested_vmx_basic =
2859                 VMCS12_REVISION |
2860                 VMX_BASIC_TRUE_CTLS |
2861                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2862                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2863
2864         if (cpu_has_vmx_basic_inout())
2865                 vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
2866
2867         /*
2868          * These MSRs specify bits which the guest must keep fixed on
2869          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2870          * We picked the standard core2 setting.
2871          */
2872 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2873 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
2874         vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
2875         vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
2876
2877         /* These MSRs specify bits which the guest must keep fixed off. */
2878         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
2879         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
2880
2881         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2882         vmx->nested.nested_vmx_vmcs_enum = 0x2e;
2883 }
2884
2885 /*
2886  * if fixed0[i] == 1: val[i] must be 1
2887  * if fixed1[i] == 0: val[i] must be 0
2888  */
2889 static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
2890 {
2891         return ((val & fixed1) | fixed0) == val;
2892 }
2893
2894 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2895 {
2896         return fixed_bits_valid(control, low, high);
2897 }
2898
2899 static inline u64 vmx_control_msr(u32 low, u32 high)
2900 {
2901         return low | ((u64)high << 32);
2902 }
2903
2904 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
2905 {
2906         superset &= mask;
2907         subset &= mask;
2908
2909         return (superset | subset) == superset;
2910 }
2911
2912 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
2913 {
2914         const u64 feature_and_reserved =
2915                 /* feature (except bit 48; see below) */
2916                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
2917                 /* reserved */
2918                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
2919         u64 vmx_basic = vmx->nested.nested_vmx_basic;
2920
2921         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
2922                 return -EINVAL;
2923
2924         /*
2925          * KVM does not emulate a version of VMX that constrains physical
2926          * addresses of VMX structures (e.g. VMCS) to 32-bits.
2927          */
2928         if (data & BIT_ULL(48))
2929                 return -EINVAL;
2930
2931         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
2932             vmx_basic_vmcs_revision_id(data))
2933                 return -EINVAL;
2934
2935         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
2936                 return -EINVAL;
2937
2938         vmx->nested.nested_vmx_basic = data;
2939         return 0;
2940 }
2941
2942 static int
2943 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
2944 {
2945         u64 supported;
2946         u32 *lowp, *highp;
2947
2948         switch (msr_index) {
2949         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2950                 lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
2951                 highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
2952                 break;
2953         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2954                 lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
2955                 highp = &vmx->nested.nested_vmx_procbased_ctls_high;
2956                 break;
2957         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2958                 lowp = &vmx->nested.nested_vmx_exit_ctls_low;
2959                 highp = &vmx->nested.nested_vmx_exit_ctls_high;
2960                 break;
2961         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2962                 lowp = &vmx->nested.nested_vmx_entry_ctls_low;
2963                 highp = &vmx->nested.nested_vmx_entry_ctls_high;
2964                 break;
2965         case MSR_IA32_VMX_PROCBASED_CTLS2:
2966                 lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
2967                 highp = &vmx->nested.nested_vmx_secondary_ctls_high;
2968                 break;
2969         default:
2970                 BUG();
2971         }
2972
2973         supported = vmx_control_msr(*lowp, *highp);
2974
2975         /* Check must-be-1 bits are still 1. */
2976         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
2977                 return -EINVAL;
2978
2979         /* Check must-be-0 bits are still 0. */
2980         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
2981                 return -EINVAL;
2982
2983         *lowp = data;
2984         *highp = data >> 32;
2985         return 0;
2986 }
2987
2988 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
2989 {
2990         const u64 feature_and_reserved_bits =
2991                 /* feature */
2992                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
2993                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
2994                 /* reserved */
2995                 GENMASK_ULL(13, 9) | BIT_ULL(31);
2996         u64 vmx_misc;
2997
2998         vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
2999                                    vmx->nested.nested_vmx_misc_high);
3000
3001         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3002                 return -EINVAL;
3003
3004         if ((vmx->nested.nested_vmx_pinbased_ctls_high &
3005              PIN_BASED_VMX_PREEMPTION_TIMER) &&
3006             vmx_misc_preemption_timer_rate(data) !=
3007             vmx_misc_preemption_timer_rate(vmx_misc))
3008                 return -EINVAL;
3009
3010         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
3011                 return -EINVAL;
3012
3013         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
3014                 return -EINVAL;
3015
3016         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3017                 return -EINVAL;
3018
3019         vmx->nested.nested_vmx_misc_low = data;
3020         vmx->nested.nested_vmx_misc_high = data >> 32;
3021         return 0;
3022 }
3023
3024 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3025 {
3026         u64 vmx_ept_vpid_cap;
3027
3028         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
3029                                            vmx->nested.nested_vmx_vpid_caps);
3030
3031         /* Every bit is either reserved or a feature bit. */
3032         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3033                 return -EINVAL;
3034
3035         vmx->nested.nested_vmx_ept_caps = data;
3036         vmx->nested.nested_vmx_vpid_caps = data >> 32;
3037         return 0;
3038 }
3039
3040 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3041 {
3042         u64 *msr;
3043
3044         switch (msr_index) {
3045         case MSR_IA32_VMX_CR0_FIXED0:
3046                 msr = &vmx->nested.nested_vmx_cr0_fixed0;
3047                 break;
3048         case MSR_IA32_VMX_CR4_FIXED0:
3049                 msr = &vmx->nested.nested_vmx_cr4_fixed0;
3050                 break;
3051         default:
3052                 BUG();
3053         }
3054
3055         /*
3056          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3057          * must be 1 in the restored value.
3058          */
3059         if (!is_bitwise_subset(data, *msr, -1ULL))
3060                 return -EINVAL;
3061
3062         *msr = data;
3063         return 0;
3064 }
3065
3066 /*
3067  * Called when userspace is restoring VMX MSRs.
3068  *
3069  * Returns 0 on success, non-0 otherwise.
3070  */
3071 static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3072 {
3073         struct vcpu_vmx *vmx = to_vmx(vcpu);
3074
3075         switch (msr_index) {
3076         case MSR_IA32_VMX_BASIC:
3077                 return vmx_restore_vmx_basic(vmx, data);
3078         case MSR_IA32_VMX_PINBASED_CTLS:
3079         case MSR_IA32_VMX_PROCBASED_CTLS:
3080         case MSR_IA32_VMX_EXIT_CTLS:
3081         case MSR_IA32_VMX_ENTRY_CTLS:
3082                 /*
3083                  * The "non-true" VMX capability MSRs are generated from the
3084                  * "true" MSRs, so we do not support restoring them directly.
3085                  *
3086                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3087                  * should restore the "true" MSRs with the must-be-1 bits
3088                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3089                  * DEFAULT SETTINGS".
3090                  */
3091                 return -EINVAL;
3092         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3093         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3094         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3095         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3096         case MSR_IA32_VMX_PROCBASED_CTLS2:
3097                 return vmx_restore_control_msr(vmx, msr_index, data);
3098         case MSR_IA32_VMX_MISC:
3099                 return vmx_restore_vmx_misc(vmx, data);
3100         case MSR_IA32_VMX_CR0_FIXED0:
3101         case MSR_IA32_VMX_CR4_FIXED0:
3102                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3103         case MSR_IA32_VMX_CR0_FIXED1:
3104         case MSR_IA32_VMX_CR4_FIXED1:
3105                 /*
3106                  * These MSRs are generated based on the vCPU's CPUID, so we
3107                  * do not support restoring them directly.
3108                  */
3109                 return -EINVAL;
3110         case MSR_IA32_VMX_EPT_VPID_CAP:
3111                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3112         case MSR_IA32_VMX_VMCS_ENUM:
3113                 vmx->nested.nested_vmx_vmcs_enum = data;
3114                 return 0;
3115         default:
3116                 /*
3117                  * The rest of the VMX capability MSRs do not support restore.
3118                  */
3119                 return -EINVAL;
3120         }
3121 }
3122
3123 /* Returns 0 on success, non-0 otherwise. */
3124 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
3125 {
3126         struct vcpu_vmx *vmx = to_vmx(vcpu);
3127
3128         switch (msr_index) {
3129         case MSR_IA32_VMX_BASIC:
3130                 *pdata = vmx->nested.nested_vmx_basic;
3131                 break;
3132         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3133         case MSR_IA32_VMX_PINBASED_CTLS:
3134                 *pdata = vmx_control_msr(
3135                         vmx->nested.nested_vmx_pinbased_ctls_low,
3136                         vmx->nested.nested_vmx_pinbased_ctls_high);
3137                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
3138                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3139                 break;
3140         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3141         case MSR_IA32_VMX_PROCBASED_CTLS:
3142                 *pdata = vmx_control_msr(
3143                         vmx->nested.nested_vmx_procbased_ctls_low,
3144                         vmx->nested.nested_vmx_procbased_ctls_high);
3145                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
3146                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3147                 break;
3148         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3149         case MSR_IA32_VMX_EXIT_CTLS:
3150                 *pdata = vmx_control_msr(
3151                         vmx->nested.nested_vmx_exit_ctls_low,
3152                         vmx->nested.nested_vmx_exit_ctls_high);
3153                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
3154                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3155                 break;
3156         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3157         case MSR_IA32_VMX_ENTRY_CTLS:
3158                 *pdata = vmx_control_msr(
3159                         vmx->nested.nested_vmx_entry_ctls_low,
3160                         vmx->nested.nested_vmx_entry_ctls_high);
3161                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
3162                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3163                 break;
3164         case MSR_IA32_VMX_MISC:
3165                 *pdata = vmx_control_msr(
3166                         vmx->nested.nested_vmx_misc_low,
3167                         vmx->nested.nested_vmx_misc_high);
3168                 break;
3169         case MSR_IA32_VMX_CR0_FIXED0:
3170                 *pdata = vmx->nested.nested_vmx_cr0_fixed0;
3171                 break;
3172         case MSR_IA32_VMX_CR0_FIXED1:
3173                 *pdata = vmx->nested.nested_vmx_cr0_fixed1;
3174                 break;
3175         case MSR_IA32_VMX_CR4_FIXED0:
3176                 *pdata = vmx->nested.nested_vmx_cr4_fixed0;
3177                 break;
3178         case MSR_IA32_VMX_CR4_FIXED1:
3179                 *pdata = vmx->nested.nested_vmx_cr4_fixed1;
3180                 break;
3181         case MSR_IA32_VMX_VMCS_ENUM:
3182                 *pdata = vmx->nested.nested_vmx_vmcs_enum;
3183                 break;
3184         case MSR_IA32_VMX_PROCBASED_CTLS2:
3185                 *pdata = vmx_control_msr(
3186                         vmx->nested.nested_vmx_secondary_ctls_low,
3187                         vmx->nested.nested_vmx_secondary_ctls_high);
3188                 break;
3189         case MSR_IA32_VMX_EPT_VPID_CAP:
3190                 *pdata = vmx->nested.nested_vmx_ept_caps |
3191                         ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
3192                 break;
3193         case MSR_IA32_VMX_VMFUNC:
3194                 *pdata = vmx->nested.nested_vmx_vmfunc_controls;
3195                 break;
3196         default:
3197                 return 1;
3198         }
3199
3200         return 0;
3201 }
3202
3203 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
3204                                                  uint64_t val)
3205 {
3206         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
3207
3208         return !(val & ~valid_bits);
3209 }
3210
3211 /*
3212  * Reads an msr value (of 'msr_index') into 'pdata'.
3213  * Returns 0 on success, non-0 otherwise.
3214  * Assumes vcpu_load() was already called.
3215  */
3216 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3217 {
3218         struct shared_msr_entry *msr;
3219
3220         switch (msr_info->index) {
3221 #ifdef CONFIG_X86_64
3222         case MSR_FS_BASE:
3223                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
3224                 break;
3225         case MSR_GS_BASE:
3226                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
3227                 break;
3228         case MSR_KERNEL_GS_BASE:
3229                 vmx_load_host_state(to_vmx(vcpu));
3230                 msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
3231                 break;
3232 #endif
3233         case MSR_EFER:
3234                 return kvm_get_msr_common(vcpu, msr_info);
3235         case MSR_IA32_TSC:
3236                 msr_info->data = guest_read_tsc(vcpu);
3237                 break;
3238         case MSR_IA32_SYSENTER_CS:
3239                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
3240                 break;
3241         case MSR_IA32_SYSENTER_EIP:
3242                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
3243                 break;
3244         case MSR_IA32_SYSENTER_ESP:
3245                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
3246                 break;
3247         case MSR_IA32_BNDCFGS:
3248                 if (!kvm_mpx_supported() ||
3249                     (!msr_info->host_initiated &&
3250                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3251                         return 1;
3252                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3253                 break;
3254         case MSR_IA32_MCG_EXT_CTL:
3255                 if (!msr_info->host_initiated &&
3256                     !(to_vmx(vcpu)->msr_ia32_feature_control &
3257                       FEATURE_CONTROL_LMCE))
3258                         return 1;
3259                 msr_info->data = vcpu->arch.mcg_ext_ctl;
3260                 break;
3261         case MSR_IA32_FEATURE_CONTROL:
3262                 msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
3263                 break;
3264         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3265                 if (!nested_vmx_allowed(vcpu))
3266                         return 1;
3267                 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
3268         case MSR_IA32_XSS:
3269                 if (!vmx_xsaves_supported())
3270                         return 1;
3271                 msr_info->data = vcpu->arch.ia32_xss;
3272                 break;
3273         case MSR_TSC_AUX:
3274                 if (!msr_info->host_initiated &&
3275                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3276                         return 1;
3277                 /* Otherwise falls through */
3278         default:
3279                 msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
3280                 if (msr) {
3281                         msr_info->data = msr->data;
3282                         break;
3283                 }
3284                 return kvm_get_msr_common(vcpu, msr_info);
3285         }
3286
3287         return 0;
3288 }
3289
3290 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
3291
3292 /*
3293  * Writes msr value into into the appropriate "register".
3294  * Returns 0 on success, non-0 otherwise.
3295  * Assumes vcpu_load() was already called.
3296  */
3297 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3298 {
3299         struct vcpu_vmx *vmx = to_vmx(vcpu);
3300         struct shared_msr_entry *msr;
3301         int ret = 0;
3302         u32 msr_index = msr_info->index;
3303         u64 data = msr_info->data;
3304
3305         switch (msr_index) {
3306         case MSR_EFER:
3307                 ret = kvm_set_msr_common(vcpu, msr_info);
3308                 break;
3309 #ifdef CONFIG_X86_64
3310         case MSR_FS_BASE:
3311                 vmx_segment_cache_clear(vmx);
3312                 vmcs_writel(GUEST_FS_BASE, data);
3313                 break;
3314         case MSR_GS_BASE:
3315                 vmx_segment_cache_clear(vmx);
3316                 vmcs_writel(GUEST_GS_BASE, data);
3317                 break;
3318         case MSR_KERNEL_GS_BASE:
3319                 vmx_load_host_state(vmx);
3320                 vmx->msr_guest_kernel_gs_base = data;
3321                 break;
3322 #endif
3323         case MSR_IA32_SYSENTER_CS:
3324                 vmcs_write32(GUEST_SYSENTER_CS, data);
3325                 break;
3326         case MSR_IA32_SYSENTER_EIP:
3327                 vmcs_writel(GUEST_SYSENTER_EIP, data);
3328                 break;
3329         case MSR_IA32_SYSENTER_ESP:
3330                 vmcs_writel(GUEST_SYSENTER_ESP, data);
3331                 break;
3332         case MSR_IA32_BNDCFGS:
3333                 if (!kvm_mpx_supported() ||
3334                     (!msr_info->host_initiated &&
3335                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3336                         return 1;
3337                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
3338                     (data & MSR_IA32_BNDCFGS_RSVD))
3339                         return 1;
3340                 vmcs_write64(GUEST_BNDCFGS, data);
3341                 break;
3342         case MSR_IA32_TSC:
3343                 kvm_write_tsc(vcpu, msr_info);
3344                 break;
3345         case MSR_IA32_CR_PAT:
3346                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3347                         if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3348                                 return 1;
3349                         vmcs_write64(GUEST_IA32_PAT, data);
3350                         vcpu->arch.pat = data;
3351                         break;
3352                 }
3353                 ret = kvm_set_msr_common(vcpu, msr_info);
3354                 break;
3355         case MSR_IA32_TSC_ADJUST:
3356                 ret = kvm_set_msr_common(vcpu, msr_info);
3357                 break;
3358         case MSR_IA32_MCG_EXT_CTL:
3359                 if ((!msr_info->host_initiated &&
3360                      !(to_vmx(vcpu)->msr_ia32_feature_control &
3361                        FEATURE_CONTROL_LMCE)) ||
3362                     (data & ~MCG_EXT_CTL_LMCE_EN))
3363                         return 1;
3364                 vcpu->arch.mcg_ext_ctl = data;
3365                 break;
3366         case MSR_IA32_FEATURE_CONTROL:
3367                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
3368                     (to_vmx(vcpu)->msr_ia32_feature_control &
3369                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
3370                         return 1;
3371                 vmx->msr_ia32_feature_control = data;
3372                 if (msr_info->host_initiated && data == 0)
3373                         vmx_leave_nested(vcpu);
3374                 break;
3375         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3376                 if (!msr_info->host_initiated)
3377                         return 1; /* they are read-only */
3378                 if (!nested_vmx_allowed(vcpu))
3379                         return 1;
3380                 return vmx_set_vmx_msr(vcpu, msr_index, data);
3381         case MSR_IA32_XSS:
3382                 if (!vmx_xsaves_supported())
3383                         return 1;
3384                 /*
3385                  * The only supported bit as of Skylake is bit 8, but
3386                  * it is not supported on KVM.
3387                  */
3388                 if (data != 0)
3389                         return 1;
3390                 vcpu->arch.ia32_xss = data;
3391                 if (vcpu->arch.ia32_xss != host_xss)
3392                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
3393                                 vcpu->arch.ia32_xss, host_xss);
3394                 else
3395                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
3396                 break;
3397         case MSR_TSC_AUX:
3398                 if (!msr_info->host_initiated &&
3399                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3400                         return 1;
3401                 /* Check reserved bit, higher 32 bits should be zero */
3402                 if ((data >> 32) != 0)
3403                         return 1;
3404                 /* Otherwise falls through */
3405         default:
3406                 msr = find_msr_entry(vmx, msr_index);
3407                 if (msr) {
3408                         u64 old_msr_data = msr->data;
3409                         msr->data = data;
3410                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
3411                                 preempt_disable();
3412                                 ret = kvm_set_shared_msr(msr->index, msr->data,
3413                                                          msr->mask);
3414                                 preempt_enable();
3415                                 if (ret)
3416                                         msr->data = old_msr_data;
3417                         }
3418                         break;
3419                 }
3420                 ret = kvm_set_msr_common(vcpu, msr_info);
3421         }
3422
3423         return ret;
3424 }
3425
3426 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
3427 {
3428         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
3429         switch (reg) {
3430         case VCPU_REGS_RSP:
3431                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
3432                 break;
3433         case VCPU_REGS_RIP:
3434                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
3435                 break;
3436         case VCPU_EXREG_PDPTR:
3437                 if (enable_ept)
3438                         ept_save_pdptrs(vcpu);
3439                 break;
3440         default:
3441                 break;
3442         }
3443 }
3444
3445 static __init int cpu_has_kvm_support(void)
3446 {
3447         return cpu_has_vmx();
3448 }
3449
3450 static __init int vmx_disabled_by_bios(void)
3451 {
3452         u64 msr;
3453
3454         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
3455         if (msr & FEATURE_CONTROL_LOCKED) {
3456                 /* launched w/ TXT and VMX disabled */
3457                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3458                         && tboot_enabled())
3459                         return 1;
3460                 /* launched w/o TXT and VMX only enabled w/ TXT */
3461                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3462                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3463                         && !tboot_enabled()) {
3464                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
3465                                 "activate TXT before enabling KVM\n");
3466                         return 1;
3467                 }
3468                 /* launched w/o TXT and VMX disabled */
3469                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3470                         && !tboot_enabled())
3471                         return 1;
3472         }
3473
3474         return 0;
3475 }
3476
3477 static void kvm_cpu_vmxon(u64 addr)
3478 {
3479         cr4_set_bits(X86_CR4_VMXE);
3480         intel_pt_handle_vmx(1);
3481
3482         asm volatile (ASM_VMX_VMXON_RAX
3483                         : : "a"(&addr), "m"(addr)
3484                         : "memory", "cc");
3485 }
3486
3487 static int hardware_enable(void)
3488 {
3489         int cpu = raw_smp_processor_id();
3490         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3491         u64 old, test_bits;
3492
3493         if (cr4_read_shadow() & X86_CR4_VMXE)
3494                 return -EBUSY;
3495
3496         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3497         INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3498         spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
3499
3500         /*
3501          * Now we can enable the vmclear operation in kdump
3502          * since the loaded_vmcss_on_cpu list on this cpu
3503          * has been initialized.
3504          *
3505          * Though the cpu is not in VMX operation now, there
3506          * is no problem to enable the vmclear operation
3507          * for the loaded_vmcss_on_cpu list is empty!
3508          */
3509         crash_enable_local_vmclear(cpu);
3510
3511         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
3512
3513         test_bits = FEATURE_CONTROL_LOCKED;
3514         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
3515         if (tboot_enabled())
3516                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
3517
3518         if ((old & test_bits) != test_bits) {
3519                 /* enable and lock */
3520                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
3521         }
3522         kvm_cpu_vmxon(phys_addr);
3523         if (enable_ept)
3524                 ept_sync_global();
3525
3526         return 0;
3527 }
3528
3529 static void vmclear_local_loaded_vmcss(void)
3530 {
3531         int cpu = raw_smp_processor_id();
3532         struct loaded_vmcs *v, *n;
3533
3534         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
3535                                  loaded_vmcss_on_cpu_link)
3536                 __loaded_vmcs_clear(v);
3537 }
3538
3539
3540 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
3541  * tricks.
3542  */
3543 static void kvm_cpu_vmxoff(void)
3544 {
3545         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
3546
3547         intel_pt_handle_vmx(0);
3548         cr4_clear_bits(X86_CR4_VMXE);
3549 }
3550
3551 static void hardware_disable(void)
3552 {
3553         vmclear_local_loaded_vmcss();
3554         kvm_cpu_vmxoff();
3555 }
3556
3557 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
3558                                       u32 msr, u32 *result)
3559 {
3560         u32 vmx_msr_low, vmx_msr_high;
3561         u32 ctl = ctl_min | ctl_opt;
3562
3563         rdmsr(msr, vmx_msr_low, vmx_msr_high);
3564
3565         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
3566         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
3567
3568         /* Ensure minimum (required) set of control bits are supported. */
3569         if (ctl_min & ~ctl)
3570                 return -EIO;
3571
3572         *result = ctl;
3573         return 0;
3574 }
3575
3576 static __init bool allow_1_setting(u32 msr, u32 ctl)
3577 {
3578         u32 vmx_msr_low, vmx_msr_high;
3579
3580         rdmsr(msr, vmx_msr_low, vmx_msr_high);
3581         return vmx_msr_high & ctl;
3582 }
3583
3584 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3585 {
3586         u32 vmx_msr_low, vmx_msr_high;
3587         u32 min, opt, min2, opt2;
3588         u32 _pin_based_exec_control = 0;
3589         u32 _cpu_based_exec_control = 0;
3590         u32 _cpu_based_2nd_exec_control = 0;
3591         u32 _vmexit_control = 0;
3592         u32 _vmentry_control = 0;
3593
3594         min = CPU_BASED_HLT_EXITING |
3595 #ifdef CONFIG_X86_64
3596               CPU_BASED_CR8_LOAD_EXITING |
3597               CPU_BASED_CR8_STORE_EXITING |
3598 #endif
3599               CPU_BASED_CR3_LOAD_EXITING |
3600               CPU_BASED_CR3_STORE_EXITING |
3601               CPU_BASED_USE_IO_BITMAPS |
3602               CPU_BASED_MOV_DR_EXITING |
3603               CPU_BASED_USE_TSC_OFFSETING |
3604               CPU_BASED_INVLPG_EXITING |
3605               CPU_BASED_RDPMC_EXITING;
3606
3607         if (!kvm_mwait_in_guest())
3608                 min |= CPU_BASED_MWAIT_EXITING |
3609                         CPU_BASED_MONITOR_EXITING;
3610
3611         opt = CPU_BASED_TPR_SHADOW |
3612               CPU_BASED_USE_MSR_BITMAPS |
3613               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3614         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3615                                 &_cpu_based_exec_control) < 0)
3616                 return -EIO;
3617 #ifdef CONFIG_X86_64
3618         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3619                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
3620                                            ~CPU_BASED_CR8_STORE_EXITING;
3621 #endif
3622         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
3623                 min2 = 0;
3624                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3625                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3626                         SECONDARY_EXEC_WBINVD_EXITING |
3627                         SECONDARY_EXEC_ENABLE_VPID |
3628                         SECONDARY_EXEC_ENABLE_EPT |
3629                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
3630                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3631                         SECONDARY_EXEC_RDTSCP |
3632                         SECONDARY_EXEC_ENABLE_INVPCID |
3633                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
3634                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3635                         SECONDARY_EXEC_SHADOW_VMCS |
3636                         SECONDARY_EXEC_XSAVES |
3637                         SECONDARY_EXEC_RDSEED_EXITING |
3638                         SECONDARY_EXEC_RDRAND_EXITING |
3639                         SECONDARY_EXEC_ENABLE_PML |
3640                         SECONDARY_EXEC_TSC_SCALING |
3641                         SECONDARY_EXEC_ENABLE_VMFUNC;
3642                 if (adjust_vmx_controls(min2, opt2,
3643                                         MSR_IA32_VMX_PROCBASED_CTLS2,
3644                                         &_cpu_based_2nd_exec_control) < 0)
3645                         return -EIO;
3646         }
3647 #ifndef CONFIG_X86_64
3648         if (!(_cpu_based_2nd_exec_control &
3649                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
3650                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
3651 #endif
3652
3653         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3654                 _cpu_based_2nd_exec_control &= ~(
3655                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3656                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3657                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3658
3659         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
3660                 &vmx_capability.ept, &vmx_capability.vpid);
3661
3662         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3663                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3664                    enabled */
3665                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3666                                              CPU_BASED_CR3_STORE_EXITING |
3667                                              CPU_BASED_INVLPG_EXITING);
3668         } else if (vmx_capability.ept) {
3669                 vmx_capability.ept = 0;
3670                 pr_warn_once("EPT CAP should not exist if not support "
3671                                 "1-setting enable EPT VM-execution control\n");
3672         }
3673         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
3674                 vmx_capability.vpid) {
3675                 vmx_capability.vpid = 0;
3676                 pr_warn_once("VPID CAP should not exist if not support "
3677                                 "1-setting enable VPID VM-execution control\n");
3678         }
3679
3680         min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
3681 #ifdef CONFIG_X86_64
3682         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
3683 #endif
3684         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
3685                 VM_EXIT_CLEAR_BNDCFGS;
3686         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
3687                                 &_vmexit_control) < 0)
3688                 return -EIO;
3689
3690         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
3691         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
3692                  PIN_BASED_VMX_PREEMPTION_TIMER;
3693         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
3694                                 &_pin_based_exec_control) < 0)
3695                 return -EIO;
3696
3697         if (cpu_has_broken_vmx_preemption_timer())
3698                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3699         if (!(_cpu_based_2nd_exec_control &
3700                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
3701                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
3702
3703         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3704         opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
3705         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
3706                                 &_vmentry_control) < 0)
3707                 return -EIO;
3708
3709         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3710
3711         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
3712         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
3713                 return -EIO;
3714
3715 #ifdef CONFIG_X86_64
3716         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
3717         if (vmx_msr_high & (1u<<16))
3718                 return -EIO;
3719 #endif
3720
3721         /* Require Write-Back (WB) memory type for VMCS accesses. */
3722         if (((vmx_msr_high >> 18) & 15) != 6)
3723                 return -EIO;
3724
3725         vmcs_conf->size = vmx_msr_high & 0x1fff;
3726         vmcs_conf->order = get_order(vmcs_conf->size);
3727         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
3728         vmcs_conf->revision_id = vmx_msr_low;
3729
3730         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3731         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3732         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
3733         vmcs_conf->vmexit_ctrl         = _vmexit_control;
3734         vmcs_conf->vmentry_ctrl        = _vmentry_control;
3735
3736         cpu_has_load_ia32_efer =
3737                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3738                                 VM_ENTRY_LOAD_IA32_EFER)
3739                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3740                                    VM_EXIT_LOAD_IA32_EFER);
3741
3742         cpu_has_load_perf_global_ctrl =
3743                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3744                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
3745                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3746                                    VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
3747
3748         /*
3749          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
3750          * but due to errata below it can't be used. Workaround is to use
3751          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
3752          *
3753          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
3754          *
3755          * AAK155             (model 26)
3756          * AAP115             (model 30)
3757          * AAT100             (model 37)
3758          * BC86,AAY89,BD102   (model 44)
3759          * BA97               (model 46)
3760          *
3761          */
3762         if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
3763                 switch (boot_cpu_data.x86_model) {
3764                 case 26:
3765                 case 30:
3766                 case 37:
3767                 case 44:
3768                 case 46:
3769                         cpu_has_load_perf_global_ctrl = false;
3770                         printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
3771                                         "does not work properly. Using workaround\n");
3772                         break;
3773                 default:
3774                         break;
3775                 }
3776         }
3777
3778         if (boot_cpu_has(X86_FEATURE_XSAVES))
3779                 rdmsrl(MSR_IA32_XSS, host_xss);
3780
3781         return 0;
3782 }
3783
3784 static struct vmcs *alloc_vmcs_cpu(int cpu)
3785 {
3786         int node = cpu_to_node(cpu);
3787         struct page *pages;
3788         struct vmcs *vmcs;
3789
3790         pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
3791         if (!pages)
3792                 return NULL;
3793         vmcs = page_address(pages);
3794         memset(vmcs, 0, vmcs_config.size);
3795         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
3796         return vmcs;
3797 }
3798
3799 static void free_vmcs(struct vmcs *vmcs)
3800 {
3801         free_pages((unsigned long)vmcs, vmcs_config.order);
3802 }
3803
3804 /*
3805  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3806  */
3807 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3808 {
3809         if (!loaded_vmcs->vmcs)
3810                 return;
3811         loaded_vmcs_clear(loaded_vmcs);
3812         free_vmcs(loaded_vmcs->vmcs);
3813         loaded_vmcs->vmcs = NULL;
3814         if (loaded_vmcs->msr_bitmap)
3815                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
3816         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3817 }
3818
3819 static struct vmcs *alloc_vmcs(void)
3820 {
3821         return alloc_vmcs_cpu(raw_smp_processor_id());
3822 }
3823
3824 static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3825 {
3826         loaded_vmcs->vmcs = alloc_vmcs();
3827         if (!loaded_vmcs->vmcs)
3828                 return -ENOMEM;
3829
3830         loaded_vmcs->shadow_vmcs = NULL;
3831         loaded_vmcs_init(loaded_vmcs);
3832
3833         if (cpu_has_vmx_msr_bitmap()) {
3834                 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
3835                 if (!loaded_vmcs->msr_bitmap)
3836                         goto out_vmcs;
3837                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3838         }
3839         return 0;
3840
3841 out_vmcs:
3842         free_loaded_vmcs(loaded_vmcs);
3843         return -ENOMEM;
3844 }
3845
3846 static void free_kvm_area(void)
3847 {
3848         int cpu;
3849
3850         for_each_possible_cpu(cpu) {
3851                 free_vmcs(per_cpu(vmxarea, cpu));
3852                 per_cpu(vmxarea, cpu) = NULL;
3853         }
3854 }
3855
3856 enum vmcs_field_type {
3857         VMCS_FIELD_TYPE_U16 = 0,
3858         VMCS_FIELD_TYPE_U64 = 1,
3859         VMCS_FIELD_TYPE_U32 = 2,
3860         VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
3861 };
3862
3863 static inline int vmcs_field_type(unsigned long field)
3864 {
3865         if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
3866                 return VMCS_FIELD_TYPE_U32;
3867         return (field >> 13) & 0x3 ;
3868 }
3869
3870 static inline int vmcs_field_readonly(unsigned long field)
3871 {
3872         return (((field >> 10) & 0x3) == 1);
3873 }
3874
3875 static void init_vmcs_shadow_fields(void)
3876 {
3877         int i, j;
3878
3879         /* No checks for read only fields yet */
3880
3881         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
3882                 switch (shadow_read_write_fields[i]) {
3883                 case GUEST_BNDCFGS:
3884                         if (!kvm_mpx_supported())
3885                                 continue;
3886                         break;
3887                 default:
3888                         break;
3889                 }
3890
3891                 if (j < i)
3892                         shadow_read_write_fields[j] =
3893                                 shadow_read_write_fields[i];
3894                 j++;
3895         }
3896         max_shadow_read_write_fields = j;
3897
3898         /* shadowed fields guest access without vmexit */
3899         for (i = 0; i < max_shadow_read_write_fields; i++) {
3900                 unsigned long field = shadow_read_write_fields[i];
3901
3902                 clear_bit(field, vmx_vmwrite_bitmap);
3903                 clear_bit(field, vmx_vmread_bitmap);
3904                 if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
3905                         clear_bit(field + 1, vmx_vmwrite_bitmap);
3906                         clear_bit(field + 1, vmx_vmread_bitmap);
3907                 }
3908         }
3909         for (i = 0; i < max_shadow_read_only_fields; i++) {
3910                 unsigned long field = shadow_read_only_fields[i];
3911
3912                 clear_bit(field, vmx_vmread_bitmap);
3913                 if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
3914                         clear_bit(field + 1, vmx_vmread_bitmap);
3915         }
3916 }
3917
3918 static __init int alloc_kvm_area(void)
3919 {
3920         int cpu;
3921
3922         for_each_possible_cpu(cpu) {
3923                 struct vmcs *vmcs;
3924
3925                 vmcs = alloc_vmcs_cpu(cpu);
3926                 if (!vmcs) {
3927                         free_kvm_area();
3928                         return -ENOMEM;
3929                 }
3930
3931                 per_cpu(vmxarea, cpu) = vmcs;
3932         }
3933         return 0;
3934 }
3935
3936 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3937                 struct kvm_segment *save)
3938 {
3939         if (!emulate_invalid_guest_state) {
3940                 /*
3941                  * CS and SS RPL should be equal during guest entry according
3942                  * to VMX spec, but in reality it is not always so. Since vcpu
3943                  * is in the middle of the transition from real mode to
3944                  * protected mode it is safe to assume that RPL 0 is a good
3945                  * default value.
3946                  */
3947                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3948                         save->selector &= ~SEGMENT_RPL_MASK;
3949                 save->dpl = save->selector & SEGMENT_RPL_MASK;
3950                 save->s = 1;
3951         }
3952         vmx_set_segment(vcpu, save, seg);
3953 }
3954
3955 static void enter_pmode(struct kvm_vcpu *vcpu)
3956 {
3957         unsigned long flags;
3958         struct vcpu_vmx *vmx = to_vmx(vcpu);
3959
3960         /*
3961          * Update real mode segment cache. It may be not up-to-date if sement
3962          * register was written while vcpu was in a guest mode.
3963          */
3964         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3965         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3966         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3967         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3968         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3969         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3970
3971         vmx->rmode.vm86_active = 0;
3972
3973         vmx_segment_cache_clear(vmx);
3974
3975         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3976
3977         flags = vmcs_readl(GUEST_RFLAGS);
3978         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3979         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3980         vmcs_writel(GUEST_RFLAGS, flags);
3981
3982         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3983                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3984
3985         update_exception_bitmap(vcpu);
3986
3987         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3988         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3989         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3990         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3991         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3992         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3993 }
3994
3995 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3996 {
3997         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3998         struct kvm_segment var = *save;
3999
4000         var.dpl = 0x3;
4001         if (seg == VCPU_SREG_CS)
4002                 var.type = 0x3;
4003
4004         if (!emulate_invalid_guest_state) {
4005                 var.selector = var.base >> 4;
4006                 var.base = var.base & 0xffff0;
4007                 var.limit = 0xffff;
4008                 var.g = 0;
4009                 var.db = 0;
4010                 var.present = 1;
4011                 var.s = 1;
4012                 var.l = 0;
4013                 var.unusable = 0;
4014                 var.type = 0x3;
4015                 var.avl = 0;
4016                 if (save->base & 0xf)
4017                         printk_once(KERN_WARNING "kvm: segment base is not "
4018                                         "paragraph aligned when entering "
4019                                         "protected mode (seg=%d)", seg);
4020         }
4021
4022         vmcs_write16(sf->selector, var.selector);
4023         vmcs_writel(sf->base, var.base);
4024         vmcs_write32(sf->limit, var.limit);
4025         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
4026 }
4027
4028 static void enter_rmode(struct kvm_vcpu *vcpu)
4029 {
4030         unsigned long flags;
4031         struct vcpu_vmx *vmx = to_vmx(vcpu);
4032
4033         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4034         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4035         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4036         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4037         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4038         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4039         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4040
4041         vmx->rmode.vm86_active = 1;
4042
4043         /*
4044          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4045          * vcpu. Warn the user that an update is overdue.
4046          */
4047         if (!vcpu->kvm->arch.tss_addr)
4048                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
4049                              "called before entering vcpu\n");
4050
4051         vmx_segment_cache_clear(vmx);
4052
4053         vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
4054         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
4055         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4056
4057         flags = vmcs_readl(GUEST_RFLAGS);
4058         vmx->rmode.save_rflags = flags;
4059
4060         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
4061
4062         vmcs_writel(GUEST_RFLAGS, flags);
4063         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
4064         update_exception_bitmap(vcpu);
4065
4066         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
4067         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
4068         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
4069         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
4070         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
4071         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
4072
4073         kvm_mmu_reset_context(vcpu);
4074 }
4075
4076 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
4077 {
4078         struct vcpu_vmx *vmx = to_vmx(vcpu);
4079         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
4080
4081         if (!msr)
4082                 return;
4083
4084         /*
4085          * Force kernel_gs_base reloading before EFER changes, as control
4086          * of this msr depends on is_long_mode().
4087          */
4088         vmx_load_host_state(to_vmx(vcpu));
4089         vcpu->arch.efer = efer;
4090         if (efer & EFER_LMA) {
4091                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4092                 msr->data = efer;
4093         } else {
4094                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4095
4096                 msr->data = efer & ~EFER_LME;
4097         }
4098         setup_msrs(vmx);
4099 }
4100
4101 #ifdef CONFIG_X86_64
4102
4103 static void enter_lmode(struct kvm_vcpu *vcpu)
4104 {
4105         u32 guest_tr_ar;
4106
4107         vmx_segment_cache_clear(to_vmx(vcpu));
4108
4109         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4110         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
4111                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
4112                                      __func__);
4113                 vmcs_write32(GUEST_TR_AR_BYTES,
4114                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
4115                              | VMX_AR_TYPE_BUSY_64_TSS);
4116         }
4117         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
4118 }
4119
4120 static void exit_lmode(struct kvm_vcpu *vcpu)
4121 {
4122         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4123         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
4124 }
4125
4126 #endif
4127
4128 static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
4129 {
4130         if (enable_ept) {
4131                 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
4132                         return;
4133                 ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
4134         } else {
4135                 vpid_sync_context(vpid);
4136         }
4137 }
4138
4139 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
4140 {
4141         __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
4142 }
4143
4144 static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
4145 {
4146         if (enable_ept)
4147                 vmx_flush_tlb(vcpu);
4148 }
4149
4150 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
4151 {
4152         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
4153
4154         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
4155         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
4156 }
4157
4158 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
4159 {
4160         if (enable_ept && is_paging(vcpu))
4161                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4162         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
4163 }
4164
4165 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
4166 {
4167         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
4168
4169         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
4170         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
4171 }
4172
4173 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
4174 {
4175         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4176
4177         if (!test_bit(VCPU_EXREG_PDPTR,
4178                       (unsigned long *)&vcpu->arch.regs_dirty))
4179                 return;
4180
4181         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
4182                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
4183                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
4184                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
4185                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
4186         }
4187 }
4188
4189 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
4190 {
4191         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4192
4193         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
4194                 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
4195                 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
4196                 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
4197                 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
4198         }
4199
4200         __set_bit(VCPU_EXREG_PDPTR,
4201                   (unsigned long *)&vcpu->arch.regs_avail);
4202         __set_bit(VCPU_EXREG_PDPTR,
4203                   (unsigned long *)&vcpu->arch.regs_dirty);
4204 }
4205
4206 static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4207 {
4208         u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
4209         u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
4210         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4211
4212         if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
4213                 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
4214             nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
4215                 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
4216
4217         return fixed_bits_valid(val, fixed0, fixed1);
4218 }
4219
4220 static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4221 {
4222         u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
4223         u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
4224
4225         return fixed_bits_valid(val, fixed0, fixed1);
4226 }
4227
4228 static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
4229 {
4230         u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
4231         u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;
4232
4233         return fixed_bits_valid(val, fixed0, fixed1);
4234 }
4235
4236 /* No difference in the restrictions on guest and host CR4 in VMX operation. */
4237 #define nested_guest_cr4_valid  nested_cr4_valid
4238 #define nested_host_cr4_valid   nested_cr4_valid
4239
4240 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
4241
4242 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
4243                                         unsigned long cr0,
4244                                         struct kvm_vcpu *vcpu)
4245 {
4246         if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
4247                 vmx_decache_cr3(vcpu);
4248         if (!(cr0 & X86_CR0_PG)) {
4249                 /* From paging/starting to nonpaging */
4250                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
4251                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
4252                              (CPU_BASED_CR3_LOAD_EXITING |
4253                               CPU_BASED_CR3_STORE_EXITING));
4254                 vcpu->arch.cr0 = cr0;
4255                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4256         } else if (!is_paging(vcpu)) {
4257                 /* From nonpaging to paging */
4258                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
4259                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
4260                              ~(CPU_BASED_CR3_LOAD_EXITING |
4261                                CPU_BASED_CR3_STORE_EXITING));
4262                 vcpu->arch.cr0 = cr0;
4263                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4264         }
4265
4266         if (!(cr0 & X86_CR0_WP))
4267                 *hw_cr0 &= ~X86_CR0_WP;
4268 }
4269
4270 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4271 {
4272         struct vcpu_vmx *vmx = to_vmx(vcpu);
4273         unsigned long hw_cr0;
4274
4275         hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
4276         if (enable_unrestricted_guest)
4277                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
4278         else {
4279                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
4280
4281                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
4282                         enter_pmode(vcpu);
4283
4284                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
4285                         enter_rmode(vcpu);
4286         }
4287
4288 #ifdef CONFIG_X86_64
4289         if (vcpu->arch.efer & EFER_LME) {
4290                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
4291                         enter_lmode(vcpu);
4292                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
4293                         exit_lmode(vcpu);
4294         }
4295 #endif
4296
4297         if (enable_ept)
4298                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
4299
4300         vmcs_writel(CR0_READ_SHADOW, cr0);
4301         vmcs_writel(GUEST_CR0, hw_cr0);
4302         vcpu->arch.cr0 = cr0;
4303
4304         /* depends on vcpu->arch.cr0 to be set to a new value */
4305         vmx->emulation_required = emulation_required(vcpu);
4306 }
4307
4308 static int get_ept_level(struct kvm_vcpu *vcpu)
4309 {
4310         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
4311                 return 5;
4312         return 4;
4313 }
4314
4315 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
4316 {
4317         u64 eptp = VMX_EPTP_MT_WB;
4318
4319         eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
4320
4321         if (enable_ept_ad_bits &&
4322             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
4323                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
4324         eptp |= (root_hpa & PAGE_MASK);
4325
4326         return eptp;
4327 }
4328
4329 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
4330 {
4331         unsigned long guest_cr3;
4332         u64 eptp;
4333
4334         guest_cr3 = cr3;
4335         if (enable_ept) {
4336                 eptp = construct_eptp(vcpu, cr3);
4337                 vmcs_write64(EPT_POINTER, eptp);
4338                 if (is_paging(vcpu) || is_guest_mode(vcpu))
4339                         guest_cr3 = kvm_read_cr3(vcpu);
4340                 else
4341                         guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
4342                 ept_load_pdptrs(vcpu);
4343         }
4344
4345         vmx_flush_tlb(vcpu);
4346         vmcs_writel(GUEST_CR3, guest_cr3);
4347 }
4348
4349 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
4350 {
4351         /*
4352          * Pass through host's Machine Check Enable value to hw_cr4, which
4353          * is in force while we are in guest mode.  Do not let guests control
4354          * this bit, even if host CR4.MCE == 0.
4355          */
4356         unsigned long hw_cr4 =
4357                 (cr4_read_shadow() & X86_CR4_MCE) |
4358                 (cr4 & ~X86_CR4_MCE) |
4359                 (to_vmx(vcpu)->rmode.vm86_active ?
4360                  KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
4361
4362         if (cr4 & X86_CR4_VMXE) {
4363                 /*
4364                  * To use VMXON (and later other VMX instructions), a guest
4365                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
4366                  * So basically the check on whether to allow nested VMX
4367                  * is here.
4368                  */
4369                 if (!nested_vmx_allowed(vcpu))
4370                         return 1;
4371         }
4372
4373         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
4374                 return 1;
4375
4376         vcpu->arch.cr4 = cr4;
4377         if (enable_ept) {
4378                 if (!is_paging(vcpu)) {
4379                         hw_cr4 &= ~X86_CR4_PAE;
4380                         hw_cr4 |= X86_CR4_PSE;
4381                 } else if (!(cr4 & X86_CR4_PAE)) {
4382                         hw_cr4 &= ~X86_CR4_PAE;
4383                 }
4384         }
4385
4386         if (!enable_unrestricted_guest && !is_paging(vcpu))
4387                 /*
4388                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
4389                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
4390                  * to be manually disabled when guest switches to non-paging
4391                  * mode.
4392                  *
4393                  * If !enable_unrestricted_guest, the CPU is always running
4394                  * with CR0.PG=1 and CR4 needs to be modified.
4395                  * If enable_unrestricted_guest, the CPU automatically
4396                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
4397                  */
4398                 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
4399
4400         vmcs_writel(CR4_READ_SHADOW, cr4);
4401         vmcs_writel(GUEST_CR4, hw_cr4);
4402         return 0;
4403 }
4404
4405 static void vmx_get_segment(struct kvm_vcpu *vcpu,
4406                             struct kvm_segment *var, int seg)
4407 {
4408         struct vcpu_vmx *vmx = to_vmx(vcpu);
4409         u32 ar;
4410
4411         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4412                 *var = vmx->rmode.segs[seg];
4413                 if (seg == VCPU_SREG_TR
4414                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
4415                         return;
4416                 var->base = vmx_read_guest_seg_base(vmx, seg);
4417                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
4418                 return;
4419         }
4420         var->base = vmx_read_guest_seg_base(vmx, seg);
4421         var->limit = vmx_read_guest_seg_limit(vmx, seg);
4422         var->selector = vmx_read_guest_seg_selector(vmx, seg);
4423         ar = vmx_read_guest_seg_ar(vmx, seg);
4424         var->unusable = (ar >> 16) & 1;
4425         var->type = ar & 15;
4426         var->s = (ar >> 4) & 1;
4427         var->dpl = (ar >> 5) & 3;
4428         /*
4429          * Some userspaces do not preserve unusable property. Since usable
4430          * segment has to be present according to VMX spec we can use present
4431          * property to amend userspace bug by making unusable segment always
4432          * nonpresent. vmx_segment_access_rights() already marks nonpresent
4433          * segment as unusable.
4434          */
4435         var->present = !var->unusable;
4436         var->avl = (ar >> 12) & 1;
4437         var->l = (ar >> 13) & 1;
4438         var->db = (ar >> 14) & 1;
4439         var->g = (ar >> 15) & 1;
4440 }
4441
4442 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
4443 {
4444         struct kvm_segment s;
4445
4446         if (to_vmx(vcpu)->rmode.vm86_active) {
4447                 vmx_get_segment(vcpu, &s, seg);
4448                 return s.base;
4449         }
4450         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
4451 }
4452
4453 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
4454 {
4455         struct vcpu_vmx *vmx = to_vmx(vcpu);
4456
4457         if (unlikely(vmx->rmode.vm86_active))
4458                 return 0;
4459         else {
4460                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4461                 return VMX_AR_DPL(ar);
4462         }
4463 }
4464
4465 static u32 vmx_segment_access_rights(struct kvm_segment *var)
4466 {
4467         u32 ar;
4468
4469         if (var->unusable || !var->present)
4470                 ar = 1 << 16;
4471         else {
4472                 ar = var->type & 15;
4473                 ar |= (var->s & 1) << 4;
4474                 ar |= (var->dpl & 3) << 5;
4475                 ar |= (var->present & 1) << 7;
4476                 ar |= (var->avl & 1) << 12;
4477                 ar |= (var->l & 1) << 13;
4478                 ar |= (var->db & 1) << 14;
4479                 ar |= (var->g & 1) << 15;
4480         }
4481
4482         return ar;
4483 }
4484
4485 static void vmx_set_segment(struct kvm_vcpu *vcpu,
4486                             struct kvm_segment *var, int seg)
4487 {
4488         struct vcpu_vmx *vmx = to_vmx(vcpu);
4489         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4490
4491         vmx_segment_cache_clear(vmx);
4492
4493         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4494                 vmx->rmode.segs[seg] = *var;
4495                 if (seg == VCPU_SREG_TR)
4496                         vmcs_write16(sf->selector, var->selector);
4497                 else if (var->s)
4498                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
4499                 goto out;
4500         }
4501
4502         vmcs_writel(sf->base, var->base);
4503         vmcs_write32(sf->limit, var->limit);
4504         vmcs_write16(sf->selector, var->selector);
4505
4506         /*
4507          *   Fix the "Accessed" bit in AR field of segment registers for older
4508          * qemu binaries.
4509          *   IA32 arch specifies that at the time of processor reset the
4510          * "Accessed" bit in the AR field of segment registers is 1. And qemu
4511          * is setting it to 0 in the userland code. This causes invalid guest
4512          * state vmexit when "unrestricted guest" mode is turned on.
4513          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
4514          * tree. Newer qemu binaries with that qemu fix would not need this
4515          * kvm hack.
4516          */
4517         if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
4518                 var->type |= 0x1; /* Accessed */
4519
4520         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
4521
4522 out:
4523         vmx->emulation_required = emulation_required(vcpu);
4524 }
4525
4526 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4527 {
4528         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
4529
4530         *db = (ar >> 14) & 1;
4531         *l = (ar >> 13) & 1;
4532 }
4533
4534 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4535 {
4536         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
4537         dt->address = vmcs_readl(GUEST_IDTR_BASE);
4538 }
4539
4540 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4541 {
4542         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
4543         vmcs_writel(GUEST_IDTR_BASE, dt->address);
4544 }
4545
4546 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4547 {
4548         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
4549         dt->address = vmcs_readl(GUEST_GDTR_BASE);
4550 }
4551
4552 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4553 {
4554         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
4555         vmcs_writel(GUEST_GDTR_BASE, dt->address);
4556 }
4557
4558 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
4559 {
4560         struct kvm_segment var;
4561         u32 ar;
4562
4563         vmx_get_segment(vcpu, &var, seg);
4564         var.dpl = 0x3;
4565         if (seg == VCPU_SREG_CS)
4566                 var.type = 0x3;
4567         ar = vmx_segment_access_rights(&var);
4568
4569         if (var.base != (var.selector << 4))
4570                 return false;
4571         if (var.limit != 0xffff)
4572                 return false;
4573         if (ar != 0xf3)
4574                 return false;
4575
4576         return true;
4577 }
4578
4579 static bool code_segment_valid(struct kvm_vcpu *vcpu)
4580 {
4581         struct kvm_segment cs;
4582         unsigned int cs_rpl;
4583
4584         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4585         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
4586
4587         if (cs.unusable)
4588                 return false;
4589         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
4590                 return false;
4591         if (!cs.s)
4592                 return false;
4593         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
4594                 if (cs.dpl > cs_rpl)
4595                         return false;
4596         } else {
4597                 if (cs.dpl != cs_rpl)
4598                         return false;
4599         }
4600         if (!cs.present)
4601                 return false;
4602
4603         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
4604         return true;
4605 }
4606
4607 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
4608 {
4609         struct kvm_segment ss;
4610         unsigned int ss_rpl;
4611
4612         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4613         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
4614
4615         if (ss.unusable)
4616                 return true;
4617         if (ss.type != 3 && ss.type != 7)
4618                 return false;
4619         if (!ss.s)
4620                 return false;
4621         if (ss.dpl != ss_rpl) /* DPL != RPL */
4622                 return false;
4623         if (!ss.present)
4624                 return false;
4625
4626         return true;
4627 }
4628
4629 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
4630 {
4631         struct kvm_segment var;
4632         unsigned int rpl;
4633
4634         vmx_get_segment(vcpu, &var, seg);
4635         rpl = var.selector & SEGMENT_RPL_MASK;
4636
4637         if (var.unusable)
4638                 return true;
4639         if (!var.s)
4640                 return false;
4641         if (!var.present)
4642                 return false;
4643         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
4644                 if (var.dpl < rpl) /* DPL < RPL */
4645                         return false;
4646         }
4647
4648         /* TODO: Add other members to kvm_segment_field to allow checking for other access
4649          * rights flags
4650          */
4651         return true;
4652 }
4653
4654 static bool tr_valid(struct kvm_vcpu *vcpu)
4655 {
4656         struct kvm_segment tr;
4657
4658         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
4659
4660         if (tr.unusable)
4661                 return false;
4662         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
4663                 return false;
4664         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
4665                 return false;
4666         if (!tr.present)
4667                 return false;
4668
4669         return true;
4670 }
4671
4672 static bool ldtr_valid(struct kvm_vcpu *vcpu)
4673 {
4674         struct kvm_segment ldtr;
4675
4676         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
4677
4678         if (ldtr.unusable)
4679                 return true;
4680         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
4681                 return false;
4682         if (ldtr.type != 2)
4683                 return false;
4684         if (!ldtr.present)
4685                 return false;
4686
4687         return true;
4688 }
4689
4690 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
4691 {
4692         struct kvm_segment cs, ss;
4693
4694         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4695         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4696
4697         return ((cs.selector & SEGMENT_RPL_MASK) ==
4698                  (ss.selector & SEGMENT_RPL_MASK));
4699 }
4700
4701 /*
4702  * Check if guest state is valid. Returns true if valid, false if
4703  * not.
4704  * We assume that registers are always usable
4705  */
4706 static bool guest_state_valid(struct kvm_vcpu *vcpu)
4707 {
4708         if (enable_unrestricted_guest)
4709                 return true;
4710
4711         /* real mode guest state checks */
4712         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4713                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
4714                         return false;
4715                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
4716                         return false;
4717                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
4718                         return false;
4719                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
4720                         return false;
4721                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
4722                         return false;
4723                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
4724                         return false;
4725         } else {
4726         /* protected mode guest state checks */
4727                 if (!cs_ss_rpl_check(vcpu))
4728                         return false;
4729                 if (!code_segment_valid(vcpu))
4730                         return false;
4731                 if (!stack_segment_valid(vcpu))
4732                         return false;
4733                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
4734                         return false;
4735                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
4736                         return false;
4737                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
4738                         return false;
4739                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
4740                         return false;
4741                 if (!tr_valid(vcpu))
4742                         return false;
4743                 if (!ldtr_valid(vcpu))
4744                         return false;
4745         }
4746         /* TODO:
4747          * - Add checks on RIP
4748          * - Add checks on RFLAGS
4749          */
4750
4751         return true;
4752 }
4753
4754 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
4755 {
4756         return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
4757 }
4758
4759 static int init_rmode_tss(struct kvm *kvm)
4760 {
4761         gfn_t fn;
4762         u16 data = 0;
4763         int idx, r;
4764
4765         idx = srcu_read_lock(&kvm->srcu);
4766         fn = kvm->arch.tss_addr >> PAGE_SHIFT;
4767         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4768         if (r < 0)
4769                 goto out;
4770         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4771         r = kvm_write_guest_page(kvm, fn++, &data,
4772                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
4773         if (r < 0)
4774                 goto out;
4775         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
4776         if (r < 0)
4777                 goto out;
4778         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4779         if (r < 0)
4780                 goto out;
4781         data = ~0;
4782         r = kvm_write_guest_page(kvm, fn, &data,
4783                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
4784                                  sizeof(u8));
4785 out:
4786         srcu_read_unlock(&kvm->srcu, idx);
4787         return r;
4788 }
4789
4790 static int init_rmode_identity_map(struct kvm *kvm)
4791 {
4792         int i, idx, r = 0;
4793         kvm_pfn_t identity_map_pfn;
4794         u32 tmp;
4795
4796         /* Protect kvm->arch.ept_identity_pagetable_done. */
4797         mutex_lock(&kvm->slots_lock);
4798
4799         if (likely(kvm->arch.ept_identity_pagetable_done))
4800                 goto out2;
4801
4802         if (!kvm->arch.ept_identity_map_addr)
4803                 kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4804         identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
4805
4806         r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4807                                     kvm->arch.ept_identity_map_addr, PAGE_SIZE);
4808         if (r < 0)
4809                 goto out2;
4810
4811         idx = srcu_read_lock(&kvm->srcu);
4812         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
4813         if (r < 0)
4814                 goto out;
4815         /* Set up identity-mapping pagetable for EPT in real mode */
4816         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
4817                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4818                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4819                 r = kvm_write_guest_page(kvm, identity_map_pfn,
4820                                 &tmp, i * sizeof(tmp), sizeof(tmp));
4821                 if (r < 0)
4822                         goto out;
4823         }
4824         kvm->arch.ept_identity_pagetable_done = true;
4825
4826 out:
4827         srcu_read_unlock(&kvm->srcu, idx);
4828
4829 out2:
4830         mutex_unlock(&kvm->slots_lock);
4831         return r;
4832 }
4833
4834 static void seg_setup(int seg)
4835 {
4836         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4837         unsigned int ar;
4838
4839         vmcs_write16(sf->selector, 0);
4840         vmcs_writel(sf->base, 0);
4841         vmcs_write32(sf->limit, 0xffff);
4842         ar = 0x93;
4843         if (seg == VCPU_SREG_CS)
4844                 ar |= 0x08; /* code segment */
4845
4846         vmcs_write32(sf->ar_bytes, ar);
4847 }
4848
4849 static int alloc_apic_access_page(struct kvm *kvm)
4850 {
4851         struct page *page;
4852         int r = 0;
4853
4854         mutex_lock(&kvm->slots_lock);
4855         if (kvm->arch.apic_access_page_done)
4856                 goto out;
4857         r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
4858                                     APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
4859         if (r)
4860                 goto out;
4861
4862         page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4863         if (is_error_page(page)) {
4864                 r = -EFAULT;
4865                 goto out;
4866         }
4867
4868         /*
4869          * Do not pin the page in memory, so that memory hot-unplug
4870          * is able to migrate it.
4871          */
4872         put_page(page);
4873         kvm->arch.apic_access_page_done = true;
4874 out:
4875         mutex_unlock(&kvm->slots_lock);
4876         return r;
4877 }
4878
4879 static int allocate_vpid(void)
4880 {
4881         int vpid;
4882
4883         if (!enable_vpid)
4884                 return 0;
4885         spin_lock(&vmx_vpid_lock);
4886         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4887         if (vpid < VMX_NR_VPIDS)
4888                 __set_bit(vpid, vmx_vpid_bitmap);
4889         else
4890                 vpid = 0;
4891         spin_unlock(&vmx_vpid_lock);
4892         return vpid;
4893 }
4894
4895 static void free_vpid(int vpid)
4896 {
4897         if (!enable_vpid || vpid == 0)
4898                 return;
4899         spin_lock(&vmx_vpid_lock);
4900         __clear_bit(vpid, vmx_vpid_bitmap);
4901         spin_unlock(&vmx_vpid_lock);
4902 }
4903
4904 static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4905                                                           u32 msr, int type)
4906 {
4907         int f = sizeof(unsigned long);
4908
4909         if (!cpu_has_vmx_msr_bitmap())
4910                 return;
4911
4912         /*
4913          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4914          * have the write-low and read-high bitmap offsets the wrong way round.
4915          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4916          */
4917         if (msr <= 0x1fff) {
4918                 if (type & MSR_TYPE_R)
4919                         /* read-low */
4920                         __clear_bit(msr, msr_bitmap + 0x000 / f);
4921
4922                 if (type & MSR_TYPE_W)
4923                         /* write-low */
4924                         __clear_bit(msr, msr_bitmap + 0x800 / f);
4925
4926         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4927                 msr &= 0x1fff;
4928                 if (type & MSR_TYPE_R)
4929                         /* read-high */
4930                         __clear_bit(msr, msr_bitmap + 0x400 / f);
4931
4932                 if (type & MSR_TYPE_W)
4933                         /* write-high */
4934                         __clear_bit(msr, msr_bitmap + 0xc00 / f);
4935
4936         }
4937 }
4938
4939 static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4940                                                          u32 msr, int type)
4941 {
4942         int f = sizeof(unsigned long);
4943
4944         if (!cpu_has_vmx_msr_bitmap())
4945                 return;
4946
4947         /*
4948          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4949          * have the write-low and read-high bitmap offsets the wrong way round.
4950          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4951          */
4952         if (msr <= 0x1fff) {
4953                 if (type & MSR_TYPE_R)
4954                         /* read-low */
4955                         __set_bit(msr, msr_bitmap + 0x000 / f);
4956
4957                 if (type & MSR_TYPE_W)
4958                         /* write-low */
4959                         __set_bit(msr, msr_bitmap + 0x800 / f);
4960
4961         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4962                 msr &= 0x1fff;
4963                 if (type & MSR_TYPE_R)
4964                         /* read-high */
4965                         __set_bit(msr, msr_bitmap + 0x400 / f);
4966
4967                 if (type & MSR_TYPE_W)
4968                         /* write-high */
4969                         __set_bit(msr, msr_bitmap + 0xc00 / f);
4970
4971         }
4972 }
4973
4974 static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
4975                                                       u32 msr, int type, bool value)
4976 {
4977         if (value)
4978                 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
4979         else
4980                 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
4981 }
4982
4983 /*
4984  * If a msr is allowed by L0, we should check whether it is allowed by L1.
4985  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
4986  */
4987 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
4988                                                unsigned long *msr_bitmap_nested,
4989                                                u32 msr, int type)
4990 {
4991         int f = sizeof(unsigned long);
4992
4993         if (!cpu_has_vmx_msr_bitmap()) {
4994                 WARN_ON(1);
4995                 return;
4996         }
4997
4998         /*
4999          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5000          * have the write-low and read-high bitmap offsets the wrong way round.
5001          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5002          */
5003         if (msr <= 0x1fff) {
5004                 if (type & MSR_TYPE_R &&
5005                    !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
5006                         /* read-low */
5007                         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
5008
5009                 if (type & MSR_TYPE_W &&
5010                    !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
5011                         /* write-low */
5012                         __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
5013
5014         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5015                 msr &= 0x1fff;
5016                 if (type & MSR_TYPE_R &&
5017                    !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
5018                         /* read-high */
5019                         __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
5020
5021                 if (type & MSR_TYPE_W &&
5022                    !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
5023                         /* write-high */
5024                         __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
5025
5026         }
5027 }
5028
5029 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
5030 {
5031         u8 mode = 0;
5032
5033         if (cpu_has_secondary_exec_ctrls() &&
5034             (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
5035              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
5036                 mode |= MSR_BITMAP_MODE_X2APIC;
5037                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
5038                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
5039         }
5040
5041         if (is_long_mode(vcpu))
5042                 mode |= MSR_BITMAP_MODE_LM;
5043
5044         return mode;
5045 }
5046
5047 #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
5048
5049 static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
5050                                          u8 mode)
5051 {
5052         int msr;
5053
5054         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
5055                 unsigned word = msr / BITS_PER_LONG;
5056                 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
5057                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
5058         }
5059
5060         if (mode & MSR_BITMAP_MODE_X2APIC) {
5061                 /*
5062                  * TPR reads and writes can be virtualized even if virtual interrupt
5063                  * delivery is not in use.
5064                  */
5065                 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
5066                 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
5067                         vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
5068                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
5069                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
5070                 }
5071         }
5072 }
5073
5074 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
5075 {
5076         struct vcpu_vmx *vmx = to_vmx(vcpu);
5077         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
5078         u8 mode = vmx_msr_bitmap_mode(vcpu);
5079         u8 changed = mode ^ vmx->msr_bitmap_mode;
5080
5081         if (!changed)
5082                 return;
5083
5084         vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
5085                                   !(mode & MSR_BITMAP_MODE_LM));
5086
5087         if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
5088                 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
5089
5090         vmx->msr_bitmap_mode = mode;
5091 }
5092
5093 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
5094 {
5095         return enable_apicv;
5096 }
5097
5098 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
5099 {
5100         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5101         gfn_t gfn;
5102
5103         /*
5104          * Don't need to mark the APIC access page dirty; it is never
5105          * written to by the CPU during APIC virtualization.
5106          */
5107
5108         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
5109                 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
5110                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
5111         }
5112
5113         if (nested_cpu_has_posted_intr(vmcs12)) {
5114                 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
5115                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
5116         }
5117 }
5118
5119
5120 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
5121 {
5122         struct vcpu_vmx *vmx = to_vmx(vcpu);
5123         int max_irr;
5124         void *vapic_page;
5125         u16 status;
5126
5127         if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
5128                 return;
5129
5130         vmx->nested.pi_pending = false;
5131         if (!pi_test_and_clear_on(vmx->nested.pi_desc))
5132                 return;
5133
5134         max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
5135         if (max_irr != 256) {
5136                 vapic_page = kmap(vmx->nested.virtual_apic_page);
5137                 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
5138                 kunmap(vmx->nested.virtual_apic_page);
5139
5140                 status = vmcs_read16(GUEST_INTR_STATUS);
5141                 if ((u8)max_irr > ((u8)status & 0xff)) {
5142                         status &= ~0xff;
5143                         status |= (u8)max_irr;
5144                         vmcs_write16(GUEST_INTR_STATUS, status);
5145                 }
5146         }
5147
5148         nested_mark_vmcs12_pages_dirty(vcpu);
5149 }
5150
5151 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
5152                                                      bool nested)
5153 {
5154 #ifdef CONFIG_SMP
5155         int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
5156
5157         if (vcpu->mode == IN_GUEST_MODE) {
5158                 /*
5159                  * The vector of interrupt to be delivered to vcpu had
5160                  * been set in PIR before this function.
5161                  *
5162                  * Following cases will be reached in this block, and
5163                  * we always send a notification event in all cases as
5164                  * explained below.
5165                  *
5166                  * Case 1: vcpu keeps in non-root mode. Sending a
5167                  * notification event posts the interrupt to vcpu.
5168                  *
5169                  * Case 2: vcpu exits to root mode and is still
5170                  * runnable. PIR will be synced to vIRR before the
5171                  * next vcpu entry. Sending a notification event in
5172                  * this case has no effect, as vcpu is not in root
5173                  * mode.
5174                  *
5175                  * Case 3: vcpu exits to root mode and is blocked.
5176                  * vcpu_block() has already synced PIR to vIRR and
5177                  * never blocks vcpu if vIRR is not cleared. Therefore,
5178                  * a blocked vcpu here does not wait for any requested
5179                  * interrupts in PIR, and sending a notification event
5180                  * which has no effect is safe here.
5181                  */
5182
5183                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
5184                 return true;
5185         }
5186 #endif
5187         return false;
5188 }
5189
5190 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
5191                                                 int vector)
5192 {
5193         struct vcpu_vmx *vmx = to_vmx(vcpu);
5194
5195         if (is_guest_mode(vcpu) &&
5196             vector == vmx->nested.posted_intr_nv) {
5197                 /* the PIR and ON have been set by L1. */
5198                 kvm_vcpu_trigger_posted_interrupt(vcpu, true);
5199                 /*
5200                  * If a posted intr is not recognized by hardware,
5201                  * we will accomplish it in the next vmentry.
5202                  */
5203                 vmx->nested.pi_pending = true;
5204                 kvm_make_request(KVM_REQ_EVENT, vcpu);
5205                 return 0;
5206         }
5207         return -1;
5208 }
5209 /*
5210  * Send interrupt to vcpu via posted interrupt way.
5211  * 1. If target vcpu is running(non-root mode), send posted interrupt
5212  * notification to vcpu and hardware will sync PIR to vIRR atomically.
5213  * 2. If target vcpu isn't running(root mode), kick it to pick up the
5214  * interrupt from PIR in next vmentry.
5215  */
5216 static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
5217 {
5218         struct vcpu_vmx *vmx = to_vmx(vcpu);
5219         int r;
5220
5221         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
5222         if (!r)
5223                 return;
5224
5225         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
5226                 return;
5227
5228         /* If a previous notification has sent the IPI, nothing to do.  */
5229         if (pi_test_and_set_on(&vmx->pi_desc))
5230                 return;
5231
5232         if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
5233                 kvm_vcpu_kick(vcpu);
5234 }
5235
5236 /*
5237  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
5238  * will not change in the lifetime of the guest.
5239  * Note that host-state that does change is set elsewhere. E.g., host-state
5240  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
5241  */
5242 static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
5243 {
5244         u32 low32, high32;
5245         unsigned long tmpl;
5246         struct desc_ptr dt;
5247         unsigned long cr0, cr3, cr4;
5248
5249         cr0 = read_cr0();
5250         WARN_ON(cr0 & X86_CR0_TS);
5251         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
5252
5253         /*
5254          * Save the most likely value for this task's CR3 in the VMCS.
5255          * We can't use __get_current_cr3_fast() because we're not atomic.
5256          */
5257         cr3 = __read_cr3();
5258         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
5259         vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
5260
5261         /* Save the most likely value for this task's CR4 in the VMCS. */
5262         cr4 = cr4_read_shadow();
5263         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
5264         vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
5265
5266         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
5267 #ifdef CONFIG_X86_64
5268         /*
5269          * Load null selectors, so we can avoid reloading them in
5270          * __vmx_load_host_state(), in case userspace uses the null selectors
5271          * too (the expected case).
5272          */
5273         vmcs_write16(HOST_DS_SELECTOR, 0);
5274         vmcs_write16(HOST_ES_SELECTOR, 0);
5275 #else
5276         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
5277         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
5278 #endif
5279         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
5280         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
5281
5282         store_idt(&dt);
5283         vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
5284         vmx->host_idt_base = dt.address;
5285
5286         vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
5287
5288         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
5289         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
5290         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
5291         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
5292
5293         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
5294                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
5295                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
5296         }
5297 }
5298
5299 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
5300 {
5301         vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
5302         if (enable_ept)
5303                 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
5304         if (is_guest_mode(&vmx->vcpu))
5305                 vmx->vcpu.arch.cr4_guest_owned_bits &=
5306                         ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
5307         vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
5308 }
5309
5310 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
5311 {
5312         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
5313
5314         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
5315                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
5316
5317         if (!enable_vnmi)
5318                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
5319
5320         /* Enable the preemption timer dynamically */
5321         pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
5322         return pin_based_exec_ctrl;
5323 }
5324
5325 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
5326 {
5327         struct vcpu_vmx *vmx = to_vmx(vcpu);
5328
5329         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
5330         if (cpu_has_secondary_exec_ctrls()) {
5331                 if (kvm_vcpu_apicv_active(vcpu))
5332                         vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
5333                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
5334                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5335                 else
5336                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
5337                                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
5338                                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5339         }
5340
5341         if (cpu_has_vmx_msr_bitmap())
5342                 vmx_update_msr_bitmap(vcpu);
5343 }
5344
5345 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
5346 {
5347         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
5348
5349         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
5350                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
5351
5352         if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
5353                 exec_control &= ~CPU_BASED_TPR_SHADOW;
5354 #ifdef CONFIG_X86_64
5355                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
5356                                 CPU_BASED_CR8_LOAD_EXITING;
5357 #endif
5358         }
5359         if (!enable_ept)
5360                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
5361                                 CPU_BASED_CR3_LOAD_EXITING  |
5362                                 CPU_BASED_INVLPG_EXITING;
5363         return exec_control;
5364 }
5365
5366 static bool vmx_rdrand_supported(void)
5367 {
5368         return vmcs_config.cpu_based_2nd_exec_ctrl &
5369                 SECONDARY_EXEC_RDRAND_EXITING;
5370 }
5371
5372 static bool vmx_rdseed_supported(void)
5373 {
5374         return vmcs_config.cpu_based_2nd_exec_ctrl &
5375                 SECONDARY_EXEC_RDSEED_EXITING;
5376 }
5377
5378 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5379 {
5380         struct kvm_vcpu *vcpu = &vmx->vcpu;
5381
5382         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
5383         if (!cpu_need_virtualize_apic_accesses(vcpu))
5384                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5385         if (vmx->vpid == 0)
5386                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
5387         if (!enable_ept) {
5388                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
5389                 enable_unrestricted_guest = 0;
5390                 /* Enable INVPCID for non-ept guests may cause performance regression. */
5391                 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
5392         }
5393         if (!enable_unrestricted_guest)
5394                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
5395         if (!ple_gap)
5396                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
5397         if (!kvm_vcpu_apicv_active(vcpu))
5398                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
5399                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5400         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
5401         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
5402            (handle_vmptrld).
5403            We can NOT enable shadow_vmcs here because we don't have yet
5404            a current VMCS12
5405         */
5406         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
5407
5408         if (!enable_pml)
5409                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
5410
5411         if (vmx_xsaves_supported()) {
5412                 /* Exposing XSAVES only when XSAVE is exposed */
5413                 bool xsaves_enabled =
5414                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
5415                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
5416
5417                 if (!xsaves_enabled)
5418                         exec_control &= ~SECONDARY_EXEC_XSAVES;
5419
5420                 if (nested) {
5421                         if (xsaves_enabled)
5422                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5423                                         SECONDARY_EXEC_XSAVES;
5424                         else
5425                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5426                                         ~SECONDARY_EXEC_XSAVES;
5427                 }
5428         }
5429
5430         if (vmx_rdtscp_supported()) {
5431                 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
5432                 if (!rdtscp_enabled)
5433                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
5434
5435                 if (nested) {
5436                         if (rdtscp_enabled)
5437                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5438                                         SECONDARY_EXEC_RDTSCP;
5439                         else
5440                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5441                                         ~SECONDARY_EXEC_RDTSCP;
5442                 }
5443         }
5444
5445         if (vmx_invpcid_supported()) {
5446                 /* Exposing INVPCID only when PCID is exposed */
5447                 bool invpcid_enabled =
5448                         guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
5449                         guest_cpuid_has(vcpu, X86_FEATURE_PCID);
5450
5451                 if (!invpcid_enabled) {
5452                         exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
5453                         guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
5454                 }
5455
5456                 if (nested) {
5457                         if (invpcid_enabled)
5458                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5459                                         SECONDARY_EXEC_ENABLE_INVPCID;
5460                         else
5461                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5462                                         ~SECONDARY_EXEC_ENABLE_INVPCID;
5463                 }
5464         }
5465
5466         if (vmx_rdrand_supported()) {
5467                 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
5468                 if (rdrand_enabled)
5469                         exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
5470
5471                 if (nested) {
5472                         if (rdrand_enabled)
5473                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5474                                         SECONDARY_EXEC_RDRAND_EXITING;
5475                         else
5476                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5477                                         ~SECONDARY_EXEC_RDRAND_EXITING;
5478                 }
5479         }
5480
5481         if (vmx_rdseed_supported()) {
5482                 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
5483                 if (rdseed_enabled)
5484                         exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
5485
5486                 if (nested) {
5487                         if (rdseed_enabled)
5488                                 vmx->nested.nested_vmx_secondary_ctls_high |=
5489                                         SECONDARY_EXEC_RDSEED_EXITING;
5490                         else
5491                                 vmx->nested.nested_vmx_secondary_ctls_high &=
5492                                         ~SECONDARY_EXEC_RDSEED_EXITING;
5493                 }
5494         }
5495
5496         vmx->secondary_exec_control = exec_control;
5497 }
5498
5499 static void ept_set_mmio_spte_mask(void)
5500 {
5501         /*
5502          * EPT Misconfigurations can be generated if the value of bits 2:0
5503          * of an EPT paging-structure entry is 110b (write/execute).
5504          */
5505         kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
5506                                    VMX_EPT_MISCONFIG_WX_VALUE);
5507 }
5508
5509 #define VMX_XSS_EXIT_BITMAP 0
5510 /*
5511  * Sets up the vmcs for emulated real mode.
5512  */
5513 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
5514 {
5515 #ifdef CONFIG_X86_64
5516         unsigned long a;
5517 #endif
5518         int i;
5519
5520         /* I/O */
5521         vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
5522         vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
5523
5524         if (enable_shadow_vmcs) {
5525                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5526                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5527         }
5528         if (cpu_has_vmx_msr_bitmap())
5529                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
5530
5531         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
5532
5533         /* Control */
5534         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
5535         vmx->hv_deadline_tsc = -1;
5536
5537         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
5538
5539         if (cpu_has_secondary_exec_ctrls()) {
5540                 vmx_compute_secondary_exec_control(vmx);
5541                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
5542                              vmx->secondary_exec_control);
5543         }
5544
5545         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
5546                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
5547                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
5548                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
5549                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
5550
5551                 vmcs_write16(GUEST_INTR_STATUS, 0);
5552
5553                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
5554                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
5555         }
5556
5557         if (ple_gap) {
5558                 vmcs_write32(PLE_GAP, ple_gap);
5559                 vmx->ple_window = ple_window;
5560                 vmx->ple_window_dirty = true;
5561         }
5562
5563         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
5564         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
5565         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
5566
5567         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
5568         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
5569         vmx_set_constant_host_state(vmx);
5570 #ifdef CONFIG_X86_64
5571         rdmsrl(MSR_FS_BASE, a);
5572         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
5573         rdmsrl(MSR_GS_BASE, a);
5574         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
5575 #else
5576         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
5577         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
5578 #endif
5579
5580         if (cpu_has_vmx_vmfunc())
5581                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
5582
5583         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
5584         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5585         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
5586         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
5587         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
5588
5589         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
5590                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
5591
5592         for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
5593                 u32 index = vmx_msr_index[i];
5594                 u32 data_low, data_high;
5595                 int j = vmx->nmsrs;
5596
5597                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
5598                         continue;
5599                 if (wrmsr_safe(index, data_low, data_high) < 0)
5600                         continue;
5601                 vmx->guest_msrs[j].index = i;
5602                 vmx->guest_msrs[j].data = 0;
5603                 vmx->guest_msrs[j].mask = -1ull;
5604                 ++vmx->nmsrs;
5605         }
5606
5607
5608         vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
5609
5610         /* 22.2.1, 20.8.1 */
5611         vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
5612
5613         vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
5614         vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
5615
5616         set_cr4_guest_host_mask(vmx);
5617
5618         if (vmx_xsaves_supported())
5619                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
5620
5621         if (enable_pml) {
5622                 ASSERT(vmx->pml_pg);
5623                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
5624                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5625         }
5626 }
5627
5628 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5629 {
5630         struct vcpu_vmx *vmx = to_vmx(vcpu);
5631         struct msr_data apic_base_msr;
5632         u64 cr0;
5633
5634         vmx->rmode.vm86_active = 0;
5635
5636         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
5637         kvm_set_cr8(vcpu, 0);
5638
5639         if (!init_event) {
5640                 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
5641                                      MSR_IA32_APICBASE_ENABLE;
5642                 if (kvm_vcpu_is_reset_bsp(vcpu))
5643                         apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
5644                 apic_base_msr.host_initiated = true;
5645                 kvm_set_apic_base(vcpu, &apic_base_msr);
5646         }
5647
5648         vmx_segment_cache_clear(vmx);
5649
5650         seg_setup(VCPU_SREG_CS);
5651         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
5652         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5653
5654         seg_setup(VCPU_SREG_DS);
5655         seg_setup(VCPU_SREG_ES);
5656         seg_setup(VCPU_SREG_FS);
5657         seg_setup(VCPU_SREG_GS);
5658         seg_setup(VCPU_SREG_SS);
5659
5660         vmcs_write16(GUEST_TR_SELECTOR, 0);
5661         vmcs_writel(GUEST_TR_BASE, 0);
5662         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5663         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5664
5665         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5666         vmcs_writel(GUEST_LDTR_BASE, 0);
5667         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5668         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5669
5670         if (!init_event) {
5671                 vmcs_write32(GUEST_SYSENTER_CS, 0);
5672                 vmcs_writel(GUEST_SYSENTER_ESP, 0);
5673                 vmcs_writel(GUEST_SYSENTER_EIP, 0);
5674                 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
5675         }
5676
5677         kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
5678         kvm_rip_write(vcpu, 0xfff0);
5679
5680         vmcs_writel(GUEST_GDTR_BASE, 0);
5681         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5682
5683         vmcs_writel(GUEST_IDTR_BASE, 0);
5684         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5685
5686         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5687         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5688         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5689         if (kvm_mpx_supported())
5690                 vmcs_write64(GUEST_BNDCFGS, 0);
5691
5692         setup_msrs(vmx);
5693
5694         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
5695
5696         if (cpu_has_vmx_tpr_shadow() && !init_event) {
5697                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5698                 if (cpu_need_tpr_shadow(vcpu))
5699                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5700                                      __pa(vcpu->arch.apic->regs));
5701                 vmcs_write32(TPR_THRESHOLD, 0);
5702         }
5703
5704         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5705
5706         if (vmx->vpid != 0)
5707                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
5708
5709         cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
5710         vmx->vcpu.arch.cr0 = cr0;
5711         vmx_set_cr0(vcpu, cr0); /* enter rmode */
5712         vmx_set_cr4(vcpu, 0);
5713         vmx_set_efer(vcpu, 0);
5714
5715         update_exception_bitmap(vcpu);
5716
5717         vpid_sync_context(vmx->vpid);
5718 }
5719
5720 /*
5721  * In nested virtualization, check if L1 asked to exit on external interrupts.
5722  * For most existing hypervisors, this will always return true.
5723  */
5724 static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
5725 {
5726         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
5727                 PIN_BASED_EXT_INTR_MASK;
5728 }
5729
5730 /*
5731  * In nested virtualization, check if L1 has set
5732  * VM_EXIT_ACK_INTR_ON_EXIT
5733  */
5734 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
5735 {
5736         return get_vmcs12(vcpu)->vm_exit_controls &
5737                 VM_EXIT_ACK_INTR_ON_EXIT;
5738 }
5739
5740 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
5741 {
5742         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
5743                 PIN_BASED_NMI_EXITING;
5744 }
5745
5746 static void enable_irq_window(struct kvm_vcpu *vcpu)
5747 {
5748         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
5749                       CPU_BASED_VIRTUAL_INTR_PENDING);
5750 }
5751
5752 static void enable_nmi_window(struct kvm_vcpu *vcpu)
5753 {
5754         if (!enable_vnmi ||
5755             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5756                 enable_irq_window(vcpu);
5757                 return;
5758         }
5759
5760         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
5761                       CPU_BASED_VIRTUAL_NMI_PENDING);
5762 }
5763
5764 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
5765 {
5766         struct vcpu_vmx *vmx = to_vmx(vcpu);
5767         uint32_t intr;
5768         int irq = vcpu->arch.interrupt.nr;
5769
5770         trace_kvm_inj_virq(irq);
5771
5772         ++vcpu->stat.irq_injections;
5773         if (vmx->rmode.vm86_active) {
5774                 int inc_eip = 0;
5775                 if (vcpu->arch.interrupt.soft)
5776                         inc_eip = vcpu->arch.event_exit_inst_len;
5777                 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
5778                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5779                 return;
5780         }
5781         intr = irq | INTR_INFO_VALID_MASK;
5782         if (vcpu->arch.interrupt.soft) {
5783                 intr |= INTR_TYPE_SOFT_INTR;
5784                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5785                              vmx->vcpu.arch.event_exit_inst_len);
5786         } else
5787                 intr |= INTR_TYPE_EXT_INTR;
5788         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5789 }
5790
5791 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5792 {
5793         struct vcpu_vmx *vmx = to_vmx(vcpu);
5794
5795         if (!enable_vnmi) {
5796                 /*
5797                  * Tracking the NMI-blocked state in software is built upon
5798                  * finding the next open IRQ window. This, in turn, depends on
5799                  * well-behaving guests: They have to keep IRQs disabled at
5800                  * least as long as the NMI handler runs. Otherwise we may
5801                  * cause NMI nesting, maybe breaking the guest. But as this is
5802                  * highly unlikely, we can live with the residual risk.
5803                  */
5804                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
5805                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
5806         }
5807
5808         ++vcpu->stat.nmi_injections;
5809         vmx->loaded_vmcs->nmi_known_unmasked = false;
5810
5811         if (vmx->rmode.vm86_active) {
5812                 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
5813                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5814                 return;
5815         }
5816
5817         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5818                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
5819 }
5820
5821 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
5822 {
5823         struct vcpu_vmx *vmx = to_vmx(vcpu);
5824         bool masked;
5825
5826         if (!enable_vnmi)
5827                 return vmx->loaded_vmcs->soft_vnmi_blocked;
5828         if (vmx->loaded_vmcs->nmi_known_unmasked)
5829                 return false;
5830         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5831         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5832         return masked;
5833 }
5834
5835 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5836 {
5837         struct vcpu_vmx *vmx = to_vmx(vcpu);
5838
5839         if (!enable_vnmi) {
5840                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5841                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5842                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
5843                 }
5844         } else {
5845                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5846                 if (masked)
5847                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5848                                       GUEST_INTR_STATE_NMI);
5849                 else
5850                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5851                                         GUEST_INTR_STATE_NMI);
5852         }
5853 }
5854
5855 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
5856 {
5857         if (to_vmx(vcpu)->nested.nested_run_pending)
5858                 return 0;
5859
5860         if (!enable_vnmi &&
5861             to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5862                 return 0;
5863
5864         return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5865                   (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
5866                    | GUEST_INTR_STATE_NMI));
5867 }
5868
5869 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
5870 {
5871         return (!to_vmx(vcpu)->nested.nested_run_pending &&
5872                 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
5873                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5874                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5875 }
5876
5877 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5878 {
5879         int ret;
5880
5881         ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5882                                     PAGE_SIZE * 3);
5883         if (ret)
5884                 return ret;
5885         kvm->arch.tss_addr = addr;
5886         return init_rmode_tss(kvm);
5887 }
5888
5889 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5890 {
5891         switch (vec) {
5892         case BP_VECTOR:
5893                 /*
5894                  * Update instruction length as we may reinject the exception
5895                  * from user space while in guest debugging mode.
5896                  */
5897                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5898                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5899                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5900                         return false;
5901                 /* fall through */
5902         case DB_VECTOR:
5903                 if (vcpu->guest_debug &
5904                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5905                         return false;
5906                 /* fall through */
5907         case DE_VECTOR:
5908         case OF_VECTOR:
5909         case BR_VECTOR:
5910         case UD_VECTOR:
5911         case DF_VECTOR:
5912         case SS_VECTOR:
5913         case GP_VECTOR:
5914         case MF_VECTOR:
5915                 return true;
5916         break;
5917         }
5918         return false;
5919 }
5920
5921 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5922                                   int vec, u32 err_code)
5923 {
5924         /*
5925          * Instruction with address size override prefix opcode 0x67
5926          * Cause the #SS fault with 0 error code in VM86 mode.
5927          */
5928         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5929                 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
5930                         if (vcpu->arch.halt_request) {
5931                                 vcpu->arch.halt_request = 0;
5932                                 return kvm_vcpu_halt(vcpu);
5933                         }
5934                         return 1;
5935                 }
5936                 return 0;
5937         }
5938
5939         /*
5940          * Forward all other exceptions that are valid in real mode.
5941          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5942          *        the required debugging infrastructure rework.
5943          */
5944         kvm_queue_exception(vcpu, vec);
5945         return 1;
5946 }
5947
5948 /*
5949  * Trigger machine check on the host. We assume all the MSRs are already set up
5950  * by the CPU and that we still run on the same CPU as the MCE occurred on.
5951  * We pass a fake environment to the machine check handler because we want
5952  * the guest to be always treated like user space, no matter what context
5953  * it used internally.
5954  */
5955 static void kvm_machine_check(void)
5956 {
5957 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
5958         struct pt_regs regs = {
5959                 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
5960                 .flags = X86_EFLAGS_IF,
5961         };
5962
5963         do_machine_check(&regs, 0);
5964 #endif
5965 }
5966
5967 static int handle_machine_check(struct kvm_vcpu *vcpu)
5968 {
5969         /* already handled by vcpu_run */
5970         return 1;
5971 }
5972
5973 static int handle_exception(struct kvm_vcpu *vcpu)
5974 {
5975         struct vcpu_vmx *vmx = to_vmx(vcpu);
5976         struct kvm_run *kvm_run = vcpu->run;
5977         u32 intr_info, ex_no, error_code;
5978         unsigned long cr2, rip, dr6;
5979         u32 vect_info;
5980         enum emulation_result er;
5981
5982         vect_info = vmx->idt_vectoring_info;
5983         intr_info = vmx->exit_intr_info;
5984
5985         if (is_machine_check(intr_info))
5986                 return handle_machine_check(vcpu);
5987
5988         if (is_nmi(intr_info))
5989                 return 1;  /* already handled by vmx_vcpu_run() */
5990
5991         if (is_invalid_opcode(intr_info)) {
5992                 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
5993                 if (er == EMULATE_USER_EXIT)
5994                         return 0;
5995                 if (er != EMULATE_DONE)
5996                         kvm_queue_exception(vcpu, UD_VECTOR);
5997                 return 1;
5998         }
5999
6000         error_code = 0;
6001         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6002                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6003
6004         /*
6005          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
6006          * MMIO, it is better to report an internal error.
6007          * See the comments in vmx_handle_exit.
6008          */
6009         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
6010             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
6011                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6012                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
6013                 vcpu->run->internal.ndata = 3;
6014                 vcpu->run->internal.data[0] = vect_info;
6015                 vcpu->run->internal.data[1] = intr_info;
6016                 vcpu->run->internal.data[2] = error_code;
6017                 return 0;
6018         }
6019
6020         if (is_page_fault(intr_info)) {
6021                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
6022                 /* EPT won't cause page fault directly */
6023                 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
6024                 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
6025         }
6026
6027         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
6028
6029         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
6030                 return handle_rmode_exception(vcpu, ex_no, error_code);
6031
6032         switch (ex_no) {
6033         case AC_VECTOR:
6034                 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
6035                 return 1;
6036         case DB_VECTOR:
6037                 dr6 = vmcs_readl(EXIT_QUALIFICATION);
6038                 if (!(vcpu->guest_debug &
6039                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
6040                         vcpu->arch.dr6 &= ~15;
6041                         vcpu->arch.dr6 |= dr6 | DR6_RTM;
6042                         if (!(dr6 & ~DR6_RESERVED)) /* icebp */
6043                                 skip_emulated_instruction(vcpu);
6044
6045                         kvm_queue_exception(vcpu, DB_VECTOR);
6046                         return 1;
6047                 }
6048                 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
6049                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
6050                 /* fall through */
6051         case BP_VECTOR:
6052                 /*
6053                  * Update instruction length as we may reinject #BP from
6054                  * user space while in guest debugging mode. Reading it for
6055                  * #DB as well causes no harm, it is not used in that case.
6056                  */
6057                 vmx->vcpu.arch.event_exit_inst_len =
6058                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6059                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
6060                 rip = kvm_rip_read(vcpu);
6061                 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
6062                 kvm_run->debug.arch.exception = ex_no;
6063                 break;
6064         default:
6065                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
6066                 kvm_run->ex.exception = ex_no;
6067                 kvm_run->ex.error_code = error_code;
6068                 break;
6069         }
6070         return 0;
6071 }
6072
6073 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
6074 {
6075         ++vcpu->stat.irq_exits;
6076         return 1;
6077 }
6078
6079 static int handle_triple_fault(struct kvm_vcpu *vcpu)
6080 {
6081         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
6082         vcpu->mmio_needed = 0;
6083         return 0;
6084 }
6085
6086 static int handle_io(struct kvm_vcpu *vcpu)
6087 {
6088         unsigned long exit_qualification;
6089         int size, in, string, ret;
6090         unsigned port;
6091
6092         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6093         string = (exit_qualification & 16) != 0;
6094         in = (exit_qualification & 8) != 0;
6095
6096         ++vcpu->stat.io_exits;
6097
6098         if (string || in)
6099                 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
6100
6101         port = exit_qualification >> 16;
6102         size = (exit_qualification & 7) + 1;
6103
6104         ret = kvm_skip_emulated_instruction(vcpu);
6105
6106         /*
6107          * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
6108          * KVM_EXIT_DEBUG here.
6109          */
6110         return kvm_fast_pio_out(vcpu, size, port) && ret;
6111 }
6112
6113 static void
6114 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
6115 {
6116         /*
6117          * Patch in the VMCALL instruction:
6118          */
6119         hypercall[0] = 0x0f;
6120         hypercall[1] = 0x01;
6121         hypercall[2] = 0xc1;
6122 }
6123
6124 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
6125 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
6126 {
6127         if (is_guest_mode(vcpu)) {
6128                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6129                 unsigned long orig_val = val;
6130
6131                 /*
6132                  * We get here when L2 changed cr0 in a way that did not change
6133                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
6134                  * but did change L0 shadowed bits. So we first calculate the
6135                  * effective cr0 value that L1 would like to write into the
6136                  * hardware. It consists of the L2-owned bits from the new
6137                  * value combined with the L1-owned bits from L1's guest_cr0.
6138                  */
6139                 val = (val & ~vmcs12->cr0_guest_host_mask) |
6140                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
6141
6142                 if (!nested_guest_cr0_valid(vcpu, val))
6143                         return 1;
6144
6145                 if (kvm_set_cr0(vcpu, val))
6146                         return 1;
6147                 vmcs_writel(CR0_READ_SHADOW, orig_val);
6148                 return 0;
6149         } else {
6150                 if (to_vmx(vcpu)->nested.vmxon &&
6151                     !nested_host_cr0_valid(vcpu, val))
6152                         return 1;
6153
6154                 return kvm_set_cr0(vcpu, val);
6155         }
6156 }
6157
6158 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
6159 {
6160         if (is_guest_mode(vcpu)) {
6161                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6162                 unsigned long orig_val = val;
6163
6164                 /* analogously to handle_set_cr0 */
6165                 val = (val & ~vmcs12->cr4_guest_host_mask) |
6166                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
6167                 if (kvm_set_cr4(vcpu, val))
6168                         return 1;
6169                 vmcs_writel(CR4_READ_SHADOW, orig_val);
6170                 return 0;
6171         } else
6172                 return kvm_set_cr4(vcpu, val);
6173 }
6174
6175 static int handle_cr(struct kvm_vcpu *vcpu)
6176 {
6177         unsigned long exit_qualification, val;
6178         int cr;
6179         int reg;
6180         int err;
6181         int ret;
6182
6183         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6184         cr = exit_qualification & 15;
6185         reg = (exit_qualification >> 8) & 15;
6186         switch ((exit_qualification >> 4) & 3) {
6187         case 0: /* mov to cr */
6188                 val = kvm_register_readl(vcpu, reg);
6189                 trace_kvm_cr_write(cr, val);
6190                 switch (cr) {
6191                 case 0:
6192                         err = handle_set_cr0(vcpu, val);
6193                         return kvm_complete_insn_gp(vcpu, err);
6194                 case 3:
6195                         err = kvm_set_cr3(vcpu, val);
6196                         return kvm_complete_insn_gp(vcpu, err);
6197                 case 4:
6198                         err = handle_set_cr4(vcpu, val);
6199                         return kvm_complete_insn_gp(vcpu, err);
6200                 case 8: {
6201                                 u8 cr8_prev = kvm_get_cr8(vcpu);
6202                                 u8 cr8 = (u8)val;
6203                                 err = kvm_set_cr8(vcpu, cr8);
6204                                 ret = kvm_complete_insn_gp(vcpu, err);
6205                                 if (lapic_in_kernel(vcpu))
6206                                         return ret;
6207                                 if (cr8_prev <= cr8)
6208                                         return ret;
6209                                 /*
6210                                  * TODO: we might be squashing a
6211                                  * KVM_GUESTDBG_SINGLESTEP-triggered
6212                                  * KVM_EXIT_DEBUG here.
6213                                  */
6214                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
6215                                 return 0;
6216                         }
6217                 }
6218                 break;
6219         case 2: /* clts */
6220                 WARN_ONCE(1, "Guest should always own CR0.TS");
6221                 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
6222                 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
6223                 return kvm_skip_emulated_instruction(vcpu);
6224         case 1: /*mov from cr*/
6225                 switch (cr) {
6226                 case 3:
6227                         val = kvm_read_cr3(vcpu);
6228                         kvm_register_write(vcpu, reg, val);
6229                         trace_kvm_cr_read(cr, val);
6230                         return kvm_skip_emulated_instruction(vcpu);
6231                 case 8:
6232                         val = kvm_get_cr8(vcpu);
6233                         kvm_register_write(vcpu, reg, val);
6234                         trace_kvm_cr_read(cr, val);
6235                         return kvm_skip_emulated_instruction(vcpu);
6236                 }
6237                 break;
6238         case 3: /* lmsw */
6239                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
6240                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
6241                 kvm_lmsw(vcpu, val);
6242
6243                 return kvm_skip_emulated_instruction(vcpu);
6244         default:
6245                 break;
6246         }
6247         vcpu->run->exit_reason = 0;
6248         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6249                (int)(exit_qualification >> 4) & 3, cr);
6250         return 0;
6251 }
6252
6253 static int handle_dr(struct kvm_vcpu *vcpu)
6254 {
6255         unsigned long exit_qualification;
6256         int dr, dr7, reg;
6257
6258         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6259         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
6260
6261         /* First, if DR does not exist, trigger UD */
6262         if (!kvm_require_dr(vcpu, dr))
6263                 return 1;
6264
6265         /* Do not handle if the CPL > 0, will trigger GP on re-entry */
6266         if (!kvm_require_cpl(vcpu, 0))
6267                 return 1;
6268         dr7 = vmcs_readl(GUEST_DR7);
6269         if (dr7 & DR7_GD) {
6270                 /*
6271                  * As the vm-exit takes precedence over the debug trap, we
6272                  * need to emulate the latter, either for the host or the
6273                  * guest debugging itself.
6274                  */
6275                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
6276                         vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
6277                         vcpu->run->debug.arch.dr7 = dr7;
6278                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
6279                         vcpu->run->debug.arch.exception = DB_VECTOR;
6280                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
6281                         return 0;
6282                 } else {
6283                         vcpu->arch.dr6 &= ~15;
6284                         vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
6285                         kvm_queue_exception(vcpu, DB_VECTOR);
6286                         return 1;
6287                 }
6288         }
6289
6290         if (vcpu->guest_debug == 0) {
6291                 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
6292                                 CPU_BASED_MOV_DR_EXITING);
6293
6294                 /*
6295                  * No more DR vmexits; force a reload of the debug registers
6296                  * and reenter on this instruction.  The next vmexit will
6297                  * retrieve the full state of the debug registers.
6298                  */
6299                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
6300                 return 1;
6301         }
6302
6303         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
6304         if (exit_qualification & TYPE_MOV_FROM_DR) {
6305                 unsigned long val;
6306
6307                 if (kvm_get_dr(vcpu, dr, &val))
6308                         return 1;
6309                 kvm_register_write(vcpu, reg, val);
6310         } else
6311                 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
6312                         return 1;
6313
6314         return kvm_skip_emulated_instruction(vcpu);
6315 }
6316
6317 static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
6318 {
6319         return vcpu->arch.dr6;
6320 }
6321
6322 static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
6323 {
6324 }
6325
6326 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
6327 {
6328         get_debugreg(vcpu->arch.db[0], 0);
6329         get_debugreg(vcpu->arch.db[1], 1);
6330         get_debugreg(vcpu->arch.db[2], 2);
6331         get_debugreg(vcpu->arch.db[3], 3);
6332         get_debugreg(vcpu->arch.dr6, 6);
6333         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
6334
6335         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
6336         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
6337 }
6338
6339 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
6340 {
6341         vmcs_writel(GUEST_DR7, val);
6342 }
6343
6344 static int handle_cpuid(struct kvm_vcpu *vcpu)
6345 {
6346         return kvm_emulate_cpuid(vcpu);
6347 }
6348
6349 static int handle_rdmsr(struct kvm_vcpu *vcpu)
6350 {
6351         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
6352         struct msr_data msr_info;
6353
6354         msr_info.index = ecx;
6355         msr_info.host_initiated = false;
6356         if (vmx_get_msr(vcpu, &msr_info)) {
6357                 trace_kvm_msr_read_ex(ecx);
6358                 kvm_inject_gp(vcpu, 0);
6359                 return 1;
6360         }
6361
6362         trace_kvm_msr_read(ecx, msr_info.data);
6363
6364         /* FIXME: handling of bits 32:63 of rax, rdx */
6365         vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
6366         vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
6367         return kvm_skip_emulated_instruction(vcpu);
6368 }
6369
6370 static int handle_wrmsr(struct kvm_vcpu *vcpu)
6371 {
6372         struct msr_data msr;
6373         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
6374         u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
6375                 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
6376
6377         msr.data = data;
6378         msr.index = ecx;
6379         msr.host_initiated = false;
6380         if (kvm_set_msr(vcpu, &msr) != 0) {
6381                 trace_kvm_msr_write_ex(ecx, data);
6382                 kvm_inject_gp(vcpu, 0);
6383                 return 1;
6384         }
6385
6386         trace_kvm_msr_write(ecx, data);
6387         return kvm_skip_emulated_instruction(vcpu);
6388 }
6389
6390 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6391 {
6392         kvm_apic_update_ppr(vcpu);
6393         return 1;
6394 }
6395
6396 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6397 {
6398         vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
6399                         CPU_BASED_VIRTUAL_INTR_PENDING);
6400
6401         kvm_make_request(KVM_REQ_EVENT, vcpu);
6402
6403         ++vcpu->stat.irq_window_exits;
6404         return 1;
6405 }
6406
6407 static int handle_halt(struct kvm_vcpu *vcpu)
6408 {
6409         return kvm_emulate_halt(vcpu);
6410 }
6411
6412 static int handle_vmcall(struct kvm_vcpu *vcpu)
6413 {
6414         return kvm_emulate_hypercall(vcpu);
6415 }
6416
6417 static int handle_invd(struct kvm_vcpu *vcpu)
6418 {
6419         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
6420 }
6421
6422 static int handle_invlpg(struct kvm_vcpu *vcpu)
6423 {
6424         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6425
6426         kvm_mmu_invlpg(vcpu, exit_qualification);
6427         return kvm_skip_emulated_instruction(vcpu);
6428 }
6429
6430 static int handle_rdpmc(struct kvm_vcpu *vcpu)
6431 {
6432         int err;
6433
6434         err = kvm_rdpmc(vcpu);
6435         return kvm_complete_insn_gp(vcpu, err);
6436 }
6437
6438 static int handle_wbinvd(struct kvm_vcpu *vcpu)
6439 {
6440         return kvm_emulate_wbinvd(vcpu);
6441 }
6442
6443 static int handle_xsetbv(struct kvm_vcpu *vcpu)
6444 {
6445         u64 new_bv = kvm_read_edx_eax(vcpu);
6446         u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
6447
6448         if (kvm_set_xcr(vcpu, index, new_bv) == 0)
6449                 return kvm_skip_emulated_instruction(vcpu);
6450         return 1;
6451 }
6452
6453 static int handle_xsaves(struct kvm_vcpu *vcpu)
6454 {
6455         kvm_skip_emulated_instruction(vcpu);
6456         WARN(1, "this should never happen\n");
6457         return 1;
6458 }
6459
6460 static int handle_xrstors(struct kvm_vcpu *vcpu)
6461 {
6462         kvm_skip_emulated_instruction(vcpu);
6463         WARN(1, "this should never happen\n");
6464         return 1;
6465 }
6466
6467 static int handle_apic_access(struct kvm_vcpu *vcpu)
6468 {
6469         if (likely(fasteoi)) {
6470                 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6471                 int access_type, offset;
6472
6473                 access_type = exit_qualification & APIC_ACCESS_TYPE;
6474                 offset = exit_qualification & APIC_ACCESS_OFFSET;
6475                 /*
6476                  * Sane guest uses MOV to write EOI, with written value
6477                  * not cared. So make a short-circuit here by avoiding
6478                  * heavy instruction emulation.
6479                  */
6480                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
6481                     (offset == APIC_EOI)) {
6482                         kvm_lapic_set_eoi(vcpu);
6483                         return kvm_skip_emulated_instruction(vcpu);
6484                 }
6485         }
6486         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
6487 }
6488
6489 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
6490 {
6491         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6492         int vector = exit_qualification & 0xff;
6493
6494         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
6495         kvm_apic_set_eoi_accelerated(vcpu, vector);
6496         return 1;
6497 }
6498
6499 static int handle_apic_write(struct kvm_vcpu *vcpu)
6500 {
6501         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6502         u32 offset = exit_qualification & 0xfff;
6503
6504         /* APIC-write VM exit is trap-like and thus no need to adjust IP */
6505         kvm_apic_write_nodecode(vcpu, offset);
6506         return 1;
6507 }
6508
6509 static int handle_task_switch(struct kvm_vcpu *vcpu)
6510 {
6511         struct vcpu_vmx *vmx = to_vmx(vcpu);
6512         unsigned long exit_qualification;
6513         bool has_error_code = false;
6514         u32 error_code = 0;
6515         u16 tss_selector;
6516         int reason, type, idt_v, idt_index;
6517
6518         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
6519         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
6520         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
6521
6522         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6523
6524         reason = (u32)exit_qualification >> 30;
6525         if (reason == TASK_SWITCH_GATE && idt_v) {
6526                 switch (type) {
6527                 case INTR_TYPE_NMI_INTR:
6528                         vcpu->arch.nmi_injected = false;
6529                         vmx_set_nmi_mask(vcpu, true);
6530                         break;
6531                 case INTR_TYPE_EXT_INTR:
6532                 case INTR_TYPE_SOFT_INTR:
6533                         kvm_clear_interrupt_queue(vcpu);
6534                         break;
6535                 case INTR_TYPE_HARD_EXCEPTION:
6536                         if (vmx->idt_vectoring_info &
6537                             VECTORING_INFO_DELIVER_CODE_MASK) {
6538                                 has_error_code = true;
6539                                 error_code =
6540                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
6541                         }
6542                         /* fall through */
6543                 case INTR_TYPE_SOFT_EXCEPTION:
6544                         kvm_clear_exception_queue(vcpu);
6545                         break;
6546                 default:
6547                         break;
6548                 }
6549         }
6550         tss_selector = exit_qualification;
6551
6552         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
6553                        type != INTR_TYPE_EXT_INTR &&
6554                        type != INTR_TYPE_NMI_INTR))
6555                 skip_emulated_instruction(vcpu);
6556
6557         if (kvm_task_switch(vcpu, tss_selector,
6558                             type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
6559                             has_error_code, error_code) == EMULATE_FAIL) {
6560                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6561                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
6562                 vcpu->run->internal.ndata = 0;
6563                 return 0;
6564         }
6565
6566         /*
6567          * TODO: What about debug traps on tss switch?
6568          *       Are we supposed to inject them and update dr6?
6569          */
6570
6571         return 1;
6572 }
6573
6574 static int handle_ept_violation(struct kvm_vcpu *vcpu)
6575 {
6576         unsigned long exit_qualification;
6577         gpa_t gpa;
6578         u64 error_code;
6579
6580         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6581
6582         /*
6583          * EPT violation happened while executing iret from NMI,
6584          * "blocked by NMI" bit has to be set before next VM entry.
6585          * There are errata that may cause this bit to not be set:
6586          * AAK134, BY25.
6587          */
6588         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6589                         enable_vnmi &&
6590                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
6591                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
6592
6593         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6594         trace_kvm_page_fault(gpa, exit_qualification);
6595
6596         /* Is it a read fault? */
6597         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
6598                      ? PFERR_USER_MASK : 0;
6599         /* Is it a write fault? */
6600         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
6601                       ? PFERR_WRITE_MASK : 0;
6602         /* Is it a fetch fault? */
6603         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
6604                       ? PFERR_FETCH_MASK : 0;
6605         /* ept page table entry is present? */
6606         error_code |= (exit_qualification &
6607                        (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
6608                         EPT_VIOLATION_EXECUTABLE))
6609                       ? PFERR_PRESENT_MASK : 0;
6610
6611         error_code |= (exit_qualification & 0x100) != 0 ?
6612                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
6613
6614         vcpu->arch.exit_qualification = exit_qualification;
6615         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
6616 }
6617
6618 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6619 {
6620         int ret;
6621         gpa_t gpa;
6622
6623         /*
6624          * A nested guest cannot optimize MMIO vmexits, because we have an
6625          * nGPA here instead of the required GPA.
6626          */
6627         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6628         if (!is_guest_mode(vcpu) &&
6629             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
6630                 trace_kvm_fast_mmio(gpa);
6631                 return kvm_skip_emulated_instruction(vcpu);
6632         }
6633
6634         ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
6635         if (ret >= 0)
6636                 return ret;
6637
6638         /* It is the real ept misconfig */
6639         WARN_ON(1);
6640
6641         vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
6642         vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
6643
6644         return 0;
6645 }
6646
6647 static int handle_nmi_window(struct kvm_vcpu *vcpu)
6648 {
6649         WARN_ON_ONCE(!enable_vnmi);
6650         vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
6651                         CPU_BASED_VIRTUAL_NMI_PENDING);
6652         ++vcpu->stat.nmi_window_exits;
6653         kvm_make_request(KVM_REQ_EVENT, vcpu);
6654
6655         return 1;
6656 }
6657
6658 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6659 {
6660         struct vcpu_vmx *vmx = to_vmx(vcpu);
6661         enum emulation_result err = EMULATE_DONE;
6662         int ret = 1;
6663         u32 cpu_exec_ctrl;
6664         bool intr_window_requested;
6665         unsigned count = 130;
6666
6667         cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6668         intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
6669
6670         while (vmx->emulation_required && count-- != 0) {
6671                 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
6672                         return handle_interrupt_window(&vmx->vcpu);
6673
6674                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
6675                         return 1;
6676
6677                 err = emulate_instruction(vcpu, 0);
6678
6679                 if (err == EMULATE_USER_EXIT) {
6680                         ++vcpu->stat.mmio_exits;
6681                         ret = 0;
6682                         goto out;
6683                 }
6684
6685                 if (err != EMULATE_DONE) {
6686                         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6687                         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
6688                         vcpu->run->internal.ndata = 0;
6689                         return 0;
6690                 }
6691
6692                 if (vcpu->arch.halt_request) {
6693                         vcpu->arch.halt_request = 0;
6694                         ret = kvm_vcpu_halt(vcpu);
6695                         goto out;
6696                 }
6697
6698                 if (signal_pending(current))
6699                         goto out;
6700                 if (need_resched())
6701                         schedule();
6702         }
6703
6704 out:
6705         return ret;
6706 }
6707
6708 static int __grow_ple_window(int val)
6709 {
6710         if (ple_window_grow < 1)
6711                 return ple_window;
6712
6713         val = min(val, ple_window_actual_max);
6714
6715         if (ple_window_grow < ple_window)
6716                 val *= ple_window_grow;
6717         else
6718                 val += ple_window_grow;
6719
6720         return val;
6721 }
6722
6723 static int __shrink_ple_window(int val, int modifier, int minimum)
6724 {
6725         if (modifier < 1)
6726                 return ple_window;
6727
6728         if (modifier < ple_window)
6729                 val /= modifier;
6730         else
6731                 val -= modifier;
6732
6733         return max(val, minimum);
6734 }
6735
6736 static void grow_ple_window(struct kvm_vcpu *vcpu)
6737 {
6738         struct vcpu_vmx *vmx = to_vmx(vcpu);
6739         int old = vmx->ple_window;
6740
6741         vmx->ple_window = __grow_ple_window(old);
6742
6743         if (vmx->ple_window != old)
6744                 vmx->ple_window_dirty = true;
6745
6746         trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
6747 }
6748
6749 static void shrink_ple_window(struct kvm_vcpu *vcpu)
6750 {
6751         struct vcpu_vmx *vmx = to_vmx(vcpu);
6752         int old = vmx->ple_window;
6753
6754         vmx->ple_window = __shrink_ple_window(old,
6755                                               ple_window_shrink, ple_window);
6756
6757         if (vmx->ple_window != old)
6758                 vmx->ple_window_dirty = true;
6759
6760         trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
6761 }
6762
6763 /*
6764  * ple_window_actual_max is computed to be one grow_ple_window() below
6765  * ple_window_max. (See __grow_ple_window for the reason.)
6766  * This prevents overflows, because ple_window_max is int.
6767  * ple_window_max effectively rounded down to a multiple of ple_window_grow in
6768  * this process.
6769  * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
6770  */
6771 static void update_ple_window_actual_max(void)
6772 {
6773         ple_window_actual_max =
6774                         __shrink_ple_window(max(ple_window_max, ple_window),
6775                                             ple_window_grow, INT_MIN);
6776 }
6777
6778 /*
6779  * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
6780  */
6781 static void wakeup_handler(void)
6782 {
6783         struct kvm_vcpu *vcpu;
6784         int cpu = smp_processor_id();
6785
6786         spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6787         list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
6788                         blocked_vcpu_list) {
6789                 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6790
6791                 if (pi_test_on(pi_desc) == 1)
6792                         kvm_vcpu_kick(vcpu);
6793         }
6794         spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6795 }
6796
6797 void vmx_enable_tdp(void)
6798 {
6799         kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
6800                 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
6801                 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6802                 0ull, VMX_EPT_EXECUTABLE_MASK,
6803                 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6804                 VMX_EPT_RWX_MASK, 0ull);
6805
6806         ept_set_mmio_spte_mask();
6807         kvm_enable_tdp();
6808 }
6809
6810 static __init int hardware_setup(void)
6811 {
6812         int r = -ENOMEM, i;
6813
6814         rdmsrl_safe(MSR_EFER, &host_efer);
6815
6816         for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
6817                 kvm_define_shared_msr(i, vmx_msr_index[i]);
6818
6819         for (i = 0; i < VMX_BITMAP_NR; i++) {
6820                 vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
6821                 if (!vmx_bitmap[i])
6822                         goto out;
6823         }
6824
6825         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
6826         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
6827
6828         memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
6829
6830         memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
6831
6832         if (setup_vmcs_config(&vmcs_config) < 0) {
6833                 r = -EIO;
6834                 goto out;
6835         }
6836
6837         if (boot_cpu_has(X86_FEATURE_NX))
6838                 kvm_enable_efer_bits(EFER_NX);
6839
6840         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
6841                 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
6842                 enable_vpid = 0;
6843
6844         if (!cpu_has_vmx_shadow_vmcs())
6845                 enable_shadow_vmcs = 0;
6846         if (enable_shadow_vmcs)
6847                 init_vmcs_shadow_fields();
6848
6849         if (!cpu_has_vmx_ept() ||
6850             !cpu_has_vmx_ept_4levels() ||
6851             !cpu_has_vmx_ept_mt_wb() ||
6852             !cpu_has_vmx_invept_global())
6853                 enable_ept = 0;
6854
6855         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
6856                 enable_ept_ad_bits = 0;
6857
6858         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
6859                 enable_unrestricted_guest = 0;
6860
6861         if (!cpu_has_vmx_flexpriority())
6862                 flexpriority_enabled = 0;
6863
6864         if (!cpu_has_virtual_nmis())
6865                 enable_vnmi = 0;
6866
6867         /*
6868          * set_apic_access_page_addr() is used to reload apic access
6869          * page upon invalidation.  No need to do anything if not
6870          * using the APIC_ACCESS_ADDR VMCS field.
6871          */
6872         if (!flexpriority_enabled)
6873                 kvm_x86_ops->set_apic_access_page_addr = NULL;
6874
6875         if (!cpu_has_vmx_tpr_shadow())
6876                 kvm_x86_ops->update_cr8_intercept = NULL;
6877
6878         if (enable_ept && !cpu_has_vmx_ept_2m_page())
6879                 kvm_disable_largepages();
6880
6881         if (!cpu_has_vmx_ple()) {
6882                 ple_gap = 0;
6883                 ple_window = 0;
6884                 ple_window_grow = 0;
6885                 ple_window_max = 0;
6886                 ple_window_shrink = 0;
6887         }
6888
6889         if (!cpu_has_vmx_apicv()) {
6890                 enable_apicv = 0;
6891                 kvm_x86_ops->sync_pir_to_irr = NULL;
6892         }
6893
6894         if (cpu_has_vmx_tsc_scaling()) {
6895                 kvm_has_tsc_control = true;
6896                 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
6897                 kvm_tsc_scaling_ratio_frac_bits = 48;
6898         }
6899
6900         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
6901
6902         if (enable_ept)
6903                 vmx_enable_tdp();
6904         else
6905                 kvm_disable_tdp();
6906
6907         update_ple_window_actual_max();
6908
6909         /*
6910          * Only enable PML when hardware supports PML feature, and both EPT
6911          * and EPT A/D bit features are enabled -- PML depends on them to work.
6912          */
6913         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
6914                 enable_pml = 0;
6915
6916         if (!enable_pml) {
6917                 kvm_x86_ops->slot_enable_log_dirty = NULL;
6918                 kvm_x86_ops->slot_disable_log_dirty = NULL;
6919                 kvm_x86_ops->flush_log_dirty = NULL;
6920                 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
6921         }
6922
6923         if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
6924                 u64 vmx_msr;
6925
6926                 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
6927                 cpu_preemption_timer_multi =
6928                          vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
6929         } else {
6930                 kvm_x86_ops->set_hv_timer = NULL;
6931                 kvm_x86_ops->cancel_hv_timer = NULL;
6932         }
6933
6934         kvm_set_posted_intr_wakeup_handler(wakeup_handler);
6935
6936         kvm_mce_cap_supported |= MCG_LMCE_P;
6937
6938         return alloc_kvm_area();
6939
6940 out:
6941         for (i = 0; i < VMX_BITMAP_NR; i++)
6942                 free_page((unsigned long)vmx_bitmap[i]);
6943
6944     return r;
6945 }
6946
6947 static __exit void hardware_unsetup(void)
6948 {
6949         int i;
6950
6951         for (i = 0; i < VMX_BITMAP_NR; i++)
6952                 free_page((unsigned long)vmx_bitmap[i]);
6953
6954         free_kvm_area();
6955 }
6956
6957 /*
6958  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6959  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6960  */
6961 static int handle_pause(struct kvm_vcpu *vcpu)
6962 {
6963         if (ple_gap)
6964                 grow_ple_window(vcpu);
6965
6966         /*
6967          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6968          * VM-execution control is ignored if CPL > 0. OTOH, KVM
6969          * never set PAUSE_EXITING and just set PLE if supported,
6970          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
6971          */
6972         kvm_vcpu_on_spin(vcpu, true);
6973         return kvm_skip_emulated_instruction(vcpu);
6974 }
6975
6976 static int handle_nop(struct kvm_vcpu *vcpu)
6977 {
6978         return kvm_skip_emulated_instruction(vcpu);
6979 }
6980
6981 static int handle_mwait(struct kvm_vcpu *vcpu)
6982 {
6983         printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
6984         return handle_nop(vcpu);
6985 }
6986
6987 static int handle_invalid_op(struct kvm_vcpu *vcpu)
6988 {
6989         kvm_queue_exception(vcpu, UD_VECTOR);
6990         return 1;
6991 }
6992
6993 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6994 {
6995         return 1;
6996 }
6997
6998 static int handle_monitor(struct kvm_vcpu *vcpu)
6999 {
7000         printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
7001         return handle_nop(vcpu);
7002 }
7003
7004 /*
7005  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
7006  * set the success or error code of an emulated VMX instruction, as specified
7007  * by Vol 2B, VMX Instruction Reference, "Conventions".
7008  */
7009 static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
7010 {
7011         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
7012                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
7013                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
7014 }
7015
7016 static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
7017 {
7018         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
7019                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
7020                             X86_EFLAGS_SF | X86_EFLAGS_OF))
7021                         | X86_EFLAGS_CF);
7022 }
7023
7024 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
7025                                         u32 vm_instruction_error)
7026 {
7027         if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
7028                 /*
7029                  * failValid writes the error number to the current VMCS, which
7030                  * can't be done there isn't a current VMCS.
7031                  */
7032                 nested_vmx_failInvalid(vcpu);
7033                 return;
7034         }
7035         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
7036                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
7037                             X86_EFLAGS_SF | X86_EFLAGS_OF))
7038                         | X86_EFLAGS_ZF);
7039         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
7040         /*
7041          * We don't need to force a shadow sync because
7042          * VM_INSTRUCTION_ERROR is not shadowed
7043          */
7044 }
7045
7046 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
7047 {
7048         /* TODO: not to reset guest simply here. */
7049         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
7050         pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
7051 }
7052
7053 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
7054 {
7055         struct vcpu_vmx *vmx =
7056                 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
7057
7058         vmx->nested.preemption_timer_expired = true;
7059         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
7060         kvm_vcpu_kick(&vmx->vcpu);
7061
7062         return HRTIMER_NORESTART;
7063 }
7064
7065 /*
7066  * Decode the memory-address operand of a vmx instruction, as recorded on an
7067  * exit caused by such an instruction (run by a guest hypervisor).
7068  * On success, returns 0. When the operand is invalid, returns 1 and throws
7069  * #UD or #GP.
7070  */
7071 static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
7072                                  unsigned long exit_qualification,
7073                                  u32 vmx_instruction_info, bool wr, gva_t *ret)
7074 {
7075         gva_t off;
7076         bool exn;
7077         struct kvm_segment s;
7078
7079         /*
7080          * According to Vol. 3B, "Information for VM Exits Due to Instruction
7081          * Execution", on an exit, vmx_instruction_info holds most of the
7082          * addressing components of the operand. Only the displacement part
7083          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
7084          * For how an actual address is calculated from all these components,
7085          * refer to Vol. 1, "Operand Addressing".
7086          */
7087         int  scaling = vmx_instruction_info & 3;
7088         int  addr_size = (vmx_instruction_info >> 7) & 7;
7089         bool is_reg = vmx_instruction_info & (1u << 10);
7090         int  seg_reg = (vmx_instruction_info >> 15) & 7;
7091         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
7092         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
7093         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
7094         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
7095
7096         if (is_reg) {
7097                 kvm_queue_exception(vcpu, UD_VECTOR);
7098                 return 1;
7099         }
7100
7101         /* Addr = segment_base + offset */
7102         /* offset = base + [index * scale] + displacement */
7103         off = exit_qualification; /* holds the displacement */
7104         if (base_is_valid)
7105                 off += kvm_register_read(vcpu, base_reg);
7106         if (index_is_valid)
7107                 off += kvm_register_read(vcpu, index_reg)<<scaling;
7108         vmx_get_segment(vcpu, &s, seg_reg);
7109         *ret = s.base + off;
7110
7111         if (addr_size == 1) /* 32 bit */
7112                 *ret &= 0xffffffff;
7113
7114         /* Checks for #GP/#SS exceptions. */
7115         exn = false;
7116         if (is_long_mode(vcpu)) {
7117                 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
7118                  * non-canonical form. This is the only check on the memory
7119                  * destination for long mode!
7120                  */
7121                 exn = is_noncanonical_address(*ret, vcpu);
7122         } else if (is_protmode(vcpu)) {
7123                 /* Protected mode: apply checks for segment validity in the
7124                  * following order:
7125                  * - segment type check (#GP(0) may be thrown)
7126                  * - usability check (#GP(0)/#SS(0))
7127                  * - limit check (#GP(0)/#SS(0))
7128                  */
7129                 if (wr)
7130                         /* #GP(0) if the destination operand is located in a
7131                          * read-only data segment or any code segment.
7132                          */
7133                         exn = ((s.type & 0xa) == 0 || (s.type & 8));
7134                 else
7135                         /* #GP(0) if the source operand is located in an
7136                          * execute-only code segment
7137                          */
7138                         exn = ((s.type & 0xa) == 8);
7139                 if (exn) {
7140                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7141                         return 1;
7142                 }
7143                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
7144                  */
7145                 exn = (s.unusable != 0);
7146                 /* Protected mode: #GP(0)/#SS(0) if the memory
7147                  * operand is outside the segment limit.
7148                  */
7149                 exn = exn || (off + sizeof(u64) > s.limit);
7150         }
7151         if (exn) {
7152                 kvm_queue_exception_e(vcpu,
7153                                       seg_reg == VCPU_SREG_SS ?
7154                                                 SS_VECTOR : GP_VECTOR,
7155                                       0);
7156                 return 1;
7157         }
7158
7159         return 0;
7160 }
7161
7162 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
7163 {
7164         gva_t gva;
7165         struct x86_exception e;
7166
7167         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7168                         vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
7169                 return 1;
7170
7171         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
7172                                 sizeof(*vmpointer), &e)) {
7173                 kvm_inject_page_fault(vcpu, &e);
7174                 return 1;
7175         }
7176
7177         return 0;
7178 }
7179
7180 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
7181 {
7182         struct vcpu_vmx *vmx = to_vmx(vcpu);
7183         struct vmcs *shadow_vmcs;
7184         int r;
7185
7186         r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
7187         if (r < 0)
7188                 goto out_vmcs02;
7189
7190         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
7191         if (!vmx->nested.cached_vmcs12)
7192                 goto out_cached_vmcs12;
7193
7194         if (enable_shadow_vmcs) {
7195                 shadow_vmcs = alloc_vmcs();
7196                 if (!shadow_vmcs)
7197                         goto out_shadow_vmcs;
7198                 /* mark vmcs as shadow */
7199                 shadow_vmcs->revision_id |= (1u << 31);
7200                 /* init shadow vmcs */
7201                 vmcs_clear(shadow_vmcs);
7202                 vmx->vmcs01.shadow_vmcs = shadow_vmcs;
7203         }
7204
7205         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
7206                      HRTIMER_MODE_REL_PINNED);
7207         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
7208
7209         vmx->nested.vmxon = true;
7210         return 0;
7211
7212 out_shadow_vmcs:
7213         kfree(vmx->nested.cached_vmcs12);
7214
7215 out_cached_vmcs12:
7216         free_loaded_vmcs(&vmx->nested.vmcs02);
7217
7218 out_vmcs02:
7219         return -ENOMEM;
7220 }
7221
7222 /*
7223  * Emulate the VMXON instruction.
7224  * Currently, we just remember that VMX is active, and do not save or even
7225  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
7226  * do not currently need to store anything in that guest-allocated memory
7227  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
7228  * argument is different from the VMXON pointer (which the spec says they do).
7229  */
7230 static int handle_vmon(struct kvm_vcpu *vcpu)
7231 {
7232         int ret;
7233         gpa_t vmptr;
7234         struct page *page;
7235         struct vcpu_vmx *vmx = to_vmx(vcpu);
7236         const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
7237                 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
7238
7239         /*
7240          * The Intel VMX Instruction Reference lists a bunch of bits that are
7241          * prerequisite to running VMXON, most notably cr4.VMXE must be set to
7242          * 1 (see vmx_set_cr4() for when we allow the guest to set this).
7243          * Otherwise, we should fail with #UD.  But most faulting conditions
7244          * have already been checked by hardware, prior to the VM-exit for
7245          * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
7246          * that bit set to 1 in non-root mode.
7247          */
7248         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
7249                 kvm_queue_exception(vcpu, UD_VECTOR);
7250                 return 1;
7251         }
7252
7253         if (vmx->nested.vmxon) {
7254                 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
7255                 return kvm_skip_emulated_instruction(vcpu);
7256         }
7257
7258         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
7259                         != VMXON_NEEDED_FEATURES) {
7260                 kvm_inject_gp(vcpu, 0);
7261                 return 1;
7262         }
7263
7264         if (nested_vmx_get_vmptr(vcpu, &vmptr))
7265                 return 1;
7266
7267         /*
7268          * SDM 3: 24.11.5
7269          * The first 4 bytes of VMXON region contain the supported
7270          * VMCS revision identifier
7271          *
7272          * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
7273          * which replaces physical address width with 32
7274          */
7275         if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
7276                 nested_vmx_failInvalid(vcpu);
7277                 return kvm_skip_emulated_instruction(vcpu);
7278         }
7279
7280         page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
7281         if (is_error_page(page)) {
7282                 nested_vmx_failInvalid(vcpu);
7283                 return kvm_skip_emulated_instruction(vcpu);
7284         }
7285         if (*(u32 *)kmap(page) != VMCS12_REVISION) {
7286                 kunmap(page);
7287                 kvm_release_page_clean(page);
7288                 nested_vmx_failInvalid(vcpu);
7289                 return kvm_skip_emulated_instruction(vcpu);
7290         }
7291         kunmap(page);
7292         kvm_release_page_clean(page);
7293
7294         vmx->nested.vmxon_ptr = vmptr;
7295         ret = enter_vmx_operation(vcpu);
7296         if (ret)
7297                 return ret;
7298
7299         nested_vmx_succeed(vcpu);
7300         return kvm_skip_emulated_instruction(vcpu);
7301 }
7302
7303 /*
7304  * Intel's VMX Instruction Reference specifies a common set of prerequisites
7305  * for running VMX instructions (except VMXON, whose prerequisites are
7306  * slightly different). It also specifies what exception to inject otherwise.
7307  * Note that many of these exceptions have priority over VM exits, so they
7308  * don't have to be checked again here.
7309  */
7310 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
7311 {
7312         if (!to_vmx(vcpu)->nested.vmxon) {
7313                 kvm_queue_exception(vcpu, UD_VECTOR);
7314                 return 0;
7315         }
7316         return 1;
7317 }
7318
7319 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
7320 {
7321         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
7322         vmcs_write64(VMCS_LINK_POINTER, -1ull);
7323 }
7324
7325 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
7326 {
7327         if (vmx->nested.current_vmptr == -1ull)
7328                 return;
7329
7330         if (enable_shadow_vmcs) {
7331                 /* copy to memory all shadowed fields in case
7332                    they were modified */
7333                 copy_shadow_to_vmcs12(vmx);
7334                 vmx->nested.sync_shadow_vmcs = false;
7335                 vmx_disable_shadow_vmcs(vmx);
7336         }
7337         vmx->nested.posted_intr_nv = -1;
7338
7339         /* Flush VMCS12 to guest memory */
7340         kvm_vcpu_write_guest_page(&vmx->vcpu,
7341                                   vmx->nested.current_vmptr >> PAGE_SHIFT,
7342                                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
7343
7344         vmx->nested.current_vmptr = -1ull;
7345 }
7346
7347 /*
7348  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
7349  * just stops using VMX.
7350  */
7351 static void free_nested(struct vcpu_vmx *vmx)
7352 {
7353         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
7354                 return;
7355
7356         vmx->nested.vmxon = false;
7357         vmx->nested.smm.vmxon = false;
7358         free_vpid(vmx->nested.vpid02);
7359         vmx->nested.posted_intr_nv = -1;
7360         vmx->nested.current_vmptr = -1ull;
7361         if (enable_shadow_vmcs) {
7362                 vmx_disable_shadow_vmcs(vmx);
7363                 vmcs_clear(vmx->vmcs01.shadow_vmcs);
7364                 free_vmcs(vmx->vmcs01.shadow_vmcs);
7365                 vmx->vmcs01.shadow_vmcs = NULL;
7366         }
7367         kfree(vmx->nested.cached_vmcs12);
7368         /* Unpin physical memory we referred to in the vmcs02 */
7369         if (vmx->nested.apic_access_page) {
7370                 kvm_release_page_dirty(vmx->nested.apic_access_page);
7371                 vmx->nested.apic_access_page = NULL;
7372         }
7373         if (vmx->nested.virtual_apic_page) {
7374                 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
7375                 vmx->nested.virtual_apic_page = NULL;
7376         }
7377         if (vmx->nested.pi_desc_page) {
7378                 kunmap(vmx->nested.pi_desc_page);
7379                 kvm_release_page_dirty(vmx->nested.pi_desc_page);
7380                 vmx->nested.pi_desc_page = NULL;
7381                 vmx->nested.pi_desc = NULL;
7382         }
7383
7384         free_loaded_vmcs(&vmx->nested.vmcs02);
7385 }
7386
7387 /* Emulate the VMXOFF instruction */
7388 static int handle_vmoff(struct kvm_vcpu *vcpu)
7389 {
7390         if (!nested_vmx_check_permission(vcpu))
7391                 return 1;
7392         free_nested(to_vmx(vcpu));
7393         nested_vmx_succeed(vcpu);
7394         return kvm_skip_emulated_instruction(vcpu);
7395 }
7396
7397 /* Emulate the VMCLEAR instruction */
7398 static int handle_vmclear(struct kvm_vcpu *vcpu)
7399 {
7400         struct vcpu_vmx *vmx = to_vmx(vcpu);
7401         u32 zero = 0;
7402         gpa_t vmptr;
7403
7404         if (!nested_vmx_check_permission(vcpu))
7405                 return 1;
7406
7407         if (nested_vmx_get_vmptr(vcpu, &vmptr))
7408                 return 1;
7409
7410         if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
7411                 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
7412                 return kvm_skip_emulated_instruction(vcpu);
7413         }
7414
7415         if (vmptr == vmx->nested.vmxon_ptr) {
7416                 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
7417                 return kvm_skip_emulated_instruction(vcpu);
7418         }
7419
7420         if (vmptr == vmx->nested.current_vmptr)
7421                 nested_release_vmcs12(vmx);
7422
7423         kvm_vcpu_write_guest(vcpu,
7424                         vmptr + offsetof(struct vmcs12, launch_state),
7425                         &zero, sizeof(zero));
7426
7427         nested_vmx_succeed(vcpu);
7428         return kvm_skip_emulated_instruction(vcpu);
7429 }
7430
7431 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
7432
7433 /* Emulate the VMLAUNCH instruction */
7434 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
7435 {
7436         return nested_vmx_run(vcpu, true);
7437 }
7438
7439 /* Emulate the VMRESUME instruction */
7440 static int handle_vmresume(struct kvm_vcpu *vcpu)
7441 {
7442
7443         return nested_vmx_run(vcpu, false);
7444 }
7445
7446 /*
7447  * Read a vmcs12 field. Since these can have varying lengths and we return
7448  * one type, we chose the biggest type (u64) and zero-extend the return value
7449  * to that size. Note that the caller, handle_vmread, might need to use only
7450  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
7451  * 64-bit fields are to be returned).
7452  */
7453 static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
7454                                   unsigned long field, u64 *ret)
7455 {
7456         short offset = vmcs_field_to_offset(field);
7457         char *p;
7458
7459         if (offset < 0)
7460                 return offset;
7461
7462         p = ((char *)(get_vmcs12(vcpu))) + offset;
7463
7464         switch (vmcs_field_type(field)) {
7465         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
7466                 *ret = *((natural_width *)p);
7467                 return 0;
7468         case VMCS_FIELD_TYPE_U16:
7469                 *ret = *((u16 *)p);
7470                 return 0;
7471         case VMCS_FIELD_TYPE_U32:
7472                 *ret = *((u32 *)p);
7473                 return 0;
7474         case VMCS_FIELD_TYPE_U64:
7475                 *ret = *((u64 *)p);
7476                 return 0;
7477         default:
7478                 WARN_ON(1);
7479                 return -ENOENT;
7480         }
7481 }
7482
7483
7484 static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
7485                                    unsigned long field, u64 field_value){
7486         short offset = vmcs_field_to_offset(field);
7487         char *p = ((char *) get_vmcs12(vcpu)) + offset;
7488         if (offset < 0)
7489                 return offset;
7490
7491         switch (vmcs_field_type(field)) {
7492         case VMCS_FIELD_TYPE_U16:
7493                 *(u16 *)p = field_value;
7494                 return 0;
7495         case VMCS_FIELD_TYPE_U32:
7496                 *(u32 *)p = field_value;
7497                 return 0;
7498         case VMCS_FIELD_TYPE_U64:
7499                 *(u64 *)p = field_value;
7500                 return 0;
7501         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
7502                 *(natural_width *)p = field_value;
7503                 return 0;
7504         default:
7505                 WARN_ON(1);
7506                 return -ENOENT;
7507         }
7508
7509 }
7510
7511 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
7512 {
7513         int i;
7514         unsigned long field;
7515         u64 field_value;
7516         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
7517         const unsigned long *fields = shadow_read_write_fields;
7518         const int num_fields = max_shadow_read_write_fields;
7519
7520         preempt_disable();
7521
7522         vmcs_load(shadow_vmcs);
7523
7524         for (i = 0; i < num_fields; i++) {
7525                 field = fields[i];
7526                 switch (vmcs_field_type(field)) {
7527                 case VMCS_FIELD_TYPE_U16:
7528                         field_value = vmcs_read16(field);
7529                         break;
7530                 case VMCS_FIELD_TYPE_U32:
7531                         field_value = vmcs_read32(field);
7532                         break;
7533                 case VMCS_FIELD_TYPE_U64:
7534                         field_value = vmcs_read64(field);
7535                         break;
7536                 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
7537                         field_value = vmcs_readl(field);
7538                         break;
7539                 default:
7540                         WARN_ON(1);
7541                         continue;
7542                 }
7543                 vmcs12_write_any(&vmx->vcpu, field, field_value);
7544         }
7545
7546         vmcs_clear(shadow_vmcs);
7547         vmcs_load(vmx->loaded_vmcs->vmcs);
7548
7549         preempt_enable();
7550 }
7551
7552 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
7553 {
7554         const unsigned long *fields[] = {
7555                 shadow_read_write_fields,
7556                 shadow_read_only_fields
7557         };
7558         const int max_fields[] = {
7559                 max_shadow_read_write_fields,
7560                 max_shadow_read_only_fields
7561         };
7562         int i, q;
7563         unsigned long field;
7564         u64 field_value = 0;
7565         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
7566
7567         vmcs_load(shadow_vmcs);
7568
7569         for (q = 0; q < ARRAY_SIZE(fields); q++) {
7570                 for (i = 0; i < max_fields[q]; i++) {
7571                         field = fields[q][i];
7572                         vmcs12_read_any(&vmx->vcpu, field, &field_value);
7573
7574                         switch (vmcs_field_type(field)) {
7575                         case VMCS_FIELD_TYPE_U16:
7576                                 vmcs_write16(field, (u16)field_value);
7577                                 break;
7578                         case VMCS_FIELD_TYPE_U32:
7579                                 vmcs_write32(field, (u32)field_value);
7580                                 break;
7581                         case VMCS_FIELD_TYPE_U64:
7582                                 vmcs_write64(field, (u64)field_value);
7583                                 break;
7584                         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
7585                                 vmcs_writel(field, (long)field_value);
7586                                 break;
7587                         default:
7588                                 WARN_ON(1);
7589                                 break;
7590                         }
7591                 }
7592         }
7593
7594         vmcs_clear(shadow_vmcs);
7595         vmcs_load(vmx->loaded_vmcs->vmcs);
7596 }
7597
7598 /*
7599  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
7600  * used before) all generate the same failure when it is missing.
7601  */
7602 static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
7603 {
7604         struct vcpu_vmx *vmx = to_vmx(vcpu);
7605         if (vmx->nested.current_vmptr == -1ull) {
7606                 nested_vmx_failInvalid(vcpu);
7607                 return 0;
7608         }
7609         return 1;
7610 }
7611
7612 static int handle_vmread(struct kvm_vcpu *vcpu)
7613 {
7614         unsigned long field;
7615         u64 field_value;
7616         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7617         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7618         gva_t gva = 0;
7619
7620         if (!nested_vmx_check_permission(vcpu))
7621                 return 1;
7622
7623         if (!nested_vmx_check_vmcs12(vcpu))
7624                 return kvm_skip_emulated_instruction(vcpu);
7625
7626         /* Decode instruction info and find the field to read */
7627         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
7628         /* Read the field, zero-extended to a u64 field_value */
7629         if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
7630                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
7631                 return kvm_skip_emulated_instruction(vcpu);
7632         }
7633         /*
7634          * Now copy part of this value to register or memory, as requested.
7635          * Note that the number of bits actually copied is 32 or 64 depending
7636          * on the guest's mode (32 or 64 bit), not on the given field's length.
7637          */
7638         if (vmx_instruction_info & (1u << 10)) {
7639                 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
7640                         field_value);
7641         } else {
7642                 if (get_vmx_mem_address(vcpu, exit_qualification,
7643                                 vmx_instruction_info, true, &gva))
7644                         return 1;
7645                 /* _system ok, as hardware has verified cpl=0 */
7646                 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
7647                              &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
7648         }
7649
7650         nested_vmx_succeed(vcpu);
7651         return kvm_skip_emulated_instruction(vcpu);
7652 }
7653
7654
7655 static int handle_vmwrite(struct kvm_vcpu *vcpu)
7656 {
7657         unsigned long field;
7658         gva_t gva;
7659         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7660         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7661         /* The value to write might be 32 or 64 bits, depending on L1's long
7662          * mode, and eventually we need to write that into a field of several
7663          * possible lengths. The code below first zero-extends the value to 64
7664          * bit (field_value), and then copies only the appropriate number of
7665          * bits into the vmcs12 field.
7666          */
7667         u64 field_value = 0;
7668         struct x86_exception e;
7669
7670         if (!nested_vmx_check_permission(vcpu))
7671                 return 1;
7672
7673         if (!nested_vmx_check_vmcs12(vcpu))
7674                 return kvm_skip_emulated_instruction(vcpu);
7675
7676         if (vmx_instruction_info & (1u << 10))
7677                 field_value = kvm_register_readl(vcpu,
7678                         (((vmx_instruction_info) >> 3) & 0xf));
7679         else {
7680                 if (get_vmx_mem_address(vcpu, exit_qualification,
7681                                 vmx_instruction_info, false, &gva))
7682                         return 1;
7683                 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
7684                            &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
7685                         kvm_inject_page_fault(vcpu, &e);
7686                         return 1;
7687                 }
7688         }
7689
7690
7691         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
7692         if (vmcs_field_readonly(field)) {
7693                 nested_vmx_failValid(vcpu,
7694                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
7695                 return kvm_skip_emulated_instruction(vcpu);
7696         }
7697
7698         if (vmcs12_write_any(vcpu, field, field_value) < 0) {
7699                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
7700                 return kvm_skip_emulated_instruction(vcpu);
7701         }
7702
7703         nested_vmx_succeed(vcpu);
7704         return kvm_skip_emulated_instruction(vcpu);
7705 }
7706
7707 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
7708 {
7709         vmx->nested.current_vmptr = vmptr;
7710         if (enable_shadow_vmcs) {
7711                 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
7712                               SECONDARY_EXEC_SHADOW_VMCS);
7713                 vmcs_write64(VMCS_LINK_POINTER,
7714                              __pa(vmx->vmcs01.shadow_vmcs));
7715                 vmx->nested.sync_shadow_vmcs = true;
7716         }
7717 }
7718
7719 /* Emulate the VMPTRLD instruction */
7720 static int handle_vmptrld(struct kvm_vcpu *vcpu)
7721 {
7722         struct vcpu_vmx *vmx = to_vmx(vcpu);
7723         gpa_t vmptr;
7724
7725         if (!nested_vmx_check_permission(vcpu))
7726                 return 1;
7727
7728         if (nested_vmx_get_vmptr(vcpu, &vmptr))
7729                 return 1;
7730
7731         if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
7732                 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
7733                 return kvm_skip_emulated_instruction(vcpu);
7734         }
7735
7736         if (vmptr == vmx->nested.vmxon_ptr) {
7737                 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
7738                 return kvm_skip_emulated_instruction(vcpu);
7739         }
7740
7741         if (vmx->nested.current_vmptr != vmptr) {
7742                 struct vmcs12 *new_vmcs12;
7743                 struct page *page;
7744                 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
7745                 if (is_error_page(page)) {
7746                         nested_vmx_failInvalid(vcpu);
7747                         return kvm_skip_emulated_instruction(vcpu);
7748                 }
7749                 new_vmcs12 = kmap(page);
7750                 if (new_vmcs12->revision_id != VMCS12_REVISION) {
7751                         kunmap(page);
7752                         kvm_release_page_clean(page);
7753                         nested_vmx_failValid(vcpu,
7754                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
7755                         return kvm_skip_emulated_instruction(vcpu);
7756                 }
7757
7758                 nested_release_vmcs12(vmx);
7759                 /*
7760                  * Load VMCS12 from guest memory since it is not already
7761                  * cached.
7762                  */
7763                 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
7764                 kunmap(page);
7765                 kvm_release_page_clean(page);
7766
7767                 set_current_vmptr(vmx, vmptr);
7768         }
7769
7770         nested_vmx_succeed(vcpu);
7771         return kvm_skip_emulated_instruction(vcpu);
7772 }
7773
7774 /* Emulate the VMPTRST instruction */
7775 static int handle_vmptrst(struct kvm_vcpu *vcpu)
7776 {
7777         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7778         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7779         gva_t vmcs_gva;
7780         struct x86_exception e;
7781
7782         if (!nested_vmx_check_permission(vcpu))
7783                 return 1;
7784
7785         if (get_vmx_mem_address(vcpu, exit_qualification,
7786                         vmx_instruction_info, true, &vmcs_gva))
7787                 return 1;
7788         /* ok to use *_system, as hardware has verified cpl=0 */
7789         if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
7790                                  (void *)&to_vmx(vcpu)->nested.current_vmptr,
7791                                  sizeof(u64), &e)) {
7792                 kvm_inject_page_fault(vcpu, &e);
7793                 return 1;
7794         }
7795         nested_vmx_succeed(vcpu);
7796         return kvm_skip_emulated_instruction(vcpu);
7797 }
7798
7799 /* Emulate the INVEPT instruction */
7800 static int handle_invept(struct kvm_vcpu *vcpu)
7801 {
7802         struct vcpu_vmx *vmx = to_vmx(vcpu);
7803         u32 vmx_instruction_info, types;
7804         unsigned long type;
7805         gva_t gva;
7806         struct x86_exception e;
7807         struct {
7808                 u64 eptp, gpa;
7809         } operand;
7810
7811         if (!(vmx->nested.nested_vmx_secondary_ctls_high &
7812               SECONDARY_EXEC_ENABLE_EPT) ||
7813             !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
7814                 kvm_queue_exception(vcpu, UD_VECTOR);
7815                 return 1;
7816         }
7817
7818         if (!nested_vmx_check_permission(vcpu))
7819                 return 1;
7820
7821         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7822         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7823
7824         types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
7825
7826         if (type >= 32 || !(types & (1 << type))) {
7827                 nested_vmx_failValid(vcpu,
7828                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7829                 return kvm_skip_emulated_instruction(vcpu);
7830         }
7831
7832         /* According to the Intel VMX instruction reference, the memory
7833          * operand is read even if it isn't needed (e.g., for type==global)
7834          */
7835         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7836                         vmx_instruction_info, false, &gva))
7837                 return 1;
7838         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
7839                                 sizeof(operand), &e)) {
7840                 kvm_inject_page_fault(vcpu, &e);
7841                 return 1;
7842         }
7843
7844         switch (type) {
7845         case VMX_EPT_EXTENT_GLOBAL:
7846         /*
7847          * TODO: track mappings and invalidate
7848          * single context requests appropriately
7849          */
7850         case VMX_EPT_EXTENT_CONTEXT:
7851                 kvm_mmu_sync_roots(vcpu);
7852                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
7853                 nested_vmx_succeed(vcpu);
7854                 break;
7855         default:
7856                 BUG_ON(1);
7857                 break;
7858         }
7859
7860         return kvm_skip_emulated_instruction(vcpu);
7861 }
7862
7863 static int handle_invvpid(struct kvm_vcpu *vcpu)
7864 {
7865         struct vcpu_vmx *vmx = to_vmx(vcpu);
7866         u32 vmx_instruction_info;
7867         unsigned long type, types;
7868         gva_t gva;
7869         struct x86_exception e;
7870         struct {
7871                 u64 vpid;
7872                 u64 gla;
7873         } operand;
7874
7875         if (!(vmx->nested.nested_vmx_secondary_ctls_high &
7876               SECONDARY_EXEC_ENABLE_VPID) ||
7877                         !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
7878                 kvm_queue_exception(vcpu, UD_VECTOR);
7879                 return 1;
7880         }
7881
7882         if (!nested_vmx_check_permission(vcpu))
7883                 return 1;
7884
7885         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7886         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7887
7888         types = (vmx->nested.nested_vmx_vpid_caps &
7889                         VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
7890
7891         if (type >= 32 || !(types & (1 << type))) {
7892                 nested_vmx_failValid(vcpu,
7893                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7894                 return kvm_skip_emulated_instruction(vcpu);
7895         }
7896
7897         /* according to the intel vmx instruction reference, the memory
7898          * operand is read even if it isn't needed (e.g., for type==global)
7899          */
7900         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7901                         vmx_instruction_info, false, &gva))
7902                 return 1;
7903         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
7904                                 sizeof(operand), &e)) {
7905                 kvm_inject_page_fault(vcpu, &e);
7906                 return 1;
7907         }
7908         if (operand.vpid >> 16) {
7909                 nested_vmx_failValid(vcpu,
7910                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7911                 return kvm_skip_emulated_instruction(vcpu);
7912         }
7913
7914         switch (type) {
7915         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
7916                 if (is_noncanonical_address(operand.gla, vcpu)) {
7917                         nested_vmx_failValid(vcpu,
7918                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7919                         return kvm_skip_emulated_instruction(vcpu);
7920                 }
7921                 /* fall through */
7922         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
7923         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
7924                 if (!operand.vpid) {
7925                         nested_vmx_failValid(vcpu,
7926                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7927                         return kvm_skip_emulated_instruction(vcpu);
7928                 }
7929                 break;
7930         case VMX_VPID_EXTENT_ALL_CONTEXT:
7931                 break;
7932         default:
7933                 WARN_ON_ONCE(1);
7934                 return kvm_skip_emulated_instruction(vcpu);
7935         }
7936
7937         __vmx_flush_tlb(vcpu, vmx->nested.vpid02);
7938         nested_vmx_succeed(vcpu);
7939
7940         return kvm_skip_emulated_instruction(vcpu);
7941 }
7942
7943 static int handle_pml_full(struct kvm_vcpu *vcpu)
7944 {
7945         unsigned long exit_qualification;
7946
7947         trace_kvm_pml_full(vcpu->vcpu_id);
7948
7949         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7950
7951         /*
7952          * PML buffer FULL happened while executing iret from NMI,
7953          * "blocked by NMI" bit has to be set before next VM entry.
7954          */
7955         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7956                         enable_vnmi &&
7957                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
7958                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7959                                 GUEST_INTR_STATE_NMI);
7960
7961         /*
7962          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
7963          * here.., and there's no userspace involvement needed for PML.
7964          */
7965         return 1;
7966 }
7967
7968 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
7969 {
7970         kvm_lapic_expired_hv_timer(vcpu);
7971         return 1;
7972 }
7973
7974 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
7975 {
7976         struct vcpu_vmx *vmx = to_vmx(vcpu);
7977         int maxphyaddr = cpuid_maxphyaddr(vcpu);
7978
7979         /* Check for memory type validity */
7980         switch (address & VMX_EPTP_MT_MASK) {
7981         case VMX_EPTP_MT_UC:
7982                 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
7983                         return false;
7984                 break;
7985         case VMX_EPTP_MT_WB:
7986                 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT))
7987                         return false;
7988                 break;
7989         default:
7990                 return false;
7991         }
7992
7993         /* only 4 levels page-walk length are valid */
7994         if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
7995                 return false;
7996
7997         /* Reserved bits should not be set */
7998         if (address >> maxphyaddr || ((address >> 7) & 0x1f))
7999                 return false;
8000
8001         /* AD, if set, should be supported */
8002         if (address & VMX_EPTP_AD_ENABLE_BIT) {
8003                 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT))
8004                         return false;
8005         }
8006
8007         return true;
8008 }
8009
8010 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
8011                                      struct vmcs12 *vmcs12)
8012 {
8013         u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
8014         u64 address;
8015         bool accessed_dirty;
8016         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
8017
8018         if (!nested_cpu_has_eptp_switching(vmcs12) ||
8019             !nested_cpu_has_ept(vmcs12))
8020                 return 1;
8021
8022         if (index >= VMFUNC_EPTP_ENTRIES)
8023                 return 1;
8024
8025
8026         if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
8027                                      &address, index * 8, 8))
8028                 return 1;
8029
8030         accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
8031
8032         /*
8033          * If the (L2) guest does a vmfunc to the currently
8034          * active ept pointer, we don't have to do anything else
8035          */
8036         if (vmcs12->ept_pointer != address) {
8037                 if (!valid_ept_address(vcpu, address))
8038                         return 1;
8039
8040                 kvm_mmu_unload(vcpu);
8041                 mmu->ept_ad = accessed_dirty;
8042                 mmu->base_role.ad_disabled = !accessed_dirty;
8043                 vmcs12->ept_pointer = address;
8044                 /*
8045                  * TODO: Check what's the correct approach in case
8046                  * mmu reload fails. Currently, we just let the next
8047                  * reload potentially fail
8048                  */
8049                 kvm_mmu_reload(vcpu);
8050         }
8051
8052         return 0;
8053 }
8054
8055 static int handle_vmfunc(struct kvm_vcpu *vcpu)
8056 {
8057         struct vcpu_vmx *vmx = to_vmx(vcpu);
8058         struct vmcs12 *vmcs12;
8059         u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
8060
8061         /*
8062          * VMFUNC is only supported for nested guests, but we always enable the
8063          * secondary control for simplicity; for non-nested mode, fake that we
8064          * didn't by injecting #UD.
8065          */
8066         if (!is_guest_mode(vcpu)) {
8067                 kvm_queue_exception(vcpu, UD_VECTOR);
8068                 return 1;
8069         }
8070
8071         vmcs12 = get_vmcs12(vcpu);
8072         if ((vmcs12->vm_function_control & (1 << function)) == 0)
8073                 goto fail;
8074
8075         switch (function) {
8076         case 0:
8077                 if (nested_vmx_eptp_switching(vcpu, vmcs12))
8078                         goto fail;
8079                 break;
8080         default:
8081                 goto fail;
8082         }
8083         return kvm_skip_emulated_instruction(vcpu);
8084
8085 fail:
8086         nested_vmx_vmexit(vcpu, vmx->exit_reason,
8087                           vmcs_read32(VM_EXIT_INTR_INFO),
8088                           vmcs_readl(EXIT_QUALIFICATION));
8089         return 1;
8090 }
8091
8092 /*
8093  * The exit handlers return 1 if the exit was handled fully and guest execution
8094  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
8095  * to be done to userspace and return 0.
8096  */
8097 static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
8098         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
8099         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
8100         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
8101         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
8102         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
8103         [EXIT_REASON_CR_ACCESS]               = handle_cr,
8104         [EXIT_REASON_DR_ACCESS]               = handle_dr,
8105         [EXIT_REASON_CPUID]                   = handle_cpuid,
8106         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
8107         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
8108         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
8109         [EXIT_REASON_HLT]                     = handle_halt,
8110         [EXIT_REASON_INVD]                    = handle_invd,
8111         [EXIT_REASON_INVLPG]                  = handle_invlpg,
8112         [EXIT_REASON_RDPMC]                   = handle_rdpmc,
8113         [EXIT_REASON_VMCALL]                  = handle_vmcall,
8114         [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
8115         [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
8116         [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
8117         [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
8118         [EXIT_REASON_VMREAD]                  = handle_vmread,
8119         [EXIT_REASON_VMRESUME]                = handle_vmresume,
8120         [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
8121         [EXIT_REASON_VMOFF]                   = handle_vmoff,
8122         [EXIT_REASON_VMON]                    = handle_vmon,
8123         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
8124         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
8125         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
8126         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
8127         [EXIT_REASON_WBINVD]                  = handle_wbinvd,
8128         [EXIT_REASON_XSETBV]                  = handle_xsetbv,
8129         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
8130         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
8131         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
8132         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
8133         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
8134         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
8135         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
8136         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
8137         [EXIT_REASON_INVEPT]                  = handle_invept,
8138         [EXIT_REASON_INVVPID]                 = handle_invvpid,
8139         [EXIT_REASON_RDRAND]                  = handle_invalid_op,
8140         [EXIT_REASON_RDSEED]                  = handle_invalid_op,
8141         [EXIT_REASON_XSAVES]                  = handle_xsaves,
8142         [EXIT_REASON_XRSTORS]                 = handle_xrstors,
8143         [EXIT_REASON_PML_FULL]                = handle_pml_full,
8144         [EXIT_REASON_VMFUNC]                  = handle_vmfunc,
8145         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
8146 };
8147
8148 static const int kvm_vmx_max_exit_handlers =
8149         ARRAY_SIZE(kvm_vmx_exit_handlers);
8150
8151 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
8152                                        struct vmcs12 *vmcs12)
8153 {
8154         unsigned long exit_qualification;
8155         gpa_t bitmap, last_bitmap;
8156         unsigned int port;
8157         int size;
8158         u8 b;
8159
8160         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
8161                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
8162
8163         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
8164
8165         port = exit_qualification >> 16;
8166         size = (exit_qualification & 7) + 1;
8167
8168         last_bitmap = (gpa_t)-1;
8169         b = -1;
8170
8171         while (size > 0) {
8172                 if (port < 0x8000)
8173                         bitmap = vmcs12->io_bitmap_a;
8174                 else if (port < 0x10000)
8175                         bitmap = vmcs12->io_bitmap_b;
8176                 else
8177                         return true;
8178                 bitmap += (port & 0x7fff) / 8;
8179
8180                 if (last_bitmap != bitmap)
8181                         if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
8182                                 return true;
8183                 if (b & (1 << (port & 7)))
8184                         return true;
8185
8186                 port++;
8187                 size--;
8188                 last_bitmap = bitmap;
8189         }
8190
8191         return false;
8192 }
8193
8194 /*
8195  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
8196  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
8197  * disinterest in the current event (read or write a specific MSR) by using an
8198  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
8199  */
8200 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
8201         struct vmcs12 *vmcs12, u32 exit_reason)
8202 {
8203         u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
8204         gpa_t bitmap;
8205
8206         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
8207                 return true;
8208
8209         /*
8210          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
8211          * for the four combinations of read/write and low/high MSR numbers.
8212          * First we need to figure out which of the four to use:
8213          */
8214         bitmap = vmcs12->msr_bitmap;
8215         if (exit_reason == EXIT_REASON_MSR_WRITE)
8216                 bitmap += 2048;
8217         if (msr_index >= 0xc0000000) {
8218                 msr_index -= 0xc0000000;
8219                 bitmap += 1024;
8220         }
8221
8222         /* Then read the msr_index'th bit from this bitmap: */
8223         if (msr_index < 1024*8) {
8224                 unsigned char b;
8225                 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
8226                         return true;
8227                 return 1 & (b >> (msr_index & 7));
8228         } else
8229                 return true; /* let L1 handle the wrong parameter */
8230 }
8231
8232 /*
8233  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
8234  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
8235  * intercept (via guest_host_mask etc.) the current event.
8236  */
8237 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
8238         struct vmcs12 *vmcs12)
8239 {
8240         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
8241         int cr = exit_qualification & 15;
8242         int reg;
8243         unsigned long val;
8244
8245         switch ((exit_qualification >> 4) & 3) {
8246         case 0: /* mov to cr */
8247                 reg = (exit_qualification >> 8) & 15;
8248                 val = kvm_register_readl(vcpu, reg);
8249                 switch (cr) {
8250                 case 0:
8251                         if (vmcs12->cr0_guest_host_mask &
8252                             (val ^ vmcs12->cr0_read_shadow))
8253                                 return true;
8254                         break;
8255                 case 3:
8256                         if ((vmcs12->cr3_target_count >= 1 &&
8257                                         vmcs12->cr3_target_value0 == val) ||
8258                                 (vmcs12->cr3_target_count >= 2 &&
8259                                         vmcs12->cr3_target_value1 == val) ||
8260                                 (vmcs12->cr3_target_count >= 3 &&
8261                                         vmcs12->cr3_target_value2 == val) ||
8262                                 (vmcs12->cr3_target_count >= 4 &&
8263                                         vmcs12->cr3_target_value3 == val))
8264                                 return false;
8265                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
8266                                 return true;
8267                         break;
8268                 case 4:
8269                         if (vmcs12->cr4_guest_host_mask &
8270                             (vmcs12->cr4_read_shadow ^ val))
8271                                 return true;
8272                         break;
8273                 case 8:
8274                         if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
8275                                 return true;
8276                         break;
8277                 }
8278                 break;
8279         case 2: /* clts */
8280                 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
8281                     (vmcs12->cr0_read_shadow & X86_CR0_TS))
8282                         return true;
8283                 break;
8284         case 1: /* mov from cr */
8285                 switch (cr) {
8286                 case 3:
8287                         if (vmcs12->cpu_based_vm_exec_control &
8288                             CPU_BASED_CR3_STORE_EXITING)
8289                                 return true;
8290                         break;
8291                 case 8:
8292                         if (vmcs12->cpu_based_vm_exec_control &
8293                             CPU_BASED_CR8_STORE_EXITING)
8294                                 return true;
8295                         break;
8296                 }
8297                 break;
8298         case 3: /* lmsw */
8299                 /*
8300                  * lmsw can change bits 1..3 of cr0, and only set bit 0 of
8301                  * cr0. Other attempted changes are ignored, with no exit.
8302                  */
8303                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
8304                 if (vmcs12->cr0_guest_host_mask & 0xe &
8305                     (val ^ vmcs12->cr0_read_shadow))
8306                         return true;
8307                 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
8308                     !(vmcs12->cr0_read_shadow & 0x1) &&
8309                     (val & 0x1))
8310                         return true;
8311                 break;
8312         }
8313         return false;
8314 }
8315
8316 /*
8317  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
8318  * should handle it ourselves in L0 (and then continue L2). Only call this
8319  * when in is_guest_mode (L2).
8320  */
8321 static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8322 {
8323         u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
8324         struct vcpu_vmx *vmx = to_vmx(vcpu);
8325         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8326
8327         if (vmx->nested.nested_run_pending)
8328                 return false;
8329
8330         if (unlikely(vmx->fail)) {
8331                 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
8332                                     vmcs_read32(VM_INSTRUCTION_ERROR));
8333                 return true;
8334         }
8335
8336         /*
8337          * The host physical addresses of some pages of guest memory
8338          * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
8339          * Page). The CPU may write to these pages via their host
8340          * physical address while L2 is running, bypassing any
8341          * address-translation-based dirty tracking (e.g. EPT write
8342          * protection).
8343          *
8344          * Mark them dirty on every exit from L2 to prevent them from
8345          * getting out of sync with dirty tracking.
8346          */
8347         nested_mark_vmcs12_pages_dirty(vcpu);
8348
8349         trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
8350                                 vmcs_readl(EXIT_QUALIFICATION),
8351                                 vmx->idt_vectoring_info,
8352                                 intr_info,
8353                                 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
8354                                 KVM_ISA_VMX);
8355
8356         switch (exit_reason) {
8357         case EXIT_REASON_EXCEPTION_NMI:
8358                 if (is_nmi(intr_info))
8359                         return false;
8360                 else if (is_page_fault(intr_info))
8361                         return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
8362                 else if (is_no_device(intr_info) &&
8363                          !(vmcs12->guest_cr0 & X86_CR0_TS))
8364                         return false;
8365                 else if (is_debug(intr_info) &&
8366                          vcpu->guest_debug &
8367                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
8368                         return false;
8369                 else if (is_breakpoint(intr_info) &&
8370                          vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
8371                         return false;
8372                 return vmcs12->exception_bitmap &
8373                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
8374         case EXIT_REASON_EXTERNAL_INTERRUPT:
8375                 return false;
8376         case EXIT_REASON_TRIPLE_FAULT:
8377                 return true;
8378         case EXIT_REASON_PENDING_INTERRUPT:
8379                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
8380         case EXIT_REASON_NMI_WINDOW:
8381                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
8382         case EXIT_REASON_TASK_SWITCH:
8383                 return true;
8384         case EXIT_REASON_CPUID:
8385                 return true;
8386         case EXIT_REASON_HLT:
8387                 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
8388         case EXIT_REASON_INVD:
8389                 return true;
8390         case EXIT_REASON_INVLPG:
8391                 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
8392         case EXIT_REASON_RDPMC:
8393                 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
8394         case EXIT_REASON_RDRAND:
8395                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
8396         case EXIT_REASON_RDSEED:
8397                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
8398         case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
8399                 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
8400         case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
8401         case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
8402         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
8403         case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
8404         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
8405         case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
8406                 /*
8407                  * VMX instructions trap unconditionally. This allows L1 to
8408                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
8409                  */
8410                 return true;
8411         case EXIT_REASON_CR_ACCESS:
8412                 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
8413         case EXIT_REASON_DR_ACCESS:
8414                 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
8415         case EXIT_REASON_IO_INSTRUCTION:
8416                 return nested_vmx_exit_handled_io(vcpu, vmcs12);
8417         case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
8418                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
8419         case EXIT_REASON_MSR_READ:
8420         case EXIT_REASON_MSR_WRITE:
8421                 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
8422         case EXIT_REASON_INVALID_STATE:
8423                 return true;
8424         case EXIT_REASON_MWAIT_INSTRUCTION:
8425                 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
8426         case EXIT_REASON_MONITOR_TRAP_FLAG:
8427                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
8428         case EXIT_REASON_MONITOR_INSTRUCTION:
8429                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
8430         case EXIT_REASON_PAUSE_INSTRUCTION:
8431                 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
8432                         nested_cpu_has2(vmcs12,
8433                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
8434         case EXIT_REASON_MCE_DURING_VMENTRY:
8435                 return false;
8436         case EXIT_REASON_TPR_BELOW_THRESHOLD:
8437                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
8438         case EXIT_REASON_APIC_ACCESS:
8439                 return nested_cpu_has2(vmcs12,
8440                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
8441         case EXIT_REASON_APIC_WRITE:
8442         case EXIT_REASON_EOI_INDUCED:
8443                 /* apic_write and eoi_induced should exit unconditionally. */
8444                 return true;
8445         case EXIT_REASON_EPT_VIOLATION:
8446                 /*
8447                  * L0 always deals with the EPT violation. If nested EPT is
8448                  * used, and the nested mmu code discovers that the address is
8449                  * missing in the guest EPT table (EPT12), the EPT violation
8450                  * will be injected with nested_ept_inject_page_fault()
8451                  */
8452                 return false;
8453         case EXIT_REASON_EPT_MISCONFIG:
8454                 /*
8455                  * L2 never uses directly L1's EPT, but rather L0's own EPT
8456                  * table (shadow on EPT) or a merged EPT table that L0 built
8457                  * (EPT on EPT). So any problems with the structure of the
8458                  * table is L0's fault.
8459                  */
8460                 return false;
8461         case EXIT_REASON_INVPCID:
8462                 return
8463                         nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
8464                         nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
8465         case EXIT_REASON_WBINVD:
8466                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
8467         case EXIT_REASON_XSETBV:
8468                 return true;
8469         case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
8470                 /*
8471                  * This should never happen, since it is not possible to
8472                  * set XSS to a non-zero value---neither in L1 nor in L2.
8473                  * If if it were, XSS would have to be checked against
8474                  * the XSS exit bitmap in vmcs12.
8475                  */
8476                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
8477         case EXIT_REASON_PREEMPTION_TIMER:
8478                 return false;
8479         case EXIT_REASON_PML_FULL:
8480                 /* We emulate PML support to L1. */
8481                 return false;
8482         case EXIT_REASON_VMFUNC:
8483                 /* VM functions are emulated through L2->L0 vmexits. */
8484                 return false;
8485         default:
8486                 return true;
8487         }
8488 }
8489
8490 static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
8491 {
8492         u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
8493
8494         /*
8495          * At this point, the exit interruption info in exit_intr_info
8496          * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
8497          * we need to query the in-kernel LAPIC.
8498          */
8499         WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
8500         if ((exit_intr_info &
8501              (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
8502             (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
8503                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8504                 vmcs12->vm_exit_intr_error_code =
8505                         vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
8506         }
8507
8508         nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
8509                           vmcs_readl(EXIT_QUALIFICATION));
8510         return 1;
8511 }
8512
8513 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
8514 {
8515         *info1 = vmcs_readl(EXIT_QUALIFICATION);
8516         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
8517 }
8518
8519 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
8520 {
8521         if (vmx->pml_pg) {
8522                 __free_page(vmx->pml_pg);
8523                 vmx->pml_pg = NULL;
8524         }
8525 }
8526
8527 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
8528 {
8529         struct vcpu_vmx *vmx = to_vmx(vcpu);
8530         u64 *pml_buf;
8531         u16 pml_idx;
8532
8533         pml_idx = vmcs_read16(GUEST_PML_INDEX);
8534
8535         /* Do nothing if PML buffer is empty */
8536         if (pml_idx == (PML_ENTITY_NUM - 1))
8537                 return;
8538
8539         /* PML index always points to next available PML buffer entity */
8540         if (pml_idx >= PML_ENTITY_NUM)
8541                 pml_idx = 0;
8542         else
8543                 pml_idx++;
8544
8545         pml_buf = page_address(vmx->pml_pg);
8546         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
8547                 u64 gpa;
8548
8549                 gpa = pml_buf[pml_idx];
8550                 WARN_ON(gpa & (PAGE_SIZE - 1));
8551                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
8552         }
8553
8554         /* reset PML index */
8555         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
8556 }
8557
8558 /*
8559  * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
8560  * Called before reporting dirty_bitmap to userspace.
8561  */
8562 static void kvm_flush_pml_buffers(struct kvm *kvm)
8563 {
8564         int i;
8565         struct kvm_vcpu *vcpu;
8566         /*
8567          * We only need to kick vcpu out of guest mode here, as PML buffer
8568          * is flushed at beginning of all VMEXITs, and it's obvious that only
8569          * vcpus running in guest are possible to have unflushed GPAs in PML
8570          * buffer.
8571          */
8572         kvm_for_each_vcpu(i, vcpu, kvm)
8573                 kvm_vcpu_kick(vcpu);
8574 }
8575
8576 static void vmx_dump_sel(char *name, uint32_t sel)
8577 {
8578         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
8579                name, vmcs_read16(sel),
8580                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
8581                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
8582                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
8583 }
8584
8585 static void vmx_dump_dtsel(char *name, uint32_t limit)
8586 {
8587         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
8588                name, vmcs_read32(limit),
8589                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
8590 }
8591
8592 static void dump_vmcs(void)
8593 {
8594         u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
8595         u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
8596         u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
8597         u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
8598         u32 secondary_exec_control = 0;
8599         unsigned long cr4 = vmcs_readl(GUEST_CR4);
8600         u64 efer = vmcs_read64(GUEST_IA32_EFER);
8601         int i, n;
8602
8603         if (cpu_has_secondary_exec_ctrls())
8604                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8605
8606         pr_err("*** Guest State ***\n");
8607         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
8608                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
8609                vmcs_readl(CR0_GUEST_HOST_MASK));
8610         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
8611                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
8612         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
8613         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
8614             (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
8615         {
8616                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
8617                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
8618                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
8619                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
8620         }
8621         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
8622                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
8623         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
8624                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
8625         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
8626                vmcs_readl(GUEST_SYSENTER_ESP),
8627                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
8628         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
8629         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
8630         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
8631         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
8632         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
8633         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
8634         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
8635         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
8636         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
8637         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
8638         if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
8639             (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
8640                 pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
8641                        efer, vmcs_read64(GUEST_IA32_PAT));
8642         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
8643                vmcs_read64(GUEST_IA32_DEBUGCTL),
8644                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
8645         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
8646                 pr_err("PerfGlobCtl = 0x%016llx\n",
8647                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
8648         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
8649                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
8650         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
8651                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
8652                vmcs_read32(GUEST_ACTIVITY_STATE));
8653         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
8654                 pr_err("InterruptStatus = %04x\n",
8655                        vmcs_read16(GUEST_INTR_STATUS));
8656
8657         pr_err("*** Host State ***\n");
8658         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
8659                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
8660         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
8661                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
8662                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
8663                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
8664                vmcs_read16(HOST_TR_SELECTOR));
8665         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
8666                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
8667                vmcs_readl(HOST_TR_BASE));
8668         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
8669                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
8670         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
8671                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
8672                vmcs_readl(HOST_CR4));
8673         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
8674                vmcs_readl(HOST_IA32_SYSENTER_ESP),
8675                vmcs_read32(HOST_IA32_SYSENTER_CS),
8676                vmcs_readl(HOST_IA32_SYSENTER_EIP));
8677         if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
8678                 pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
8679                        vmcs_read64(HOST_IA32_EFER),
8680                        vmcs_read64(HOST_IA32_PAT));
8681         if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
8682                 pr_err("PerfGlobCtl = 0x%016llx\n",
8683                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
8684
8685         pr_err("*** Control State ***\n");
8686         pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
8687                pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
8688         pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
8689         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
8690                vmcs_read32(EXCEPTION_BITMAP),
8691                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
8692                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
8693         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
8694                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
8695                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
8696                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
8697         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
8698                vmcs_read32(VM_EXIT_INTR_INFO),
8699                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
8700                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
8701         pr_err("        reason=%08x qualification=%016lx\n",
8702                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
8703         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
8704                vmcs_read32(IDT_VECTORING_INFO_FIELD),
8705                vmcs_read32(IDT_VECTORING_ERROR_CODE));
8706         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
8707         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
8708                 pr_err("TSC Multiplier = 0x%016llx\n",
8709                        vmcs_read64(TSC_MULTIPLIER));
8710         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
8711                 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
8712         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
8713                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
8714         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
8715                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
8716         n = vmcs_read32(CR3_TARGET_COUNT);
8717         for (i = 0; i + 1 < n; i += 4)
8718                 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
8719                        i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
8720                        i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
8721         if (i < n)
8722                 pr_err("CR3 target%u=%016lx\n",
8723                        i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
8724         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
8725                 pr_err("PLE Gap=%08x Window=%08x\n",
8726                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
8727         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
8728                 pr_err("Virtual processor ID = 0x%04x\n",
8729                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
8730 }
8731
8732 /*
8733  * The guest has exited.  See if we can fix it or if we need userspace
8734  * assistance.
8735  */
8736 static int vmx_handle_exit(struct kvm_vcpu *vcpu)
8737 {
8738         struct vcpu_vmx *vmx = to_vmx(vcpu);
8739         u32 exit_reason = vmx->exit_reason;
8740         u32 vectoring_info = vmx->idt_vectoring_info;
8741
8742         trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
8743
8744         /*
8745          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
8746          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
8747          * querying dirty_bitmap, we only need to kick all vcpus out of guest
8748          * mode as if vcpus is in root mode, the PML buffer must has been
8749          * flushed already.
8750          */
8751         if (enable_pml)
8752                 vmx_flush_pml_buffer(vcpu);
8753
8754         /* If guest state is invalid, start emulating */
8755         if (vmx->emulation_required)
8756                 return handle_invalid_guest_state(vcpu);
8757
8758         if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
8759                 return nested_vmx_reflect_vmexit(vcpu, exit_reason);
8760
8761         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
8762                 dump_vmcs();
8763                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
8764                 vcpu->run->fail_entry.hardware_entry_failure_reason
8765                         = exit_reason;
8766                 return 0;
8767         }
8768
8769         if (unlikely(vmx->fail)) {
8770                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
8771                 vcpu->run->fail_entry.hardware_entry_failure_reason
8772                         = vmcs_read32(VM_INSTRUCTION_ERROR);
8773                 return 0;
8774         }
8775
8776         /*
8777          * Note:
8778          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
8779          * delivery event since it indicates guest is accessing MMIO.
8780          * The vm-exit can be triggered again after return to guest that
8781          * will cause infinite loop.
8782          */
8783         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
8784                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
8785                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
8786                         exit_reason != EXIT_REASON_PML_FULL &&
8787                         exit_reason != EXIT_REASON_TASK_SWITCH)) {
8788                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
8789                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
8790                 vcpu->run->internal.ndata = 3;
8791                 vcpu->run->internal.data[0] = vectoring_info;
8792                 vcpu->run->internal.data[1] = exit_reason;
8793                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
8794                 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
8795                         vcpu->run->internal.ndata++;
8796                         vcpu->run->internal.data[3] =
8797                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
8798                 }
8799                 return 0;
8800         }
8801
8802         if (unlikely(!enable_vnmi &&
8803                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
8804                 if (vmx_interrupt_allowed(vcpu)) {
8805                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
8806                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
8807                            vcpu->arch.nmi_pending) {
8808                         /*
8809                          * This CPU don't support us in finding the end of an
8810                          * NMI-blocked window if the guest runs with IRQs
8811                          * disabled. So we pull the trigger after 1 s of
8812                          * futile waiting, but inform the user about this.
8813                          */
8814                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
8815                                "state on VCPU %d after 1 s timeout\n",
8816                                __func__, vcpu->vcpu_id);
8817                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
8818                 }
8819         }
8820
8821         if (exit_reason < kvm_vmx_max_exit_handlers
8822             && kvm_vmx_exit_handlers[exit_reason])
8823                 return kvm_vmx_exit_handlers[exit_reason](vcpu);
8824         else {
8825                 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
8826                                 exit_reason);
8827                 kvm_queue_exception(vcpu, UD_VECTOR);
8828                 return 1;
8829         }
8830 }
8831
8832 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
8833 {
8834         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8835
8836         if (is_guest_mode(vcpu) &&
8837                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
8838                 return;
8839
8840         if (irr == -1 || tpr < irr) {
8841                 vmcs_write32(TPR_THRESHOLD, 0);
8842                 return;
8843         }
8844
8845         vmcs_write32(TPR_THRESHOLD, irr);
8846 }
8847
8848 static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
8849 {
8850         u32 sec_exec_control;
8851
8852         /* Postpone execution until vmcs01 is the current VMCS. */
8853         if (is_guest_mode(vcpu)) {
8854                 to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
8855                 return;
8856         }
8857
8858         if (!cpu_has_vmx_virtualize_x2apic_mode())
8859                 return;
8860
8861         if (!cpu_need_tpr_shadow(vcpu))
8862                 return;
8863
8864         sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8865
8866         if (set) {
8867                 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
8868                 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
8869         } else {
8870                 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
8871                 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
8872                 vmx_flush_tlb_ept_only(vcpu);
8873         }
8874         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
8875
8876         vmx_update_msr_bitmap(vcpu);
8877 }
8878
8879 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
8880 {
8881         struct vcpu_vmx *vmx = to_vmx(vcpu);
8882
8883         /*
8884          * Currently we do not handle the nested case where L2 has an
8885          * APIC access page of its own; that page is still pinned.
8886          * Hence, we skip the case where the VCPU is in guest mode _and_
8887          * L1 prepared an APIC access page for L2.
8888          *
8889          * For the case where L1 and L2 share the same APIC access page
8890          * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
8891          * in the vmcs12), this function will only update either the vmcs01
8892          * or the vmcs02.  If the former, the vmcs02 will be updated by
8893          * prepare_vmcs02.  If the latter, the vmcs01 will be updated in
8894          * the next L2->L1 exit.
8895          */
8896         if (!is_guest_mode(vcpu) ||
8897             !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
8898                              SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
8899                 vmcs_write64(APIC_ACCESS_ADDR, hpa);
8900                 vmx_flush_tlb_ept_only(vcpu);
8901         }
8902 }
8903
8904 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
8905 {
8906         u16 status;
8907         u8 old;
8908
8909         if (max_isr == -1)
8910                 max_isr = 0;
8911
8912         status = vmcs_read16(GUEST_INTR_STATUS);
8913         old = status >> 8;
8914         if (max_isr != old) {
8915                 status &= 0xff;
8916                 status |= max_isr << 8;
8917                 vmcs_write16(GUEST_INTR_STATUS, status);
8918         }
8919 }
8920
8921 static void vmx_set_rvi(int vector)
8922 {
8923         u16 status;
8924         u8 old;
8925
8926         if (vector == -1)
8927                 vector = 0;
8928
8929         status = vmcs_read16(GUEST_INTR_STATUS);
8930         old = (u8)status & 0xff;
8931         if ((u8)vector != old) {
8932                 status &= ~0xff;
8933                 status |= (u8)vector;
8934                 vmcs_write16(GUEST_INTR_STATUS, status);
8935         }
8936 }
8937
8938 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
8939 {
8940         if (!is_guest_mode(vcpu)) {
8941                 vmx_set_rvi(max_irr);
8942                 return;
8943         }
8944
8945         if (max_irr == -1)
8946                 return;
8947
8948         /*
8949          * In guest mode.  If a vmexit is needed, vmx_check_nested_events
8950          * handles it.
8951          */
8952         if (nested_exit_on_intr(vcpu))
8953                 return;
8954
8955         /*
8956          * Else, fall back to pre-APICv interrupt injection since L2
8957          * is run without virtual interrupt delivery.
8958          */
8959         if (!kvm_event_needs_reinjection(vcpu) &&
8960             vmx_interrupt_allowed(vcpu)) {
8961                 kvm_queue_interrupt(vcpu, max_irr, false);
8962                 vmx_inject_irq(vcpu);
8963         }
8964 }
8965
8966 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
8967 {
8968         struct vcpu_vmx *vmx = to_vmx(vcpu);
8969         int max_irr;
8970
8971         WARN_ON(!vcpu->arch.apicv_active);
8972         if (pi_test_on(&vmx->pi_desc)) {
8973                 pi_clear_on(&vmx->pi_desc);
8974                 /*
8975                  * IOMMU can write to PIR.ON, so the barrier matters even on UP.
8976                  * But on x86 this is just a compiler barrier anyway.
8977                  */
8978                 smp_mb__after_atomic();
8979                 max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
8980         } else {
8981                 max_irr = kvm_lapic_find_highest_irr(vcpu);
8982         }
8983         vmx_hwapic_irr_update(vcpu, max_irr);
8984         return max_irr;
8985 }
8986
8987 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
8988 {
8989         if (!kvm_vcpu_apicv_active(vcpu))
8990                 return;
8991
8992         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
8993         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
8994         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
8995         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
8996 }
8997
8998 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
8999 {
9000         struct vcpu_vmx *vmx = to_vmx(vcpu);
9001
9002         pi_clear_on(&vmx->pi_desc);
9003         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
9004 }
9005
9006 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
9007 {
9008         u32 exit_intr_info = 0;
9009         u16 basic_exit_reason = (u16)vmx->exit_reason;
9010
9011         if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
9012               || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
9013                 return;
9014
9015         if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
9016                 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
9017         vmx->exit_intr_info = exit_intr_info;
9018
9019         /* if exit due to PF check for async PF */
9020         if (is_page_fault(exit_intr_info))
9021                 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
9022
9023         /* Handle machine checks before interrupts are enabled */
9024         if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
9025             is_machine_check(exit_intr_info))
9026                 kvm_machine_check();
9027
9028         /* We need to handle NMIs before interrupts are enabled */
9029         if (is_nmi(exit_intr_info)) {
9030                 kvm_before_handle_nmi(&vmx->vcpu);
9031                 asm("int $2");
9032                 kvm_after_handle_nmi(&vmx->vcpu);
9033         }
9034 }
9035
9036 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
9037 {
9038         u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
9039
9040         if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
9041                         == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
9042                 unsigned int vector;
9043                 unsigned long entry;
9044                 gate_desc *desc;
9045                 struct vcpu_vmx *vmx = to_vmx(vcpu);
9046 #ifdef CONFIG_X86_64
9047                 unsigned long tmp;
9048 #endif
9049
9050                 vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
9051                 desc = (gate_desc *)vmx->host_idt_base + vector;
9052                 entry = gate_offset(desc);
9053                 asm volatile(
9054 #ifdef CONFIG_X86_64
9055                         "mov %%" _ASM_SP ", %[sp]\n\t"
9056                         "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
9057                         "push $%c[ss]\n\t"
9058                         "push %[sp]\n\t"
9059 #endif
9060                         "pushf\n\t"
9061                         __ASM_SIZE(push) " $%c[cs]\n\t"
9062                         "call *%[entry]\n\t"
9063                         :
9064 #ifdef CONFIG_X86_64
9065                         [sp]"=&r"(tmp),
9066 #endif
9067                         ASM_CALL_CONSTRAINT
9068                         :
9069                         [entry]"r"(entry),
9070                         [ss]"i"(__KERNEL_DS),
9071                         [cs]"i"(__KERNEL_CS)
9072                         );
9073         }
9074 }
9075 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
9076
9077 static bool vmx_has_high_real_mode_segbase(void)
9078 {
9079         return enable_unrestricted_guest || emulate_invalid_guest_state;
9080 }
9081
9082 static bool vmx_mpx_supported(void)
9083 {
9084         return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
9085                 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
9086 }
9087
9088 static bool vmx_xsaves_supported(void)
9089 {
9090         return vmcs_config.cpu_based_2nd_exec_ctrl &
9091                 SECONDARY_EXEC_XSAVES;
9092 }
9093
9094 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
9095 {
9096         u32 exit_intr_info;
9097         bool unblock_nmi;
9098         u8 vector;
9099         bool idtv_info_valid;
9100
9101         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
9102
9103         if (enable_vnmi) {
9104                 if (vmx->loaded_vmcs->nmi_known_unmasked)
9105                         return;
9106                 /*
9107                  * Can't use vmx->exit_intr_info since we're not sure what
9108                  * the exit reason is.
9109                  */
9110                 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
9111                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
9112                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
9113                 /*
9114                  * SDM 3: 27.7.1.2 (September 2008)
9115                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
9116                  * a guest IRET fault.
9117                  * SDM 3: 23.2.2 (September 2008)
9118                  * Bit 12 is undefined in any of the following cases:
9119                  *  If the VM exit sets the valid bit in the IDT-vectoring
9120                  *   information field.
9121                  *  If the VM exit is due to a double fault.
9122                  */
9123                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
9124                     vector != DF_VECTOR && !idtv_info_valid)
9125                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
9126                                       GUEST_INTR_STATE_NMI);
9127                 else
9128                         vmx->loaded_vmcs->nmi_known_unmasked =
9129                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
9130                                   & GUEST_INTR_STATE_NMI);
9131         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
9132                 vmx->loaded_vmcs->vnmi_blocked_time +=
9133                         ktime_to_ns(ktime_sub(ktime_get(),
9134                                               vmx->loaded_vmcs->entry_time));
9135 }
9136
9137 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
9138                                       u32 idt_vectoring_info,
9139                                       int instr_len_field,
9140                                       int error_code_field)
9141 {
9142         u8 vector;
9143         int type;
9144         bool idtv_info_valid;
9145
9146         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
9147
9148         vcpu->arch.nmi_injected = false;
9149         kvm_clear_exception_queue(vcpu);
9150         kvm_clear_interrupt_queue(vcpu);
9151
9152         if (!idtv_info_valid)
9153                 return;
9154
9155         kvm_make_request(KVM_REQ_EVENT, vcpu);
9156
9157         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
9158         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
9159
9160         switch (type) {
9161         case INTR_TYPE_NMI_INTR:
9162                 vcpu->arch.nmi_injected = true;
9163                 /*
9164                  * SDM 3: 27.7.1.2 (September 2008)
9165                  * Clear bit "block by NMI" before VM entry if a NMI
9166                  * delivery faulted.
9167                  */
9168                 vmx_set_nmi_mask(vcpu, false);
9169                 break;
9170         case INTR_TYPE_SOFT_EXCEPTION:
9171                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
9172                 /* fall through */
9173         case INTR_TYPE_HARD_EXCEPTION:
9174                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
9175                         u32 err = vmcs_read32(error_code_field);
9176                         kvm_requeue_exception_e(vcpu, vector, err);
9177                 } else
9178                         kvm_requeue_exception(vcpu, vector);
9179                 break;
9180         case INTR_TYPE_SOFT_INTR:
9181                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
9182                 /* fall through */
9183         case INTR_TYPE_EXT_INTR:
9184                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
9185                 break;
9186         default:
9187                 break;
9188         }
9189 }
9190
9191 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
9192 {
9193         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
9194                                   VM_EXIT_INSTRUCTION_LEN,
9195                                   IDT_VECTORING_ERROR_CODE);
9196 }
9197
9198 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
9199 {
9200         __vmx_complete_interrupts(vcpu,
9201                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
9202                                   VM_ENTRY_INSTRUCTION_LEN,
9203                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
9204
9205         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
9206 }
9207
9208 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
9209 {
9210         int i, nr_msrs;
9211         struct perf_guest_switch_msr *msrs;
9212
9213         msrs = perf_guest_get_msrs(&nr_msrs);
9214
9215         if (!msrs)
9216                 return;
9217
9218         for (i = 0; i < nr_msrs; i++)
9219                 if (msrs[i].host == msrs[i].guest)
9220                         clear_atomic_switch_msr(vmx, msrs[i].msr);
9221                 else
9222                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
9223                                         msrs[i].host);
9224 }
9225
9226 static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
9227 {
9228         struct vcpu_vmx *vmx = to_vmx(vcpu);
9229         u64 tscl;
9230         u32 delta_tsc;
9231
9232         if (vmx->hv_deadline_tsc == -1)
9233                 return;
9234
9235         tscl = rdtsc();
9236         if (vmx->hv_deadline_tsc > tscl)
9237                 /* sure to be 32 bit only because checked on set_hv_timer */
9238                 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
9239                         cpu_preemption_timer_multi);
9240         else
9241                 delta_tsc = 0;
9242
9243         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
9244 }
9245
9246 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9247 {
9248         struct vcpu_vmx *vmx = to_vmx(vcpu);
9249         unsigned long debugctlmsr, cr3, cr4;
9250
9251         /* Record the guest's net vcpu time for enforced NMI injections. */
9252         if (unlikely(!enable_vnmi &&
9253                      vmx->loaded_vmcs->soft_vnmi_blocked))
9254                 vmx->loaded_vmcs->entry_time = ktime_get();
9255
9256         /* Don't enter VMX if guest state is invalid, let the exit handler
9257            start emulation until we arrive back to a valid state */
9258         if (vmx->emulation_required)
9259                 return;
9260
9261         if (vmx->ple_window_dirty) {
9262                 vmx->ple_window_dirty = false;
9263                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
9264         }
9265
9266         if (vmx->nested.sync_shadow_vmcs) {
9267                 copy_vmcs12_to_shadow(vmx);
9268                 vmx->nested.sync_shadow_vmcs = false;
9269         }
9270
9271         if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
9272                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
9273         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
9274                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
9275
9276         cr3 = __get_current_cr3_fast();
9277         if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
9278                 vmcs_writel(HOST_CR3, cr3);
9279                 vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
9280         }
9281
9282         cr4 = cr4_read_shadow();
9283         if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
9284                 vmcs_writel(HOST_CR4, cr4);
9285                 vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
9286         }
9287
9288         /* When single-stepping over STI and MOV SS, we must clear the
9289          * corresponding interruptibility bits in the guest state. Otherwise
9290          * vmentry fails as it then expects bit 14 (BS) in pending debug
9291          * exceptions being set, but that's not correct for the guest debugging
9292          * case. */
9293         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
9294                 vmx_set_interrupt_shadow(vcpu, 0);
9295
9296         if (static_cpu_has(X86_FEATURE_PKU) &&
9297             kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
9298             vcpu->arch.pkru != vmx->host_pkru)
9299                 __write_pkru(vcpu->arch.pkru);
9300
9301         atomic_switch_perf_msrs(vmx);
9302         debugctlmsr = get_debugctlmsr();
9303
9304         vmx_arm_hv_timer(vcpu);
9305
9306         vmx->__launched = vmx->loaded_vmcs->launched;
9307         asm(
9308                 /* Store host registers */
9309                 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
9310                 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
9311                 "push %%" _ASM_CX " \n\t"
9312                 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
9313                 "je 1f \n\t"
9314                 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
9315                 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
9316                 "1: \n\t"
9317                 /* Reload cr2 if changed */
9318                 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
9319                 "mov %%cr2, %%" _ASM_DX " \n\t"
9320                 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
9321                 "je 2f \n\t"
9322                 "mov %%" _ASM_AX", %%cr2 \n\t"
9323                 "2: \n\t"
9324                 /* Check if vmlaunch of vmresume is needed */
9325                 "cmpl $0, %c[launched](%0) \n\t"
9326                 /* Load guest registers.  Don't clobber flags. */
9327                 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
9328                 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
9329                 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
9330                 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
9331                 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
9332                 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
9333 #ifdef CONFIG_X86_64
9334                 "mov %c[r8](%0),  %%r8  \n\t"
9335                 "mov %c[r9](%0),  %%r9  \n\t"
9336                 "mov %c[r10](%0), %%r10 \n\t"
9337                 "mov %c[r11](%0), %%r11 \n\t"
9338                 "mov %c[r12](%0), %%r12 \n\t"
9339                 "mov %c[r13](%0), %%r13 \n\t"
9340                 "mov %c[r14](%0), %%r14 \n\t"
9341                 "mov %c[r15](%0), %%r15 \n\t"
9342 #endif
9343                 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
9344
9345                 /* Enter guest mode */
9346                 "jne 1f \n\t"
9347                 __ex(ASM_VMX_VMLAUNCH) "\n\t"
9348                 "jmp 2f \n\t"
9349                 "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
9350                 "2: "
9351                 /* Save guest registers, load host registers, keep flags */
9352                 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
9353                 "pop %0 \n\t"
9354                 "setbe %c[fail](%0)\n\t"
9355                 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
9356                 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
9357                 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
9358                 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
9359                 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
9360                 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
9361                 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
9362 #ifdef CONFIG_X86_64
9363                 "mov %%r8,  %c[r8](%0) \n\t"
9364                 "mov %%r9,  %c[r9](%0) \n\t"
9365                 "mov %%r10, %c[r10](%0) \n\t"
9366                 "mov %%r11, %c[r11](%0) \n\t"
9367                 "mov %%r12, %c[r12](%0) \n\t"
9368                 "mov %%r13, %c[r13](%0) \n\t"
9369                 "mov %%r14, %c[r14](%0) \n\t"
9370                 "mov %%r15, %c[r15](%0) \n\t"
9371                 "xor %%r8d,  %%r8d \n\t"
9372                 "xor %%r9d,  %%r9d \n\t"
9373                 "xor %%r10d, %%r10d \n\t"
9374                 "xor %%r11d, %%r11d \n\t"
9375                 "xor %%r12d, %%r12d \n\t"
9376                 "xor %%r13d, %%r13d \n\t"
9377                 "xor %%r14d, %%r14d \n\t"
9378                 "xor %%r15d, %%r15d \n\t"
9379 #endif
9380                 "mov %%cr2, %%" _ASM_AX "   \n\t"
9381                 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
9382
9383                 "xor %%eax, %%eax \n\t"
9384                 "xor %%ebx, %%ebx \n\t"
9385                 "xor %%esi, %%esi \n\t"
9386                 "xor %%edi, %%edi \n\t"
9387                 "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
9388                 ".pushsection .rodata \n\t"
9389                 ".global vmx_return \n\t"
9390                 "vmx_return: " _ASM_PTR " 2b \n\t"
9391                 ".popsection"
9392               : : "c"(vmx), "d"((unsigned long)HOST_RSP),
9393                 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
9394                 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
9395                 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
9396                 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
9397                 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
9398                 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
9399                 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
9400                 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
9401                 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
9402                 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
9403 #ifdef CONFIG_X86_64
9404                 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
9405                 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
9406                 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
9407                 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
9408                 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
9409                 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
9410                 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
9411                 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
9412 #endif
9413                 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
9414                 [wordsize]"i"(sizeof(ulong))
9415               : "cc", "memory"
9416 #ifdef CONFIG_X86_64
9417                 , "rax", "rbx", "rdi", "rsi"
9418                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
9419 #else
9420                 , "eax", "ebx", "edi", "esi"
9421 #endif
9422               );
9423
9424         /* Eliminate branch target predictions from guest mode */
9425         vmexit_fill_RSB();
9426
9427         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
9428         if (debugctlmsr)
9429                 update_debugctlmsr(debugctlmsr);
9430
9431 #ifndef CONFIG_X86_64
9432         /*
9433          * The sysexit path does not restore ds/es, so we must set them to
9434          * a reasonable value ourselves.
9435          *
9436          * We can't defer this to vmx_load_host_state() since that function
9437          * may be executed in interrupt context, which saves and restore segments
9438          * around it, nullifying its effect.
9439          */
9440         loadsegment(ds, __USER_DS);
9441         loadsegment(es, __USER_DS);
9442 #endif
9443
9444         vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
9445                                   | (1 << VCPU_EXREG_RFLAGS)
9446                                   | (1 << VCPU_EXREG_PDPTR)
9447                                   | (1 << VCPU_EXREG_SEGMENTS)
9448                                   | (1 << VCPU_EXREG_CR3));
9449         vcpu->arch.regs_dirty = 0;
9450
9451         /*
9452          * eager fpu is enabled if PKEY is supported and CR4 is switched
9453          * back on host, so it is safe to read guest PKRU from current
9454          * XSAVE.
9455          */
9456         if (static_cpu_has(X86_FEATURE_PKU) &&
9457             kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
9458                 vcpu->arch.pkru = __read_pkru();
9459                 if (vcpu->arch.pkru != vmx->host_pkru)
9460                         __write_pkru(vmx->host_pkru);
9461         }
9462
9463         /*
9464          * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
9465          * we did not inject a still-pending event to L1 now because of
9466          * nested_run_pending, we need to re-enable this bit.
9467          */
9468         if (vmx->nested.nested_run_pending)
9469                 kvm_make_request(KVM_REQ_EVENT, vcpu);
9470
9471         vmx->nested.nested_run_pending = 0;
9472         vmx->idt_vectoring_info = 0;
9473
9474         vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
9475         if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
9476                 return;
9477
9478         vmx->loaded_vmcs->launched = 1;
9479         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
9480
9481         vmx_complete_atomic_exit(vmx);
9482         vmx_recover_nmi_blocking(vmx);
9483         vmx_complete_interrupts(vmx);
9484 }
9485 STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
9486
9487 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
9488 {
9489         struct vcpu_vmx *vmx = to_vmx(vcpu);
9490         int cpu;
9491
9492         if (vmx->loaded_vmcs == vmcs)
9493                 return;
9494
9495         cpu = get_cpu();
9496         vmx->loaded_vmcs = vmcs;
9497         vmx_vcpu_put(vcpu);
9498         vmx_vcpu_load(vcpu, cpu);
9499         put_cpu();
9500 }
9501
9502 /*
9503  * Ensure that the current vmcs of the logical processor is the
9504  * vmcs01 of the vcpu before calling free_nested().
9505  */
9506 static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
9507 {
9508        struct vcpu_vmx *vmx = to_vmx(vcpu);
9509        int r;
9510
9511        r = vcpu_load(vcpu);
9512        BUG_ON(r);
9513        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
9514        free_nested(vmx);
9515        vcpu_put(vcpu);
9516 }
9517
9518 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
9519 {
9520         struct vcpu_vmx *vmx = to_vmx(vcpu);
9521
9522         if (enable_pml)
9523                 vmx_destroy_pml_buffer(vmx);
9524         free_vpid(vmx->vpid);
9525         leave_guest_mode(vcpu);
9526         vmx_free_vcpu_nested(vcpu);
9527         free_loaded_vmcs(vmx->loaded_vmcs);
9528         kfree(vmx->guest_msrs);
9529         kvm_vcpu_uninit(vcpu);
9530         kmem_cache_free(kvm_vcpu_cache, vmx);
9531 }
9532
9533 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
9534 {
9535         int err;
9536         struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
9537         unsigned long *msr_bitmap;
9538         int cpu;
9539
9540         if (!vmx)
9541                 return ERR_PTR(-ENOMEM);
9542
9543         vmx->vpid = allocate_vpid();
9544
9545         err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
9546         if (err)
9547                 goto free_vcpu;
9548
9549         err = -ENOMEM;
9550
9551         /*
9552          * If PML is turned on, failure on enabling PML just results in failure
9553          * of creating the vcpu, therefore we can simplify PML logic (by
9554          * avoiding dealing with cases, such as enabling PML partially on vcpus
9555          * for the guest, etc.
9556          */
9557         if (enable_pml) {
9558                 vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
9559                 if (!vmx->pml_pg)
9560                         goto uninit_vcpu;
9561         }
9562
9563         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
9564         BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
9565                      > PAGE_SIZE);
9566
9567         if (!vmx->guest_msrs)
9568                 goto free_pml;
9569
9570         err = alloc_loaded_vmcs(&vmx->vmcs01);
9571         if (err < 0)
9572                 goto free_msrs;
9573
9574         msr_bitmap = vmx->vmcs01.msr_bitmap;
9575         vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
9576         vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
9577         vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
9578         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
9579         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
9580         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
9581         vmx->msr_bitmap_mode = 0;
9582
9583         vmx->loaded_vmcs = &vmx->vmcs01;
9584         cpu = get_cpu();
9585         vmx_vcpu_load(&vmx->vcpu, cpu);
9586         vmx->vcpu.cpu = cpu;
9587         vmx_vcpu_setup(vmx);
9588         vmx_vcpu_put(&vmx->vcpu);
9589         put_cpu();
9590         if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
9591                 err = alloc_apic_access_page(kvm);
9592                 if (err)
9593                         goto free_vmcs;
9594         }
9595
9596         if (enable_ept) {
9597                 err = init_rmode_identity_map(kvm);
9598                 if (err)
9599                         goto free_vmcs;
9600         }
9601
9602         if (nested) {
9603                 nested_vmx_setup_ctls_msrs(vmx);
9604                 vmx->nested.vpid02 = allocate_vpid();
9605         }
9606
9607         vmx->nested.posted_intr_nv = -1;
9608         vmx->nested.current_vmptr = -1ull;
9609
9610         vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
9611
9612         /*
9613          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
9614          * or POSTED_INTR_WAKEUP_VECTOR.
9615          */
9616         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
9617         vmx->pi_desc.sn = 1;
9618
9619         return &vmx->vcpu;
9620
9621 free_vmcs:
9622         free_vpid(vmx->nested.vpid02);
9623         free_loaded_vmcs(vmx->loaded_vmcs);
9624 free_msrs:
9625         kfree(vmx->guest_msrs);
9626 free_pml:
9627         vmx_destroy_pml_buffer(vmx);
9628 uninit_vcpu:
9629         kvm_vcpu_uninit(&vmx->vcpu);
9630 free_vcpu:
9631         free_vpid(vmx->vpid);
9632         kmem_cache_free(kvm_vcpu_cache, vmx);
9633         return ERR_PTR(err);
9634 }
9635
9636 static void __init vmx_check_processor_compat(void *rtn)
9637 {
9638         struct vmcs_config vmcs_conf;
9639
9640         *(int *)rtn = 0;
9641         if (setup_vmcs_config(&vmcs_conf) < 0)
9642                 *(int *)rtn = -EIO;
9643         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
9644                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
9645                                 smp_processor_id());
9646                 *(int *)rtn = -EIO;
9647         }
9648 }
9649
9650 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
9651 {
9652         u8 cache;
9653         u64 ipat = 0;
9654
9655         /* For VT-d and EPT combination
9656          * 1. MMIO: always map as UC
9657          * 2. EPT with VT-d:
9658          *   a. VT-d without snooping control feature: can't guarantee the
9659          *      result, try to trust guest.
9660          *   b. VT-d with snooping control feature: snooping control feature of
9661          *      VT-d engine can guarantee the cache correctness. Just set it
9662          *      to WB to keep consistent with host. So the same as item 3.
9663          * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
9664          *    consistent with host MTRR
9665          */
9666         if (is_mmio) {
9667                 cache = MTRR_TYPE_UNCACHABLE;
9668                 goto exit;
9669         }
9670
9671         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
9672                 ipat = VMX_EPT_IPAT_BIT;
9673                 cache = MTRR_TYPE_WRBACK;
9674                 goto exit;
9675         }
9676
9677         if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
9678                 ipat = VMX_EPT_IPAT_BIT;
9679                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
9680                         cache = MTRR_TYPE_WRBACK;
9681                 else
9682                         cache = MTRR_TYPE_UNCACHABLE;
9683                 goto exit;
9684         }
9685
9686         cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
9687
9688 exit:
9689         return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
9690 }
9691
9692 static int vmx_get_lpage_level(void)
9693 {
9694         if (enable_ept && !cpu_has_vmx_ept_1g_page())
9695                 return PT_DIRECTORY_LEVEL;
9696         else
9697                 /* For shadow and EPT supported 1GB page */
9698                 return PT_PDPE_LEVEL;
9699 }
9700
9701 static void vmcs_set_secondary_exec_control(u32 new_ctl)
9702 {
9703         /*
9704          * These bits in the secondary execution controls field
9705          * are dynamic, the others are mostly based on the hypervisor
9706          * architecture and the guest's CPUID.  Do not touch the
9707          * dynamic bits.
9708          */
9709         u32 mask =
9710                 SECONDARY_EXEC_SHADOW_VMCS |
9711                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
9712                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
9713
9714         u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
9715
9716         vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
9717                      (new_ctl & ~mask) | (cur_ctl & mask));
9718 }
9719
9720 /*
9721  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
9722  * (indicating "allowed-1") if they are supported in the guest's CPUID.
9723  */
9724 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
9725 {
9726         struct vcpu_vmx *vmx = to_vmx(vcpu);
9727         struct kvm_cpuid_entry2 *entry;
9728
9729         vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff;
9730         vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE;
9731
9732 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
9733         if (entry && (entry->_reg & (_cpuid_mask)))                     \
9734                 vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask);       \
9735 } while (0)
9736
9737         entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
9738         cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
9739         cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
9740         cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
9741         cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
9742         cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
9743         cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
9744         cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
9745         cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
9746         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
9747         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
9748         cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
9749         cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
9750         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
9751         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));
9752
9753         entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
9754         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
9755         cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
9756         cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
9757         cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
9758         cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));
9759
9760 #undef cr4_fixed1_update
9761 }
9762
9763 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
9764 {
9765         struct vcpu_vmx *vmx = to_vmx(vcpu);
9766
9767         if (cpu_has_secondary_exec_ctrls()) {
9768                 vmx_compute_secondary_exec_control(vmx);
9769                 vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
9770         }
9771
9772         if (nested_vmx_allowed(vcpu))
9773                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
9774                         FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
9775         else
9776                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
9777                         ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
9778
9779         if (nested_vmx_allowed(vcpu))
9780                 nested_vmx_cr_fixed1_bits_update(vcpu);
9781 }
9782
9783 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
9784 {
9785         if (func == 1 && nested)
9786                 entry->ecx |= bit(X86_FEATURE_VMX);
9787 }
9788
9789 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
9790                 struct x86_exception *fault)
9791 {
9792         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9793         struct vcpu_vmx *vmx = to_vmx(vcpu);
9794         u32 exit_reason;
9795         unsigned long exit_qualification = vcpu->arch.exit_qualification;
9796
9797         if (vmx->nested.pml_full) {
9798                 exit_reason = EXIT_REASON_PML_FULL;
9799                 vmx->nested.pml_full = false;
9800                 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
9801         } else if (fault->error_code & PFERR_RSVD_MASK)
9802                 exit_reason = EXIT_REASON_EPT_MISCONFIG;
9803         else
9804                 exit_reason = EXIT_REASON_EPT_VIOLATION;
9805
9806         nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
9807         vmcs12->guest_physical_address = fault->address;
9808 }
9809
9810 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
9811 {
9812         return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
9813 }
9814
9815 /* Callbacks for nested_ept_init_mmu_context: */
9816
9817 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
9818 {
9819         /* return the page table to be shadowed - in our case, EPT12 */
9820         return get_vmcs12(vcpu)->ept_pointer;
9821 }
9822
9823 static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
9824 {
9825         WARN_ON(mmu_is_nested(vcpu));
9826         if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
9827                 return 1;
9828
9829         kvm_mmu_unload(vcpu);
9830         kvm_init_shadow_ept_mmu(vcpu,
9831                         to_vmx(vcpu)->nested.nested_vmx_ept_caps &
9832                         VMX_EPT_EXECUTE_ONLY_BIT,
9833                         nested_ept_ad_enabled(vcpu));
9834         vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
9835         vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
9836         vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
9837
9838         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
9839         return 0;
9840 }
9841
9842 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
9843 {
9844         vcpu->arch.walk_mmu = &vcpu->arch.mmu;
9845 }
9846
9847 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
9848                                             u16 error_code)
9849 {
9850         bool inequality, bit;
9851
9852         bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
9853         inequality =
9854                 (error_code & vmcs12->page_fault_error_code_mask) !=
9855                  vmcs12->page_fault_error_code_match;
9856         return inequality ^ bit;
9857 }
9858
9859 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
9860                 struct x86_exception *fault)
9861 {
9862         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9863
9864         WARN_ON(!is_guest_mode(vcpu));
9865
9866         if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
9867                 !to_vmx(vcpu)->nested.nested_run_pending) {
9868                 vmcs12->vm_exit_intr_error_code = fault->error_code;
9869                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
9870                                   PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
9871                                   INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
9872                                   fault->address);
9873         } else {
9874                 kvm_inject_page_fault(vcpu, fault);
9875         }
9876 }
9877
9878 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
9879                                                struct vmcs12 *vmcs12);
9880
9881 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9882                                         struct vmcs12 *vmcs12)
9883 {
9884         struct vcpu_vmx *vmx = to_vmx(vcpu);
9885         struct page *page;
9886         u64 hpa;
9887
9888         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
9889                 /*
9890                  * Translate L1 physical address to host physical
9891                  * address for vmcs02. Keep the page pinned, so this
9892                  * physical address remains valid. We keep a reference
9893                  * to it so we can release it later.
9894                  */
9895                 if (vmx->nested.apic_access_page) { /* shouldn't happen */
9896                         kvm_release_page_dirty(vmx->nested.apic_access_page);
9897                         vmx->nested.apic_access_page = NULL;
9898                 }
9899                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
9900                 /*
9901                  * If translation failed, no matter: This feature asks
9902                  * to exit when accessing the given address, and if it
9903                  * can never be accessed, this feature won't do
9904                  * anything anyway.
9905                  */
9906                 if (!is_error_page(page)) {
9907                         vmx->nested.apic_access_page = page;
9908                         hpa = page_to_phys(vmx->nested.apic_access_page);
9909                         vmcs_write64(APIC_ACCESS_ADDR, hpa);
9910                 } else {
9911                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
9912                                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
9913                 }
9914         } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9915                    cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
9916                 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
9917                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
9918                 kvm_vcpu_reload_apic_access_page(vcpu);
9919         }
9920
9921         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
9922                 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
9923                         kvm_release_page_dirty(vmx->nested.virtual_apic_page);
9924                         vmx->nested.virtual_apic_page = NULL;
9925                 }
9926                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
9927
9928                 /*
9929                  * If translation failed, VM entry will fail because
9930                  * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
9931                  * Failing the vm entry is _not_ what the processor
9932                  * does but it's basically the only possibility we
9933                  * have.  We could still enter the guest if CR8 load
9934                  * exits are enabled, CR8 store exits are enabled, and
9935                  * virtualize APIC access is disabled; in this case
9936                  * the processor would never use the TPR shadow and we
9937                  * could simply clear the bit from the execution
9938                  * control.  But such a configuration is useless, so
9939                  * let's keep the code simple.
9940                  */
9941                 if (!is_error_page(page)) {
9942                         vmx->nested.virtual_apic_page = page;
9943                         hpa = page_to_phys(vmx->nested.virtual_apic_page);
9944                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
9945                 }
9946         }
9947
9948         if (nested_cpu_has_posted_intr(vmcs12)) {
9949                 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
9950                         kunmap(vmx->nested.pi_desc_page);
9951                         kvm_release_page_dirty(vmx->nested.pi_desc_page);
9952                         vmx->nested.pi_desc_page = NULL;
9953                 }
9954                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
9955                 if (is_error_page(page))
9956                         return;
9957                 vmx->nested.pi_desc_page = page;
9958                 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
9959                 vmx->nested.pi_desc =
9960                         (struct pi_desc *)((void *)vmx->nested.pi_desc +
9961                         (unsigned long)(vmcs12->posted_intr_desc_addr &
9962                         (PAGE_SIZE - 1)));
9963                 vmcs_write64(POSTED_INTR_DESC_ADDR,
9964                         page_to_phys(vmx->nested.pi_desc_page) +
9965                         (unsigned long)(vmcs12->posted_intr_desc_addr &
9966                         (PAGE_SIZE - 1)));
9967         }
9968         if (cpu_has_vmx_msr_bitmap() &&
9969             nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
9970             nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
9971                 ;
9972         else
9973                 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
9974                                 CPU_BASED_USE_MSR_BITMAPS);
9975 }
9976
9977 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
9978 {
9979         u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
9980         struct vcpu_vmx *vmx = to_vmx(vcpu);
9981
9982         if (vcpu->arch.virtual_tsc_khz == 0)
9983                 return;
9984
9985         /* Make sure short timeouts reliably trigger an immediate vmexit.
9986          * hrtimer_start does not guarantee this. */
9987         if (preemption_timeout <= 1) {
9988                 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
9989                 return;
9990         }
9991
9992         preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
9993         preemption_timeout *= 1000000;
9994         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
9995         hrtimer_start(&vmx->nested.preemption_timer,
9996                       ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
9997 }
9998
9999 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
10000                                                struct vmcs12 *vmcs12)
10001 {
10002         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
10003                 return 0;
10004
10005         if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
10006             !page_address_valid(vcpu, vmcs12->io_bitmap_b))
10007                 return -EINVAL;
10008
10009         return 0;
10010 }
10011
10012 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
10013                                                 struct vmcs12 *vmcs12)
10014 {
10015         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
10016                 return 0;
10017
10018         if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
10019                 return -EINVAL;
10020
10021         return 0;
10022 }
10023
10024 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
10025                                                 struct vmcs12 *vmcs12)
10026 {
10027         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10028                 return 0;
10029
10030         if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
10031                 return -EINVAL;
10032
10033         return 0;
10034 }
10035
10036 /*
10037  * Merge L0's and L1's MSR bitmap, return false to indicate that
10038  * we do not use the hardware.
10039  */
10040 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
10041                                                struct vmcs12 *vmcs12)
10042 {
10043         int msr;
10044         struct page *page;
10045         unsigned long *msr_bitmap_l1;
10046         unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
10047
10048         /* This shortcut is ok because we support only x2APIC MSRs so far. */
10049         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
10050                 return false;
10051
10052         page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
10053         if (is_error_page(page))
10054                 return false;
10055         msr_bitmap_l1 = (unsigned long *)kmap(page);
10056
10057         memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
10058
10059         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
10060                 if (nested_cpu_has_apic_reg_virt(vmcs12))
10061                         for (msr = 0x800; msr <= 0x8ff; msr++)
10062                                 nested_vmx_disable_intercept_for_msr(
10063                                         msr_bitmap_l1, msr_bitmap_l0,
10064                                         msr, MSR_TYPE_R);
10065
10066                 nested_vmx_disable_intercept_for_msr(
10067                                 msr_bitmap_l1, msr_bitmap_l0,
10068                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
10069                                 MSR_TYPE_R | MSR_TYPE_W);
10070
10071                 if (nested_cpu_has_vid(vmcs12)) {
10072                         nested_vmx_disable_intercept_for_msr(
10073                                 msr_bitmap_l1, msr_bitmap_l0,
10074                                 APIC_BASE_MSR + (APIC_EOI >> 4),
10075                                 MSR_TYPE_W);
10076                         nested_vmx_disable_intercept_for_msr(
10077                                 msr_bitmap_l1, msr_bitmap_l0,
10078                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
10079                                 MSR_TYPE_W);
10080                 }
10081         }
10082         kunmap(page);
10083         kvm_release_page_clean(page);
10084
10085         return true;
10086 }
10087
10088 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
10089                                            struct vmcs12 *vmcs12)
10090 {
10091         if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
10092             !nested_cpu_has_apic_reg_virt(vmcs12) &&
10093             !nested_cpu_has_vid(vmcs12) &&
10094             !nested_cpu_has_posted_intr(vmcs12))
10095                 return 0;
10096
10097         /*
10098          * If virtualize x2apic mode is enabled,
10099          * virtualize apic access must be disabled.
10100          */
10101         if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
10102             nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
10103                 return -EINVAL;
10104
10105         /*
10106          * If virtual interrupt delivery is enabled,
10107          * we must exit on external interrupts.
10108          */
10109         if (nested_cpu_has_vid(vmcs12) &&
10110            !nested_exit_on_intr(vcpu))
10111                 return -EINVAL;
10112
10113         /*
10114          * bits 15:8 should be zero in posted_intr_nv,
10115          * the descriptor address has been already checked
10116          * in nested_get_vmcs12_pages.
10117          */
10118         if (nested_cpu_has_posted_intr(vmcs12) &&
10119            (!nested_cpu_has_vid(vmcs12) ||
10120             !nested_exit_intr_ack_set(vcpu) ||
10121             vmcs12->posted_intr_nv & 0xff00))
10122                 return -EINVAL;
10123
10124         /* tpr shadow is needed by all apicv features. */
10125         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10126                 return -EINVAL;
10127
10128         return 0;
10129 }
10130
10131 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
10132                                        unsigned long count_field,
10133                                        unsigned long addr_field)
10134 {
10135         int maxphyaddr;
10136         u64 count, addr;
10137
10138         if (vmcs12_read_any(vcpu, count_field, &count) ||
10139             vmcs12_read_any(vcpu, addr_field, &addr)) {
10140                 WARN_ON(1);
10141                 return -EINVAL;
10142         }
10143         if (count == 0)
10144                 return 0;
10145         maxphyaddr = cpuid_maxphyaddr(vcpu);
10146         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
10147             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
10148                 pr_debug_ratelimited(
10149                         "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
10150                         addr_field, maxphyaddr, count, addr);
10151                 return -EINVAL;
10152         }
10153         return 0;
10154 }
10155
10156 static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
10157                                                 struct vmcs12 *vmcs12)
10158 {
10159         if (vmcs12->vm_exit_msr_load_count == 0 &&
10160             vmcs12->vm_exit_msr_store_count == 0 &&
10161             vmcs12->vm_entry_msr_load_count == 0)
10162                 return 0; /* Fast path */
10163         if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
10164                                         VM_EXIT_MSR_LOAD_ADDR) ||
10165             nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
10166                                         VM_EXIT_MSR_STORE_ADDR) ||
10167             nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
10168                                         VM_ENTRY_MSR_LOAD_ADDR))
10169                 return -EINVAL;
10170         return 0;
10171 }
10172
10173 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
10174                                          struct vmcs12 *vmcs12)
10175 {
10176         u64 address = vmcs12->pml_address;
10177         int maxphyaddr = cpuid_maxphyaddr(vcpu);
10178
10179         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
10180                 if (!nested_cpu_has_ept(vmcs12) ||
10181                     !IS_ALIGNED(address, 4096)  ||
10182                     address >> maxphyaddr)
10183                         return -EINVAL;
10184         }
10185
10186         return 0;
10187 }
10188
10189 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
10190                                        struct vmx_msr_entry *e)
10191 {
10192         /* x2APIC MSR accesses are not allowed */
10193         if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
10194                 return -EINVAL;
10195         if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
10196             e->index == MSR_IA32_UCODE_REV)
10197                 return -EINVAL;
10198         if (e->reserved != 0)
10199                 return -EINVAL;
10200         return 0;
10201 }
10202
10203 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
10204                                      struct vmx_msr_entry *e)
10205 {
10206         if (e->index == MSR_FS_BASE ||
10207             e->index == MSR_GS_BASE ||
10208             e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
10209             nested_vmx_msr_check_common(vcpu, e))
10210                 return -EINVAL;
10211         return 0;
10212 }
10213
10214 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
10215                                       struct vmx_msr_entry *e)
10216 {
10217         if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
10218             nested_vmx_msr_check_common(vcpu, e))
10219                 return -EINVAL;
10220         return 0;
10221 }
10222
10223 /*
10224  * Load guest's/host's msr at nested entry/exit.
10225  * return 0 for success, entry index for failure.
10226  */
10227 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
10228 {
10229         u32 i;
10230         struct vmx_msr_entry e;
10231         struct msr_data msr;
10232
10233         msr.host_initiated = false;
10234         for (i = 0; i < count; i++) {
10235                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
10236                                         &e, sizeof(e))) {
10237                         pr_debug_ratelimited(
10238                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
10239                                 __func__, i, gpa + i * sizeof(e));
10240                         goto fail;
10241                 }
10242                 if (nested_vmx_load_msr_check(vcpu, &e)) {
10243                         pr_debug_ratelimited(
10244                                 "%s check failed (%u, 0x%x, 0x%x)\n",
10245                                 __func__, i, e.index, e.reserved);
10246                         goto fail;
10247                 }
10248                 msr.index = e.index;
10249                 msr.data = e.value;
10250                 if (kvm_set_msr(vcpu, &msr)) {
10251                         pr_debug_ratelimited(
10252                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
10253                                 __func__, i, e.index, e.value);
10254                         goto fail;
10255                 }
10256         }
10257         return 0;
10258 fail:
10259         return i + 1;
10260 }
10261
10262 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
10263 {
10264         u32 i;
10265         struct vmx_msr_entry e;
10266
10267         for (i = 0; i < count; i++) {
10268                 struct msr_data msr_info;
10269                 if (kvm_vcpu_read_guest(vcpu,
10270                                         gpa + i * sizeof(e),
10271                                         &e, 2 * sizeof(u32))) {
10272                         pr_debug_ratelimited(
10273                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
10274                                 __func__, i, gpa + i * sizeof(e));
10275                         return -EINVAL;
10276                 }
10277                 if (nested_vmx_store_msr_check(vcpu, &e)) {
10278                         pr_debug_ratelimited(
10279                                 "%s check failed (%u, 0x%x, 0x%x)\n",
10280                                 __func__, i, e.index, e.reserved);
10281                         return -EINVAL;
10282                 }
10283                 msr_info.host_initiated = false;
10284                 msr_info.index = e.index;
10285                 if (kvm_get_msr(vcpu, &msr_info)) {
10286                         pr_debug_ratelimited(
10287                                 "%s cannot read MSR (%u, 0x%x)\n",
10288                                 __func__, i, e.index);
10289                         return -EINVAL;
10290                 }
10291                 if (kvm_vcpu_write_guest(vcpu,
10292                                          gpa + i * sizeof(e) +
10293                                              offsetof(struct vmx_msr_entry, value),
10294                                          &msr_info.data, sizeof(msr_info.data))) {
10295                         pr_debug_ratelimited(
10296                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
10297                                 __func__, i, e.index, msr_info.data);
10298                         return -EINVAL;
10299                 }
10300         }
10301         return 0;
10302 }
10303
10304 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
10305 {
10306         unsigned long invalid_mask;
10307
10308         invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
10309         return (val & invalid_mask) == 0;
10310 }
10311
10312 /*
10313  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
10314  * emulating VM entry into a guest with EPT enabled.
10315  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
10316  * is assigned to entry_failure_code on failure.
10317  */
10318 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
10319                                u32 *entry_failure_code)
10320 {
10321         if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
10322                 if (!nested_cr3_valid(vcpu, cr3)) {
10323                         *entry_failure_code = ENTRY_FAIL_DEFAULT;
10324                         return 1;
10325                 }
10326
10327                 /*
10328                  * If PAE paging and EPT are both on, CR3 is not used by the CPU and
10329                  * must not be dereferenced.
10330                  */
10331                 if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
10332                     !nested_ept) {
10333                         if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
10334                                 *entry_failure_code = ENTRY_FAIL_PDPTE;
10335                                 return 1;
10336                         }
10337                 }
10338
10339                 vcpu->arch.cr3 = cr3;
10340                 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
10341         }
10342
10343         kvm_mmu_reset_context(vcpu);
10344         return 0;
10345 }
10346
10347 /*
10348  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
10349  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
10350  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
10351  * guest in a way that will both be appropriate to L1's requests, and our
10352  * needs. In addition to modifying the active vmcs (which is vmcs02), this
10353  * function also has additional necessary side-effects, like setting various
10354  * vcpu->arch fields.
10355  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
10356  * is assigned to entry_failure_code on failure.
10357  */
10358 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10359                           bool from_vmentry, u32 *entry_failure_code)
10360 {
10361         struct vcpu_vmx *vmx = to_vmx(vcpu);
10362         u32 exec_control, vmcs12_exec_ctrl;
10363
10364         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
10365         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
10366         vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
10367         vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
10368         vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
10369         vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
10370         vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
10371         vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
10372         vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
10373         vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
10374         vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
10375         vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
10376         vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
10377         vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
10378         vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
10379         vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
10380         vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
10381         vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
10382         vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
10383         vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
10384         vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
10385         vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
10386         vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
10387         vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
10388         vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
10389         vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
10390         vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
10391         vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
10392         vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
10393         vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
10394         vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
10395         vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
10396         vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
10397         vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
10398         vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
10399         vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
10400
10401         if (from_vmentry &&
10402             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
10403                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
10404                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
10405         } else {
10406                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
10407                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
10408         }
10409         if (from_vmentry) {
10410                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
10411                              vmcs12->vm_entry_intr_info_field);
10412                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
10413                              vmcs12->vm_entry_exception_error_code);
10414                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
10415                              vmcs12->vm_entry_instruction_len);
10416                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
10417                              vmcs12->guest_interruptibility_info);
10418                 vmx->loaded_vmcs->nmi_known_unmasked =
10419                         !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
10420         } else {
10421                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
10422         }
10423         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
10424         vmx_set_rflags(vcpu, vmcs12->guest_rflags);
10425         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
10426                 vmcs12->guest_pending_dbg_exceptions);
10427         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
10428         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
10429
10430         if (nested_cpu_has_xsaves(vmcs12))
10431                 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
10432         vmcs_write64(VMCS_LINK_POINTER, -1ull);
10433
10434         exec_control = vmcs12->pin_based_vm_exec_control;
10435
10436         /* Preemption timer setting is only taken from vmcs01.  */
10437         exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
10438         exec_control |= vmcs_config.pin_based_exec_ctrl;
10439         if (vmx->hv_deadline_tsc == -1)
10440                 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
10441
10442         /* Posted interrupts setting is only taken from vmcs12.  */
10443         if (nested_cpu_has_posted_intr(vmcs12)) {
10444                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
10445                 vmx->nested.pi_pending = false;
10446                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
10447         } else {
10448                 exec_control &= ~PIN_BASED_POSTED_INTR;
10449         }
10450
10451         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
10452
10453         vmx->nested.preemption_timer_expired = false;
10454         if (nested_cpu_has_preemption_timer(vmcs12))
10455                 vmx_start_preemption_timer(vcpu);
10456
10457         /*
10458          * Whether page-faults are trapped is determined by a combination of
10459          * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
10460          * If enable_ept, L0 doesn't care about page faults and we should
10461          * set all of these to L1's desires. However, if !enable_ept, L0 does
10462          * care about (at least some) page faults, and because it is not easy
10463          * (if at all possible?) to merge L0 and L1's desires, we simply ask
10464          * to exit on each and every L2 page fault. This is done by setting
10465          * MASK=MATCH=0 and (see below) EB.PF=1.
10466          * Note that below we don't need special code to set EB.PF beyond the
10467          * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
10468          * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
10469          * !enable_ept, EB.PF is 1, so the "or" will always be 1.
10470          */
10471         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
10472                 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
10473         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
10474                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
10475
10476         if (cpu_has_secondary_exec_ctrls()) {
10477                 exec_control = vmx->secondary_exec_control;
10478
10479                 /* Take the following fields only from vmcs12 */
10480                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
10481                                   SECONDARY_EXEC_ENABLE_INVPCID |
10482                                   SECONDARY_EXEC_RDTSCP |
10483                                   SECONDARY_EXEC_XSAVES |
10484                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
10485                                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
10486                                   SECONDARY_EXEC_ENABLE_VMFUNC);
10487                 if (nested_cpu_has(vmcs12,
10488                                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
10489                         vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
10490                                 ~SECONDARY_EXEC_ENABLE_PML;
10491                         exec_control |= vmcs12_exec_ctrl;
10492                 }
10493
10494                 /* All VMFUNCs are currently emulated through L0 vmexits.  */
10495                 if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
10496                         vmcs_write64(VM_FUNCTION_CONTROL, 0);
10497
10498                 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
10499                         vmcs_write64(EOI_EXIT_BITMAP0,
10500                                 vmcs12->eoi_exit_bitmap0);
10501                         vmcs_write64(EOI_EXIT_BITMAP1,
10502                                 vmcs12->eoi_exit_bitmap1);
10503                         vmcs_write64(EOI_EXIT_BITMAP2,
10504                                 vmcs12->eoi_exit_bitmap2);
10505                         vmcs_write64(EOI_EXIT_BITMAP3,
10506                                 vmcs12->eoi_exit_bitmap3);
10507                         vmcs_write16(GUEST_INTR_STATUS,
10508                                 vmcs12->guest_intr_status);
10509                 }
10510
10511                 /*
10512                  * Write an illegal value to APIC_ACCESS_ADDR. Later,
10513                  * nested_get_vmcs12_pages will either fix it up or
10514                  * remove the VM execution control.
10515                  */
10516                 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
10517                         vmcs_write64(APIC_ACCESS_ADDR, -1ull);
10518
10519                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
10520         }
10521
10522
10523         /*
10524          * Set host-state according to L0's settings (vmcs12 is irrelevant here)
10525          * Some constant fields are set here by vmx_set_constant_host_state().
10526          * Other fields are different per CPU, and will be set later when
10527          * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
10528          */
10529         vmx_set_constant_host_state(vmx);
10530
10531         /*
10532          * Set the MSR load/store lists to match L0's settings.
10533          */
10534         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
10535         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
10536         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
10537         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
10538         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
10539
10540         /*
10541          * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
10542          * entry, but only if the current (host) sp changed from the value
10543          * we wrote last (vmx->host_rsp). This cache is no longer relevant
10544          * if we switch vmcs, and rather than hold a separate cache per vmcs,
10545          * here we just force the write to happen on entry.
10546          */
10547         vmx->host_rsp = 0;
10548
10549         exec_control = vmx_exec_control(vmx); /* L0's desires */
10550         exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
10551         exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
10552         exec_control &= ~CPU_BASED_TPR_SHADOW;
10553         exec_control |= vmcs12->cpu_based_vm_exec_control;
10554
10555         /*
10556          * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
10557          * nested_get_vmcs12_pages can't fix it up, the illegal value
10558          * will result in a VM entry failure.
10559          */
10560         if (exec_control & CPU_BASED_TPR_SHADOW) {
10561                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
10562                 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
10563         } else {
10564 #ifdef CONFIG_X86_64
10565                 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
10566                                 CPU_BASED_CR8_STORE_EXITING;
10567 #endif
10568         }
10569
10570         /*
10571          * Merging of IO bitmap not currently supported.
10572          * Rather, exit every time.
10573          */
10574         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
10575         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
10576
10577         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
10578
10579         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
10580          * bitwise-or of what L1 wants to trap for L2, and what we want to
10581          * trap. Note that CR0.TS also needs updating - we do this later.
10582          */
10583         update_exception_bitmap(vcpu);
10584         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
10585         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
10586
10587         /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
10588          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
10589          * bits are further modified by vmx_set_efer() below.
10590          */
10591         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
10592
10593         /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
10594          * emulated by vmx_set_efer(), below.
10595          */
10596         vm_entry_controls_init(vmx, 
10597                 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
10598                         ~VM_ENTRY_IA32E_MODE) |
10599                 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
10600
10601         if (from_vmentry &&
10602             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
10603                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
10604                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
10605         } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
10606                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
10607         }
10608
10609         set_cr4_guest_host_mask(vmx);
10610
10611         if (from_vmentry &&
10612             vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
10613                 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
10614
10615         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
10616                 vmcs_write64(TSC_OFFSET,
10617                         vcpu->arch.tsc_offset + vmcs12->tsc_offset);
10618         else
10619                 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
10620         if (kvm_has_tsc_control)
10621                 decache_tsc_multiplier(vmx);
10622
10623         if (cpu_has_vmx_msr_bitmap())
10624                 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
10625
10626         if (enable_vpid) {
10627                 /*
10628                  * There is no direct mapping between vpid02 and vpid12, the
10629                  * vpid02 is per-vCPU for L0 and reused while the value of
10630                  * vpid12 is changed w/ one invvpid during nested vmentry.
10631                  * The vpid12 is allocated by L1 for L2, so it will not
10632                  * influence global bitmap(for vpid01 and vpid02 allocation)
10633                  * even if spawn a lot of nested vCPUs.
10634                  */
10635                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
10636                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
10637                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
10638                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
10639                                 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
10640                         }
10641                 } else {
10642                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
10643                         vmx_flush_tlb(vcpu);
10644                 }
10645
10646         }
10647
10648         if (enable_pml) {
10649                 /*
10650                  * Conceptually we want to copy the PML address and index from
10651                  * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
10652                  * since we always flush the log on each vmexit, this happens
10653                  * to be equivalent to simply resetting the fields in vmcs02.
10654                  */
10655                 ASSERT(vmx->pml_pg);
10656                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
10657                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
10658         }
10659
10660         if (nested_cpu_has_ept(vmcs12)) {
10661                 if (nested_ept_init_mmu_context(vcpu)) {
10662                         *entry_failure_code = ENTRY_FAIL_DEFAULT;
10663                         return 1;
10664                 }
10665         } else if (nested_cpu_has2(vmcs12,
10666                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
10667                 vmx_flush_tlb_ept_only(vcpu);
10668         }
10669
10670         /*
10671          * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
10672          * bits which we consider mandatory enabled.
10673          * The CR0_READ_SHADOW is what L2 should have expected to read given
10674          * the specifications by L1; It's not enough to take
10675          * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
10676          * have more bits than L1 expected.
10677          */
10678         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
10679         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
10680
10681         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
10682         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
10683
10684         if (from_vmentry &&
10685             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
10686                 vcpu->arch.efer = vmcs12->guest_ia32_efer;
10687         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
10688                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
10689         else
10690                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
10691         /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
10692         vmx_set_efer(vcpu, vcpu->arch.efer);
10693
10694         /* Shadow page tables on either EPT or shadow page tables. */
10695         if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
10696                                 entry_failure_code))
10697                 return 1;
10698
10699         if (!enable_ept)
10700                 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
10701
10702         /*
10703          * L1 may access the L2's PDPTR, so save them to construct vmcs12
10704          */
10705         if (enable_ept) {
10706                 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
10707                 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
10708                 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
10709                 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
10710         }
10711
10712         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
10713         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
10714         return 0;
10715 }
10716
10717 static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10718 {
10719         struct vcpu_vmx *vmx = to_vmx(vcpu);
10720
10721         if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
10722             vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
10723                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10724
10725         if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
10726                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10727
10728         if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
10729                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10730
10731         if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
10732                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10733
10734         if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
10735                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10736
10737         if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
10738                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10739
10740         if (nested_vmx_check_pml_controls(vcpu, vmcs12))
10741                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10742
10743         if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
10744                                 vmx->nested.nested_vmx_procbased_ctls_low,
10745                                 vmx->nested.nested_vmx_procbased_ctls_high) ||
10746             (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
10747              !vmx_control_verify(vmcs12->secondary_vm_exec_control,
10748                                  vmx->nested.nested_vmx_secondary_ctls_low,
10749                                  vmx->nested.nested_vmx_secondary_ctls_high)) ||
10750             !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
10751                                 vmx->nested.nested_vmx_pinbased_ctls_low,
10752                                 vmx->nested.nested_vmx_pinbased_ctls_high) ||
10753             !vmx_control_verify(vmcs12->vm_exit_controls,
10754                                 vmx->nested.nested_vmx_exit_ctls_low,
10755                                 vmx->nested.nested_vmx_exit_ctls_high) ||
10756             !vmx_control_verify(vmcs12->vm_entry_controls,
10757                                 vmx->nested.nested_vmx_entry_ctls_low,
10758                                 vmx->nested.nested_vmx_entry_ctls_high))
10759                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10760
10761         if (nested_cpu_has_vmfunc(vmcs12)) {
10762                 if (vmcs12->vm_function_control &
10763                     ~vmx->nested.nested_vmx_vmfunc_controls)
10764                         return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10765
10766                 if (nested_cpu_has_eptp_switching(vmcs12)) {
10767                         if (!nested_cpu_has_ept(vmcs12) ||
10768                             !page_address_valid(vcpu, vmcs12->eptp_list_address))
10769                                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10770                 }
10771         }
10772
10773         if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
10774                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10775
10776         if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
10777             !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
10778             !nested_cr3_valid(vcpu, vmcs12->host_cr3))
10779                 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
10780
10781         return 0;
10782 }
10783
10784 static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10785                                   u32 *exit_qual)
10786 {
10787         bool ia32e;
10788
10789         *exit_qual = ENTRY_FAIL_DEFAULT;
10790
10791         if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
10792             !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
10793                 return 1;
10794
10795         if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
10796             vmcs12->vmcs_link_pointer != -1ull) {
10797                 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
10798                 return 1;
10799         }
10800
10801         /*
10802          * If the load IA32_EFER VM-entry control is 1, the following checks
10803          * are performed on the field for the IA32_EFER MSR:
10804          * - Bits reserved in the IA32_EFER MSR must be 0.
10805          * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
10806          *   the IA-32e mode guest VM-exit control. It must also be identical
10807          *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
10808          *   CR0.PG) is 1.
10809          */
10810         if (to_vmx(vcpu)->nested.nested_run_pending &&
10811             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
10812                 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
10813                 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
10814                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
10815                     ((vmcs12->guest_cr0 & X86_CR0_PG) &&
10816                      ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
10817                         return 1;
10818         }
10819
10820         /*
10821          * If the load IA32_EFER VM-exit control is 1, bits reserved in the
10822          * IA32_EFER MSR must be 0 in the field for that register. In addition,
10823          * the values of the LMA and LME bits in the field must each be that of
10824          * the host address-space size VM-exit control.
10825          */
10826         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
10827                 ia32e = (vmcs12->vm_exit_controls &
10828                          VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
10829                 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
10830                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
10831                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
10832                         return 1;
10833         }
10834
10835         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
10836                 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
10837                 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
10838                         return 1;
10839
10840         return 0;
10841 }
10842
10843 static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
10844 {
10845         struct vcpu_vmx *vmx = to_vmx(vcpu);
10846         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10847         u32 msr_entry_idx;
10848         u32 exit_qual;
10849
10850         enter_guest_mode(vcpu);
10851
10852         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
10853                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
10854
10855         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
10856         vmx_segment_cache_clear(vmx);
10857
10858         if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
10859                 leave_guest_mode(vcpu);
10860                 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
10861                 nested_vmx_entry_failure(vcpu, vmcs12,
10862                                          EXIT_REASON_INVALID_STATE, exit_qual);
10863                 return 1;
10864         }
10865
10866         nested_get_vmcs12_pages(vcpu, vmcs12);
10867
10868         msr_entry_idx = nested_vmx_load_msr(vcpu,
10869                                             vmcs12->vm_entry_msr_load_addr,
10870                                             vmcs12->vm_entry_msr_load_count);
10871         if (msr_entry_idx) {
10872                 leave_guest_mode(vcpu);
10873                 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
10874                 nested_vmx_entry_failure(vcpu, vmcs12,
10875                                 EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
10876                 return 1;
10877         }
10878
10879         /*
10880          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
10881          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
10882          * returned as far as L1 is concerned. It will only return (and set
10883          * the success flag) when L2 exits (see nested_vmx_vmexit()).
10884          */
10885         return 0;
10886 }
10887
10888 /*
10889  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
10890  * for running an L2 nested guest.
10891  */
10892 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10893 {
10894         struct vmcs12 *vmcs12;
10895         struct vcpu_vmx *vmx = to_vmx(vcpu);
10896         u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
10897         u32 exit_qual;
10898         int ret;
10899
10900         if (!nested_vmx_check_permission(vcpu))
10901                 return 1;
10902
10903         if (!nested_vmx_check_vmcs12(vcpu))
10904                 goto out;
10905
10906         vmcs12 = get_vmcs12(vcpu);
10907
10908         if (enable_shadow_vmcs)
10909                 copy_shadow_to_vmcs12(vmx);
10910
10911         /*
10912          * The nested entry process starts with enforcing various prerequisites
10913          * on vmcs12 as required by the Intel SDM, and act appropriately when
10914          * they fail: As the SDM explains, some conditions should cause the
10915          * instruction to fail, while others will cause the instruction to seem
10916          * to succeed, but return an EXIT_REASON_INVALID_STATE.
10917          * To speed up the normal (success) code path, we should avoid checking
10918          * for misconfigurations which will anyway be caught by the processor
10919          * when using the merged vmcs02.
10920          */
10921         if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
10922                 nested_vmx_failValid(vcpu,
10923                                      VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
10924                 goto out;
10925         }
10926
10927         if (vmcs12->launch_state == launch) {
10928                 nested_vmx_failValid(vcpu,
10929                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
10930                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
10931                 goto out;
10932         }
10933
10934         ret = check_vmentry_prereqs(vcpu, vmcs12);
10935         if (ret) {
10936                 nested_vmx_failValid(vcpu, ret);
10937                 goto out;
10938         }
10939
10940         /*
10941          * After this point, the trap flag no longer triggers a singlestep trap
10942          * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
10943          * This is not 100% correct; for performance reasons, we delegate most
10944          * of the checks on host state to the processor.  If those fail,
10945          * the singlestep trap is missed.
10946          */
10947         skip_emulated_instruction(vcpu);
10948
10949         ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
10950         if (ret) {
10951                 nested_vmx_entry_failure(vcpu, vmcs12,
10952                                          EXIT_REASON_INVALID_STATE, exit_qual);
10953                 return 1;
10954         }
10955
10956         /*
10957          * We're finally done with prerequisite checking, and can start with
10958          * the nested entry.
10959          */
10960
10961         ret = enter_vmx_non_root_mode(vcpu, true);
10962         if (ret)
10963                 return ret;
10964
10965         if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
10966                 return kvm_vcpu_halt(vcpu);
10967
10968         vmx->nested.nested_run_pending = 1;
10969
10970         return 1;
10971
10972 out:
10973         return kvm_skip_emulated_instruction(vcpu);
10974 }
10975
10976 /*
10977  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
10978  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
10979  * This function returns the new value we should put in vmcs12.guest_cr0.
10980  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
10981  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
10982  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
10983  *     didn't trap the bit, because if L1 did, so would L0).
10984  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
10985  *     been modified by L2, and L1 knows it. So just leave the old value of
10986  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
10987  *     isn't relevant, because if L0 traps this bit it can set it to anything.
10988  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
10989  *     changed these bits, and therefore they need to be updated, but L0
10990  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
10991  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
10992  */
10993 static inline unsigned long
10994 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10995 {
10996         return
10997         /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
10998         /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
10999         /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
11000                         vcpu->arch.cr0_guest_owned_bits));
11001 }
11002
11003 static inline unsigned long
11004 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
11005 {
11006         return
11007         /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
11008         /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
11009         /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
11010                         vcpu->arch.cr4_guest_owned_bits));
11011 }
11012
11013 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
11014                                        struct vmcs12 *vmcs12)
11015 {
11016         u32 idt_vectoring;
11017         unsigned int nr;
11018
11019         if (vcpu->arch.exception.injected) {
11020                 nr = vcpu->arch.exception.nr;
11021                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
11022
11023                 if (kvm_exception_is_soft(nr)) {
11024                         vmcs12->vm_exit_instruction_len =
11025                                 vcpu->arch.event_exit_inst_len;
11026                         idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
11027                 } else
11028                         idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
11029
11030                 if (vcpu->arch.exception.has_error_code) {
11031                         idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
11032                         vmcs12->idt_vectoring_error_code =
11033                                 vcpu->arch.exception.error_code;
11034                 }
11035
11036                 vmcs12->idt_vectoring_info_field = idt_vectoring;
11037         } else if (vcpu->arch.nmi_injected) {
11038                 vmcs12->idt_vectoring_info_field =
11039                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
11040         } else if (vcpu->arch.interrupt.pending) {
11041                 nr = vcpu->arch.interrupt.nr;
11042                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
11043
11044                 if (vcpu->arch.interrupt.soft) {
11045                         idt_vectoring |= INTR_TYPE_SOFT_INTR;
11046                         vmcs12->vm_entry_instruction_len =
11047                                 vcpu->arch.event_exit_inst_len;
11048                 } else
11049                         idt_vectoring |= INTR_TYPE_EXT_INTR;
11050
11051                 vmcs12->idt_vectoring_info_field = idt_vectoring;
11052         }
11053 }
11054
11055 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
11056 {
11057         struct vcpu_vmx *vmx = to_vmx(vcpu);
11058         unsigned long exit_qual;
11059         bool block_nested_events =
11060             vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
11061
11062         if (vcpu->arch.exception.pending &&
11063                 nested_vmx_check_exception(vcpu, &exit_qual)) {
11064                 if (block_nested_events)
11065                         return -EBUSY;
11066                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
11067                 vcpu->arch.exception.pending = false;
11068                 return 0;
11069         }
11070
11071         if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
11072             vmx->nested.preemption_timer_expired) {
11073                 if (block_nested_events)
11074                         return -EBUSY;
11075                 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
11076                 return 0;
11077         }
11078
11079         if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
11080                 if (block_nested_events)
11081                         return -EBUSY;
11082                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
11083                                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
11084                                   INTR_INFO_VALID_MASK, 0);
11085                 /*
11086                  * The NMI-triggered VM exit counts as injection:
11087                  * clear this one and block further NMIs.
11088                  */
11089                 vcpu->arch.nmi_pending = 0;
11090                 vmx_set_nmi_mask(vcpu, true);
11091                 return 0;
11092         }
11093
11094         if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
11095             nested_exit_on_intr(vcpu)) {
11096                 if (block_nested_events)
11097                         return -EBUSY;
11098                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
11099                 return 0;
11100         }
11101
11102         vmx_complete_nested_posted_interrupt(vcpu);
11103         return 0;
11104 }
11105
11106 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
11107 {
11108         ktime_t remaining =
11109                 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
11110         u64 value;
11111
11112         if (ktime_to_ns(remaining) <= 0)
11113                 return 0;
11114
11115         value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
11116         do_div(value, 1000000);
11117         return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
11118 }
11119
11120 /*
11121  * Update the guest state fields of vmcs12 to reflect changes that
11122  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
11123  * VM-entry controls is also updated, since this is really a guest
11124  * state bit.)
11125  */
11126 static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
11127 {
11128         vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
11129         vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
11130
11131         vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
11132         vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
11133         vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
11134
11135         vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
11136         vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
11137         vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
11138         vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
11139         vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
11140         vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
11141         vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
11142         vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
11143         vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
11144         vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
11145         vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
11146         vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
11147         vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
11148         vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
11149         vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
11150         vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
11151         vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
11152         vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
11153         vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
11154         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
11155         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
11156         vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
11157         vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
11158         vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
11159         vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
11160         vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
11161         vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
11162         vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
11163         vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
11164         vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
11165         vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
11166         vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
11167         vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
11168         vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
11169         vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
11170         vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
11171
11172         vmcs12->guest_interruptibility_info =
11173                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
11174         vmcs12->guest_pending_dbg_exceptions =
11175                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
11176         if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
11177                 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
11178         else
11179                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
11180
11181         if (nested_cpu_has_preemption_timer(vmcs12)) {
11182                 if (vmcs12->vm_exit_controls &
11183                     VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
11184                         vmcs12->vmx_preemption_timer_value =
11185                                 vmx_get_preemption_timer_value(vcpu);
11186                 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
11187         }
11188
11189         /*
11190          * In some cases (usually, nested EPT), L2 is allowed to change its
11191          * own CR3 without exiting. If it has changed it, we must keep it.
11192          * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
11193          * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
11194          *
11195          * Additionally, restore L2's PDPTR to vmcs12.
11196          */
11197         if (enable_ept) {
11198                 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
11199                 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
11200                 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
11201                 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
11202                 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
11203         }
11204
11205         vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
11206
11207         if (nested_cpu_has_vid(vmcs12))
11208                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
11209
11210         vmcs12->vm_entry_controls =
11211                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
11212                 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
11213
11214         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
11215                 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
11216                 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
11217         }
11218
11219         /* TODO: These cannot have changed unless we have MSR bitmaps and
11220          * the relevant bit asks not to trap the change */
11221         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
11222                 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
11223         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
11224                 vmcs12->guest_ia32_efer = vcpu->arch.efer;
11225         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
11226         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
11227         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
11228         if (kvm_mpx_supported())
11229                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
11230 }
11231
11232 /*
11233  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
11234  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
11235  * and this function updates it to reflect the changes to the guest state while
11236  * L2 was running (and perhaps made some exits which were handled directly by L0
11237  * without going back to L1), and to reflect the exit reason.
11238  * Note that we do not have to copy here all VMCS fields, just those that
11239  * could have changed by the L2 guest or the exit - i.e., the guest-state and
11240  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
11241  * which already writes to vmcs12 directly.
11242  */
11243 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
11244                            u32 exit_reason, u32 exit_intr_info,
11245                            unsigned long exit_qualification)
11246 {
11247         /* update guest state fields: */
11248         sync_vmcs12(vcpu, vmcs12);
11249
11250         /* update exit information fields: */
11251
11252         vmcs12->vm_exit_reason = exit_reason;
11253         vmcs12->exit_qualification = exit_qualification;
11254         vmcs12->vm_exit_intr_info = exit_intr_info;
11255
11256         vmcs12->idt_vectoring_info_field = 0;
11257         vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
11258         vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
11259
11260         if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
11261                 vmcs12->launch_state = 1;
11262
11263                 /* vm_entry_intr_info_field is cleared on exit. Emulate this
11264                  * instead of reading the real value. */
11265                 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
11266
11267                 /*
11268                  * Transfer the event that L0 or L1 may wanted to inject into
11269                  * L2 to IDT_VECTORING_INFO_FIELD.
11270                  */
11271                 vmcs12_save_pending_event(vcpu, vmcs12);
11272         }
11273
11274         /*
11275          * Drop what we picked up for L2 via vmx_complete_interrupts. It is
11276          * preserved above and would only end up incorrectly in L1.
11277          */
11278         vcpu->arch.nmi_injected = false;
11279         kvm_clear_exception_queue(vcpu);
11280         kvm_clear_interrupt_queue(vcpu);
11281 }
11282
11283 static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
11284                         struct vmcs12 *vmcs12)
11285 {
11286         u32 entry_failure_code;
11287
11288         nested_ept_uninit_mmu_context(vcpu);
11289
11290         /*
11291          * Only PDPTE load can fail as the value of cr3 was checked on entry and
11292          * couldn't have changed.
11293          */
11294         if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
11295                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
11296
11297         if (!enable_ept)
11298                 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
11299 }
11300
11301 /*
11302  * A part of what we need to when the nested L2 guest exits and we want to
11303  * run its L1 parent, is to reset L1's guest state to the host state specified
11304  * in vmcs12.
11305  * This function is to be called not only on normal nested exit, but also on
11306  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
11307  * Failures During or After Loading Guest State").
11308  * This function should be called when the active VMCS is L1's (vmcs01).
11309  */
11310 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
11311                                    struct vmcs12 *vmcs12)
11312 {
11313         struct kvm_segment seg;
11314
11315         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
11316                 vcpu->arch.efer = vmcs12->host_ia32_efer;
11317         else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
11318                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
11319         else
11320                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
11321         vmx_set_efer(vcpu, vcpu->arch.efer);
11322
11323         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
11324         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
11325         vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
11326         /*
11327          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
11328          * actually changed, because vmx_set_cr0 refers to efer set above.
11329          *
11330          * CR0_GUEST_HOST_MASK is already set in the original vmcs01
11331          * (KVM doesn't change it);
11332          */
11333         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
11334         vmx_set_cr0(vcpu, vmcs12->host_cr0);
11335
11336         /* Same as above - no reason to call set_cr4_guest_host_mask().  */
11337         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
11338         vmx_set_cr4(vcpu, vmcs12->host_cr4);
11339
11340         load_vmcs12_mmu_host_state(vcpu, vmcs12);
11341
11342         if (enable_vpid) {
11343                 /*
11344                  * Trivially support vpid by letting L2s share their parent
11345                  * L1's vpid. TODO: move to a more elaborate solution, giving
11346                  * each L2 its own vpid and exposing the vpid feature to L1.
11347                  */
11348                 vmx_flush_tlb(vcpu);
11349         }
11350         /* Restore posted intr vector. */
11351         if (nested_cpu_has_posted_intr(vmcs12))
11352                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
11353
11354         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
11355         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
11356         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
11357         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
11358         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
11359         vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
11360         vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
11361
11362         /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
11363         if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
11364                 vmcs_write64(GUEST_BNDCFGS, 0);
11365
11366         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
11367                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
11368                 vcpu->arch.pat = vmcs12->host_ia32_pat;
11369         }
11370         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
11371                 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
11372                         vmcs12->host_ia32_perf_global_ctrl);
11373
11374         /* Set L1 segment info according to Intel SDM
11375             27.5.2 Loading Host Segment and Descriptor-Table Registers */
11376         seg = (struct kvm_segment) {
11377                 .base = 0,
11378                 .limit = 0xFFFFFFFF,
11379                 .selector = vmcs12->host_cs_selector,
11380                 .type = 11,
11381                 .present = 1,
11382                 .s = 1,
11383                 .g = 1
11384         };
11385         if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
11386                 seg.l = 1;
11387         else
11388                 seg.db = 1;
11389         vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
11390         seg = (struct kvm_segment) {
11391                 .base = 0,
11392                 .limit = 0xFFFFFFFF,
11393                 .type = 3,
11394                 .present = 1,
11395                 .s = 1,
11396                 .db = 1,
11397                 .g = 1
11398         };
11399         seg.selector = vmcs12->host_ds_selector;
11400         vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
11401         seg.selector = vmcs12->host_es_selector;
11402         vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
11403         seg.selector = vmcs12->host_ss_selector;
11404         vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
11405         seg.selector = vmcs12->host_fs_selector;
11406         seg.base = vmcs12->host_fs_base;
11407         vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
11408         seg.selector = vmcs12->host_gs_selector;
11409         seg.base = vmcs12->host_gs_base;
11410         vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
11411         seg = (struct kvm_segment) {
11412                 .base = vmcs12->host_tr_base,
11413                 .limit = 0x67,
11414                 .selector = vmcs12->host_tr_selector,
11415                 .type = 11,
11416                 .present = 1
11417         };
11418         vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
11419
11420         kvm_set_dr(vcpu, 7, 0x400);
11421         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
11422
11423         if (cpu_has_vmx_msr_bitmap())
11424                 vmx_update_msr_bitmap(vcpu);
11425
11426         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
11427                                 vmcs12->vm_exit_msr_load_count))
11428                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
11429 }
11430
11431 /*
11432  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
11433  * and modify vmcs12 to make it see what it would expect to see there if
11434  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
11435  */
11436 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11437                               u32 exit_intr_info,
11438                               unsigned long exit_qualification)
11439 {
11440         struct vcpu_vmx *vmx = to_vmx(vcpu);
11441         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11442
11443         /* trying to cancel vmlaunch/vmresume is a bug */
11444         WARN_ON_ONCE(vmx->nested.nested_run_pending);
11445
11446         /*
11447          * The only expected VM-instruction error is "VM entry with
11448          * invalid control field(s)." Anything else indicates a
11449          * problem with L0.
11450          */
11451         WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
11452                                    VMXERR_ENTRY_INVALID_CONTROL_FIELD));
11453
11454         leave_guest_mode(vcpu);
11455
11456         if (likely(!vmx->fail)) {
11457                 if (exit_reason == -1)
11458                         sync_vmcs12(vcpu, vmcs12);
11459                 else
11460                         prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
11461                                        exit_qualification);
11462
11463                 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
11464                                          vmcs12->vm_exit_msr_store_count))
11465                         nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
11466         }
11467
11468         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
11469         vm_entry_controls_reset_shadow(vmx);
11470         vm_exit_controls_reset_shadow(vmx);
11471         vmx_segment_cache_clear(vmx);
11472
11473         /* Update any VMCS fields that might have changed while L2 ran */
11474         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
11475         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
11476         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
11477         if (vmx->hv_deadline_tsc == -1)
11478                 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
11479                                 PIN_BASED_VMX_PREEMPTION_TIMER);
11480         else
11481                 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
11482                               PIN_BASED_VMX_PREEMPTION_TIMER);
11483         if (kvm_has_tsc_control)
11484                 decache_tsc_multiplier(vmx);
11485
11486         if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
11487                 vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
11488                 vmx_set_virtual_x2apic_mode(vcpu,
11489                                 vcpu->arch.apic_base & X2APIC_ENABLE);
11490         } else if (!nested_cpu_has_ept(vmcs12) &&
11491                    nested_cpu_has2(vmcs12,
11492                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
11493                 vmx_flush_tlb_ept_only(vcpu);
11494         }
11495
11496         /* This is needed for same reason as it was needed in prepare_vmcs02 */
11497         vmx->host_rsp = 0;
11498
11499         /* Unpin physical memory we referred to in vmcs02 */
11500         if (vmx->nested.apic_access_page) {
11501                 kvm_release_page_dirty(vmx->nested.apic_access_page);
11502                 vmx->nested.apic_access_page = NULL;
11503         }
11504         if (vmx->nested.virtual_apic_page) {
11505                 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
11506                 vmx->nested.virtual_apic_page = NULL;
11507         }
11508         if (vmx->nested.pi_desc_page) {
11509                 kunmap(vmx->nested.pi_desc_page);
11510                 kvm_release_page_dirty(vmx->nested.pi_desc_page);
11511                 vmx->nested.pi_desc_page = NULL;
11512                 vmx->nested.pi_desc = NULL;
11513         }
11514
11515         /*
11516          * We are now running in L2, mmu_notifier will force to reload the
11517          * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
11518          */
11519         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
11520
11521         if (enable_shadow_vmcs && exit_reason != -1)
11522                 vmx->nested.sync_shadow_vmcs = true;
11523
11524         /* in case we halted in L2 */
11525         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11526
11527         if (likely(!vmx->fail)) {
11528                 /*
11529                  * TODO: SDM says that with acknowledge interrupt on
11530                  * exit, bit 31 of the VM-exit interrupt information
11531                  * (valid interrupt) is always set to 1 on
11532                  * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
11533                  * need kvm_cpu_has_interrupt().  See the commit
11534                  * message for details.
11535                  */
11536                 if (nested_exit_intr_ack_set(vcpu) &&
11537                     exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
11538                     kvm_cpu_has_interrupt(vcpu)) {
11539                         int irq = kvm_cpu_get_interrupt(vcpu);
11540                         WARN_ON(irq < 0);
11541                         vmcs12->vm_exit_intr_info = irq |
11542                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
11543                 }
11544
11545                 if (exit_reason != -1)
11546                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
11547                                                        vmcs12->exit_qualification,
11548                                                        vmcs12->idt_vectoring_info_field,
11549                                                        vmcs12->vm_exit_intr_info,
11550                                                        vmcs12->vm_exit_intr_error_code,
11551                                                        KVM_ISA_VMX);
11552
11553                 load_vmcs12_host_state(vcpu, vmcs12);
11554
11555                 return;
11556         }
11557         
11558         /*
11559          * After an early L2 VM-entry failure, we're now back
11560          * in L1 which thinks it just finished a VMLAUNCH or
11561          * VMRESUME instruction, so we need to set the failure
11562          * flag and the VM-instruction error field of the VMCS
11563          * accordingly.
11564          */
11565         nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
11566
11567         load_vmcs12_mmu_host_state(vcpu, vmcs12);
11568
11569         /*
11570          * The emulated instruction was already skipped in
11571          * nested_vmx_run, but the updated RIP was never
11572          * written back to the vmcs01.
11573          */
11574         skip_emulated_instruction(vcpu);
11575         vmx->fail = 0;
11576 }
11577
11578 /*
11579  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
11580  */
11581 static void vmx_leave_nested(struct kvm_vcpu *vcpu)
11582 {
11583         if (is_guest_mode(vcpu)) {
11584                 to_vmx(vcpu)->nested.nested_run_pending = 0;
11585                 nested_vmx_vmexit(vcpu, -1, 0, 0);
11586         }
11587         free_nested(to_vmx(vcpu));
11588 }
11589
11590 /*
11591  * L1's failure to enter L2 is a subset of a normal exit, as explained in
11592  * 23.7 "VM-entry failures during or after loading guest state" (this also
11593  * lists the acceptable exit-reason and exit-qualification parameters).
11594  * It should only be called before L2 actually succeeded to run, and when
11595  * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
11596  */
11597 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
11598                         struct vmcs12 *vmcs12,
11599                         u32 reason, unsigned long qualification)
11600 {
11601         load_vmcs12_host_state(vcpu, vmcs12);
11602         vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
11603         vmcs12->exit_qualification = qualification;
11604         nested_vmx_succeed(vcpu);
11605         if (enable_shadow_vmcs)
11606                 to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
11607 }
11608
11609 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
11610                                struct x86_instruction_info *info,
11611                                enum x86_intercept_stage stage)
11612 {
11613         return X86EMUL_CONTINUE;
11614 }
11615
11616 #ifdef CONFIG_X86_64
11617 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
11618 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
11619                                   u64 divisor, u64 *result)
11620 {
11621         u64 low = a << shift, high = a >> (64 - shift);
11622
11623         /* To avoid the overflow on divq */
11624         if (high >= divisor)
11625                 return 1;
11626
11627         /* Low hold the result, high hold rem which is discarded */
11628         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
11629             "rm" (divisor), "0" (low), "1" (high));
11630         *result = low;
11631
11632         return 0;
11633 }
11634
11635 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
11636 {
11637         struct vcpu_vmx *vmx = to_vmx(vcpu);
11638         u64 tscl = rdtsc();
11639         u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
11640         u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
11641
11642         /* Convert to host delta tsc if tsc scaling is enabled */
11643         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
11644                         u64_shl_div_u64(delta_tsc,
11645                                 kvm_tsc_scaling_ratio_frac_bits,
11646                                 vcpu->arch.tsc_scaling_ratio,
11647                                 &delta_tsc))
11648                 return -ERANGE;
11649
11650         /*
11651          * If the delta tsc can't fit in the 32 bit after the multi shift,
11652          * we can't use the preemption timer.
11653          * It's possible that it fits on later vmentries, but checking
11654          * on every vmentry is costly so we just use an hrtimer.
11655          */
11656         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
11657                 return -ERANGE;
11658
11659         vmx->hv_deadline_tsc = tscl + delta_tsc;
11660         vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
11661                         PIN_BASED_VMX_PREEMPTION_TIMER);
11662
11663         return delta_tsc == 0;
11664 }
11665
11666 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
11667 {
11668         struct vcpu_vmx *vmx = to_vmx(vcpu);
11669         vmx->hv_deadline_tsc = -1;
11670         vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
11671                         PIN_BASED_VMX_PREEMPTION_TIMER);
11672 }
11673 #endif
11674
11675 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
11676 {
11677         if (ple_gap)
11678                 shrink_ple_window(vcpu);
11679 }
11680
11681 static void vmx_slot_enable_log_dirty(struct kvm *kvm,
11682                                      struct kvm_memory_slot *slot)
11683 {
11684         kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
11685         kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
11686 }
11687
11688 static void vmx_slot_disable_log_dirty(struct kvm *kvm,
11689                                        struct kvm_memory_slot *slot)
11690 {
11691         kvm_mmu_slot_set_dirty(kvm, slot);
11692 }
11693
11694 static void vmx_flush_log_dirty(struct kvm *kvm)
11695 {
11696         kvm_flush_pml_buffers(kvm);
11697 }
11698
11699 static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
11700 {
11701         struct vmcs12 *vmcs12;
11702         struct vcpu_vmx *vmx = to_vmx(vcpu);
11703         gpa_t gpa;
11704         struct page *page = NULL;
11705         u64 *pml_address;
11706
11707         if (is_guest_mode(vcpu)) {
11708                 WARN_ON_ONCE(vmx->nested.pml_full);
11709
11710                 /*
11711                  * Check if PML is enabled for the nested guest.
11712                  * Whether eptp bit 6 is set is already checked
11713                  * as part of A/D emulation.
11714                  */
11715                 vmcs12 = get_vmcs12(vcpu);
11716                 if (!nested_cpu_has_pml(vmcs12))
11717                         return 0;
11718
11719                 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
11720                         vmx->nested.pml_full = true;
11721                         return 1;
11722                 }
11723
11724                 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
11725
11726                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
11727                 if (is_error_page(page))
11728                         return 0;
11729
11730                 pml_address = kmap(page);
11731                 pml_address[vmcs12->guest_pml_index--] = gpa;
11732                 kunmap(page);
11733                 kvm_release_page_clean(page);
11734         }
11735
11736         return 0;
11737 }
11738
11739 static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
11740                                            struct kvm_memory_slot *memslot,
11741                                            gfn_t offset, unsigned long mask)
11742 {
11743         kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
11744 }
11745
11746 static void __pi_post_block(struct kvm_vcpu *vcpu)
11747 {
11748         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
11749         struct pi_desc old, new;
11750         unsigned int dest;
11751
11752         do {
11753                 old.control = new.control = pi_desc->control;
11754                 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
11755                      "Wakeup handler not enabled while the VCPU is blocked\n");
11756
11757                 dest = cpu_physical_id(vcpu->cpu);
11758
11759                 if (x2apic_enabled())
11760                         new.ndst = dest;
11761                 else
11762                         new.ndst = (dest << 8) & 0xFF00;
11763
11764                 /* set 'NV' to 'notification vector' */
11765                 new.nv = POSTED_INTR_VECTOR;
11766         } while (cmpxchg64(&pi_desc->control, old.control,
11767                            new.control) != old.control);
11768
11769         if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
11770                 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
11771                 list_del(&vcpu->blocked_vcpu_list);
11772                 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
11773                 vcpu->pre_pcpu = -1;
11774         }
11775 }
11776
11777 /*
11778  * This routine does the following things for vCPU which is going
11779  * to be blocked if VT-d PI is enabled.
11780  * - Store the vCPU to the wakeup list, so when interrupts happen
11781  *   we can find the right vCPU to wake up.
11782  * - Change the Posted-interrupt descriptor as below:
11783  *      'NDST' <-- vcpu->pre_pcpu
11784  *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
11785  * - If 'ON' is set during this process, which means at least one
11786  *   interrupt is posted for this vCPU, we cannot block it, in
11787  *   this case, return 1, otherwise, return 0.
11788  *
11789  */
11790 static int pi_pre_block(struct kvm_vcpu *vcpu)
11791 {
11792         unsigned int dest;
11793         struct pi_desc old, new;
11794         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
11795
11796         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
11797                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
11798                 !kvm_vcpu_apicv_active(vcpu))
11799                 return 0;
11800
11801         WARN_ON(irqs_disabled());
11802         local_irq_disable();
11803         if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
11804                 vcpu->pre_pcpu = vcpu->cpu;
11805                 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
11806                 list_add_tail(&vcpu->blocked_vcpu_list,
11807                               &per_cpu(blocked_vcpu_on_cpu,
11808                                        vcpu->pre_pcpu));
11809                 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
11810         }
11811
11812         do {
11813                 old.control = new.control = pi_desc->control;
11814
11815                 WARN((pi_desc->sn == 1),
11816                      "Warning: SN field of posted-interrupts "
11817                      "is set before blocking\n");
11818
11819                 /*
11820                  * Since vCPU can be preempted during this process,
11821                  * vcpu->cpu could be different with pre_pcpu, we
11822                  * need to set pre_pcpu as the destination of wakeup
11823                  * notification event, then we can find the right vCPU
11824                  * to wakeup in wakeup handler if interrupts happen
11825                  * when the vCPU is in blocked state.
11826                  */
11827                 dest = cpu_physical_id(vcpu->pre_pcpu);
11828
11829                 if (x2apic_enabled())
11830                         new.ndst = dest;
11831                 else
11832                         new.ndst = (dest << 8) & 0xFF00;
11833
11834                 /* set 'NV' to 'wakeup vector' */
11835                 new.nv = POSTED_INTR_WAKEUP_VECTOR;
11836         } while (cmpxchg64(&pi_desc->control, old.control,
11837                            new.control) != old.control);
11838
11839         /* We should not block the vCPU if an interrupt is posted for it.  */
11840         if (pi_test_on(pi_desc) == 1)
11841                 __pi_post_block(vcpu);
11842
11843         local_irq_enable();
11844         return (vcpu->pre_pcpu == -1);
11845 }
11846
11847 static int vmx_pre_block(struct kvm_vcpu *vcpu)
11848 {
11849         if (pi_pre_block(vcpu))
11850                 return 1;
11851
11852         if (kvm_lapic_hv_timer_in_use(vcpu))
11853                 kvm_lapic_switch_to_sw_timer(vcpu);
11854
11855         return 0;
11856 }
11857
11858 static void pi_post_block(struct kvm_vcpu *vcpu)
11859 {
11860         if (vcpu->pre_pcpu == -1)
11861                 return;
11862
11863         WARN_ON(irqs_disabled());
11864         local_irq_disable();
11865         __pi_post_block(vcpu);
11866         local_irq_enable();
11867 }
11868
11869 static void vmx_post_block(struct kvm_vcpu *vcpu)
11870 {
11871         if (kvm_x86_ops->set_hv_timer)
11872                 kvm_lapic_switch_to_hv_timer(vcpu);
11873
11874         pi_post_block(vcpu);
11875 }
11876
11877 /*
11878  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
11879  *
11880  * @kvm: kvm
11881  * @host_irq: host irq of the interrupt
11882  * @guest_irq: gsi of the interrupt
11883  * @set: set or unset PI
11884  * returns 0 on success, < 0 on failure
11885  */
11886 static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
11887                               uint32_t guest_irq, bool set)
11888 {
11889         struct kvm_kernel_irq_routing_entry *e;
11890         struct kvm_irq_routing_table *irq_rt;
11891         struct kvm_lapic_irq irq;
11892         struct kvm_vcpu *vcpu;
11893         struct vcpu_data vcpu_info;
11894         int idx, ret = 0;
11895
11896         if (!kvm_arch_has_assigned_device(kvm) ||
11897                 !irq_remapping_cap(IRQ_POSTING_CAP) ||
11898                 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
11899                 return 0;
11900
11901         idx = srcu_read_lock(&kvm->irq_srcu);
11902         irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
11903         if (guest_irq >= irq_rt->nr_rt_entries ||
11904             hlist_empty(&irq_rt->map[guest_irq])) {
11905                 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
11906                              guest_irq, irq_rt->nr_rt_entries);
11907                 goto out;
11908         }
11909
11910         hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
11911                 if (e->type != KVM_IRQ_ROUTING_MSI)
11912                         continue;
11913                 /*
11914                  * VT-d PI cannot support posting multicast/broadcast
11915                  * interrupts to a vCPU, we still use interrupt remapping
11916                  * for these kind of interrupts.
11917                  *
11918                  * For lowest-priority interrupts, we only support
11919                  * those with single CPU as the destination, e.g. user
11920                  * configures the interrupts via /proc/irq or uses
11921                  * irqbalance to make the interrupts single-CPU.
11922                  *
11923                  * We will support full lowest-priority interrupt later.
11924                  */
11925
11926                 kvm_set_msi_irq(kvm, e, &irq);
11927                 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
11928                         /*
11929                          * Make sure the IRTE is in remapped mode if
11930                          * we don't handle it in posted mode.
11931                          */
11932                         ret = irq_set_vcpu_affinity(host_irq, NULL);
11933                         if (ret < 0) {
11934                                 printk(KERN_INFO
11935                                    "failed to back to remapped mode, irq: %u\n",
11936                                    host_irq);
11937                                 goto out;
11938                         }
11939
11940                         continue;
11941                 }
11942
11943                 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
11944                 vcpu_info.vector = irq.vector;
11945
11946                 trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
11947                                 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
11948
11949                 if (set)
11950                         ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
11951                 else
11952                         ret = irq_set_vcpu_affinity(host_irq, NULL);
11953
11954                 if (ret < 0) {
11955                         printk(KERN_INFO "%s: failed to update PI IRTE\n",
11956                                         __func__);
11957                         goto out;
11958                 }
11959         }
11960
11961         ret = 0;
11962 out:
11963         srcu_read_unlock(&kvm->irq_srcu, idx);
11964         return ret;
11965 }
11966
11967 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
11968 {
11969         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
11970                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
11971                         FEATURE_CONTROL_LMCE;
11972         else
11973                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
11974                         ~FEATURE_CONTROL_LMCE;
11975 }
11976
11977 static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
11978 {
11979         /* we need a nested vmexit to enter SMM, postpone if run is pending */
11980         if (to_vmx(vcpu)->nested.nested_run_pending)
11981                 return 0;
11982         return 1;
11983 }
11984
11985 static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
11986 {
11987         struct vcpu_vmx *vmx = to_vmx(vcpu);
11988
11989         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
11990         if (vmx->nested.smm.guest_mode)
11991                 nested_vmx_vmexit(vcpu, -1, 0, 0);
11992
11993         vmx->nested.smm.vmxon = vmx->nested.vmxon;
11994         vmx->nested.vmxon = false;
11995         return 0;
11996 }
11997
11998 static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
11999 {
12000         struct vcpu_vmx *vmx = to_vmx(vcpu);
12001         int ret;
12002
12003         if (vmx->nested.smm.vmxon) {
12004                 vmx->nested.vmxon = true;
12005                 vmx->nested.smm.vmxon = false;
12006         }
12007
12008         if (vmx->nested.smm.guest_mode) {
12009                 vcpu->arch.hflags &= ~HF_SMM_MASK;
12010                 ret = enter_vmx_non_root_mode(vcpu, false);
12011                 vcpu->arch.hflags |= HF_SMM_MASK;
12012                 if (ret)
12013                         return ret;
12014
12015                 vmx->nested.smm.guest_mode = false;
12016         }
12017         return 0;
12018 }
12019
12020 static int enable_smi_window(struct kvm_vcpu *vcpu)
12021 {
12022         return 0;
12023 }
12024
12025 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
12026         .cpu_has_kvm_support = cpu_has_kvm_support,
12027         .disabled_by_bios = vmx_disabled_by_bios,
12028         .hardware_setup = hardware_setup,
12029         .hardware_unsetup = hardware_unsetup,
12030         .check_processor_compatibility = vmx_check_processor_compat,
12031         .hardware_enable = hardware_enable,
12032         .hardware_disable = hardware_disable,
12033         .cpu_has_accelerated_tpr = report_flexpriority,
12034         .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
12035
12036         .vcpu_create = vmx_create_vcpu,
12037         .vcpu_free = vmx_free_vcpu,
12038         .vcpu_reset = vmx_vcpu_reset,
12039
12040         .prepare_guest_switch = vmx_save_host_state,
12041         .vcpu_load = vmx_vcpu_load,
12042         .vcpu_put = vmx_vcpu_put,
12043
12044         .update_bp_intercept = update_exception_bitmap,
12045         .get_msr = vmx_get_msr,
12046         .set_msr = vmx_set_msr,
12047         .get_segment_base = vmx_get_segment_base,
12048         .get_segment = vmx_get_segment,
12049         .set_segment = vmx_set_segment,
12050         .get_cpl = vmx_get_cpl,
12051         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
12052         .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
12053         .decache_cr3 = vmx_decache_cr3,
12054         .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
12055         .set_cr0 = vmx_set_cr0,
12056         .set_cr3 = vmx_set_cr3,
12057         .set_cr4 = vmx_set_cr4,
12058         .set_efer = vmx_set_efer,
12059         .get_idt = vmx_get_idt,
12060         .set_idt = vmx_set_idt,
12061         .get_gdt = vmx_get_gdt,
12062         .set_gdt = vmx_set_gdt,
12063         .get_dr6 = vmx_get_dr6,
12064         .set_dr6 = vmx_set_dr6,
12065         .set_dr7 = vmx_set_dr7,
12066         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
12067         .cache_reg = vmx_cache_reg,
12068         .get_rflags = vmx_get_rflags,
12069         .set_rflags = vmx_set_rflags,
12070
12071         .tlb_flush = vmx_flush_tlb,
12072
12073         .run = vmx_vcpu_run,
12074         .handle_exit = vmx_handle_exit,
12075         .skip_emulated_instruction = skip_emulated_instruction,
12076         .set_interrupt_shadow = vmx_set_interrupt_shadow,
12077         .get_interrupt_shadow = vmx_get_interrupt_shadow,
12078         .patch_hypercall = vmx_patch_hypercall,
12079         .set_irq = vmx_inject_irq,
12080         .set_nmi = vmx_inject_nmi,
12081         .queue_exception = vmx_queue_exception,
12082         .cancel_injection = vmx_cancel_injection,
12083         .interrupt_allowed = vmx_interrupt_allowed,
12084         .nmi_allowed = vmx_nmi_allowed,
12085         .get_nmi_mask = vmx_get_nmi_mask,
12086         .set_nmi_mask = vmx_set_nmi_mask,
12087         .enable_nmi_window = enable_nmi_window,
12088         .enable_irq_window = enable_irq_window,
12089         .update_cr8_intercept = update_cr8_intercept,
12090         .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
12091         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
12092         .get_enable_apicv = vmx_get_enable_apicv,
12093         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
12094         .load_eoi_exitmap = vmx_load_eoi_exitmap,
12095         .apicv_post_state_restore = vmx_apicv_post_state_restore,
12096         .hwapic_irr_update = vmx_hwapic_irr_update,
12097         .hwapic_isr_update = vmx_hwapic_isr_update,
12098         .sync_pir_to_irr = vmx_sync_pir_to_irr,
12099         .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
12100
12101         .set_tss_addr = vmx_set_tss_addr,
12102         .get_tdp_level = get_ept_level,
12103         .get_mt_mask = vmx_get_mt_mask,
12104
12105         .get_exit_info = vmx_get_exit_info,
12106
12107         .get_lpage_level = vmx_get_lpage_level,
12108
12109         .cpuid_update = vmx_cpuid_update,
12110
12111         .rdtscp_supported = vmx_rdtscp_supported,
12112         .invpcid_supported = vmx_invpcid_supported,
12113
12114         .set_supported_cpuid = vmx_set_supported_cpuid,
12115
12116         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
12117
12118         .write_tsc_offset = vmx_write_tsc_offset,
12119
12120         .set_tdp_cr3 = vmx_set_cr3,
12121
12122         .check_intercept = vmx_check_intercept,
12123         .handle_external_intr = vmx_handle_external_intr,
12124         .mpx_supported = vmx_mpx_supported,
12125         .xsaves_supported = vmx_xsaves_supported,
12126
12127         .check_nested_events = vmx_check_nested_events,
12128
12129         .sched_in = vmx_sched_in,
12130
12131         .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
12132         .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
12133         .flush_log_dirty = vmx_flush_log_dirty,
12134         .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
12135         .write_log_dirty = vmx_write_pml_buffer,
12136
12137         .pre_block = vmx_pre_block,
12138         .post_block = vmx_post_block,
12139
12140         .pmu_ops = &intel_pmu_ops,
12141
12142         .update_pi_irte = vmx_update_pi_irte,
12143
12144 #ifdef CONFIG_X86_64
12145         .set_hv_timer = vmx_set_hv_timer,
12146         .cancel_hv_timer = vmx_cancel_hv_timer,
12147 #endif
12148
12149         .setup_mce = vmx_setup_mce,
12150
12151         .smi_allowed = vmx_smi_allowed,
12152         .pre_enter_smm = vmx_pre_enter_smm,
12153         .pre_leave_smm = vmx_pre_leave_smm,
12154         .enable_smi_window = enable_smi_window,
12155 };
12156
12157 static int __init vmx_init(void)
12158 {
12159         int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
12160                      __alignof__(struct vcpu_vmx), THIS_MODULE);
12161         if (r)
12162                 return r;
12163
12164 #ifdef CONFIG_KEXEC_CORE
12165         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
12166                            crash_vmclear_local_loaded_vmcss);
12167 #endif
12168
12169         return 0;
12170 }
12171
12172 static void __exit vmx_exit(void)
12173 {
12174 #ifdef CONFIG_KEXEC_CORE
12175         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
12176         synchronize_rcu();
12177 #endif
12178
12179         kvm_exit();
12180 }
12181
12182 module_init(vmx_init)
12183 module_exit(vmx_exit)