d2d6e1b6c7882779c657adc062c83ae049445bc9
[linux-block.git] / arch / x86 / kvm / vmx / vmx.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * Copyright (C) 2006 Qumranet, Inc.
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  */
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/highmem.h>
18 #include <linux/hrtimer.h>
19 #include <linux/kernel.h>
20 #include <linux/kvm_host.h>
21 #include <linux/module.h>
22 #include <linux/moduleparam.h>
23 #include <linux/mod_devicetable.h>
24 #include <linux/mm.h>
25 #include <linux/objtool.h>
26 #include <linux/sched.h>
27 #include <linux/sched/smt.h>
28 #include <linux/slab.h>
29 #include <linux/tboot.h>
30 #include <linux/trace_events.h>
31 #include <linux/entry-kvm.h>
32
33 #include <asm/apic.h>
34 #include <asm/asm.h>
35 #include <asm/cpu.h>
36 #include <asm/cpu_device_id.h>
37 #include <asm/debugreg.h>
38 #include <asm/desc.h>
39 #include <asm/fpu/api.h>
40 #include <asm/fpu/xstate.h>
41 #include <asm/idtentry.h>
42 #include <asm/io.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/kexec.h>
45 #include <asm/perf_event.h>
46 #include <asm/mmu_context.h>
47 #include <asm/mshyperv.h>
48 #include <asm/mwait.h>
49 #include <asm/spec-ctrl.h>
50 #include <asm/virtext.h>
51 #include <asm/vmx.h>
52
53 #include "capabilities.h"
54 #include "cpuid.h"
55 #include "hyperv.h"
56 #include "kvm_onhyperv.h"
57 #include "irq.h"
58 #include "kvm_cache_regs.h"
59 #include "lapic.h"
60 #include "mmu.h"
61 #include "nested.h"
62 #include "pmu.h"
63 #include "sgx.h"
64 #include "trace.h"
65 #include "vmcs.h"
66 #include "vmcs12.h"
67 #include "vmx.h"
68 #include "x86.h"
69 #include "smm.h"
70
71 MODULE_AUTHOR("Qumranet");
72 MODULE_LICENSE("GPL");
73
74 #ifdef MODULE
75 static const struct x86_cpu_id vmx_cpu_id[] = {
76         X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
77         {}
78 };
79 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
80 #endif
81
82 bool __read_mostly enable_vpid = 1;
83 module_param_named(vpid, enable_vpid, bool, 0444);
84
85 static bool __read_mostly enable_vnmi = 1;
86 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
87
88 bool __read_mostly flexpriority_enabled = 1;
89 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
90
91 bool __read_mostly enable_ept = 1;
92 module_param_named(ept, enable_ept, bool, S_IRUGO);
93
94 bool __read_mostly enable_unrestricted_guest = 1;
95 module_param_named(unrestricted_guest,
96                         enable_unrestricted_guest, bool, S_IRUGO);
97
98 bool __read_mostly enable_ept_ad_bits = 1;
99 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
100
101 static bool __read_mostly emulate_invalid_guest_state = true;
102 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
103
104 static bool __read_mostly fasteoi = 1;
105 module_param(fasteoi, bool, S_IRUGO);
106
107 module_param(enable_apicv, bool, S_IRUGO);
108
109 bool __read_mostly enable_ipiv = true;
110 module_param(enable_ipiv, bool, 0444);
111
112 /*
113  * If nested=1, nested virtualization is supported, i.e., guests may use
114  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
115  * use VMX instructions.
116  */
117 static bool __read_mostly nested = 1;
118 module_param(nested, bool, S_IRUGO);
119
120 bool __read_mostly enable_pml = 1;
121 module_param_named(pml, enable_pml, bool, S_IRUGO);
122
123 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
124 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
125
126 static bool __read_mostly dump_invalid_vmcs = 0;
127 module_param(dump_invalid_vmcs, bool, 0644);
128
129 #define MSR_BITMAP_MODE_X2APIC          1
130 #define MSR_BITMAP_MODE_X2APIC_APICV    2
131
132 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
133
134 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
135 static int __read_mostly cpu_preemption_timer_multi;
136 static bool __read_mostly enable_preemption_timer = 1;
137 #ifdef CONFIG_X86_64
138 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
139 #endif
140
141 extern bool __read_mostly allow_smaller_maxphyaddr;
142 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
143
144 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
145 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
146 #define KVM_VM_CR0_ALWAYS_ON                            \
147         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
148
149 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
150 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
151 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
152
153 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
154
155 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
156         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
157         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
158         RTIT_STATUS_BYTECNT))
159
160 /*
161  * List of MSRs that can be directly passed to the guest.
162  * In addition to these x2apic and PT MSRs are handled specially.
163  */
164 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
165         MSR_IA32_SPEC_CTRL,
166         MSR_IA32_PRED_CMD,
167         MSR_IA32_TSC,
168 #ifdef CONFIG_X86_64
169         MSR_FS_BASE,
170         MSR_GS_BASE,
171         MSR_KERNEL_GS_BASE,
172         MSR_IA32_XFD,
173         MSR_IA32_XFD_ERR,
174 #endif
175         MSR_IA32_SYSENTER_CS,
176         MSR_IA32_SYSENTER_ESP,
177         MSR_IA32_SYSENTER_EIP,
178         MSR_CORE_C1_RES,
179         MSR_CORE_C3_RESIDENCY,
180         MSR_CORE_C6_RESIDENCY,
181         MSR_CORE_C7_RESIDENCY,
182 };
183
184 /*
185  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
186  * ple_gap:    upper bound on the amount of time between two successive
187  *             executions of PAUSE in a loop. Also indicate if ple enabled.
188  *             According to test, this time is usually smaller than 128 cycles.
189  * ple_window: upper bound on the amount of time a guest is allowed to execute
190  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
191  *             less than 2^12 cycles
192  * Time is measured based on a counter that runs at the same rate as the TSC,
193  * refer SDM volume 3b section 21.6.13 & 22.1.3.
194  */
195 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
196 module_param(ple_gap, uint, 0444);
197
198 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
199 module_param(ple_window, uint, 0444);
200
201 /* Default doubles per-vcpu window every exit. */
202 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
203 module_param(ple_window_grow, uint, 0444);
204
205 /* Default resets per-vcpu window every exit to ple_window. */
206 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
207 module_param(ple_window_shrink, uint, 0444);
208
209 /* Default is to compute the maximum so we can never overflow. */
210 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
211 module_param(ple_window_max, uint, 0444);
212
213 /* Default is SYSTEM mode, 1 for host-guest mode */
214 int __read_mostly pt_mode = PT_MODE_SYSTEM;
215 module_param(pt_mode, int, S_IRUGO);
216
217 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
218 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
219 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
220
221 /* Storage for pre module init parameter parsing */
222 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
223
224 static const struct {
225         const char *option;
226         bool for_parse;
227 } vmentry_l1d_param[] = {
228         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
229         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
230         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
231         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
232         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
233         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
234 };
235
236 #define L1D_CACHE_ORDER 4
237 static void *vmx_l1d_flush_pages;
238
239 /* Control for disabling CPU Fill buffer clear */
240 static bool __read_mostly vmx_fb_clear_ctrl_available;
241
242 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
243 {
244         struct page *page;
245         unsigned int i;
246
247         if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
248                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
249                 return 0;
250         }
251
252         if (!enable_ept) {
253                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
254                 return 0;
255         }
256
257         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
258                 u64 msr;
259
260                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
261                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
262                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
263                         return 0;
264                 }
265         }
266
267         /* If set to auto use the default l1tf mitigation method */
268         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
269                 switch (l1tf_mitigation) {
270                 case L1TF_MITIGATION_OFF:
271                         l1tf = VMENTER_L1D_FLUSH_NEVER;
272                         break;
273                 case L1TF_MITIGATION_FLUSH_NOWARN:
274                 case L1TF_MITIGATION_FLUSH:
275                 case L1TF_MITIGATION_FLUSH_NOSMT:
276                         l1tf = VMENTER_L1D_FLUSH_COND;
277                         break;
278                 case L1TF_MITIGATION_FULL:
279                 case L1TF_MITIGATION_FULL_FORCE:
280                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
281                         break;
282                 }
283         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
284                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
285         }
286
287         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
288             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
289                 /*
290                  * This allocation for vmx_l1d_flush_pages is not tied to a VM
291                  * lifetime and so should not be charged to a memcg.
292                  */
293                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
294                 if (!page)
295                         return -ENOMEM;
296                 vmx_l1d_flush_pages = page_address(page);
297
298                 /*
299                  * Initialize each page with a different pattern in
300                  * order to protect against KSM in the nested
301                  * virtualization case.
302                  */
303                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
304                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
305                                PAGE_SIZE);
306                 }
307         }
308
309         l1tf_vmx_mitigation = l1tf;
310
311         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
312                 static_branch_enable(&vmx_l1d_should_flush);
313         else
314                 static_branch_disable(&vmx_l1d_should_flush);
315
316         if (l1tf == VMENTER_L1D_FLUSH_COND)
317                 static_branch_enable(&vmx_l1d_flush_cond);
318         else
319                 static_branch_disable(&vmx_l1d_flush_cond);
320         return 0;
321 }
322
323 static int vmentry_l1d_flush_parse(const char *s)
324 {
325         unsigned int i;
326
327         if (s) {
328                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
329                         if (vmentry_l1d_param[i].for_parse &&
330                             sysfs_streq(s, vmentry_l1d_param[i].option))
331                                 return i;
332                 }
333         }
334         return -EINVAL;
335 }
336
337 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
338 {
339         int l1tf, ret;
340
341         l1tf = vmentry_l1d_flush_parse(s);
342         if (l1tf < 0)
343                 return l1tf;
344
345         if (!boot_cpu_has(X86_BUG_L1TF))
346                 return 0;
347
348         /*
349          * Has vmx_init() run already? If not then this is the pre init
350          * parameter parsing. In that case just store the value and let
351          * vmx_init() do the proper setup after enable_ept has been
352          * established.
353          */
354         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
355                 vmentry_l1d_flush_param = l1tf;
356                 return 0;
357         }
358
359         mutex_lock(&vmx_l1d_flush_mutex);
360         ret = vmx_setup_l1d_flush(l1tf);
361         mutex_unlock(&vmx_l1d_flush_mutex);
362         return ret;
363 }
364
365 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
366 {
367         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
368                 return sprintf(s, "???\n");
369
370         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
371 }
372
373 static void vmx_setup_fb_clear_ctrl(void)
374 {
375         u64 msr;
376
377         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
378             !boot_cpu_has_bug(X86_BUG_MDS) &&
379             !boot_cpu_has_bug(X86_BUG_TAA)) {
380                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
381                 if (msr & ARCH_CAP_FB_CLEAR_CTRL)
382                         vmx_fb_clear_ctrl_available = true;
383         }
384 }
385
386 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
387 {
388         u64 msr;
389
390         if (!vmx->disable_fb_clear)
391                 return;
392
393         msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
394         msr |= FB_CLEAR_DIS;
395         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
396         /* Cache the MSR value to avoid reading it later */
397         vmx->msr_ia32_mcu_opt_ctrl = msr;
398 }
399
400 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
401 {
402         if (!vmx->disable_fb_clear)
403                 return;
404
405         vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
406         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
407 }
408
409 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
410 {
411         vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
412
413         /*
414          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
415          * at VMEntry. Skip the MSR read/write when a guest has no use case to
416          * execute VERW.
417          */
418         if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
419            ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
420             (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
421             (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
422             (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
423             (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
424                 vmx->disable_fb_clear = false;
425 }
426
427 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
428         .set = vmentry_l1d_flush_set,
429         .get = vmentry_l1d_flush_get,
430 };
431 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
432
433 static u32 vmx_segment_access_rights(struct kvm_segment *var);
434
435 void vmx_vmexit(void);
436
437 #define vmx_insn_failed(fmt...)         \
438 do {                                    \
439         WARN_ONCE(1, fmt);              \
440         pr_warn_ratelimited(fmt);       \
441 } while (0)
442
443 void vmread_error(unsigned long field, bool fault)
444 {
445         if (fault)
446                 kvm_spurious_fault();
447         else
448                 vmx_insn_failed("vmread failed: field=%lx\n", field);
449 }
450
451 noinline void vmwrite_error(unsigned long field, unsigned long value)
452 {
453         vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
454                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
455 }
456
457 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
458 {
459         vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
460                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
461 }
462
463 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
464 {
465         vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
466                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
467 }
468
469 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
470 {
471         vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
472                         ext, vpid, gva);
473 }
474
475 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
476 {
477         vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
478                         ext, eptp, gpa);
479 }
480
481 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
482 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
483 /*
484  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
485  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
486  */
487 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
488
489 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
490 static DEFINE_SPINLOCK(vmx_vpid_lock);
491
492 struct vmcs_config vmcs_config __ro_after_init;
493 struct vmx_capability vmx_capability __ro_after_init;
494
495 #define VMX_SEGMENT_FIELD(seg)                                  \
496         [VCPU_SREG_##seg] = {                                   \
497                 .selector = GUEST_##seg##_SELECTOR,             \
498                 .base = GUEST_##seg##_BASE,                     \
499                 .limit = GUEST_##seg##_LIMIT,                   \
500                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
501         }
502
503 static const struct kvm_vmx_segment_field {
504         unsigned selector;
505         unsigned base;
506         unsigned limit;
507         unsigned ar_bytes;
508 } kvm_vmx_segment_fields[] = {
509         VMX_SEGMENT_FIELD(CS),
510         VMX_SEGMENT_FIELD(DS),
511         VMX_SEGMENT_FIELD(ES),
512         VMX_SEGMENT_FIELD(FS),
513         VMX_SEGMENT_FIELD(GS),
514         VMX_SEGMENT_FIELD(SS),
515         VMX_SEGMENT_FIELD(TR),
516         VMX_SEGMENT_FIELD(LDTR),
517 };
518
519 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
520 {
521         vmx->segment_cache.bitmask = 0;
522 }
523
524 static unsigned long host_idt_base;
525
526 #if IS_ENABLED(CONFIG_HYPERV)
527 static struct kvm_x86_ops vmx_x86_ops __initdata;
528
529 static bool __read_mostly enlightened_vmcs = true;
530 module_param(enlightened_vmcs, bool, 0444);
531
532 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
533 {
534         struct hv_enlightened_vmcs *evmcs;
535         struct hv_partition_assist_pg **p_hv_pa_pg =
536                         &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
537         /*
538          * Synthetic VM-Exit is not enabled in current code and so All
539          * evmcs in singe VM shares same assist page.
540          */
541         if (!*p_hv_pa_pg)
542                 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
543
544         if (!*p_hv_pa_pg)
545                 return -ENOMEM;
546
547         evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
548
549         evmcs->partition_assist_page =
550                 __pa(*p_hv_pa_pg);
551         evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
552         evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
553
554         return 0;
555 }
556
557 static __init void hv_init_evmcs(void)
558 {
559         int cpu;
560
561         if (!enlightened_vmcs)
562                 return;
563
564         /*
565          * Enlightened VMCS usage should be recommended and the host needs
566          * to support eVMCS v1 or above.
567          */
568         if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
569             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
570              KVM_EVMCS_VERSION) {
571
572                 /* Check that we have assist pages on all online CPUs */
573                 for_each_online_cpu(cpu) {
574                         if (!hv_get_vp_assist_page(cpu)) {
575                                 enlightened_vmcs = false;
576                                 break;
577                         }
578                 }
579
580                 if (enlightened_vmcs) {
581                         pr_info("Using Hyper-V Enlightened VMCS\n");
582                         static_branch_enable(&enable_evmcs);
583                 }
584
585                 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
586                         vmx_x86_ops.enable_l2_tlb_flush
587                                 = hv_enable_l2_tlb_flush;
588
589         } else {
590                 enlightened_vmcs = false;
591         }
592 }
593
594 static void hv_reset_evmcs(void)
595 {
596         struct hv_vp_assist_page *vp_ap;
597
598         if (!static_branch_unlikely(&enable_evmcs))
599                 return;
600
601         /*
602          * KVM should enable eVMCS if and only if all CPUs have a VP assist
603          * page, and should reject CPU onlining if eVMCS is enabled the CPU
604          * doesn't have a VP assist page allocated.
605          */
606         vp_ap = hv_get_vp_assist_page(smp_processor_id());
607         if (WARN_ON_ONCE(!vp_ap))
608                 return;
609
610         /*
611          * Reset everything to support using non-enlightened VMCS access later
612          * (e.g. when we reload the module with enlightened_vmcs=0)
613          */
614         vp_ap->nested_control.features.directhypercall = 0;
615         vp_ap->current_nested_vmcs = 0;
616         vp_ap->enlighten_vmentry = 0;
617 }
618
619 #else /* IS_ENABLED(CONFIG_HYPERV) */
620 static void hv_init_evmcs(void) {}
621 static void hv_reset_evmcs(void) {}
622 #endif /* IS_ENABLED(CONFIG_HYPERV) */
623
624 /*
625  * Comment's format: document - errata name - stepping - processor name.
626  * Refer from
627  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
628  */
629 static u32 vmx_preemption_cpu_tfms[] = {
630 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
631 0x000206E6,
632 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
633 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
634 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
635 0x00020652,
636 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
637 0x00020655,
638 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
639 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
640 /*
641  * 320767.pdf - AAP86  - B1 -
642  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
643  */
644 0x000106E5,
645 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
646 0x000106A0,
647 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
648 0x000106A1,
649 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
650 0x000106A4,
651  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
652  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
653  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
654 0x000106A5,
655  /* Xeon E3-1220 V2 */
656 0x000306A8,
657 };
658
659 static inline bool cpu_has_broken_vmx_preemption_timer(void)
660 {
661         u32 eax = cpuid_eax(0x00000001), i;
662
663         /* Clear the reserved bits */
664         eax &= ~(0x3U << 14 | 0xfU << 28);
665         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
666                 if (eax == vmx_preemption_cpu_tfms[i])
667                         return true;
668
669         return false;
670 }
671
672 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
673 {
674         return flexpriority_enabled && lapic_in_kernel(vcpu);
675 }
676
677 static int possible_passthrough_msr_slot(u32 msr)
678 {
679         u32 i;
680
681         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
682                 if (vmx_possible_passthrough_msrs[i] == msr)
683                         return i;
684
685         return -ENOENT;
686 }
687
688 static bool is_valid_passthrough_msr(u32 msr)
689 {
690         bool r;
691
692         switch (msr) {
693         case 0x800 ... 0x8ff:
694                 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
695                 return true;
696         case MSR_IA32_RTIT_STATUS:
697         case MSR_IA32_RTIT_OUTPUT_BASE:
698         case MSR_IA32_RTIT_OUTPUT_MASK:
699         case MSR_IA32_RTIT_CR3_MATCH:
700         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
701                 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
702         case MSR_LBR_SELECT:
703         case MSR_LBR_TOS:
704         case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
705         case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
706         case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
707         case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
708         case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
709                 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
710                 return true;
711         }
712
713         r = possible_passthrough_msr_slot(msr) != -ENOENT;
714
715         WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
716
717         return r;
718 }
719
720 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
721 {
722         int i;
723
724         i = kvm_find_user_return_msr(msr);
725         if (i >= 0)
726                 return &vmx->guest_uret_msrs[i];
727         return NULL;
728 }
729
730 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
731                                   struct vmx_uret_msr *msr, u64 data)
732 {
733         unsigned int slot = msr - vmx->guest_uret_msrs;
734         int ret = 0;
735
736         if (msr->load_into_hardware) {
737                 preempt_disable();
738                 ret = kvm_set_user_return_msr(slot, data, msr->mask);
739                 preempt_enable();
740         }
741         if (!ret)
742                 msr->data = data;
743         return ret;
744 }
745
746 #ifdef CONFIG_KEXEC_CORE
747 static void crash_vmclear_local_loaded_vmcss(void)
748 {
749         int cpu = raw_smp_processor_id();
750         struct loaded_vmcs *v;
751
752         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
753                             loaded_vmcss_on_cpu_link)
754                 vmcs_clear(v->vmcs);
755 }
756 #endif /* CONFIG_KEXEC_CORE */
757
758 static void __loaded_vmcs_clear(void *arg)
759 {
760         struct loaded_vmcs *loaded_vmcs = arg;
761         int cpu = raw_smp_processor_id();
762
763         if (loaded_vmcs->cpu != cpu)
764                 return; /* vcpu migration can race with cpu offline */
765         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
766                 per_cpu(current_vmcs, cpu) = NULL;
767
768         vmcs_clear(loaded_vmcs->vmcs);
769         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
770                 vmcs_clear(loaded_vmcs->shadow_vmcs);
771
772         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
773
774         /*
775          * Ensure all writes to loaded_vmcs, including deleting it from its
776          * current percpu list, complete before setting loaded_vmcs->cpu to
777          * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
778          * and add loaded_vmcs to its percpu list before it's deleted from this
779          * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
780          */
781         smp_wmb();
782
783         loaded_vmcs->cpu = -1;
784         loaded_vmcs->launched = 0;
785 }
786
787 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
788 {
789         int cpu = loaded_vmcs->cpu;
790
791         if (cpu != -1)
792                 smp_call_function_single(cpu,
793                          __loaded_vmcs_clear, loaded_vmcs, 1);
794 }
795
796 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
797                                        unsigned field)
798 {
799         bool ret;
800         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
801
802         if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
803                 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
804                 vmx->segment_cache.bitmask = 0;
805         }
806         ret = vmx->segment_cache.bitmask & mask;
807         vmx->segment_cache.bitmask |= mask;
808         return ret;
809 }
810
811 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
812 {
813         u16 *p = &vmx->segment_cache.seg[seg].selector;
814
815         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
816                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
817         return *p;
818 }
819
820 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
821 {
822         ulong *p = &vmx->segment_cache.seg[seg].base;
823
824         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
825                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
826         return *p;
827 }
828
829 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
830 {
831         u32 *p = &vmx->segment_cache.seg[seg].limit;
832
833         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
834                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
835         return *p;
836 }
837
838 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
839 {
840         u32 *p = &vmx->segment_cache.seg[seg].ar;
841
842         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
843                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
844         return *p;
845 }
846
847 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
848 {
849         u32 eb;
850
851         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
852              (1u << DB_VECTOR) | (1u << AC_VECTOR);
853         /*
854          * Guest access to VMware backdoor ports could legitimately
855          * trigger #GP because of TSS I/O permission bitmap.
856          * We intercept those #GP and allow access to them anyway
857          * as VMware does.
858          */
859         if (enable_vmware_backdoor)
860                 eb |= (1u << GP_VECTOR);
861         if ((vcpu->guest_debug &
862              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
863             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
864                 eb |= 1u << BP_VECTOR;
865         if (to_vmx(vcpu)->rmode.vm86_active)
866                 eb = ~0;
867         if (!vmx_need_pf_intercept(vcpu))
868                 eb &= ~(1u << PF_VECTOR);
869
870         /* When we are running a nested L2 guest and L1 specified for it a
871          * certain exception bitmap, we must trap the same exceptions and pass
872          * them to L1. When running L2, we will only handle the exceptions
873          * specified above if L1 did not want them.
874          */
875         if (is_guest_mode(vcpu))
876                 eb |= get_vmcs12(vcpu)->exception_bitmap;
877         else {
878                 int mask = 0, match = 0;
879
880                 if (enable_ept && (eb & (1u << PF_VECTOR))) {
881                         /*
882                          * If EPT is enabled, #PF is currently only intercepted
883                          * if MAXPHYADDR is smaller on the guest than on the
884                          * host.  In that case we only care about present,
885                          * non-reserved faults.  For vmcs02, however, PFEC_MASK
886                          * and PFEC_MATCH are set in prepare_vmcs02_rare.
887                          */
888                         mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
889                         match = PFERR_PRESENT_MASK;
890                 }
891                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
892                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
893         }
894
895         /*
896          * Disabling xfd interception indicates that dynamic xfeatures
897          * might be used in the guest. Always trap #NM in this case
898          * to save guest xfd_err timely.
899          */
900         if (vcpu->arch.xfd_no_write_intercept)
901                 eb |= (1u << NM_VECTOR);
902
903         vmcs_write32(EXCEPTION_BITMAP, eb);
904 }
905
906 /*
907  * Check if MSR is intercepted for currently loaded MSR bitmap.
908  */
909 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
910 {
911         if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
912                 return true;
913
914         return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
915 }
916
917 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
918 {
919         unsigned int flags = 0;
920
921         if (vmx->loaded_vmcs->launched)
922                 flags |= VMX_RUN_VMRESUME;
923
924         /*
925          * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
926          * to change it directly without causing a vmexit.  In that case read
927          * it after vmexit and store it in vmx->spec_ctrl.
928          */
929         if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
930                 flags |= VMX_RUN_SAVE_SPEC_CTRL;
931
932         return flags;
933 }
934
935 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
936                 unsigned long entry, unsigned long exit)
937 {
938         vm_entry_controls_clearbit(vmx, entry);
939         vm_exit_controls_clearbit(vmx, exit);
940 }
941
942 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
943 {
944         unsigned int i;
945
946         for (i = 0; i < m->nr; ++i) {
947                 if (m->val[i].index == msr)
948                         return i;
949         }
950         return -ENOENT;
951 }
952
953 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
954 {
955         int i;
956         struct msr_autoload *m = &vmx->msr_autoload;
957
958         switch (msr) {
959         case MSR_EFER:
960                 if (cpu_has_load_ia32_efer()) {
961                         clear_atomic_switch_msr_special(vmx,
962                                         VM_ENTRY_LOAD_IA32_EFER,
963                                         VM_EXIT_LOAD_IA32_EFER);
964                         return;
965                 }
966                 break;
967         case MSR_CORE_PERF_GLOBAL_CTRL:
968                 if (cpu_has_load_perf_global_ctrl()) {
969                         clear_atomic_switch_msr_special(vmx,
970                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
971                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
972                         return;
973                 }
974                 break;
975         }
976         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
977         if (i < 0)
978                 goto skip_guest;
979         --m->guest.nr;
980         m->guest.val[i] = m->guest.val[m->guest.nr];
981         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
982
983 skip_guest:
984         i = vmx_find_loadstore_msr_slot(&m->host, msr);
985         if (i < 0)
986                 return;
987
988         --m->host.nr;
989         m->host.val[i] = m->host.val[m->host.nr];
990         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
991 }
992
993 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
994                 unsigned long entry, unsigned long exit,
995                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
996                 u64 guest_val, u64 host_val)
997 {
998         vmcs_write64(guest_val_vmcs, guest_val);
999         if (host_val_vmcs != HOST_IA32_EFER)
1000                 vmcs_write64(host_val_vmcs, host_val);
1001         vm_entry_controls_setbit(vmx, entry);
1002         vm_exit_controls_setbit(vmx, exit);
1003 }
1004
1005 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1006                                   u64 guest_val, u64 host_val, bool entry_only)
1007 {
1008         int i, j = 0;
1009         struct msr_autoload *m = &vmx->msr_autoload;
1010
1011         switch (msr) {
1012         case MSR_EFER:
1013                 if (cpu_has_load_ia32_efer()) {
1014                         add_atomic_switch_msr_special(vmx,
1015                                         VM_ENTRY_LOAD_IA32_EFER,
1016                                         VM_EXIT_LOAD_IA32_EFER,
1017                                         GUEST_IA32_EFER,
1018                                         HOST_IA32_EFER,
1019                                         guest_val, host_val);
1020                         return;
1021                 }
1022                 break;
1023         case MSR_CORE_PERF_GLOBAL_CTRL:
1024                 if (cpu_has_load_perf_global_ctrl()) {
1025                         add_atomic_switch_msr_special(vmx,
1026                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1027                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1028                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1029                                         HOST_IA32_PERF_GLOBAL_CTRL,
1030                                         guest_val, host_val);
1031                         return;
1032                 }
1033                 break;
1034         case MSR_IA32_PEBS_ENABLE:
1035                 /* PEBS needs a quiescent period after being disabled (to write
1036                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1037                  * provide that period, so a CPU could write host's record into
1038                  * guest's memory.
1039                  */
1040                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1041         }
1042
1043         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1044         if (!entry_only)
1045                 j = vmx_find_loadstore_msr_slot(&m->host, msr);
1046
1047         if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1048             (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1049                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1050                                 "Can't add msr %x\n", msr);
1051                 return;
1052         }
1053         if (i < 0) {
1054                 i = m->guest.nr++;
1055                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1056         }
1057         m->guest.val[i].index = msr;
1058         m->guest.val[i].value = guest_val;
1059
1060         if (entry_only)
1061                 return;
1062
1063         if (j < 0) {
1064                 j = m->host.nr++;
1065                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1066         }
1067         m->host.val[j].index = msr;
1068         m->host.val[j].value = host_val;
1069 }
1070
1071 static bool update_transition_efer(struct vcpu_vmx *vmx)
1072 {
1073         u64 guest_efer = vmx->vcpu.arch.efer;
1074         u64 ignore_bits = 0;
1075         int i;
1076
1077         /* Shadow paging assumes NX to be available.  */
1078         if (!enable_ept)
1079                 guest_efer |= EFER_NX;
1080
1081         /*
1082          * LMA and LME handled by hardware; SCE meaningless outside long mode.
1083          */
1084         ignore_bits |= EFER_SCE;
1085 #ifdef CONFIG_X86_64
1086         ignore_bits |= EFER_LMA | EFER_LME;
1087         /* SCE is meaningful only in long mode on Intel */
1088         if (guest_efer & EFER_LMA)
1089                 ignore_bits &= ~(u64)EFER_SCE;
1090 #endif
1091
1092         /*
1093          * On EPT, we can't emulate NX, so we must switch EFER atomically.
1094          * On CPUs that support "load IA32_EFER", always switch EFER
1095          * atomically, since it's faster than switching it manually.
1096          */
1097         if (cpu_has_load_ia32_efer() ||
1098             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1099                 if (!(guest_efer & EFER_LMA))
1100                         guest_efer &= ~EFER_LME;
1101                 if (guest_efer != host_efer)
1102                         add_atomic_switch_msr(vmx, MSR_EFER,
1103                                               guest_efer, host_efer, false);
1104                 else
1105                         clear_atomic_switch_msr(vmx, MSR_EFER);
1106                 return false;
1107         }
1108
1109         i = kvm_find_user_return_msr(MSR_EFER);
1110         if (i < 0)
1111                 return false;
1112
1113         clear_atomic_switch_msr(vmx, MSR_EFER);
1114
1115         guest_efer &= ~ignore_bits;
1116         guest_efer |= host_efer & ignore_bits;
1117
1118         vmx->guest_uret_msrs[i].data = guest_efer;
1119         vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1120
1121         return true;
1122 }
1123
1124 #ifdef CONFIG_X86_32
1125 /*
1126  * On 32-bit kernels, VM exits still load the FS and GS bases from the
1127  * VMCS rather than the segment table.  KVM uses this helper to figure
1128  * out the current bases to poke them into the VMCS before entry.
1129  */
1130 static unsigned long segment_base(u16 selector)
1131 {
1132         struct desc_struct *table;
1133         unsigned long v;
1134
1135         if (!(selector & ~SEGMENT_RPL_MASK))
1136                 return 0;
1137
1138         table = get_current_gdt_ro();
1139
1140         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1141                 u16 ldt_selector = kvm_read_ldt();
1142
1143                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1144                         return 0;
1145
1146                 table = (struct desc_struct *)segment_base(ldt_selector);
1147         }
1148         v = get_desc_base(&table[selector >> 3]);
1149         return v;
1150 }
1151 #endif
1152
1153 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1154 {
1155         return vmx_pt_mode_is_host_guest() &&
1156                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1157 }
1158
1159 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1160 {
1161         /* The base must be 128-byte aligned and a legal physical address. */
1162         return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1163 }
1164
1165 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1166 {
1167         u32 i;
1168
1169         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1170         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1171         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1172         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1173         for (i = 0; i < addr_range; i++) {
1174                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1175                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1176         }
1177 }
1178
1179 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1180 {
1181         u32 i;
1182
1183         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1184         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1185         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1186         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1187         for (i = 0; i < addr_range; i++) {
1188                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1189                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1190         }
1191 }
1192
1193 static void pt_guest_enter(struct vcpu_vmx *vmx)
1194 {
1195         if (vmx_pt_mode_is_system())
1196                 return;
1197
1198         /*
1199          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1200          * Save host state before VM entry.
1201          */
1202         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1203         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1204                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1205                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1206                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1207         }
1208 }
1209
1210 static void pt_guest_exit(struct vcpu_vmx *vmx)
1211 {
1212         if (vmx_pt_mode_is_system())
1213                 return;
1214
1215         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1216                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1217                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1218         }
1219
1220         /*
1221          * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1222          * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
1223          */
1224         if (vmx->pt_desc.host.ctl)
1225                 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1226 }
1227
1228 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1229                         unsigned long fs_base, unsigned long gs_base)
1230 {
1231         if (unlikely(fs_sel != host->fs_sel)) {
1232                 if (!(fs_sel & 7))
1233                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1234                 else
1235                         vmcs_write16(HOST_FS_SELECTOR, 0);
1236                 host->fs_sel = fs_sel;
1237         }
1238         if (unlikely(gs_sel != host->gs_sel)) {
1239                 if (!(gs_sel & 7))
1240                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1241                 else
1242                         vmcs_write16(HOST_GS_SELECTOR, 0);
1243                 host->gs_sel = gs_sel;
1244         }
1245         if (unlikely(fs_base != host->fs_base)) {
1246                 vmcs_writel(HOST_FS_BASE, fs_base);
1247                 host->fs_base = fs_base;
1248         }
1249         if (unlikely(gs_base != host->gs_base)) {
1250                 vmcs_writel(HOST_GS_BASE, gs_base);
1251                 host->gs_base = gs_base;
1252         }
1253 }
1254
1255 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1256 {
1257         struct vcpu_vmx *vmx = to_vmx(vcpu);
1258         struct vmcs_host_state *host_state;
1259 #ifdef CONFIG_X86_64
1260         int cpu = raw_smp_processor_id();
1261 #endif
1262         unsigned long fs_base, gs_base;
1263         u16 fs_sel, gs_sel;
1264         int i;
1265
1266         vmx->req_immediate_exit = false;
1267
1268         /*
1269          * Note that guest MSRs to be saved/restored can also be changed
1270          * when guest state is loaded. This happens when guest transitions
1271          * to/from long-mode by setting MSR_EFER.LMA.
1272          */
1273         if (!vmx->guest_uret_msrs_loaded) {
1274                 vmx->guest_uret_msrs_loaded = true;
1275                 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1276                         if (!vmx->guest_uret_msrs[i].load_into_hardware)
1277                                 continue;
1278
1279                         kvm_set_user_return_msr(i,
1280                                                 vmx->guest_uret_msrs[i].data,
1281                                                 vmx->guest_uret_msrs[i].mask);
1282                 }
1283         }
1284
1285         if (vmx->nested.need_vmcs12_to_shadow_sync)
1286                 nested_sync_vmcs12_to_shadow(vcpu);
1287
1288         if (vmx->guest_state_loaded)
1289                 return;
1290
1291         host_state = &vmx->loaded_vmcs->host_state;
1292
1293         /*
1294          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1295          * allow segment selectors with cpl > 0 or ti == 1.
1296          */
1297         host_state->ldt_sel = kvm_read_ldt();
1298
1299 #ifdef CONFIG_X86_64
1300         savesegment(ds, host_state->ds_sel);
1301         savesegment(es, host_state->es_sel);
1302
1303         gs_base = cpu_kernelmode_gs_base(cpu);
1304         if (likely(is_64bit_mm(current->mm))) {
1305                 current_save_fsgs();
1306                 fs_sel = current->thread.fsindex;
1307                 gs_sel = current->thread.gsindex;
1308                 fs_base = current->thread.fsbase;
1309                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1310         } else {
1311                 savesegment(fs, fs_sel);
1312                 savesegment(gs, gs_sel);
1313                 fs_base = read_msr(MSR_FS_BASE);
1314                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1315         }
1316
1317         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1318 #else
1319         savesegment(fs, fs_sel);
1320         savesegment(gs, gs_sel);
1321         fs_base = segment_base(fs_sel);
1322         gs_base = segment_base(gs_sel);
1323 #endif
1324
1325         vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1326         vmx->guest_state_loaded = true;
1327 }
1328
1329 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1330 {
1331         struct vmcs_host_state *host_state;
1332
1333         if (!vmx->guest_state_loaded)
1334                 return;
1335
1336         host_state = &vmx->loaded_vmcs->host_state;
1337
1338         ++vmx->vcpu.stat.host_state_reload;
1339
1340 #ifdef CONFIG_X86_64
1341         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1342 #endif
1343         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1344                 kvm_load_ldt(host_state->ldt_sel);
1345 #ifdef CONFIG_X86_64
1346                 load_gs_index(host_state->gs_sel);
1347 #else
1348                 loadsegment(gs, host_state->gs_sel);
1349 #endif
1350         }
1351         if (host_state->fs_sel & 7)
1352                 loadsegment(fs, host_state->fs_sel);
1353 #ifdef CONFIG_X86_64
1354         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1355                 loadsegment(ds, host_state->ds_sel);
1356                 loadsegment(es, host_state->es_sel);
1357         }
1358 #endif
1359         invalidate_tss_limit();
1360 #ifdef CONFIG_X86_64
1361         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1362 #endif
1363         load_fixmap_gdt(raw_smp_processor_id());
1364         vmx->guest_state_loaded = false;
1365         vmx->guest_uret_msrs_loaded = false;
1366 }
1367
1368 #ifdef CONFIG_X86_64
1369 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1370 {
1371         preempt_disable();
1372         if (vmx->guest_state_loaded)
1373                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1374         preempt_enable();
1375         return vmx->msr_guest_kernel_gs_base;
1376 }
1377
1378 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1379 {
1380         preempt_disable();
1381         if (vmx->guest_state_loaded)
1382                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1383         preempt_enable();
1384         vmx->msr_guest_kernel_gs_base = data;
1385 }
1386 #endif
1387
1388 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1389                         struct loaded_vmcs *buddy)
1390 {
1391         struct vcpu_vmx *vmx = to_vmx(vcpu);
1392         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1393         struct vmcs *prev;
1394
1395         if (!already_loaded) {
1396                 loaded_vmcs_clear(vmx->loaded_vmcs);
1397                 local_irq_disable();
1398
1399                 /*
1400                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1401                  * this cpu's percpu list, otherwise it may not yet be deleted
1402                  * from its previous cpu's percpu list.  Pairs with the
1403                  * smb_wmb() in __loaded_vmcs_clear().
1404                  */
1405                 smp_rmb();
1406
1407                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1408                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1409                 local_irq_enable();
1410         }
1411
1412         prev = per_cpu(current_vmcs, cpu);
1413         if (prev != vmx->loaded_vmcs->vmcs) {
1414                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1415                 vmcs_load(vmx->loaded_vmcs->vmcs);
1416
1417                 /*
1418                  * No indirect branch prediction barrier needed when switching
1419                  * the active VMCS within a vCPU, unless IBRS is advertised to
1420                  * the vCPU.  To minimize the number of IBPBs executed, KVM
1421                  * performs IBPB on nested VM-Exit (a single nested transition
1422                  * may switch the active VMCS multiple times).
1423                  */
1424                 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1425                         indirect_branch_prediction_barrier();
1426         }
1427
1428         if (!already_loaded) {
1429                 void *gdt = get_current_gdt_ro();
1430
1431                 /*
1432                  * Flush all EPTP/VPID contexts, the new pCPU may have stale
1433                  * TLB entries from its previous association with the vCPU.
1434                  */
1435                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1436
1437                 /*
1438                  * Linux uses per-cpu TSS and GDT, so set these when switching
1439                  * processors.  See 22.2.4.
1440                  */
1441                 vmcs_writel(HOST_TR_BASE,
1442                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1443                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1444
1445                 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1446                         /* 22.2.3 */
1447                         vmcs_writel(HOST_IA32_SYSENTER_ESP,
1448                                     (unsigned long)(cpu_entry_stack(cpu) + 1));
1449                 }
1450
1451                 vmx->loaded_vmcs->cpu = cpu;
1452         }
1453 }
1454
1455 /*
1456  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1457  * vcpu mutex is already taken.
1458  */
1459 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1460 {
1461         struct vcpu_vmx *vmx = to_vmx(vcpu);
1462
1463         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1464
1465         vmx_vcpu_pi_load(vcpu, cpu);
1466
1467         vmx->host_debugctlmsr = get_debugctlmsr();
1468 }
1469
1470 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1471 {
1472         vmx_vcpu_pi_put(vcpu);
1473
1474         vmx_prepare_switch_to_host(to_vmx(vcpu));
1475 }
1476
1477 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1478 {
1479         return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1480 }
1481
1482 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1483 {
1484         struct vcpu_vmx *vmx = to_vmx(vcpu);
1485         unsigned long rflags, save_rflags;
1486
1487         if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1488                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1489                 rflags = vmcs_readl(GUEST_RFLAGS);
1490                 if (vmx->rmode.vm86_active) {
1491                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1492                         save_rflags = vmx->rmode.save_rflags;
1493                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1494                 }
1495                 vmx->rflags = rflags;
1496         }
1497         return vmx->rflags;
1498 }
1499
1500 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1501 {
1502         struct vcpu_vmx *vmx = to_vmx(vcpu);
1503         unsigned long old_rflags;
1504
1505         if (is_unrestricted_guest(vcpu)) {
1506                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1507                 vmx->rflags = rflags;
1508                 vmcs_writel(GUEST_RFLAGS, rflags);
1509                 return;
1510         }
1511
1512         old_rflags = vmx_get_rflags(vcpu);
1513         vmx->rflags = rflags;
1514         if (vmx->rmode.vm86_active) {
1515                 vmx->rmode.save_rflags = rflags;
1516                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1517         }
1518         vmcs_writel(GUEST_RFLAGS, rflags);
1519
1520         if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1521                 vmx->emulation_required = vmx_emulation_required(vcpu);
1522 }
1523
1524 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1525 {
1526         return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1527 }
1528
1529 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1530 {
1531         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1532         int ret = 0;
1533
1534         if (interruptibility & GUEST_INTR_STATE_STI)
1535                 ret |= KVM_X86_SHADOW_INT_STI;
1536         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1537                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1538
1539         return ret;
1540 }
1541
1542 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1543 {
1544         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1545         u32 interruptibility = interruptibility_old;
1546
1547         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1548
1549         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1550                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1551         else if (mask & KVM_X86_SHADOW_INT_STI)
1552                 interruptibility |= GUEST_INTR_STATE_STI;
1553
1554         if ((interruptibility != interruptibility_old))
1555                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1556 }
1557
1558 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1559 {
1560         struct vcpu_vmx *vmx = to_vmx(vcpu);
1561         unsigned long value;
1562
1563         /*
1564          * Any MSR write that attempts to change bits marked reserved will
1565          * case a #GP fault.
1566          */
1567         if (data & vmx->pt_desc.ctl_bitmask)
1568                 return 1;
1569
1570         /*
1571          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1572          * result in a #GP unless the same write also clears TraceEn.
1573          */
1574         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1575                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1576                 return 1;
1577
1578         /*
1579          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1580          * and FabricEn would cause #GP, if
1581          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1582          */
1583         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1584                 !(data & RTIT_CTL_FABRIC_EN) &&
1585                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1586                                         PT_CAP_single_range_output))
1587                 return 1;
1588
1589         /*
1590          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1591          * utilize encodings marked reserved will cause a #GP fault.
1592          */
1593         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1594         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1595                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1596                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1597                 return 1;
1598         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1599                                                 PT_CAP_cycle_thresholds);
1600         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1601                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1602                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1603                 return 1;
1604         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1605         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1606                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1607                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1608                 return 1;
1609
1610         /*
1611          * If ADDRx_CFG is reserved or the encodings is >2 will
1612          * cause a #GP fault.
1613          */
1614         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1615         if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1616                 return 1;
1617         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1618         if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1619                 return 1;
1620         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1621         if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1622                 return 1;
1623         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1624         if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1625                 return 1;
1626
1627         return 0;
1628 }
1629
1630 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1631                                         void *insn, int insn_len)
1632 {
1633         /*
1634          * Emulation of instructions in SGX enclaves is impossible as RIP does
1635          * not point at the failing instruction, and even if it did, the code
1636          * stream is inaccessible.  Inject #UD instead of exiting to userspace
1637          * so that guest userspace can't DoS the guest simply by triggering
1638          * emulation (enclaves are CPL3 only).
1639          */
1640         if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1641                 kvm_queue_exception(vcpu, UD_VECTOR);
1642                 return false;
1643         }
1644         return true;
1645 }
1646
1647 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1648 {
1649         union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
1650         unsigned long rip, orig_rip;
1651         u32 instr_len;
1652
1653         /*
1654          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1655          * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1656          * set when EPT misconfig occurs.  In practice, real hardware updates
1657          * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1658          * (namely Hyper-V) don't set it due to it being undefined behavior,
1659          * i.e. we end up advancing IP with some random value.
1660          */
1661         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1662             exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1663                 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1664
1665                 /*
1666                  * Emulating an enclave's instructions isn't supported as KVM
1667                  * cannot access the enclave's memory or its true RIP, e.g. the
1668                  * vmcs.GUEST_RIP points at the exit point of the enclave, not
1669                  * the RIP that actually triggered the VM-Exit.  But, because
1670                  * most instructions that cause VM-Exit will #UD in an enclave,
1671                  * most instruction-based VM-Exits simply do not occur.
1672                  *
1673                  * There are a few exceptions, notably the debug instructions
1674                  * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1675                  * and generate #DB/#BP as expected, which KVM might intercept.
1676                  * But again, the CPU does the dirty work and saves an instr
1677                  * length of zero so VMMs don't shoot themselves in the foot.
1678                  * WARN if KVM tries to skip a non-zero length instruction on
1679                  * a VM-Exit from an enclave.
1680                  */
1681                 if (!instr_len)
1682                         goto rip_updated;
1683
1684                 WARN_ONCE(exit_reason.enclave_mode,
1685                           "skipping instruction after SGX enclave VM-Exit");
1686
1687                 orig_rip = kvm_rip_read(vcpu);
1688                 rip = orig_rip + instr_len;
1689 #ifdef CONFIG_X86_64
1690                 /*
1691                  * We need to mask out the high 32 bits of RIP if not in 64-bit
1692                  * mode, but just finding out that we are in 64-bit mode is
1693                  * quite expensive.  Only do it if there was a carry.
1694                  */
1695                 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1696                         rip = (u32)rip;
1697 #endif
1698                 kvm_rip_write(vcpu, rip);
1699         } else {
1700                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1701                         return 0;
1702         }
1703
1704 rip_updated:
1705         /* skipping an emulated instruction also counts */
1706         vmx_set_interrupt_shadow(vcpu, 0);
1707
1708         return 1;
1709 }
1710
1711 /*
1712  * Recognizes a pending MTF VM-exit and records the nested state for later
1713  * delivery.
1714  */
1715 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1716 {
1717         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1718         struct vcpu_vmx *vmx = to_vmx(vcpu);
1719
1720         if (!is_guest_mode(vcpu))
1721                 return;
1722
1723         /*
1724          * Per the SDM, MTF takes priority over debug-trap exceptions besides
1725          * TSS T-bit traps and ICEBP (INT1).  KVM doesn't emulate T-bit traps
1726          * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1727          * intercepted #DB deliberately avoids single-step #DB and MTF updates
1728          * as ICEBP is higher priority than both.  As instruction emulation is
1729          * completed at this point (i.e. KVM is at the instruction boundary),
1730          * any #DB exception pending delivery must be a debug-trap of lower
1731          * priority than MTF.  Record the pending MTF state to be delivered in
1732          * vmx_check_nested_events().
1733          */
1734         if (nested_cpu_has_mtf(vmcs12) &&
1735             (!vcpu->arch.exception.pending ||
1736              vcpu->arch.exception.vector == DB_VECTOR) &&
1737             (!vcpu->arch.exception_vmexit.pending ||
1738              vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1739                 vmx->nested.mtf_pending = true;
1740                 kvm_make_request(KVM_REQ_EVENT, vcpu);
1741         } else {
1742                 vmx->nested.mtf_pending = false;
1743         }
1744 }
1745
1746 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1747 {
1748         vmx_update_emulated_instruction(vcpu);
1749         return skip_emulated_instruction(vcpu);
1750 }
1751
1752 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1753 {
1754         /*
1755          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1756          * explicitly skip the instruction because if the HLT state is set,
1757          * then the instruction is already executing and RIP has already been
1758          * advanced.
1759          */
1760         if (kvm_hlt_in_guest(vcpu->kvm) &&
1761                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1762                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1763 }
1764
1765 static void vmx_inject_exception(struct kvm_vcpu *vcpu)
1766 {
1767         struct kvm_queued_exception *ex = &vcpu->arch.exception;
1768         u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1769         struct vcpu_vmx *vmx = to_vmx(vcpu);
1770
1771         kvm_deliver_exception_payload(vcpu, ex);
1772
1773         if (ex->has_error_code) {
1774                 /*
1775                  * Despite the error code being architecturally defined as 32
1776                  * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1777                  * VMX don't actually supporting setting bits 31:16.  Hardware
1778                  * will (should) never provide a bogus error code, but AMD CPUs
1779                  * do generate error codes with bits 31:16 set, and so KVM's
1780                  * ABI lets userspace shove in arbitrary 32-bit values.  Drop
1781                  * the upper bits to avoid VM-Fail, losing information that
1782                  * does't really exist is preferable to killing the VM.
1783                  */
1784                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1785                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1786         }
1787
1788         if (vmx->rmode.vm86_active) {
1789                 int inc_eip = 0;
1790                 if (kvm_exception_is_soft(ex->vector))
1791                         inc_eip = vcpu->arch.event_exit_inst_len;
1792                 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1793                 return;
1794         }
1795
1796         WARN_ON_ONCE(vmx->emulation_required);
1797
1798         if (kvm_exception_is_soft(ex->vector)) {
1799                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1800                              vmx->vcpu.arch.event_exit_inst_len);
1801                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1802         } else
1803                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1804
1805         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1806
1807         vmx_clear_hlt(vcpu);
1808 }
1809
1810 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1811                                bool load_into_hardware)
1812 {
1813         struct vmx_uret_msr *uret_msr;
1814
1815         uret_msr = vmx_find_uret_msr(vmx, msr);
1816         if (!uret_msr)
1817                 return;
1818
1819         uret_msr->load_into_hardware = load_into_hardware;
1820 }
1821
1822 /*
1823  * Configuring user return MSRs to automatically save, load, and restore MSRs
1824  * that need to be shoved into hardware when running the guest.  Note, omitting
1825  * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1826  * loaded into hardware when running the guest.
1827  */
1828 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1829 {
1830 #ifdef CONFIG_X86_64
1831         bool load_syscall_msrs;
1832
1833         /*
1834          * The SYSCALL MSRs are only needed on long mode guests, and only
1835          * when EFER.SCE is set.
1836          */
1837         load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1838                             (vmx->vcpu.arch.efer & EFER_SCE);
1839
1840         vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1841         vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1842         vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1843 #endif
1844         vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1845
1846         vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1847                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1848                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
1849
1850         /*
1851          * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1852          * kernel and old userspace.  If those guests run on a tsx=off host, do
1853          * allow guests to use TSX_CTRL, but don't change the value in hardware
1854          * so that TSX remains always disabled.
1855          */
1856         vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1857
1858         /*
1859          * The set of MSRs to load may have changed, reload MSRs before the
1860          * next VM-Enter.
1861          */
1862         vmx->guest_uret_msrs_loaded = false;
1863 }
1864
1865 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1866 {
1867         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1868
1869         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1870                 return vmcs12->tsc_offset;
1871
1872         return 0;
1873 }
1874
1875 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1876 {
1877         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1878
1879         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1880             nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1881                 return vmcs12->tsc_multiplier;
1882
1883         return kvm_caps.default_tsc_scaling_ratio;
1884 }
1885
1886 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1887 {
1888         vmcs_write64(TSC_OFFSET, offset);
1889 }
1890
1891 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1892 {
1893         vmcs_write64(TSC_MULTIPLIER, multiplier);
1894 }
1895
1896 /*
1897  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1898  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1899  * all guests if the "nested" module option is off, and can also be disabled
1900  * for a single guest by disabling its VMX cpuid bit.
1901  */
1902 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1903 {
1904         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1905 }
1906
1907 /*
1908  * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1909  * guest CPUID.  Note, KVM allows userspace to set "VMX in SMX" to maintain
1910  * backwards compatibility even though KVM doesn't support emulating SMX.  And
1911  * because userspace set "VMX in SMX", the guest must also be allowed to set it,
1912  * e.g. if the MSR is left unlocked and the guest does a RMW operation.
1913  */
1914 #define KVM_SUPPORTED_FEATURE_CONTROL  (FEAT_CTL_LOCKED                  | \
1915                                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX  | \
1916                                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
1917                                         FEAT_CTL_SGX_LC_ENABLED          | \
1918                                         FEAT_CTL_SGX_ENABLED             | \
1919                                         FEAT_CTL_LMCE_ENABLED)
1920
1921 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1922                                                     struct msr_data *msr)
1923 {
1924         uint64_t valid_bits;
1925
1926         /*
1927          * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1928          * exposed to the guest.
1929          */
1930         WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1931                      ~KVM_SUPPORTED_FEATURE_CONTROL);
1932
1933         if (!msr->host_initiated &&
1934             (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1935                 return false;
1936
1937         if (msr->host_initiated)
1938                 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1939         else
1940                 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
1941
1942         return !(msr->data & ~valid_bits);
1943 }
1944
1945 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1946 {
1947         switch (msr->index) {
1948         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1949                 if (!nested)
1950                         return 1;
1951                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1952         default:
1953                 return KVM_MSR_RET_INVALID;
1954         }
1955 }
1956
1957 /*
1958  * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
1959  * Returns 0 on success, non-0 otherwise.
1960  * Assumes vcpu_load() was already called.
1961  */
1962 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1963 {
1964         struct vcpu_vmx *vmx = to_vmx(vcpu);
1965         struct vmx_uret_msr *msr;
1966         u32 index;
1967
1968         switch (msr_info->index) {
1969 #ifdef CONFIG_X86_64
1970         case MSR_FS_BASE:
1971                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1972                 break;
1973         case MSR_GS_BASE:
1974                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1975                 break;
1976         case MSR_KERNEL_GS_BASE:
1977                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1978                 break;
1979 #endif
1980         case MSR_EFER:
1981                 return kvm_get_msr_common(vcpu, msr_info);
1982         case MSR_IA32_TSX_CTRL:
1983                 if (!msr_info->host_initiated &&
1984                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1985                         return 1;
1986                 goto find_uret_msr;
1987         case MSR_IA32_UMWAIT_CONTROL:
1988                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1989                         return 1;
1990
1991                 msr_info->data = vmx->msr_ia32_umwait_control;
1992                 break;
1993         case MSR_IA32_SPEC_CTRL:
1994                 if (!msr_info->host_initiated &&
1995                     !guest_has_spec_ctrl_msr(vcpu))
1996                         return 1;
1997
1998                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1999                 break;
2000         case MSR_IA32_SYSENTER_CS:
2001                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2002                 break;
2003         case MSR_IA32_SYSENTER_EIP:
2004                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2005                 break;
2006         case MSR_IA32_SYSENTER_ESP:
2007                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2008                 break;
2009         case MSR_IA32_BNDCFGS:
2010                 if (!kvm_mpx_supported() ||
2011                     (!msr_info->host_initiated &&
2012                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2013                         return 1;
2014                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2015                 break;
2016         case MSR_IA32_MCG_EXT_CTL:
2017                 if (!msr_info->host_initiated &&
2018                     !(vmx->msr_ia32_feature_control &
2019                       FEAT_CTL_LMCE_ENABLED))
2020                         return 1;
2021                 msr_info->data = vcpu->arch.mcg_ext_ctl;
2022                 break;
2023         case MSR_IA32_FEAT_CTL:
2024                 msr_info->data = vmx->msr_ia32_feature_control;
2025                 break;
2026         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2027                 if (!msr_info->host_initiated &&
2028                     !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2029                         return 1;
2030                 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2031                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2032                 break;
2033         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2034                 if (!nested_vmx_allowed(vcpu))
2035                         return 1;
2036                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2037                                     &msr_info->data))
2038                         return 1;
2039                 /*
2040                  * Enlightened VMCS v1 doesn't have certain VMCS fields but
2041                  * instead of just ignoring the features, different Hyper-V
2042                  * versions are either trying to use them and fail or do some
2043                  * sanity checking and refuse to boot. Filter all unsupported
2044                  * features out.
2045                  */
2046                 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
2047                         nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2048                                                         &msr_info->data);
2049                 break;
2050         case MSR_IA32_RTIT_CTL:
2051                 if (!vmx_pt_mode_is_host_guest())
2052                         return 1;
2053                 msr_info->data = vmx->pt_desc.guest.ctl;
2054                 break;
2055         case MSR_IA32_RTIT_STATUS:
2056                 if (!vmx_pt_mode_is_host_guest())
2057                         return 1;
2058                 msr_info->data = vmx->pt_desc.guest.status;
2059                 break;
2060         case MSR_IA32_RTIT_CR3_MATCH:
2061                 if (!vmx_pt_mode_is_host_guest() ||
2062                         !intel_pt_validate_cap(vmx->pt_desc.caps,
2063                                                 PT_CAP_cr3_filtering))
2064                         return 1;
2065                 msr_info->data = vmx->pt_desc.guest.cr3_match;
2066                 break;
2067         case MSR_IA32_RTIT_OUTPUT_BASE:
2068                 if (!vmx_pt_mode_is_host_guest() ||
2069                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2070                                         PT_CAP_topa_output) &&
2071                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2072                                         PT_CAP_single_range_output)))
2073                         return 1;
2074                 msr_info->data = vmx->pt_desc.guest.output_base;
2075                 break;
2076         case MSR_IA32_RTIT_OUTPUT_MASK:
2077                 if (!vmx_pt_mode_is_host_guest() ||
2078                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2079                                         PT_CAP_topa_output) &&
2080                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2081                                         PT_CAP_single_range_output)))
2082                         return 1;
2083                 msr_info->data = vmx->pt_desc.guest.output_mask;
2084                 break;
2085         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2086                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2087                 if (!vmx_pt_mode_is_host_guest() ||
2088                     (index >= 2 * vmx->pt_desc.num_address_ranges))
2089                         return 1;
2090                 if (index % 2)
2091                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2092                 else
2093                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2094                 break;
2095         case MSR_IA32_DEBUGCTLMSR:
2096                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
2097                 break;
2098         default:
2099         find_uret_msr:
2100                 msr = vmx_find_uret_msr(vmx, msr_info->index);
2101                 if (msr) {
2102                         msr_info->data = msr->data;
2103                         break;
2104                 }
2105                 return kvm_get_msr_common(vcpu, msr_info);
2106         }
2107
2108         return 0;
2109 }
2110
2111 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2112                                                     u64 data)
2113 {
2114 #ifdef CONFIG_X86_64
2115         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2116                 return (u32)data;
2117 #endif
2118         return (unsigned long)data;
2119 }
2120
2121 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2122 {
2123         u64 debugctl = 0;
2124
2125         if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2126             (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2127                 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2128
2129         if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
2130             (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2131                 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2132
2133         return debugctl;
2134 }
2135
2136 /*
2137  * Writes msr value into the appropriate "register".
2138  * Returns 0 on success, non-0 otherwise.
2139  * Assumes vcpu_load() was already called.
2140  */
2141 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2142 {
2143         struct vcpu_vmx *vmx = to_vmx(vcpu);
2144         struct vmx_uret_msr *msr;
2145         int ret = 0;
2146         u32 msr_index = msr_info->index;
2147         u64 data = msr_info->data;
2148         u32 index;
2149
2150         switch (msr_index) {
2151         case MSR_EFER:
2152                 ret = kvm_set_msr_common(vcpu, msr_info);
2153                 break;
2154 #ifdef CONFIG_X86_64
2155         case MSR_FS_BASE:
2156                 vmx_segment_cache_clear(vmx);
2157                 vmcs_writel(GUEST_FS_BASE, data);
2158                 break;
2159         case MSR_GS_BASE:
2160                 vmx_segment_cache_clear(vmx);
2161                 vmcs_writel(GUEST_GS_BASE, data);
2162                 break;
2163         case MSR_KERNEL_GS_BASE:
2164                 vmx_write_guest_kernel_gs_base(vmx, data);
2165                 break;
2166         case MSR_IA32_XFD:
2167                 ret = kvm_set_msr_common(vcpu, msr_info);
2168                 /*
2169                  * Always intercepting WRMSR could incur non-negligible
2170                  * overhead given xfd might be changed frequently in
2171                  * guest context switch. Disable write interception
2172                  * upon the first write with a non-zero value (indicating
2173                  * potential usage on dynamic xfeatures). Also update
2174                  * exception bitmap to trap #NM for proper virtualization
2175                  * of guest xfd_err.
2176                  */
2177                 if (!ret && data) {
2178                         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2179                                                       MSR_TYPE_RW);
2180                         vcpu->arch.xfd_no_write_intercept = true;
2181                         vmx_update_exception_bitmap(vcpu);
2182                 }
2183                 break;
2184 #endif
2185         case MSR_IA32_SYSENTER_CS:
2186                 if (is_guest_mode(vcpu))
2187                         get_vmcs12(vcpu)->guest_sysenter_cs = data;
2188                 vmcs_write32(GUEST_SYSENTER_CS, data);
2189                 break;
2190         case MSR_IA32_SYSENTER_EIP:
2191                 if (is_guest_mode(vcpu)) {
2192                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2193                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
2194                 }
2195                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2196                 break;
2197         case MSR_IA32_SYSENTER_ESP:
2198                 if (is_guest_mode(vcpu)) {
2199                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2200                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
2201                 }
2202                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2203                 break;
2204         case MSR_IA32_DEBUGCTLMSR: {
2205                 u64 invalid;
2206
2207                 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2208                 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
2209                         kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
2210                         data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2211                         invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2212                 }
2213
2214                 if (invalid)
2215                         return 1;
2216
2217                 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2218                                                 VM_EXIT_SAVE_DEBUG_CONTROLS)
2219                         get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2220
2221                 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
2222                 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2223                     (data & DEBUGCTLMSR_LBR))
2224                         intel_pmu_create_guest_lbr_event(vcpu);
2225                 return 0;
2226         }
2227         case MSR_IA32_BNDCFGS:
2228                 if (!kvm_mpx_supported() ||
2229                     (!msr_info->host_initiated &&
2230                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2231                         return 1;
2232                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2233                     (data & MSR_IA32_BNDCFGS_RSVD))
2234                         return 1;
2235
2236                 if (is_guest_mode(vcpu) &&
2237                     ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2238                      (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2239                         get_vmcs12(vcpu)->guest_bndcfgs = data;
2240
2241                 vmcs_write64(GUEST_BNDCFGS, data);
2242                 break;
2243         case MSR_IA32_UMWAIT_CONTROL:
2244                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2245                         return 1;
2246
2247                 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2248                 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2249                         return 1;
2250
2251                 vmx->msr_ia32_umwait_control = data;
2252                 break;
2253         case MSR_IA32_SPEC_CTRL:
2254                 if (!msr_info->host_initiated &&
2255                     !guest_has_spec_ctrl_msr(vcpu))
2256                         return 1;
2257
2258                 if (kvm_spec_ctrl_test_value(data))
2259                         return 1;
2260
2261                 vmx->spec_ctrl = data;
2262                 if (!data)
2263                         break;
2264
2265                 /*
2266                  * For non-nested:
2267                  * When it's written (to non-zero) for the first time, pass
2268                  * it through.
2269                  *
2270                  * For nested:
2271                  * The handling of the MSR bitmap for L2 guests is done in
2272                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2273                  * vmcs02.msr_bitmap here since it gets completely overwritten
2274                  * in the merging. We update the vmcs01 here for L1 as well
2275                  * since it will end up touching the MSR anyway now.
2276                  */
2277                 vmx_disable_intercept_for_msr(vcpu,
2278                                               MSR_IA32_SPEC_CTRL,
2279                                               MSR_TYPE_RW);
2280                 break;
2281         case MSR_IA32_TSX_CTRL:
2282                 if (!msr_info->host_initiated &&
2283                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2284                         return 1;
2285                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2286                         return 1;
2287                 goto find_uret_msr;
2288         case MSR_IA32_PRED_CMD:
2289                 if (!msr_info->host_initiated &&
2290                     !guest_has_pred_cmd_msr(vcpu))
2291                         return 1;
2292
2293                 if (data & ~PRED_CMD_IBPB)
2294                         return 1;
2295                 if (!boot_cpu_has(X86_FEATURE_IBPB))
2296                         return 1;
2297                 if (!data)
2298                         break;
2299
2300                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2301
2302                 /*
2303                  * For non-nested:
2304                  * When it's written (to non-zero) for the first time, pass
2305                  * it through.
2306                  *
2307                  * For nested:
2308                  * The handling of the MSR bitmap for L2 guests is done in
2309                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2310                  * vmcs02.msr_bitmap here since it gets completely overwritten
2311                  * in the merging.
2312                  */
2313                 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
2314                 break;
2315         case MSR_IA32_CR_PAT:
2316                 if (!kvm_pat_valid(data))
2317                         return 1;
2318
2319                 if (is_guest_mode(vcpu) &&
2320                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2321                         get_vmcs12(vcpu)->guest_ia32_pat = data;
2322
2323                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2324                         vmcs_write64(GUEST_IA32_PAT, data);
2325                         vcpu->arch.pat = data;
2326                         break;
2327                 }
2328                 ret = kvm_set_msr_common(vcpu, msr_info);
2329                 break;
2330         case MSR_IA32_MCG_EXT_CTL:
2331                 if ((!msr_info->host_initiated &&
2332                      !(to_vmx(vcpu)->msr_ia32_feature_control &
2333                        FEAT_CTL_LMCE_ENABLED)) ||
2334                     (data & ~MCG_EXT_CTL_LMCE_EN))
2335                         return 1;
2336                 vcpu->arch.mcg_ext_ctl = data;
2337                 break;
2338         case MSR_IA32_FEAT_CTL:
2339                 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2340                         return 1;
2341
2342                 vmx->msr_ia32_feature_control = data;
2343                 if (msr_info->host_initiated && data == 0)
2344                         vmx_leave_nested(vcpu);
2345
2346                 /* SGX may be enabled/disabled by guest's firmware */
2347                 vmx_write_encls_bitmap(vcpu, NULL);
2348                 break;
2349         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2350                 /*
2351                  * On real hardware, the LE hash MSRs are writable before
2352                  * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2353                  * at which point SGX related bits in IA32_FEATURE_CONTROL
2354                  * become writable.
2355                  *
2356                  * KVM does not emulate SGX activation for simplicity, so
2357                  * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2358                  * is unlocked.  This is technically not architectural
2359                  * behavior, but it's close enough.
2360                  */
2361                 if (!msr_info->host_initiated &&
2362                     (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2363                     ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2364                     !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2365                         return 1;
2366                 vmx->msr_ia32_sgxlepubkeyhash
2367                         [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2368                 break;
2369         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2370                 if (!msr_info->host_initiated)
2371                         return 1; /* they are read-only */
2372                 if (!nested_vmx_allowed(vcpu))
2373                         return 1;
2374                 return vmx_set_vmx_msr(vcpu, msr_index, data);
2375         case MSR_IA32_RTIT_CTL:
2376                 if (!vmx_pt_mode_is_host_guest() ||
2377                         vmx_rtit_ctl_check(vcpu, data) ||
2378                         vmx->nested.vmxon)
2379                         return 1;
2380                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2381                 vmx->pt_desc.guest.ctl = data;
2382                 pt_update_intercept_for_msr(vcpu);
2383                 break;
2384         case MSR_IA32_RTIT_STATUS:
2385                 if (!pt_can_write_msr(vmx))
2386                         return 1;
2387                 if (data & MSR_IA32_RTIT_STATUS_MASK)
2388                         return 1;
2389                 vmx->pt_desc.guest.status = data;
2390                 break;
2391         case MSR_IA32_RTIT_CR3_MATCH:
2392                 if (!pt_can_write_msr(vmx))
2393                         return 1;
2394                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2395                                            PT_CAP_cr3_filtering))
2396                         return 1;
2397                 vmx->pt_desc.guest.cr3_match = data;
2398                 break;
2399         case MSR_IA32_RTIT_OUTPUT_BASE:
2400                 if (!pt_can_write_msr(vmx))
2401                         return 1;
2402                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2403                                            PT_CAP_topa_output) &&
2404                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2405                                            PT_CAP_single_range_output))
2406                         return 1;
2407                 if (!pt_output_base_valid(vcpu, data))
2408                         return 1;
2409                 vmx->pt_desc.guest.output_base = data;
2410                 break;
2411         case MSR_IA32_RTIT_OUTPUT_MASK:
2412                 if (!pt_can_write_msr(vmx))
2413                         return 1;
2414                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2415                                            PT_CAP_topa_output) &&
2416                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2417                                            PT_CAP_single_range_output))
2418                         return 1;
2419                 vmx->pt_desc.guest.output_mask = data;
2420                 break;
2421         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2422                 if (!pt_can_write_msr(vmx))
2423                         return 1;
2424                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2425                 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2426                         return 1;
2427                 if (is_noncanonical_address(data, vcpu))
2428                         return 1;
2429                 if (index % 2)
2430                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2431                 else
2432                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2433                 break;
2434         case MSR_IA32_PERF_CAPABILITIES:
2435                 if (data && !vcpu_to_pmu(vcpu)->version)
2436                         return 1;
2437                 if (data & PMU_CAP_LBR_FMT) {
2438                         if ((data & PMU_CAP_LBR_FMT) !=
2439                             (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
2440                                 return 1;
2441                         if (!cpuid_model_is_consistent(vcpu))
2442                                 return 1;
2443                 }
2444                 if (data & PERF_CAP_PEBS_FORMAT) {
2445                         if ((data & PERF_CAP_PEBS_MASK) !=
2446                             (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2447                                 return 1;
2448                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2449                                 return 1;
2450                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2451                                 return 1;
2452                         if (!cpuid_model_is_consistent(vcpu))
2453                                 return 1;
2454                 }
2455                 ret = kvm_set_msr_common(vcpu, msr_info);
2456                 break;
2457
2458         default:
2459         find_uret_msr:
2460                 msr = vmx_find_uret_msr(vmx, msr_index);
2461                 if (msr)
2462                         ret = vmx_set_guest_uret_msr(vmx, msr, data);
2463                 else
2464                         ret = kvm_set_msr_common(vcpu, msr_info);
2465         }
2466
2467         /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2468         if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2469                 vmx_update_fb_clear_dis(vcpu, vmx);
2470
2471         return ret;
2472 }
2473
2474 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2475 {
2476         unsigned long guest_owned_bits;
2477
2478         kvm_register_mark_available(vcpu, reg);
2479
2480         switch (reg) {
2481         case VCPU_REGS_RSP:
2482                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2483                 break;
2484         case VCPU_REGS_RIP:
2485                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2486                 break;
2487         case VCPU_EXREG_PDPTR:
2488                 if (enable_ept)
2489                         ept_save_pdptrs(vcpu);
2490                 break;
2491         case VCPU_EXREG_CR0:
2492                 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2493
2494                 vcpu->arch.cr0 &= ~guest_owned_bits;
2495                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2496                 break;
2497         case VCPU_EXREG_CR3:
2498                 /*
2499                  * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2500                  * CR3 is loaded into hardware, not the guest's CR3.
2501                  */
2502                 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2503                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2504                 break;
2505         case VCPU_EXREG_CR4:
2506                 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2507
2508                 vcpu->arch.cr4 &= ~guest_owned_bits;
2509                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2510                 break;
2511         default:
2512                 KVM_BUG_ON(1, vcpu->kvm);
2513                 break;
2514         }
2515 }
2516
2517 /*
2518  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2519  * directly instead of going through cpu_has(), to ensure KVM is trapping
2520  * ENCLS whenever it's supported in hardware.  It does not matter whether
2521  * the host OS supports or has enabled SGX.
2522  */
2523 static bool cpu_has_sgx(void)
2524 {
2525         return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2526 }
2527
2528 /*
2529  * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2530  * can't be used due to errata where VM Exit may incorrectly clear
2531  * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
2532  * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2533  */
2534 static bool cpu_has_perf_global_ctrl_bug(void)
2535 {
2536         if (boot_cpu_data.x86 == 0x6) {
2537                 switch (boot_cpu_data.x86_model) {
2538                 case INTEL_FAM6_NEHALEM_EP:     /* AAK155 */
2539                 case INTEL_FAM6_NEHALEM:        /* AAP115 */
2540                 case INTEL_FAM6_WESTMERE:       /* AAT100 */
2541                 case INTEL_FAM6_WESTMERE_EP:    /* BC86,AAY89,BD102 */
2542                 case INTEL_FAM6_NEHALEM_EX:     /* BA97 */
2543                         return true;
2544                 default:
2545                         break;
2546                 }
2547         }
2548
2549         return false;
2550 }
2551
2552 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2553 {
2554         u32 vmx_msr_low, vmx_msr_high;
2555         u32 ctl = ctl_min | ctl_opt;
2556
2557         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2558
2559         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2560         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2561
2562         /* Ensure minimum (required) set of control bits are supported. */
2563         if (ctl_min & ~ctl)
2564                 return -EIO;
2565
2566         *result = ctl;
2567         return 0;
2568 }
2569
2570 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2571 {
2572         u64 allowed;
2573
2574         rdmsrl(msr, allowed);
2575
2576         return  ctl_opt & allowed;
2577 }
2578
2579 static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2580                              struct vmx_capability *vmx_cap)
2581 {
2582         u32 vmx_msr_low, vmx_msr_high;
2583         u32 _pin_based_exec_control = 0;
2584         u32 _cpu_based_exec_control = 0;
2585         u32 _cpu_based_2nd_exec_control = 0;
2586         u64 _cpu_based_3rd_exec_control = 0;
2587         u32 _vmexit_control = 0;
2588         u32 _vmentry_control = 0;
2589         u64 misc_msr;
2590         int i;
2591
2592         /*
2593          * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2594          * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2595          * intercepts writes to PAT and EFER, i.e. never enables those controls.
2596          */
2597         struct {
2598                 u32 entry_control;
2599                 u32 exit_control;
2600         } const vmcs_entry_exit_pairs[] = {
2601                 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2602                 { VM_ENTRY_LOAD_IA32_PAT,               VM_EXIT_LOAD_IA32_PAT },
2603                 { VM_ENTRY_LOAD_IA32_EFER,              VM_EXIT_LOAD_IA32_EFER },
2604                 { VM_ENTRY_LOAD_BNDCFGS,                VM_EXIT_CLEAR_BNDCFGS },
2605                 { VM_ENTRY_LOAD_IA32_RTIT_CTL,          VM_EXIT_CLEAR_IA32_RTIT_CTL },
2606         };
2607
2608         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2609
2610         if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2611                                 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2612                                 MSR_IA32_VMX_PROCBASED_CTLS,
2613                                 &_cpu_based_exec_control))
2614                 return -EIO;
2615         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2616                 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2617                                         KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2618                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2619                                         &_cpu_based_2nd_exec_control))
2620                         return -EIO;
2621         }
2622 #ifndef CONFIG_X86_64
2623         if (!(_cpu_based_2nd_exec_control &
2624                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2625                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2626 #endif
2627
2628         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2629                 _cpu_based_2nd_exec_control &= ~(
2630                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2631                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2632                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2633
2634         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2635                 &vmx_cap->ept, &vmx_cap->vpid);
2636
2637         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2638             vmx_cap->ept) {
2639                 pr_warn_once("EPT CAP should not exist if not support "
2640                                 "1-setting enable EPT VM-execution control\n");
2641
2642                 if (error_on_inconsistent_vmcs_config)
2643                         return -EIO;
2644
2645                 vmx_cap->ept = 0;
2646         }
2647         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2648             vmx_cap->vpid) {
2649                 pr_warn_once("VPID CAP should not exist if not support "
2650                                 "1-setting enable VPID VM-execution control\n");
2651
2652                 if (error_on_inconsistent_vmcs_config)
2653                         return -EIO;
2654
2655                 vmx_cap->vpid = 0;
2656         }
2657
2658         if (!cpu_has_sgx())
2659                 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2660
2661         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2662                 _cpu_based_3rd_exec_control =
2663                         adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2664                                               MSR_IA32_VMX_PROCBASED_CTLS3);
2665
2666         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2667                                 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2668                                 MSR_IA32_VMX_EXIT_CTLS,
2669                                 &_vmexit_control))
2670                 return -EIO;
2671
2672         if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2673                                 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2674                                 MSR_IA32_VMX_PINBASED_CTLS,
2675                                 &_pin_based_exec_control))
2676                 return -EIO;
2677
2678         if (cpu_has_broken_vmx_preemption_timer())
2679                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2680         if (!(_cpu_based_2nd_exec_control &
2681                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2682                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2683
2684         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2685                                 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2686                                 MSR_IA32_VMX_ENTRY_CTLS,
2687                                 &_vmentry_control))
2688                 return -EIO;
2689
2690         for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2691                 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2692                 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2693
2694                 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2695                         continue;
2696
2697                 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2698                              _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2699
2700                 if (error_on_inconsistent_vmcs_config)
2701                         return -EIO;
2702
2703                 _vmentry_control &= ~n_ctrl;
2704                 _vmexit_control &= ~x_ctrl;
2705         }
2706
2707         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2708
2709         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2710         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2711                 return -EIO;
2712
2713 #ifdef CONFIG_X86_64
2714         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2715         if (vmx_msr_high & (1u<<16))
2716                 return -EIO;
2717 #endif
2718
2719         /* Require Write-Back (WB) memory type for VMCS accesses. */
2720         if (((vmx_msr_high >> 18) & 15) != 6)
2721                 return -EIO;
2722
2723         rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2724
2725         vmcs_conf->size = vmx_msr_high & 0x1fff;
2726         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2727
2728         vmcs_conf->revision_id = vmx_msr_low;
2729
2730         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2731         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2732         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2733         vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2734         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2735         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2736         vmcs_conf->misc = misc_msr;
2737
2738 #if IS_ENABLED(CONFIG_HYPERV)
2739         if (enlightened_vmcs)
2740                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2741 #endif
2742
2743         return 0;
2744 }
2745
2746 static bool kvm_is_vmx_supported(void)
2747 {
2748         int cpu = raw_smp_processor_id();
2749
2750         if (!cpu_has_vmx()) {
2751                 pr_err("VMX not supported by CPU %d\n", cpu);
2752                 return false;
2753         }
2754
2755         if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2756             !this_cpu_has(X86_FEATURE_VMX)) {
2757                 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2758                 return false;
2759         }
2760
2761         return true;
2762 }
2763
2764 static int vmx_check_processor_compat(void)
2765 {
2766         int cpu = raw_smp_processor_id();
2767         struct vmcs_config vmcs_conf;
2768         struct vmx_capability vmx_cap;
2769
2770         if (!kvm_is_vmx_supported())
2771                 return -EIO;
2772
2773         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2774                 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2775                 return -EIO;
2776         }
2777         if (nested)
2778                 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2779         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2780                 pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
2781                 return -EIO;
2782         }
2783         return 0;
2784 }
2785
2786 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2787 {
2788         u64 msr;
2789
2790         cr4_set_bits(X86_CR4_VMXE);
2791
2792         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2793                           _ASM_EXTABLE(1b, %l[fault])
2794                           : : [vmxon_pointer] "m"(vmxon_pointer)
2795                           : : fault);
2796         return 0;
2797
2798 fault:
2799         WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2800                   rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2801         cr4_clear_bits(X86_CR4_VMXE);
2802
2803         return -EFAULT;
2804 }
2805
2806 static int vmx_hardware_enable(void)
2807 {
2808         int cpu = raw_smp_processor_id();
2809         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2810         int r;
2811
2812         if (cr4_read_shadow() & X86_CR4_VMXE)
2813                 return -EBUSY;
2814
2815         /*
2816          * This can happen if we hot-added a CPU but failed to allocate
2817          * VP assist page for it.
2818          */
2819         if (static_branch_unlikely(&enable_evmcs) &&
2820             !hv_get_vp_assist_page(cpu))
2821                 return -EFAULT;
2822
2823         intel_pt_handle_vmx(1);
2824
2825         r = kvm_cpu_vmxon(phys_addr);
2826         if (r) {
2827                 intel_pt_handle_vmx(0);
2828                 return r;
2829         }
2830
2831         if (enable_ept)
2832                 ept_sync_global();
2833
2834         return 0;
2835 }
2836
2837 static void vmclear_local_loaded_vmcss(void)
2838 {
2839         int cpu = raw_smp_processor_id();
2840         struct loaded_vmcs *v, *n;
2841
2842         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2843                                  loaded_vmcss_on_cpu_link)
2844                 __loaded_vmcs_clear(v);
2845 }
2846
2847 static void vmx_hardware_disable(void)
2848 {
2849         vmclear_local_loaded_vmcss();
2850
2851         if (cpu_vmxoff())
2852                 kvm_spurious_fault();
2853
2854         hv_reset_evmcs();
2855
2856         intel_pt_handle_vmx(0);
2857 }
2858
2859 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2860 {
2861         int node = cpu_to_node(cpu);
2862         struct page *pages;
2863         struct vmcs *vmcs;
2864
2865         pages = __alloc_pages_node(node, flags, 0);
2866         if (!pages)
2867                 return NULL;
2868         vmcs = page_address(pages);
2869         memset(vmcs, 0, vmcs_config.size);
2870
2871         /* KVM supports Enlightened VMCS v1 only */
2872         if (static_branch_unlikely(&enable_evmcs))
2873                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2874         else
2875                 vmcs->hdr.revision_id = vmcs_config.revision_id;
2876
2877         if (shadow)
2878                 vmcs->hdr.shadow_vmcs = 1;
2879         return vmcs;
2880 }
2881
2882 void free_vmcs(struct vmcs *vmcs)
2883 {
2884         free_page((unsigned long)vmcs);
2885 }
2886
2887 /*
2888  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2889  */
2890 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2891 {
2892         if (!loaded_vmcs->vmcs)
2893                 return;
2894         loaded_vmcs_clear(loaded_vmcs);
2895         free_vmcs(loaded_vmcs->vmcs);
2896         loaded_vmcs->vmcs = NULL;
2897         if (loaded_vmcs->msr_bitmap)
2898                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2899         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2900 }
2901
2902 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2903 {
2904         loaded_vmcs->vmcs = alloc_vmcs(false);
2905         if (!loaded_vmcs->vmcs)
2906                 return -ENOMEM;
2907
2908         vmcs_clear(loaded_vmcs->vmcs);
2909
2910         loaded_vmcs->shadow_vmcs = NULL;
2911         loaded_vmcs->hv_timer_soft_disabled = false;
2912         loaded_vmcs->cpu = -1;
2913         loaded_vmcs->launched = 0;
2914
2915         if (cpu_has_vmx_msr_bitmap()) {
2916                 loaded_vmcs->msr_bitmap = (unsigned long *)
2917                                 __get_free_page(GFP_KERNEL_ACCOUNT);
2918                 if (!loaded_vmcs->msr_bitmap)
2919                         goto out_vmcs;
2920                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2921         }
2922
2923         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2924         memset(&loaded_vmcs->controls_shadow, 0,
2925                 sizeof(struct vmcs_controls_shadow));
2926
2927         return 0;
2928
2929 out_vmcs:
2930         free_loaded_vmcs(loaded_vmcs);
2931         return -ENOMEM;
2932 }
2933
2934 static void free_kvm_area(void)
2935 {
2936         int cpu;
2937
2938         for_each_possible_cpu(cpu) {
2939                 free_vmcs(per_cpu(vmxarea, cpu));
2940                 per_cpu(vmxarea, cpu) = NULL;
2941         }
2942 }
2943
2944 static __init int alloc_kvm_area(void)
2945 {
2946         int cpu;
2947
2948         for_each_possible_cpu(cpu) {
2949                 struct vmcs *vmcs;
2950
2951                 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2952                 if (!vmcs) {
2953                         free_kvm_area();
2954                         return -ENOMEM;
2955                 }
2956
2957                 /*
2958                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2959                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2960                  * revision_id reported by MSR_IA32_VMX_BASIC.
2961                  *
2962                  * However, even though not explicitly documented by
2963                  * TLFS, VMXArea passed as VMXON argument should
2964                  * still be marked with revision_id reported by
2965                  * physical CPU.
2966                  */
2967                 if (static_branch_unlikely(&enable_evmcs))
2968                         vmcs->hdr.revision_id = vmcs_config.revision_id;
2969
2970                 per_cpu(vmxarea, cpu) = vmcs;
2971         }
2972         return 0;
2973 }
2974
2975 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2976                 struct kvm_segment *save)
2977 {
2978         if (!emulate_invalid_guest_state) {
2979                 /*
2980                  * CS and SS RPL should be equal during guest entry according
2981                  * to VMX spec, but in reality it is not always so. Since vcpu
2982                  * is in the middle of the transition from real mode to
2983                  * protected mode it is safe to assume that RPL 0 is a good
2984                  * default value.
2985                  */
2986                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2987                         save->selector &= ~SEGMENT_RPL_MASK;
2988                 save->dpl = save->selector & SEGMENT_RPL_MASK;
2989                 save->s = 1;
2990         }
2991         __vmx_set_segment(vcpu, save, seg);
2992 }
2993
2994 static void enter_pmode(struct kvm_vcpu *vcpu)
2995 {
2996         unsigned long flags;
2997         struct vcpu_vmx *vmx = to_vmx(vcpu);
2998
2999         /*
3000          * Update real mode segment cache. It may be not up-to-date if segment
3001          * register was written while vcpu was in a guest mode.
3002          */
3003         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3004         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3005         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3006         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3007         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3008         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3009
3010         vmx->rmode.vm86_active = 0;
3011
3012         __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3013
3014         flags = vmcs_readl(GUEST_RFLAGS);
3015         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3016         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3017         vmcs_writel(GUEST_RFLAGS, flags);
3018
3019         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3020                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3021
3022         vmx_update_exception_bitmap(vcpu);
3023
3024         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3025         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3026         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3027         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3028         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3029         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3030 }
3031
3032 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3033 {
3034         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3035         struct kvm_segment var = *save;
3036
3037         var.dpl = 0x3;
3038         if (seg == VCPU_SREG_CS)
3039                 var.type = 0x3;
3040
3041         if (!emulate_invalid_guest_state) {
3042                 var.selector = var.base >> 4;
3043                 var.base = var.base & 0xffff0;
3044                 var.limit = 0xffff;
3045                 var.g = 0;
3046                 var.db = 0;
3047                 var.present = 1;
3048                 var.s = 1;
3049                 var.l = 0;
3050                 var.unusable = 0;
3051                 var.type = 0x3;
3052                 var.avl = 0;
3053                 if (save->base & 0xf)
3054                         pr_warn_once("segment base is not paragraph aligned "
3055                                      "when entering protected mode (seg=%d)", seg);
3056         }
3057
3058         vmcs_write16(sf->selector, var.selector);
3059         vmcs_writel(sf->base, var.base);
3060         vmcs_write32(sf->limit, var.limit);
3061         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3062 }
3063
3064 static void enter_rmode(struct kvm_vcpu *vcpu)
3065 {
3066         unsigned long flags;
3067         struct vcpu_vmx *vmx = to_vmx(vcpu);
3068         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3069
3070         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3071         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3072         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3073         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3074         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3075         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3076         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3077
3078         vmx->rmode.vm86_active = 1;
3079
3080         /*
3081          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3082          * vcpu. Warn the user that an update is overdue.
3083          */
3084         if (!kvm_vmx->tss_addr)
3085                 pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
3086
3087         vmx_segment_cache_clear(vmx);
3088
3089         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3090         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3091         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3092
3093         flags = vmcs_readl(GUEST_RFLAGS);
3094         vmx->rmode.save_rflags = flags;
3095
3096         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3097
3098         vmcs_writel(GUEST_RFLAGS, flags);
3099         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3100         vmx_update_exception_bitmap(vcpu);
3101
3102         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3103         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3104         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3105         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3106         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3107         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3108 }
3109
3110 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3111 {
3112         struct vcpu_vmx *vmx = to_vmx(vcpu);
3113
3114         /* Nothing to do if hardware doesn't support EFER. */
3115         if (!vmx_find_uret_msr(vmx, MSR_EFER))
3116                 return 0;
3117
3118         vcpu->arch.efer = efer;
3119 #ifdef CONFIG_X86_64
3120         if (efer & EFER_LMA)
3121                 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3122         else
3123                 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3124 #else
3125         if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3126                 return 1;
3127 #endif
3128
3129         vmx_setup_uret_msrs(vmx);
3130         return 0;
3131 }
3132
3133 #ifdef CONFIG_X86_64
3134
3135 static void enter_lmode(struct kvm_vcpu *vcpu)
3136 {
3137         u32 guest_tr_ar;
3138
3139         vmx_segment_cache_clear(to_vmx(vcpu));
3140
3141         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3142         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3143                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3144                                      __func__);
3145                 vmcs_write32(GUEST_TR_AR_BYTES,
3146                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3147                              | VMX_AR_TYPE_BUSY_64_TSS);
3148         }
3149         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3150 }
3151
3152 static void exit_lmode(struct kvm_vcpu *vcpu)
3153 {
3154         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3155 }
3156
3157 #endif
3158
3159 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3160 {
3161         struct vcpu_vmx *vmx = to_vmx(vcpu);
3162
3163         /*
3164          * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3165          * the CPU is not required to invalidate guest-physical mappings on
3166          * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
3167          * associated with the root EPT structure and not any particular VPID
3168          * (INVVPID also isn't required to invalidate guest-physical mappings).
3169          */
3170         if (enable_ept) {
3171                 ept_sync_global();
3172         } else if (enable_vpid) {
3173                 if (cpu_has_vmx_invvpid_global()) {
3174                         vpid_sync_vcpu_global();
3175                 } else {
3176                         vpid_sync_vcpu_single(vmx->vpid);
3177                         vpid_sync_vcpu_single(vmx->nested.vpid02);
3178                 }
3179         }
3180 }
3181
3182 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3183 {
3184         if (is_guest_mode(vcpu))
3185                 return nested_get_vpid02(vcpu);
3186         return to_vmx(vcpu)->vpid;
3187 }
3188
3189 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3190 {
3191         struct kvm_mmu *mmu = vcpu->arch.mmu;
3192         u64 root_hpa = mmu->root.hpa;
3193
3194         /* No flush required if the current context is invalid. */
3195         if (!VALID_PAGE(root_hpa))
3196                 return;
3197
3198         if (enable_ept)
3199                 ept_sync_context(construct_eptp(vcpu, root_hpa,
3200                                                 mmu->root_role.level));
3201         else
3202                 vpid_sync_context(vmx_get_current_vpid(vcpu));
3203 }
3204
3205 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3206 {
3207         /*
3208          * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3209          * vmx_flush_tlb_guest() for an explanation of why this is ok.
3210          */
3211         vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3212 }
3213
3214 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3215 {
3216         /*
3217          * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3218          * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
3219          * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3220          * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3221          * i.e. no explicit INVVPID is necessary.
3222          */
3223         vpid_sync_context(vmx_get_current_vpid(vcpu));
3224 }
3225
3226 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3227 {
3228         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3229
3230         if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3231                 return;
3232
3233         if (is_pae_paging(vcpu)) {
3234                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3235                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3236                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3237                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3238         }
3239 }
3240
3241 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3242 {
3243         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3244
3245         if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3246                 return;
3247
3248         mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3249         mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3250         mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3251         mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3252
3253         kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3254 }
3255
3256 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3257                           CPU_BASED_CR3_STORE_EXITING)
3258
3259 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3260 {
3261         struct vcpu_vmx *vmx = to_vmx(vcpu);
3262         unsigned long hw_cr0, old_cr0_pg;
3263         u32 tmp;
3264
3265         old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3266
3267         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3268         if (is_unrestricted_guest(vcpu))
3269                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3270         else {
3271                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3272                 if (!enable_ept)
3273                         hw_cr0 |= X86_CR0_WP;
3274
3275                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3276                         enter_pmode(vcpu);
3277
3278                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3279                         enter_rmode(vcpu);
3280         }
3281
3282         vmcs_writel(CR0_READ_SHADOW, cr0);
3283         vmcs_writel(GUEST_CR0, hw_cr0);
3284         vcpu->arch.cr0 = cr0;
3285         kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3286
3287 #ifdef CONFIG_X86_64
3288         if (vcpu->arch.efer & EFER_LME) {
3289                 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3290                         enter_lmode(vcpu);
3291                 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3292                         exit_lmode(vcpu);
3293         }
3294 #endif
3295
3296         if (enable_ept && !is_unrestricted_guest(vcpu)) {
3297                 /*
3298                  * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
3299                  * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3300                  * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3301                  * KVM's CR3 is installed.
3302                  */
3303                 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3304                         vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3305
3306                 /*
3307                  * When running with EPT but not unrestricted guest, KVM must
3308                  * intercept CR3 accesses when paging is _disabled_.  This is
3309                  * necessary because restricted guests can't actually run with
3310                  * paging disabled, and so KVM stuffs its own CR3 in order to
3311                  * run the guest when identity mapped page tables.
3312                  *
3313                  * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3314                  * update, it may be stale with respect to CR3 interception,
3315                  * e.g. after nested VM-Enter.
3316                  *
3317                  * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3318                  * stores to forward them to L1, even if KVM does not need to
3319                  * intercept them to preserve its identity mapped page tables.
3320                  */
3321                 if (!(cr0 & X86_CR0_PG)) {
3322                         exec_controls_setbit(vmx, CR3_EXITING_BITS);
3323                 } else if (!is_guest_mode(vcpu)) {
3324                         exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3325                 } else {
3326                         tmp = exec_controls_get(vmx);
3327                         tmp &= ~CR3_EXITING_BITS;
3328                         tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3329                         exec_controls_set(vmx, tmp);
3330                 }
3331
3332                 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3333                 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3334                         vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3335
3336                 /*
3337                  * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3338                  * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3339                  */
3340                 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3341                         kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3342         }
3343
3344         /* depends on vcpu->arch.cr0 to be set to a new value */
3345         vmx->emulation_required = vmx_emulation_required(vcpu);
3346 }
3347
3348 static int vmx_get_max_tdp_level(void)
3349 {
3350         if (cpu_has_vmx_ept_5levels())
3351                 return 5;
3352         return 4;
3353 }
3354
3355 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3356 {
3357         u64 eptp = VMX_EPTP_MT_WB;
3358
3359         eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3360
3361         if (enable_ept_ad_bits &&
3362             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3363                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3364         eptp |= root_hpa;
3365
3366         return eptp;
3367 }
3368
3369 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3370                              int root_level)
3371 {
3372         struct kvm *kvm = vcpu->kvm;
3373         bool update_guest_cr3 = true;
3374         unsigned long guest_cr3;
3375         u64 eptp;
3376
3377         if (enable_ept) {
3378                 eptp = construct_eptp(vcpu, root_hpa, root_level);
3379                 vmcs_write64(EPT_POINTER, eptp);
3380
3381                 hv_track_root_tdp(vcpu, root_hpa);
3382
3383                 if (!enable_unrestricted_guest && !is_paging(vcpu))
3384                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3385                 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3386                         guest_cr3 = vcpu->arch.cr3;
3387                 else /* vmcs.GUEST_CR3 is already up-to-date. */
3388                         update_guest_cr3 = false;
3389                 vmx_ept_load_pdptrs(vcpu);
3390         } else {
3391                 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
3392         }
3393
3394         if (update_guest_cr3)
3395                 vmcs_writel(GUEST_CR3, guest_cr3);
3396 }
3397
3398
3399 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3400 {
3401         /*
3402          * We operate under the default treatment of SMM, so VMX cannot be
3403          * enabled under SMM.  Note, whether or not VMXE is allowed at all,
3404          * i.e. is a reserved bit, is handled by common x86 code.
3405          */
3406         if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3407                 return false;
3408
3409         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3410                 return false;
3411
3412         return true;
3413 }
3414
3415 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3416 {
3417         unsigned long old_cr4 = vcpu->arch.cr4;
3418         struct vcpu_vmx *vmx = to_vmx(vcpu);
3419         /*
3420          * Pass through host's Machine Check Enable value to hw_cr4, which
3421          * is in force while we are in guest mode.  Do not let guests control
3422          * this bit, even if host CR4.MCE == 0.
3423          */
3424         unsigned long hw_cr4;
3425
3426         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3427         if (is_unrestricted_guest(vcpu))
3428                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3429         else if (vmx->rmode.vm86_active)
3430                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3431         else
3432                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3433
3434         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3435                 if (cr4 & X86_CR4_UMIP) {
3436                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3437                         hw_cr4 &= ~X86_CR4_UMIP;
3438                 } else if (!is_guest_mode(vcpu) ||
3439                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3440                         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3441                 }
3442         }
3443
3444         vcpu->arch.cr4 = cr4;
3445         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3446
3447         if (!is_unrestricted_guest(vcpu)) {
3448                 if (enable_ept) {
3449                         if (!is_paging(vcpu)) {
3450                                 hw_cr4 &= ~X86_CR4_PAE;
3451                                 hw_cr4 |= X86_CR4_PSE;
3452                         } else if (!(cr4 & X86_CR4_PAE)) {
3453                                 hw_cr4 &= ~X86_CR4_PAE;
3454                         }
3455                 }
3456
3457                 /*
3458                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3459                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3460                  * to be manually disabled when guest switches to non-paging
3461                  * mode.
3462                  *
3463                  * If !enable_unrestricted_guest, the CPU is always running
3464                  * with CR0.PG=1 and CR4 needs to be modified.
3465                  * If enable_unrestricted_guest, the CPU automatically
3466                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3467                  */
3468                 if (!is_paging(vcpu))
3469                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3470         }
3471
3472         vmcs_writel(CR4_READ_SHADOW, cr4);
3473         vmcs_writel(GUEST_CR4, hw_cr4);
3474
3475         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3476                 kvm_update_cpuid_runtime(vcpu);
3477 }
3478
3479 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3480 {
3481         struct vcpu_vmx *vmx = to_vmx(vcpu);
3482         u32 ar;
3483
3484         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3485                 *var = vmx->rmode.segs[seg];
3486                 if (seg == VCPU_SREG_TR
3487                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3488                         return;
3489                 var->base = vmx_read_guest_seg_base(vmx, seg);
3490                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3491                 return;
3492         }
3493         var->base = vmx_read_guest_seg_base(vmx, seg);
3494         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3495         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3496         ar = vmx_read_guest_seg_ar(vmx, seg);
3497         var->unusable = (ar >> 16) & 1;
3498         var->type = ar & 15;
3499         var->s = (ar >> 4) & 1;
3500         var->dpl = (ar >> 5) & 3;
3501         /*
3502          * Some userspaces do not preserve unusable property. Since usable
3503          * segment has to be present according to VMX spec we can use present
3504          * property to amend userspace bug by making unusable segment always
3505          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3506          * segment as unusable.
3507          */
3508         var->present = !var->unusable;
3509         var->avl = (ar >> 12) & 1;
3510         var->l = (ar >> 13) & 1;
3511         var->db = (ar >> 14) & 1;
3512         var->g = (ar >> 15) & 1;
3513 }
3514
3515 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3516 {
3517         struct kvm_segment s;
3518
3519         if (to_vmx(vcpu)->rmode.vm86_active) {
3520                 vmx_get_segment(vcpu, &s, seg);
3521                 return s.base;
3522         }
3523         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3524 }
3525
3526 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3527 {
3528         struct vcpu_vmx *vmx = to_vmx(vcpu);
3529
3530         if (unlikely(vmx->rmode.vm86_active))
3531                 return 0;
3532         else {
3533                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3534                 return VMX_AR_DPL(ar);
3535         }
3536 }
3537
3538 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3539 {
3540         u32 ar;
3541
3542         ar = var->type & 15;
3543         ar |= (var->s & 1) << 4;
3544         ar |= (var->dpl & 3) << 5;
3545         ar |= (var->present & 1) << 7;
3546         ar |= (var->avl & 1) << 12;
3547         ar |= (var->l & 1) << 13;
3548         ar |= (var->db & 1) << 14;
3549         ar |= (var->g & 1) << 15;
3550         ar |= (var->unusable || !var->present) << 16;
3551
3552         return ar;
3553 }
3554
3555 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3556 {
3557         struct vcpu_vmx *vmx = to_vmx(vcpu);
3558         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3559
3560         vmx_segment_cache_clear(vmx);
3561
3562         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3563                 vmx->rmode.segs[seg] = *var;
3564                 if (seg == VCPU_SREG_TR)
3565                         vmcs_write16(sf->selector, var->selector);
3566                 else if (var->s)
3567                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3568                 return;
3569         }
3570
3571         vmcs_writel(sf->base, var->base);
3572         vmcs_write32(sf->limit, var->limit);
3573         vmcs_write16(sf->selector, var->selector);
3574
3575         /*
3576          *   Fix the "Accessed" bit in AR field of segment registers for older
3577          * qemu binaries.
3578          *   IA32 arch specifies that at the time of processor reset the
3579          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3580          * is setting it to 0 in the userland code. This causes invalid guest
3581          * state vmexit when "unrestricted guest" mode is turned on.
3582          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3583          * tree. Newer qemu binaries with that qemu fix would not need this
3584          * kvm hack.
3585          */
3586         if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3587                 var->type |= 0x1; /* Accessed */
3588
3589         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3590 }
3591
3592 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3593 {
3594         __vmx_set_segment(vcpu, var, seg);
3595
3596         to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
3597 }
3598
3599 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3600 {
3601         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3602
3603         *db = (ar >> 14) & 1;
3604         *l = (ar >> 13) & 1;
3605 }
3606
3607 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3608 {
3609         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3610         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3611 }
3612
3613 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3614 {
3615         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3616         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3617 }
3618
3619 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3620 {
3621         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3622         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3623 }
3624
3625 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3626 {
3627         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3628         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3629 }
3630
3631 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3632 {
3633         struct kvm_segment var;
3634         u32 ar;
3635
3636         vmx_get_segment(vcpu, &var, seg);
3637         var.dpl = 0x3;
3638         if (seg == VCPU_SREG_CS)
3639                 var.type = 0x3;
3640         ar = vmx_segment_access_rights(&var);
3641
3642         if (var.base != (var.selector << 4))
3643                 return false;
3644         if (var.limit != 0xffff)
3645                 return false;
3646         if (ar != 0xf3)
3647                 return false;
3648
3649         return true;
3650 }
3651
3652 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3653 {
3654         struct kvm_segment cs;
3655         unsigned int cs_rpl;
3656
3657         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3658         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3659
3660         if (cs.unusable)
3661                 return false;
3662         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3663                 return false;
3664         if (!cs.s)
3665                 return false;
3666         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3667                 if (cs.dpl > cs_rpl)
3668                         return false;
3669         } else {
3670                 if (cs.dpl != cs_rpl)
3671                         return false;
3672         }
3673         if (!cs.present)
3674                 return false;
3675
3676         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3677         return true;
3678 }
3679
3680 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3681 {
3682         struct kvm_segment ss;
3683         unsigned int ss_rpl;
3684
3685         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3686         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3687
3688         if (ss.unusable)
3689                 return true;
3690         if (ss.type != 3 && ss.type != 7)
3691                 return false;
3692         if (!ss.s)
3693                 return false;
3694         if (ss.dpl != ss_rpl) /* DPL != RPL */
3695                 return false;
3696         if (!ss.present)
3697                 return false;
3698
3699         return true;
3700 }
3701
3702 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3703 {
3704         struct kvm_segment var;
3705         unsigned int rpl;
3706
3707         vmx_get_segment(vcpu, &var, seg);
3708         rpl = var.selector & SEGMENT_RPL_MASK;
3709
3710         if (var.unusable)
3711                 return true;
3712         if (!var.s)
3713                 return false;
3714         if (!var.present)
3715                 return false;
3716         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3717                 if (var.dpl < rpl) /* DPL < RPL */
3718                         return false;
3719         }
3720
3721         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3722          * rights flags
3723          */
3724         return true;
3725 }
3726
3727 static bool tr_valid(struct kvm_vcpu *vcpu)
3728 {
3729         struct kvm_segment tr;
3730
3731         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3732
3733         if (tr.unusable)
3734                 return false;
3735         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3736                 return false;
3737         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3738                 return false;
3739         if (!tr.present)
3740                 return false;
3741
3742         return true;
3743 }
3744
3745 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3746 {
3747         struct kvm_segment ldtr;
3748
3749         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3750
3751         if (ldtr.unusable)
3752                 return true;
3753         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3754                 return false;
3755         if (ldtr.type != 2)
3756                 return false;
3757         if (!ldtr.present)
3758                 return false;
3759
3760         return true;
3761 }
3762
3763 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3764 {
3765         struct kvm_segment cs, ss;
3766
3767         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3768         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3769
3770         return ((cs.selector & SEGMENT_RPL_MASK) ==
3771                  (ss.selector & SEGMENT_RPL_MASK));
3772 }
3773
3774 /*
3775  * Check if guest state is valid. Returns true if valid, false if
3776  * not.
3777  * We assume that registers are always usable
3778  */
3779 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3780 {
3781         /* real mode guest state checks */
3782         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3783                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3784                         return false;
3785                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3786                         return false;
3787                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3788                         return false;
3789                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3790                         return false;
3791                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3792                         return false;
3793                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3794                         return false;
3795         } else {
3796         /* protected mode guest state checks */
3797                 if (!cs_ss_rpl_check(vcpu))
3798                         return false;
3799                 if (!code_segment_valid(vcpu))
3800                         return false;
3801                 if (!stack_segment_valid(vcpu))
3802                         return false;
3803                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3804                         return false;
3805                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3806                         return false;
3807                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3808                         return false;
3809                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3810                         return false;
3811                 if (!tr_valid(vcpu))
3812                         return false;
3813                 if (!ldtr_valid(vcpu))
3814                         return false;
3815         }
3816         /* TODO:
3817          * - Add checks on RIP
3818          * - Add checks on RFLAGS
3819          */
3820
3821         return true;
3822 }
3823
3824 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
3825 {
3826         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3827         u16 data;
3828         int i;
3829
3830         for (i = 0; i < 3; i++) {
3831                 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3832                         return -EFAULT;
3833         }
3834
3835         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3836         if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3837                 return -EFAULT;
3838
3839         data = ~0;
3840         if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3841                 return -EFAULT;
3842
3843         return 0;
3844 }
3845
3846 static int init_rmode_identity_map(struct kvm *kvm)
3847 {
3848         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3849         int i, r = 0;
3850         void __user *uaddr;
3851         u32 tmp;
3852
3853         /* Protect kvm_vmx->ept_identity_pagetable_done. */
3854         mutex_lock(&kvm->slots_lock);
3855
3856         if (likely(kvm_vmx->ept_identity_pagetable_done))
3857                 goto out;
3858
3859         if (!kvm_vmx->ept_identity_map_addr)
3860                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3861
3862         uaddr = __x86_set_memory_region(kvm,
3863                                         IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3864                                         kvm_vmx->ept_identity_map_addr,
3865                                         PAGE_SIZE);
3866         if (IS_ERR(uaddr)) {
3867                 r = PTR_ERR(uaddr);
3868                 goto out;
3869         }
3870
3871         /* Set up identity-mapping pagetable for EPT in real mode */
3872         for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
3873                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3874                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3875                 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3876                         r = -EFAULT;
3877                         goto out;
3878                 }
3879         }
3880         kvm_vmx->ept_identity_pagetable_done = true;
3881
3882 out:
3883         mutex_unlock(&kvm->slots_lock);
3884         return r;
3885 }
3886
3887 static void seg_setup(int seg)
3888 {
3889         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3890         unsigned int ar;
3891
3892         vmcs_write16(sf->selector, 0);
3893         vmcs_writel(sf->base, 0);
3894         vmcs_write32(sf->limit, 0xffff);
3895         ar = 0x93;
3896         if (seg == VCPU_SREG_CS)
3897                 ar |= 0x08; /* code segment */
3898
3899         vmcs_write32(sf->ar_bytes, ar);
3900 }
3901
3902 int allocate_vpid(void)
3903 {
3904         int vpid;
3905
3906         if (!enable_vpid)
3907                 return 0;
3908         spin_lock(&vmx_vpid_lock);
3909         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3910         if (vpid < VMX_NR_VPIDS)
3911                 __set_bit(vpid, vmx_vpid_bitmap);
3912         else
3913                 vpid = 0;
3914         spin_unlock(&vmx_vpid_lock);
3915         return vpid;
3916 }
3917
3918 void free_vpid(int vpid)
3919 {
3920         if (!enable_vpid || vpid == 0)
3921                 return;
3922         spin_lock(&vmx_vpid_lock);
3923         __clear_bit(vpid, vmx_vpid_bitmap);
3924         spin_unlock(&vmx_vpid_lock);
3925 }
3926
3927 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3928 {
3929         /*
3930          * When KVM is a nested hypervisor on top of Hyper-V and uses
3931          * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3932          * bitmap has changed.
3933          */
3934         if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
3935                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
3936
3937                 if (evmcs->hv_enlightenments_control.msr_bitmap)
3938                         evmcs->hv_clean_fields &=
3939                                 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3940         }
3941
3942         vmx->nested.force_msr_bitmap_recalc = true;
3943 }
3944
3945 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3946 {
3947         struct vcpu_vmx *vmx = to_vmx(vcpu);
3948         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3949
3950         if (!cpu_has_vmx_msr_bitmap())
3951                 return;
3952
3953         vmx_msr_bitmap_l01_changed(vmx);
3954
3955         /*
3956          * Mark the desired intercept state in shadow bitmap, this is needed
3957          * for resync when the MSR filters change.
3958         */
3959         if (is_valid_passthrough_msr(msr)) {
3960                 int idx = possible_passthrough_msr_slot(msr);
3961
3962                 if (idx != -ENOENT) {
3963                         if (type & MSR_TYPE_R)
3964                                 clear_bit(idx, vmx->shadow_msr_intercept.read);
3965                         if (type & MSR_TYPE_W)
3966                                 clear_bit(idx, vmx->shadow_msr_intercept.write);
3967                 }
3968         }
3969
3970         if ((type & MSR_TYPE_R) &&
3971             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3972                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3973                 type &= ~MSR_TYPE_R;
3974         }
3975
3976         if ((type & MSR_TYPE_W) &&
3977             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3978                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3979                 type &= ~MSR_TYPE_W;
3980         }
3981
3982         if (type & MSR_TYPE_R)
3983                 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
3984
3985         if (type & MSR_TYPE_W)
3986                 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
3987 }
3988
3989 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3990 {
3991         struct vcpu_vmx *vmx = to_vmx(vcpu);
3992         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3993
3994         if (!cpu_has_vmx_msr_bitmap())
3995                 return;
3996
3997         vmx_msr_bitmap_l01_changed(vmx);
3998
3999         /*
4000          * Mark the desired intercept state in shadow bitmap, this is needed
4001          * for resync when the MSR filter changes.
4002         */
4003         if (is_valid_passthrough_msr(msr)) {
4004                 int idx = possible_passthrough_msr_slot(msr);
4005
4006                 if (idx != -ENOENT) {
4007                         if (type & MSR_TYPE_R)
4008                                 set_bit(idx, vmx->shadow_msr_intercept.read);
4009                         if (type & MSR_TYPE_W)
4010                                 set_bit(idx, vmx->shadow_msr_intercept.write);
4011                 }
4012         }
4013
4014         if (type & MSR_TYPE_R)
4015                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4016
4017         if (type & MSR_TYPE_W)
4018                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4019 }
4020
4021 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4022 {
4023         /*
4024          * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4025          * of the MSR bitmap.  KVM emulates APIC registers up through 0x3f0,
4026          * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4027          */
4028         const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4029         const int write_idx = read_idx + (0x800 / sizeof(u64));
4030         struct vcpu_vmx *vmx = to_vmx(vcpu);
4031         u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
4032         u8 mode;
4033
4034         if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4035                 return;
4036
4037         if (cpu_has_secondary_exec_ctrls() &&
4038             (secondary_exec_controls_get(vmx) &
4039              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4040                 mode = MSR_BITMAP_MODE_X2APIC;
4041                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4042                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4043         } else {
4044                 mode = 0;
4045         }
4046
4047         if (mode == vmx->x2apic_msr_bitmap_mode)
4048                 return;
4049
4050         vmx->x2apic_msr_bitmap_mode = mode;
4051
4052         /*
4053          * Reset the bitmap for MSRs 0x800 - 0x83f.  Leave AMD's uber-extended
4054          * registers (0x840 and above) intercepted, KVM doesn't support them.
4055          * Intercept all writes by default and poke holes as needed.  Pass
4056          * through reads for all valid registers by default in x2APIC+APICv
4057          * mode, only the current timer count needs on-demand emulation by KVM.
4058          */
4059         if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4060                 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
4061         else
4062                 msr_bitmap[read_idx] = ~0ull;
4063         msr_bitmap[write_idx] = ~0ull;
4064
4065         /*
4066          * TPR reads and writes can be virtualized even if virtual interrupt
4067          * delivery is not in use.
4068          */
4069         vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4070                                   !(mode & MSR_BITMAP_MODE_X2APIC));
4071
4072         if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4073                 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4074                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4075                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4076                 if (enable_ipiv)
4077                         vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4078         }
4079 }
4080
4081 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4082 {
4083         struct vcpu_vmx *vmx = to_vmx(vcpu);
4084         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4085         u32 i;
4086
4087         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4088         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4089         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4090         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4091         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4092                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4093                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4094         }
4095 }
4096
4097 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4098 {
4099         struct vcpu_vmx *vmx = to_vmx(vcpu);
4100         void *vapic_page;
4101         u32 vppr;
4102         int rvi;
4103
4104         if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4105                 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4106                 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4107                 return false;
4108
4109         rvi = vmx_get_rvi();
4110
4111         vapic_page = vmx->nested.virtual_apic_map.hva;
4112         vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
4113
4114         return ((rvi & 0xf0) > (vppr & 0xf0));
4115 }
4116
4117 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4118 {
4119         struct vcpu_vmx *vmx = to_vmx(vcpu);
4120         u32 i;
4121
4122         /*
4123          * Redo intercept permissions for MSRs that KVM is passing through to
4124          * the guest.  Disabling interception will check the new MSR filter and
4125          * ensure that KVM enables interception if usersepace wants to filter
4126          * the MSR.  MSRs that KVM is already intercepting don't need to be
4127          * refreshed since KVM is going to intercept them regardless of what
4128          * userspace wants.
4129          */
4130         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4131                 u32 msr = vmx_possible_passthrough_msrs[i];
4132
4133                 if (!test_bit(i, vmx->shadow_msr_intercept.read))
4134                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4135
4136                 if (!test_bit(i, vmx->shadow_msr_intercept.write))
4137                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
4138         }
4139
4140         /* PT MSRs can be passed through iff PT is exposed to the guest. */
4141         if (vmx_pt_mode_is_host_guest())
4142                 pt_update_intercept_for_msr(vcpu);
4143 }
4144
4145 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4146                                                      int pi_vec)
4147 {
4148 #ifdef CONFIG_SMP
4149         if (vcpu->mode == IN_GUEST_MODE) {
4150                 /*
4151                  * The vector of the virtual has already been set in the PIR.
4152                  * Send a notification event to deliver the virtual interrupt
4153                  * unless the vCPU is the currently running vCPU, i.e. the
4154                  * event is being sent from a fastpath VM-Exit handler, in
4155                  * which case the PIR will be synced to the vIRR before
4156                  * re-entering the guest.
4157                  *
4158                  * When the target is not the running vCPU, the following
4159                  * possibilities emerge:
4160                  *
4161                  * Case 1: vCPU stays in non-root mode. Sending a notification
4162                  * event posts the interrupt to the vCPU.
4163                  *
4164                  * Case 2: vCPU exits to root mode and is still runnable. The
4165                  * PIR will be synced to the vIRR before re-entering the guest.
4166                  * Sending a notification event is ok as the host IRQ handler
4167                  * will ignore the spurious event.
4168                  *
4169                  * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4170                  * has already synced PIR to vIRR and never blocks the vCPU if
4171                  * the vIRR is not empty. Therefore, a blocked vCPU here does
4172                  * not wait for any requested interrupts in PIR, and sending a
4173                  * notification event also results in a benign, spurious event.
4174                  */
4175
4176                 if (vcpu != kvm_get_running_vcpu())
4177                         apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4178                 return;
4179         }
4180 #endif
4181         /*
4182          * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4183          * otherwise do nothing as KVM will grab the highest priority pending
4184          * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4185          */
4186         kvm_vcpu_wake_up(vcpu);
4187 }
4188
4189 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4190                                                 int vector)
4191 {
4192         struct vcpu_vmx *vmx = to_vmx(vcpu);
4193
4194         if (is_guest_mode(vcpu) &&
4195             vector == vmx->nested.posted_intr_nv) {
4196                 /*
4197                  * If a posted intr is not recognized by hardware,
4198                  * we will accomplish it in the next vmentry.
4199                  */
4200                 vmx->nested.pi_pending = true;
4201                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4202
4203                 /*
4204                  * This pairs with the smp_mb_*() after setting vcpu->mode in
4205                  * vcpu_enter_guest() to guarantee the vCPU sees the event
4206                  * request if triggering a posted interrupt "fails" because
4207                  * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
4208                  * the smb_wmb() in kvm_make_request() only ensures everything
4209                  * done before making the request is visible when the request
4210                  * is visible, it doesn't ensure ordering between the store to
4211                  * vcpu->requests and the load from vcpu->mode.
4212                  */
4213                 smp_mb__after_atomic();
4214
4215                 /* the PIR and ON have been set by L1. */
4216                 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4217                 return 0;
4218         }
4219         return -1;
4220 }
4221 /*
4222  * Send interrupt to vcpu via posted interrupt way.
4223  * 1. If target vcpu is running(non-root mode), send posted interrupt
4224  * notification to vcpu and hardware will sync PIR to vIRR atomically.
4225  * 2. If target vcpu isn't running(root mode), kick it to pick up the
4226  * interrupt from PIR in next vmentry.
4227  */
4228 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4229 {
4230         struct vcpu_vmx *vmx = to_vmx(vcpu);
4231         int r;
4232
4233         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4234         if (!r)
4235                 return 0;
4236
4237         /* Note, this is called iff the local APIC is in-kernel. */
4238         if (!vcpu->arch.apic->apicv_active)
4239                 return -1;
4240
4241         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4242                 return 0;
4243
4244         /* If a previous notification has sent the IPI, nothing to do.  */
4245         if (pi_test_and_set_on(&vmx->pi_desc))
4246                 return 0;
4247
4248         /*
4249          * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4250          * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4251          * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4252          * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4253          */
4254         kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
4255         return 0;
4256 }
4257
4258 static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4259                                   int trig_mode, int vector)
4260 {
4261         struct kvm_vcpu *vcpu = apic->vcpu;
4262
4263         if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4264                 kvm_lapic_set_irr(vector, apic);
4265                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4266                 kvm_vcpu_kick(vcpu);
4267         } else {
4268                 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4269                                            trig_mode, vector);
4270         }
4271 }
4272
4273 /*
4274  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4275  * will not change in the lifetime of the guest.
4276  * Note that host-state that does change is set elsewhere. E.g., host-state
4277  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4278  */
4279 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4280 {
4281         u32 low32, high32;
4282         unsigned long tmpl;
4283         unsigned long cr0, cr3, cr4;
4284
4285         cr0 = read_cr0();
4286         WARN_ON(cr0 & X86_CR0_TS);
4287         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
4288
4289         /*
4290          * Save the most likely value for this task's CR3 in the VMCS.
4291          * We can't use __get_current_cr3_fast() because we're not atomic.
4292          */
4293         cr3 = __read_cr3();
4294         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
4295         vmx->loaded_vmcs->host_state.cr3 = cr3;
4296
4297         /* Save the most likely value for this task's CR4 in the VMCS. */
4298         cr4 = cr4_read_shadow();
4299         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4300         vmx->loaded_vmcs->host_state.cr4 = cr4;
4301
4302         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4303 #ifdef CONFIG_X86_64
4304         /*
4305          * Load null selectors, so we can avoid reloading them in
4306          * vmx_prepare_switch_to_host(), in case userspace uses
4307          * the null selectors too (the expected case).
4308          */
4309         vmcs_write16(HOST_DS_SELECTOR, 0);
4310         vmcs_write16(HOST_ES_SELECTOR, 0);
4311 #else
4312         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4313         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4314 #endif
4315         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4316         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4317
4318         vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
4319
4320         vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4321
4322         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4323         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4324
4325         /*
4326          * SYSENTER is used for 32-bit system calls on either 32-bit or
4327          * 64-bit kernels.  It is always zero If neither is allowed, otherwise
4328          * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4329          * have already done so!).
4330          */
4331         if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4332                 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4333
4334         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4335         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4336
4337         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4338                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4339                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4340         }
4341
4342         if (cpu_has_load_ia32_efer())
4343                 vmcs_write64(HOST_IA32_EFER, host_efer);
4344 }
4345
4346 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4347 {
4348         struct kvm_vcpu *vcpu = &vmx->vcpu;
4349
4350         vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4351                                           ~vcpu->arch.cr4_guest_rsvd_bits;
4352         if (!enable_ept) {
4353                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4354                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4355         }
4356         if (is_guest_mode(&vmx->vcpu))
4357                 vcpu->arch.cr4_guest_owned_bits &=
4358                         ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4359         vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4360 }
4361
4362 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4363 {
4364         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4365
4366         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4367                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4368
4369         if (!enable_vnmi)
4370                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4371
4372         if (!enable_preemption_timer)
4373                 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4374
4375         return pin_based_exec_ctrl;
4376 }
4377
4378 static u32 vmx_vmentry_ctrl(void)
4379 {
4380         u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4381
4382         if (vmx_pt_mode_is_system())
4383                 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4384                                   VM_ENTRY_LOAD_IA32_RTIT_CTL);
4385         /*
4386          * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4387          */
4388         vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4389                           VM_ENTRY_LOAD_IA32_EFER |
4390                           VM_ENTRY_IA32E_MODE);
4391
4392         if (cpu_has_perf_global_ctrl_bug())
4393                 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4394
4395         return vmentry_ctrl;
4396 }
4397
4398 static u32 vmx_vmexit_ctrl(void)
4399 {
4400         u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4401
4402         /*
4403          * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4404          * nested virtualization and thus allowed to be set in vmcs12.
4405          */
4406         vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4407                          VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4408
4409         if (vmx_pt_mode_is_system())
4410                 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4411                                  VM_EXIT_CLEAR_IA32_RTIT_CTL);
4412
4413         if (cpu_has_perf_global_ctrl_bug())
4414                 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4415
4416         /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4417         return vmexit_ctrl &
4418                 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4419 }
4420
4421 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4422 {
4423         struct vcpu_vmx *vmx = to_vmx(vcpu);
4424
4425         if (is_guest_mode(vcpu)) {
4426                 vmx->nested.update_vmcs01_apicv_status = true;
4427                 return;
4428         }
4429
4430         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4431
4432         if (kvm_vcpu_apicv_active(vcpu)) {
4433                 secondary_exec_controls_setbit(vmx,
4434                                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
4435                                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4436                 if (enable_ipiv)
4437                         tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4438         } else {
4439                 secondary_exec_controls_clearbit(vmx,
4440                                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
4441                                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4442                 if (enable_ipiv)
4443                         tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4444         }
4445
4446         vmx_update_msr_bitmap_x2apic(vcpu);
4447 }
4448
4449 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4450 {
4451         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4452
4453         /*
4454          * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4455          * vmcs12 and propagated to vmcs02 when set in vmcs12.
4456          */
4457         exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4458                           CPU_BASED_USE_IO_BITMAPS |
4459                           CPU_BASED_MONITOR_TRAP_FLAG |
4460                           CPU_BASED_PAUSE_EXITING);
4461
4462         /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4463         exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4464                           CPU_BASED_NMI_WINDOW_EXITING);
4465
4466         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4467                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4468
4469         if (!cpu_need_tpr_shadow(&vmx->vcpu))
4470                 exec_control &= ~CPU_BASED_TPR_SHADOW;
4471
4472 #ifdef CONFIG_X86_64
4473         if (exec_control & CPU_BASED_TPR_SHADOW)
4474                 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4475                                   CPU_BASED_CR8_STORE_EXITING);
4476         else
4477                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4478                                 CPU_BASED_CR8_LOAD_EXITING;
4479 #endif
4480         /* No need to intercept CR3 access or INVPLG when using EPT. */
4481         if (enable_ept)
4482                 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4483                                   CPU_BASED_CR3_STORE_EXITING |
4484                                   CPU_BASED_INVLPG_EXITING);
4485         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4486                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4487                                 CPU_BASED_MONITOR_EXITING);
4488         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4489                 exec_control &= ~CPU_BASED_HLT_EXITING;
4490         return exec_control;
4491 }
4492
4493 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4494 {
4495         u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4496
4497         /*
4498          * IPI virtualization relies on APICv. Disable IPI virtualization if
4499          * APICv is inhibited.
4500          */
4501         if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4502                 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4503
4504         return exec_control;
4505 }
4506
4507 /*
4508  * Adjust a single secondary execution control bit to intercept/allow an
4509  * instruction in the guest.  This is usually done based on whether or not a
4510  * feature has been exposed to the guest in order to correctly emulate faults.
4511  */
4512 static inline void
4513 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4514                                   u32 control, bool enabled, bool exiting)
4515 {
4516         /*
4517          * If the control is for an opt-in feature, clear the control if the
4518          * feature is not exposed to the guest, i.e. not enabled.  If the
4519          * control is opt-out, i.e. an exiting control, clear the control if
4520          * the feature _is_ exposed to the guest, i.e. exiting/interception is
4521          * disabled for the associated instruction.  Note, the caller is
4522          * responsible presetting exec_control to set all supported bits.
4523          */
4524         if (enabled == exiting)
4525                 *exec_control &= ~control;
4526
4527         /*
4528          * Update the nested MSR settings so that a nested VMM can/can't set
4529          * controls for features that are/aren't exposed to the guest.
4530          */
4531         if (nested) {
4532                 /*
4533                  * All features that can be added or removed to VMX MSRs must
4534                  * be supported in the first place for nested virtualization.
4535                  */
4536                 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4537                         enabled = false;
4538
4539                 if (enabled)
4540                         vmx->nested.msrs.secondary_ctls_high |= control;
4541                 else
4542                         vmx->nested.msrs.secondary_ctls_high &= ~control;
4543         }
4544 }
4545
4546 /*
4547  * Wrapper macro for the common case of adjusting a secondary execution control
4548  * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4549  * verifies that the control is actually supported by KVM and hardware.
4550  */
4551 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4552 ({                                                                       \
4553         bool __enabled;                                                  \
4554                                                                          \
4555         if (cpu_has_vmx_##name()) {                                      \
4556                 __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
4557                                             X86_FEATURE_##feat_name);    \
4558                 vmx_adjust_secondary_exec_control(vmx, exec_control,     \
4559                         SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4560         }                                                                \
4561 })
4562
4563 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4564 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4565         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4566
4567 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4568         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4569
4570 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4571 {
4572         struct kvm_vcpu *vcpu = &vmx->vcpu;
4573
4574         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4575
4576         if (vmx_pt_mode_is_system())
4577                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4578         if (!cpu_need_virtualize_apic_accesses(vcpu))
4579                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4580         if (vmx->vpid == 0)
4581                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4582         if (!enable_ept) {
4583                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4584                 enable_unrestricted_guest = 0;
4585         }
4586         if (!enable_unrestricted_guest)
4587                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4588         if (kvm_pause_in_guest(vmx->vcpu.kvm))
4589                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4590         if (!kvm_vcpu_apicv_active(vcpu))
4591                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4592                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4593         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4594
4595         /*
4596          * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4597          * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4598          */
4599         exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4600
4601         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4602          * in vmx_set_cr4.  */
4603         exec_control &= ~SECONDARY_EXEC_DESC;
4604
4605         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4606            (handle_vmptrld).
4607            We can NOT enable shadow_vmcs here because we don't have yet
4608            a current VMCS12
4609         */
4610         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4611
4612         /*
4613          * PML is enabled/disabled when dirty logging of memsmlots changes, but
4614          * it needs to be set here when dirty logging is already active, e.g.
4615          * if this vCPU was created after dirty logging was enabled.
4616          */
4617         if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
4618                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4619
4620         if (cpu_has_vmx_xsaves()) {
4621                 /* Exposing XSAVES only when XSAVE is exposed */
4622                 bool xsaves_enabled =
4623                         boot_cpu_has(X86_FEATURE_XSAVE) &&
4624                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4625                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4626
4627                 vcpu->arch.xsaves_enabled = xsaves_enabled;
4628
4629                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4630                                                   SECONDARY_EXEC_XSAVES,
4631                                                   xsaves_enabled, false);
4632         }
4633
4634         /*
4635          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4636          * feature is exposed to the guest.  This creates a virtualization hole
4637          * if both are supported in hardware but only one is exposed to the
4638          * guest, but letting the guest execute RDTSCP or RDPID when either one
4639          * is advertised is preferable to emulating the advertised instruction
4640          * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4641          */
4642         if (cpu_has_vmx_rdtscp()) {
4643                 bool rdpid_or_rdtscp_enabled =
4644                         guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4645                         guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4646
4647                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4648                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
4649                                                   rdpid_or_rdtscp_enabled, false);
4650         }
4651         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4652
4653         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4654         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4655
4656         vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4657                                     ENABLE_USR_WAIT_PAUSE, false);
4658
4659         if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4660                 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4661
4662         if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4663                 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4664
4665         return exec_control;
4666 }
4667
4668 static inline int vmx_get_pid_table_order(struct kvm *kvm)
4669 {
4670         return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4671 }
4672
4673 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4674 {
4675         struct page *pages;
4676         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4677
4678         if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4679                 return 0;
4680
4681         if (kvm_vmx->pid_table)
4682                 return 0;
4683
4684         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
4685         if (!pages)
4686                 return -ENOMEM;
4687
4688         kvm_vmx->pid_table = (void *)page_address(pages);
4689         return 0;
4690 }
4691
4692 static int vmx_vcpu_precreate(struct kvm *kvm)
4693 {
4694         return vmx_alloc_ipiv_pid_table(kvm);
4695 }
4696
4697 #define VMX_XSS_EXIT_BITMAP 0
4698
4699 static void init_vmcs(struct vcpu_vmx *vmx)
4700 {
4701         struct kvm *kvm = vmx->vcpu.kvm;
4702         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4703
4704         if (nested)
4705                 nested_vmx_set_vmcs_shadowing_bitmap();
4706
4707         if (cpu_has_vmx_msr_bitmap())
4708                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4709
4710         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4711
4712         /* Control */
4713         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4714
4715         exec_controls_set(vmx, vmx_exec_control(vmx));
4716
4717         if (cpu_has_secondary_exec_ctrls())
4718                 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4719
4720         if (cpu_has_tertiary_exec_ctrls())
4721                 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4722
4723         if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4724                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4725                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4726                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4727                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4728
4729                 vmcs_write16(GUEST_INTR_STATUS, 0);
4730
4731                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4732                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4733         }
4734
4735         if (vmx_can_use_ipiv(&vmx->vcpu)) {
4736                 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4737                 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4738         }
4739
4740         if (!kvm_pause_in_guest(kvm)) {
4741                 vmcs_write32(PLE_GAP, ple_gap);
4742                 vmx->ple_window = ple_window;
4743                 vmx->ple_window_dirty = true;
4744         }
4745
4746         if (kvm_notify_vmexit_enabled(kvm))
4747                 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4748
4749         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4750         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4751         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4752
4753         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4754         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4755         vmx_set_constant_host_state(vmx);
4756         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4757         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4758
4759         if (cpu_has_vmx_vmfunc())
4760                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4761
4762         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4763         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4764         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4765         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4766         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4767
4768         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4769                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4770
4771         vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4772
4773         /* 22.2.1, 20.8.1 */
4774         vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4775
4776         vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4777         vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4778
4779         set_cr4_guest_host_mask(vmx);
4780
4781         if (vmx->vpid != 0)
4782                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4783
4784         if (cpu_has_vmx_xsaves())
4785                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4786
4787         if (enable_pml) {
4788                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4789                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4790         }
4791
4792         vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4793
4794         if (vmx_pt_mode_is_host_guest()) {
4795                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4796                 /* Bit[6~0] are forced to 1, writes are ignored. */
4797                 vmx->pt_desc.guest.output_mask = 0x7F;
4798                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4799         }
4800
4801         vmcs_write32(GUEST_SYSENTER_CS, 0);
4802         vmcs_writel(GUEST_SYSENTER_ESP, 0);
4803         vmcs_writel(GUEST_SYSENTER_EIP, 0);
4804         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4805
4806         if (cpu_has_vmx_tpr_shadow()) {
4807                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4808                 if (cpu_need_tpr_shadow(&vmx->vcpu))
4809                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4810                                      __pa(vmx->vcpu.arch.apic->regs));
4811                 vmcs_write32(TPR_THRESHOLD, 0);
4812         }
4813
4814         vmx_setup_uret_msrs(vmx);
4815 }
4816
4817 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4818 {
4819         struct vcpu_vmx *vmx = to_vmx(vcpu);
4820
4821         init_vmcs(vmx);
4822
4823         if (nested)
4824                 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4825
4826         vcpu_setup_sgx_lepubkeyhash(vcpu);
4827
4828         vmx->nested.posted_intr_nv = -1;
4829         vmx->nested.vmxon_ptr = INVALID_GPA;
4830         vmx->nested.current_vmptr = INVALID_GPA;
4831         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4832
4833         vcpu->arch.microcode_version = 0x100000000ULL;
4834         vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4835
4836         /*
4837          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4838          * or POSTED_INTR_WAKEUP_VECTOR.
4839          */
4840         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4841         vmx->pi_desc.sn = 1;
4842 }
4843
4844 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4845 {
4846         struct vcpu_vmx *vmx = to_vmx(vcpu);
4847
4848         if (!init_event)
4849                 __vmx_vcpu_reset(vcpu);
4850
4851         vmx->rmode.vm86_active = 0;
4852         vmx->spec_ctrl = 0;
4853
4854         vmx->msr_ia32_umwait_control = 0;
4855
4856         vmx->hv_deadline_tsc = -1;
4857         kvm_set_cr8(vcpu, 0);
4858
4859         vmx_segment_cache_clear(vmx);
4860         kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
4861
4862         seg_setup(VCPU_SREG_CS);
4863         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4864         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4865
4866         seg_setup(VCPU_SREG_DS);
4867         seg_setup(VCPU_SREG_ES);
4868         seg_setup(VCPU_SREG_FS);
4869         seg_setup(VCPU_SREG_GS);
4870         seg_setup(VCPU_SREG_SS);
4871
4872         vmcs_write16(GUEST_TR_SELECTOR, 0);
4873         vmcs_writel(GUEST_TR_BASE, 0);
4874         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4875         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4876
4877         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4878         vmcs_writel(GUEST_LDTR_BASE, 0);
4879         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4880         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4881
4882         vmcs_writel(GUEST_GDTR_BASE, 0);
4883         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4884
4885         vmcs_writel(GUEST_IDTR_BASE, 0);
4886         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4887
4888         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4889         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4890         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4891         if (kvm_mpx_supported())
4892                 vmcs_write64(GUEST_BNDCFGS, 0);
4893
4894         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4895
4896         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4897
4898         vpid_sync_context(vmx->vpid);
4899
4900         vmx_update_fb_clear_dis(vcpu, vmx);
4901 }
4902
4903 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
4904 {
4905         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4906 }
4907
4908 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
4909 {
4910         if (!enable_vnmi ||
4911             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4912                 vmx_enable_irq_window(vcpu);
4913                 return;
4914         }
4915
4916         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4917 }
4918
4919 static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
4920 {
4921         struct vcpu_vmx *vmx = to_vmx(vcpu);
4922         uint32_t intr;
4923         int irq = vcpu->arch.interrupt.nr;
4924
4925         trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
4926
4927         ++vcpu->stat.irq_injections;
4928         if (vmx->rmode.vm86_active) {
4929                 int inc_eip = 0;
4930                 if (vcpu->arch.interrupt.soft)
4931                         inc_eip = vcpu->arch.event_exit_inst_len;
4932                 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4933                 return;
4934         }
4935         intr = irq | INTR_INFO_VALID_MASK;
4936         if (vcpu->arch.interrupt.soft) {
4937                 intr |= INTR_TYPE_SOFT_INTR;
4938                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4939                              vmx->vcpu.arch.event_exit_inst_len);
4940         } else
4941                 intr |= INTR_TYPE_EXT_INTR;
4942         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4943
4944         vmx_clear_hlt(vcpu);
4945 }
4946
4947 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4948 {
4949         struct vcpu_vmx *vmx = to_vmx(vcpu);
4950
4951         if (!enable_vnmi) {
4952                 /*
4953                  * Tracking the NMI-blocked state in software is built upon
4954                  * finding the next open IRQ window. This, in turn, depends on
4955                  * well-behaving guests: They have to keep IRQs disabled at
4956                  * least as long as the NMI handler runs. Otherwise we may
4957                  * cause NMI nesting, maybe breaking the guest. But as this is
4958                  * highly unlikely, we can live with the residual risk.
4959                  */
4960                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4961                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4962         }
4963
4964         ++vcpu->stat.nmi_injections;
4965         vmx->loaded_vmcs->nmi_known_unmasked = false;
4966
4967         if (vmx->rmode.vm86_active) {
4968                 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
4969                 return;
4970         }
4971
4972         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4973                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4974
4975         vmx_clear_hlt(vcpu);
4976 }
4977
4978 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4979 {
4980         struct vcpu_vmx *vmx = to_vmx(vcpu);
4981         bool masked;
4982
4983         if (!enable_vnmi)
4984                 return vmx->loaded_vmcs->soft_vnmi_blocked;
4985         if (vmx->loaded_vmcs->nmi_known_unmasked)
4986                 return false;
4987         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4988         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4989         return masked;
4990 }
4991
4992 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4993 {
4994         struct vcpu_vmx *vmx = to_vmx(vcpu);
4995
4996         if (!enable_vnmi) {
4997                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4998                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4999                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
5000                 }
5001         } else {
5002                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5003                 if (masked)
5004                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5005                                       GUEST_INTR_STATE_NMI);
5006                 else
5007                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5008                                         GUEST_INTR_STATE_NMI);
5009         }
5010 }
5011
5012 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5013 {
5014         if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5015                 return false;
5016
5017         if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5018                 return true;
5019
5020         return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5021                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5022                  GUEST_INTR_STATE_NMI));
5023 }
5024
5025 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5026 {
5027         if (to_vmx(vcpu)->nested.nested_run_pending)
5028                 return -EBUSY;
5029
5030         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
5031         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5032                 return -EBUSY;
5033
5034         return !vmx_nmi_blocked(vcpu);
5035 }
5036
5037 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5038 {
5039         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5040                 return false;
5041
5042         return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5043                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5044                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5045 }
5046
5047 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5048 {
5049         if (to_vmx(vcpu)->nested.nested_run_pending)
5050                 return -EBUSY;
5051
5052         /*
5053          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5054          * e.g. if the IRQ arrived asynchronously after checking nested events.
5055          */
5056         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5057                 return -EBUSY;
5058
5059         return !vmx_interrupt_blocked(vcpu);
5060 }
5061
5062 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5063 {
5064         void __user *ret;
5065
5066         if (enable_unrestricted_guest)
5067                 return 0;
5068
5069         mutex_lock(&kvm->slots_lock);
5070         ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5071                                       PAGE_SIZE * 3);
5072         mutex_unlock(&kvm->slots_lock);
5073
5074         if (IS_ERR(ret))
5075                 return PTR_ERR(ret);
5076
5077         to_kvm_vmx(kvm)->tss_addr = addr;
5078
5079         return init_rmode_tss(kvm, ret);
5080 }
5081
5082 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5083 {
5084         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5085         return 0;
5086 }
5087
5088 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5089 {
5090         switch (vec) {
5091         case BP_VECTOR:
5092                 /*
5093                  * Update instruction length as we may reinject the exception
5094                  * from user space while in guest debugging mode.
5095                  */
5096                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5097                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5098                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5099                         return false;
5100                 fallthrough;
5101         case DB_VECTOR:
5102                 return !(vcpu->guest_debug &
5103                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5104         case DE_VECTOR:
5105         case OF_VECTOR:
5106         case BR_VECTOR:
5107         case UD_VECTOR:
5108         case DF_VECTOR:
5109         case SS_VECTOR:
5110         case GP_VECTOR:
5111         case MF_VECTOR:
5112                 return true;
5113         }
5114         return false;
5115 }
5116
5117 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5118                                   int vec, u32 err_code)
5119 {
5120         /*
5121          * Instruction with address size override prefix opcode 0x67
5122          * Cause the #SS fault with 0 error code in VM86 mode.
5123          */
5124         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5125                 if (kvm_emulate_instruction(vcpu, 0)) {
5126                         if (vcpu->arch.halt_request) {
5127                                 vcpu->arch.halt_request = 0;
5128                                 return kvm_emulate_halt_noskip(vcpu);
5129                         }
5130                         return 1;
5131                 }
5132                 return 0;
5133         }
5134
5135         /*
5136          * Forward all other exceptions that are valid in real mode.
5137          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5138          *        the required debugging infrastructure rework.
5139          */
5140         kvm_queue_exception(vcpu, vec);
5141         return 1;
5142 }
5143
5144 static int handle_machine_check(struct kvm_vcpu *vcpu)
5145 {
5146         /* handled by vmx_vcpu_run() */
5147         return 1;
5148 }
5149
5150 /*
5151  * If the host has split lock detection disabled, then #AC is
5152  * unconditionally injected into the guest, which is the pre split lock
5153  * detection behaviour.
5154  *
5155  * If the host has split lock detection enabled then #AC is
5156  * only injected into the guest when:
5157  *  - Guest CPL == 3 (user mode)
5158  *  - Guest has #AC detection enabled in CR0
5159  *  - Guest EFLAGS has AC bit set
5160  */
5161 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5162 {
5163         if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5164                 return true;
5165
5166         return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
5167                (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5168 }
5169
5170 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5171 {
5172         struct vcpu_vmx *vmx = to_vmx(vcpu);
5173         struct kvm_run *kvm_run = vcpu->run;
5174         u32 intr_info, ex_no, error_code;
5175         unsigned long cr2, dr6;
5176         u32 vect_info;
5177
5178         vect_info = vmx->idt_vectoring_info;
5179         intr_info = vmx_get_intr_info(vcpu);
5180
5181         /*
5182          * Machine checks are handled by handle_exception_irqoff(), or by
5183          * vmx_vcpu_run() if a #MC occurs on VM-Entry.  NMIs are handled by
5184          * vmx_vcpu_enter_exit().
5185          */
5186         if (is_machine_check(intr_info) || is_nmi(intr_info))
5187                 return 1;
5188
5189         /*
5190          * Queue the exception here instead of in handle_nm_fault_irqoff().
5191          * This ensures the nested_vmx check is not skipped so vmexit can
5192          * be reflected to L1 (when it intercepts #NM) before reaching this
5193          * point.
5194          */
5195         if (is_nm_fault(intr_info)) {
5196                 kvm_queue_exception(vcpu, NM_VECTOR);
5197                 return 1;
5198         }
5199
5200         if (is_invalid_opcode(intr_info))
5201                 return handle_ud(vcpu);
5202
5203         error_code = 0;
5204         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5205                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5206
5207         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5208                 WARN_ON_ONCE(!enable_vmware_backdoor);
5209
5210                 /*
5211                  * VMware backdoor emulation on #GP interception only handles
5212                  * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5213                  * error code on #GP.
5214                  */
5215                 if (error_code) {
5216                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5217                         return 1;
5218                 }
5219                 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5220         }
5221
5222         /*
5223          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5224          * MMIO, it is better to report an internal error.
5225          * See the comments in vmx_handle_exit.
5226          */
5227         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5228             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5229                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5230                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5231                 vcpu->run->internal.ndata = 4;
5232                 vcpu->run->internal.data[0] = vect_info;
5233                 vcpu->run->internal.data[1] = intr_info;
5234                 vcpu->run->internal.data[2] = error_code;
5235                 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5236                 return 0;
5237         }
5238
5239         if (is_page_fault(intr_info)) {
5240                 cr2 = vmx_get_exit_qual(vcpu);
5241                 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5242                         /*
5243                          * EPT will cause page fault only if we need to
5244                          * detect illegal GPAs.
5245                          */
5246                         WARN_ON_ONCE(!allow_smaller_maxphyaddr);
5247                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5248                         return 1;
5249                 } else
5250                         return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5251         }
5252
5253         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5254
5255         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5256                 return handle_rmode_exception(vcpu, ex_no, error_code);
5257
5258         switch (ex_no) {
5259         case DB_VECTOR:
5260                 dr6 = vmx_get_exit_qual(vcpu);
5261                 if (!(vcpu->guest_debug &
5262                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5263                         /*
5264                          * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5265                          * instruction.  ICEBP generates a trap-like #DB, but
5266                          * despite its interception control being tied to #DB,
5267                          * is an instruction intercept, i.e. the VM-Exit occurs
5268                          * on the ICEBP itself.  Use the inner "skip" helper to
5269                          * avoid single-step #DB and MTF updates, as ICEBP is
5270                          * higher priority.  Note, skipping ICEBP still clears
5271                          * STI and MOVSS blocking.
5272                          *
5273                          * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5274                          * if single-step is enabled in RFLAGS and STI or MOVSS
5275                          * blocking is active, as the CPU doesn't set the bit
5276                          * on VM-Exit due to #DB interception.  VM-Entry has a
5277                          * consistency check that a single-step #DB is pending
5278                          * in this scenario as the previous instruction cannot
5279                          * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5280                          * don't modify RFLAGS), therefore the one instruction
5281                          * delay when activating single-step breakpoints must
5282                          * have already expired.  Note, the CPU sets/clears BS
5283                          * as appropriate for all other VM-Exits types.
5284                          */
5285                         if (is_icebp(intr_info))
5286                                 WARN_ON(!skip_emulated_instruction(vcpu));
5287                         else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5288                                  (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5289                                   (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5290                                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5291                                             vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5292
5293                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5294                         return 1;
5295                 }
5296                 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5297                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5298                 fallthrough;
5299         case BP_VECTOR:
5300                 /*
5301                  * Update instruction length as we may reinject #BP from
5302                  * user space while in guest debugging mode. Reading it for
5303                  * #DB as well causes no harm, it is not used in that case.
5304                  */
5305                 vmx->vcpu.arch.event_exit_inst_len =
5306                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5307                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5308                 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5309                 kvm_run->debug.arch.exception = ex_no;
5310                 break;
5311         case AC_VECTOR:
5312                 if (vmx_guest_inject_ac(vcpu)) {
5313                         kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5314                         return 1;
5315                 }
5316
5317                 /*
5318                  * Handle split lock. Depending on detection mode this will
5319                  * either warn and disable split lock detection for this
5320                  * task or force SIGBUS on it.
5321                  */
5322                 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5323                         return 1;
5324                 fallthrough;
5325         default:
5326                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5327                 kvm_run->ex.exception = ex_no;
5328                 kvm_run->ex.error_code = error_code;
5329                 break;
5330         }
5331         return 0;
5332 }
5333
5334 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5335 {
5336         ++vcpu->stat.irq_exits;
5337         return 1;
5338 }
5339
5340 static int handle_triple_fault(struct kvm_vcpu *vcpu)
5341 {
5342         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5343         vcpu->mmio_needed = 0;
5344         return 0;
5345 }
5346
5347 static int handle_io(struct kvm_vcpu *vcpu)
5348 {
5349         unsigned long exit_qualification;
5350         int size, in, string;
5351         unsigned port;
5352
5353         exit_qualification = vmx_get_exit_qual(vcpu);
5354         string = (exit_qualification & 16) != 0;
5355
5356         ++vcpu->stat.io_exits;
5357
5358         if (string)
5359                 return kvm_emulate_instruction(vcpu, 0);
5360
5361         port = exit_qualification >> 16;
5362         size = (exit_qualification & 7) + 1;
5363         in = (exit_qualification & 8) != 0;
5364
5365         return kvm_fast_pio(vcpu, size, port, in);
5366 }
5367
5368 static void
5369 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5370 {
5371         /*
5372          * Patch in the VMCALL instruction:
5373          */
5374         hypercall[0] = 0x0f;
5375         hypercall[1] = 0x01;
5376         hypercall[2] = 0xc1;
5377 }
5378
5379 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5380 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5381 {
5382         if (is_guest_mode(vcpu)) {
5383                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5384                 unsigned long orig_val = val;
5385
5386                 /*
5387                  * We get here when L2 changed cr0 in a way that did not change
5388                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5389                  * but did change L0 shadowed bits. So we first calculate the
5390                  * effective cr0 value that L1 would like to write into the
5391                  * hardware. It consists of the L2-owned bits from the new
5392                  * value combined with the L1-owned bits from L1's guest_cr0.
5393                  */
5394                 val = (val & ~vmcs12->cr0_guest_host_mask) |
5395                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5396
5397                 if (!nested_guest_cr0_valid(vcpu, val))
5398                         return 1;
5399
5400                 if (kvm_set_cr0(vcpu, val))
5401                         return 1;
5402                 vmcs_writel(CR0_READ_SHADOW, orig_val);
5403                 return 0;
5404         } else {
5405                 if (to_vmx(vcpu)->nested.vmxon &&
5406                     !nested_host_cr0_valid(vcpu, val))
5407                         return 1;
5408
5409                 return kvm_set_cr0(vcpu, val);
5410         }
5411 }
5412
5413 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5414 {
5415         if (is_guest_mode(vcpu)) {
5416                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5417                 unsigned long orig_val = val;
5418
5419                 /* analogously to handle_set_cr0 */
5420                 val = (val & ~vmcs12->cr4_guest_host_mask) |
5421                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5422                 if (kvm_set_cr4(vcpu, val))
5423                         return 1;
5424                 vmcs_writel(CR4_READ_SHADOW, orig_val);
5425                 return 0;
5426         } else
5427                 return kvm_set_cr4(vcpu, val);
5428 }
5429
5430 static int handle_desc(struct kvm_vcpu *vcpu)
5431 {
5432         WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
5433         return kvm_emulate_instruction(vcpu, 0);
5434 }
5435
5436 static int handle_cr(struct kvm_vcpu *vcpu)
5437 {
5438         unsigned long exit_qualification, val;
5439         int cr;
5440         int reg;
5441         int err;
5442         int ret;
5443
5444         exit_qualification = vmx_get_exit_qual(vcpu);
5445         cr = exit_qualification & 15;
5446         reg = (exit_qualification >> 8) & 15;
5447         switch ((exit_qualification >> 4) & 3) {
5448         case 0: /* mov to cr */
5449                 val = kvm_register_read(vcpu, reg);
5450                 trace_kvm_cr_write(cr, val);
5451                 switch (cr) {
5452                 case 0:
5453                         err = handle_set_cr0(vcpu, val);
5454                         return kvm_complete_insn_gp(vcpu, err);
5455                 case 3:
5456                         WARN_ON_ONCE(enable_unrestricted_guest);
5457
5458                         err = kvm_set_cr3(vcpu, val);
5459                         return kvm_complete_insn_gp(vcpu, err);
5460                 case 4:
5461                         err = handle_set_cr4(vcpu, val);
5462                         return kvm_complete_insn_gp(vcpu, err);
5463                 case 8: {
5464                                 u8 cr8_prev = kvm_get_cr8(vcpu);
5465                                 u8 cr8 = (u8)val;
5466                                 err = kvm_set_cr8(vcpu, cr8);
5467                                 ret = kvm_complete_insn_gp(vcpu, err);
5468                                 if (lapic_in_kernel(vcpu))
5469                                         return ret;
5470                                 if (cr8_prev <= cr8)
5471                                         return ret;
5472                                 /*
5473                                  * TODO: we might be squashing a
5474                                  * KVM_GUESTDBG_SINGLESTEP-triggered
5475                                  * KVM_EXIT_DEBUG here.
5476                                  */
5477                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5478                                 return 0;
5479                         }
5480                 }
5481                 break;
5482         case 2: /* clts */
5483                 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5484                 return -EIO;
5485         case 1: /*mov from cr*/
5486                 switch (cr) {
5487                 case 3:
5488                         WARN_ON_ONCE(enable_unrestricted_guest);
5489
5490                         val = kvm_read_cr3(vcpu);
5491                         kvm_register_write(vcpu, reg, val);
5492                         trace_kvm_cr_read(cr, val);
5493                         return kvm_skip_emulated_instruction(vcpu);
5494                 case 8:
5495                         val = kvm_get_cr8(vcpu);
5496                         kvm_register_write(vcpu, reg, val);
5497                         trace_kvm_cr_read(cr, val);
5498                         return kvm_skip_emulated_instruction(vcpu);
5499                 }
5500                 break;
5501         case 3: /* lmsw */
5502                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5503                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5504                 kvm_lmsw(vcpu, val);
5505
5506                 return kvm_skip_emulated_instruction(vcpu);
5507         default:
5508                 break;
5509         }
5510         vcpu->run->exit_reason = 0;
5511         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5512                (int)(exit_qualification >> 4) & 3, cr);
5513         return 0;
5514 }
5515
5516 static int handle_dr(struct kvm_vcpu *vcpu)
5517 {
5518         unsigned long exit_qualification;
5519         int dr, dr7, reg;
5520         int err = 1;
5521
5522         exit_qualification = vmx_get_exit_qual(vcpu);
5523         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5524
5525         /* First, if DR does not exist, trigger UD */
5526         if (!kvm_require_dr(vcpu, dr))
5527                 return 1;
5528
5529         if (vmx_get_cpl(vcpu) > 0)
5530                 goto out;
5531
5532         dr7 = vmcs_readl(GUEST_DR7);
5533         if (dr7 & DR7_GD) {
5534                 /*
5535                  * As the vm-exit takes precedence over the debug trap, we
5536                  * need to emulate the latter, either for the host or the
5537                  * guest debugging itself.
5538                  */
5539                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5540                         vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5541                         vcpu->run->debug.arch.dr7 = dr7;
5542                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5543                         vcpu->run->debug.arch.exception = DB_VECTOR;
5544                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5545                         return 0;
5546                 } else {
5547                         kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5548                         return 1;
5549                 }
5550         }
5551
5552         if (vcpu->guest_debug == 0) {
5553                 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5554
5555                 /*
5556                  * No more DR vmexits; force a reload of the debug registers
5557                  * and reenter on this instruction.  The next vmexit will
5558                  * retrieve the full state of the debug registers.
5559                  */
5560                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5561                 return 1;
5562         }
5563
5564         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5565         if (exit_qualification & TYPE_MOV_FROM_DR) {
5566                 unsigned long val;
5567
5568                 kvm_get_dr(vcpu, dr, &val);
5569                 kvm_register_write(vcpu, reg, val);
5570                 err = 0;
5571         } else {
5572                 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5573         }
5574
5575 out:
5576         return kvm_complete_insn_gp(vcpu, err);
5577 }
5578
5579 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5580 {
5581         get_debugreg(vcpu->arch.db[0], 0);
5582         get_debugreg(vcpu->arch.db[1], 1);
5583         get_debugreg(vcpu->arch.db[2], 2);
5584         get_debugreg(vcpu->arch.db[3], 3);
5585         get_debugreg(vcpu->arch.dr6, 6);
5586         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5587
5588         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5589         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5590
5591         /*
5592          * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5593          * a stale dr6 from the guest.
5594          */
5595         set_debugreg(DR6_RESERVED, 6);
5596 }
5597
5598 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5599 {
5600         vmcs_writel(GUEST_DR7, val);
5601 }
5602
5603 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5604 {
5605         kvm_apic_update_ppr(vcpu);
5606         return 1;
5607 }
5608
5609 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5610 {
5611         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5612
5613         kvm_make_request(KVM_REQ_EVENT, vcpu);
5614
5615         ++vcpu->stat.irq_window_exits;
5616         return 1;
5617 }
5618
5619 static int handle_invlpg(struct kvm_vcpu *vcpu)
5620 {
5621         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5622
5623         kvm_mmu_invlpg(vcpu, exit_qualification);
5624         return kvm_skip_emulated_instruction(vcpu);
5625 }
5626
5627 static int handle_apic_access(struct kvm_vcpu *vcpu)
5628 {
5629         if (likely(fasteoi)) {
5630                 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5631                 int access_type, offset;
5632
5633                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5634                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5635                 /*
5636                  * Sane guest uses MOV to write EOI, with written value
5637                  * not cared. So make a short-circuit here by avoiding
5638                  * heavy instruction emulation.
5639                  */
5640                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5641                     (offset == APIC_EOI)) {
5642                         kvm_lapic_set_eoi(vcpu);
5643                         return kvm_skip_emulated_instruction(vcpu);
5644                 }
5645         }
5646         return kvm_emulate_instruction(vcpu, 0);
5647 }
5648
5649 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5650 {
5651         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5652         int vector = exit_qualification & 0xff;
5653
5654         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5655         kvm_apic_set_eoi_accelerated(vcpu, vector);
5656         return 1;
5657 }
5658
5659 static int handle_apic_write(struct kvm_vcpu *vcpu)
5660 {
5661         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5662
5663         /*
5664          * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5665          * hardware has done any necessary aliasing, offset adjustments, etc...
5666          * for the access.  I.e. the correct value has already been  written to
5667          * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
5668          * retrieve the register value and emulate the access.
5669          */
5670         u32 offset = exit_qualification & 0xff0;
5671
5672         kvm_apic_write_nodecode(vcpu, offset);
5673         return 1;
5674 }
5675
5676 static int handle_task_switch(struct kvm_vcpu *vcpu)
5677 {
5678         struct vcpu_vmx *vmx = to_vmx(vcpu);
5679         unsigned long exit_qualification;
5680         bool has_error_code = false;
5681         u32 error_code = 0;
5682         u16 tss_selector;
5683         int reason, type, idt_v, idt_index;
5684
5685         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5686         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5687         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5688
5689         exit_qualification = vmx_get_exit_qual(vcpu);
5690
5691         reason = (u32)exit_qualification >> 30;
5692         if (reason == TASK_SWITCH_GATE && idt_v) {
5693                 switch (type) {
5694                 case INTR_TYPE_NMI_INTR:
5695                         vcpu->arch.nmi_injected = false;
5696                         vmx_set_nmi_mask(vcpu, true);
5697                         break;
5698                 case INTR_TYPE_EXT_INTR:
5699                 case INTR_TYPE_SOFT_INTR:
5700                         kvm_clear_interrupt_queue(vcpu);
5701                         break;
5702                 case INTR_TYPE_HARD_EXCEPTION:
5703                         if (vmx->idt_vectoring_info &
5704                             VECTORING_INFO_DELIVER_CODE_MASK) {
5705                                 has_error_code = true;
5706                                 error_code =
5707                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5708                         }
5709                         fallthrough;
5710                 case INTR_TYPE_SOFT_EXCEPTION:
5711                         kvm_clear_exception_queue(vcpu);
5712                         break;
5713                 default:
5714                         break;
5715                 }
5716         }
5717         tss_selector = exit_qualification;
5718
5719         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5720                        type != INTR_TYPE_EXT_INTR &&
5721                        type != INTR_TYPE_NMI_INTR))
5722                 WARN_ON(!skip_emulated_instruction(vcpu));
5723
5724         /*
5725          * TODO: What about debug traps on tss switch?
5726          *       Are we supposed to inject them and update dr6?
5727          */
5728         return kvm_task_switch(vcpu, tss_selector,
5729                                type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5730                                reason, has_error_code, error_code);
5731 }
5732
5733 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5734 {
5735         unsigned long exit_qualification;
5736         gpa_t gpa;
5737         u64 error_code;
5738
5739         exit_qualification = vmx_get_exit_qual(vcpu);
5740
5741         /*
5742          * EPT violation happened while executing iret from NMI,
5743          * "blocked by NMI" bit has to be set before next VM entry.
5744          * There are errata that may cause this bit to not be set:
5745          * AAK134, BY25.
5746          */
5747         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5748                         enable_vnmi &&
5749                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5750                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5751
5752         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5753         trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5754
5755         /* Is it a read fault? */
5756         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5757                      ? PFERR_USER_MASK : 0;
5758         /* Is it a write fault? */
5759         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5760                       ? PFERR_WRITE_MASK : 0;
5761         /* Is it a fetch fault? */
5762         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5763                       ? PFERR_FETCH_MASK : 0;
5764         /* ept page table entry is present? */
5765         error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
5766                       ? PFERR_PRESENT_MASK : 0;
5767
5768         error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
5769                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5770
5771         vcpu->arch.exit_qualification = exit_qualification;
5772
5773         /*
5774          * Check that the GPA doesn't exceed physical memory limits, as that is
5775          * a guest page fault.  We have to emulate the instruction here, because
5776          * if the illegal address is that of a paging structure, then
5777          * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5778          * would also use advanced VM-exit information for EPT violations to
5779          * reconstruct the page fault error code.
5780          */
5781         if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5782                 return kvm_emulate_instruction(vcpu, 0);
5783
5784         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5785 }
5786
5787 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5788 {
5789         gpa_t gpa;
5790
5791         if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
5792                 return 1;
5793
5794         /*
5795          * A nested guest cannot optimize MMIO vmexits, because we have an
5796          * nGPA here instead of the required GPA.
5797          */
5798         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5799         if (!is_guest_mode(vcpu) &&
5800             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5801                 trace_kvm_fast_mmio(gpa);
5802                 return kvm_skip_emulated_instruction(vcpu);
5803         }
5804
5805         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5806 }
5807
5808 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5809 {
5810         if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5811                 return -EIO;
5812
5813         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5814         ++vcpu->stat.nmi_window_exits;
5815         kvm_make_request(KVM_REQ_EVENT, vcpu);
5816
5817         return 1;
5818 }
5819
5820 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5821 {
5822         struct vcpu_vmx *vmx = to_vmx(vcpu);
5823
5824         return vmx->emulation_required && !vmx->rmode.vm86_active &&
5825                (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
5826 }
5827
5828 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5829 {
5830         struct vcpu_vmx *vmx = to_vmx(vcpu);
5831         bool intr_window_requested;
5832         unsigned count = 130;
5833
5834         intr_window_requested = exec_controls_get(vmx) &
5835                                 CPU_BASED_INTR_WINDOW_EXITING;
5836
5837         while (vmx->emulation_required && count-- != 0) {
5838                 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5839                         return handle_interrupt_window(&vmx->vcpu);
5840
5841                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5842                         return 1;
5843
5844                 if (!kvm_emulate_instruction(vcpu, 0))
5845                         return 0;
5846
5847                 if (vmx_emulation_required_with_pending_exception(vcpu)) {
5848                         kvm_prepare_emulation_failure_exit(vcpu);
5849                         return 0;
5850                 }
5851
5852                 if (vcpu->arch.halt_request) {
5853                         vcpu->arch.halt_request = 0;
5854                         return kvm_emulate_halt_noskip(vcpu);
5855                 }
5856
5857                 /*
5858                  * Note, return 1 and not 0, vcpu_run() will invoke
5859                  * xfer_to_guest_mode() which will create a proper return
5860                  * code.
5861                  */
5862                 if (__xfer_to_guest_mode_work_pending())
5863                         return 1;
5864         }
5865
5866         return 1;
5867 }
5868
5869 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5870 {
5871         if (vmx_emulation_required_with_pending_exception(vcpu)) {
5872                 kvm_prepare_emulation_failure_exit(vcpu);
5873                 return 0;
5874         }
5875
5876         return 1;
5877 }
5878
5879 static void grow_ple_window(struct kvm_vcpu *vcpu)
5880 {
5881         struct vcpu_vmx *vmx = to_vmx(vcpu);
5882         unsigned int old = vmx->ple_window;
5883
5884         vmx->ple_window = __grow_ple_window(old, ple_window,
5885                                             ple_window_grow,
5886                                             ple_window_max);
5887
5888         if (vmx->ple_window != old) {
5889                 vmx->ple_window_dirty = true;
5890                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5891                                             vmx->ple_window, old);
5892         }
5893 }
5894
5895 static void shrink_ple_window(struct kvm_vcpu *vcpu)
5896 {
5897         struct vcpu_vmx *vmx = to_vmx(vcpu);
5898         unsigned int old = vmx->ple_window;
5899
5900         vmx->ple_window = __shrink_ple_window(old, ple_window,
5901                                               ple_window_shrink,
5902                                               ple_window);
5903
5904         if (vmx->ple_window != old) {
5905                 vmx->ple_window_dirty = true;
5906                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5907                                             vmx->ple_window, old);
5908         }
5909 }
5910
5911 /*
5912  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5913  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5914  */
5915 static int handle_pause(struct kvm_vcpu *vcpu)
5916 {
5917         if (!kvm_pause_in_guest(vcpu->kvm))
5918                 grow_ple_window(vcpu);
5919
5920         /*
5921          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5922          * VM-execution control is ignored if CPL > 0. OTOH, KVM
5923          * never set PAUSE_EXITING and just set PLE if supported,
5924          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5925          */
5926         kvm_vcpu_on_spin(vcpu, true);
5927         return kvm_skip_emulated_instruction(vcpu);
5928 }
5929
5930 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5931 {
5932         return 1;
5933 }
5934
5935 static int handle_invpcid(struct kvm_vcpu *vcpu)
5936 {
5937         u32 vmx_instruction_info;
5938         unsigned long type;
5939         gva_t gva;
5940         struct {
5941                 u64 pcid;
5942                 u64 gla;
5943         } operand;
5944         int gpr_index;
5945
5946         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5947                 kvm_queue_exception(vcpu, UD_VECTOR);
5948                 return 1;
5949         }
5950
5951         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5952         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5953         type = kvm_register_read(vcpu, gpr_index);
5954
5955         /* According to the Intel instruction reference, the memory operand
5956          * is read even if it isn't needed (e.g., for type==all)
5957          */
5958         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5959                                 vmx_instruction_info, false,
5960                                 sizeof(operand), &gva))
5961                 return 1;
5962
5963         return kvm_handle_invpcid(vcpu, type, gva);
5964 }
5965
5966 static int handle_pml_full(struct kvm_vcpu *vcpu)
5967 {
5968         unsigned long exit_qualification;
5969
5970         trace_kvm_pml_full(vcpu->vcpu_id);
5971
5972         exit_qualification = vmx_get_exit_qual(vcpu);
5973
5974         /*
5975          * PML buffer FULL happened while executing iret from NMI,
5976          * "blocked by NMI" bit has to be set before next VM entry.
5977          */
5978         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5979                         enable_vnmi &&
5980                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5981                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5982                                 GUEST_INTR_STATE_NMI);
5983
5984         /*
5985          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5986          * here.., and there's no userspace involvement needed for PML.
5987          */
5988         return 1;
5989 }
5990
5991 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
5992 {
5993         struct vcpu_vmx *vmx = to_vmx(vcpu);
5994
5995         if (!vmx->req_immediate_exit &&
5996             !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
5997                 kvm_lapic_expired_hv_timer(vcpu);
5998                 return EXIT_FASTPATH_REENTER_GUEST;
5999         }
6000
6001         return EXIT_FASTPATH_NONE;
6002 }
6003
6004 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6005 {
6006         handle_fastpath_preemption_timer(vcpu);
6007         return 1;
6008 }
6009
6010 /*
6011  * When nested=0, all VMX instruction VM Exits filter here.  The handlers
6012  * are overwritten by nested_vmx_setup() when nested=1.
6013  */
6014 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6015 {
6016         kvm_queue_exception(vcpu, UD_VECTOR);
6017         return 1;
6018 }
6019
6020 #ifndef CONFIG_X86_SGX_KVM
6021 static int handle_encls(struct kvm_vcpu *vcpu)
6022 {
6023         /*
6024          * SGX virtualization is disabled.  There is no software enable bit for
6025          * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6026          * the guest from executing ENCLS (when SGX is supported by hardware).
6027          */
6028         kvm_queue_exception(vcpu, UD_VECTOR);
6029         return 1;
6030 }
6031 #endif /* CONFIG_X86_SGX_KVM */
6032
6033 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6034 {
6035         /*
6036          * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6037          * VM-Exits. Unconditionally set the flag here and leave the handling to
6038          * vmx_handle_exit().
6039          */
6040         to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6041         return 1;
6042 }
6043
6044 static int handle_notify(struct kvm_vcpu *vcpu)
6045 {
6046         unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6047         bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6048
6049         ++vcpu->stat.notify_window_exits;
6050
6051         /*
6052          * Notify VM exit happened while executing iret from NMI,
6053          * "blocked by NMI" bit has to be set before next VM entry.
6054          */
6055         if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6056                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6057                               GUEST_INTR_STATE_NMI);
6058
6059         if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6060             context_invalid) {
6061                 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6062                 vcpu->run->notify.flags = context_invalid ?
6063                                           KVM_NOTIFY_CONTEXT_INVALID : 0;
6064                 return 0;
6065         }
6066
6067         return 1;
6068 }
6069
6070 /*
6071  * The exit handlers return 1 if the exit was handled fully and guest execution
6072  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
6073  * to be done to userspace and return 0.
6074  */
6075 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6076         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
6077         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
6078         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
6079         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
6080         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
6081         [EXIT_REASON_CR_ACCESS]               = handle_cr,
6082         [EXIT_REASON_DR_ACCESS]               = handle_dr,
6083         [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
6084         [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
6085         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
6086         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
6087         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
6088         [EXIT_REASON_INVD]                    = kvm_emulate_invd,
6089         [EXIT_REASON_INVLPG]                  = handle_invlpg,
6090         [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
6091         [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
6092         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
6093         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
6094         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
6095         [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
6096         [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
6097         [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
6098         [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
6099         [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
6100         [EXIT_REASON_VMON]                    = handle_vmx_instruction,
6101         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
6102         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
6103         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
6104         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
6105         [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
6106         [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
6107         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
6108         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
6109         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
6110         [EXIT_REASON_LDTR_TR]                 = handle_desc,
6111         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
6112         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
6113         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
6114         [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
6115         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
6116         [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
6117         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
6118         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
6119         [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
6120         [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
6121         [EXIT_REASON_PML_FULL]                = handle_pml_full,
6122         [EXIT_REASON_INVPCID]                 = handle_invpcid,
6123         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
6124         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
6125         [EXIT_REASON_ENCLS]                   = handle_encls,
6126         [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
6127         [EXIT_REASON_NOTIFY]                  = handle_notify,
6128 };
6129
6130 static const int kvm_vmx_max_exit_handlers =
6131         ARRAY_SIZE(kvm_vmx_exit_handlers);
6132
6133 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6134                               u64 *info1, u64 *info2,
6135                               u32 *intr_info, u32 *error_code)
6136 {
6137         struct vcpu_vmx *vmx = to_vmx(vcpu);
6138
6139         *reason = vmx->exit_reason.full;
6140         *info1 = vmx_get_exit_qual(vcpu);
6141         if (!(vmx->exit_reason.failed_vmentry)) {
6142                 *info2 = vmx->idt_vectoring_info;
6143                 *intr_info = vmx_get_intr_info(vcpu);
6144                 if (is_exception_with_error_code(*intr_info))
6145                         *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6146                 else
6147                         *error_code = 0;
6148         } else {
6149                 *info2 = 0;
6150                 *intr_info = 0;
6151                 *error_code = 0;
6152         }
6153 }
6154
6155 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6156 {
6157         if (vmx->pml_pg) {
6158                 __free_page(vmx->pml_pg);
6159                 vmx->pml_pg = NULL;
6160         }
6161 }
6162
6163 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6164 {
6165         struct vcpu_vmx *vmx = to_vmx(vcpu);
6166         u64 *pml_buf;
6167         u16 pml_idx;
6168
6169         pml_idx = vmcs_read16(GUEST_PML_INDEX);
6170
6171         /* Do nothing if PML buffer is empty */
6172         if (pml_idx == (PML_ENTITY_NUM - 1))
6173                 return;
6174
6175         /* PML index always points to next available PML buffer entity */
6176         if (pml_idx >= PML_ENTITY_NUM)
6177                 pml_idx = 0;
6178         else
6179                 pml_idx++;
6180
6181         pml_buf = page_address(vmx->pml_pg);
6182         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6183                 u64 gpa;
6184
6185                 gpa = pml_buf[pml_idx];
6186                 WARN_ON(gpa & (PAGE_SIZE - 1));
6187                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6188         }
6189
6190         /* reset PML index */
6191         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6192 }
6193
6194 static void vmx_dump_sel(char *name, uint32_t sel)
6195 {
6196         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6197                name, vmcs_read16(sel),
6198                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6199                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6200                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6201 }
6202
6203 static void vmx_dump_dtsel(char *name, uint32_t limit)
6204 {
6205         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
6206                name, vmcs_read32(limit),
6207                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6208 }
6209
6210 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6211 {
6212         unsigned int i;
6213         struct vmx_msr_entry *e;
6214
6215         pr_err("MSR %s:\n", name);
6216         for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6217                 pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6218 }
6219
6220 void dump_vmcs(struct kvm_vcpu *vcpu)
6221 {
6222         struct vcpu_vmx *vmx = to_vmx(vcpu);
6223         u32 vmentry_ctl, vmexit_ctl;
6224         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6225         u64 tertiary_exec_control;
6226         unsigned long cr4;
6227         int efer_slot;
6228
6229         if (!dump_invalid_vmcs) {
6230                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6231                 return;
6232         }
6233
6234         vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6235         vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6236         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6237         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6238         cr4 = vmcs_readl(GUEST_CR4);
6239
6240         if (cpu_has_secondary_exec_ctrls())
6241                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6242         else
6243                 secondary_exec_control = 0;
6244
6245         if (cpu_has_tertiary_exec_ctrls())
6246                 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6247         else
6248                 tertiary_exec_control = 0;
6249
6250         pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6251                vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6252         pr_err("*** Guest State ***\n");
6253         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6254                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6255                vmcs_readl(CR0_GUEST_HOST_MASK));
6256         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6257                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6258         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6259         if (cpu_has_vmx_ept()) {
6260                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
6261                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6262                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
6263                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6264         }
6265         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
6266                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6267         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
6268                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6269         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6270                vmcs_readl(GUEST_SYSENTER_ESP),
6271                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6272         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
6273         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
6274         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
6275         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
6276         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
6277         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
6278         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6279         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6280         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6281         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
6282         efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6283         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6284                 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6285         else if (efer_slot >= 0)
6286                 pr_err("EFER= 0x%016llx (autoload)\n",
6287                        vmx->msr_autoload.guest.val[efer_slot].value);
6288         else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6289                 pr_err("EFER= 0x%016llx (effective)\n",
6290                        vcpu->arch.efer | (EFER_LMA | EFER_LME));
6291         else
6292                 pr_err("EFER= 0x%016llx (effective)\n",
6293                        vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6294         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6295                 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6296         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
6297                vmcs_read64(GUEST_IA32_DEBUGCTL),
6298                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6299         if (cpu_has_load_perf_global_ctrl() &&
6300             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6301                 pr_err("PerfGlobCtl = 0x%016llx\n",
6302                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6303         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6304                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6305         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
6306                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6307                vmcs_read32(GUEST_ACTIVITY_STATE));
6308         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6309                 pr_err("InterruptStatus = %04x\n",
6310                        vmcs_read16(GUEST_INTR_STATUS));
6311         if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6312                 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6313         if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6314                 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
6315
6316         pr_err("*** Host State ***\n");
6317         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
6318                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6319         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6320                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6321                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6322                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6323                vmcs_read16(HOST_TR_SELECTOR));
6324         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6325                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6326                vmcs_readl(HOST_TR_BASE));
6327         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6328                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6329         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6330                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6331                vmcs_readl(HOST_CR4));
6332         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6333                vmcs_readl(HOST_IA32_SYSENTER_ESP),
6334                vmcs_read32(HOST_IA32_SYSENTER_CS),
6335                vmcs_readl(HOST_IA32_SYSENTER_EIP));
6336         if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6337                 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6338         if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6339                 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6340         if (cpu_has_load_perf_global_ctrl() &&
6341             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6342                 pr_err("PerfGlobCtl = 0x%016llx\n",
6343                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6344         if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6345                 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6346
6347         pr_err("*** Control State ***\n");
6348         pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6349                cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6350         pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6351                pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6352         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6353                vmcs_read32(EXCEPTION_BITMAP),
6354                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6355                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6356         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6357                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6358                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6359                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6360         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6361                vmcs_read32(VM_EXIT_INTR_INFO),
6362                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6363                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6364         pr_err("        reason=%08x qualification=%016lx\n",
6365                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6366         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6367                vmcs_read32(IDT_VECTORING_INFO_FIELD),
6368                vmcs_read32(IDT_VECTORING_ERROR_CODE));
6369         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6370         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6371                 pr_err("TSC Multiplier = 0x%016llx\n",
6372                        vmcs_read64(TSC_MULTIPLIER));
6373         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6374                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6375                         u16 status = vmcs_read16(GUEST_INTR_STATUS);
6376                         pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6377                 }
6378                 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6379                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6380                         pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6381                 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6382         }
6383         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6384                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6385         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6386                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6387         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6388                 pr_err("PLE Gap=%08x Window=%08x\n",
6389                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6390         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6391                 pr_err("Virtual processor ID = 0x%04x\n",
6392                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
6393 }
6394
6395 /*
6396  * The guest has exited.  See if we can fix it or if we need userspace
6397  * assistance.
6398  */
6399 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6400 {
6401         struct vcpu_vmx *vmx = to_vmx(vcpu);
6402         union vmx_exit_reason exit_reason = vmx->exit_reason;
6403         u32 vectoring_info = vmx->idt_vectoring_info;
6404         u16 exit_handler_index;
6405
6406         /*
6407          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6408          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6409          * querying dirty_bitmap, we only need to kick all vcpus out of guest
6410          * mode as if vcpus is in root mode, the PML buffer must has been
6411          * flushed already.  Note, PML is never enabled in hardware while
6412          * running L2.
6413          */
6414         if (enable_pml && !is_guest_mode(vcpu))
6415                 vmx_flush_pml_buffer(vcpu);
6416
6417         /*
6418          * KVM should never reach this point with a pending nested VM-Enter.
6419          * More specifically, short-circuiting VM-Entry to emulate L2 due to
6420          * invalid guest state should never happen as that means KVM knowingly
6421          * allowed a nested VM-Enter with an invalid vmcs12.  More below.
6422          */
6423         if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6424                 return -EIO;
6425
6426         if (is_guest_mode(vcpu)) {
6427                 /*
6428                  * PML is never enabled when running L2, bail immediately if a
6429                  * PML full exit occurs as something is horribly wrong.
6430                  */
6431                 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6432                         goto unexpected_vmexit;
6433
6434                 /*
6435                  * The host physical addresses of some pages of guest memory
6436                  * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6437                  * Page). The CPU may write to these pages via their host
6438                  * physical address while L2 is running, bypassing any
6439                  * address-translation-based dirty tracking (e.g. EPT write
6440                  * protection).
6441                  *
6442                  * Mark them dirty on every exit from L2 to prevent them from
6443                  * getting out of sync with dirty tracking.
6444                  */
6445                 nested_mark_vmcs12_pages_dirty(vcpu);
6446
6447                 /*
6448                  * Synthesize a triple fault if L2 state is invalid.  In normal
6449                  * operation, nested VM-Enter rejects any attempt to enter L2
6450                  * with invalid state.  However, those checks are skipped if
6451                  * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
6452                  * L2 state is invalid, it means either L1 modified SMRAM state
6453                  * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
6454                  * doing so is architecturally allowed in the RSM case, and is
6455                  * the least awful solution for the userspace case without
6456                  * risking false positives.
6457                  */
6458                 if (vmx->emulation_required) {
6459                         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6460                         return 1;
6461                 }
6462
6463                 if (nested_vmx_reflect_vmexit(vcpu))
6464                         return 1;
6465         }
6466
6467         /* If guest state is invalid, start emulating.  L2 is handled above. */
6468         if (vmx->emulation_required)
6469                 return handle_invalid_guest_state(vcpu);
6470
6471         if (exit_reason.failed_vmentry) {
6472                 dump_vmcs(vcpu);
6473                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6474                 vcpu->run->fail_entry.hardware_entry_failure_reason
6475                         = exit_reason.full;
6476                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6477                 return 0;
6478         }
6479
6480         if (unlikely(vmx->fail)) {
6481                 dump_vmcs(vcpu);
6482                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6483                 vcpu->run->fail_entry.hardware_entry_failure_reason
6484                         = vmcs_read32(VM_INSTRUCTION_ERROR);
6485                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6486                 return 0;
6487         }
6488
6489         /*
6490          * Note:
6491          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6492          * delivery event since it indicates guest is accessing MMIO.
6493          * The vm-exit can be triggered again after return to guest that
6494          * will cause infinite loop.
6495          */
6496         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6497             (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6498              exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6499              exit_reason.basic != EXIT_REASON_PML_FULL &&
6500              exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6501              exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6502              exit_reason.basic != EXIT_REASON_NOTIFY)) {
6503                 int ndata = 3;
6504
6505                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6506                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6507                 vcpu->run->internal.data[0] = vectoring_info;
6508                 vcpu->run->internal.data[1] = exit_reason.full;
6509                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6510                 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6511                         vcpu->run->internal.data[ndata++] =
6512                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6513                 }
6514                 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6515                 vcpu->run->internal.ndata = ndata;
6516                 return 0;
6517         }
6518
6519         if (unlikely(!enable_vnmi &&
6520                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
6521                 if (!vmx_interrupt_blocked(vcpu)) {
6522                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6523                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6524                            vcpu->arch.nmi_pending) {
6525                         /*
6526                          * This CPU don't support us in finding the end of an
6527                          * NMI-blocked window if the guest runs with IRQs
6528                          * disabled. So we pull the trigger after 1 s of
6529                          * futile waiting, but inform the user about this.
6530                          */
6531                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6532                                "state on VCPU %d after 1 s timeout\n",
6533                                __func__, vcpu->vcpu_id);
6534                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6535                 }
6536         }
6537
6538         if (exit_fastpath != EXIT_FASTPATH_NONE)
6539                 return 1;
6540
6541         if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6542                 goto unexpected_vmexit;
6543 #ifdef CONFIG_RETPOLINE
6544         if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6545                 return kvm_emulate_wrmsr(vcpu);
6546         else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6547                 return handle_preemption_timer(vcpu);
6548         else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6549                 return handle_interrupt_window(vcpu);
6550         else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6551                 return handle_external_interrupt(vcpu);
6552         else if (exit_reason.basic == EXIT_REASON_HLT)
6553                 return kvm_emulate_halt(vcpu);
6554         else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6555                 return handle_ept_misconfig(vcpu);
6556 #endif
6557
6558         exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6559                                                 kvm_vmx_max_exit_handlers);
6560         if (!kvm_vmx_exit_handlers[exit_handler_index])
6561                 goto unexpected_vmexit;
6562
6563         return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6564
6565 unexpected_vmexit:
6566         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6567                     exit_reason.full);
6568         dump_vmcs(vcpu);
6569         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6570         vcpu->run->internal.suberror =
6571                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6572         vcpu->run->internal.ndata = 2;
6573         vcpu->run->internal.data[0] = exit_reason.full;
6574         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
6575         return 0;
6576 }
6577
6578 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6579 {
6580         int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6581
6582         /*
6583          * Exit to user space when bus lock detected to inform that there is
6584          * a bus lock in guest.
6585          */
6586         if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6587                 if (ret > 0)
6588                         vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6589
6590                 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6591                 return 0;
6592         }
6593         return ret;
6594 }
6595
6596 /*
6597  * Software based L1D cache flush which is used when microcode providing
6598  * the cache control MSR is not loaded.
6599  *
6600  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6601  * flush it is required to read in 64 KiB because the replacement algorithm
6602  * is not exactly LRU. This could be sized at runtime via topology
6603  * information but as all relevant affected CPUs have 32KiB L1D cache size
6604  * there is no point in doing so.
6605  */
6606 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6607 {
6608         int size = PAGE_SIZE << L1D_CACHE_ORDER;
6609
6610         /*
6611          * This code is only executed when the flush mode is 'cond' or
6612          * 'always'
6613          */
6614         if (static_branch_likely(&vmx_l1d_flush_cond)) {
6615                 bool flush_l1d;
6616
6617                 /*
6618                  * Clear the per-vcpu flush bit, it gets set again
6619                  * either from vcpu_run() or from one of the unsafe
6620                  * VMEXIT handlers.
6621                  */
6622                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6623                 vcpu->arch.l1tf_flush_l1d = false;
6624
6625                 /*
6626                  * Clear the per-cpu flush bit, it gets set again from
6627                  * the interrupt handlers.
6628                  */
6629                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6630                 kvm_clear_cpu_l1tf_flush_l1d();
6631
6632                 if (!flush_l1d)
6633                         return;
6634         }
6635
6636         vcpu->stat.l1d_flush++;
6637
6638         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6639                 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6640                 return;
6641         }
6642
6643         asm volatile(
6644                 /* First ensure the pages are in the TLB */
6645                 "xorl   %%eax, %%eax\n"
6646                 ".Lpopulate_tlb:\n\t"
6647                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6648                 "addl   $4096, %%eax\n\t"
6649                 "cmpl   %%eax, %[size]\n\t"
6650                 "jne    .Lpopulate_tlb\n\t"
6651                 "xorl   %%eax, %%eax\n\t"
6652                 "cpuid\n\t"
6653                 /* Now fill the cache */
6654                 "xorl   %%eax, %%eax\n"
6655                 ".Lfill_cache:\n"
6656                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6657                 "addl   $64, %%eax\n\t"
6658                 "cmpl   %%eax, %[size]\n\t"
6659                 "jne    .Lfill_cache\n\t"
6660                 "lfence\n"
6661                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6662                     [size] "r" (size)
6663                 : "eax", "ebx", "ecx", "edx");
6664 }
6665
6666 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6667 {
6668         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6669         int tpr_threshold;
6670
6671         if (is_guest_mode(vcpu) &&
6672                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6673                 return;
6674
6675         tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6676         if (is_guest_mode(vcpu))
6677                 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6678         else
6679                 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6680 }
6681
6682 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6683 {
6684         struct vcpu_vmx *vmx = to_vmx(vcpu);
6685         u32 sec_exec_control;
6686
6687         if (!lapic_in_kernel(vcpu))
6688                 return;
6689
6690         if (!flexpriority_enabled &&
6691             !cpu_has_vmx_virtualize_x2apic_mode())
6692                 return;
6693
6694         /* Postpone execution until vmcs01 is the current VMCS. */
6695         if (is_guest_mode(vcpu)) {
6696                 vmx->nested.change_vmcs01_virtual_apic_mode = true;
6697                 return;
6698         }
6699
6700         sec_exec_control = secondary_exec_controls_get(vmx);
6701         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6702                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6703
6704         switch (kvm_get_apic_mode(vcpu)) {
6705         case LAPIC_MODE_INVALID:
6706                 WARN_ONCE(true, "Invalid local APIC state");
6707                 break;
6708         case LAPIC_MODE_DISABLED:
6709                 break;
6710         case LAPIC_MODE_XAPIC:
6711                 if (flexpriority_enabled) {
6712                         sec_exec_control |=
6713                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6714                         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6715
6716                         /*
6717                          * Flush the TLB, reloading the APIC access page will
6718                          * only do so if its physical address has changed, but
6719                          * the guest may have inserted a non-APIC mapping into
6720                          * the TLB while the APIC access page was disabled.
6721                          */
6722                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6723                 }
6724                 break;
6725         case LAPIC_MODE_X2APIC:
6726                 if (cpu_has_vmx_virtualize_x2apic_mode())
6727                         sec_exec_control |=
6728                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6729                 break;
6730         }
6731         secondary_exec_controls_set(vmx, sec_exec_control);
6732
6733         vmx_update_msr_bitmap_x2apic(vcpu);
6734 }
6735
6736 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6737 {
6738         struct page *page;
6739
6740         /* Defer reload until vmcs01 is the current VMCS. */
6741         if (is_guest_mode(vcpu)) {
6742                 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6743                 return;
6744         }
6745
6746         if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6747             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6748                 return;
6749
6750         page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6751         if (is_error_page(page))
6752                 return;
6753
6754         vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
6755         vmx_flush_tlb_current(vcpu);
6756
6757         /*
6758          * Do not pin apic access page in memory, the MMU notifier
6759          * will call us again if it is migrated or swapped out.
6760          */
6761         put_page(page);
6762 }
6763
6764 static void vmx_hwapic_isr_update(int max_isr)
6765 {
6766         u16 status;
6767         u8 old;
6768
6769         if (max_isr == -1)
6770                 max_isr = 0;
6771
6772         status = vmcs_read16(GUEST_INTR_STATUS);
6773         old = status >> 8;
6774         if (max_isr != old) {
6775                 status &= 0xff;
6776                 status |= max_isr << 8;
6777                 vmcs_write16(GUEST_INTR_STATUS, status);
6778         }
6779 }
6780
6781 static void vmx_set_rvi(int vector)
6782 {
6783         u16 status;
6784         u8 old;
6785
6786         if (vector == -1)
6787                 vector = 0;
6788
6789         status = vmcs_read16(GUEST_INTR_STATUS);
6790         old = (u8)status & 0xff;
6791         if ((u8)vector != old) {
6792                 status &= ~0xff;
6793                 status |= (u8)vector;
6794                 vmcs_write16(GUEST_INTR_STATUS, status);
6795         }
6796 }
6797
6798 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6799 {
6800         /*
6801          * When running L2, updating RVI is only relevant when
6802          * vmcs12 virtual-interrupt-delivery enabled.
6803          * However, it can be enabled only when L1 also
6804          * intercepts external-interrupts and in that case
6805          * we should not update vmcs02 RVI but instead intercept
6806          * interrupt. Therefore, do nothing when running L2.
6807          */
6808         if (!is_guest_mode(vcpu))
6809                 vmx_set_rvi(max_irr);
6810 }
6811
6812 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6813 {
6814         struct vcpu_vmx *vmx = to_vmx(vcpu);
6815         int max_irr;
6816         bool got_posted_interrupt;
6817
6818         if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6819                 return -EIO;
6820
6821         if (pi_test_on(&vmx->pi_desc)) {
6822                 pi_clear_on(&vmx->pi_desc);
6823                 /*
6824                  * IOMMU can write to PID.ON, so the barrier matters even on UP.
6825                  * But on x86 this is just a compiler barrier anyway.
6826                  */
6827                 smp_mb__after_atomic();
6828                 got_posted_interrupt =
6829                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6830         } else {
6831                 max_irr = kvm_lapic_find_highest_irr(vcpu);
6832                 got_posted_interrupt = false;
6833         }
6834
6835         /*
6836          * Newly recognized interrupts are injected via either virtual interrupt
6837          * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
6838          * disabled in two cases:
6839          *
6840          * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
6841          * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6842          * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
6843          * into L2, but KVM doesn't use virtual interrupt delivery to inject
6844          * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6845          *
6846          * 2) If APICv is disabled for this vCPU, assigned devices may still
6847          * attempt to post interrupts.  The posted interrupt vector will cause
6848          * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6849          */
6850         if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6851                 vmx_set_rvi(max_irr);
6852         else if (got_posted_interrupt)
6853                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6854
6855         return max_irr;
6856 }
6857
6858 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6859 {
6860         if (!kvm_vcpu_apicv_active(vcpu))
6861                 return;
6862
6863         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6864         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6865         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6866         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6867 }
6868
6869 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6870 {
6871         struct vcpu_vmx *vmx = to_vmx(vcpu);
6872
6873         pi_clear_on(&vmx->pi_desc);
6874         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6875 }
6876
6877 void vmx_do_interrupt_irqoff(unsigned long entry);
6878 void vmx_do_nmi_irqoff(void);
6879
6880 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6881 {
6882         /*
6883          * Save xfd_err to guest_fpu before interrupt is enabled, so the
6884          * MSR value is not clobbered by the host activity before the guest
6885          * has chance to consume it.
6886          *
6887          * Do not blindly read xfd_err here, since this exception might
6888          * be caused by L1 interception on a platform which doesn't
6889          * support xfd at all.
6890          *
6891          * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6892          * only when xfd contains a non-zero value.
6893          *
6894          * Queuing exception is done in vmx_handle_exit. See comment there.
6895          */
6896         if (vcpu->arch.guest_fpu.fpstate->xfd)
6897                 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6898 }
6899
6900 static void handle_exception_irqoff(struct vcpu_vmx *vmx)
6901 {
6902         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6903
6904         /* if exit due to PF check for async PF */
6905         if (is_page_fault(intr_info))
6906                 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6907         /* if exit due to NM, handle before interrupts are enabled */
6908         else if (is_nm_fault(intr_info))
6909                 handle_nm_fault_irqoff(&vmx->vcpu);
6910         /* Handle machine checks before interrupts are enabled */
6911         else if (is_machine_check(intr_info))
6912                 kvm_machine_check();
6913 }
6914
6915 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6916 {
6917         u32 intr_info = vmx_get_intr_info(vcpu);
6918         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6919         gate_desc *desc = (gate_desc *)host_idt_base + vector;
6920
6921         if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
6922             "unexpected VM-Exit interrupt info: 0x%x", intr_info))
6923                 return;
6924
6925         kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
6926         vmx_do_interrupt_irqoff(gate_offset(desc));
6927         kvm_after_interrupt(vcpu);
6928
6929         vcpu->arch.at_instruction_boundary = true;
6930 }
6931
6932 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6933 {
6934         struct vcpu_vmx *vmx = to_vmx(vcpu);
6935
6936         if (vmx->emulation_required)
6937                 return;
6938
6939         if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6940                 handle_external_interrupt_irqoff(vcpu);
6941         else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
6942                 handle_exception_irqoff(vmx);
6943 }
6944
6945 /*
6946  * The kvm parameter can be NULL (module initialization, or invocation before
6947  * VM creation). Be sure to check the kvm parameter before using it.
6948  */
6949 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
6950 {
6951         switch (index) {
6952         case MSR_IA32_SMBASE:
6953                 if (!IS_ENABLED(CONFIG_KVM_SMM))
6954                         return false;
6955                 /*
6956                  * We cannot do SMM unless we can run the guest in big
6957                  * real mode.
6958                  */
6959                 return enable_unrestricted_guest || emulate_invalid_guest_state;
6960         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6961                 return nested;
6962         case MSR_AMD64_VIRT_SPEC_CTRL:
6963         case MSR_AMD64_TSC_RATIO:
6964                 /* This is AMD only.  */
6965                 return false;
6966         default:
6967                 return true;
6968         }
6969 }
6970
6971 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6972 {
6973         u32 exit_intr_info;
6974         bool unblock_nmi;
6975         u8 vector;
6976         bool idtv_info_valid;
6977
6978         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6979
6980         if (enable_vnmi) {
6981                 if (vmx->loaded_vmcs->nmi_known_unmasked)
6982                         return;
6983
6984                 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
6985                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6986                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6987                 /*
6988                  * SDM 3: 27.7.1.2 (September 2008)
6989                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
6990                  * a guest IRET fault.
6991                  * SDM 3: 23.2.2 (September 2008)
6992                  * Bit 12 is undefined in any of the following cases:
6993                  *  If the VM exit sets the valid bit in the IDT-vectoring
6994                  *   information field.
6995                  *  If the VM exit is due to a double fault.
6996                  */
6997                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6998                     vector != DF_VECTOR && !idtv_info_valid)
6999                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7000                                       GUEST_INTR_STATE_NMI);
7001                 else
7002                         vmx->loaded_vmcs->nmi_known_unmasked =
7003                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7004                                   & GUEST_INTR_STATE_NMI);
7005         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7006                 vmx->loaded_vmcs->vnmi_blocked_time +=
7007                         ktime_to_ns(ktime_sub(ktime_get(),
7008                                               vmx->loaded_vmcs->entry_time));
7009 }
7010
7011 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7012                                       u32 idt_vectoring_info,
7013                                       int instr_len_field,
7014                                       int error_code_field)
7015 {
7016         u8 vector;
7017         int type;
7018         bool idtv_info_valid;
7019
7020         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7021
7022         vcpu->arch.nmi_injected = false;
7023         kvm_clear_exception_queue(vcpu);
7024         kvm_clear_interrupt_queue(vcpu);
7025
7026         if (!idtv_info_valid)
7027                 return;
7028
7029         kvm_make_request(KVM_REQ_EVENT, vcpu);
7030
7031         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7032         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7033
7034         switch (type) {
7035         case INTR_TYPE_NMI_INTR:
7036                 vcpu->arch.nmi_injected = true;
7037                 /*
7038                  * SDM 3: 27.7.1.2 (September 2008)
7039                  * Clear bit "block by NMI" before VM entry if a NMI
7040                  * delivery faulted.
7041                  */
7042                 vmx_set_nmi_mask(vcpu, false);
7043                 break;
7044         case INTR_TYPE_SOFT_EXCEPTION:
7045                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7046                 fallthrough;
7047         case INTR_TYPE_HARD_EXCEPTION:
7048                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7049                         u32 err = vmcs_read32(error_code_field);
7050                         kvm_requeue_exception_e(vcpu, vector, err);
7051                 } else
7052                         kvm_requeue_exception(vcpu, vector);
7053                 break;
7054         case INTR_TYPE_SOFT_INTR:
7055                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7056                 fallthrough;
7057         case INTR_TYPE_EXT_INTR:
7058                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7059                 break;
7060         default:
7061                 break;
7062         }
7063 }
7064
7065 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7066 {
7067         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7068                                   VM_EXIT_INSTRUCTION_LEN,
7069                                   IDT_VECTORING_ERROR_CODE);
7070 }
7071
7072 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7073 {
7074         __vmx_complete_interrupts(vcpu,
7075                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7076                                   VM_ENTRY_INSTRUCTION_LEN,
7077                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
7078
7079         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7080 }
7081
7082 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7083 {
7084         int i, nr_msrs;
7085         struct perf_guest_switch_msr *msrs;
7086         struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7087
7088         pmu->host_cross_mapped_mask = 0;
7089         if (pmu->pebs_enable & pmu->global_ctrl)
7090                 intel_pmu_cross_mapped_check(pmu);
7091
7092         /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7093         msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7094         if (!msrs)
7095                 return;
7096
7097         for (i = 0; i < nr_msrs; i++)
7098                 if (msrs[i].host == msrs[i].guest)
7099                         clear_atomic_switch_msr(vmx, msrs[i].msr);
7100                 else
7101                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7102                                         msrs[i].host, false);
7103 }
7104
7105 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
7106 {
7107         struct vcpu_vmx *vmx = to_vmx(vcpu);
7108         u64 tscl;
7109         u32 delta_tsc;
7110
7111         if (vmx->req_immediate_exit) {
7112                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7113                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7114         } else if (vmx->hv_deadline_tsc != -1) {
7115                 tscl = rdtsc();
7116                 if (vmx->hv_deadline_tsc > tscl)
7117                         /* set_hv_timer ensures the delta fits in 32-bits */
7118                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7119                                 cpu_preemption_timer_multi);
7120                 else
7121                         delta_tsc = 0;
7122
7123                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7124                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7125         } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7126                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7127                 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7128         }
7129 }
7130
7131 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7132 {
7133         if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7134                 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7135                 vmcs_writel(HOST_RSP, host_rsp);
7136         }
7137 }
7138
7139 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7140                                         unsigned int flags)
7141 {
7142         u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7143
7144         if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7145                 return;
7146
7147         if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7148                 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7149
7150         /*
7151          * If the guest/host SPEC_CTRL values differ, restore the host value.
7152          *
7153          * For legacy IBRS, the IBRS bit always needs to be written after
7154          * transitioning from a less privileged predictor mode, regardless of
7155          * whether the guest/host values differ.
7156          */
7157         if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7158             vmx->spec_ctrl != hostval)
7159                 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7160
7161         barrier_nospec();
7162 }
7163
7164 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
7165 {
7166         switch (to_vmx(vcpu)->exit_reason.basic) {
7167         case EXIT_REASON_MSR_WRITE:
7168                 return handle_fastpath_set_msr_irqoff(vcpu);
7169         case EXIT_REASON_PREEMPTION_TIMER:
7170                 return handle_fastpath_preemption_timer(vcpu);
7171         default:
7172                 return EXIT_FASTPATH_NONE;
7173         }
7174 }
7175
7176 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7177                                         unsigned int flags)
7178 {
7179         struct vcpu_vmx *vmx = to_vmx(vcpu);
7180
7181         guest_state_enter_irqoff();
7182
7183         /* L1D Flush includes CPU buffer clear to mitigate MDS */
7184         if (static_branch_unlikely(&vmx_l1d_should_flush))
7185                 vmx_l1d_flush(vcpu);
7186         else if (static_branch_unlikely(&mds_user_clear))
7187                 mds_clear_cpu_buffers();
7188         else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7189                  kvm_arch_has_assigned_device(vcpu->kvm))
7190                 mds_clear_cpu_buffers();
7191
7192         vmx_disable_fb_clear(vmx);
7193
7194         if (vcpu->arch.cr2 != native_read_cr2())
7195                 native_write_cr2(vcpu->arch.cr2);
7196
7197         vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7198                                    flags);
7199
7200         vcpu->arch.cr2 = native_read_cr2();
7201
7202         vmx_enable_fb_clear(vmx);
7203
7204         if (unlikely(vmx->fail))
7205                 vmx->exit_reason.full = 0xdead;
7206         else
7207                 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7208
7209         if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
7210             is_nmi(vmx_get_intr_info(vcpu))) {
7211                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7212                 vmx_do_nmi_irqoff();
7213                 kvm_after_interrupt(vcpu);
7214         }
7215
7216         guest_state_exit_irqoff();
7217 }
7218
7219 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
7220 {
7221         struct vcpu_vmx *vmx = to_vmx(vcpu);
7222         unsigned long cr3, cr4;
7223
7224         /* Record the guest's net vcpu time for enforced NMI injections. */
7225         if (unlikely(!enable_vnmi &&
7226                      vmx->loaded_vmcs->soft_vnmi_blocked))
7227                 vmx->loaded_vmcs->entry_time = ktime_get();
7228
7229         /*
7230          * Don't enter VMX if guest state is invalid, let the exit handler
7231          * start emulation until we arrive back to a valid state.  Synthesize a
7232          * consistency check VM-Exit due to invalid guest state and bail.
7233          */
7234         if (unlikely(vmx->emulation_required)) {
7235                 vmx->fail = 0;
7236
7237                 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7238                 vmx->exit_reason.failed_vmentry = 1;
7239                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7240                 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7241                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7242                 vmx->exit_intr_info = 0;
7243                 return EXIT_FASTPATH_NONE;
7244         }
7245
7246         trace_kvm_entry(vcpu);
7247
7248         if (vmx->ple_window_dirty) {
7249                 vmx->ple_window_dirty = false;
7250                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7251         }
7252
7253         /*
7254          * We did this in prepare_switch_to_guest, because it needs to
7255          * be within srcu_read_lock.
7256          */
7257         WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7258
7259         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7260                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7261         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7262                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7263         vcpu->arch.regs_dirty = 0;
7264
7265         /*
7266          * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
7267          * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7268          * it switches back to the current->mm, which can occur in KVM context
7269          * when switching to a temporary mm to patch kernel code, e.g. if KVM
7270          * toggles a static key while handling a VM-Exit.
7271          */
7272         cr3 = __get_current_cr3_fast();
7273         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7274                 vmcs_writel(HOST_CR3, cr3);
7275                 vmx->loaded_vmcs->host_state.cr3 = cr3;
7276         }
7277
7278         cr4 = cr4_read_shadow();
7279         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7280                 vmcs_writel(HOST_CR4, cr4);
7281                 vmx->loaded_vmcs->host_state.cr4 = cr4;
7282         }
7283
7284         /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
7285         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7286                 set_debugreg(vcpu->arch.dr6, 6);
7287
7288         /* When single-stepping over STI and MOV SS, we must clear the
7289          * corresponding interruptibility bits in the guest state. Otherwise
7290          * vmentry fails as it then expects bit 14 (BS) in pending debug
7291          * exceptions being set, but that's not correct for the guest debugging
7292          * case. */
7293         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7294                 vmx_set_interrupt_shadow(vcpu, 0);
7295
7296         kvm_load_guest_xsave_state(vcpu);
7297
7298         pt_guest_enter(vmx);
7299
7300         atomic_switch_perf_msrs(vmx);
7301         if (intel_pmu_lbr_is_enabled(vcpu))
7302                 vmx_passthrough_lbr_msrs(vcpu);
7303
7304         if (enable_preemption_timer)
7305                 vmx_update_hv_timer(vcpu);
7306
7307         kvm_wait_lapic_expire(vcpu);
7308
7309         /* The actual VMENTER/EXIT is in the .noinstr.text section. */
7310         vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7311
7312         /* All fields are clean at this point */
7313         if (static_branch_unlikely(&enable_evmcs)) {
7314                 current_evmcs->hv_clean_fields |=
7315                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7316
7317                 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7318         }
7319
7320         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7321         if (vmx->host_debugctlmsr)
7322                 update_debugctlmsr(vmx->host_debugctlmsr);
7323
7324 #ifndef CONFIG_X86_64
7325         /*
7326          * The sysexit path does not restore ds/es, so we must set them to
7327          * a reasonable value ourselves.
7328          *
7329          * We can't defer this to vmx_prepare_switch_to_host() since that
7330          * function may be executed in interrupt context, which saves and
7331          * restore segments around it, nullifying its effect.
7332          */
7333         loadsegment(ds, __USER_DS);
7334         loadsegment(es, __USER_DS);
7335 #endif
7336
7337         vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7338
7339         pt_guest_exit(vmx);
7340
7341         kvm_load_host_xsave_state(vcpu);
7342
7343         if (is_guest_mode(vcpu)) {
7344                 /*
7345                  * Track VMLAUNCH/VMRESUME that have made past guest state
7346                  * checking.
7347                  */
7348                 if (vmx->nested.nested_run_pending &&
7349                     !vmx->exit_reason.failed_vmentry)
7350                         ++vcpu->stat.nested_run;
7351
7352                 vmx->nested.nested_run_pending = 0;
7353         }
7354
7355         vmx->idt_vectoring_info = 0;
7356
7357         if (unlikely(vmx->fail))
7358                 return EXIT_FASTPATH_NONE;
7359
7360         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
7361                 kvm_machine_check();
7362
7363         if (likely(!vmx->exit_reason.failed_vmentry))
7364                 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7365
7366         trace_kvm_exit(vcpu, KVM_ISA_VMX);
7367
7368         if (unlikely(vmx->exit_reason.failed_vmentry))
7369                 return EXIT_FASTPATH_NONE;
7370
7371         vmx->loaded_vmcs->launched = 1;
7372
7373         vmx_recover_nmi_blocking(vmx);
7374         vmx_complete_interrupts(vmx);
7375
7376         if (is_guest_mode(vcpu))
7377                 return EXIT_FASTPATH_NONE;
7378
7379         return vmx_exit_handlers_fastpath(vcpu);
7380 }
7381
7382 static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7383 {
7384         struct vcpu_vmx *vmx = to_vmx(vcpu);
7385
7386         if (enable_pml)
7387                 vmx_destroy_pml_buffer(vmx);
7388         free_vpid(vmx->vpid);
7389         nested_vmx_free_vcpu(vcpu);
7390         free_loaded_vmcs(vmx->loaded_vmcs);
7391 }
7392
7393 static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7394 {
7395         struct vmx_uret_msr *tsx_ctrl;
7396         struct vcpu_vmx *vmx;
7397         int i, err;
7398
7399         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7400         vmx = to_vmx(vcpu);
7401
7402         INIT_LIST_HEAD(&vmx->pi_wakeup_list);
7403
7404         err = -ENOMEM;
7405
7406         vmx->vpid = allocate_vpid();
7407
7408         /*
7409          * If PML is turned on, failure on enabling PML just results in failure
7410          * of creating the vcpu, therefore we can simplify PML logic (by
7411          * avoiding dealing with cases, such as enabling PML partially on vcpus
7412          * for the guest), etc.
7413          */
7414         if (enable_pml) {
7415                 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7416                 if (!vmx->pml_pg)
7417                         goto free_vpid;
7418         }
7419
7420         for (i = 0; i < kvm_nr_uret_msrs; ++i)
7421                 vmx->guest_uret_msrs[i].mask = -1ull;
7422         if (boot_cpu_has(X86_FEATURE_RTM)) {
7423                 /*
7424                  * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7425                  * Keep the host value unchanged to avoid changing CPUID bits
7426                  * under the host kernel's feet.
7427                  */
7428                 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7429                 if (tsx_ctrl)
7430                         tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7431         }
7432
7433         err = alloc_loaded_vmcs(&vmx->vmcs01);
7434         if (err < 0)
7435                 goto free_pml;
7436
7437         /*
7438          * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7439          * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7440          * feature only for vmcs01, KVM currently isn't equipped to realize any
7441          * performance benefits from enabling it for vmcs02.
7442          */
7443         if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
7444             (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7445                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7446
7447                 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7448         }
7449
7450         /* The MSR bitmap starts with all ones */
7451         bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7452         bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7453
7454         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
7455 #ifdef CONFIG_X86_64
7456         vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7457         vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7458         vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7459 #endif
7460         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7461         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7462         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7463         if (kvm_cstate_in_guest(vcpu->kvm)) {
7464                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7465                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7466                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7467                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7468         }
7469
7470         vmx->loaded_vmcs = &vmx->vmcs01;
7471
7472         if (cpu_need_virtualize_apic_accesses(vcpu)) {
7473                 err = kvm_alloc_apic_access_page(vcpu->kvm);
7474                 if (err)
7475                         goto free_vmcs;
7476         }
7477
7478         if (enable_ept && !enable_unrestricted_guest) {
7479                 err = init_rmode_identity_map(vcpu->kvm);
7480                 if (err)
7481                         goto free_vmcs;
7482         }
7483
7484         if (vmx_can_use_ipiv(vcpu))
7485                 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7486                            __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
7487
7488         return 0;
7489
7490 free_vmcs:
7491         free_loaded_vmcs(vmx->loaded_vmcs);
7492 free_pml:
7493         vmx_destroy_pml_buffer(vmx);
7494 free_vpid:
7495         free_vpid(vmx->vpid);
7496         return err;
7497 }
7498
7499 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7500 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7501
7502 static int vmx_vm_init(struct kvm *kvm)
7503 {
7504         if (!ple_gap)
7505                 kvm->arch.pause_in_guest = true;
7506
7507         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7508                 switch (l1tf_mitigation) {
7509                 case L1TF_MITIGATION_OFF:
7510                 case L1TF_MITIGATION_FLUSH_NOWARN:
7511                         /* 'I explicitly don't care' is set */
7512                         break;
7513                 case L1TF_MITIGATION_FLUSH:
7514                 case L1TF_MITIGATION_FLUSH_NOSMT:
7515                 case L1TF_MITIGATION_FULL:
7516                         /*
7517                          * Warn upon starting the first VM in a potentially
7518                          * insecure environment.
7519                          */
7520                         if (sched_smt_active())
7521                                 pr_warn_once(L1TF_MSG_SMT);
7522                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7523                                 pr_warn_once(L1TF_MSG_L1D);
7524                         break;
7525                 case L1TF_MITIGATION_FULL_FORCE:
7526                         /* Flush is enforced */
7527                         break;
7528                 }
7529         }
7530         return 0;
7531 }
7532
7533 static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7534 {
7535         u8 cache;
7536
7537         /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7538          * memory aliases with conflicting memory types and sometimes MCEs.
7539          * We have to be careful as to what are honored and when.
7540          *
7541          * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
7542          * UC.  The effective memory type is UC or WC depending on guest PAT.
7543          * This was historically the source of MCEs and we want to be
7544          * conservative.
7545          *
7546          * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7547          * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
7548          * EPT memory type is set to WB.  The effective memory type is forced
7549          * WB.
7550          *
7551          * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
7552          * EPT memory type is used to emulate guest CD/MTRR.
7553          */
7554
7555         if (is_mmio)
7556                 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7557
7558         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7559                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7560
7561         if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
7562                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7563                         cache = MTRR_TYPE_WRBACK;
7564                 else
7565                         cache = MTRR_TYPE_UNCACHABLE;
7566
7567                 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7568         }
7569
7570         return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
7571 }
7572
7573 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7574 {
7575         /*
7576          * These bits in the secondary execution controls field
7577          * are dynamic, the others are mostly based on the hypervisor
7578          * architecture and the guest's CPUID.  Do not touch the
7579          * dynamic bits.
7580          */
7581         u32 mask =
7582                 SECONDARY_EXEC_SHADOW_VMCS |
7583                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7584                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7585                 SECONDARY_EXEC_DESC;
7586
7587         u32 cur_ctl = secondary_exec_controls_get(vmx);
7588
7589         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7590 }
7591
7592 /*
7593  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7594  * (indicating "allowed-1") if they are supported in the guest's CPUID.
7595  */
7596 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7597 {
7598         struct vcpu_vmx *vmx = to_vmx(vcpu);
7599         struct kvm_cpuid_entry2 *entry;
7600
7601         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7602         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7603
7604 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
7605         if (entry && (entry->_reg & (_cpuid_mask)))                     \
7606                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
7607 } while (0)
7608
7609         entry = kvm_find_cpuid_entry(vcpu, 0x1);
7610         cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
7611         cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
7612         cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
7613         cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
7614         cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
7615         cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
7616         cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
7617         cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
7618         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
7619         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7620         cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
7621         cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
7622         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
7623         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
7624
7625         entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7626         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
7627         cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
7628         cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
7629         cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
7630         cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
7631         cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
7632
7633 #undef cr4_fixed1_update
7634 }
7635
7636 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7637 {
7638         struct vcpu_vmx *vmx = to_vmx(vcpu);
7639         struct kvm_cpuid_entry2 *best = NULL;
7640         int i;
7641
7642         for (i = 0; i < PT_CPUID_LEAVES; i++) {
7643                 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
7644                 if (!best)
7645                         return;
7646                 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7647                 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7648                 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7649                 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7650         }
7651
7652         /* Get the number of configurable Address Ranges for filtering */
7653         vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7654                                                 PT_CAP_num_address_ranges);
7655
7656         /* Initialize and clear the no dependency bits */
7657         vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7658                         RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7659                         RTIT_CTL_BRANCH_EN);
7660
7661         /*
7662          * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7663          * will inject an #GP
7664          */
7665         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7666                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7667
7668         /*
7669          * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7670          * PSBFreq can be set
7671          */
7672         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7673                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7674                                 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7675
7676         /*
7677          * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7678          */
7679         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7680                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7681                                               RTIT_CTL_MTC_RANGE);
7682
7683         /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7684         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7685                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7686                                                         RTIT_CTL_PTW_EN);
7687
7688         /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7689         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7690                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7691
7692         /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7693         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7694                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7695
7696         /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7697         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7698                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7699
7700         /* unmask address range configure area */
7701         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7702                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7703 }
7704
7705 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7706 {
7707         struct vcpu_vmx *vmx = to_vmx(vcpu);
7708
7709         /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7710         vcpu->arch.xsaves_enabled = false;
7711
7712         vmx_setup_uret_msrs(vmx);
7713
7714         if (cpu_has_secondary_exec_ctrls())
7715                 vmcs_set_secondary_exec_control(vmx,
7716                                                 vmx_secondary_exec_control(vmx));
7717
7718         if (nested_vmx_allowed(vcpu))
7719                 vmx->msr_ia32_feature_control_valid_bits |=
7720                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7721                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7722         else
7723                 vmx->msr_ia32_feature_control_valid_bits &=
7724                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7725                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7726
7727         if (nested_vmx_allowed(vcpu))
7728                 nested_vmx_cr_fixed1_bits_update(vcpu);
7729
7730         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7731                         guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7732                 update_intel_pt_cfg(vcpu);
7733
7734         if (boot_cpu_has(X86_FEATURE_RTM)) {
7735                 struct vmx_uret_msr *msr;
7736                 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7737                 if (msr) {
7738                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7739                         vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7740                 }
7741         }
7742
7743         if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7744                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7745                                           !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7746
7747
7748         set_cr4_guest_host_mask(vmx);
7749
7750         vmx_write_encls_bitmap(vcpu, NULL);
7751         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7752                 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7753         else
7754                 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7755
7756         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7757                 vmx->msr_ia32_feature_control_valid_bits |=
7758                         FEAT_CTL_SGX_LC_ENABLED;
7759         else
7760                 vmx->msr_ia32_feature_control_valid_bits &=
7761                         ~FEAT_CTL_SGX_LC_ENABLED;
7762
7763         /* Refresh #PF interception to account for MAXPHYADDR changes. */
7764         vmx_update_exception_bitmap(vcpu);
7765 }
7766
7767 static u64 vmx_get_perf_capabilities(void)
7768 {
7769         u64 perf_cap = PMU_CAP_FW_WRITES;
7770         struct x86_pmu_lbr lbr;
7771         u64 host_perf_cap = 0;
7772
7773         if (!enable_pmu)
7774                 return 0;
7775
7776         if (boot_cpu_has(X86_FEATURE_PDCM))
7777                 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7778
7779         x86_perf_get_lbr(&lbr);
7780         if (lbr.nr)
7781                 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
7782
7783         if (vmx_pebs_supported()) {
7784                 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7785                 if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
7786                         perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7787         }
7788
7789         return perf_cap;
7790 }
7791
7792 static __init void vmx_set_cpu_caps(void)
7793 {
7794         kvm_set_cpu_caps();
7795
7796         /* CPUID 0x1 */
7797         if (nested)
7798                 kvm_cpu_cap_set(X86_FEATURE_VMX);
7799
7800         /* CPUID 0x7 */
7801         if (kvm_mpx_supported())
7802                 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7803         if (!cpu_has_vmx_invpcid())
7804                 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
7805         if (vmx_pt_mode_is_host_guest())
7806                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7807         if (vmx_pebs_supported()) {
7808                 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7809                 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7810         }
7811
7812         if (!enable_pmu)
7813                 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
7814         kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
7815
7816         if (!enable_sgx) {
7817                 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7818                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7819                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7820                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7821         }
7822
7823         if (vmx_umip_emulated())
7824                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7825
7826         /* CPUID 0xD.1 */
7827         kvm_caps.supported_xss = 0;
7828         if (!cpu_has_vmx_xsaves())
7829                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7830
7831         /* CPUID 0x80000001 and 0x7 (RDPID) */
7832         if (!cpu_has_vmx_rdtscp()) {
7833                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7834                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7835         }
7836
7837         if (cpu_has_vmx_waitpkg())
7838                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7839 }
7840
7841 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7842 {
7843         to_vmx(vcpu)->req_immediate_exit = true;
7844 }
7845
7846 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7847                                   struct x86_instruction_info *info)
7848 {
7849         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7850         unsigned short port;
7851         bool intercept;
7852         int size;
7853
7854         if (info->intercept == x86_intercept_in ||
7855             info->intercept == x86_intercept_ins) {
7856                 port = info->src_val;
7857                 size = info->dst_bytes;
7858         } else {
7859                 port = info->dst_val;
7860                 size = info->src_bytes;
7861         }
7862
7863         /*
7864          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7865          * VM-exits depend on the 'unconditional IO exiting' VM-execution
7866          * control.
7867          *
7868          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7869          */
7870         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7871                 intercept = nested_cpu_has(vmcs12,
7872                                            CPU_BASED_UNCOND_IO_EXITING);
7873         else
7874                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7875
7876         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7877         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7878 }
7879
7880 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7881                                struct x86_instruction_info *info,
7882                                enum x86_intercept_stage stage,
7883                                struct x86_exception *exception)
7884 {
7885         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7886
7887         switch (info->intercept) {
7888         /*
7889          * RDPID causes #UD if disabled through secondary execution controls.
7890          * Because it is marked as EmulateOnUD, we need to intercept it here.
7891          * Note, RDPID is hidden behind ENABLE_RDTSCP.
7892          */
7893         case x86_intercept_rdpid:
7894                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7895                         exception->vector = UD_VECTOR;
7896                         exception->error_code_valid = false;
7897                         return X86EMUL_PROPAGATE_FAULT;
7898                 }
7899                 break;
7900
7901         case x86_intercept_in:
7902         case x86_intercept_ins:
7903         case x86_intercept_out:
7904         case x86_intercept_outs:
7905                 return vmx_check_intercept_io(vcpu, info);
7906
7907         case x86_intercept_lgdt:
7908         case x86_intercept_lidt:
7909         case x86_intercept_lldt:
7910         case x86_intercept_ltr:
7911         case x86_intercept_sgdt:
7912         case x86_intercept_sidt:
7913         case x86_intercept_sldt:
7914         case x86_intercept_str:
7915                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7916                         return X86EMUL_CONTINUE;
7917
7918                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7919                 break;
7920
7921         /* TODO: check more intercepts... */
7922         default:
7923                 break;
7924         }
7925
7926         return X86EMUL_UNHANDLEABLE;
7927 }
7928
7929 #ifdef CONFIG_X86_64
7930 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7931 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7932                                   u64 divisor, u64 *result)
7933 {
7934         u64 low = a << shift, high = a >> (64 - shift);
7935
7936         /* To avoid the overflow on divq */
7937         if (high >= divisor)
7938                 return 1;
7939
7940         /* Low hold the result, high hold rem which is discarded */
7941         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7942             "rm" (divisor), "0" (low), "1" (high));
7943         *result = low;
7944
7945         return 0;
7946 }
7947
7948 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7949                             bool *expired)
7950 {
7951         struct vcpu_vmx *vmx;
7952         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7953         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
7954
7955         vmx = to_vmx(vcpu);
7956         tscl = rdtsc();
7957         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7958         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7959         lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7960                                                     ktimer->timer_advance_ns);
7961
7962         if (delta_tsc > lapic_timer_advance_cycles)
7963                 delta_tsc -= lapic_timer_advance_cycles;
7964         else
7965                 delta_tsc = 0;
7966
7967         /* Convert to host delta tsc if tsc scaling is enabled */
7968         if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
7969             delta_tsc && u64_shl_div_u64(delta_tsc,
7970                                 kvm_caps.tsc_scaling_ratio_frac_bits,
7971                                 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
7972                 return -ERANGE;
7973
7974         /*
7975          * If the delta tsc can't fit in the 32 bit after the multi shift,
7976          * we can't use the preemption timer.
7977          * It's possible that it fits on later vmentries, but checking
7978          * on every vmentry is costly so we just use an hrtimer.
7979          */
7980         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7981                 return -ERANGE;
7982
7983         vmx->hv_deadline_tsc = tscl + delta_tsc;
7984         *expired = !delta_tsc;
7985         return 0;
7986 }
7987
7988 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7989 {
7990         to_vmx(vcpu)->hv_deadline_tsc = -1;
7991 }
7992 #endif
7993
7994 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
7995 {
7996         if (!kvm_pause_in_guest(vcpu->kvm))
7997                 shrink_ple_window(vcpu);
7998 }
7999
8000 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8001 {
8002         struct vcpu_vmx *vmx = to_vmx(vcpu);
8003
8004         if (WARN_ON_ONCE(!enable_pml))
8005                 return;
8006
8007         if (is_guest_mode(vcpu)) {
8008                 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
8009                 return;
8010         }
8011
8012         /*
8013          * Note, nr_memslots_dirty_logging can be changed concurrent with this
8014          * code, but in that case another update request will be made and so
8015          * the guest will never run with a stale PML value.
8016          */
8017         if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
8018                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8019         else
8020                 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8021 }
8022
8023 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8024 {
8025         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8026                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8027                         FEAT_CTL_LMCE_ENABLED;
8028         else
8029                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8030                         ~FEAT_CTL_LMCE_ENABLED;
8031 }
8032
8033 #ifdef CONFIG_KVM_SMM
8034 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8035 {
8036         /* we need a nested vmexit to enter SMM, postpone if run is pending */
8037         if (to_vmx(vcpu)->nested.nested_run_pending)
8038                 return -EBUSY;
8039         return !is_smm(vcpu);
8040 }
8041
8042 static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8043 {
8044         struct vcpu_vmx *vmx = to_vmx(vcpu);
8045
8046         /*
8047          * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8048          * SMI and RSM.  Using the common VM-Exit + VM-Enter routines is wrong
8049          * SMI and RSM only modify state that is saved and restored via SMRAM.
8050          * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8051          * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8052          */
8053         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8054         if (vmx->nested.smm.guest_mode)
8055                 nested_vmx_vmexit(vcpu, -1, 0, 0);
8056
8057         vmx->nested.smm.vmxon = vmx->nested.vmxon;
8058         vmx->nested.vmxon = false;
8059         vmx_clear_hlt(vcpu);
8060         return 0;
8061 }
8062
8063 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8064 {
8065         struct vcpu_vmx *vmx = to_vmx(vcpu);
8066         int ret;
8067
8068         if (vmx->nested.smm.vmxon) {
8069                 vmx->nested.vmxon = true;
8070                 vmx->nested.smm.vmxon = false;
8071         }
8072
8073         if (vmx->nested.smm.guest_mode) {
8074                 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8075                 if (ret)
8076                         return ret;
8077
8078                 vmx->nested.nested_run_pending = 1;
8079                 vmx->nested.smm.guest_mode = false;
8080         }
8081         return 0;
8082 }
8083
8084 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8085 {
8086         /* RSM will cause a vmexit anyway.  */
8087 }
8088 #endif
8089
8090 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8091 {
8092         return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8093 }
8094
8095 static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8096 {
8097         if (is_guest_mode(vcpu)) {
8098                 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8099
8100                 if (hrtimer_try_to_cancel(timer) == 1)
8101                         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8102         }
8103 }
8104
8105 static void vmx_hardware_unsetup(void)
8106 {
8107         kvm_set_posted_intr_wakeup_handler(NULL);
8108
8109         if (nested)
8110                 nested_vmx_hardware_unsetup();
8111
8112         free_kvm_area();
8113 }
8114
8115 #define VMX_REQUIRED_APICV_INHIBITS                     \
8116 (                                                       \
8117         BIT(APICV_INHIBIT_REASON_DISABLE)|              \
8118         BIT(APICV_INHIBIT_REASON_ABSENT) |              \
8119         BIT(APICV_INHIBIT_REASON_HYPERV) |              \
8120         BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |            \
8121         BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
8122         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |    \
8123         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)    \
8124 )
8125
8126 static void vmx_vm_destroy(struct kvm *kvm)
8127 {
8128         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8129
8130         free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8131 }
8132
8133 static struct kvm_x86_ops vmx_x86_ops __initdata = {
8134         .name = KBUILD_MODNAME,
8135
8136         .check_processor_compatibility = vmx_check_processor_compat,
8137
8138         .hardware_unsetup = vmx_hardware_unsetup,
8139
8140         .hardware_enable = vmx_hardware_enable,
8141         .hardware_disable = vmx_hardware_disable,
8142         .has_emulated_msr = vmx_has_emulated_msr,
8143
8144         .vm_size = sizeof(struct kvm_vmx),
8145         .vm_init = vmx_vm_init,
8146         .vm_destroy = vmx_vm_destroy,
8147
8148         .vcpu_precreate = vmx_vcpu_precreate,
8149         .vcpu_create = vmx_vcpu_create,
8150         .vcpu_free = vmx_vcpu_free,
8151         .vcpu_reset = vmx_vcpu_reset,
8152
8153         .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
8154         .vcpu_load = vmx_vcpu_load,
8155         .vcpu_put = vmx_vcpu_put,
8156
8157         .update_exception_bitmap = vmx_update_exception_bitmap,
8158         .get_msr_feature = vmx_get_msr_feature,
8159         .get_msr = vmx_get_msr,
8160         .set_msr = vmx_set_msr,
8161         .get_segment_base = vmx_get_segment_base,
8162         .get_segment = vmx_get_segment,
8163         .set_segment = vmx_set_segment,
8164         .get_cpl = vmx_get_cpl,
8165         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
8166         .set_cr0 = vmx_set_cr0,
8167         .is_valid_cr4 = vmx_is_valid_cr4,
8168         .set_cr4 = vmx_set_cr4,
8169         .set_efer = vmx_set_efer,
8170         .get_idt = vmx_get_idt,
8171         .set_idt = vmx_set_idt,
8172         .get_gdt = vmx_get_gdt,
8173         .set_gdt = vmx_set_gdt,
8174         .set_dr7 = vmx_set_dr7,
8175         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8176         .cache_reg = vmx_cache_reg,
8177         .get_rflags = vmx_get_rflags,
8178         .set_rflags = vmx_set_rflags,
8179         .get_if_flag = vmx_get_if_flag,
8180
8181         .flush_tlb_all = vmx_flush_tlb_all,
8182         .flush_tlb_current = vmx_flush_tlb_current,
8183         .flush_tlb_gva = vmx_flush_tlb_gva,
8184         .flush_tlb_guest = vmx_flush_tlb_guest,
8185
8186         .vcpu_pre_run = vmx_vcpu_pre_run,
8187         .vcpu_run = vmx_vcpu_run,
8188         .handle_exit = vmx_handle_exit,
8189         .skip_emulated_instruction = vmx_skip_emulated_instruction,
8190         .update_emulated_instruction = vmx_update_emulated_instruction,
8191         .set_interrupt_shadow = vmx_set_interrupt_shadow,
8192         .get_interrupt_shadow = vmx_get_interrupt_shadow,
8193         .patch_hypercall = vmx_patch_hypercall,
8194         .inject_irq = vmx_inject_irq,
8195         .inject_nmi = vmx_inject_nmi,
8196         .inject_exception = vmx_inject_exception,
8197         .cancel_injection = vmx_cancel_injection,
8198         .interrupt_allowed = vmx_interrupt_allowed,
8199         .nmi_allowed = vmx_nmi_allowed,
8200         .get_nmi_mask = vmx_get_nmi_mask,
8201         .set_nmi_mask = vmx_set_nmi_mask,
8202         .enable_nmi_window = vmx_enable_nmi_window,
8203         .enable_irq_window = vmx_enable_irq_window,
8204         .update_cr8_intercept = vmx_update_cr8_intercept,
8205         .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8206         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8207         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8208         .load_eoi_exitmap = vmx_load_eoi_exitmap,
8209         .apicv_post_state_restore = vmx_apicv_post_state_restore,
8210         .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
8211         .hwapic_irr_update = vmx_hwapic_irr_update,
8212         .hwapic_isr_update = vmx_hwapic_isr_update,
8213         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8214         .sync_pir_to_irr = vmx_sync_pir_to_irr,
8215         .deliver_interrupt = vmx_deliver_interrupt,
8216         .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
8217
8218         .set_tss_addr = vmx_set_tss_addr,
8219         .set_identity_map_addr = vmx_set_identity_map_addr,
8220         .get_mt_mask = vmx_get_mt_mask,
8221
8222         .get_exit_info = vmx_get_exit_info,
8223
8224         .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
8225
8226         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8227
8228         .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
8229         .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
8230         .write_tsc_offset = vmx_write_tsc_offset,
8231         .write_tsc_multiplier = vmx_write_tsc_multiplier,
8232
8233         .load_mmu_pgd = vmx_load_mmu_pgd,
8234
8235         .check_intercept = vmx_check_intercept,
8236         .handle_exit_irqoff = vmx_handle_exit_irqoff,
8237
8238         .request_immediate_exit = vmx_request_immediate_exit,
8239
8240         .sched_in = vmx_sched_in,
8241
8242         .cpu_dirty_log_size = PML_ENTITY_NUM,
8243         .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
8244
8245         .nested_ops = &vmx_nested_ops,
8246
8247         .pi_update_irte = vmx_pi_update_irte,
8248         .pi_start_assignment = vmx_pi_start_assignment,
8249
8250 #ifdef CONFIG_X86_64
8251         .set_hv_timer = vmx_set_hv_timer,
8252         .cancel_hv_timer = vmx_cancel_hv_timer,
8253 #endif
8254
8255         .setup_mce = vmx_setup_mce,
8256
8257 #ifdef CONFIG_KVM_SMM
8258         .smi_allowed = vmx_smi_allowed,
8259         .enter_smm = vmx_enter_smm,
8260         .leave_smm = vmx_leave_smm,
8261         .enable_smi_window = vmx_enable_smi_window,
8262 #endif
8263
8264         .can_emulate_instruction = vmx_can_emulate_instruction,
8265         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
8266         .migrate_timers = vmx_migrate_timers,
8267
8268         .msr_filter_changed = vmx_msr_filter_changed,
8269         .complete_emulated_msr = kvm_complete_insn_gp,
8270
8271         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
8272 };
8273
8274 static unsigned int vmx_handle_intel_pt_intr(void)
8275 {
8276         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8277
8278         /* '0' on failure so that the !PT case can use a RET0 static call. */
8279         if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8280                 return 0;
8281
8282         kvm_make_request(KVM_REQ_PMI, vcpu);
8283         __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8284                   (unsigned long *)&vcpu->arch.pmu.global_status);
8285         return 1;
8286 }
8287
8288 static __init void vmx_setup_user_return_msrs(void)
8289 {
8290
8291         /*
8292          * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8293          * will emulate SYSCALL in legacy mode if the vendor string in guest
8294          * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8295          * support this emulation, MSR_STAR is included in the list for i386,
8296          * but is never loaded into hardware.  MSR_CSTAR is also never loaded
8297          * into hardware and is here purely for emulation purposes.
8298          */
8299         const u32 vmx_uret_msrs_list[] = {
8300         #ifdef CONFIG_X86_64
8301                 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8302         #endif
8303                 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8304                 MSR_IA32_TSX_CTRL,
8305         };
8306         int i;
8307
8308         BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8309
8310         for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8311                 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8312 }
8313
8314 static void __init vmx_setup_me_spte_mask(void)
8315 {
8316         u64 me_mask = 0;
8317
8318         /*
8319          * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
8320          * the former to avoid exposing shadow_phys_bits.
8321          *
8322          * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8323          * shadow_phys_bits.  On MKTME and/or TDX capable systems,
8324          * boot_cpu_data.x86_phys_bits holds the actual physical address
8325          * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
8326          * reported by CPUID.  Those bits between are KeyID bits.
8327          */
8328         if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
8329                 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8330                         kvm_get_shadow_phys_bits() - 1);
8331         /*
8332          * Unlike SME, host kernel doesn't support setting up any
8333          * MKTME KeyID on Intel platforms.  No memory encryption
8334          * bits should be included into the SPTE.
8335          */
8336         kvm_mmu_set_me_spte_mask(0, me_mask);
8337 }
8338
8339 static struct kvm_x86_init_ops vmx_init_ops __initdata;
8340
8341 static __init int hardware_setup(void)
8342 {
8343         unsigned long host_bndcfgs;
8344         struct desc_ptr dt;
8345         int r;
8346
8347         store_idt(&dt);
8348         host_idt_base = dt.address;
8349
8350         vmx_setup_user_return_msrs();
8351
8352         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8353                 return -EIO;
8354
8355         if (cpu_has_perf_global_ctrl_bug())
8356                 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
8357                              "does not work properly. Using workaround\n");
8358
8359         if (boot_cpu_has(X86_FEATURE_NX))
8360                 kvm_enable_efer_bits(EFER_NX);
8361
8362         if (boot_cpu_has(X86_FEATURE_MPX)) {
8363                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8364                 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8365         }
8366
8367         if (!cpu_has_vmx_mpx())
8368                 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8369                                              XFEATURE_MASK_BNDCSR);
8370
8371         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8372             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8373                 enable_vpid = 0;
8374
8375         if (!cpu_has_vmx_ept() ||
8376             !cpu_has_vmx_ept_4levels() ||
8377             !cpu_has_vmx_ept_mt_wb() ||
8378             !cpu_has_vmx_invept_global())
8379                 enable_ept = 0;
8380
8381         /* NX support is required for shadow paging. */
8382         if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8383                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
8384                 return -EOPNOTSUPP;
8385         }
8386
8387         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8388                 enable_ept_ad_bits = 0;
8389
8390         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8391                 enable_unrestricted_guest = 0;
8392
8393         if (!cpu_has_vmx_flexpriority())
8394                 flexpriority_enabled = 0;
8395
8396         if (!cpu_has_virtual_nmis())
8397                 enable_vnmi = 0;
8398
8399 #ifdef CONFIG_X86_SGX_KVM
8400         if (!cpu_has_vmx_encls_vmexit())
8401                 enable_sgx = false;
8402 #endif
8403
8404         /*
8405          * set_apic_access_page_addr() is used to reload apic access
8406          * page upon invalidation.  No need to do anything if not
8407          * using the APIC_ACCESS_ADDR VMCS field.
8408          */
8409         if (!flexpriority_enabled)
8410                 vmx_x86_ops.set_apic_access_page_addr = NULL;
8411
8412         if (!cpu_has_vmx_tpr_shadow())
8413                 vmx_x86_ops.update_cr8_intercept = NULL;
8414
8415 #if IS_ENABLED(CONFIG_HYPERV)
8416         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8417             && enable_ept) {
8418                 vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
8419                 vmx_x86_ops.tlb_remote_flush_with_range =
8420                                 hv_remote_flush_tlb_with_range;
8421         }
8422 #endif
8423
8424         if (!cpu_has_vmx_ple()) {
8425                 ple_gap = 0;
8426                 ple_window = 0;
8427                 ple_window_grow = 0;
8428                 ple_window_max = 0;
8429                 ple_window_shrink = 0;
8430         }
8431
8432         if (!cpu_has_vmx_apicv())
8433                 enable_apicv = 0;
8434         if (!enable_apicv)
8435                 vmx_x86_ops.sync_pir_to_irr = NULL;
8436
8437         if (!enable_apicv || !cpu_has_vmx_ipiv())
8438                 enable_ipiv = false;
8439
8440         if (cpu_has_vmx_tsc_scaling())
8441                 kvm_caps.has_tsc_control = true;
8442
8443         kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8444         kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8445         kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8446         kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8447
8448         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8449
8450         if (enable_ept)
8451                 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8452                                       cpu_has_vmx_ept_execute_only());
8453
8454         /*
8455          * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8456          * bits to shadow_zero_check.
8457          */
8458         vmx_setup_me_spte_mask();
8459
8460         kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
8461                           ept_caps_to_lpage_level(vmx_capability.ept));
8462
8463         /*
8464          * Only enable PML when hardware supports PML feature, and both EPT
8465          * and EPT A/D bit features are enabled -- PML depends on them to work.
8466          */
8467         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8468                 enable_pml = 0;
8469
8470         if (!enable_pml)
8471                 vmx_x86_ops.cpu_dirty_log_size = 0;
8472
8473         if (!cpu_has_vmx_preemption_timer())
8474                 enable_preemption_timer = false;
8475
8476         if (enable_preemption_timer) {
8477                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
8478
8479                 cpu_preemption_timer_multi =
8480                         vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8481
8482                 if (tsc_khz)
8483                         use_timer_freq = (u64)tsc_khz * 1000;
8484                 use_timer_freq >>= cpu_preemption_timer_multi;
8485
8486                 /*
8487                  * KVM "disables" the preemption timer by setting it to its max
8488                  * value.  Don't use the timer if it might cause spurious exits
8489                  * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8490                  */
8491                 if (use_timer_freq > 0xffffffffu / 10)
8492                         enable_preemption_timer = false;
8493         }
8494
8495         if (!enable_preemption_timer) {
8496                 vmx_x86_ops.set_hv_timer = NULL;
8497                 vmx_x86_ops.cancel_hv_timer = NULL;
8498                 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
8499         }
8500
8501         kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8502         kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8503
8504         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8505                 return -EINVAL;
8506         if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8507                 pt_mode = PT_MODE_SYSTEM;
8508         if (pt_mode == PT_MODE_HOST_GUEST)
8509                 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8510         else
8511                 vmx_init_ops.handle_intel_pt_intr = NULL;
8512
8513         setup_default_sgx_lepubkeyhash();
8514
8515         if (nested) {
8516                 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
8517
8518                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8519                 if (r)
8520                         return r;
8521         }
8522
8523         vmx_set_cpu_caps();
8524
8525         r = alloc_kvm_area();
8526         if (r && nested)
8527                 nested_vmx_hardware_unsetup();
8528
8529         kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8530
8531         return r;
8532 }
8533
8534 static struct kvm_x86_init_ops vmx_init_ops __initdata = {
8535         .hardware_setup = hardware_setup,
8536         .handle_intel_pt_intr = NULL,
8537
8538         .runtime_ops = &vmx_x86_ops,
8539         .pmu_ops = &intel_pmu_ops,
8540 };
8541
8542 static void vmx_cleanup_l1d_flush(void)
8543 {
8544         if (vmx_l1d_flush_pages) {
8545                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8546                 vmx_l1d_flush_pages = NULL;
8547         }
8548         /* Restore state so sysfs ignores VMX */
8549         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8550 }
8551
8552 static void __vmx_exit(void)
8553 {
8554         allow_smaller_maxphyaddr = false;
8555
8556 #ifdef CONFIG_KEXEC_CORE
8557         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8558         synchronize_rcu();
8559 #endif
8560         vmx_cleanup_l1d_flush();
8561 }
8562
8563 static void vmx_exit(void)
8564 {
8565         kvm_exit();
8566         kvm_x86_vendor_exit();
8567
8568         __vmx_exit();
8569 }
8570 module_exit(vmx_exit);
8571
8572 static int __init vmx_init(void)
8573 {
8574         int r, cpu;
8575
8576         if (!kvm_is_vmx_supported())
8577                 return -EOPNOTSUPP;
8578
8579         /*
8580          * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8581          * to unwind if a later step fails.
8582          */
8583         hv_init_evmcs();
8584
8585         r = kvm_x86_vendor_init(&vmx_init_ops);
8586         if (r)
8587                 return r;
8588
8589         /*
8590          * Must be called after common x86 init so enable_ept is properly set
8591          * up. Hand the parameter mitigation value in which was stored in
8592          * the pre module init parser. If no parameter was given, it will
8593          * contain 'auto' which will be turned into the default 'cond'
8594          * mitigation mode.
8595          */
8596         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8597         if (r)
8598                 goto err_l1d_flush;
8599
8600         vmx_setup_fb_clear_ctrl();
8601
8602         for_each_possible_cpu(cpu) {
8603                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8604
8605                 pi_init_cpu(cpu);
8606         }
8607
8608 #ifdef CONFIG_KEXEC_CORE
8609         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8610                            crash_vmclear_local_loaded_vmcss);
8611 #endif
8612         vmx_check_vmcs12_offsets();
8613
8614         /*
8615          * Shadow paging doesn't have a (further) performance penalty
8616          * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8617          * by default
8618          */
8619         if (!enable_ept)
8620                 allow_smaller_maxphyaddr = true;
8621
8622         /*
8623          * Common KVM initialization _must_ come last, after this, /dev/kvm is
8624          * exposed to userspace!
8625          */
8626         r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
8627                      THIS_MODULE);
8628         if (r)
8629                 goto err_kvm_init;
8630
8631         return 0;
8632
8633 err_kvm_init:
8634         __vmx_exit();
8635 err_l1d_flush:
8636         kvm_x86_vendor_exit();
8637         return r;
8638 }
8639 module_init(vmx_init);