arch/x86/kvm/vmx/vmx.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <avi@qumranet.com>
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  */
  15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  16
  17 #include <linux/highmem.h>
  18 #include <linux/hrtimer.h>
  19 #include <linux/kernel.h>
  20 #include <linux/kvm_host.h>
  21 #include <linux/module.h>
  22 #include <linux/moduleparam.h>
  23 #include <linux/mod_devicetable.h>
  24 #include <linux/mm.h>
  25 #include <linux/objtool.h>
  26 #include <linux/sched.h>
  27 #include <linux/sched/smt.h>
  28 #include <linux/slab.h>
  29 #include <linux/tboot.h>
  30 #include <linux/trace_events.h>
  31 #include <linux/entry-kvm.h>
  32
  33 #include <asm/apic.h>
  34 #include <asm/asm.h>
  35 #include <asm/cpu.h>
  36 #include <asm/cpu_device_id.h>
  37 #include <asm/debugreg.h>
  38 #include <asm/desc.h>
  39 #include <asm/fpu/api.h>
  40 #include <asm/fpu/xstate.h>
  41 #include <asm/idtentry.h>
  42 #include <asm/io.h>
  43 #include <asm/irq_remapping.h>
  44 #include <asm/kexec.h>
  45 #include <asm/perf_event.h>
  46 #include <asm/mmu_context.h>
  47 #include <asm/mshyperv.h>
  48 #include <asm/mwait.h>
  49 #include <asm/spec-ctrl.h>
  50 #include <asm/virtext.h>
  51 #include <asm/vmx.h>
  52
  53 #include "capabilities.h"
  54 #include "cpuid.h"
  55 #include "hyperv.h"
  56 #include "kvm_onhyperv.h"
  57 #include "irq.h"
  58 #include "kvm_cache_regs.h"
  59 #include "lapic.h"
  60 #include "mmu.h"
  61 #include "nested.h"
  62 #include "pmu.h"
  63 #include "sgx.h"
  64 #include "trace.h"
  65 #include "vmcs.h"
  66 #include "vmcs12.h"
  67 #include "vmx.h"
  68 #include "x86.h"
  69 #include "smm.h"
  70
  71 MODULE_AUTHOR("Qumranet");
  72 MODULE_LICENSE("GPL");
  73
  74 #ifdef MODULE
  75 static const struct x86_cpu_id vmx_cpu_id[] = {
  76         X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
  77         {}
  78 };
  79 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  80 #endif
  81
  82 bool __read_mostly enable_vpid = 1;
  83 module_param_named(vpid, enable_vpid, bool, 0444);
  84
  85 static bool __read_mostly enable_vnmi = 1;
  86 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
  87
  88 bool __read_mostly flexpriority_enabled = 1;
  89 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  90
  91 bool __read_mostly enable_ept = 1;
  92 module_param_named(ept, enable_ept, bool, S_IRUGO);
  93
  94 bool __read_mostly enable_unrestricted_guest = 1;
  95 module_param_named(unrestricted_guest,
  96                         enable_unrestricted_guest, bool, S_IRUGO);
  97
  98 bool __read_mostly enable_ept_ad_bits = 1;
  99 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 100
 101 static bool __read_mostly emulate_invalid_guest_state = true;
 102 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 103
 104 static bool __read_mostly fasteoi = 1;
 105 module_param(fasteoi, bool, S_IRUGO);
 106
 107 module_param(enable_apicv, bool, S_IRUGO);
 108
 109 bool __read_mostly enable_ipiv = true;
 110 module_param(enable_ipiv, bool, 0444);
 111
 112 /*
 113  * If nested=1, nested virtualization is supported, i.e., guests may use
 114  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 115  * use VMX instructions.
 116  */
 117 static bool __read_mostly nested = 1;
 118 module_param(nested, bool, S_IRUGO);
 119
 120 bool __read_mostly enable_pml = 1;
 121 module_param_named(pml, enable_pml, bool, S_IRUGO);
 122
 123 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
 124 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
 125
 126 static bool __read_mostly dump_invalid_vmcs = 0;
 127 module_param(dump_invalid_vmcs, bool, 0644);
 128
 129 #define MSR_BITMAP_MODE_X2APIC          1
 130 #define MSR_BITMAP_MODE_X2APIC_APICV    2
 131
 132 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 133
 134 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
 135 static int __read_mostly cpu_preemption_timer_multi;
 136 static bool __read_mostly enable_preemption_timer = 1;
 137 #ifdef CONFIG_X86_64
 138 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 139 #endif
 140
 141 extern bool __read_mostly allow_smaller_maxphyaddr;
 142 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
 143
 144 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
 145 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 146 #define KVM_VM_CR0_ALWAYS_ON                            \
 147         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 148
 149 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
 150 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 151 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 152
 153 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 154
 155 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
 156         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
 157         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
 158         RTIT_STATUS_BYTECNT))
 159
 160 /*
 161  * List of MSRs that can be directly passed to the guest.
 162  * In addition to these x2apic and PT MSRs are handled specially.
 163  */
 164 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
 165         MSR_IA32_SPEC_CTRL,
 166         MSR_IA32_PRED_CMD,
 167         MSR_IA32_TSC,
 168 #ifdef CONFIG_X86_64
 169         MSR_FS_BASE,
 170         MSR_GS_BASE,
 171         MSR_KERNEL_GS_BASE,
 172         MSR_IA32_XFD,
 173         MSR_IA32_XFD_ERR,
 174 #endif
 175         MSR_IA32_SYSENTER_CS,
 176         MSR_IA32_SYSENTER_ESP,
 177         MSR_IA32_SYSENTER_EIP,
 178         MSR_CORE_C1_RES,
 179         MSR_CORE_C3_RESIDENCY,
 180         MSR_CORE_C6_RESIDENCY,
 181         MSR_CORE_C7_RESIDENCY,
 182 };
 183
 184 /*
 185  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 186  * ple_gap:    upper bound on the amount of time between two successive
 187  *             executions of PAUSE in a loop. Also indicate if ple enabled.
 188  *             According to test, this time is usually smaller than 128 cycles.
 189  * ple_window: upper bound on the amount of time a guest is allowed to execute
 190  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 191  *             less than 2^12 cycles
 192  * Time is measured based on a counter that runs at the same rate as the TSC,
 193  * refer SDM volume 3b section 21.6.13 & 22.1.3.
 194  */
 195 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
 196 module_param(ple_gap, uint, 0444);
 197
 198 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 199 module_param(ple_window, uint, 0444);
 200
 201 /* Default doubles per-vcpu window every exit. */
 202 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 203 module_param(ple_window_grow, uint, 0444);
 204
 205 /* Default resets per-vcpu window every exit to ple_window. */
 206 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 207 module_param(ple_window_shrink, uint, 0444);
 208
 209 /* Default is to compute the maximum so we can never overflow. */
 210 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 211 module_param(ple_window_max, uint, 0444);
 212
 213 /* Default is SYSTEM mode, 1 for host-guest mode */
 214 int __read_mostly pt_mode = PT_MODE_SYSTEM;
 215 module_param(pt_mode, int, S_IRUGO);
 216
 217 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 218 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 219 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 220
 221 /* Storage for pre module init parameter parsing */
 222 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 223
 224 static const struct {
 225         const char *option;
 226         bool for_parse;
 227 } vmentry_l1d_param[] = {
 228         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
 229         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
 230         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
 231         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
 232         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
 233         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 234 };
 235
 236 #define L1D_CACHE_ORDER 4
 237 static void *vmx_l1d_flush_pages;
 238
 239 /* Control for disabling CPU Fill buffer clear */
 240 static bool __read_mostly vmx_fb_clear_ctrl_available;
 241
 242 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 243 {
 244         struct page *page;
 245         unsigned int i;
 246
 247         if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
 248                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 249                 return 0;
 250         }
 251
 252         if (!enable_ept) {
 253                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 254                 return 0;
 255         }
 256
 257         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 258                 u64 msr;
 259
 260                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 261                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 262                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 263                         return 0;
 264                 }
 265         }
 266
 267         /* If set to auto use the default l1tf mitigation method */
 268         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
 269                 switch (l1tf_mitigation) {
 270                 case L1TF_MITIGATION_OFF:
 271                         l1tf = VMENTER_L1D_FLUSH_NEVER;
 272                         break;
 273                 case L1TF_MITIGATION_FLUSH_NOWARN:
 274                 case L1TF_MITIGATION_FLUSH:
 275                 case L1TF_MITIGATION_FLUSH_NOSMT:
 276                         l1tf = VMENTER_L1D_FLUSH_COND;
 277                         break;
 278                 case L1TF_MITIGATION_FULL:
 279                 case L1TF_MITIGATION_FULL_FORCE:
 280                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 281                         break;
 282                 }
 283         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
 284                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 285         }
 286
 287         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 288             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 289                 /*
 290                  * This allocation for vmx_l1d_flush_pages is not tied to a VM
 291                  * lifetime and so should not be charged to a memcg.
 292                  */
 293                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 294                 if (!page)
 295                         return -ENOMEM;
 296                 vmx_l1d_flush_pages = page_address(page);
 297
 298                 /*
 299                  * Initialize each page with a different pattern in
 300                  * order to protect against KSM in the nested
 301                  * virtualization case.
 302                  */
 303                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
 304                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
 305                                PAGE_SIZE);
 306                 }
 307         }
 308
 309         l1tf_vmx_mitigation = l1tf;
 310
 311         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
 312                 static_branch_enable(&vmx_l1d_should_flush);
 313         else
 314                 static_branch_disable(&vmx_l1d_should_flush);
 315
 316         if (l1tf == VMENTER_L1D_FLUSH_COND)
 317                 static_branch_enable(&vmx_l1d_flush_cond);
 318         else
 319                 static_branch_disable(&vmx_l1d_flush_cond);
 320         return 0;
 321 }
 322
 323 static int vmentry_l1d_flush_parse(const char *s)
 324 {
 325         unsigned int i;
 326
 327         if (s) {
 328                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 329                         if (vmentry_l1d_param[i].for_parse &&
 330                             sysfs_streq(s, vmentry_l1d_param[i].option))
 331                                 return i;
 332                 }
 333         }
 334         return -EINVAL;
 335 }
 336
 337 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 338 {
 339         int l1tf, ret;
 340
 341         l1tf = vmentry_l1d_flush_parse(s);
 342         if (l1tf < 0)
 343                 return l1tf;
 344
 345         if (!boot_cpu_has(X86_BUG_L1TF))
 346                 return 0;
 347
 348         /*
 349          * Has vmx_init() run already? If not then this is the pre init
 350          * parameter parsing. In that case just store the value and let
 351          * vmx_init() do the proper setup after enable_ept has been
 352          * established.
 353          */
 354         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
 355                 vmentry_l1d_flush_param = l1tf;
 356                 return 0;
 357         }
 358
 359         mutex_lock(&vmx_l1d_flush_mutex);
 360         ret = vmx_setup_l1d_flush(l1tf);
 361         mutex_unlock(&vmx_l1d_flush_mutex);
 362         return ret;
 363 }
 364
 365 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 366 {
 367         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
 368                 return sprintf(s, "???\n");
 369
 370         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 371 }
 372
 373 static void vmx_setup_fb_clear_ctrl(void)
 374 {
 375         u64 msr;
 376
 377         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
 378             !boot_cpu_has_bug(X86_BUG_MDS) &&
 379             !boot_cpu_has_bug(X86_BUG_TAA)) {
 380                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 381                 if (msr & ARCH_CAP_FB_CLEAR_CTRL)
 382                         vmx_fb_clear_ctrl_available = true;
 383         }
 384 }
 385
 386 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
 387 {
 388         u64 msr;
 389
 390         if (!vmx->disable_fb_clear)
 391                 return;
 392
 393         msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
 394         msr |= FB_CLEAR_DIS;
 395         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
 396         /* Cache the MSR value to avoid reading it later */
 397         vmx->msr_ia32_mcu_opt_ctrl = msr;
 398 }
 399
 400 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
 401 {
 402         if (!vmx->disable_fb_clear)
 403                 return;
 404
 405         vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
 406         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
 407 }
 408
 409 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
 410 {
 411         vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
 412
 413         /*
 414          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
 415          * at VMEntry. Skip the MSR read/write when a guest has no use case to
 416          * execute VERW.
 417          */
 418         if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
 419            ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
 420             (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
 421             (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
 422             (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
 423             (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
 424                 vmx->disable_fb_clear = false;
 425 }
 426
 427 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 428         .set = vmentry_l1d_flush_set,
 429         .get = vmentry_l1d_flush_get,
 430 };
 431 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 432
 433 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 434
 435 void vmx_vmexit(void);
 436
 437 #define vmx_insn_failed(fmt...)         \
 438 do {                                    \
 439         WARN_ONCE(1, fmt);              \
 440         pr_warn_ratelimited(fmt);       \
 441 } while (0)
 442
 443 void vmread_error(unsigned long field, bool fault)
 444 {
 445         if (fault)
 446                 kvm_spurious_fault();
 447         else
 448                 vmx_insn_failed("vmread failed: field=%lx\n", field);
 449 }
 450
 451 noinline void vmwrite_error(unsigned long field, unsigned long value)
 452 {
 453         vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
 454                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 455 }
 456
 457 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
 458 {
 459         vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
 460                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
 461 }
 462
 463 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
 464 {
 465         vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
 466                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
 467 }
 468
 469 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
 470 {
 471         vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
 472                         ext, vpid, gva);
 473 }
 474
 475 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
 476 {
 477         vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
 478                         ext, eptp, gpa);
 479 }
 480
 481 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 482 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 483 /*
 484  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 485  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 486  */
 487 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 488
 489 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 490 static DEFINE_SPINLOCK(vmx_vpid_lock);
 491
 492 struct vmcs_config vmcs_config __ro_after_init;
 493 struct vmx_capability vmx_capability __ro_after_init;
 494
 495 #define VMX_SEGMENT_FIELD(seg)                                  \
 496         [VCPU_SREG_##seg] = {                                   \
 497                 .selector = GUEST_##seg##_SELECTOR,             \
 498                 .base = GUEST_##seg##_BASE,                     \
 499                 .limit = GUEST_##seg##_LIMIT,                   \
 500                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 501         }
 502
 503 static const struct kvm_vmx_segment_field {
 504         unsigned selector;
 505         unsigned base;
 506         unsigned limit;
 507         unsigned ar_bytes;
 508 } kvm_vmx_segment_fields[] = {
 509         VMX_SEGMENT_FIELD(CS),
 510         VMX_SEGMENT_FIELD(DS),
 511         VMX_SEGMENT_FIELD(ES),
 512         VMX_SEGMENT_FIELD(FS),
 513         VMX_SEGMENT_FIELD(GS),
 514         VMX_SEGMENT_FIELD(SS),
 515         VMX_SEGMENT_FIELD(TR),
 516         VMX_SEGMENT_FIELD(LDTR),
 517 };
 518
 519 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 520 {
 521         vmx->segment_cache.bitmask = 0;
 522 }
 523
 524 static unsigned long host_idt_base;
 525
 526 #if IS_ENABLED(CONFIG_HYPERV)
 527 static struct kvm_x86_ops vmx_x86_ops __initdata;
 528
 529 static bool __read_mostly enlightened_vmcs = true;
 530 module_param(enlightened_vmcs, bool, 0444);
 531
 532 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
 533 {
 534         struct hv_enlightened_vmcs *evmcs;
 535         struct hv_partition_assist_pg **p_hv_pa_pg =
 536                         &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
 537         /*
 538          * Synthetic VM-Exit is not enabled in current code and so All
 539          * evmcs in singe VM shares same assist page.
 540          */
 541         if (!*p_hv_pa_pg)
 542                 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 543
 544         if (!*p_hv_pa_pg)
 545                 return -ENOMEM;
 546
 547         evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
 548
 549         evmcs->partition_assist_page =
 550                 __pa(*p_hv_pa_pg);
 551         evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
 552         evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
 553
 554         return 0;
 555 }
 556
 557 static __init void hv_init_evmcs(void)
 558 {
 559         int cpu;
 560
 561         if (!enlightened_vmcs)
 562                 return;
 563
 564         /*
 565          * Enlightened VMCS usage should be recommended and the host needs
 566          * to support eVMCS v1 or above.
 567          */
 568         if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
 569             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
 570              KVM_EVMCS_VERSION) {
 571
 572                 /* Check that we have assist pages on all online CPUs */
 573                 for_each_online_cpu(cpu) {
 574                         if (!hv_get_vp_assist_page(cpu)) {
 575                                 enlightened_vmcs = false;
 576                                 break;
 577                         }
 578                 }
 579
 580                 if (enlightened_vmcs) {
 581                         pr_info("Using Hyper-V Enlightened VMCS\n");
 582                         static_branch_enable(&enable_evmcs);
 583                 }
 584
 585                 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
 586                         vmx_x86_ops.enable_l2_tlb_flush
 587                                 = hv_enable_l2_tlb_flush;
 588
 589         } else {
 590                 enlightened_vmcs = false;
 591         }
 592 }
 593
 594 static void hv_reset_evmcs(void)
 595 {
 596         struct hv_vp_assist_page *vp_ap;
 597
 598         if (!static_branch_unlikely(&enable_evmcs))
 599                 return;
 600
 601         /*
 602          * KVM should enable eVMCS if and only if all CPUs have a VP assist
 603          * page, and should reject CPU onlining if eVMCS is enabled the CPU
 604          * doesn't have a VP assist page allocated.
 605          */
 606         vp_ap = hv_get_vp_assist_page(smp_processor_id());
 607         if (WARN_ON_ONCE(!vp_ap))
 608                 return;
 609
 610         /*
 611          * Reset everything to support using non-enlightened VMCS access later
 612          * (e.g. when we reload the module with enlightened_vmcs=0)
 613          */
 614         vp_ap->nested_control.features.directhypercall = 0;
 615         vp_ap->current_nested_vmcs = 0;
 616         vp_ap->enlighten_vmentry = 0;
 617 }
 618
 619 #else /* IS_ENABLED(CONFIG_HYPERV) */
 620 static void hv_init_evmcs(void) {}
 621 static void hv_reset_evmcs(void) {}
 622 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 623
 624 /*
 625  * Comment's format: document - errata name - stepping - processor name.
 626  * Refer from
 627  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 628  */
 629 static u32 vmx_preemption_cpu_tfms[] = {
 630 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
 631 0x000206E6,
 632 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
 633 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
 634 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
 635 0x00020652,
 636 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
 637 0x00020655,
 638 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
 639 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
 640 /*
 641  * 320767.pdf - AAP86  - B1 -
 642  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 643  */
 644 0x000106E5,
 645 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
 646 0x000106A0,
 647 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
 648 0x000106A1,
 649 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
 650 0x000106A4,
 651  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 652  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 653  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
 654 0x000106A5,
 655  /* Xeon E3-1220 V2 */
 656 0x000306A8,
 657 };
 658
 659 static inline bool cpu_has_broken_vmx_preemption_timer(void)
 660 {
 661         u32 eax = cpuid_eax(0x00000001), i;
 662
 663         /* Clear the reserved bits */
 664         eax &= ~(0x3U << 14 | 0xfU << 28);
 665         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
 666                 if (eax == vmx_preemption_cpu_tfms[i])
 667                         return true;
 668
 669         return false;
 670 }
 671
 672 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 673 {
 674         return flexpriority_enabled && lapic_in_kernel(vcpu);
 675 }
 676
 677 static int possible_passthrough_msr_slot(u32 msr)
 678 {
 679         u32 i;
 680
 681         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
 682                 if (vmx_possible_passthrough_msrs[i] == msr)
 683                         return i;
 684
 685         return -ENOENT;
 686 }
 687
 688 static bool is_valid_passthrough_msr(u32 msr)
 689 {
 690         bool r;
 691
 692         switch (msr) {
 693         case 0x800 ... 0x8ff:
 694                 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
 695                 return true;
 696         case MSR_IA32_RTIT_STATUS:
 697         case MSR_IA32_RTIT_OUTPUT_BASE:
 698         case MSR_IA32_RTIT_OUTPUT_MASK:
 699         case MSR_IA32_RTIT_CR3_MATCH:
 700         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 701                 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
 702         case MSR_LBR_SELECT:
 703         case MSR_LBR_TOS:
 704         case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
 705         case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
 706         case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
 707         case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
 708         case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
 709                 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
 710                 return true;
 711         }
 712
 713         r = possible_passthrough_msr_slot(msr) != -ENOENT;
 714
 715         WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
 716
 717         return r;
 718 }
 719
 720 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 721 {
 722         int i;
 723
 724         i = kvm_find_user_return_msr(msr);
 725         if (i >= 0)
 726                 return &vmx->guest_uret_msrs[i];
 727         return NULL;
 728 }
 729
 730 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 731                                   struct vmx_uret_msr *msr, u64 data)
 732 {
 733         unsigned int slot = msr - vmx->guest_uret_msrs;
 734         int ret = 0;
 735
 736         if (msr->load_into_hardware) {
 737                 preempt_disable();
 738                 ret = kvm_set_user_return_msr(slot, data, msr->mask);
 739                 preempt_enable();
 740         }
 741         if (!ret)
 742                 msr->data = data;
 743         return ret;
 744 }
 745
 746 #ifdef CONFIG_KEXEC_CORE
 747 static void crash_vmclear_local_loaded_vmcss(void)
 748 {
 749         int cpu = raw_smp_processor_id();
 750         struct loaded_vmcs *v;
 751
 752         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
 753                             loaded_vmcss_on_cpu_link)
 754                 vmcs_clear(v->vmcs);
 755 }
 756 #endif /* CONFIG_KEXEC_CORE */
 757
 758 static void __loaded_vmcs_clear(void *arg)
 759 {
 760         struct loaded_vmcs *loaded_vmcs = arg;
 761         int cpu = raw_smp_processor_id();
 762
 763         if (loaded_vmcs->cpu != cpu)
 764                 return; /* vcpu migration can race with cpu offline */
 765         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 766                 per_cpu(current_vmcs, cpu) = NULL;
 767
 768         vmcs_clear(loaded_vmcs->vmcs);
 769         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 770                 vmcs_clear(loaded_vmcs->shadow_vmcs);
 771
 772         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 773
 774         /*
 775          * Ensure all writes to loaded_vmcs, including deleting it from its
 776          * current percpu list, complete before setting loaded_vmcs->cpu to
 777          * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
 778          * and add loaded_vmcs to its percpu list before it's deleted from this
 779          * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
 780          */
 781         smp_wmb();
 782
 783         loaded_vmcs->cpu = -1;
 784         loaded_vmcs->launched = 0;
 785 }
 786
 787 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 788 {
 789         int cpu = loaded_vmcs->cpu;
 790
 791         if (cpu != -1)
 792                 smp_call_function_single(cpu,
 793                          __loaded_vmcs_clear, loaded_vmcs, 1);
 794 }
 795
 796 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
 797                                        unsigned field)
 798 {
 799         bool ret;
 800         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
 801
 802         if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
 803                 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
 804                 vmx->segment_cache.bitmask = 0;
 805         }
 806         ret = vmx->segment_cache.bitmask & mask;
 807         vmx->segment_cache.bitmask |= mask;
 808         return ret;
 809 }
 810
 811 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
 812 {
 813         u16 *p = &vmx->segment_cache.seg[seg].selector;
 814
 815         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
 816                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
 817         return *p;
 818 }
 819
 820 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
 821 {
 822         ulong *p = &vmx->segment_cache.seg[seg].base;
 823
 824         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
 825                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
 826         return *p;
 827 }
 828
 829 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
 830 {
 831         u32 *p = &vmx->segment_cache.seg[seg].limit;
 832
 833         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
 834                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
 835         return *p;
 836 }
 837
 838 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
 839 {
 840         u32 *p = &vmx->segment_cache.seg[seg].ar;
 841
 842         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
 843                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
 844         return *p;
 845 }
 846
 847 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
 848 {
 849         u32 eb;
 850
 851         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 852              (1u << DB_VECTOR) | (1u << AC_VECTOR);
 853         /*
 854          * Guest access to VMware backdoor ports could legitimately
 855          * trigger #GP because of TSS I/O permission bitmap.
 856          * We intercept those #GP and allow access to them anyway
 857          * as VMware does.
 858          */
 859         if (enable_vmware_backdoor)
 860                 eb |= (1u << GP_VECTOR);
 861         if ((vcpu->guest_debug &
 862              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 863             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 864                 eb |= 1u << BP_VECTOR;
 865         if (to_vmx(vcpu)->rmode.vm86_active)
 866                 eb = ~0;
 867         if (!vmx_need_pf_intercept(vcpu))
 868                 eb &= ~(1u << PF_VECTOR);
 869
 870         /* When we are running a nested L2 guest and L1 specified for it a
 871          * certain exception bitmap, we must trap the same exceptions and pass
 872          * them to L1. When running L2, we will only handle the exceptions
 873          * specified above if L1 did not want them.
 874          */
 875         if (is_guest_mode(vcpu))
 876                 eb |= get_vmcs12(vcpu)->exception_bitmap;
 877         else {
 878                 int mask = 0, match = 0;
 879
 880                 if (enable_ept && (eb & (1u << PF_VECTOR))) {
 881                         /*
 882                          * If EPT is enabled, #PF is currently only intercepted
 883                          * if MAXPHYADDR is smaller on the guest than on the
 884                          * host.  In that case we only care about present,
 885                          * non-reserved faults.  For vmcs02, however, PFEC_MASK
 886                          * and PFEC_MATCH are set in prepare_vmcs02_rare.
 887                          */
 888                         mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
 889                         match = PFERR_PRESENT_MASK;
 890                 }
 891                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
 892                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
 893         }
 894
 895         /*
 896          * Disabling xfd interception indicates that dynamic xfeatures
 897          * might be used in the guest. Always trap #NM in this case
 898          * to save guest xfd_err timely.
 899          */
 900         if (vcpu->arch.xfd_no_write_intercept)
 901                 eb |= (1u << NM_VECTOR);
 902
 903         vmcs_write32(EXCEPTION_BITMAP, eb);
 904 }
 905
 906 /*
 907  * Check if MSR is intercepted for currently loaded MSR bitmap.
 908  */
 909 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
 910 {
 911         if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
 912                 return true;
 913
 914         return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
 915 }
 916
 917 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
 918 {
 919         unsigned int flags = 0;
 920
 921         if (vmx->loaded_vmcs->launched)
 922                 flags |= VMX_RUN_VMRESUME;
 923
 924         /*
 925          * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
 926          * to change it directly without causing a vmexit.  In that case read
 927          * it after vmexit and store it in vmx->spec_ctrl.
 928          */
 929         if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
 930                 flags |= VMX_RUN_SAVE_SPEC_CTRL;
 931
 932         return flags;
 933 }
 934
 935 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 936                 unsigned long entry, unsigned long exit)
 937 {
 938         vm_entry_controls_clearbit(vmx, entry);
 939         vm_exit_controls_clearbit(vmx, exit);
 940 }
 941
 942 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
 943 {
 944         unsigned int i;
 945
 946         for (i = 0; i < m->nr; ++i) {
 947                 if (m->val[i].index == msr)
 948                         return i;
 949         }
 950         return -ENOENT;
 951 }
 952
 953 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 954 {
 955         int i;
 956         struct msr_autoload *m = &vmx->msr_autoload;
 957
 958         switch (msr) {
 959         case MSR_EFER:
 960                 if (cpu_has_load_ia32_efer()) {
 961                         clear_atomic_switch_msr_special(vmx,
 962                                         VM_ENTRY_LOAD_IA32_EFER,
 963                                         VM_EXIT_LOAD_IA32_EFER);
 964                         return;
 965                 }
 966                 break;
 967         case MSR_CORE_PERF_GLOBAL_CTRL:
 968                 if (cpu_has_load_perf_global_ctrl()) {
 969                         clear_atomic_switch_msr_special(vmx,
 970                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 971                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 972                         return;
 973                 }
 974                 break;
 975         }
 976         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
 977         if (i < 0)
 978                 goto skip_guest;
 979         --m->guest.nr;
 980         m->guest.val[i] = m->guest.val[m->guest.nr];
 981         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 982
 983 skip_guest:
 984         i = vmx_find_loadstore_msr_slot(&m->host, msr);
 985         if (i < 0)
 986                 return;
 987
 988         --m->host.nr;
 989         m->host.val[i] = m->host.val[m->host.nr];
 990         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
 991 }
 992
 993 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 994                 unsigned long entry, unsigned long exit,
 995                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
 996                 u64 guest_val, u64 host_val)
 997 {
 998         vmcs_write64(guest_val_vmcs, guest_val);
 999         if (host_val_vmcs != HOST_IA32_EFER)
1000                 vmcs_write64(host_val_vmcs, host_val);
1001         vm_entry_controls_setbit(vmx, entry);
1002         vm_exit_controls_setbit(vmx, exit);
1003 }
1004
1005 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1006                                   u64 guest_val, u64 host_val, bool entry_only)
1007 {
1008         int i, j = 0;
1009         struct msr_autoload *m = &vmx->msr_autoload;
1010
1011         switch (msr) {
1012         case MSR_EFER:
1013                 if (cpu_has_load_ia32_efer()) {
1014                         add_atomic_switch_msr_special(vmx,
1015                                         VM_ENTRY_LOAD_IA32_EFER,
1016                                         VM_EXIT_LOAD_IA32_EFER,
1017                                         GUEST_IA32_EFER,
1018                                         HOST_IA32_EFER,
1019                                         guest_val, host_val);
1020                         return;
1021                 }
1022                 break;
1023         case MSR_CORE_PERF_GLOBAL_CTRL:
1024                 if (cpu_has_load_perf_global_ctrl()) {
1025                         add_atomic_switch_msr_special(vmx,
1026                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1027                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1028                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1029                                         HOST_IA32_PERF_GLOBAL_CTRL,
1030                                         guest_val, host_val);
1031                         return;
1032                 }
1033                 break;
1034         case MSR_IA32_PEBS_ENABLE:
1035                 /* PEBS needs a quiescent period after being disabled (to write
1036                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1037                  * provide that period, so a CPU could write host's record into
1038                  * guest's memory.
1039                  */
1040                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1041         }
1042
1043         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1044         if (!entry_only)
1045                 j = vmx_find_loadstore_msr_slot(&m->host, msr);
1046
1047         if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1048             (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1049                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1050                                 "Can't add msr %x\n", msr);
1051                 return;
1052         }
1053         if (i < 0) {
1054                 i = m->guest.nr++;
1055                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1056         }
1057         m->guest.val[i].index = msr;
1058         m->guest.val[i].value = guest_val;
1059
1060         if (entry_only)
1061                 return;
1062
1063         if (j < 0) {
1064                 j = m->host.nr++;
1065                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1066         }
1067         m->host.val[j].index = msr;
1068         m->host.val[j].value = host_val;
1069 }
1070
1071 static bool update_transition_efer(struct vcpu_vmx *vmx)
1072 {
1073         u64 guest_efer = vmx->vcpu.arch.efer;
1074         u64 ignore_bits = 0;
1075         int i;
1076
1077         /* Shadow paging assumes NX to be available.  */
1078         if (!enable_ept)
1079                 guest_efer |= EFER_NX;
1080
1081         /*
1082          * LMA and LME handled by hardware; SCE meaningless outside long mode.
1083          */
1084         ignore_bits |= EFER_SCE;
1085 #ifdef CONFIG_X86_64
1086         ignore_bits |= EFER_LMA | EFER_LME;
1087         /* SCE is meaningful only in long mode on Intel */
1088         if (guest_efer & EFER_LMA)
1089                 ignore_bits &= ~(u64)EFER_SCE;
1090 #endif
1091
1092         /*
1093          * On EPT, we can't emulate NX, so we must switch EFER atomically.
1094          * On CPUs that support "load IA32_EFER", always switch EFER
1095          * atomically, since it's faster than switching it manually.
1096          */
1097         if (cpu_has_load_ia32_efer() ||
1098             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1099                 if (!(guest_efer & EFER_LMA))
1100                         guest_efer &= ~EFER_LME;
1101                 if (guest_efer != host_efer)
1102                         add_atomic_switch_msr(vmx, MSR_EFER,
1103                                               guest_efer, host_efer, false);
1104                 else
1105                         clear_atomic_switch_msr(vmx, MSR_EFER);
1106                 return false;
1107         }
1108
1109         i = kvm_find_user_return_msr(MSR_EFER);
1110         if (i < 0)
1111                 return false;
1112
1113         clear_atomic_switch_msr(vmx, MSR_EFER);
1114
1115         guest_efer &= ~ignore_bits;
1116         guest_efer |= host_efer & ignore_bits;
1117
1118         vmx->guest_uret_msrs[i].data = guest_efer;
1119         vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1120
1121         return true;
1122 }
1123
1124 #ifdef CONFIG_X86_32
1125 /*
1126  * On 32-bit kernels, VM exits still load the FS and GS bases from the
1127  * VMCS rather than the segment table.  KVM uses this helper to figure
1128  * out the current bases to poke them into the VMCS before entry.
1129  */
1130 static unsigned long segment_base(u16 selector)
1131 {
1132         struct desc_struct *table;
1133         unsigned long v;
1134
1135         if (!(selector & ~SEGMENT_RPL_MASK))
1136                 return 0;
1137
1138         table = get_current_gdt_ro();
1139
1140         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1141                 u16 ldt_selector = kvm_read_ldt();
1142
1143                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1144                         return 0;
1145
1146                 table = (struct desc_struct *)segment_base(ldt_selector);
1147         }
1148         v = get_desc_base(&table[selector >> 3]);
1149         return v;
1150 }
1151 #endif
1152
1153 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1154 {
1155         return vmx_pt_mode_is_host_guest() &&
1156                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1157 }
1158
1159 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1160 {
1161         /* The base must be 128-byte aligned and a legal physical address. */
1162         return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1163 }
1164
1165 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1166 {
1167         u32 i;
1168
1169         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1170         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1171         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1172         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1173         for (i = 0; i < addr_range; i++) {
1174                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1175                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1176         }
1177 }
1178
1179 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1180 {
1181         u32 i;
1182
1183         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1184         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1185         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1186         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1187         for (i = 0; i < addr_range; i++) {
1188                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1189                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1190         }
1191 }
1192
1193 static void pt_guest_enter(struct vcpu_vmx *vmx)
1194 {
1195         if (vmx_pt_mode_is_system())
1196                 return;
1197
1198         /*
1199          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1200          * Save host state before VM entry.
1201          */
1202         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1203         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1204                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1205                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1206                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1207         }
1208 }
1209
1210 static void pt_guest_exit(struct vcpu_vmx *vmx)
1211 {
1212         if (vmx_pt_mode_is_system())
1213                 return;
1214
1215         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1216                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1217                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1218         }
1219
1220         /*
1221          * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1222          * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
1223          */
1224         if (vmx->pt_desc.host.ctl)
1225                 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1226 }
1227
1228 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1229                         unsigned long fs_base, unsigned long gs_base)
1230 {
1231         if (unlikely(fs_sel != host->fs_sel)) {
1232                 if (!(fs_sel & 7))
1233                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1234                 else
1235                         vmcs_write16(HOST_FS_SELECTOR, 0);
1236                 host->fs_sel = fs_sel;
1237         }
1238         if (unlikely(gs_sel != host->gs_sel)) {
1239                 if (!(gs_sel & 7))
1240                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1241                 else
1242                         vmcs_write16(HOST_GS_SELECTOR, 0);
1243                 host->gs_sel = gs_sel;
1244         }
1245         if (unlikely(fs_base != host->fs_base)) {
1246                 vmcs_writel(HOST_FS_BASE, fs_base);
1247                 host->fs_base = fs_base;
1248         }
1249         if (unlikely(gs_base != host->gs_base)) {
1250                 vmcs_writel(HOST_GS_BASE, gs_base);
1251                 host->gs_base = gs_base;
1252         }
1253 }
1254
1255 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1256 {
1257         struct vcpu_vmx *vmx = to_vmx(vcpu);
1258         struct vmcs_host_state *host_state;
1259 #ifdef CONFIG_X86_64
1260         int cpu = raw_smp_processor_id();
1261 #endif
1262         unsigned long fs_base, gs_base;
1263         u16 fs_sel, gs_sel;
1264         int i;
1265
1266         vmx->req_immediate_exit = false;
1267
1268         /*
1269          * Note that guest MSRs to be saved/restored can also be changed
1270          * when guest state is loaded. This happens when guest transitions
1271          * to/from long-mode by setting MSR_EFER.LMA.
1272          */
1273         if (!vmx->guest_uret_msrs_loaded) {
1274                 vmx->guest_uret_msrs_loaded = true;
1275                 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1276                         if (!vmx->guest_uret_msrs[i].load_into_hardware)
1277                                 continue;
1278
1279                         kvm_set_user_return_msr(i,
1280                                                 vmx->guest_uret_msrs[i].data,
1281                                                 vmx->guest_uret_msrs[i].mask);
1282                 }
1283         }
1284
1285         if (vmx->nested.need_vmcs12_to_shadow_sync)
1286                 nested_sync_vmcs12_to_shadow(vcpu);
1287
1288         if (vmx->guest_state_loaded)
1289                 return;
1290
1291         host_state = &vmx->loaded_vmcs->host_state;
1292
1293         /*
1294          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1295          * allow segment selectors with cpl > 0 or ti == 1.
1296          */
1297         host_state->ldt_sel = kvm_read_ldt();
1298
1299 #ifdef CONFIG_X86_64
1300         savesegment(ds, host_state->ds_sel);
1301         savesegment(es, host_state->es_sel);
1302
1303         gs_base = cpu_kernelmode_gs_base(cpu);
1304         if (likely(is_64bit_mm(current->mm))) {
1305                 current_save_fsgs();
1306                 fs_sel = current->thread.fsindex;
1307                 gs_sel = current->thread.gsindex;
1308                 fs_base = current->thread.fsbase;
1309                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1310         } else {
1311                 savesegment(fs, fs_sel);
1312                 savesegment(gs, gs_sel);
1313                 fs_base = read_msr(MSR_FS_BASE);
1314                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1315         }
1316
1317         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1318 #else
1319         savesegment(fs, fs_sel);
1320         savesegment(gs, gs_sel);
1321         fs_base = segment_base(fs_sel);
1322         gs_base = segment_base(gs_sel);
1323 #endif
1324
1325         vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1326         vmx->guest_state_loaded = true;
1327 }
1328
1329 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1330 {
1331         struct vmcs_host_state *host_state;
1332
1333         if (!vmx->guest_state_loaded)
1334                 return;
1335
1336         host_state = &vmx->loaded_vmcs->host_state;
1337
1338         ++vmx->vcpu.stat.host_state_reload;
1339
1340 #ifdef CONFIG_X86_64
1341         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1342 #endif
1343         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1344                 kvm_load_ldt(host_state->ldt_sel);
1345 #ifdef CONFIG_X86_64
1346                 load_gs_index(host_state->gs_sel);
1347 #else
1348                 loadsegment(gs, host_state->gs_sel);
1349 #endif
1350         }
1351         if (host_state->fs_sel & 7)
1352                 loadsegment(fs, host_state->fs_sel);
1353 #ifdef CONFIG_X86_64
1354         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1355                 loadsegment(ds, host_state->ds_sel);
1356                 loadsegment(es, host_state->es_sel);
1357         }
1358 #endif
1359         invalidate_tss_limit();
1360 #ifdef CONFIG_X86_64
1361         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1362 #endif
1363         load_fixmap_gdt(raw_smp_processor_id());
1364         vmx->guest_state_loaded = false;
1365         vmx->guest_uret_msrs_loaded = false;
1366 }
1367
1368 #ifdef CONFIG_X86_64
1369 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1370 {
1371         preempt_disable();
1372         if (vmx->guest_state_loaded)
1373                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1374         preempt_enable();
1375         return vmx->msr_guest_kernel_gs_base;
1376 }
1377
1378 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1379 {
1380         preempt_disable();
1381         if (vmx->guest_state_loaded)
1382                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1383         preempt_enable();
1384         vmx->msr_guest_kernel_gs_base = data;
1385 }
1386 #endif
1387
1388 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1389                         struct loaded_vmcs *buddy)
1390 {
1391         struct vcpu_vmx *vmx = to_vmx(vcpu);
1392         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1393         struct vmcs *prev;
1394
1395         if (!already_loaded) {
1396                 loaded_vmcs_clear(vmx->loaded_vmcs);
1397                 local_irq_disable();
1398
1399                 /*
1400                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1401                  * this cpu's percpu list, otherwise it may not yet be deleted
1402                  * from its previous cpu's percpu list.  Pairs with the
1403                  * smb_wmb() in __loaded_vmcs_clear().
1404                  */
1405                 smp_rmb();
1406
1407                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1408                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1409                 local_irq_enable();
1410         }
1411
1412         prev = per_cpu(current_vmcs, cpu);
1413         if (prev != vmx->loaded_vmcs->vmcs) {
1414                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1415                 vmcs_load(vmx->loaded_vmcs->vmcs);
1416
1417                 /*
1418                  * No indirect branch prediction barrier needed when switching
1419                  * the active VMCS within a vCPU, unless IBRS is advertised to
1420                  * the vCPU.  To minimize the number of IBPBs executed, KVM
1421                  * performs IBPB on nested VM-Exit (a single nested transition
1422                  * may switch the active VMCS multiple times).
1423                  */
1424                 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1425                         indirect_branch_prediction_barrier();
1426         }
1427
1428         if (!already_loaded) {
1429                 void *gdt = get_current_gdt_ro();
1430
1431                 /*
1432                  * Flush all EPTP/VPID contexts, the new pCPU may have stale
1433                  * TLB entries from its previous association with the vCPU.
1434                  */
1435                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1436
1437                 /*
1438                  * Linux uses per-cpu TSS and GDT, so set these when switching
1439                  * processors.  See 22.2.4.
1440                  */
1441                 vmcs_writel(HOST_TR_BASE,
1442                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1443                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1444
1445                 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1446                         /* 22.2.3 */
1447                         vmcs_writel(HOST_IA32_SYSENTER_ESP,
1448                                     (unsigned long)(cpu_entry_stack(cpu) + 1));
1449                 }
1450
1451                 vmx->loaded_vmcs->cpu = cpu;
1452         }
1453 }
1454
1455 /*
1456  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1457  * vcpu mutex is already taken.
1458  */
1459 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1460 {
1461         struct vcpu_vmx *vmx = to_vmx(vcpu);
1462
1463         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1464
1465         vmx_vcpu_pi_load(vcpu, cpu);
1466
1467         vmx->host_debugctlmsr = get_debugctlmsr();
1468 }
1469
1470 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1471 {
1472         vmx_vcpu_pi_put(vcpu);
1473
1474         vmx_prepare_switch_to_host(to_vmx(vcpu));
1475 }
1476
1477 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1478 {
1479         return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1480 }
1481
1482 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1483 {
1484         struct vcpu_vmx *vmx = to_vmx(vcpu);
1485         unsigned long rflags, save_rflags;
1486
1487         if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1488                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1489                 rflags = vmcs_readl(GUEST_RFLAGS);
1490                 if (vmx->rmode.vm86_active) {
1491                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1492                         save_rflags = vmx->rmode.save_rflags;
1493                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1494                 }
1495                 vmx->rflags = rflags;
1496         }
1497         return vmx->rflags;
1498 }
1499
1500 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1501 {
1502         struct vcpu_vmx *vmx = to_vmx(vcpu);
1503         unsigned long old_rflags;
1504
1505         if (is_unrestricted_guest(vcpu)) {
1506                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1507                 vmx->rflags = rflags;
1508                 vmcs_writel(GUEST_RFLAGS, rflags);
1509                 return;
1510         }
1511
1512         old_rflags = vmx_get_rflags(vcpu);
1513         vmx->rflags = rflags;
1514         if (vmx->rmode.vm86_active) {
1515                 vmx->rmode.save_rflags = rflags;
1516                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1517         }
1518         vmcs_writel(GUEST_RFLAGS, rflags);
1519
1520         if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1521                 vmx->emulation_required = vmx_emulation_required(vcpu);
1522 }
1523
1524 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1525 {
1526         return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1527 }
1528
1529 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1530 {
1531         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1532         int ret = 0;
1533
1534         if (interruptibility & GUEST_INTR_STATE_STI)
1535                 ret |= KVM_X86_SHADOW_INT_STI;
1536         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1537                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1538
1539         return ret;
1540 }
1541
1542 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1543 {
1544         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1545         u32 interruptibility = interruptibility_old;
1546
1547         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1548
1549         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1550                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1551         else if (mask & KVM_X86_SHADOW_INT_STI)
1552                 interruptibility |= GUEST_INTR_STATE_STI;
1553
1554         if ((interruptibility != interruptibility_old))
1555                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1556 }
1557
1558 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1559 {
1560         struct vcpu_vmx *vmx = to_vmx(vcpu);
1561         unsigned long value;
1562
1563         /*
1564          * Any MSR write that attempts to change bits marked reserved will
1565          * case a #GP fault.
1566          */
1567         if (data & vmx->pt_desc.ctl_bitmask)
1568                 return 1;
1569
1570         /*
1571          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1572          * result in a #GP unless the same write also clears TraceEn.
1573          */
1574         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1575                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1576                 return 1;
1577
1578         /*
1579          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1580          * and FabricEn would cause #GP, if
1581          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1582          */
1583         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1584                 !(data & RTIT_CTL_FABRIC_EN) &&
1585                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1586                                         PT_CAP_single_range_output))
1587                 return 1;
1588
1589         /*
1590          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1591          * utilize encodings marked reserved will cause a #GP fault.
1592          */
1593         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1594         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1595                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1596                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1597                 return 1;
1598         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1599                                                 PT_CAP_cycle_thresholds);
1600         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1601                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1602                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1603                 return 1;
1604         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1605         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1606                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1607                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1608                 return 1;
1609
1610         /*
1611          * If ADDRx_CFG is reserved or the encodings is >2 will
1612          * cause a #GP fault.
1613          */
1614         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1615         if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1616                 return 1;
1617         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1618         if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1619                 return 1;
1620         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1621         if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1622                 return 1;
1623         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1624         if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1625                 return 1;
1626
1627         return 0;
1628 }
1629
1630 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1631                                         void *insn, int insn_len)
1632 {
1633         /*
1634          * Emulation of instructions in SGX enclaves is impossible as RIP does
1635          * not point at the failing instruction, and even if it did, the code
1636          * stream is inaccessible.  Inject #UD instead of exiting to userspace
1637          * so that guest userspace can't DoS the guest simply by triggering
1638          * emulation (enclaves are CPL3 only).
1639          */
1640         if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1641                 kvm_queue_exception(vcpu, UD_VECTOR);
1642                 return false;
1643         }
1644         return true;
1645 }
1646
1647 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1648 {
1649         union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
1650         unsigned long rip, orig_rip;
1651         u32 instr_len;
1652
1653         /*
1654          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1655          * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1656          * set when EPT misconfig occurs.  In practice, real hardware updates
1657          * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1658          * (namely Hyper-V) don't set it due to it being undefined behavior,
1659          * i.e. we end up advancing IP with some random value.
1660          */
1661         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1662             exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1663                 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1664
1665                 /*
1666                  * Emulating an enclave's instructions isn't supported as KVM
1667                  * cannot access the enclave's memory or its true RIP, e.g. the
1668                  * vmcs.GUEST_RIP points at the exit point of the enclave, not
1669                  * the RIP that actually triggered the VM-Exit.  But, because
1670                  * most instructions that cause VM-Exit will #UD in an enclave,
1671                  * most instruction-based VM-Exits simply do not occur.
1672                  *
1673                  * There are a few exceptions, notably the debug instructions
1674                  * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1675                  * and generate #DB/#BP as expected, which KVM might intercept.
1676                  * But again, the CPU does the dirty work and saves an instr
1677                  * length of zero so VMMs don't shoot themselves in the foot.
1678                  * WARN if KVM tries to skip a non-zero length instruction on
1679                  * a VM-Exit from an enclave.
1680                  */
1681                 if (!instr_len)
1682                         goto rip_updated;
1683
1684                 WARN_ONCE(exit_reason.enclave_mode,
1685                           "skipping instruction after SGX enclave VM-Exit");
1686
1687                 orig_rip = kvm_rip_read(vcpu);
1688                 rip = orig_rip + instr_len;
1689 #ifdef CONFIG_X86_64
1690                 /*
1691                  * We need to mask out the high 32 bits of RIP if not in 64-bit
1692                  * mode, but just finding out that we are in 64-bit mode is
1693                  * quite expensive.  Only do it if there was a carry.
1694                  */
1695                 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1696                         rip = (u32)rip;
1697 #endif
1698                 kvm_rip_write(vcpu, rip);
1699         } else {
1700                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1701                         return 0;
1702         }
1703
1704 rip_updated:
1705         /* skipping an emulated instruction also counts */
1706         vmx_set_interrupt_shadow(vcpu, 0);
1707
1708         return 1;
1709 }
1710
1711 /*
1712  * Recognizes a pending MTF VM-exit and records the nested state for later
1713  * delivery.
1714  */
1715 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1716 {
1717         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1718         struct vcpu_vmx *vmx = to_vmx(vcpu);
1719
1720         if (!is_guest_mode(vcpu))
1721                 return;
1722
1723         /*
1724          * Per the SDM, MTF takes priority over debug-trap exceptions besides
1725          * TSS T-bit traps and ICEBP (INT1).  KVM doesn't emulate T-bit traps
1726          * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1727          * intercepted #DB deliberately avoids single-step #DB and MTF updates
1728          * as ICEBP is higher priority than both.  As instruction emulation is
1729          * completed at this point (i.e. KVM is at the instruction boundary),
1730          * any #DB exception pending delivery must be a debug-trap of lower
1731          * priority than MTF.  Record the pending MTF state to be delivered in
1732          * vmx_check_nested_events().
1733          */
1734         if (nested_cpu_has_mtf(vmcs12) &&
1735             (!vcpu->arch.exception.pending ||
1736              vcpu->arch.exception.vector == DB_VECTOR) &&
1737             (!vcpu->arch.exception_vmexit.pending ||
1738              vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1739                 vmx->nested.mtf_pending = true;
1740                 kvm_make_request(KVM_REQ_EVENT, vcpu);
1741         } else {
1742                 vmx->nested.mtf_pending = false;
1743         }
1744 }
1745
1746 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1747 {
1748         vmx_update_emulated_instruction(vcpu);
1749         return skip_emulated_instruction(vcpu);
1750 }
1751
1752 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1753 {
1754         /*
1755          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1756          * explicitly skip the instruction because if the HLT state is set,
1757          * then the instruction is already executing and RIP has already been
1758          * advanced.
1759          */
1760         if (kvm_hlt_in_guest(vcpu->kvm) &&
1761                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1762                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1763 }
1764
1765 static void vmx_inject_exception(struct kvm_vcpu *vcpu)
1766 {
1767         struct kvm_queued_exception *ex = &vcpu->arch.exception;
1768         u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1769         struct vcpu_vmx *vmx = to_vmx(vcpu);
1770
1771         kvm_deliver_exception_payload(vcpu, ex);
1772
1773         if (ex->has_error_code) {
1774                 /*
1775                  * Despite the error code being architecturally defined as 32
1776                  * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1777                  * VMX don't actually supporting setting bits 31:16.  Hardware
1778                  * will (should) never provide a bogus error code, but AMD CPUs
1779                  * do generate error codes with bits 31:16 set, and so KVM's
1780                  * ABI lets userspace shove in arbitrary 32-bit values.  Drop
1781                  * the upper bits to avoid VM-Fail, losing information that
1782                  * does't really exist is preferable to killing the VM.
1783                  */
1784                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1785                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1786         }
1787
1788         if (vmx->rmode.vm86_active) {
1789                 int inc_eip = 0;
1790                 if (kvm_exception_is_soft(ex->vector))
1791                         inc_eip = vcpu->arch.event_exit_inst_len;
1792                 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1793                 return;
1794         }
1795
1796         WARN_ON_ONCE(vmx->emulation_required);
1797
1798         if (kvm_exception_is_soft(ex->vector)) {
1799                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1800                              vmx->vcpu.arch.event_exit_inst_len);
1801                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1802         } else
1803                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1804
1805         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1806
1807         vmx_clear_hlt(vcpu);
1808 }
1809
1810 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1811                                bool load_into_hardware)
1812 {
1813         struct vmx_uret_msr *uret_msr;
1814
1815         uret_msr = vmx_find_uret_msr(vmx, msr);
1816         if (!uret_msr)
1817                 return;
1818
1819         uret_msr->load_into_hardware = load_into_hardware;
1820 }
1821
1822 /*
1823  * Configuring user return MSRs to automatically save, load, and restore MSRs
1824  * that need to be shoved into hardware when running the guest.  Note, omitting
1825  * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1826  * loaded into hardware when running the guest.
1827  */
1828 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1829 {
1830 #ifdef CONFIG_X86_64
1831         bool load_syscall_msrs;
1832
1833         /*
1834          * The SYSCALL MSRs are only needed on long mode guests, and only
1835          * when EFER.SCE is set.
1836          */
1837         load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1838                             (vmx->vcpu.arch.efer & EFER_SCE);
1839
1840         vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1841         vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1842         vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1843 #endif
1844         vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1845
1846         vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1847                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1848                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
1849
1850         /*
1851          * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1852          * kernel and old userspace.  If those guests run on a tsx=off host, do
1853          * allow guests to use TSX_CTRL, but don't change the value in hardware
1854          * so that TSX remains always disabled.
1855          */
1856         vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1857
1858         /*
1859          * The set of MSRs to load may have changed, reload MSRs before the
1860          * next VM-Enter.
1861          */
1862         vmx->guest_uret_msrs_loaded = false;
1863 }
1864
1865 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1866 {
1867         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1868
1869         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1870                 return vmcs12->tsc_offset;
1871
1872         return 0;
1873 }
1874
1875 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1876 {
1877         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1878
1879         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1880             nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1881                 return vmcs12->tsc_multiplier;
1882
1883         return kvm_caps.default_tsc_scaling_ratio;
1884 }
1885
1886 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1887 {
1888         vmcs_write64(TSC_OFFSET, offset);
1889 }
1890
1891 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1892 {
1893         vmcs_write64(TSC_MULTIPLIER, multiplier);
1894 }
1895
1896 /*
1897  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1898  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1899  * all guests if the "nested" module option is off, and can also be disabled
1900  * for a single guest by disabling its VMX cpuid bit.
1901  */
1902 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1903 {
1904         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1905 }
1906
1907 /*
1908  * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1909  * guest CPUID.  Note, KVM allows userspace to set "VMX in SMX" to maintain
1910  * backwards compatibility even though KVM doesn't support emulating SMX.  And
1911  * because userspace set "VMX in SMX", the guest must also be allowed to set it,
1912  * e.g. if the MSR is left unlocked and the guest does a RMW operation.
1913  */
1914 #define KVM_SUPPORTED_FEATURE_CONTROL  (FEAT_CTL_LOCKED                  | \
1915                                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX  | \
1916                                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
1917                                         FEAT_CTL_SGX_LC_ENABLED          | \
1918                                         FEAT_CTL_SGX_ENABLED             | \
1919                                         FEAT_CTL_LMCE_ENABLED)
1920
1921 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1922                                                     struct msr_data *msr)
1923 {
1924         uint64_t valid_bits;
1925
1926         /*
1927          * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1928          * exposed to the guest.
1929          */
1930         WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1931                      ~KVM_SUPPORTED_FEATURE_CONTROL);
1932
1933         if (!msr->host_initiated &&
1934             (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1935                 return false;
1936
1937         if (msr->host_initiated)
1938                 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1939         else
1940                 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
1941
1942         return !(msr->data & ~valid_bits);
1943 }
1944
1945 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1946 {
1947         switch (msr->index) {
1948         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1949                 if (!nested)
1950                         return 1;
1951                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1952         default:
1953                 return KVM_MSR_RET_INVALID;
1954         }
1955 }
1956
1957 /*
1958  * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
1959  * Returns 0 on success, non-0 otherwise.
1960  * Assumes vcpu_load() was already called.
1961  */
1962 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1963 {
1964         struct vcpu_vmx *vmx = to_vmx(vcpu);
1965         struct vmx_uret_msr *msr;
1966         u32 index;
1967
1968         switch (msr_info->index) {
1969 #ifdef CONFIG_X86_64
1970         case MSR_FS_BASE:
1971                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1972                 break;
1973         case MSR_GS_BASE:
1974                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1975                 break;
1976         case MSR_KERNEL_GS_BASE:
1977                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1978                 break;
1979 #endif
1980         case MSR_EFER:
1981                 return kvm_get_msr_common(vcpu, msr_info);
1982         case MSR_IA32_TSX_CTRL:
1983                 if (!msr_info->host_initiated &&
1984                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1985                         return 1;
1986                 goto find_uret_msr;
1987         case MSR_IA32_UMWAIT_CONTROL:
1988                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1989                         return 1;
1990
1991                 msr_info->data = vmx->msr_ia32_umwait_control;
1992                 break;
1993         case MSR_IA32_SPEC_CTRL:
1994                 if (!msr_info->host_initiated &&
1995                     !guest_has_spec_ctrl_msr(vcpu))
1996                         return 1;
1997
1998                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1999                 break;
2000         case MSR_IA32_SYSENTER_CS:
2001                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2002                 break;
2003         case MSR_IA32_SYSENTER_EIP:
2004                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2005                 break;
2006         case MSR_IA32_SYSENTER_ESP:
2007                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2008                 break;
2009         case MSR_IA32_BNDCFGS:
2010                 if (!kvm_mpx_supported() ||
2011                     (!msr_info->host_initiated &&
2012                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2013                         return 1;
2014                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2015                 break;
2016         case MSR_IA32_MCG_EXT_CTL:
2017                 if (!msr_info->host_initiated &&
2018                     !(vmx->msr_ia32_feature_control &
2019                       FEAT_CTL_LMCE_ENABLED))
2020                         return 1;
2021                 msr_info->data = vcpu->arch.mcg_ext_ctl;
2022                 break;
2023         case MSR_IA32_FEAT_CTL:
2024                 msr_info->data = vmx->msr_ia32_feature_control;
2025                 break;
2026         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2027                 if (!msr_info->host_initiated &&
2028                     !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2029                         return 1;
2030                 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2031                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2032                 break;
2033         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2034                 if (!nested_vmx_allowed(vcpu))
2035                         return 1;
2036                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2037                                     &msr_info->data))
2038                         return 1;
2039                 /*
2040                  * Enlightened VMCS v1 doesn't have certain VMCS fields but
2041                  * instead of just ignoring the features, different Hyper-V
2042                  * versions are either trying to use them and fail or do some
2043                  * sanity checking and refuse to boot. Filter all unsupported
2044                  * features out.
2045                  */
2046                 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
2047                         nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2048                                                         &msr_info->data);
2049                 break;
2050         case MSR_IA32_RTIT_CTL:
2051                 if (!vmx_pt_mode_is_host_guest())
2052                         return 1;
2053                 msr_info->data = vmx->pt_desc.guest.ctl;
2054                 break;
2055         case MSR_IA32_RTIT_STATUS:
2056                 if (!vmx_pt_mode_is_host_guest())
2057                         return 1;
2058                 msr_info->data = vmx->pt_desc.guest.status;
2059                 break;
2060         case MSR_IA32_RTIT_CR3_MATCH:
2061                 if (!vmx_pt_mode_is_host_guest() ||
2062                         !intel_pt_validate_cap(vmx->pt_desc.caps,
2063                                                 PT_CAP_cr3_filtering))
2064                         return 1;
2065                 msr_info->data = vmx->pt_desc.guest.cr3_match;
2066                 break;
2067         case MSR_IA32_RTIT_OUTPUT_BASE:
2068                 if (!vmx_pt_mode_is_host_guest() ||
2069                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2070                                         PT_CAP_topa_output) &&
2071                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2072                                         PT_CAP_single_range_output)))
2073                         return 1;
2074                 msr_info->data = vmx->pt_desc.guest.output_base;
2075                 break;
2076         case MSR_IA32_RTIT_OUTPUT_MASK:
2077                 if (!vmx_pt_mode_is_host_guest() ||
2078                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2079                                         PT_CAP_topa_output) &&
2080                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2081                                         PT_CAP_single_range_output)))
2082                         return 1;
2083                 msr_info->data = vmx->pt_desc.guest.output_mask;
2084                 break;
2085         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2086                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2087                 if (!vmx_pt_mode_is_host_guest() ||
2088                     (index >= 2 * vmx->pt_desc.num_address_ranges))
2089                         return 1;
2090                 if (index % 2)
2091                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2092                 else
2093                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2094                 break;
2095         case MSR_IA32_DEBUGCTLMSR:
2096                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
2097                 break;
2098         default:
2099         find_uret_msr:
2100                 msr = vmx_find_uret_msr(vmx, msr_info->index);
2101                 if (msr) {
2102                         msr_info->data = msr->data;
2103                         break;
2104                 }
2105                 return kvm_get_msr_common(vcpu, msr_info);
2106         }
2107
2108         return 0;
2109 }
2110
2111 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2112                                                     u64 data)
2113 {
2114 #ifdef CONFIG_X86_64
2115         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2116                 return (u32)data;
2117 #endif
2118         return (unsigned long)data;
2119 }
2120
2121 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2122 {
2123         u64 debugctl = 0;
2124
2125         if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2126             (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2127                 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2128
2129         if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
2130             (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2131                 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2132
2133         return debugctl;
2134 }
2135
2136 /*
2137  * Writes msr value into the appropriate "register".
2138  * Returns 0 on success, non-0 otherwise.
2139  * Assumes vcpu_load() was already called.
2140  */
2141 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2142 {
2143         struct vcpu_vmx *vmx = to_vmx(vcpu);
2144         struct vmx_uret_msr *msr;
2145         int ret = 0;
2146         u32 msr_index = msr_info->index;
2147         u64 data = msr_info->data;
2148         u32 index;
2149
2150         switch (msr_index) {
2151         case MSR_EFER:
2152                 ret = kvm_set_msr_common(vcpu, msr_info);
2153                 break;
2154 #ifdef CONFIG_X86_64
2155         case MSR_FS_BASE:
2156                 vmx_segment_cache_clear(vmx);
2157                 vmcs_writel(GUEST_FS_BASE, data);
2158                 break;
2159         case MSR_GS_BASE:
2160                 vmx_segment_cache_clear(vmx);
2161                 vmcs_writel(GUEST_GS_BASE, data);
2162                 break;
2163         case MSR_KERNEL_GS_BASE:
2164                 vmx_write_guest_kernel_gs_base(vmx, data);
2165                 break;
2166         case MSR_IA32_XFD:
2167                 ret = kvm_set_msr_common(vcpu, msr_info);
2168                 /*
2169                  * Always intercepting WRMSR could incur non-negligible
2170                  * overhead given xfd might be changed frequently in
2171                  * guest context switch. Disable write interception
2172                  * upon the first write with a non-zero value (indicating
2173                  * potential usage on dynamic xfeatures). Also update
2174                  * exception bitmap to trap #NM for proper virtualization
2175                  * of guest xfd_err.
2176                  */
2177                 if (!ret && data) {
2178                         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2179                                                       MSR_TYPE_RW);
2180                         vcpu->arch.xfd_no_write_intercept = true;
2181                         vmx_update_exception_bitmap(vcpu);
2182                 }
2183                 break;
2184 #endif
2185         case MSR_IA32_SYSENTER_CS:
2186                 if (is_guest_mode(vcpu))
2187                         get_vmcs12(vcpu)->guest_sysenter_cs = data;
2188                 vmcs_write32(GUEST_SYSENTER_CS, data);
2189                 break;
2190         case MSR_IA32_SYSENTER_EIP:
2191                 if (is_guest_mode(vcpu)) {
2192                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2193                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
2194                 }
2195                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2196                 break;
2197         case MSR_IA32_SYSENTER_ESP:
2198                 if (is_guest_mode(vcpu)) {
2199                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2200                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
2201                 }
2202                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2203                 break;
2204         case MSR_IA32_DEBUGCTLMSR: {
2205                 u64 invalid;
2206
2207                 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2208                 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
2209                         if (report_ignored_msrs)
2210                                 vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
2211                                             __func__, data);
2212                         data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2213                         invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2214                 }
2215
2216                 if (invalid)
2217                         return 1;
2218
2219                 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2220                                                 VM_EXIT_SAVE_DEBUG_CONTROLS)
2221                         get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2222
2223                 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
2224                 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2225                     (data & DEBUGCTLMSR_LBR))
2226                         intel_pmu_create_guest_lbr_event(vcpu);
2227                 return 0;
2228         }
2229         case MSR_IA32_BNDCFGS:
2230                 if (!kvm_mpx_supported() ||
2231                     (!msr_info->host_initiated &&
2232                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2233                         return 1;
2234                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2235                     (data & MSR_IA32_BNDCFGS_RSVD))
2236                         return 1;
2237
2238                 if (is_guest_mode(vcpu) &&
2239                     ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2240                      (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2241                         get_vmcs12(vcpu)->guest_bndcfgs = data;
2242
2243                 vmcs_write64(GUEST_BNDCFGS, data);
2244                 break;
2245         case MSR_IA32_UMWAIT_CONTROL:
2246                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2247                         return 1;
2248
2249                 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2250                 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2251                         return 1;
2252
2253                 vmx->msr_ia32_umwait_control = data;
2254                 break;
2255         case MSR_IA32_SPEC_CTRL:
2256                 if (!msr_info->host_initiated &&
2257                     !guest_has_spec_ctrl_msr(vcpu))
2258                         return 1;
2259
2260                 if (kvm_spec_ctrl_test_value(data))
2261                         return 1;
2262
2263                 vmx->spec_ctrl = data;
2264                 if (!data)
2265                         break;
2266
2267                 /*
2268                  * For non-nested:
2269                  * When it's written (to non-zero) for the first time, pass
2270                  * it through.
2271                  *
2272                  * For nested:
2273                  * The handling of the MSR bitmap for L2 guests is done in
2274                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2275                  * vmcs02.msr_bitmap here since it gets completely overwritten
2276                  * in the merging. We update the vmcs01 here for L1 as well
2277                  * since it will end up touching the MSR anyway now.
2278                  */
2279                 vmx_disable_intercept_for_msr(vcpu,
2280                                               MSR_IA32_SPEC_CTRL,
2281                                               MSR_TYPE_RW);
2282                 break;
2283         case MSR_IA32_TSX_CTRL:
2284                 if (!msr_info->host_initiated &&
2285                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2286                         return 1;
2287                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2288                         return 1;
2289                 goto find_uret_msr;
2290         case MSR_IA32_PRED_CMD:
2291                 if (!msr_info->host_initiated &&
2292                     !guest_has_pred_cmd_msr(vcpu))
2293                         return 1;
2294
2295                 if (data & ~PRED_CMD_IBPB)
2296                         return 1;
2297                 if (!boot_cpu_has(X86_FEATURE_IBPB))
2298                         return 1;
2299                 if (!data)
2300                         break;
2301
2302                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2303
2304                 /*
2305                  * For non-nested:
2306                  * When it's written (to non-zero) for the first time, pass
2307                  * it through.
2308                  *
2309                  * For nested:
2310                  * The handling of the MSR bitmap for L2 guests is done in
2311                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2312                  * vmcs02.msr_bitmap here since it gets completely overwritten
2313                  * in the merging.
2314                  */
2315                 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
2316                 break;
2317         case MSR_IA32_CR_PAT:
2318                 if (!kvm_pat_valid(data))
2319                         return 1;
2320
2321                 if (is_guest_mode(vcpu) &&
2322                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2323                         get_vmcs12(vcpu)->guest_ia32_pat = data;
2324
2325                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2326                         vmcs_write64(GUEST_IA32_PAT, data);
2327                         vcpu->arch.pat = data;
2328                         break;
2329                 }
2330                 ret = kvm_set_msr_common(vcpu, msr_info);
2331                 break;
2332         case MSR_IA32_MCG_EXT_CTL:
2333                 if ((!msr_info->host_initiated &&
2334                      !(to_vmx(vcpu)->msr_ia32_feature_control &
2335                        FEAT_CTL_LMCE_ENABLED)) ||
2336                     (data & ~MCG_EXT_CTL_LMCE_EN))
2337                         return 1;
2338                 vcpu->arch.mcg_ext_ctl = data;
2339                 break;
2340         case MSR_IA32_FEAT_CTL:
2341                 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2342                         return 1;
2343
2344                 vmx->msr_ia32_feature_control = data;
2345                 if (msr_info->host_initiated && data == 0)
2346                         vmx_leave_nested(vcpu);
2347
2348                 /* SGX may be enabled/disabled by guest's firmware */
2349                 vmx_write_encls_bitmap(vcpu, NULL);
2350                 break;
2351         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2352                 /*
2353                  * On real hardware, the LE hash MSRs are writable before
2354                  * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2355                  * at which point SGX related bits in IA32_FEATURE_CONTROL
2356                  * become writable.
2357                  *
2358                  * KVM does not emulate SGX activation for simplicity, so
2359                  * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2360                  * is unlocked.  This is technically not architectural
2361                  * behavior, but it's close enough.
2362                  */
2363                 if (!msr_info->host_initiated &&
2364                     (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2365                     ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2366                     !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2367                         return 1;
2368                 vmx->msr_ia32_sgxlepubkeyhash
2369                         [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2370                 break;
2371         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2372                 if (!msr_info->host_initiated)
2373                         return 1; /* they are read-only */
2374                 if (!nested_vmx_allowed(vcpu))
2375                         return 1;
2376                 return vmx_set_vmx_msr(vcpu, msr_index, data);
2377         case MSR_IA32_RTIT_CTL:
2378                 if (!vmx_pt_mode_is_host_guest() ||
2379                         vmx_rtit_ctl_check(vcpu, data) ||
2380                         vmx->nested.vmxon)
2381                         return 1;
2382                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2383                 vmx->pt_desc.guest.ctl = data;
2384                 pt_update_intercept_for_msr(vcpu);
2385                 break;
2386         case MSR_IA32_RTIT_STATUS:
2387                 if (!pt_can_write_msr(vmx))
2388                         return 1;
2389                 if (data & MSR_IA32_RTIT_STATUS_MASK)
2390                         return 1;
2391                 vmx->pt_desc.guest.status = data;
2392                 break;
2393         case MSR_IA32_RTIT_CR3_MATCH:
2394                 if (!pt_can_write_msr(vmx))
2395                         return 1;
2396                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2397                                            PT_CAP_cr3_filtering))
2398                         return 1;
2399                 vmx->pt_desc.guest.cr3_match = data;
2400                 break;
2401         case MSR_IA32_RTIT_OUTPUT_BASE:
2402                 if (!pt_can_write_msr(vmx))
2403                         return 1;
2404                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2405                                            PT_CAP_topa_output) &&
2406                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2407                                            PT_CAP_single_range_output))
2408                         return 1;
2409                 if (!pt_output_base_valid(vcpu, data))
2410                         return 1;
2411                 vmx->pt_desc.guest.output_base = data;
2412                 break;
2413         case MSR_IA32_RTIT_OUTPUT_MASK:
2414                 if (!pt_can_write_msr(vmx))
2415                         return 1;
2416                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2417                                            PT_CAP_topa_output) &&
2418                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2419                                            PT_CAP_single_range_output))
2420                         return 1;
2421                 vmx->pt_desc.guest.output_mask = data;
2422                 break;
2423         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2424                 if (!pt_can_write_msr(vmx))
2425                         return 1;
2426                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2427                 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2428                         return 1;
2429                 if (is_noncanonical_address(data, vcpu))
2430                         return 1;
2431                 if (index % 2)
2432                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2433                 else
2434                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2435                 break;
2436         case MSR_IA32_PERF_CAPABILITIES:
2437                 if (data && !vcpu_to_pmu(vcpu)->version)
2438                         return 1;
2439                 if (data & PMU_CAP_LBR_FMT) {
2440                         if ((data & PMU_CAP_LBR_FMT) !=
2441                             (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
2442                                 return 1;
2443                         if (!cpuid_model_is_consistent(vcpu))
2444                                 return 1;
2445                 }
2446                 if (data & PERF_CAP_PEBS_FORMAT) {
2447                         if ((data & PERF_CAP_PEBS_MASK) !=
2448                             (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2449                                 return 1;
2450                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2451                                 return 1;
2452                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2453                                 return 1;
2454                         if (!cpuid_model_is_consistent(vcpu))
2455                                 return 1;
2456                 }
2457                 ret = kvm_set_msr_common(vcpu, msr_info);
2458                 break;
2459
2460         default:
2461         find_uret_msr:
2462                 msr = vmx_find_uret_msr(vmx, msr_index);
2463                 if (msr)
2464                         ret = vmx_set_guest_uret_msr(vmx, msr, data);
2465                 else
2466                         ret = kvm_set_msr_common(vcpu, msr_info);
2467         }
2468
2469         /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2470         if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2471                 vmx_update_fb_clear_dis(vcpu, vmx);
2472
2473         return ret;
2474 }
2475
2476 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2477 {
2478         unsigned long guest_owned_bits;
2479
2480         kvm_register_mark_available(vcpu, reg);
2481
2482         switch (reg) {
2483         case VCPU_REGS_RSP:
2484                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2485                 break;
2486         case VCPU_REGS_RIP:
2487                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2488                 break;
2489         case VCPU_EXREG_PDPTR:
2490                 if (enable_ept)
2491                         ept_save_pdptrs(vcpu);
2492                 break;
2493         case VCPU_EXREG_CR0:
2494                 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2495
2496                 vcpu->arch.cr0 &= ~guest_owned_bits;
2497                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2498                 break;
2499         case VCPU_EXREG_CR3:
2500                 /*
2501                  * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2502                  * CR3 is loaded into hardware, not the guest's CR3.
2503                  */
2504                 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2505                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2506                 break;
2507         case VCPU_EXREG_CR4:
2508                 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2509
2510                 vcpu->arch.cr4 &= ~guest_owned_bits;
2511                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2512                 break;
2513         default:
2514                 KVM_BUG_ON(1, vcpu->kvm);
2515                 break;
2516         }
2517 }
2518
2519 /*
2520  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2521  * directly instead of going through cpu_has(), to ensure KVM is trapping
2522  * ENCLS whenever it's supported in hardware.  It does not matter whether
2523  * the host OS supports or has enabled SGX.
2524  */
2525 static bool cpu_has_sgx(void)
2526 {
2527         return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2528 }
2529
2530 /*
2531  * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2532  * can't be used due to errata where VM Exit may incorrectly clear
2533  * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
2534  * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2535  */
2536 static bool cpu_has_perf_global_ctrl_bug(void)
2537 {
2538         if (boot_cpu_data.x86 == 0x6) {
2539                 switch (boot_cpu_data.x86_model) {
2540                 case INTEL_FAM6_NEHALEM_EP:     /* AAK155 */
2541                 case INTEL_FAM6_NEHALEM:        /* AAP115 */
2542                 case INTEL_FAM6_WESTMERE:       /* AAT100 */
2543                 case INTEL_FAM6_WESTMERE_EP:    /* BC86,AAY89,BD102 */
2544                 case INTEL_FAM6_NEHALEM_EX:     /* BA97 */
2545                         return true;
2546                 default:
2547                         break;
2548                 }
2549         }
2550
2551         return false;
2552 }
2553
2554 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2555 {
2556         u32 vmx_msr_low, vmx_msr_high;
2557         u32 ctl = ctl_min | ctl_opt;
2558
2559         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2560
2561         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2562         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2563
2564         /* Ensure minimum (required) set of control bits are supported. */
2565         if (ctl_min & ~ctl)
2566                 return -EIO;
2567
2568         *result = ctl;
2569         return 0;
2570 }
2571
2572 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2573 {
2574         u64 allowed;
2575
2576         rdmsrl(msr, allowed);
2577
2578         return  ctl_opt & allowed;
2579 }
2580
2581 static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2582                              struct vmx_capability *vmx_cap)
2583 {
2584         u32 vmx_msr_low, vmx_msr_high;
2585         u32 _pin_based_exec_control = 0;
2586         u32 _cpu_based_exec_control = 0;
2587         u32 _cpu_based_2nd_exec_control = 0;
2588         u64 _cpu_based_3rd_exec_control = 0;
2589         u32 _vmexit_control = 0;
2590         u32 _vmentry_control = 0;
2591         u64 misc_msr;
2592         int i;
2593
2594         /*
2595          * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2596          * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2597          * intercepts writes to PAT and EFER, i.e. never enables those controls.
2598          */
2599         struct {
2600                 u32 entry_control;
2601                 u32 exit_control;
2602         } const vmcs_entry_exit_pairs[] = {
2603                 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2604                 { VM_ENTRY_LOAD_IA32_PAT,               VM_EXIT_LOAD_IA32_PAT },
2605                 { VM_ENTRY_LOAD_IA32_EFER,              VM_EXIT_LOAD_IA32_EFER },
2606                 { VM_ENTRY_LOAD_BNDCFGS,                VM_EXIT_CLEAR_BNDCFGS },
2607                 { VM_ENTRY_LOAD_IA32_RTIT_CTL,          VM_EXIT_CLEAR_IA32_RTIT_CTL },
2608         };
2609
2610         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2611
2612         if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2613                                 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2614                                 MSR_IA32_VMX_PROCBASED_CTLS,
2615                                 &_cpu_based_exec_control))
2616                 return -EIO;
2617         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2618                 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2619                                         KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2620                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2621                                         &_cpu_based_2nd_exec_control))
2622                         return -EIO;
2623         }
2624 #ifndef CONFIG_X86_64
2625         if (!(_cpu_based_2nd_exec_control &
2626                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2627                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2628 #endif
2629
2630         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2631                 _cpu_based_2nd_exec_control &= ~(
2632                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2633                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2634                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2635
2636         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2637                 &vmx_cap->ept, &vmx_cap->vpid);
2638
2639         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2640             vmx_cap->ept) {
2641                 pr_warn_once("EPT CAP should not exist if not support "
2642                                 "1-setting enable EPT VM-execution control\n");
2643
2644                 if (error_on_inconsistent_vmcs_config)
2645                         return -EIO;
2646
2647                 vmx_cap->ept = 0;
2648         }
2649         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2650             vmx_cap->vpid) {
2651                 pr_warn_once("VPID CAP should not exist if not support "
2652                                 "1-setting enable VPID VM-execution control\n");
2653
2654                 if (error_on_inconsistent_vmcs_config)
2655                         return -EIO;
2656
2657                 vmx_cap->vpid = 0;
2658         }
2659
2660         if (!cpu_has_sgx())
2661                 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2662
2663         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2664                 _cpu_based_3rd_exec_control =
2665                         adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2666                                               MSR_IA32_VMX_PROCBASED_CTLS3);
2667
2668         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2669                                 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2670                                 MSR_IA32_VMX_EXIT_CTLS,
2671                                 &_vmexit_control))
2672                 return -EIO;
2673
2674         if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2675                                 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2676                                 MSR_IA32_VMX_PINBASED_CTLS,
2677                                 &_pin_based_exec_control))
2678                 return -EIO;
2679
2680         if (cpu_has_broken_vmx_preemption_timer())
2681                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2682         if (!(_cpu_based_2nd_exec_control &
2683                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2684                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2685
2686         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2687                                 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2688                                 MSR_IA32_VMX_ENTRY_CTLS,
2689                                 &_vmentry_control))
2690                 return -EIO;
2691
2692         for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2693                 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2694                 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2695
2696                 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2697                         continue;
2698
2699                 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2700                              _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2701
2702                 if (error_on_inconsistent_vmcs_config)
2703                         return -EIO;
2704
2705                 _vmentry_control &= ~n_ctrl;
2706                 _vmexit_control &= ~x_ctrl;
2707         }
2708
2709         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2710
2711         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2712         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2713                 return -EIO;
2714
2715 #ifdef CONFIG_X86_64
2716         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2717         if (vmx_msr_high & (1u<<16))
2718                 return -EIO;
2719 #endif
2720
2721         /* Require Write-Back (WB) memory type for VMCS accesses. */
2722         if (((vmx_msr_high >> 18) & 15) != 6)
2723                 return -EIO;
2724
2725         rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2726
2727         vmcs_conf->size = vmx_msr_high & 0x1fff;
2728         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2729
2730         vmcs_conf->revision_id = vmx_msr_low;
2731
2732         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2733         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2734         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2735         vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2736         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2737         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2738         vmcs_conf->misc = misc_msr;
2739
2740 #if IS_ENABLED(CONFIG_HYPERV)
2741         if (enlightened_vmcs)
2742                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2743 #endif
2744
2745         return 0;
2746 }
2747
2748 static bool kvm_is_vmx_supported(void)
2749 {
2750         int cpu = raw_smp_processor_id();
2751
2752         if (!cpu_has_vmx()) {
2753                 pr_err("VMX not supported by CPU %d\n", cpu);
2754                 return false;
2755         }
2756
2757         if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2758             !this_cpu_has(X86_FEATURE_VMX)) {
2759                 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2760                 return false;
2761         }
2762
2763         return true;
2764 }
2765
2766 static int vmx_check_processor_compat(void)
2767 {
2768         int cpu = raw_smp_processor_id();
2769         struct vmcs_config vmcs_conf;
2770         struct vmx_capability vmx_cap;
2771
2772         if (!kvm_is_vmx_supported())
2773                 return -EIO;
2774
2775         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2776                 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2777                 return -EIO;
2778         }
2779         if (nested)
2780                 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2781         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2782                 pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
2783                 return -EIO;
2784         }
2785         return 0;
2786 }
2787
2788 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2789 {
2790         u64 msr;
2791
2792         cr4_set_bits(X86_CR4_VMXE);
2793
2794         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2795                           _ASM_EXTABLE(1b, %l[fault])
2796                           : : [vmxon_pointer] "m"(vmxon_pointer)
2797                           : : fault);
2798         return 0;
2799
2800 fault:
2801         WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2802                   rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2803         cr4_clear_bits(X86_CR4_VMXE);
2804
2805         return -EFAULT;
2806 }
2807
2808 static int vmx_hardware_enable(void)
2809 {
2810         int cpu = raw_smp_processor_id();
2811         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2812         int r;
2813
2814         if (cr4_read_shadow() & X86_CR4_VMXE)
2815                 return -EBUSY;
2816
2817         /*
2818          * This can happen if we hot-added a CPU but failed to allocate
2819          * VP assist page for it.
2820          */
2821         if (static_branch_unlikely(&enable_evmcs) &&
2822             !hv_get_vp_assist_page(cpu))
2823                 return -EFAULT;
2824
2825         intel_pt_handle_vmx(1);
2826
2827         r = kvm_cpu_vmxon(phys_addr);
2828         if (r) {
2829                 intel_pt_handle_vmx(0);
2830                 return r;
2831         }
2832
2833         if (enable_ept)
2834                 ept_sync_global();
2835
2836         return 0;
2837 }
2838
2839 static void vmclear_local_loaded_vmcss(void)
2840 {
2841         int cpu = raw_smp_processor_id();
2842         struct loaded_vmcs *v, *n;
2843
2844         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2845                                  loaded_vmcss_on_cpu_link)
2846                 __loaded_vmcs_clear(v);
2847 }
2848
2849 static void vmx_hardware_disable(void)
2850 {
2851         vmclear_local_loaded_vmcss();
2852
2853         if (cpu_vmxoff())
2854                 kvm_spurious_fault();
2855
2856         hv_reset_evmcs();
2857
2858         intel_pt_handle_vmx(0);
2859 }
2860
2861 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2862 {
2863         int node = cpu_to_node(cpu);
2864         struct page *pages;
2865         struct vmcs *vmcs;
2866
2867         pages = __alloc_pages_node(node, flags, 0);
2868         if (!pages)
2869                 return NULL;
2870         vmcs = page_address(pages);
2871         memset(vmcs, 0, vmcs_config.size);
2872
2873         /* KVM supports Enlightened VMCS v1 only */
2874         if (static_branch_unlikely(&enable_evmcs))
2875                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2876         else
2877                 vmcs->hdr.revision_id = vmcs_config.revision_id;
2878
2879         if (shadow)
2880                 vmcs->hdr.shadow_vmcs = 1;
2881         return vmcs;
2882 }
2883
2884 void free_vmcs(struct vmcs *vmcs)
2885 {
2886         free_page((unsigned long)vmcs);
2887 }
2888
2889 /*
2890  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2891  */
2892 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2893 {
2894         if (!loaded_vmcs->vmcs)
2895                 return;
2896         loaded_vmcs_clear(loaded_vmcs);
2897         free_vmcs(loaded_vmcs->vmcs);
2898         loaded_vmcs->vmcs = NULL;
2899         if (loaded_vmcs->msr_bitmap)
2900                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2901         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2902 }
2903
2904 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2905 {
2906         loaded_vmcs->vmcs = alloc_vmcs(false);
2907         if (!loaded_vmcs->vmcs)
2908                 return -ENOMEM;
2909
2910         vmcs_clear(loaded_vmcs->vmcs);
2911
2912         loaded_vmcs->shadow_vmcs = NULL;
2913         loaded_vmcs->hv_timer_soft_disabled = false;
2914         loaded_vmcs->cpu = -1;
2915         loaded_vmcs->launched = 0;
2916
2917         if (cpu_has_vmx_msr_bitmap()) {
2918                 loaded_vmcs->msr_bitmap = (unsigned long *)
2919                                 __get_free_page(GFP_KERNEL_ACCOUNT);
2920                 if (!loaded_vmcs->msr_bitmap)
2921                         goto out_vmcs;
2922                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2923         }
2924
2925         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2926         memset(&loaded_vmcs->controls_shadow, 0,
2927                 sizeof(struct vmcs_controls_shadow));
2928
2929         return 0;
2930
2931 out_vmcs:
2932         free_loaded_vmcs(loaded_vmcs);
2933         return -ENOMEM;
2934 }
2935
2936 static void free_kvm_area(void)
2937 {
2938         int cpu;
2939
2940         for_each_possible_cpu(cpu) {
2941                 free_vmcs(per_cpu(vmxarea, cpu));
2942                 per_cpu(vmxarea, cpu) = NULL;
2943         }
2944 }
2945
2946 static __init int alloc_kvm_area(void)
2947 {
2948         int cpu;
2949
2950         for_each_possible_cpu(cpu) {
2951                 struct vmcs *vmcs;
2952
2953                 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2954                 if (!vmcs) {
2955                         free_kvm_area();
2956                         return -ENOMEM;
2957                 }
2958
2959                 /*
2960                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2961                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2962                  * revision_id reported by MSR_IA32_VMX_BASIC.
2963                  *
2964                  * However, even though not explicitly documented by
2965                  * TLFS, VMXArea passed as VMXON argument should
2966                  * still be marked with revision_id reported by
2967                  * physical CPU.
2968                  */
2969                 if (static_branch_unlikely(&enable_evmcs))
2970                         vmcs->hdr.revision_id = vmcs_config.revision_id;
2971
2972                 per_cpu(vmxarea, cpu) = vmcs;
2973         }
2974         return 0;
2975 }
2976
2977 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2978                 struct kvm_segment *save)
2979 {
2980         if (!emulate_invalid_guest_state) {
2981                 /*
2982                  * CS and SS RPL should be equal during guest entry according
2983                  * to VMX spec, but in reality it is not always so. Since vcpu
2984                  * is in the middle of the transition from real mode to
2985                  * protected mode it is safe to assume that RPL 0 is a good
2986                  * default value.
2987                  */
2988                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2989                         save->selector &= ~SEGMENT_RPL_MASK;
2990                 save->dpl = save->selector & SEGMENT_RPL_MASK;
2991                 save->s = 1;
2992         }
2993         __vmx_set_segment(vcpu, save, seg);
2994 }
2995
2996 static void enter_pmode(struct kvm_vcpu *vcpu)
2997 {
2998         unsigned long flags;
2999         struct vcpu_vmx *vmx = to_vmx(vcpu);
3000
3001         /*
3002          * Update real mode segment cache. It may be not up-to-date if segment
3003          * register was written while vcpu was in a guest mode.
3004          */
3005         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3006         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3007         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3008         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3009         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3010         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3011
3012         vmx->rmode.vm86_active = 0;
3013
3014         __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3015
3016         flags = vmcs_readl(GUEST_RFLAGS);
3017         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3018         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3019         vmcs_writel(GUEST_RFLAGS, flags);
3020
3021         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3022                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3023
3024         vmx_update_exception_bitmap(vcpu);
3025
3026         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3027         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3028         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3029         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3030         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3031         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3032 }
3033
3034 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3035 {
3036         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3037         struct kvm_segment var = *save;
3038
3039         var.dpl = 0x3;
3040         if (seg == VCPU_SREG_CS)
3041                 var.type = 0x3;
3042
3043         if (!emulate_invalid_guest_state) {
3044                 var.selector = var.base >> 4;
3045                 var.base = var.base & 0xffff0;
3046                 var.limit = 0xffff;
3047                 var.g = 0;
3048                 var.db = 0;
3049                 var.present = 1;
3050                 var.s = 1;
3051                 var.l = 0;
3052                 var.unusable = 0;
3053                 var.type = 0x3;
3054                 var.avl = 0;
3055                 if (save->base & 0xf)
3056                         pr_warn_once("segment base is not paragraph aligned "
3057                                      "when entering protected mode (seg=%d)", seg);
3058         }
3059
3060         vmcs_write16(sf->selector, var.selector);
3061         vmcs_writel(sf->base, var.base);
3062         vmcs_write32(sf->limit, var.limit);
3063         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3064 }
3065
3066 static void enter_rmode(struct kvm_vcpu *vcpu)
3067 {
3068         unsigned long flags;
3069         struct vcpu_vmx *vmx = to_vmx(vcpu);
3070         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3071
3072         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3073         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3074         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3075         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3076         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3077         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3078         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3079
3080         vmx->rmode.vm86_active = 1;
3081
3082         /*
3083          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3084          * vcpu. Warn the user that an update is overdue.
3085          */
3086         if (!kvm_vmx->tss_addr)
3087                 pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
3088
3089         vmx_segment_cache_clear(vmx);
3090
3091         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3092         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3093         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3094
3095         flags = vmcs_readl(GUEST_RFLAGS);
3096         vmx->rmode.save_rflags = flags;
3097
3098         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3099
3100         vmcs_writel(GUEST_RFLAGS, flags);
3101         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3102         vmx_update_exception_bitmap(vcpu);
3103
3104         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3105         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3106         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3107         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3108         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3109         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3110 }
3111
3112 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3113 {
3114         struct vcpu_vmx *vmx = to_vmx(vcpu);
3115
3116         /* Nothing to do if hardware doesn't support EFER. */
3117         if (!vmx_find_uret_msr(vmx, MSR_EFER))
3118                 return 0;
3119
3120         vcpu->arch.efer = efer;
3121 #ifdef CONFIG_X86_64
3122         if (efer & EFER_LMA)
3123                 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3124         else
3125                 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3126 #else
3127         if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3128                 return 1;
3129 #endif
3130
3131         vmx_setup_uret_msrs(vmx);
3132         return 0;
3133 }
3134
3135 #ifdef CONFIG_X86_64
3136
3137 static void enter_lmode(struct kvm_vcpu *vcpu)
3138 {
3139         u32 guest_tr_ar;
3140
3141         vmx_segment_cache_clear(to_vmx(vcpu));
3142
3143         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3144         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3145                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3146                                      __func__);
3147                 vmcs_write32(GUEST_TR_AR_BYTES,
3148                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3149                              | VMX_AR_TYPE_BUSY_64_TSS);
3150         }
3151         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3152 }
3153
3154 static void exit_lmode(struct kvm_vcpu *vcpu)
3155 {
3156         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3157 }
3158
3159 #endif
3160
3161 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3162 {
3163         struct vcpu_vmx *vmx = to_vmx(vcpu);
3164
3165         /*
3166          * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3167          * the CPU is not required to invalidate guest-physical mappings on
3168          * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
3169          * associated with the root EPT structure and not any particular VPID
3170          * (INVVPID also isn't required to invalidate guest-physical mappings).
3171          */
3172         if (enable_ept) {
3173                 ept_sync_global();
3174         } else if (enable_vpid) {
3175                 if (cpu_has_vmx_invvpid_global()) {
3176                         vpid_sync_vcpu_global();
3177                 } else {
3178                         vpid_sync_vcpu_single(vmx->vpid);
3179                         vpid_sync_vcpu_single(vmx->nested.vpid02);
3180                 }
3181         }
3182 }
3183
3184 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3185 {
3186         if (is_guest_mode(vcpu))
3187                 return nested_get_vpid02(vcpu);
3188         return to_vmx(vcpu)->vpid;
3189 }
3190
3191 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3192 {
3193         struct kvm_mmu *mmu = vcpu->arch.mmu;
3194         u64 root_hpa = mmu->root.hpa;
3195
3196         /* No flush required if the current context is invalid. */
3197         if (!VALID_PAGE(root_hpa))
3198                 return;
3199
3200         if (enable_ept)
3201                 ept_sync_context(construct_eptp(vcpu, root_hpa,
3202                                                 mmu->root_role.level));
3203         else
3204                 vpid_sync_context(vmx_get_current_vpid(vcpu));
3205 }
3206
3207 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3208 {
3209         /*
3210          * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3211          * vmx_flush_tlb_guest() for an explanation of why this is ok.
3212          */
3213         vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3214 }
3215
3216 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3217 {
3218         /*
3219          * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3220          * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
3221          * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3222          * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3223          * i.e. no explicit INVVPID is necessary.
3224          */
3225         vpid_sync_context(vmx_get_current_vpid(vcpu));
3226 }
3227
3228 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3229 {
3230         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3231
3232         if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3233                 return;
3234
3235         if (is_pae_paging(vcpu)) {
3236                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3237                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3238                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3239                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3240         }
3241 }
3242
3243 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3244 {
3245         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3246
3247         if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3248                 return;
3249
3250         mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3251         mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3252         mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3253         mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3254
3255         kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3256 }
3257
3258 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3259                           CPU_BASED_CR3_STORE_EXITING)
3260
3261 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3262 {
3263         struct vcpu_vmx *vmx = to_vmx(vcpu);
3264         unsigned long hw_cr0, old_cr0_pg;
3265         u32 tmp;
3266
3267         old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3268
3269         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3270         if (is_unrestricted_guest(vcpu))
3271                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3272         else {
3273                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3274                 if (!enable_ept)
3275                         hw_cr0 |= X86_CR0_WP;
3276
3277                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3278                         enter_pmode(vcpu);
3279
3280                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3281                         enter_rmode(vcpu);
3282         }
3283
3284         vmcs_writel(CR0_READ_SHADOW, cr0);
3285         vmcs_writel(GUEST_CR0, hw_cr0);
3286         vcpu->arch.cr0 = cr0;
3287         kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3288
3289 #ifdef CONFIG_X86_64
3290         if (vcpu->arch.efer & EFER_LME) {
3291                 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3292                         enter_lmode(vcpu);
3293                 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3294                         exit_lmode(vcpu);
3295         }
3296 #endif
3297
3298         if (enable_ept && !is_unrestricted_guest(vcpu)) {
3299                 /*
3300                  * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
3301                  * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3302                  * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3303                  * KVM's CR3 is installed.
3304                  */
3305                 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3306                         vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3307
3308                 /*
3309                  * When running with EPT but not unrestricted guest, KVM must
3310                  * intercept CR3 accesses when paging is _disabled_.  This is
3311                  * necessary because restricted guests can't actually run with
3312                  * paging disabled, and so KVM stuffs its own CR3 in order to
3313                  * run the guest when identity mapped page tables.
3314                  *
3315                  * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3316                  * update, it may be stale with respect to CR3 interception,
3317                  * e.g. after nested VM-Enter.
3318                  *
3319                  * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3320                  * stores to forward them to L1, even if KVM does not need to
3321                  * intercept them to preserve its identity mapped page tables.
3322                  */
3323                 if (!(cr0 & X86_CR0_PG)) {
3324                         exec_controls_setbit(vmx, CR3_EXITING_BITS);
3325                 } else if (!is_guest_mode(vcpu)) {
3326                         exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3327                 } else {
3328                         tmp = exec_controls_get(vmx);
3329                         tmp &= ~CR3_EXITING_BITS;
3330                         tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3331                         exec_controls_set(vmx, tmp);
3332                 }
3333
3334                 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3335                 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3336                         vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3337
3338                 /*
3339                  * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3340                  * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3341                  */
3342                 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3343                         kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3344         }
3345
3346         /* depends on vcpu->arch.cr0 to be set to a new value */
3347         vmx->emulation_required = vmx_emulation_required(vcpu);
3348 }
3349
3350 static int vmx_get_max_tdp_level(void)
3351 {
3352         if (cpu_has_vmx_ept_5levels())
3353                 return 5;
3354         return 4;
3355 }
3356
3357 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3358 {
3359         u64 eptp = VMX_EPTP_MT_WB;
3360
3361         eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3362
3363         if (enable_ept_ad_bits &&
3364             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3365                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3366         eptp |= root_hpa;
3367
3368         return eptp;
3369 }
3370
3371 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3372                              int root_level)
3373 {
3374         struct kvm *kvm = vcpu->kvm;
3375         bool update_guest_cr3 = true;
3376         unsigned long guest_cr3;
3377         u64 eptp;
3378
3379         if (enable_ept) {
3380                 eptp = construct_eptp(vcpu, root_hpa, root_level);
3381                 vmcs_write64(EPT_POINTER, eptp);
3382
3383                 hv_track_root_tdp(vcpu, root_hpa);
3384
3385                 if (!enable_unrestricted_guest && !is_paging(vcpu))
3386                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3387                 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3388                         guest_cr3 = vcpu->arch.cr3;
3389                 else /* vmcs.GUEST_CR3 is already up-to-date. */
3390                         update_guest_cr3 = false;
3391                 vmx_ept_load_pdptrs(vcpu);
3392         } else {
3393                 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
3394         }
3395
3396         if (update_guest_cr3)
3397                 vmcs_writel(GUEST_CR3, guest_cr3);
3398 }
3399
3400
3401 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3402 {
3403         /*
3404          * We operate under the default treatment of SMM, so VMX cannot be
3405          * enabled under SMM.  Note, whether or not VMXE is allowed at all,
3406          * i.e. is a reserved bit, is handled by common x86 code.
3407          */
3408         if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3409                 return false;
3410
3411         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3412                 return false;
3413
3414         return true;
3415 }
3416
3417 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3418 {
3419         unsigned long old_cr4 = vcpu->arch.cr4;
3420         struct vcpu_vmx *vmx = to_vmx(vcpu);
3421         /*
3422          * Pass through host's Machine Check Enable value to hw_cr4, which
3423          * is in force while we are in guest mode.  Do not let guests control
3424          * this bit, even if host CR4.MCE == 0.
3425          */
3426         unsigned long hw_cr4;
3427
3428         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3429         if (is_unrestricted_guest(vcpu))
3430                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3431         else if (vmx->rmode.vm86_active)
3432                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3433         else
3434                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3435
3436         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3437                 if (cr4 & X86_CR4_UMIP) {
3438                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3439                         hw_cr4 &= ~X86_CR4_UMIP;
3440                 } else if (!is_guest_mode(vcpu) ||
3441                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3442                         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3443                 }
3444         }
3445
3446         vcpu->arch.cr4 = cr4;
3447         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3448
3449         if (!is_unrestricted_guest(vcpu)) {
3450                 if (enable_ept) {
3451                         if (!is_paging(vcpu)) {
3452                                 hw_cr4 &= ~X86_CR4_PAE;
3453                                 hw_cr4 |= X86_CR4_PSE;
3454                         } else if (!(cr4 & X86_CR4_PAE)) {
3455                                 hw_cr4 &= ~X86_CR4_PAE;
3456                         }
3457                 }
3458
3459                 /*
3460                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3461                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3462                  * to be manually disabled when guest switches to non-paging
3463                  * mode.
3464                  *
3465                  * If !enable_unrestricted_guest, the CPU is always running
3466                  * with CR0.PG=1 and CR4 needs to be modified.
3467                  * If enable_unrestricted_guest, the CPU automatically
3468                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3469                  */
3470                 if (!is_paging(vcpu))
3471                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3472         }
3473
3474         vmcs_writel(CR4_READ_SHADOW, cr4);
3475         vmcs_writel(GUEST_CR4, hw_cr4);
3476
3477         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3478                 kvm_update_cpuid_runtime(vcpu);
3479 }
3480
3481 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3482 {
3483         struct vcpu_vmx *vmx = to_vmx(vcpu);
3484         u32 ar;
3485
3486         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3487                 *var = vmx->rmode.segs[seg];
3488                 if (seg == VCPU_SREG_TR
3489                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3490                         return;
3491                 var->base = vmx_read_guest_seg_base(vmx, seg);
3492                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3493                 return;
3494         }
3495         var->base = vmx_read_guest_seg_base(vmx, seg);
3496         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3497         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3498         ar = vmx_read_guest_seg_ar(vmx, seg);
3499         var->unusable = (ar >> 16) & 1;
3500         var->type = ar & 15;
3501         var->s = (ar >> 4) & 1;
3502         var->dpl = (ar >> 5) & 3;
3503         /*
3504          * Some userspaces do not preserve unusable property. Since usable
3505          * segment has to be present according to VMX spec we can use present
3506          * property to amend userspace bug by making unusable segment always
3507          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3508          * segment as unusable.
3509          */
3510         var->present = !var->unusable;
3511         var->avl = (ar >> 12) & 1;
3512         var->l = (ar >> 13) & 1;
3513         var->db = (ar >> 14) & 1;
3514         var->g = (ar >> 15) & 1;
3515 }
3516
3517 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3518 {
3519         struct kvm_segment s;
3520
3521         if (to_vmx(vcpu)->rmode.vm86_active) {
3522                 vmx_get_segment(vcpu, &s, seg);
3523                 return s.base;
3524         }
3525         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3526 }
3527
3528 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3529 {
3530         struct vcpu_vmx *vmx = to_vmx(vcpu);
3531
3532         if (unlikely(vmx->rmode.vm86_active))
3533                 return 0;
3534         else {
3535                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3536                 return VMX_AR_DPL(ar);
3537         }
3538 }
3539
3540 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3541 {
3542         u32 ar;
3543
3544         if (var->unusable || !var->present)
3545                 ar = 1 << 16;
3546         else {
3547                 ar = var->type & 15;
3548                 ar |= (var->s & 1) << 4;
3549                 ar |= (var->dpl & 3) << 5;
3550                 ar |= (var->present & 1) << 7;
3551                 ar |= (var->avl & 1) << 12;
3552                 ar |= (var->l & 1) << 13;
3553                 ar |= (var->db & 1) << 14;
3554                 ar |= (var->g & 1) << 15;
3555         }
3556
3557         return ar;
3558 }
3559
3560 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3561 {
3562         struct vcpu_vmx *vmx = to_vmx(vcpu);
3563         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3564
3565         vmx_segment_cache_clear(vmx);
3566
3567         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3568                 vmx->rmode.segs[seg] = *var;
3569                 if (seg == VCPU_SREG_TR)
3570                         vmcs_write16(sf->selector, var->selector);
3571                 else if (var->s)
3572                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3573                 return;
3574         }
3575
3576         vmcs_writel(sf->base, var->base);
3577         vmcs_write32(sf->limit, var->limit);
3578         vmcs_write16(sf->selector, var->selector);
3579
3580         /*
3581          *   Fix the "Accessed" bit in AR field of segment registers for older
3582          * qemu binaries.
3583          *   IA32 arch specifies that at the time of processor reset the
3584          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3585          * is setting it to 0 in the userland code. This causes invalid guest
3586          * state vmexit when "unrestricted guest" mode is turned on.
3587          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3588          * tree. Newer qemu binaries with that qemu fix would not need this
3589          * kvm hack.
3590          */
3591         if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3592                 var->type |= 0x1; /* Accessed */
3593
3594         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3595 }
3596
3597 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3598 {
3599         __vmx_set_segment(vcpu, var, seg);
3600
3601         to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
3602 }
3603
3604 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3605 {
3606         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3607
3608         *db = (ar >> 14) & 1;
3609         *l = (ar >> 13) & 1;
3610 }
3611
3612 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3613 {
3614         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3615         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3616 }
3617
3618 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3619 {
3620         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3621         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3622 }
3623
3624 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3625 {
3626         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3627         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3628 }
3629
3630 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3631 {
3632         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3633         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3634 }
3635
3636 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3637 {
3638         struct kvm_segment var;
3639         u32 ar;
3640
3641         vmx_get_segment(vcpu, &var, seg);
3642         var.dpl = 0x3;
3643         if (seg == VCPU_SREG_CS)
3644                 var.type = 0x3;
3645         ar = vmx_segment_access_rights(&var);
3646
3647         if (var.base != (var.selector << 4))
3648                 return false;
3649         if (var.limit != 0xffff)
3650                 return false;
3651         if (ar != 0xf3)
3652                 return false;
3653
3654         return true;
3655 }
3656
3657 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3658 {
3659         struct kvm_segment cs;
3660         unsigned int cs_rpl;
3661
3662         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3663         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3664
3665         if (cs.unusable)
3666                 return false;
3667         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3668                 return false;
3669         if (!cs.s)
3670                 return false;
3671         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3672                 if (cs.dpl > cs_rpl)
3673                         return false;
3674         } else {
3675                 if (cs.dpl != cs_rpl)
3676                         return false;
3677         }
3678         if (!cs.present)
3679                 return false;
3680
3681         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3682         return true;
3683 }
3684
3685 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3686 {
3687         struct kvm_segment ss;
3688         unsigned int ss_rpl;
3689
3690         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3691         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3692
3693         if (ss.unusable)
3694                 return true;
3695         if (ss.type != 3 && ss.type != 7)
3696                 return false;
3697         if (!ss.s)
3698                 return false;
3699         if (ss.dpl != ss_rpl) /* DPL != RPL */
3700                 return false;
3701         if (!ss.present)
3702                 return false;
3703
3704         return true;
3705 }
3706
3707 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3708 {
3709         struct kvm_segment var;
3710         unsigned int rpl;
3711
3712         vmx_get_segment(vcpu, &var, seg);
3713         rpl = var.selector & SEGMENT_RPL_MASK;
3714
3715         if (var.unusable)
3716                 return true;
3717         if (!var.s)
3718                 return false;
3719         if (!var.present)
3720                 return false;
3721         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3722                 if (var.dpl < rpl) /* DPL < RPL */
3723                         return false;
3724         }
3725
3726         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3727          * rights flags
3728          */
3729         return true;
3730 }
3731
3732 static bool tr_valid(struct kvm_vcpu *vcpu)
3733 {
3734         struct kvm_segment tr;
3735
3736         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3737
3738         if (tr.unusable)
3739                 return false;
3740         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3741                 return false;
3742         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3743                 return false;
3744         if (!tr.present)
3745                 return false;
3746
3747         return true;
3748 }
3749
3750 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3751 {
3752         struct kvm_segment ldtr;
3753
3754         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3755
3756         if (ldtr.unusable)
3757                 return true;
3758         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3759                 return false;
3760         if (ldtr.type != 2)
3761                 return false;
3762         if (!ldtr.present)
3763                 return false;
3764
3765         return true;
3766 }
3767
3768 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3769 {
3770         struct kvm_segment cs, ss;
3771
3772         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3773         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3774
3775         return ((cs.selector & SEGMENT_RPL_MASK) ==
3776                  (ss.selector & SEGMENT_RPL_MASK));
3777 }
3778
3779 /*
3780  * Check if guest state is valid. Returns true if valid, false if
3781  * not.
3782  * We assume that registers are always usable
3783  */
3784 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3785 {
3786         /* real mode guest state checks */
3787         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3788                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3789                         return false;
3790                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3791                         return false;
3792                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3793                         return false;
3794                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3795                         return false;
3796                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3797                         return false;
3798                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3799                         return false;
3800         } else {
3801         /* protected mode guest state checks */
3802                 if (!cs_ss_rpl_check(vcpu))
3803                         return false;
3804                 if (!code_segment_valid(vcpu))
3805                         return false;
3806                 if (!stack_segment_valid(vcpu))
3807                         return false;
3808                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3809                         return false;
3810                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3811                         return false;
3812                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3813                         return false;
3814                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3815                         return false;
3816                 if (!tr_valid(vcpu))
3817                         return false;
3818                 if (!ldtr_valid(vcpu))
3819                         return false;
3820         }
3821         /* TODO:
3822          * - Add checks on RIP
3823          * - Add checks on RFLAGS
3824          */
3825
3826         return true;
3827 }
3828
3829 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
3830 {
3831         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3832         u16 data;
3833         int i;
3834
3835         for (i = 0; i < 3; i++) {
3836                 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3837                         return -EFAULT;
3838         }
3839
3840         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3841         if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3842                 return -EFAULT;
3843
3844         data = ~0;
3845         if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3846                 return -EFAULT;
3847
3848         return 0;
3849 }
3850
3851 static int init_rmode_identity_map(struct kvm *kvm)
3852 {
3853         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3854         int i, r = 0;
3855         void __user *uaddr;
3856         u32 tmp;
3857
3858         /* Protect kvm_vmx->ept_identity_pagetable_done. */
3859         mutex_lock(&kvm->slots_lock);
3860
3861         if (likely(kvm_vmx->ept_identity_pagetable_done))
3862                 goto out;
3863
3864         if (!kvm_vmx->ept_identity_map_addr)
3865                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3866
3867         uaddr = __x86_set_memory_region(kvm,
3868                                         IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3869                                         kvm_vmx->ept_identity_map_addr,
3870                                         PAGE_SIZE);
3871         if (IS_ERR(uaddr)) {
3872                 r = PTR_ERR(uaddr);
3873                 goto out;
3874         }
3875
3876         /* Set up identity-mapping pagetable for EPT in real mode */
3877         for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
3878                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3879                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3880                 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3881                         r = -EFAULT;
3882                         goto out;
3883                 }
3884         }
3885         kvm_vmx->ept_identity_pagetable_done = true;
3886
3887 out:
3888         mutex_unlock(&kvm->slots_lock);
3889         return r;
3890 }
3891
3892 static void seg_setup(int seg)
3893 {
3894         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3895         unsigned int ar;
3896
3897         vmcs_write16(sf->selector, 0);
3898         vmcs_writel(sf->base, 0);
3899         vmcs_write32(sf->limit, 0xffff);
3900         ar = 0x93;
3901         if (seg == VCPU_SREG_CS)
3902                 ar |= 0x08; /* code segment */
3903
3904         vmcs_write32(sf->ar_bytes, ar);
3905 }
3906
3907 int allocate_vpid(void)
3908 {
3909         int vpid;
3910
3911         if (!enable_vpid)
3912                 return 0;
3913         spin_lock(&vmx_vpid_lock);
3914         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3915         if (vpid < VMX_NR_VPIDS)
3916                 __set_bit(vpid, vmx_vpid_bitmap);
3917         else
3918                 vpid = 0;
3919         spin_unlock(&vmx_vpid_lock);
3920         return vpid;
3921 }
3922
3923 void free_vpid(int vpid)
3924 {
3925         if (!enable_vpid || vpid == 0)
3926                 return;
3927         spin_lock(&vmx_vpid_lock);
3928         __clear_bit(vpid, vmx_vpid_bitmap);
3929         spin_unlock(&vmx_vpid_lock);
3930 }
3931
3932 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3933 {
3934         /*
3935          * When KVM is a nested hypervisor on top of Hyper-V and uses
3936          * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3937          * bitmap has changed.
3938          */
3939         if (static_branch_unlikely(&enable_evmcs))
3940                 evmcs_touch_msr_bitmap();
3941
3942         vmx->nested.force_msr_bitmap_recalc = true;
3943 }
3944
3945 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3946 {
3947         struct vcpu_vmx *vmx = to_vmx(vcpu);
3948         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3949
3950         if (!cpu_has_vmx_msr_bitmap())
3951                 return;
3952
3953         vmx_msr_bitmap_l01_changed(vmx);
3954
3955         /*
3956          * Mark the desired intercept state in shadow bitmap, this is needed
3957          * for resync when the MSR filters change.
3958         */
3959         if (is_valid_passthrough_msr(msr)) {
3960                 int idx = possible_passthrough_msr_slot(msr);
3961
3962                 if (idx != -ENOENT) {
3963                         if (type & MSR_TYPE_R)
3964                                 clear_bit(idx, vmx->shadow_msr_intercept.read);
3965                         if (type & MSR_TYPE_W)
3966                                 clear_bit(idx, vmx->shadow_msr_intercept.write);
3967                 }
3968         }
3969
3970         if ((type & MSR_TYPE_R) &&
3971             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3972                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3973                 type &= ~MSR_TYPE_R;
3974         }
3975
3976         if ((type & MSR_TYPE_W) &&
3977             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3978                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3979                 type &= ~MSR_TYPE_W;
3980         }
3981
3982         if (type & MSR_TYPE_R)
3983                 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
3984
3985         if (type & MSR_TYPE_W)
3986                 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
3987 }
3988
3989 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3990 {
3991         struct vcpu_vmx *vmx = to_vmx(vcpu);
3992         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3993
3994         if (!cpu_has_vmx_msr_bitmap())
3995                 return;
3996
3997         vmx_msr_bitmap_l01_changed(vmx);
3998
3999         /*
4000          * Mark the desired intercept state in shadow bitmap, this is needed
4001          * for resync when the MSR filter changes.
4002         */
4003         if (is_valid_passthrough_msr(msr)) {
4004                 int idx = possible_passthrough_msr_slot(msr);
4005
4006                 if (idx != -ENOENT) {
4007                         if (type & MSR_TYPE_R)
4008                                 set_bit(idx, vmx->shadow_msr_intercept.read);
4009                         if (type & MSR_TYPE_W)
4010                                 set_bit(idx, vmx->shadow_msr_intercept.write);
4011                 }
4012         }
4013
4014         if (type & MSR_TYPE_R)
4015                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4016
4017         if (type & MSR_TYPE_W)
4018                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4019 }
4020
4021 static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
4022 {
4023         unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
4024         unsigned long read_intercept;
4025         int msr;
4026
4027         read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
4028
4029         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
4030                 unsigned int read_idx = msr / BITS_PER_LONG;
4031                 unsigned int write_idx = read_idx + (0x800 / sizeof(long));
4032
4033                 msr_bitmap[read_idx] = read_intercept;
4034                 msr_bitmap[write_idx] = ~0ul;
4035         }
4036 }
4037
4038 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4039 {
4040         struct vcpu_vmx *vmx = to_vmx(vcpu);
4041         u8 mode;
4042
4043         if (!cpu_has_vmx_msr_bitmap())
4044                 return;
4045
4046         if (cpu_has_secondary_exec_ctrls() &&
4047             (secondary_exec_controls_get(vmx) &
4048              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4049                 mode = MSR_BITMAP_MODE_X2APIC;
4050                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4051                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4052         } else {
4053                 mode = 0;
4054         }
4055
4056         if (mode == vmx->x2apic_msr_bitmap_mode)
4057                 return;
4058
4059         vmx->x2apic_msr_bitmap_mode = mode;
4060
4061         vmx_reset_x2apic_msrs(vcpu, mode);
4062
4063         /*
4064          * TPR reads and writes can be virtualized even if virtual interrupt
4065          * delivery is not in use.
4066          */
4067         vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4068                                   !(mode & MSR_BITMAP_MODE_X2APIC));
4069
4070         if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4071                 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4072                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4073                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4074                 if (enable_ipiv)
4075                         vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4076         }
4077 }
4078
4079 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4080 {
4081         struct vcpu_vmx *vmx = to_vmx(vcpu);
4082         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4083         u32 i;
4084
4085         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4086         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4087         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4088         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4089         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4090                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4091                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4092         }
4093 }
4094
4095 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4096 {
4097         struct vcpu_vmx *vmx = to_vmx(vcpu);
4098         void *vapic_page;
4099         u32 vppr;
4100         int rvi;
4101
4102         if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4103                 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4104                 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4105                 return false;
4106
4107         rvi = vmx_get_rvi();
4108
4109         vapic_page = vmx->nested.virtual_apic_map.hva;
4110         vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
4111
4112         return ((rvi & 0xf0) > (vppr & 0xf0));
4113 }
4114
4115 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4116 {
4117         struct vcpu_vmx *vmx = to_vmx(vcpu);
4118         u32 i;
4119
4120         /*
4121          * Redo intercept permissions for MSRs that KVM is passing through to
4122          * the guest.  Disabling interception will check the new MSR filter and
4123          * ensure that KVM enables interception if usersepace wants to filter
4124          * the MSR.  MSRs that KVM is already intercepting don't need to be
4125          * refreshed since KVM is going to intercept them regardless of what
4126          * userspace wants.
4127          */
4128         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4129                 u32 msr = vmx_possible_passthrough_msrs[i];
4130
4131                 if (!test_bit(i, vmx->shadow_msr_intercept.read))
4132                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4133
4134                 if (!test_bit(i, vmx->shadow_msr_intercept.write))
4135                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
4136         }
4137
4138         /* PT MSRs can be passed through iff PT is exposed to the guest. */
4139         if (vmx_pt_mode_is_host_guest())
4140                 pt_update_intercept_for_msr(vcpu);
4141 }
4142
4143 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4144                                                      int pi_vec)
4145 {
4146 #ifdef CONFIG_SMP
4147         if (vcpu->mode == IN_GUEST_MODE) {
4148                 /*
4149                  * The vector of the virtual has already been set in the PIR.
4150                  * Send a notification event to deliver the virtual interrupt
4151                  * unless the vCPU is the currently running vCPU, i.e. the
4152                  * event is being sent from a fastpath VM-Exit handler, in
4153                  * which case the PIR will be synced to the vIRR before
4154                  * re-entering the guest.
4155                  *
4156                  * When the target is not the running vCPU, the following
4157                  * possibilities emerge:
4158                  *
4159                  * Case 1: vCPU stays in non-root mode. Sending a notification
4160                  * event posts the interrupt to the vCPU.
4161                  *
4162                  * Case 2: vCPU exits to root mode and is still runnable. The
4163                  * PIR will be synced to the vIRR before re-entering the guest.
4164                  * Sending a notification event is ok as the host IRQ handler
4165                  * will ignore the spurious event.
4166                  *
4167                  * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4168                  * has already synced PIR to vIRR and never blocks the vCPU if
4169                  * the vIRR is not empty. Therefore, a blocked vCPU here does
4170                  * not wait for any requested interrupts in PIR, and sending a
4171                  * notification event also results in a benign, spurious event.
4172                  */
4173
4174                 if (vcpu != kvm_get_running_vcpu())
4175                         apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4176                 return;
4177         }
4178 #endif
4179         /*
4180          * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4181          * otherwise do nothing as KVM will grab the highest priority pending
4182          * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4183          */
4184         kvm_vcpu_wake_up(vcpu);
4185 }
4186
4187 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4188                                                 int vector)
4189 {
4190         struct vcpu_vmx *vmx = to_vmx(vcpu);
4191
4192         if (is_guest_mode(vcpu) &&
4193             vector == vmx->nested.posted_intr_nv) {
4194                 /*
4195                  * If a posted intr is not recognized by hardware,
4196                  * we will accomplish it in the next vmentry.
4197                  */
4198                 vmx->nested.pi_pending = true;
4199                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4200
4201                 /*
4202                  * This pairs with the smp_mb_*() after setting vcpu->mode in
4203                  * vcpu_enter_guest() to guarantee the vCPU sees the event
4204                  * request if triggering a posted interrupt "fails" because
4205                  * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
4206                  * the smb_wmb() in kvm_make_request() only ensures everything
4207                  * done before making the request is visible when the request
4208                  * is visible, it doesn't ensure ordering between the store to
4209                  * vcpu->requests and the load from vcpu->mode.
4210                  */
4211                 smp_mb__after_atomic();
4212
4213                 /* the PIR and ON have been set by L1. */
4214                 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4215                 return 0;
4216         }
4217         return -1;
4218 }
4219 /*
4220  * Send interrupt to vcpu via posted interrupt way.
4221  * 1. If target vcpu is running(non-root mode), send posted interrupt
4222  * notification to vcpu and hardware will sync PIR to vIRR atomically.
4223  * 2. If target vcpu isn't running(root mode), kick it to pick up the
4224  * interrupt from PIR in next vmentry.
4225  */
4226 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4227 {
4228         struct vcpu_vmx *vmx = to_vmx(vcpu);
4229         int r;
4230
4231         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4232         if (!r)
4233                 return 0;
4234
4235         /* Note, this is called iff the local APIC is in-kernel. */
4236         if (!vcpu->arch.apic->apicv_active)
4237                 return -1;
4238
4239         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4240                 return 0;
4241
4242         /* If a previous notification has sent the IPI, nothing to do.  */
4243         if (pi_test_and_set_on(&vmx->pi_desc))
4244                 return 0;
4245
4246         /*
4247          * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4248          * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4249          * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4250          * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4251          */
4252         kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
4253         return 0;
4254 }
4255
4256 static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4257                                   int trig_mode, int vector)
4258 {
4259         struct kvm_vcpu *vcpu = apic->vcpu;
4260
4261         if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4262                 kvm_lapic_set_irr(vector, apic);
4263                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4264                 kvm_vcpu_kick(vcpu);
4265         } else {
4266                 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4267                                            trig_mode, vector);
4268         }
4269 }
4270
4271 /*
4272  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4273  * will not change in the lifetime of the guest.
4274  * Note that host-state that does change is set elsewhere. E.g., host-state
4275  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4276  */
4277 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4278 {
4279         u32 low32, high32;
4280         unsigned long tmpl;
4281         unsigned long cr0, cr3, cr4;
4282
4283         cr0 = read_cr0();
4284         WARN_ON(cr0 & X86_CR0_TS);
4285         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
4286
4287         /*
4288          * Save the most likely value for this task's CR3 in the VMCS.
4289          * We can't use __get_current_cr3_fast() because we're not atomic.
4290          */
4291         cr3 = __read_cr3();
4292         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
4293         vmx->loaded_vmcs->host_state.cr3 = cr3;
4294
4295         /* Save the most likely value for this task's CR4 in the VMCS. */
4296         cr4 = cr4_read_shadow();
4297         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4298         vmx->loaded_vmcs->host_state.cr4 = cr4;
4299
4300         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4301 #ifdef CONFIG_X86_64
4302         /*
4303          * Load null selectors, so we can avoid reloading them in
4304          * vmx_prepare_switch_to_host(), in case userspace uses
4305          * the null selectors too (the expected case).
4306          */
4307         vmcs_write16(HOST_DS_SELECTOR, 0);
4308         vmcs_write16(HOST_ES_SELECTOR, 0);
4309 #else
4310         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4311         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4312 #endif
4313         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4314         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4315
4316         vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
4317
4318         vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4319
4320         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4321         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4322
4323         /*
4324          * SYSENTER is used for 32-bit system calls on either 32-bit or
4325          * 64-bit kernels.  It is always zero If neither is allowed, otherwise
4326          * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4327          * have already done so!).
4328          */
4329         if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4330                 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4331
4332         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4333         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4334
4335         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4336                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4337                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4338         }
4339
4340         if (cpu_has_load_ia32_efer())
4341                 vmcs_write64(HOST_IA32_EFER, host_efer);
4342 }
4343
4344 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4345 {
4346         struct kvm_vcpu *vcpu = &vmx->vcpu;
4347
4348         vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4349                                           ~vcpu->arch.cr4_guest_rsvd_bits;
4350         if (!enable_ept) {
4351                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4352                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4353         }
4354         if (is_guest_mode(&vmx->vcpu))
4355                 vcpu->arch.cr4_guest_owned_bits &=
4356                         ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4357         vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4358 }
4359
4360 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4361 {
4362         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4363
4364         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4365                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4366
4367         if (!enable_vnmi)
4368                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4369
4370         if (!enable_preemption_timer)
4371                 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4372
4373         return pin_based_exec_ctrl;
4374 }
4375
4376 static u32 vmx_vmentry_ctrl(void)
4377 {
4378         u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4379
4380         if (vmx_pt_mode_is_system())
4381                 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4382                                   VM_ENTRY_LOAD_IA32_RTIT_CTL);
4383         /*
4384          * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4385          */
4386         vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4387                           VM_ENTRY_LOAD_IA32_EFER |
4388                           VM_ENTRY_IA32E_MODE);
4389
4390         if (cpu_has_perf_global_ctrl_bug())
4391                 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4392
4393         return vmentry_ctrl;
4394 }
4395
4396 static u32 vmx_vmexit_ctrl(void)
4397 {
4398         u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4399
4400         /*
4401          * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4402          * nested virtualization and thus allowed to be set in vmcs12.
4403          */
4404         vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4405                          VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4406
4407         if (vmx_pt_mode_is_system())
4408                 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4409                                  VM_EXIT_CLEAR_IA32_RTIT_CTL);
4410
4411         if (cpu_has_perf_global_ctrl_bug())
4412                 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4413
4414         /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4415         return vmexit_ctrl &
4416                 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4417 }
4418
4419 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4420 {
4421         struct vcpu_vmx *vmx = to_vmx(vcpu);
4422
4423         if (is_guest_mode(vcpu)) {
4424                 vmx->nested.update_vmcs01_apicv_status = true;
4425                 return;
4426         }
4427
4428         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4429
4430         if (kvm_vcpu_apicv_active(vcpu)) {
4431                 secondary_exec_controls_setbit(vmx,
4432                                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
4433                                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4434                 if (enable_ipiv)
4435                         tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4436         } else {
4437                 secondary_exec_controls_clearbit(vmx,
4438                                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
4439                                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4440                 if (enable_ipiv)
4441                         tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4442         }
4443
4444         vmx_update_msr_bitmap_x2apic(vcpu);
4445 }
4446
4447 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4448 {
4449         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4450
4451         /*
4452          * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4453          * vmcs12 and propagated to vmcs02 when set in vmcs12.
4454          */
4455         exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4456                           CPU_BASED_USE_IO_BITMAPS |
4457                           CPU_BASED_MONITOR_TRAP_FLAG |
4458                           CPU_BASED_PAUSE_EXITING);
4459
4460         /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4461         exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4462                           CPU_BASED_NMI_WINDOW_EXITING);
4463
4464         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4465                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4466
4467         if (!cpu_need_tpr_shadow(&vmx->vcpu))
4468                 exec_control &= ~CPU_BASED_TPR_SHADOW;
4469
4470 #ifdef CONFIG_X86_64
4471         if (exec_control & CPU_BASED_TPR_SHADOW)
4472                 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4473                                   CPU_BASED_CR8_STORE_EXITING);
4474         else
4475                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4476                                 CPU_BASED_CR8_LOAD_EXITING;
4477 #endif
4478         /* No need to intercept CR3 access or INVPLG when using EPT. */
4479         if (enable_ept)
4480                 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4481                                   CPU_BASED_CR3_STORE_EXITING |
4482                                   CPU_BASED_INVLPG_EXITING);
4483         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4484                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4485                                 CPU_BASED_MONITOR_EXITING);
4486         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4487                 exec_control &= ~CPU_BASED_HLT_EXITING;
4488         return exec_control;
4489 }
4490
4491 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4492 {
4493         u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4494
4495         /*
4496          * IPI virtualization relies on APICv. Disable IPI virtualization if
4497          * APICv is inhibited.
4498          */
4499         if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4500                 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4501
4502         return exec_control;
4503 }
4504
4505 /*
4506  * Adjust a single secondary execution control bit to intercept/allow an
4507  * instruction in the guest.  This is usually done based on whether or not a
4508  * feature has been exposed to the guest in order to correctly emulate faults.
4509  */
4510 static inline void
4511 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4512                                   u32 control, bool enabled, bool exiting)
4513 {
4514         /*
4515          * If the control is for an opt-in feature, clear the control if the
4516          * feature is not exposed to the guest, i.e. not enabled.  If the
4517          * control is opt-out, i.e. an exiting control, clear the control if
4518          * the feature _is_ exposed to the guest, i.e. exiting/interception is
4519          * disabled for the associated instruction.  Note, the caller is
4520          * responsible presetting exec_control to set all supported bits.
4521          */
4522         if (enabled == exiting)
4523                 *exec_control &= ~control;
4524
4525         /*
4526          * Update the nested MSR settings so that a nested VMM can/can't set
4527          * controls for features that are/aren't exposed to the guest.
4528          */
4529         if (nested) {
4530                 /*
4531                  * All features that can be added or removed to VMX MSRs must
4532                  * be supported in the first place for nested virtualization.
4533                  */
4534                 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4535                         enabled = false;
4536
4537                 if (enabled)
4538                         vmx->nested.msrs.secondary_ctls_high |= control;
4539                 else
4540                         vmx->nested.msrs.secondary_ctls_high &= ~control;
4541         }
4542 }
4543
4544 /*
4545  * Wrapper macro for the common case of adjusting a secondary execution control
4546  * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4547  * verifies that the control is actually supported by KVM and hardware.
4548  */
4549 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4550 ({                                                                       \
4551         bool __enabled;                                                  \
4552                                                                          \
4553         if (cpu_has_vmx_##name()) {                                      \
4554                 __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
4555                                             X86_FEATURE_##feat_name);    \
4556                 vmx_adjust_secondary_exec_control(vmx, exec_control,     \
4557                         SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4558         }                                                                \
4559 })
4560
4561 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4562 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4563         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4564
4565 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4566         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4567
4568 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4569 {
4570         struct kvm_vcpu *vcpu = &vmx->vcpu;
4571
4572         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4573
4574         if (vmx_pt_mode_is_system())
4575                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4576         if (!cpu_need_virtualize_apic_accesses(vcpu))
4577                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4578         if (vmx->vpid == 0)
4579                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4580         if (!enable_ept) {
4581                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4582                 enable_unrestricted_guest = 0;
4583         }
4584         if (!enable_unrestricted_guest)
4585                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4586         if (kvm_pause_in_guest(vmx->vcpu.kvm))
4587                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4588         if (!kvm_vcpu_apicv_active(vcpu))
4589                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4590                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4591         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4592
4593         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4594          * in vmx_set_cr4.  */
4595         exec_control &= ~SECONDARY_EXEC_DESC;
4596
4597         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4598            (handle_vmptrld).
4599            We can NOT enable shadow_vmcs here because we don't have yet
4600            a current VMCS12
4601         */
4602         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4603
4604         /*
4605          * PML is enabled/disabled when dirty logging of memsmlots changes, but
4606          * it needs to be set here when dirty logging is already active, e.g.
4607          * if this vCPU was created after dirty logging was enabled.
4608          */
4609         if (!vcpu->kvm->arch.cpu_dirty_logging_count)
4610                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4611
4612         if (cpu_has_vmx_xsaves()) {
4613                 /* Exposing XSAVES only when XSAVE is exposed */
4614                 bool xsaves_enabled =
4615                         boot_cpu_has(X86_FEATURE_XSAVE) &&
4616                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4617                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4618
4619                 vcpu->arch.xsaves_enabled = xsaves_enabled;
4620
4621                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4622                                                   SECONDARY_EXEC_XSAVES,
4623                                                   xsaves_enabled, false);
4624         }
4625
4626         /*
4627          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4628          * feature is exposed to the guest.  This creates a virtualization hole
4629          * if both are supported in hardware but only one is exposed to the
4630          * guest, but letting the guest execute RDTSCP or RDPID when either one
4631          * is advertised is preferable to emulating the advertised instruction
4632          * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4633          */
4634         if (cpu_has_vmx_rdtscp()) {
4635                 bool rdpid_or_rdtscp_enabled =
4636                         guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4637                         guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4638
4639                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4640                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
4641                                                   rdpid_or_rdtscp_enabled, false);
4642         }
4643         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4644
4645         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4646         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4647
4648         vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4649                                     ENABLE_USR_WAIT_PAUSE, false);
4650
4651         if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4652                 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4653
4654         if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4655                 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4656
4657         return exec_control;
4658 }
4659
4660 static inline int vmx_get_pid_table_order(struct kvm *kvm)
4661 {
4662         return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4663 }
4664
4665 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4666 {
4667         struct page *pages;
4668         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4669
4670         if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4671                 return 0;
4672
4673         if (kvm_vmx->pid_table)
4674                 return 0;
4675
4676         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
4677         if (!pages)
4678                 return -ENOMEM;
4679
4680         kvm_vmx->pid_table = (void *)page_address(pages);
4681         return 0;
4682 }
4683
4684 static int vmx_vcpu_precreate(struct kvm *kvm)
4685 {
4686         return vmx_alloc_ipiv_pid_table(kvm);
4687 }
4688
4689 #define VMX_XSS_EXIT_BITMAP 0
4690
4691 static void init_vmcs(struct vcpu_vmx *vmx)
4692 {
4693         struct kvm *kvm = vmx->vcpu.kvm;
4694         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4695
4696         if (nested)
4697                 nested_vmx_set_vmcs_shadowing_bitmap();
4698
4699         if (cpu_has_vmx_msr_bitmap())
4700                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4701
4702         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4703
4704         /* Control */
4705         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4706
4707         exec_controls_set(vmx, vmx_exec_control(vmx));
4708
4709         if (cpu_has_secondary_exec_ctrls())
4710                 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4711
4712         if (cpu_has_tertiary_exec_ctrls())
4713                 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4714
4715         if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4716                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4717                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4718                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4719                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4720
4721                 vmcs_write16(GUEST_INTR_STATUS, 0);
4722
4723                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4724                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4725         }
4726
4727         if (vmx_can_use_ipiv(&vmx->vcpu)) {
4728                 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4729                 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4730         }
4731
4732         if (!kvm_pause_in_guest(kvm)) {
4733                 vmcs_write32(PLE_GAP, ple_gap);
4734                 vmx->ple_window = ple_window;
4735                 vmx->ple_window_dirty = true;
4736         }
4737
4738         if (kvm_notify_vmexit_enabled(kvm))
4739                 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4740
4741         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4742         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4743         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4744
4745         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4746         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4747         vmx_set_constant_host_state(vmx);
4748         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4749         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4750
4751         if (cpu_has_vmx_vmfunc())
4752                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4753
4754         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4755         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4756         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4757         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4758         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4759
4760         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4761                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4762
4763         vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4764
4765         /* 22.2.1, 20.8.1 */
4766         vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4767
4768         vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4769         vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4770
4771         set_cr4_guest_host_mask(vmx);
4772
4773         if (vmx->vpid != 0)
4774                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4775
4776         if (cpu_has_vmx_xsaves())
4777                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4778
4779         if (enable_pml) {
4780                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4781                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4782         }
4783
4784         vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4785
4786         if (vmx_pt_mode_is_host_guest()) {
4787                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4788                 /* Bit[6~0] are forced to 1, writes are ignored. */
4789                 vmx->pt_desc.guest.output_mask = 0x7F;
4790                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4791         }
4792
4793         vmcs_write32(GUEST_SYSENTER_CS, 0);
4794         vmcs_writel(GUEST_SYSENTER_ESP, 0);
4795         vmcs_writel(GUEST_SYSENTER_EIP, 0);
4796         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4797
4798         if (cpu_has_vmx_tpr_shadow()) {
4799                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4800                 if (cpu_need_tpr_shadow(&vmx->vcpu))
4801                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4802                                      __pa(vmx->vcpu.arch.apic->regs));
4803                 vmcs_write32(TPR_THRESHOLD, 0);
4804         }
4805
4806         vmx_setup_uret_msrs(vmx);
4807 }
4808
4809 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4810 {
4811         struct vcpu_vmx *vmx = to_vmx(vcpu);
4812
4813         init_vmcs(vmx);
4814
4815         if (nested)
4816                 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4817
4818         vcpu_setup_sgx_lepubkeyhash(vcpu);
4819
4820         vmx->nested.posted_intr_nv = -1;
4821         vmx->nested.vmxon_ptr = INVALID_GPA;
4822         vmx->nested.current_vmptr = INVALID_GPA;
4823         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4824
4825         vcpu->arch.microcode_version = 0x100000000ULL;
4826         vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4827
4828         /*
4829          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4830          * or POSTED_INTR_WAKEUP_VECTOR.
4831          */
4832         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4833         vmx->pi_desc.sn = 1;
4834 }
4835
4836 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4837 {
4838         struct vcpu_vmx *vmx = to_vmx(vcpu);
4839
4840         if (!init_event)
4841                 __vmx_vcpu_reset(vcpu);
4842
4843         vmx->rmode.vm86_active = 0;
4844         vmx->spec_ctrl = 0;
4845
4846         vmx->msr_ia32_umwait_control = 0;
4847
4848         vmx->hv_deadline_tsc = -1;
4849         kvm_set_cr8(vcpu, 0);
4850
4851         vmx_segment_cache_clear(vmx);
4852         kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
4853
4854         seg_setup(VCPU_SREG_CS);
4855         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4856         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4857
4858         seg_setup(VCPU_SREG_DS);
4859         seg_setup(VCPU_SREG_ES);
4860         seg_setup(VCPU_SREG_FS);
4861         seg_setup(VCPU_SREG_GS);
4862         seg_setup(VCPU_SREG_SS);
4863
4864         vmcs_write16(GUEST_TR_SELECTOR, 0);
4865         vmcs_writel(GUEST_TR_BASE, 0);
4866         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4867         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4868
4869         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4870         vmcs_writel(GUEST_LDTR_BASE, 0);
4871         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4872         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4873
4874         vmcs_writel(GUEST_GDTR_BASE, 0);
4875         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4876
4877         vmcs_writel(GUEST_IDTR_BASE, 0);
4878         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4879
4880         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4881         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4882         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4883         if (kvm_mpx_supported())
4884                 vmcs_write64(GUEST_BNDCFGS, 0);
4885
4886         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4887
4888         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4889
4890         vpid_sync_context(vmx->vpid);
4891
4892         vmx_update_fb_clear_dis(vcpu, vmx);
4893 }
4894
4895 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
4896 {
4897         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4898 }
4899
4900 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
4901 {
4902         if (!enable_vnmi ||
4903             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4904                 vmx_enable_irq_window(vcpu);
4905                 return;
4906         }
4907
4908         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4909 }
4910
4911 static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
4912 {
4913         struct vcpu_vmx *vmx = to_vmx(vcpu);
4914         uint32_t intr;
4915         int irq = vcpu->arch.interrupt.nr;
4916
4917         trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
4918
4919         ++vcpu->stat.irq_injections;
4920         if (vmx->rmode.vm86_active) {
4921                 int inc_eip = 0;
4922                 if (vcpu->arch.interrupt.soft)
4923                         inc_eip = vcpu->arch.event_exit_inst_len;
4924                 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4925                 return;
4926         }
4927         intr = irq | INTR_INFO_VALID_MASK;
4928         if (vcpu->arch.interrupt.soft) {
4929                 intr |= INTR_TYPE_SOFT_INTR;
4930                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4931                              vmx->vcpu.arch.event_exit_inst_len);
4932         } else
4933                 intr |= INTR_TYPE_EXT_INTR;
4934         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4935
4936         vmx_clear_hlt(vcpu);
4937 }
4938
4939 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4940 {
4941         struct vcpu_vmx *vmx = to_vmx(vcpu);
4942
4943         if (!enable_vnmi) {
4944                 /*
4945                  * Tracking the NMI-blocked state in software is built upon
4946                  * finding the next open IRQ window. This, in turn, depends on
4947                  * well-behaving guests: They have to keep IRQs disabled at
4948                  * least as long as the NMI handler runs. Otherwise we may
4949                  * cause NMI nesting, maybe breaking the guest. But as this is
4950                  * highly unlikely, we can live with the residual risk.
4951                  */
4952                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4953                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4954         }
4955
4956         ++vcpu->stat.nmi_injections;
4957         vmx->loaded_vmcs->nmi_known_unmasked = false;
4958
4959         if (vmx->rmode.vm86_active) {
4960                 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
4961                 return;
4962         }
4963
4964         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4965                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4966
4967         vmx_clear_hlt(vcpu);
4968 }
4969
4970 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4971 {
4972         struct vcpu_vmx *vmx = to_vmx(vcpu);
4973         bool masked;
4974
4975         if (!enable_vnmi)
4976                 return vmx->loaded_vmcs->soft_vnmi_blocked;
4977         if (vmx->loaded_vmcs->nmi_known_unmasked)
4978                 return false;
4979         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4980         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4981         return masked;
4982 }
4983
4984 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4985 {
4986         struct vcpu_vmx *vmx = to_vmx(vcpu);
4987
4988         if (!enable_vnmi) {
4989                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4990                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4991                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
4992                 }
4993         } else {
4994                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4995                 if (masked)
4996                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4997                                       GUEST_INTR_STATE_NMI);
4998                 else
4999                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5000                                         GUEST_INTR_STATE_NMI);
5001         }
5002 }
5003
5004 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5005 {
5006         if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5007                 return false;
5008
5009         if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5010                 return true;
5011
5012         return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5013                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5014                  GUEST_INTR_STATE_NMI));
5015 }
5016
5017 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5018 {
5019         if (to_vmx(vcpu)->nested.nested_run_pending)
5020                 return -EBUSY;
5021
5022         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
5023         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5024                 return -EBUSY;
5025
5026         return !vmx_nmi_blocked(vcpu);
5027 }
5028
5029 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5030 {
5031         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5032                 return false;
5033
5034         return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5035                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5036                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5037 }
5038
5039 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5040 {
5041         if (to_vmx(vcpu)->nested.nested_run_pending)
5042                 return -EBUSY;
5043
5044        /*
5045         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5046         * e.g. if the IRQ arrived asynchronously after checking nested events.
5047         */
5048         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5049                 return -EBUSY;
5050
5051         return !vmx_interrupt_blocked(vcpu);
5052 }
5053
5054 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5055 {
5056         void __user *ret;
5057
5058         if (enable_unrestricted_guest)
5059                 return 0;
5060
5061         mutex_lock(&kvm->slots_lock);
5062         ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5063                                       PAGE_SIZE * 3);
5064         mutex_unlock(&kvm->slots_lock);
5065
5066         if (IS_ERR(ret))
5067                 return PTR_ERR(ret);
5068
5069         to_kvm_vmx(kvm)->tss_addr = addr;
5070
5071         return init_rmode_tss(kvm, ret);
5072 }
5073
5074 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5075 {
5076         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5077         return 0;
5078 }
5079
5080 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5081 {
5082         switch (vec) {
5083         case BP_VECTOR:
5084                 /*
5085                  * Update instruction length as we may reinject the exception
5086                  * from user space while in guest debugging mode.
5087                  */
5088                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5089                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5090                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5091                         return false;
5092                 fallthrough;
5093         case DB_VECTOR:
5094                 return !(vcpu->guest_debug &
5095                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5096         case DE_VECTOR:
5097         case OF_VECTOR:
5098         case BR_VECTOR:
5099         case UD_VECTOR:
5100         case DF_VECTOR:
5101         case SS_VECTOR:
5102         case GP_VECTOR:
5103         case MF_VECTOR:
5104                 return true;
5105         }
5106         return false;
5107 }
5108
5109 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5110                                   int vec, u32 err_code)
5111 {
5112         /*
5113          * Instruction with address size override prefix opcode 0x67
5114          * Cause the #SS fault with 0 error code in VM86 mode.
5115          */
5116         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5117                 if (kvm_emulate_instruction(vcpu, 0)) {
5118                         if (vcpu->arch.halt_request) {
5119                                 vcpu->arch.halt_request = 0;
5120                                 return kvm_emulate_halt_noskip(vcpu);
5121                         }
5122                         return 1;
5123                 }
5124                 return 0;
5125         }
5126
5127         /*
5128          * Forward all other exceptions that are valid in real mode.
5129          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5130          *        the required debugging infrastructure rework.
5131          */
5132         kvm_queue_exception(vcpu, vec);
5133         return 1;
5134 }
5135
5136 static int handle_machine_check(struct kvm_vcpu *vcpu)
5137 {
5138         /* handled by vmx_vcpu_run() */
5139         return 1;
5140 }
5141
5142 /*
5143  * If the host has split lock detection disabled, then #AC is
5144  * unconditionally injected into the guest, which is the pre split lock
5145  * detection behaviour.
5146  *
5147  * If the host has split lock detection enabled then #AC is
5148  * only injected into the guest when:
5149  *  - Guest CPL == 3 (user mode)
5150  *  - Guest has #AC detection enabled in CR0
5151  *  - Guest EFLAGS has AC bit set
5152  */
5153 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5154 {
5155         if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5156                 return true;
5157
5158         return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
5159                (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5160 }
5161
5162 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5163 {
5164         struct vcpu_vmx *vmx = to_vmx(vcpu);
5165         struct kvm_run *kvm_run = vcpu->run;
5166         u32 intr_info, ex_no, error_code;
5167         unsigned long cr2, dr6;
5168         u32 vect_info;
5169
5170         vect_info = vmx->idt_vectoring_info;
5171         intr_info = vmx_get_intr_info(vcpu);
5172
5173         if (is_machine_check(intr_info) || is_nmi(intr_info))
5174                 return 1; /* handled by handle_exception_nmi_irqoff() */
5175
5176         /*
5177          * Queue the exception here instead of in handle_nm_fault_irqoff().
5178          * This ensures the nested_vmx check is not skipped so vmexit can
5179          * be reflected to L1 (when it intercepts #NM) before reaching this
5180          * point.
5181          */
5182         if (is_nm_fault(intr_info)) {
5183                 kvm_queue_exception(vcpu, NM_VECTOR);
5184                 return 1;
5185         }
5186
5187         if (is_invalid_opcode(intr_info))
5188                 return handle_ud(vcpu);
5189
5190         error_code = 0;
5191         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5192                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5193
5194         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5195                 WARN_ON_ONCE(!enable_vmware_backdoor);
5196
5197                 /*
5198                  * VMware backdoor emulation on #GP interception only handles
5199                  * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5200                  * error code on #GP.
5201                  */
5202                 if (error_code) {
5203                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5204                         return 1;
5205                 }
5206                 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5207         }
5208
5209         /*
5210          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5211          * MMIO, it is better to report an internal error.
5212          * See the comments in vmx_handle_exit.
5213          */
5214         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5215             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5216                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5217                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5218                 vcpu->run->internal.ndata = 4;
5219                 vcpu->run->internal.data[0] = vect_info;
5220                 vcpu->run->internal.data[1] = intr_info;
5221                 vcpu->run->internal.data[2] = error_code;
5222                 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5223                 return 0;
5224         }
5225
5226         if (is_page_fault(intr_info)) {
5227                 cr2 = vmx_get_exit_qual(vcpu);
5228                 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5229                         /*
5230                          * EPT will cause page fault only if we need to
5231                          * detect illegal GPAs.
5232                          */
5233                         WARN_ON_ONCE(!allow_smaller_maxphyaddr);
5234                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5235                         return 1;
5236                 } else
5237                         return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5238         }
5239
5240         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5241
5242         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5243                 return handle_rmode_exception(vcpu, ex_no, error_code);
5244
5245         switch (ex_no) {
5246         case DB_VECTOR:
5247                 dr6 = vmx_get_exit_qual(vcpu);
5248                 if (!(vcpu->guest_debug &
5249                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5250                         /*
5251                          * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5252                          * instruction.  ICEBP generates a trap-like #DB, but
5253                          * despite its interception control being tied to #DB,
5254                          * is an instruction intercept, i.e. the VM-Exit occurs
5255                          * on the ICEBP itself.  Use the inner "skip" helper to
5256                          * avoid single-step #DB and MTF updates, as ICEBP is
5257                          * higher priority.  Note, skipping ICEBP still clears
5258                          * STI and MOVSS blocking.
5259                          *
5260                          * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5261                          * if single-step is enabled in RFLAGS and STI or MOVSS
5262                          * blocking is active, as the CPU doesn't set the bit
5263                          * on VM-Exit due to #DB interception.  VM-Entry has a
5264                          * consistency check that a single-step #DB is pending
5265                          * in this scenario as the previous instruction cannot
5266                          * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5267                          * don't modify RFLAGS), therefore the one instruction
5268                          * delay when activating single-step breakpoints must
5269                          * have already expired.  Note, the CPU sets/clears BS
5270                          * as appropriate for all other VM-Exits types.
5271                          */
5272                         if (is_icebp(intr_info))
5273                                 WARN_ON(!skip_emulated_instruction(vcpu));
5274                         else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5275                                  (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5276                                   (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5277                                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5278                                             vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5279
5280                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5281                         return 1;
5282                 }
5283                 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5284                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5285                 fallthrough;
5286         case BP_VECTOR:
5287                 /*
5288                  * Update instruction length as we may reinject #BP from
5289                  * user space while in guest debugging mode. Reading it for
5290                  * #DB as well causes no harm, it is not used in that case.
5291                  */
5292                 vmx->vcpu.arch.event_exit_inst_len =
5293                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5294                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5295                 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5296                 kvm_run->debug.arch.exception = ex_no;
5297                 break;
5298         case AC_VECTOR:
5299                 if (vmx_guest_inject_ac(vcpu)) {
5300                         kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5301                         return 1;
5302                 }
5303
5304                 /*
5305                  * Handle split lock. Depending on detection mode this will
5306                  * either warn and disable split lock detection for this
5307                  * task or force SIGBUS on it.
5308                  */
5309                 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5310                         return 1;
5311                 fallthrough;
5312         default:
5313                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5314                 kvm_run->ex.exception = ex_no;
5315                 kvm_run->ex.error_code = error_code;
5316                 break;
5317         }
5318         return 0;
5319 }
5320
5321 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5322 {
5323         ++vcpu->stat.irq_exits;
5324         return 1;
5325 }
5326
5327 static int handle_triple_fault(struct kvm_vcpu *vcpu)
5328 {
5329         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5330         vcpu->mmio_needed = 0;
5331         return 0;
5332 }
5333
5334 static int handle_io(struct kvm_vcpu *vcpu)
5335 {
5336         unsigned long exit_qualification;
5337         int size, in, string;
5338         unsigned port;
5339
5340         exit_qualification = vmx_get_exit_qual(vcpu);
5341         string = (exit_qualification & 16) != 0;
5342
5343         ++vcpu->stat.io_exits;
5344
5345         if (string)
5346                 return kvm_emulate_instruction(vcpu, 0);
5347
5348         port = exit_qualification >> 16;
5349         size = (exit_qualification & 7) + 1;
5350         in = (exit_qualification & 8) != 0;
5351
5352         return kvm_fast_pio(vcpu, size, port, in);
5353 }
5354
5355 static void
5356 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5357 {
5358         /*
5359          * Patch in the VMCALL instruction:
5360          */
5361         hypercall[0] = 0x0f;
5362         hypercall[1] = 0x01;
5363         hypercall[2] = 0xc1;
5364 }
5365
5366 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5367 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5368 {
5369         if (is_guest_mode(vcpu)) {
5370                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5371                 unsigned long orig_val = val;
5372
5373                 /*
5374                  * We get here when L2 changed cr0 in a way that did not change
5375                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5376                  * but did change L0 shadowed bits. So we first calculate the
5377                  * effective cr0 value that L1 would like to write into the
5378                  * hardware. It consists of the L2-owned bits from the new
5379                  * value combined with the L1-owned bits from L1's guest_cr0.
5380                  */
5381                 val = (val & ~vmcs12->cr0_guest_host_mask) |
5382                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5383
5384                 if (!nested_guest_cr0_valid(vcpu, val))
5385                         return 1;
5386
5387                 if (kvm_set_cr0(vcpu, val))
5388                         return 1;
5389                 vmcs_writel(CR0_READ_SHADOW, orig_val);
5390                 return 0;
5391         } else {
5392                 if (to_vmx(vcpu)->nested.vmxon &&
5393                     !nested_host_cr0_valid(vcpu, val))
5394                         return 1;
5395
5396                 return kvm_set_cr0(vcpu, val);
5397         }
5398 }
5399
5400 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5401 {
5402         if (is_guest_mode(vcpu)) {
5403                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5404                 unsigned long orig_val = val;
5405
5406                 /* analogously to handle_set_cr0 */
5407                 val = (val & ~vmcs12->cr4_guest_host_mask) |
5408                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5409                 if (kvm_set_cr4(vcpu, val))
5410                         return 1;
5411                 vmcs_writel(CR4_READ_SHADOW, orig_val);
5412                 return 0;
5413         } else
5414                 return kvm_set_cr4(vcpu, val);
5415 }
5416
5417 static int handle_desc(struct kvm_vcpu *vcpu)
5418 {
5419         WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
5420         return kvm_emulate_instruction(vcpu, 0);
5421 }
5422
5423 static int handle_cr(struct kvm_vcpu *vcpu)
5424 {
5425         unsigned long exit_qualification, val;
5426         int cr;
5427         int reg;
5428         int err;
5429         int ret;
5430
5431         exit_qualification = vmx_get_exit_qual(vcpu);
5432         cr = exit_qualification & 15;
5433         reg = (exit_qualification >> 8) & 15;
5434         switch ((exit_qualification >> 4) & 3) {
5435         case 0: /* mov to cr */
5436                 val = kvm_register_read(vcpu, reg);
5437                 trace_kvm_cr_write(cr, val);
5438                 switch (cr) {
5439                 case 0:
5440                         err = handle_set_cr0(vcpu, val);
5441                         return kvm_complete_insn_gp(vcpu, err);
5442                 case 3:
5443                         WARN_ON_ONCE(enable_unrestricted_guest);
5444
5445                         err = kvm_set_cr3(vcpu, val);
5446                         return kvm_complete_insn_gp(vcpu, err);
5447                 case 4:
5448                         err = handle_set_cr4(vcpu, val);
5449                         return kvm_complete_insn_gp(vcpu, err);
5450                 case 8: {
5451                                 u8 cr8_prev = kvm_get_cr8(vcpu);
5452                                 u8 cr8 = (u8)val;
5453                                 err = kvm_set_cr8(vcpu, cr8);
5454                                 ret = kvm_complete_insn_gp(vcpu, err);
5455                                 if (lapic_in_kernel(vcpu))
5456                                         return ret;
5457                                 if (cr8_prev <= cr8)
5458                                         return ret;
5459                                 /*
5460                                  * TODO: we might be squashing a
5461                                  * KVM_GUESTDBG_SINGLESTEP-triggered
5462                                  * KVM_EXIT_DEBUG here.
5463                                  */
5464                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5465                                 return 0;
5466                         }
5467                 }
5468                 break;
5469         case 2: /* clts */
5470                 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5471                 return -EIO;
5472         case 1: /*mov from cr*/
5473                 switch (cr) {
5474                 case 3:
5475                         WARN_ON_ONCE(enable_unrestricted_guest);
5476
5477                         val = kvm_read_cr3(vcpu);
5478                         kvm_register_write(vcpu, reg, val);
5479                         trace_kvm_cr_read(cr, val);
5480                         return kvm_skip_emulated_instruction(vcpu);
5481                 case 8:
5482                         val = kvm_get_cr8(vcpu);
5483                         kvm_register_write(vcpu, reg, val);
5484                         trace_kvm_cr_read(cr, val);
5485                         return kvm_skip_emulated_instruction(vcpu);
5486                 }
5487                 break;
5488         case 3: /* lmsw */
5489                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5490                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5491                 kvm_lmsw(vcpu, val);
5492
5493                 return kvm_skip_emulated_instruction(vcpu);
5494         default:
5495                 break;
5496         }
5497         vcpu->run->exit_reason = 0;
5498         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5499                (int)(exit_qualification >> 4) & 3, cr);
5500         return 0;
5501 }
5502
5503 static int handle_dr(struct kvm_vcpu *vcpu)
5504 {
5505         unsigned long exit_qualification;
5506         int dr, dr7, reg;
5507         int err = 1;
5508
5509         exit_qualification = vmx_get_exit_qual(vcpu);
5510         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5511
5512         /* First, if DR does not exist, trigger UD */
5513         if (!kvm_require_dr(vcpu, dr))
5514                 return 1;
5515
5516         if (vmx_get_cpl(vcpu) > 0)
5517                 goto out;
5518
5519         dr7 = vmcs_readl(GUEST_DR7);
5520         if (dr7 & DR7_GD) {
5521                 /*
5522                  * As the vm-exit takes precedence over the debug trap, we
5523                  * need to emulate the latter, either for the host or the
5524                  * guest debugging itself.
5525                  */
5526                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5527                         vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5528                         vcpu->run->debug.arch.dr7 = dr7;
5529                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5530                         vcpu->run->debug.arch.exception = DB_VECTOR;
5531                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5532                         return 0;
5533                 } else {
5534                         kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5535                         return 1;
5536                 }
5537         }
5538
5539         if (vcpu->guest_debug == 0) {
5540                 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5541
5542                 /*
5543                  * No more DR vmexits; force a reload of the debug registers
5544                  * and reenter on this instruction.  The next vmexit will
5545                  * retrieve the full state of the debug registers.
5546                  */
5547                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5548                 return 1;
5549         }
5550
5551         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5552         if (exit_qualification & TYPE_MOV_FROM_DR) {
5553                 unsigned long val;
5554
5555                 kvm_get_dr(vcpu, dr, &val);
5556                 kvm_register_write(vcpu, reg, val);
5557                 err = 0;
5558         } else {
5559                 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5560         }
5561
5562 out:
5563         return kvm_complete_insn_gp(vcpu, err);
5564 }
5565
5566 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5567 {
5568         get_debugreg(vcpu->arch.db[0], 0);
5569         get_debugreg(vcpu->arch.db[1], 1);
5570         get_debugreg(vcpu->arch.db[2], 2);
5571         get_debugreg(vcpu->arch.db[3], 3);
5572         get_debugreg(vcpu->arch.dr6, 6);
5573         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5574
5575         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5576         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5577
5578         /*
5579          * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5580          * a stale dr6 from the guest.
5581          */
5582         set_debugreg(DR6_RESERVED, 6);
5583 }
5584
5585 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5586 {
5587         vmcs_writel(GUEST_DR7, val);
5588 }
5589
5590 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5591 {
5592         kvm_apic_update_ppr(vcpu);
5593         return 1;
5594 }
5595
5596 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5597 {
5598         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5599
5600         kvm_make_request(KVM_REQ_EVENT, vcpu);
5601
5602         ++vcpu->stat.irq_window_exits;
5603         return 1;
5604 }
5605
5606 static int handle_invlpg(struct kvm_vcpu *vcpu)
5607 {
5608         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5609
5610         kvm_mmu_invlpg(vcpu, exit_qualification);
5611         return kvm_skip_emulated_instruction(vcpu);
5612 }
5613
5614 static int handle_apic_access(struct kvm_vcpu *vcpu)
5615 {
5616         if (likely(fasteoi)) {
5617                 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5618                 int access_type, offset;
5619
5620                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5621                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5622                 /*
5623                  * Sane guest uses MOV to write EOI, with written value
5624                  * not cared. So make a short-circuit here by avoiding
5625                  * heavy instruction emulation.
5626                  */
5627                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5628                     (offset == APIC_EOI)) {
5629                         kvm_lapic_set_eoi(vcpu);
5630                         return kvm_skip_emulated_instruction(vcpu);
5631                 }
5632         }
5633         return kvm_emulate_instruction(vcpu, 0);
5634 }
5635
5636 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5637 {
5638         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5639         int vector = exit_qualification & 0xff;
5640
5641         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5642         kvm_apic_set_eoi_accelerated(vcpu, vector);
5643         return 1;
5644 }
5645
5646 static int handle_apic_write(struct kvm_vcpu *vcpu)
5647 {
5648         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5649
5650         /*
5651          * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5652          * hardware has done any necessary aliasing, offset adjustments, etc...
5653          * for the access.  I.e. the correct value has already been  written to
5654          * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
5655          * retrieve the register value and emulate the access.
5656          */
5657         u32 offset = exit_qualification & 0xff0;
5658
5659         kvm_apic_write_nodecode(vcpu, offset);
5660         return 1;
5661 }
5662
5663 static int handle_task_switch(struct kvm_vcpu *vcpu)
5664 {
5665         struct vcpu_vmx *vmx = to_vmx(vcpu);
5666         unsigned long exit_qualification;
5667         bool has_error_code = false;
5668         u32 error_code = 0;
5669         u16 tss_selector;
5670         int reason, type, idt_v, idt_index;
5671
5672         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5673         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5674         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5675
5676         exit_qualification = vmx_get_exit_qual(vcpu);
5677
5678         reason = (u32)exit_qualification >> 30;
5679         if (reason == TASK_SWITCH_GATE && idt_v) {
5680                 switch (type) {
5681                 case INTR_TYPE_NMI_INTR:
5682                         vcpu->arch.nmi_injected = false;
5683                         vmx_set_nmi_mask(vcpu, true);
5684                         break;
5685                 case INTR_TYPE_EXT_INTR:
5686                 case INTR_TYPE_SOFT_INTR:
5687                         kvm_clear_interrupt_queue(vcpu);
5688                         break;
5689                 case INTR_TYPE_HARD_EXCEPTION:
5690                         if (vmx->idt_vectoring_info &
5691                             VECTORING_INFO_DELIVER_CODE_MASK) {
5692                                 has_error_code = true;
5693                                 error_code =
5694                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5695                         }
5696                         fallthrough;
5697                 case INTR_TYPE_SOFT_EXCEPTION:
5698                         kvm_clear_exception_queue(vcpu);
5699                         break;
5700                 default:
5701                         break;
5702                 }
5703         }
5704         tss_selector = exit_qualification;
5705
5706         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5707                        type != INTR_TYPE_EXT_INTR &&
5708                        type != INTR_TYPE_NMI_INTR))
5709                 WARN_ON(!skip_emulated_instruction(vcpu));
5710
5711         /*
5712          * TODO: What about debug traps on tss switch?
5713          *       Are we supposed to inject them and update dr6?
5714          */
5715         return kvm_task_switch(vcpu, tss_selector,
5716                                type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5717                                reason, has_error_code, error_code);
5718 }
5719
5720 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5721 {
5722         unsigned long exit_qualification;
5723         gpa_t gpa;
5724         u64 error_code;
5725
5726         exit_qualification = vmx_get_exit_qual(vcpu);
5727
5728         /*
5729          * EPT violation happened while executing iret from NMI,
5730          * "blocked by NMI" bit has to be set before next VM entry.
5731          * There are errata that may cause this bit to not be set:
5732          * AAK134, BY25.
5733          */
5734         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5735                         enable_vnmi &&
5736                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5737                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5738
5739         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5740         trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5741
5742         /* Is it a read fault? */
5743         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5744                      ? PFERR_USER_MASK : 0;
5745         /* Is it a write fault? */
5746         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5747                       ? PFERR_WRITE_MASK : 0;
5748         /* Is it a fetch fault? */
5749         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5750                       ? PFERR_FETCH_MASK : 0;
5751         /* ept page table entry is present? */
5752         error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
5753                       ? PFERR_PRESENT_MASK : 0;
5754
5755         error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
5756                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5757
5758         vcpu->arch.exit_qualification = exit_qualification;
5759
5760         /*
5761          * Check that the GPA doesn't exceed physical memory limits, as that is
5762          * a guest page fault.  We have to emulate the instruction here, because
5763          * if the illegal address is that of a paging structure, then
5764          * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5765          * would also use advanced VM-exit information for EPT violations to
5766          * reconstruct the page fault error code.
5767          */
5768         if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5769                 return kvm_emulate_instruction(vcpu, 0);
5770
5771         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5772 }
5773
5774 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5775 {
5776         gpa_t gpa;
5777
5778         if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
5779                 return 1;
5780
5781         /*
5782          * A nested guest cannot optimize MMIO vmexits, because we have an
5783          * nGPA here instead of the required GPA.
5784          */
5785         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5786         if (!is_guest_mode(vcpu) &&
5787             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5788                 trace_kvm_fast_mmio(gpa);
5789                 return kvm_skip_emulated_instruction(vcpu);
5790         }
5791
5792         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5793 }
5794
5795 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5796 {
5797         if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5798                 return -EIO;
5799
5800         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5801         ++vcpu->stat.nmi_window_exits;
5802         kvm_make_request(KVM_REQ_EVENT, vcpu);
5803
5804         return 1;
5805 }
5806
5807 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5808 {
5809         struct vcpu_vmx *vmx = to_vmx(vcpu);
5810
5811         return vmx->emulation_required && !vmx->rmode.vm86_active &&
5812                (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
5813 }
5814
5815 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5816 {
5817         struct vcpu_vmx *vmx = to_vmx(vcpu);
5818         bool intr_window_requested;
5819         unsigned count = 130;
5820
5821         intr_window_requested = exec_controls_get(vmx) &
5822                                 CPU_BASED_INTR_WINDOW_EXITING;
5823
5824         while (vmx->emulation_required && count-- != 0) {
5825                 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5826                         return handle_interrupt_window(&vmx->vcpu);
5827
5828                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5829                         return 1;
5830
5831                 if (!kvm_emulate_instruction(vcpu, 0))
5832                         return 0;
5833
5834                 if (vmx_emulation_required_with_pending_exception(vcpu)) {
5835                         kvm_prepare_emulation_failure_exit(vcpu);
5836                         return 0;
5837                 }
5838
5839                 if (vcpu->arch.halt_request) {
5840                         vcpu->arch.halt_request = 0;
5841                         return kvm_emulate_halt_noskip(vcpu);
5842                 }
5843
5844                 /*
5845                  * Note, return 1 and not 0, vcpu_run() will invoke
5846                  * xfer_to_guest_mode() which will create a proper return
5847                  * code.
5848                  */
5849                 if (__xfer_to_guest_mode_work_pending())
5850                         return 1;
5851         }
5852
5853         return 1;
5854 }
5855
5856 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5857 {
5858         if (vmx_emulation_required_with_pending_exception(vcpu)) {
5859                 kvm_prepare_emulation_failure_exit(vcpu);
5860                 return 0;
5861         }
5862
5863         return 1;
5864 }
5865
5866 static void grow_ple_window(struct kvm_vcpu *vcpu)
5867 {
5868         struct vcpu_vmx *vmx = to_vmx(vcpu);
5869         unsigned int old = vmx->ple_window;
5870
5871         vmx->ple_window = __grow_ple_window(old, ple_window,
5872                                             ple_window_grow,
5873                                             ple_window_max);
5874
5875         if (vmx->ple_window != old) {
5876                 vmx->ple_window_dirty = true;
5877                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5878                                             vmx->ple_window, old);
5879         }
5880 }
5881
5882 static void shrink_ple_window(struct kvm_vcpu *vcpu)
5883 {
5884         struct vcpu_vmx *vmx = to_vmx(vcpu);
5885         unsigned int old = vmx->ple_window;
5886
5887         vmx->ple_window = __shrink_ple_window(old, ple_window,
5888                                               ple_window_shrink,
5889                                               ple_window);
5890
5891         if (vmx->ple_window != old) {
5892                 vmx->ple_window_dirty = true;
5893                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5894                                             vmx->ple_window, old);
5895         }
5896 }
5897
5898 /*
5899  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5900  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5901  */
5902 static int handle_pause(struct kvm_vcpu *vcpu)
5903 {
5904         if (!kvm_pause_in_guest(vcpu->kvm))
5905                 grow_ple_window(vcpu);
5906
5907         /*
5908          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5909          * VM-execution control is ignored if CPL > 0. OTOH, KVM
5910          * never set PAUSE_EXITING and just set PLE if supported,
5911          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5912          */
5913         kvm_vcpu_on_spin(vcpu, true);
5914         return kvm_skip_emulated_instruction(vcpu);
5915 }
5916
5917 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5918 {
5919         return 1;
5920 }
5921
5922 static int handle_invpcid(struct kvm_vcpu *vcpu)
5923 {
5924         u32 vmx_instruction_info;
5925         unsigned long type;
5926         gva_t gva;
5927         struct {
5928                 u64 pcid;
5929                 u64 gla;
5930         } operand;
5931         int gpr_index;
5932
5933         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5934                 kvm_queue_exception(vcpu, UD_VECTOR);
5935                 return 1;
5936         }
5937
5938         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5939         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5940         type = kvm_register_read(vcpu, gpr_index);
5941
5942         /* According to the Intel instruction reference, the memory operand
5943          * is read even if it isn't needed (e.g., for type==all)
5944          */
5945         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5946                                 vmx_instruction_info, false,
5947                                 sizeof(operand), &gva))
5948                 return 1;
5949
5950         return kvm_handle_invpcid(vcpu, type, gva);
5951 }
5952
5953 static int handle_pml_full(struct kvm_vcpu *vcpu)
5954 {
5955         unsigned long exit_qualification;
5956
5957         trace_kvm_pml_full(vcpu->vcpu_id);
5958
5959         exit_qualification = vmx_get_exit_qual(vcpu);
5960
5961         /*
5962          * PML buffer FULL happened while executing iret from NMI,
5963          * "blocked by NMI" bit has to be set before next VM entry.
5964          */
5965         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5966                         enable_vnmi &&
5967                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5968                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5969                                 GUEST_INTR_STATE_NMI);
5970
5971         /*
5972          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5973          * here.., and there's no userspace involvement needed for PML.
5974          */
5975         return 1;
5976 }
5977
5978 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
5979 {
5980         struct vcpu_vmx *vmx = to_vmx(vcpu);
5981
5982         if (!vmx->req_immediate_exit &&
5983             !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
5984                 kvm_lapic_expired_hv_timer(vcpu);
5985                 return EXIT_FASTPATH_REENTER_GUEST;
5986         }
5987
5988         return EXIT_FASTPATH_NONE;
5989 }
5990
5991 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
5992 {
5993         handle_fastpath_preemption_timer(vcpu);
5994         return 1;
5995 }
5996
5997 /*
5998  * When nested=0, all VMX instruction VM Exits filter here.  The handlers
5999  * are overwritten by nested_vmx_setup() when nested=1.
6000  */
6001 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6002 {
6003         kvm_queue_exception(vcpu, UD_VECTOR);
6004         return 1;
6005 }
6006
6007 #ifndef CONFIG_X86_SGX_KVM
6008 static int handle_encls(struct kvm_vcpu *vcpu)
6009 {
6010         /*
6011          * SGX virtualization is disabled.  There is no software enable bit for
6012          * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6013          * the guest from executing ENCLS (when SGX is supported by hardware).
6014          */
6015         kvm_queue_exception(vcpu, UD_VECTOR);
6016         return 1;
6017 }
6018 #endif /* CONFIG_X86_SGX_KVM */
6019
6020 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6021 {
6022         /*
6023          * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6024          * VM-Exits. Unconditionally set the flag here and leave the handling to
6025          * vmx_handle_exit().
6026          */
6027         to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6028         return 1;
6029 }
6030
6031 static int handle_notify(struct kvm_vcpu *vcpu)
6032 {
6033         unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6034         bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6035
6036         ++vcpu->stat.notify_window_exits;
6037
6038         /*
6039          * Notify VM exit happened while executing iret from NMI,
6040          * "blocked by NMI" bit has to be set before next VM entry.
6041          */
6042         if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6043                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6044                               GUEST_INTR_STATE_NMI);
6045
6046         if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6047             context_invalid) {
6048                 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6049                 vcpu->run->notify.flags = context_invalid ?
6050                                           KVM_NOTIFY_CONTEXT_INVALID : 0;
6051                 return 0;
6052         }
6053
6054         return 1;
6055 }
6056
6057 /*
6058  * The exit handlers return 1 if the exit was handled fully and guest execution
6059  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
6060  * to be done to userspace and return 0.
6061  */
6062 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6063         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
6064         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
6065         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
6066         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
6067         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
6068         [EXIT_REASON_CR_ACCESS]               = handle_cr,
6069         [EXIT_REASON_DR_ACCESS]               = handle_dr,
6070         [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
6071         [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
6072         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
6073         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
6074         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
6075         [EXIT_REASON_INVD]                    = kvm_emulate_invd,
6076         [EXIT_REASON_INVLPG]                  = handle_invlpg,
6077         [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
6078         [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
6079         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
6080         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
6081         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
6082         [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
6083         [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
6084         [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
6085         [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
6086         [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
6087         [EXIT_REASON_VMON]                    = handle_vmx_instruction,
6088         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
6089         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
6090         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
6091         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
6092         [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
6093         [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
6094         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
6095         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
6096         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
6097         [EXIT_REASON_LDTR_TR]                 = handle_desc,
6098         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
6099         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
6100         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
6101         [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
6102         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
6103         [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
6104         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
6105         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
6106         [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
6107         [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
6108         [EXIT_REASON_PML_FULL]                = handle_pml_full,
6109         [EXIT_REASON_INVPCID]                 = handle_invpcid,
6110         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
6111         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
6112         [EXIT_REASON_ENCLS]                   = handle_encls,
6113         [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
6114         [EXIT_REASON_NOTIFY]                  = handle_notify,
6115 };
6116
6117 static const int kvm_vmx_max_exit_handlers =
6118         ARRAY_SIZE(kvm_vmx_exit_handlers);
6119
6120 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6121                               u64 *info1, u64 *info2,
6122                               u32 *intr_info, u32 *error_code)
6123 {
6124         struct vcpu_vmx *vmx = to_vmx(vcpu);
6125
6126         *reason = vmx->exit_reason.full;
6127         *info1 = vmx_get_exit_qual(vcpu);
6128         if (!(vmx->exit_reason.failed_vmentry)) {
6129                 *info2 = vmx->idt_vectoring_info;
6130                 *intr_info = vmx_get_intr_info(vcpu);
6131                 if (is_exception_with_error_code(*intr_info))
6132                         *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6133                 else
6134                         *error_code = 0;
6135         } else {
6136                 *info2 = 0;
6137                 *intr_info = 0;
6138                 *error_code = 0;
6139         }
6140 }
6141
6142 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6143 {
6144         if (vmx->pml_pg) {
6145                 __free_page(vmx->pml_pg);
6146                 vmx->pml_pg = NULL;
6147         }
6148 }
6149
6150 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6151 {
6152         struct vcpu_vmx *vmx = to_vmx(vcpu);
6153         u64 *pml_buf;
6154         u16 pml_idx;
6155
6156         pml_idx = vmcs_read16(GUEST_PML_INDEX);
6157
6158         /* Do nothing if PML buffer is empty */
6159         if (pml_idx == (PML_ENTITY_NUM - 1))
6160                 return;
6161
6162         /* PML index always points to next available PML buffer entity */
6163         if (pml_idx >= PML_ENTITY_NUM)
6164                 pml_idx = 0;
6165         else
6166                 pml_idx++;
6167
6168         pml_buf = page_address(vmx->pml_pg);
6169         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6170                 u64 gpa;
6171
6172                 gpa = pml_buf[pml_idx];
6173                 WARN_ON(gpa & (PAGE_SIZE - 1));
6174                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6175         }
6176
6177         /* reset PML index */
6178         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6179 }
6180
6181 static void vmx_dump_sel(char *name, uint32_t sel)
6182 {
6183         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6184                name, vmcs_read16(sel),
6185                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6186                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6187                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6188 }
6189
6190 static void vmx_dump_dtsel(char *name, uint32_t limit)
6191 {
6192         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
6193                name, vmcs_read32(limit),
6194                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6195 }
6196
6197 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6198 {
6199         unsigned int i;
6200         struct vmx_msr_entry *e;
6201
6202         pr_err("MSR %s:\n", name);
6203         for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6204                 pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6205 }
6206
6207 void dump_vmcs(struct kvm_vcpu *vcpu)
6208 {
6209         struct vcpu_vmx *vmx = to_vmx(vcpu);
6210         u32 vmentry_ctl, vmexit_ctl;
6211         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6212         u64 tertiary_exec_control;
6213         unsigned long cr4;
6214         int efer_slot;
6215
6216         if (!dump_invalid_vmcs) {
6217                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6218                 return;
6219         }
6220
6221         vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6222         vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6223         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6224         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6225         cr4 = vmcs_readl(GUEST_CR4);
6226
6227         if (cpu_has_secondary_exec_ctrls())
6228                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6229         else
6230                 secondary_exec_control = 0;
6231
6232         if (cpu_has_tertiary_exec_ctrls())
6233                 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6234         else
6235                 tertiary_exec_control = 0;
6236
6237         pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6238                vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6239         pr_err("*** Guest State ***\n");
6240         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6241                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6242                vmcs_readl(CR0_GUEST_HOST_MASK));
6243         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6244                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6245         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6246         if (cpu_has_vmx_ept()) {
6247                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
6248                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6249                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
6250                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6251         }
6252         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
6253                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6254         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
6255                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6256         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6257                vmcs_readl(GUEST_SYSENTER_ESP),
6258                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6259         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
6260         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
6261         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
6262         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
6263         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
6264         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
6265         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6266         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6267         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6268         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
6269         efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6270         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6271                 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6272         else if (efer_slot >= 0)
6273                 pr_err("EFER= 0x%016llx (autoload)\n",
6274                        vmx->msr_autoload.guest.val[efer_slot].value);
6275         else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6276                 pr_err("EFER= 0x%016llx (effective)\n",
6277                        vcpu->arch.efer | (EFER_LMA | EFER_LME));
6278         else
6279                 pr_err("EFER= 0x%016llx (effective)\n",
6280                        vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6281         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6282                 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6283         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
6284                vmcs_read64(GUEST_IA32_DEBUGCTL),
6285                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6286         if (cpu_has_load_perf_global_ctrl() &&
6287             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6288                 pr_err("PerfGlobCtl = 0x%016llx\n",
6289                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6290         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6291                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6292         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
6293                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6294                vmcs_read32(GUEST_ACTIVITY_STATE));
6295         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6296                 pr_err("InterruptStatus = %04x\n",
6297                        vmcs_read16(GUEST_INTR_STATUS));
6298         if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6299                 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6300         if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6301                 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
6302
6303         pr_err("*** Host State ***\n");
6304         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
6305                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6306         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6307                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6308                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6309                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6310                vmcs_read16(HOST_TR_SELECTOR));
6311         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6312                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6313                vmcs_readl(HOST_TR_BASE));
6314         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6315                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6316         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6317                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6318                vmcs_readl(HOST_CR4));
6319         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6320                vmcs_readl(HOST_IA32_SYSENTER_ESP),
6321                vmcs_read32(HOST_IA32_SYSENTER_CS),
6322                vmcs_readl(HOST_IA32_SYSENTER_EIP));
6323         if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6324                 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6325         if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6326                 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6327         if (cpu_has_load_perf_global_ctrl() &&
6328             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6329                 pr_err("PerfGlobCtl = 0x%016llx\n",
6330                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6331         if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6332                 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6333
6334         pr_err("*** Control State ***\n");
6335         pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6336                cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6337         pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6338                pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6339         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6340                vmcs_read32(EXCEPTION_BITMAP),
6341                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6342                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6343         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6344                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6345                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6346                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6347         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6348                vmcs_read32(VM_EXIT_INTR_INFO),
6349                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6350                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6351         pr_err("        reason=%08x qualification=%016lx\n",
6352                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6353         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6354                vmcs_read32(IDT_VECTORING_INFO_FIELD),
6355                vmcs_read32(IDT_VECTORING_ERROR_CODE));
6356         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6357         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6358                 pr_err("TSC Multiplier = 0x%016llx\n",
6359                        vmcs_read64(TSC_MULTIPLIER));
6360         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6361                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6362                         u16 status = vmcs_read16(GUEST_INTR_STATUS);
6363                         pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6364                 }
6365                 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6366                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6367                         pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6368                 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6369         }
6370         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6371                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6372         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6373                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6374         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6375                 pr_err("PLE Gap=%08x Window=%08x\n",
6376                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6377         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6378                 pr_err("Virtual processor ID = 0x%04x\n",
6379                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
6380 }
6381
6382 /*
6383  * The guest has exited.  See if we can fix it or if we need userspace
6384  * assistance.
6385  */
6386 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6387 {
6388         struct vcpu_vmx *vmx = to_vmx(vcpu);
6389         union vmx_exit_reason exit_reason = vmx->exit_reason;
6390         u32 vectoring_info = vmx->idt_vectoring_info;
6391         u16 exit_handler_index;
6392
6393         /*
6394          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6395          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6396          * querying dirty_bitmap, we only need to kick all vcpus out of guest
6397          * mode as if vcpus is in root mode, the PML buffer must has been
6398          * flushed already.  Note, PML is never enabled in hardware while
6399          * running L2.
6400          */
6401         if (enable_pml && !is_guest_mode(vcpu))
6402                 vmx_flush_pml_buffer(vcpu);
6403
6404         /*
6405          * KVM should never reach this point with a pending nested VM-Enter.
6406          * More specifically, short-circuiting VM-Entry to emulate L2 due to
6407          * invalid guest state should never happen as that means KVM knowingly
6408          * allowed a nested VM-Enter with an invalid vmcs12.  More below.
6409          */
6410         if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6411                 return -EIO;
6412
6413         if (is_guest_mode(vcpu)) {
6414                 /*
6415                  * PML is never enabled when running L2, bail immediately if a
6416                  * PML full exit occurs as something is horribly wrong.
6417                  */
6418                 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6419                         goto unexpected_vmexit;
6420
6421                 /*
6422                  * The host physical addresses of some pages of guest memory
6423                  * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6424                  * Page). The CPU may write to these pages via their host
6425                  * physical address while L2 is running, bypassing any
6426                  * address-translation-based dirty tracking (e.g. EPT write
6427                  * protection).
6428                  *
6429                  * Mark them dirty on every exit from L2 to prevent them from
6430                  * getting out of sync with dirty tracking.
6431                  */
6432                 nested_mark_vmcs12_pages_dirty(vcpu);
6433
6434                 /*
6435                  * Synthesize a triple fault if L2 state is invalid.  In normal
6436                  * operation, nested VM-Enter rejects any attempt to enter L2
6437                  * with invalid state.  However, those checks are skipped if
6438                  * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
6439                  * L2 state is invalid, it means either L1 modified SMRAM state
6440                  * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
6441                  * doing so is architecturally allowed in the RSM case, and is
6442                  * the least awful solution for the userspace case without
6443                  * risking false positives.
6444                  */
6445                 if (vmx->emulation_required) {
6446                         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6447                         return 1;
6448                 }
6449
6450                 if (nested_vmx_reflect_vmexit(vcpu))
6451                         return 1;
6452         }
6453
6454         /* If guest state is invalid, start emulating.  L2 is handled above. */
6455         if (vmx->emulation_required)
6456                 return handle_invalid_guest_state(vcpu);
6457
6458         if (exit_reason.failed_vmentry) {
6459                 dump_vmcs(vcpu);
6460                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6461                 vcpu->run->fail_entry.hardware_entry_failure_reason
6462                         = exit_reason.full;
6463                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6464                 return 0;
6465         }
6466
6467         if (unlikely(vmx->fail)) {
6468                 dump_vmcs(vcpu);
6469                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6470                 vcpu->run->fail_entry.hardware_entry_failure_reason
6471                         = vmcs_read32(VM_INSTRUCTION_ERROR);
6472                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6473                 return 0;
6474         }
6475
6476         /*
6477          * Note:
6478          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6479          * delivery event since it indicates guest is accessing MMIO.
6480          * The vm-exit can be triggered again after return to guest that
6481          * will cause infinite loop.
6482          */
6483         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6484             (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6485              exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6486              exit_reason.basic != EXIT_REASON_PML_FULL &&
6487              exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6488              exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6489              exit_reason.basic != EXIT_REASON_NOTIFY)) {
6490                 int ndata = 3;
6491
6492                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6493                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6494                 vcpu->run->internal.data[0] = vectoring_info;
6495                 vcpu->run->internal.data[1] = exit_reason.full;
6496                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6497                 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6498                         vcpu->run->internal.data[ndata++] =
6499                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6500                 }
6501                 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6502                 vcpu->run->internal.ndata = ndata;
6503                 return 0;
6504         }
6505
6506         if (unlikely(!enable_vnmi &&
6507                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
6508                 if (!vmx_interrupt_blocked(vcpu)) {
6509                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6510                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6511                            vcpu->arch.nmi_pending) {
6512                         /*
6513                          * This CPU don't support us in finding the end of an
6514                          * NMI-blocked window if the guest runs with IRQs
6515                          * disabled. So we pull the trigger after 1 s of
6516                          * futile waiting, but inform the user about this.
6517                          */
6518                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6519                                "state on VCPU %d after 1 s timeout\n",
6520                                __func__, vcpu->vcpu_id);
6521                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6522                 }
6523         }
6524
6525         if (exit_fastpath != EXIT_FASTPATH_NONE)
6526                 return 1;
6527
6528         if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6529                 goto unexpected_vmexit;
6530 #ifdef CONFIG_RETPOLINE
6531         if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6532                 return kvm_emulate_wrmsr(vcpu);
6533         else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6534                 return handle_preemption_timer(vcpu);
6535         else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6536                 return handle_interrupt_window(vcpu);
6537         else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6538                 return handle_external_interrupt(vcpu);
6539         else if (exit_reason.basic == EXIT_REASON_HLT)
6540                 return kvm_emulate_halt(vcpu);
6541         else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6542                 return handle_ept_misconfig(vcpu);
6543 #endif
6544
6545         exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6546                                                 kvm_vmx_max_exit_handlers);
6547         if (!kvm_vmx_exit_handlers[exit_handler_index])
6548                 goto unexpected_vmexit;
6549
6550         return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6551
6552 unexpected_vmexit:
6553         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6554                     exit_reason.full);
6555         dump_vmcs(vcpu);
6556         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6557         vcpu->run->internal.suberror =
6558                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6559         vcpu->run->internal.ndata = 2;
6560         vcpu->run->internal.data[0] = exit_reason.full;
6561         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
6562         return 0;
6563 }
6564
6565 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6566 {
6567         int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6568
6569         /*
6570          * Exit to user space when bus lock detected to inform that there is
6571          * a bus lock in guest.
6572          */
6573         if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6574                 if (ret > 0)
6575                         vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6576
6577                 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6578                 return 0;
6579         }
6580         return ret;
6581 }
6582
6583 /*
6584  * Software based L1D cache flush which is used when microcode providing
6585  * the cache control MSR is not loaded.
6586  *
6587  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6588  * flush it is required to read in 64 KiB because the replacement algorithm
6589  * is not exactly LRU. This could be sized at runtime via topology
6590  * information but as all relevant affected CPUs have 32KiB L1D cache size
6591  * there is no point in doing so.
6592  */
6593 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6594 {
6595         int size = PAGE_SIZE << L1D_CACHE_ORDER;
6596
6597         /*
6598          * This code is only executed when the flush mode is 'cond' or
6599          * 'always'
6600          */
6601         if (static_branch_likely(&vmx_l1d_flush_cond)) {
6602                 bool flush_l1d;
6603
6604                 /*
6605                  * Clear the per-vcpu flush bit, it gets set again
6606                  * either from vcpu_run() or from one of the unsafe
6607                  * VMEXIT handlers.
6608                  */
6609                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6610                 vcpu->arch.l1tf_flush_l1d = false;
6611
6612                 /*
6613                  * Clear the per-cpu flush bit, it gets set again from
6614                  * the interrupt handlers.
6615                  */
6616                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6617                 kvm_clear_cpu_l1tf_flush_l1d();
6618
6619                 if (!flush_l1d)
6620                         return;
6621         }
6622
6623         vcpu->stat.l1d_flush++;
6624
6625         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6626                 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6627                 return;
6628         }
6629
6630         asm volatile(
6631                 /* First ensure the pages are in the TLB */
6632                 "xorl   %%eax, %%eax\n"
6633                 ".Lpopulate_tlb:\n\t"
6634                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6635                 "addl   $4096, %%eax\n\t"
6636                 "cmpl   %%eax, %[size]\n\t"
6637                 "jne    .Lpopulate_tlb\n\t"
6638                 "xorl   %%eax, %%eax\n\t"
6639                 "cpuid\n\t"
6640                 /* Now fill the cache */
6641                 "xorl   %%eax, %%eax\n"
6642                 ".Lfill_cache:\n"
6643                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6644                 "addl   $64, %%eax\n\t"
6645                 "cmpl   %%eax, %[size]\n\t"
6646                 "jne    .Lfill_cache\n\t"
6647                 "lfence\n"
6648                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6649                     [size] "r" (size)
6650                 : "eax", "ebx", "ecx", "edx");
6651 }
6652
6653 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6654 {
6655         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6656         int tpr_threshold;
6657
6658         if (is_guest_mode(vcpu) &&
6659                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6660                 return;
6661
6662         tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6663         if (is_guest_mode(vcpu))
6664                 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6665         else
6666                 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6667 }
6668
6669 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6670 {
6671         struct vcpu_vmx *vmx = to_vmx(vcpu);
6672         u32 sec_exec_control;
6673
6674         if (!lapic_in_kernel(vcpu))
6675                 return;
6676
6677         if (!flexpriority_enabled &&
6678             !cpu_has_vmx_virtualize_x2apic_mode())
6679                 return;
6680
6681         /* Postpone execution until vmcs01 is the current VMCS. */
6682         if (is_guest_mode(vcpu)) {
6683                 vmx->nested.change_vmcs01_virtual_apic_mode = true;
6684                 return;
6685         }
6686
6687         sec_exec_control = secondary_exec_controls_get(vmx);
6688         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6689                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6690
6691         switch (kvm_get_apic_mode(vcpu)) {
6692         case LAPIC_MODE_INVALID:
6693                 WARN_ONCE(true, "Invalid local APIC state");
6694                 break;
6695         case LAPIC_MODE_DISABLED:
6696                 break;
6697         case LAPIC_MODE_XAPIC:
6698                 if (flexpriority_enabled) {
6699                         sec_exec_control |=
6700                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6701                         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6702
6703                         /*
6704                          * Flush the TLB, reloading the APIC access page will
6705                          * only do so if its physical address has changed, but
6706                          * the guest may have inserted a non-APIC mapping into
6707                          * the TLB while the APIC access page was disabled.
6708                          */
6709                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6710                 }
6711                 break;
6712         case LAPIC_MODE_X2APIC:
6713                 if (cpu_has_vmx_virtualize_x2apic_mode())
6714                         sec_exec_control |=
6715                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6716                 break;
6717         }
6718         secondary_exec_controls_set(vmx, sec_exec_control);
6719
6720         vmx_update_msr_bitmap_x2apic(vcpu);
6721 }
6722
6723 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6724 {
6725         struct page *page;
6726
6727         /* Defer reload until vmcs01 is the current VMCS. */
6728         if (is_guest_mode(vcpu)) {
6729                 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6730                 return;
6731         }
6732
6733         if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6734             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6735                 return;
6736
6737         page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6738         if (is_error_page(page))
6739                 return;
6740
6741         vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
6742         vmx_flush_tlb_current(vcpu);
6743
6744         /*
6745          * Do not pin apic access page in memory, the MMU notifier
6746          * will call us again if it is migrated or swapped out.
6747          */
6748         put_page(page);
6749 }
6750
6751 static void vmx_hwapic_isr_update(int max_isr)
6752 {
6753         u16 status;
6754         u8 old;
6755
6756         if (max_isr == -1)
6757                 max_isr = 0;
6758
6759         status = vmcs_read16(GUEST_INTR_STATUS);
6760         old = status >> 8;
6761         if (max_isr != old) {
6762                 status &= 0xff;
6763                 status |= max_isr << 8;
6764                 vmcs_write16(GUEST_INTR_STATUS, status);
6765         }
6766 }
6767
6768 static void vmx_set_rvi(int vector)
6769 {
6770         u16 status;
6771         u8 old;
6772
6773         if (vector == -1)
6774                 vector = 0;
6775
6776         status = vmcs_read16(GUEST_INTR_STATUS);
6777         old = (u8)status & 0xff;
6778         if ((u8)vector != old) {
6779                 status &= ~0xff;
6780                 status |= (u8)vector;
6781                 vmcs_write16(GUEST_INTR_STATUS, status);
6782         }
6783 }
6784
6785 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6786 {
6787         /*
6788          * When running L2, updating RVI is only relevant when
6789          * vmcs12 virtual-interrupt-delivery enabled.
6790          * However, it can be enabled only when L1 also
6791          * intercepts external-interrupts and in that case
6792          * we should not update vmcs02 RVI but instead intercept
6793          * interrupt. Therefore, do nothing when running L2.
6794          */
6795         if (!is_guest_mode(vcpu))
6796                 vmx_set_rvi(max_irr);
6797 }
6798
6799 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6800 {
6801         struct vcpu_vmx *vmx = to_vmx(vcpu);
6802         int max_irr;
6803         bool got_posted_interrupt;
6804
6805         if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6806                 return -EIO;
6807
6808         if (pi_test_on(&vmx->pi_desc)) {
6809                 pi_clear_on(&vmx->pi_desc);
6810                 /*
6811                  * IOMMU can write to PID.ON, so the barrier matters even on UP.
6812                  * But on x86 this is just a compiler barrier anyway.
6813                  */
6814                 smp_mb__after_atomic();
6815                 got_posted_interrupt =
6816                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6817         } else {
6818                 max_irr = kvm_lapic_find_highest_irr(vcpu);
6819                 got_posted_interrupt = false;
6820         }
6821
6822         /*
6823          * Newly recognized interrupts are injected via either virtual interrupt
6824          * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
6825          * disabled in two cases:
6826          *
6827          * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
6828          * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6829          * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
6830          * into L2, but KVM doesn't use virtual interrupt delivery to inject
6831          * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6832          *
6833          * 2) If APICv is disabled for this vCPU, assigned devices may still
6834          * attempt to post interrupts.  The posted interrupt vector will cause
6835          * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6836          */
6837         if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6838                 vmx_set_rvi(max_irr);
6839         else if (got_posted_interrupt)
6840                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6841
6842         return max_irr;
6843 }
6844
6845 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6846 {
6847         if (!kvm_vcpu_apicv_active(vcpu))
6848                 return;
6849
6850         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6851         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6852         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6853         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6854 }
6855
6856 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6857 {
6858         struct vcpu_vmx *vmx = to_vmx(vcpu);
6859
6860         pi_clear_on(&vmx->pi_desc);
6861         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6862 }
6863
6864 void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
6865
6866 static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
6867                                         unsigned long entry)
6868 {
6869         bool is_nmi = entry == (unsigned long)asm_exc_nmi_kvm_vmx;
6870
6871         kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
6872         vmx_do_interrupt_nmi_irqoff(entry);
6873         kvm_after_interrupt(vcpu);
6874 }
6875
6876 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6877 {
6878         /*
6879          * Save xfd_err to guest_fpu before interrupt is enabled, so the
6880          * MSR value is not clobbered by the host activity before the guest
6881          * has chance to consume it.
6882          *
6883          * Do not blindly read xfd_err here, since this exception might
6884          * be caused by L1 interception on a platform which doesn't
6885          * support xfd at all.
6886          *
6887          * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6888          * only when xfd contains a non-zero value.
6889          *
6890          * Queuing exception is done in vmx_handle_exit. See comment there.
6891          */
6892         if (vcpu->arch.guest_fpu.fpstate->xfd)
6893                 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6894 }
6895
6896 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
6897 {
6898         const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_kvm_vmx;
6899         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6900
6901         /* if exit due to PF check for async PF */
6902         if (is_page_fault(intr_info))
6903                 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6904         /* if exit due to NM, handle before interrupts are enabled */
6905         else if (is_nm_fault(intr_info))
6906                 handle_nm_fault_irqoff(&vmx->vcpu);
6907         /* Handle machine checks before interrupts are enabled */
6908         else if (is_machine_check(intr_info))
6909                 kvm_machine_check();
6910         /* We need to handle NMIs before interrupts are enabled */
6911         else if (is_nmi(intr_info))
6912                 handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
6913 }
6914
6915 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6916 {
6917         u32 intr_info = vmx_get_intr_info(vcpu);
6918         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6919         gate_desc *desc = (gate_desc *)host_idt_base + vector;
6920
6921         if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
6922             "unexpected VM-Exit interrupt info: 0x%x", intr_info))
6923                 return;
6924
6925         handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
6926         vcpu->arch.at_instruction_boundary = true;
6927 }
6928
6929 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6930 {
6931         struct vcpu_vmx *vmx = to_vmx(vcpu);
6932
6933         if (vmx->emulation_required)
6934                 return;
6935
6936         if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6937                 handle_external_interrupt_irqoff(vcpu);
6938         else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
6939                 handle_exception_nmi_irqoff(vmx);
6940 }
6941
6942 /*
6943  * The kvm parameter can be NULL (module initialization, or invocation before
6944  * VM creation). Be sure to check the kvm parameter before using it.
6945  */
6946 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
6947 {
6948         switch (index) {
6949         case MSR_IA32_SMBASE:
6950                 if (!IS_ENABLED(CONFIG_KVM_SMM))
6951                         return false;
6952                 /*
6953                  * We cannot do SMM unless we can run the guest in big
6954                  * real mode.
6955                  */
6956                 return enable_unrestricted_guest || emulate_invalid_guest_state;
6957         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6958                 return nested;
6959         case MSR_AMD64_VIRT_SPEC_CTRL:
6960         case MSR_AMD64_TSC_RATIO:
6961                 /* This is AMD only.  */
6962                 return false;
6963         default:
6964                 return true;
6965         }
6966 }
6967
6968 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6969 {
6970         u32 exit_intr_info;
6971         bool unblock_nmi;
6972         u8 vector;
6973         bool idtv_info_valid;
6974
6975         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6976
6977         if (enable_vnmi) {
6978                 if (vmx->loaded_vmcs->nmi_known_unmasked)
6979                         return;
6980
6981                 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
6982                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6983                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6984                 /*
6985                  * SDM 3: 27.7.1.2 (September 2008)
6986                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
6987                  * a guest IRET fault.
6988                  * SDM 3: 23.2.2 (September 2008)
6989                  * Bit 12 is undefined in any of the following cases:
6990                  *  If the VM exit sets the valid bit in the IDT-vectoring
6991                  *   information field.
6992                  *  If the VM exit is due to a double fault.
6993                  */
6994                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6995                     vector != DF_VECTOR && !idtv_info_valid)
6996                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6997                                       GUEST_INTR_STATE_NMI);
6998                 else
6999                         vmx->loaded_vmcs->nmi_known_unmasked =
7000                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7001                                   & GUEST_INTR_STATE_NMI);
7002         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7003                 vmx->loaded_vmcs->vnmi_blocked_time +=
7004                         ktime_to_ns(ktime_sub(ktime_get(),
7005                                               vmx->loaded_vmcs->entry_time));
7006 }
7007
7008 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7009                                       u32 idt_vectoring_info,
7010                                       int instr_len_field,
7011                                       int error_code_field)
7012 {
7013         u8 vector;
7014         int type;
7015         bool idtv_info_valid;
7016
7017         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7018
7019         vcpu->arch.nmi_injected = false;
7020         kvm_clear_exception_queue(vcpu);
7021         kvm_clear_interrupt_queue(vcpu);
7022
7023         if (!idtv_info_valid)
7024                 return;
7025
7026         kvm_make_request(KVM_REQ_EVENT, vcpu);
7027
7028         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7029         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7030
7031         switch (type) {
7032         case INTR_TYPE_NMI_INTR:
7033                 vcpu->arch.nmi_injected = true;
7034                 /*
7035                  * SDM 3: 27.7.1.2 (September 2008)
7036                  * Clear bit "block by NMI" before VM entry if a NMI
7037                  * delivery faulted.
7038                  */
7039                 vmx_set_nmi_mask(vcpu, false);
7040                 break;
7041         case INTR_TYPE_SOFT_EXCEPTION:
7042                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7043                 fallthrough;
7044         case INTR_TYPE_HARD_EXCEPTION:
7045                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7046                         u32 err = vmcs_read32(error_code_field);
7047                         kvm_requeue_exception_e(vcpu, vector, err);
7048                 } else
7049                         kvm_requeue_exception(vcpu, vector);
7050                 break;
7051         case INTR_TYPE_SOFT_INTR:
7052                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7053                 fallthrough;
7054         case INTR_TYPE_EXT_INTR:
7055                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7056                 break;
7057         default:
7058                 break;
7059         }
7060 }
7061
7062 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7063 {
7064         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7065                                   VM_EXIT_INSTRUCTION_LEN,
7066                                   IDT_VECTORING_ERROR_CODE);
7067 }
7068
7069 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7070 {
7071         __vmx_complete_interrupts(vcpu,
7072                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7073                                   VM_ENTRY_INSTRUCTION_LEN,
7074                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
7075
7076         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7077 }
7078
7079 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7080 {
7081         int i, nr_msrs;
7082         struct perf_guest_switch_msr *msrs;
7083         struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7084
7085         pmu->host_cross_mapped_mask = 0;
7086         if (pmu->pebs_enable & pmu->global_ctrl)
7087                 intel_pmu_cross_mapped_check(pmu);
7088
7089         /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7090         msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7091         if (!msrs)
7092                 return;
7093
7094         for (i = 0; i < nr_msrs; i++)
7095                 if (msrs[i].host == msrs[i].guest)
7096                         clear_atomic_switch_msr(vmx, msrs[i].msr);
7097                 else
7098                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7099                                         msrs[i].host, false);
7100 }
7101
7102 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
7103 {
7104         struct vcpu_vmx *vmx = to_vmx(vcpu);
7105         u64 tscl;
7106         u32 delta_tsc;
7107
7108         if (vmx->req_immediate_exit) {
7109                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7110                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7111         } else if (vmx->hv_deadline_tsc != -1) {
7112                 tscl = rdtsc();
7113                 if (vmx->hv_deadline_tsc > tscl)
7114                         /* set_hv_timer ensures the delta fits in 32-bits */
7115                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7116                                 cpu_preemption_timer_multi);
7117                 else
7118                         delta_tsc = 0;
7119
7120                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7121                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7122         } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7123                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7124                 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7125         }
7126 }
7127
7128 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7129 {
7130         if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7131                 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7132                 vmcs_writel(HOST_RSP, host_rsp);
7133         }
7134 }
7135
7136 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7137                                         unsigned int flags)
7138 {
7139         u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7140
7141         if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7142                 return;
7143
7144         if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7145                 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7146
7147         /*
7148          * If the guest/host SPEC_CTRL values differ, restore the host value.
7149          *
7150          * For legacy IBRS, the IBRS bit always needs to be written after
7151          * transitioning from a less privileged predictor mode, regardless of
7152          * whether the guest/host values differ.
7153          */
7154         if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7155             vmx->spec_ctrl != hostval)
7156                 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7157
7158         barrier_nospec();
7159 }
7160
7161 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
7162 {
7163         switch (to_vmx(vcpu)->exit_reason.basic) {
7164         case EXIT_REASON_MSR_WRITE:
7165                 return handle_fastpath_set_msr_irqoff(vcpu);
7166         case EXIT_REASON_PREEMPTION_TIMER:
7167                 return handle_fastpath_preemption_timer(vcpu);
7168         default:
7169                 return EXIT_FASTPATH_NONE;
7170         }
7171 }
7172
7173 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7174                                         unsigned int flags)
7175 {
7176         struct vcpu_vmx *vmx = to_vmx(vcpu);
7177
7178         guest_state_enter_irqoff();
7179
7180         /* L1D Flush includes CPU buffer clear to mitigate MDS */
7181         if (static_branch_unlikely(&vmx_l1d_should_flush))
7182                 vmx_l1d_flush(vcpu);
7183         else if (static_branch_unlikely(&mds_user_clear))
7184                 mds_clear_cpu_buffers();
7185         else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7186                  kvm_arch_has_assigned_device(vcpu->kvm))
7187                 mds_clear_cpu_buffers();
7188
7189         vmx_disable_fb_clear(vmx);
7190
7191         if (vcpu->arch.cr2 != native_read_cr2())
7192                 native_write_cr2(vcpu->arch.cr2);
7193
7194         vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7195                                    flags);
7196
7197         vcpu->arch.cr2 = native_read_cr2();
7198
7199         vmx_enable_fb_clear(vmx);
7200
7201         guest_state_exit_irqoff();
7202 }
7203
7204 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
7205 {
7206         struct vcpu_vmx *vmx = to_vmx(vcpu);
7207         unsigned long cr3, cr4;
7208
7209         /* Record the guest's net vcpu time for enforced NMI injections. */
7210         if (unlikely(!enable_vnmi &&
7211                      vmx->loaded_vmcs->soft_vnmi_blocked))
7212                 vmx->loaded_vmcs->entry_time = ktime_get();
7213
7214         /*
7215          * Don't enter VMX if guest state is invalid, let the exit handler
7216          * start emulation until we arrive back to a valid state.  Synthesize a
7217          * consistency check VM-Exit due to invalid guest state and bail.
7218          */
7219         if (unlikely(vmx->emulation_required)) {
7220                 vmx->fail = 0;
7221
7222                 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7223                 vmx->exit_reason.failed_vmentry = 1;
7224                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7225                 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7226                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7227                 vmx->exit_intr_info = 0;
7228                 return EXIT_FASTPATH_NONE;
7229         }
7230
7231         trace_kvm_entry(vcpu);
7232
7233         if (vmx->ple_window_dirty) {
7234                 vmx->ple_window_dirty = false;
7235                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7236         }
7237
7238         /*
7239          * We did this in prepare_switch_to_guest, because it needs to
7240          * be within srcu_read_lock.
7241          */
7242         WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7243
7244         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7245                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7246         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7247                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7248         vcpu->arch.regs_dirty = 0;
7249
7250         /*
7251          * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
7252          * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7253          * it switches back to the current->mm, which can occur in KVM context
7254          * when switching to a temporary mm to patch kernel code, e.g. if KVM
7255          * toggles a static key while handling a VM-Exit.
7256          */
7257         cr3 = __get_current_cr3_fast();
7258         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7259                 vmcs_writel(HOST_CR3, cr3);
7260                 vmx->loaded_vmcs->host_state.cr3 = cr3;
7261         }
7262
7263         cr4 = cr4_read_shadow();
7264         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7265                 vmcs_writel(HOST_CR4, cr4);
7266                 vmx->loaded_vmcs->host_state.cr4 = cr4;
7267         }
7268
7269         /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
7270         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7271                 set_debugreg(vcpu->arch.dr6, 6);
7272
7273         /* When single-stepping over STI and MOV SS, we must clear the
7274          * corresponding interruptibility bits in the guest state. Otherwise
7275          * vmentry fails as it then expects bit 14 (BS) in pending debug
7276          * exceptions being set, but that's not correct for the guest debugging
7277          * case. */
7278         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7279                 vmx_set_interrupt_shadow(vcpu, 0);
7280
7281         kvm_load_guest_xsave_state(vcpu);
7282
7283         pt_guest_enter(vmx);
7284
7285         atomic_switch_perf_msrs(vmx);
7286         if (intel_pmu_lbr_is_enabled(vcpu))
7287                 vmx_passthrough_lbr_msrs(vcpu);
7288
7289         if (enable_preemption_timer)
7290                 vmx_update_hv_timer(vcpu);
7291
7292         kvm_wait_lapic_expire(vcpu);
7293
7294         /* The actual VMENTER/EXIT is in the .noinstr.text section. */
7295         vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7296
7297         /* All fields are clean at this point */
7298         if (static_branch_unlikely(&enable_evmcs)) {
7299                 current_evmcs->hv_clean_fields |=
7300                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7301
7302                 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7303         }
7304
7305         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7306         if (vmx->host_debugctlmsr)
7307                 update_debugctlmsr(vmx->host_debugctlmsr);
7308
7309 #ifndef CONFIG_X86_64
7310         /*
7311          * The sysexit path does not restore ds/es, so we must set them to
7312          * a reasonable value ourselves.
7313          *
7314          * We can't defer this to vmx_prepare_switch_to_host() since that
7315          * function may be executed in interrupt context, which saves and
7316          * restore segments around it, nullifying its effect.
7317          */
7318         loadsegment(ds, __USER_DS);
7319         loadsegment(es, __USER_DS);
7320 #endif
7321
7322         vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7323
7324         pt_guest_exit(vmx);
7325
7326         kvm_load_host_xsave_state(vcpu);
7327
7328         if (is_guest_mode(vcpu)) {
7329                 /*
7330                  * Track VMLAUNCH/VMRESUME that have made past guest state
7331                  * checking.
7332                  */
7333                 if (vmx->nested.nested_run_pending &&
7334                     !vmx->exit_reason.failed_vmentry)
7335                         ++vcpu->stat.nested_run;
7336
7337                 vmx->nested.nested_run_pending = 0;
7338         }
7339
7340         vmx->idt_vectoring_info = 0;
7341
7342         if (unlikely(vmx->fail)) {
7343                 vmx->exit_reason.full = 0xdead;
7344                 return EXIT_FASTPATH_NONE;
7345         }
7346
7347         vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7348         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
7349                 kvm_machine_check();
7350
7351         if (likely(!vmx->exit_reason.failed_vmentry))
7352                 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7353
7354         trace_kvm_exit(vcpu, KVM_ISA_VMX);
7355
7356         if (unlikely(vmx->exit_reason.failed_vmentry))
7357                 return EXIT_FASTPATH_NONE;
7358
7359         vmx->loaded_vmcs->launched = 1;
7360
7361         vmx_recover_nmi_blocking(vmx);
7362         vmx_complete_interrupts(vmx);
7363
7364         if (is_guest_mode(vcpu))
7365                 return EXIT_FASTPATH_NONE;
7366
7367         return vmx_exit_handlers_fastpath(vcpu);
7368 }
7369
7370 static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7371 {
7372         struct vcpu_vmx *vmx = to_vmx(vcpu);
7373
7374         if (enable_pml)
7375                 vmx_destroy_pml_buffer(vmx);
7376         free_vpid(vmx->vpid);
7377         nested_vmx_free_vcpu(vcpu);
7378         free_loaded_vmcs(vmx->loaded_vmcs);
7379 }
7380
7381 static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7382 {
7383         struct vmx_uret_msr *tsx_ctrl;
7384         struct vcpu_vmx *vmx;
7385         int i, err;
7386
7387         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7388         vmx = to_vmx(vcpu);
7389
7390         INIT_LIST_HEAD(&vmx->pi_wakeup_list);
7391
7392         err = -ENOMEM;
7393
7394         vmx->vpid = allocate_vpid();
7395
7396         /*
7397          * If PML is turned on, failure on enabling PML just results in failure
7398          * of creating the vcpu, therefore we can simplify PML logic (by
7399          * avoiding dealing with cases, such as enabling PML partially on vcpus
7400          * for the guest), etc.
7401          */
7402         if (enable_pml) {
7403                 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7404                 if (!vmx->pml_pg)
7405                         goto free_vpid;
7406         }
7407
7408         for (i = 0; i < kvm_nr_uret_msrs; ++i)
7409                 vmx->guest_uret_msrs[i].mask = -1ull;
7410         if (boot_cpu_has(X86_FEATURE_RTM)) {
7411                 /*
7412                  * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7413                  * Keep the host value unchanged to avoid changing CPUID bits
7414                  * under the host kernel's feet.
7415                  */
7416                 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7417                 if (tsx_ctrl)
7418                         tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7419         }
7420
7421         err = alloc_loaded_vmcs(&vmx->vmcs01);
7422         if (err < 0)
7423                 goto free_pml;
7424
7425         /*
7426          * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7427          * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7428          * feature only for vmcs01, KVM currently isn't equipped to realize any
7429          * performance benefits from enabling it for vmcs02.
7430          */
7431         if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
7432             (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7433                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7434
7435                 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7436         }
7437
7438         /* The MSR bitmap starts with all ones */
7439         bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7440         bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7441
7442         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
7443 #ifdef CONFIG_X86_64
7444         vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7445         vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7446         vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7447 #endif
7448         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7449         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7450         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7451         if (kvm_cstate_in_guest(vcpu->kvm)) {
7452                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7453                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7454                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7455                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7456         }
7457
7458         vmx->loaded_vmcs = &vmx->vmcs01;
7459
7460         if (cpu_need_virtualize_apic_accesses(vcpu)) {
7461                 err = kvm_alloc_apic_access_page(vcpu->kvm);
7462                 if (err)
7463                         goto free_vmcs;
7464         }
7465
7466         if (enable_ept && !enable_unrestricted_guest) {
7467                 err = init_rmode_identity_map(vcpu->kvm);
7468                 if (err)
7469                         goto free_vmcs;
7470         }
7471
7472         if (vmx_can_use_ipiv(vcpu))
7473                 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7474                            __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
7475
7476         return 0;
7477
7478 free_vmcs:
7479         free_loaded_vmcs(vmx->loaded_vmcs);
7480 free_pml:
7481         vmx_destroy_pml_buffer(vmx);
7482 free_vpid:
7483         free_vpid(vmx->vpid);
7484         return err;
7485 }
7486
7487 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7488 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7489
7490 static int vmx_vm_init(struct kvm *kvm)
7491 {
7492         if (!ple_gap)
7493                 kvm->arch.pause_in_guest = true;
7494
7495         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7496                 switch (l1tf_mitigation) {
7497                 case L1TF_MITIGATION_OFF:
7498                 case L1TF_MITIGATION_FLUSH_NOWARN:
7499                         /* 'I explicitly don't care' is set */
7500                         break;
7501                 case L1TF_MITIGATION_FLUSH:
7502                 case L1TF_MITIGATION_FLUSH_NOSMT:
7503                 case L1TF_MITIGATION_FULL:
7504                         /*
7505                          * Warn upon starting the first VM in a potentially
7506                          * insecure environment.
7507                          */
7508                         if (sched_smt_active())
7509                                 pr_warn_once(L1TF_MSG_SMT);
7510                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7511                                 pr_warn_once(L1TF_MSG_L1D);
7512                         break;
7513                 case L1TF_MITIGATION_FULL_FORCE:
7514                         /* Flush is enforced */
7515                         break;
7516                 }
7517         }
7518         return 0;
7519 }
7520
7521 static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7522 {
7523         u8 cache;
7524
7525         /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7526          * memory aliases with conflicting memory types and sometimes MCEs.
7527          * We have to be careful as to what are honored and when.
7528          *
7529          * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
7530          * UC.  The effective memory type is UC or WC depending on guest PAT.
7531          * This was historically the source of MCEs and we want to be
7532          * conservative.
7533          *
7534          * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7535          * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
7536          * EPT memory type is set to WB.  The effective memory type is forced
7537          * WB.
7538          *
7539          * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
7540          * EPT memory type is used to emulate guest CD/MTRR.
7541          */
7542
7543         if (is_mmio)
7544                 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7545
7546         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7547                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7548
7549         if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
7550                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7551                         cache = MTRR_TYPE_WRBACK;
7552                 else
7553                         cache = MTRR_TYPE_UNCACHABLE;
7554
7555                 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7556         }
7557
7558         return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
7559 }
7560
7561 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7562 {
7563         /*
7564          * These bits in the secondary execution controls field
7565          * are dynamic, the others are mostly based on the hypervisor
7566          * architecture and the guest's CPUID.  Do not touch the
7567          * dynamic bits.
7568          */
7569         u32 mask =
7570                 SECONDARY_EXEC_SHADOW_VMCS |
7571                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7572                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7573                 SECONDARY_EXEC_DESC;
7574
7575         u32 cur_ctl = secondary_exec_controls_get(vmx);
7576
7577         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7578 }
7579
7580 /*
7581  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7582  * (indicating "allowed-1") if they are supported in the guest's CPUID.
7583  */
7584 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7585 {
7586         struct vcpu_vmx *vmx = to_vmx(vcpu);
7587         struct kvm_cpuid_entry2 *entry;
7588
7589         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7590         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7591
7592 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
7593         if (entry && (entry->_reg & (_cpuid_mask)))                     \
7594                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
7595 } while (0)
7596
7597         entry = kvm_find_cpuid_entry(vcpu, 0x1);
7598         cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
7599         cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
7600         cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
7601         cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
7602         cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
7603         cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
7604         cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
7605         cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
7606         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
7607         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7608         cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
7609         cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
7610         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
7611         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
7612
7613         entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7614         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
7615         cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
7616         cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
7617         cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
7618         cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
7619         cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
7620
7621 #undef cr4_fixed1_update
7622 }
7623
7624 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7625 {
7626         struct vcpu_vmx *vmx = to_vmx(vcpu);
7627         struct kvm_cpuid_entry2 *best = NULL;
7628         int i;
7629
7630         for (i = 0; i < PT_CPUID_LEAVES; i++) {
7631                 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
7632                 if (!best)
7633                         return;
7634                 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7635                 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7636                 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7637                 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7638         }
7639
7640         /* Get the number of configurable Address Ranges for filtering */
7641         vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7642                                                 PT_CAP_num_address_ranges);
7643
7644         /* Initialize and clear the no dependency bits */
7645         vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7646                         RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7647                         RTIT_CTL_BRANCH_EN);
7648
7649         /*
7650          * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7651          * will inject an #GP
7652          */
7653         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7654                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7655
7656         /*
7657          * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7658          * PSBFreq can be set
7659          */
7660         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7661                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7662                                 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7663
7664         /*
7665          * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7666          */
7667         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7668                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7669                                               RTIT_CTL_MTC_RANGE);
7670
7671         /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7672         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7673                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7674                                                         RTIT_CTL_PTW_EN);
7675
7676         /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7677         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7678                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7679
7680         /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7681         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7682                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7683
7684         /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7685         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7686                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7687
7688         /* unmask address range configure area */
7689         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7690                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7691 }
7692
7693 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7694 {
7695         struct vcpu_vmx *vmx = to_vmx(vcpu);
7696
7697         /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7698         vcpu->arch.xsaves_enabled = false;
7699
7700         vmx_setup_uret_msrs(vmx);
7701
7702         if (cpu_has_secondary_exec_ctrls())
7703                 vmcs_set_secondary_exec_control(vmx,
7704                                                 vmx_secondary_exec_control(vmx));
7705
7706         if (nested_vmx_allowed(vcpu))
7707                 vmx->msr_ia32_feature_control_valid_bits |=
7708                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7709                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7710         else
7711                 vmx->msr_ia32_feature_control_valid_bits &=
7712                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7713                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7714
7715         if (nested_vmx_allowed(vcpu))
7716                 nested_vmx_cr_fixed1_bits_update(vcpu);
7717
7718         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7719                         guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7720                 update_intel_pt_cfg(vcpu);
7721
7722         if (boot_cpu_has(X86_FEATURE_RTM)) {
7723                 struct vmx_uret_msr *msr;
7724                 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7725                 if (msr) {
7726                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7727                         vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7728                 }
7729         }
7730
7731         if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7732                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7733                                           !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7734
7735
7736         set_cr4_guest_host_mask(vmx);
7737
7738         vmx_write_encls_bitmap(vcpu, NULL);
7739         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7740                 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7741         else
7742                 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7743
7744         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7745                 vmx->msr_ia32_feature_control_valid_bits |=
7746                         FEAT_CTL_SGX_LC_ENABLED;
7747         else
7748                 vmx->msr_ia32_feature_control_valid_bits &=
7749                         ~FEAT_CTL_SGX_LC_ENABLED;
7750
7751         /* Refresh #PF interception to account for MAXPHYADDR changes. */
7752         vmx_update_exception_bitmap(vcpu);
7753 }
7754
7755 static u64 vmx_get_perf_capabilities(void)
7756 {
7757         u64 perf_cap = PMU_CAP_FW_WRITES;
7758         struct x86_pmu_lbr lbr;
7759         u64 host_perf_cap = 0;
7760
7761         if (!enable_pmu)
7762                 return 0;
7763
7764         if (boot_cpu_has(X86_FEATURE_PDCM))
7765                 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7766
7767         x86_perf_get_lbr(&lbr);
7768         if (lbr.nr)
7769                 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
7770
7771         if (vmx_pebs_supported()) {
7772                 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7773                 if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
7774                         perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7775         }
7776
7777         return perf_cap;
7778 }
7779
7780 static __init void vmx_set_cpu_caps(void)
7781 {
7782         kvm_set_cpu_caps();
7783
7784         /* CPUID 0x1 */
7785         if (nested)
7786                 kvm_cpu_cap_set(X86_FEATURE_VMX);
7787
7788         /* CPUID 0x7 */
7789         if (kvm_mpx_supported())
7790                 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7791         if (!cpu_has_vmx_invpcid())
7792                 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
7793         if (vmx_pt_mode_is_host_guest())
7794                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7795         if (vmx_pebs_supported()) {
7796                 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7797                 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7798         }
7799
7800         if (!enable_pmu)
7801                 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
7802         kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
7803
7804         if (!enable_sgx) {
7805                 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7806                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7807                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7808                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7809         }
7810
7811         if (vmx_umip_emulated())
7812                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7813
7814         /* CPUID 0xD.1 */
7815         kvm_caps.supported_xss = 0;
7816         if (!cpu_has_vmx_xsaves())
7817                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7818
7819         /* CPUID 0x80000001 and 0x7 (RDPID) */
7820         if (!cpu_has_vmx_rdtscp()) {
7821                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7822                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7823         }
7824
7825         if (cpu_has_vmx_waitpkg())
7826                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7827 }
7828
7829 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7830 {
7831         to_vmx(vcpu)->req_immediate_exit = true;
7832 }
7833
7834 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7835                                   struct x86_instruction_info *info)
7836 {
7837         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7838         unsigned short port;
7839         bool intercept;
7840         int size;
7841
7842         if (info->intercept == x86_intercept_in ||
7843             info->intercept == x86_intercept_ins) {
7844                 port = info->src_val;
7845                 size = info->dst_bytes;
7846         } else {
7847                 port = info->dst_val;
7848                 size = info->src_bytes;
7849         }
7850
7851         /*
7852          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7853          * VM-exits depend on the 'unconditional IO exiting' VM-execution
7854          * control.
7855          *
7856          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7857          */
7858         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7859                 intercept = nested_cpu_has(vmcs12,
7860                                            CPU_BASED_UNCOND_IO_EXITING);
7861         else
7862                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7863
7864         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7865         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7866 }
7867
7868 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7869                                struct x86_instruction_info *info,
7870                                enum x86_intercept_stage stage,
7871                                struct x86_exception *exception)
7872 {
7873         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7874
7875         switch (info->intercept) {
7876         /*
7877          * RDPID causes #UD if disabled through secondary execution controls.
7878          * Because it is marked as EmulateOnUD, we need to intercept it here.
7879          * Note, RDPID is hidden behind ENABLE_RDTSCP.
7880          */
7881         case x86_intercept_rdpid:
7882                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7883                         exception->vector = UD_VECTOR;
7884                         exception->error_code_valid = false;
7885                         return X86EMUL_PROPAGATE_FAULT;
7886                 }
7887                 break;
7888
7889         case x86_intercept_in:
7890         case x86_intercept_ins:
7891         case x86_intercept_out:
7892         case x86_intercept_outs:
7893                 return vmx_check_intercept_io(vcpu, info);
7894
7895         case x86_intercept_lgdt:
7896         case x86_intercept_lidt:
7897         case x86_intercept_lldt:
7898         case x86_intercept_ltr:
7899         case x86_intercept_sgdt:
7900         case x86_intercept_sidt:
7901         case x86_intercept_sldt:
7902         case x86_intercept_str:
7903                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7904                         return X86EMUL_CONTINUE;
7905
7906                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7907                 break;
7908
7909         /* TODO: check more intercepts... */
7910         default:
7911                 break;
7912         }
7913
7914         return X86EMUL_UNHANDLEABLE;
7915 }
7916
7917 #ifdef CONFIG_X86_64
7918 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7919 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7920                                   u64 divisor, u64 *result)
7921 {
7922         u64 low = a << shift, high = a >> (64 - shift);
7923
7924         /* To avoid the overflow on divq */
7925         if (high >= divisor)
7926                 return 1;
7927
7928         /* Low hold the result, high hold rem which is discarded */
7929         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7930             "rm" (divisor), "0" (low), "1" (high));
7931         *result = low;
7932
7933         return 0;
7934 }
7935
7936 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7937                             bool *expired)
7938 {
7939         struct vcpu_vmx *vmx;
7940         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7941         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
7942
7943         vmx = to_vmx(vcpu);
7944         tscl = rdtsc();
7945         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7946         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7947         lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7948                                                     ktimer->timer_advance_ns);
7949
7950         if (delta_tsc > lapic_timer_advance_cycles)
7951                 delta_tsc -= lapic_timer_advance_cycles;
7952         else
7953                 delta_tsc = 0;
7954
7955         /* Convert to host delta tsc if tsc scaling is enabled */
7956         if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
7957             delta_tsc && u64_shl_div_u64(delta_tsc,
7958                                 kvm_caps.tsc_scaling_ratio_frac_bits,
7959                                 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
7960                 return -ERANGE;
7961
7962         /*
7963          * If the delta tsc can't fit in the 32 bit after the multi shift,
7964          * we can't use the preemption timer.
7965          * It's possible that it fits on later vmentries, but checking
7966          * on every vmentry is costly so we just use an hrtimer.
7967          */
7968         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7969                 return -ERANGE;
7970
7971         vmx->hv_deadline_tsc = tscl + delta_tsc;
7972         *expired = !delta_tsc;
7973         return 0;
7974 }
7975
7976 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7977 {
7978         to_vmx(vcpu)->hv_deadline_tsc = -1;
7979 }
7980 #endif
7981
7982 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
7983 {
7984         if (!kvm_pause_in_guest(vcpu->kvm))
7985                 shrink_ple_window(vcpu);
7986 }
7987
7988 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
7989 {
7990         struct vcpu_vmx *vmx = to_vmx(vcpu);
7991
7992         if (is_guest_mode(vcpu)) {
7993                 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
7994                 return;
7995         }
7996
7997         /*
7998          * Note, cpu_dirty_logging_count can be changed concurrent with this
7999          * code, but in that case another update request will be made and so
8000          * the guest will never run with a stale PML value.
8001          */
8002         if (vcpu->kvm->arch.cpu_dirty_logging_count)
8003                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8004         else
8005                 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8006 }
8007
8008 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8009 {
8010         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8011                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8012                         FEAT_CTL_LMCE_ENABLED;
8013         else
8014                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8015                         ~FEAT_CTL_LMCE_ENABLED;
8016 }
8017
8018 #ifdef CONFIG_KVM_SMM
8019 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8020 {
8021         /* we need a nested vmexit to enter SMM, postpone if run is pending */
8022         if (to_vmx(vcpu)->nested.nested_run_pending)
8023                 return -EBUSY;
8024         return !is_smm(vcpu);
8025 }
8026
8027 static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8028 {
8029         struct vcpu_vmx *vmx = to_vmx(vcpu);
8030
8031         /*
8032          * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8033          * SMI and RSM.  Using the common VM-Exit + VM-Enter routines is wrong
8034          * SMI and RSM only modify state that is saved and restored via SMRAM.
8035          * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8036          * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8037          */
8038         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8039         if (vmx->nested.smm.guest_mode)
8040                 nested_vmx_vmexit(vcpu, -1, 0, 0);
8041
8042         vmx->nested.smm.vmxon = vmx->nested.vmxon;
8043         vmx->nested.vmxon = false;
8044         vmx_clear_hlt(vcpu);
8045         return 0;
8046 }
8047
8048 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8049 {
8050         struct vcpu_vmx *vmx = to_vmx(vcpu);
8051         int ret;
8052
8053         if (vmx->nested.smm.vmxon) {
8054                 vmx->nested.vmxon = true;
8055                 vmx->nested.smm.vmxon = false;
8056         }
8057
8058         if (vmx->nested.smm.guest_mode) {
8059                 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8060                 if (ret)
8061                         return ret;
8062
8063                 vmx->nested.nested_run_pending = 1;
8064                 vmx->nested.smm.guest_mode = false;
8065         }
8066         return 0;
8067 }
8068
8069 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8070 {
8071         /* RSM will cause a vmexit anyway.  */
8072 }
8073 #endif
8074
8075 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8076 {
8077         return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8078 }
8079
8080 static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8081 {
8082         if (is_guest_mode(vcpu)) {
8083                 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8084
8085                 if (hrtimer_try_to_cancel(timer) == 1)
8086                         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8087         }
8088 }
8089
8090 static void vmx_hardware_unsetup(void)
8091 {
8092         kvm_set_posted_intr_wakeup_handler(NULL);
8093
8094         if (nested)
8095                 nested_vmx_hardware_unsetup();
8096
8097         free_kvm_area();
8098 }
8099
8100 #define VMX_REQUIRED_APICV_INHIBITS                     \
8101 (                                                       \
8102         BIT(APICV_INHIBIT_REASON_DISABLE)|              \
8103         BIT(APICV_INHIBIT_REASON_ABSENT) |              \
8104         BIT(APICV_INHIBIT_REASON_HYPERV) |              \
8105         BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |            \
8106         BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
8107         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |    \
8108         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)    \
8109 )
8110
8111 static void vmx_vm_destroy(struct kvm *kvm)
8112 {
8113         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8114
8115         free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8116 }
8117
8118 static struct kvm_x86_ops vmx_x86_ops __initdata = {
8119         .name = KBUILD_MODNAME,
8120
8121         .check_processor_compatibility = vmx_check_processor_compat,
8122
8123         .hardware_unsetup = vmx_hardware_unsetup,
8124
8125         .hardware_enable = vmx_hardware_enable,
8126         .hardware_disable = vmx_hardware_disable,
8127         .has_emulated_msr = vmx_has_emulated_msr,
8128
8129         .vm_size = sizeof(struct kvm_vmx),
8130         .vm_init = vmx_vm_init,
8131         .vm_destroy = vmx_vm_destroy,
8132
8133         .vcpu_precreate = vmx_vcpu_precreate,
8134         .vcpu_create = vmx_vcpu_create,
8135         .vcpu_free = vmx_vcpu_free,
8136         .vcpu_reset = vmx_vcpu_reset,
8137
8138         .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
8139         .vcpu_load = vmx_vcpu_load,
8140         .vcpu_put = vmx_vcpu_put,
8141
8142         .update_exception_bitmap = vmx_update_exception_bitmap,
8143         .get_msr_feature = vmx_get_msr_feature,
8144         .get_msr = vmx_get_msr,
8145         .set_msr = vmx_set_msr,
8146         .get_segment_base = vmx_get_segment_base,
8147         .get_segment = vmx_get_segment,
8148         .set_segment = vmx_set_segment,
8149         .get_cpl = vmx_get_cpl,
8150         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
8151         .set_cr0 = vmx_set_cr0,
8152         .is_valid_cr4 = vmx_is_valid_cr4,
8153         .set_cr4 = vmx_set_cr4,
8154         .set_efer = vmx_set_efer,
8155         .get_idt = vmx_get_idt,
8156         .set_idt = vmx_set_idt,
8157         .get_gdt = vmx_get_gdt,
8158         .set_gdt = vmx_set_gdt,
8159         .set_dr7 = vmx_set_dr7,
8160         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8161         .cache_reg = vmx_cache_reg,
8162         .get_rflags = vmx_get_rflags,
8163         .set_rflags = vmx_set_rflags,
8164         .get_if_flag = vmx_get_if_flag,
8165
8166         .flush_tlb_all = vmx_flush_tlb_all,
8167         .flush_tlb_current = vmx_flush_tlb_current,
8168         .flush_tlb_gva = vmx_flush_tlb_gva,
8169         .flush_tlb_guest = vmx_flush_tlb_guest,
8170
8171         .vcpu_pre_run = vmx_vcpu_pre_run,
8172         .vcpu_run = vmx_vcpu_run,
8173         .handle_exit = vmx_handle_exit,
8174         .skip_emulated_instruction = vmx_skip_emulated_instruction,
8175         .update_emulated_instruction = vmx_update_emulated_instruction,
8176         .set_interrupt_shadow = vmx_set_interrupt_shadow,
8177         .get_interrupt_shadow = vmx_get_interrupt_shadow,
8178         .patch_hypercall = vmx_patch_hypercall,
8179         .inject_irq = vmx_inject_irq,
8180         .inject_nmi = vmx_inject_nmi,
8181         .inject_exception = vmx_inject_exception,
8182         .cancel_injection = vmx_cancel_injection,
8183         .interrupt_allowed = vmx_interrupt_allowed,
8184         .nmi_allowed = vmx_nmi_allowed,
8185         .get_nmi_mask = vmx_get_nmi_mask,
8186         .set_nmi_mask = vmx_set_nmi_mask,
8187         .enable_nmi_window = vmx_enable_nmi_window,
8188         .enable_irq_window = vmx_enable_irq_window,
8189         .update_cr8_intercept = vmx_update_cr8_intercept,
8190         .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8191         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8192         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8193         .load_eoi_exitmap = vmx_load_eoi_exitmap,
8194         .apicv_post_state_restore = vmx_apicv_post_state_restore,
8195         .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
8196         .hwapic_irr_update = vmx_hwapic_irr_update,
8197         .hwapic_isr_update = vmx_hwapic_isr_update,
8198         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8199         .sync_pir_to_irr = vmx_sync_pir_to_irr,
8200         .deliver_interrupt = vmx_deliver_interrupt,
8201         .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
8202
8203         .set_tss_addr = vmx_set_tss_addr,
8204         .set_identity_map_addr = vmx_set_identity_map_addr,
8205         .get_mt_mask = vmx_get_mt_mask,
8206
8207         .get_exit_info = vmx_get_exit_info,
8208
8209         .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
8210
8211         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8212
8213         .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
8214         .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
8215         .write_tsc_offset = vmx_write_tsc_offset,
8216         .write_tsc_multiplier = vmx_write_tsc_multiplier,
8217
8218         .load_mmu_pgd = vmx_load_mmu_pgd,
8219
8220         .check_intercept = vmx_check_intercept,
8221         .handle_exit_irqoff = vmx_handle_exit_irqoff,
8222
8223         .request_immediate_exit = vmx_request_immediate_exit,
8224
8225         .sched_in = vmx_sched_in,
8226
8227         .cpu_dirty_log_size = PML_ENTITY_NUM,
8228         .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
8229
8230         .nested_ops = &vmx_nested_ops,
8231
8232         .pi_update_irte = vmx_pi_update_irte,
8233         .pi_start_assignment = vmx_pi_start_assignment,
8234
8235 #ifdef CONFIG_X86_64
8236         .set_hv_timer = vmx_set_hv_timer,
8237         .cancel_hv_timer = vmx_cancel_hv_timer,
8238 #endif
8239
8240         .setup_mce = vmx_setup_mce,
8241
8242 #ifdef CONFIG_KVM_SMM
8243         .smi_allowed = vmx_smi_allowed,
8244         .enter_smm = vmx_enter_smm,
8245         .leave_smm = vmx_leave_smm,
8246         .enable_smi_window = vmx_enable_smi_window,
8247 #endif
8248
8249         .can_emulate_instruction = vmx_can_emulate_instruction,
8250         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
8251         .migrate_timers = vmx_migrate_timers,
8252
8253         .msr_filter_changed = vmx_msr_filter_changed,
8254         .complete_emulated_msr = kvm_complete_insn_gp,
8255
8256         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
8257 };
8258
8259 static unsigned int vmx_handle_intel_pt_intr(void)
8260 {
8261         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8262
8263         /* '0' on failure so that the !PT case can use a RET0 static call. */
8264         if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8265                 return 0;
8266
8267         kvm_make_request(KVM_REQ_PMI, vcpu);
8268         __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8269                   (unsigned long *)&vcpu->arch.pmu.global_status);
8270         return 1;
8271 }
8272
8273 static __init void vmx_setup_user_return_msrs(void)
8274 {
8275
8276         /*
8277          * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8278          * will emulate SYSCALL in legacy mode if the vendor string in guest
8279          * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8280          * support this emulation, MSR_STAR is included in the list for i386,
8281          * but is never loaded into hardware.  MSR_CSTAR is also never loaded
8282          * into hardware and is here purely for emulation purposes.
8283          */
8284         const u32 vmx_uret_msrs_list[] = {
8285         #ifdef CONFIG_X86_64
8286                 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8287         #endif
8288                 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8289                 MSR_IA32_TSX_CTRL,
8290         };
8291         int i;
8292
8293         BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8294
8295         for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8296                 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8297 }
8298
8299 static void __init vmx_setup_me_spte_mask(void)
8300 {
8301         u64 me_mask = 0;
8302
8303         /*
8304          * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
8305          * the former to avoid exposing shadow_phys_bits.
8306          *
8307          * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8308          * shadow_phys_bits.  On MKTME and/or TDX capable systems,
8309          * boot_cpu_data.x86_phys_bits holds the actual physical address
8310          * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
8311          * reported by CPUID.  Those bits between are KeyID bits.
8312          */
8313         if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
8314                 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8315                         kvm_get_shadow_phys_bits() - 1);
8316         /*
8317          * Unlike SME, host kernel doesn't support setting up any
8318          * MKTME KeyID on Intel platforms.  No memory encryption
8319          * bits should be included into the SPTE.
8320          */
8321         kvm_mmu_set_me_spte_mask(0, me_mask);
8322 }
8323
8324 static struct kvm_x86_init_ops vmx_init_ops __initdata;
8325
8326 static __init int hardware_setup(void)
8327 {
8328         unsigned long host_bndcfgs;
8329         struct desc_ptr dt;
8330         int r;
8331
8332         store_idt(&dt);
8333         host_idt_base = dt.address;
8334
8335         vmx_setup_user_return_msrs();
8336
8337         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8338                 return -EIO;
8339
8340         if (cpu_has_perf_global_ctrl_bug())
8341                 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
8342                              "does not work properly. Using workaround\n");
8343
8344         if (boot_cpu_has(X86_FEATURE_NX))
8345                 kvm_enable_efer_bits(EFER_NX);
8346
8347         if (boot_cpu_has(X86_FEATURE_MPX)) {
8348                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8349                 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8350         }
8351
8352         if (!cpu_has_vmx_mpx())
8353                 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8354                                              XFEATURE_MASK_BNDCSR);
8355
8356         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8357             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8358                 enable_vpid = 0;
8359
8360         if (!cpu_has_vmx_ept() ||
8361             !cpu_has_vmx_ept_4levels() ||
8362             !cpu_has_vmx_ept_mt_wb() ||
8363             !cpu_has_vmx_invept_global())
8364                 enable_ept = 0;
8365
8366         /* NX support is required for shadow paging. */
8367         if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8368                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
8369                 return -EOPNOTSUPP;
8370         }
8371
8372         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8373                 enable_ept_ad_bits = 0;
8374
8375         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8376                 enable_unrestricted_guest = 0;
8377
8378         if (!cpu_has_vmx_flexpriority())
8379                 flexpriority_enabled = 0;
8380
8381         if (!cpu_has_virtual_nmis())
8382                 enable_vnmi = 0;
8383
8384 #ifdef CONFIG_X86_SGX_KVM
8385         if (!cpu_has_vmx_encls_vmexit())
8386                 enable_sgx = false;
8387 #endif
8388
8389         /*
8390          * set_apic_access_page_addr() is used to reload apic access
8391          * page upon invalidation.  No need to do anything if not
8392          * using the APIC_ACCESS_ADDR VMCS field.
8393          */
8394         if (!flexpriority_enabled)
8395                 vmx_x86_ops.set_apic_access_page_addr = NULL;
8396
8397         if (!cpu_has_vmx_tpr_shadow())
8398                 vmx_x86_ops.update_cr8_intercept = NULL;
8399
8400 #if IS_ENABLED(CONFIG_HYPERV)
8401         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8402             && enable_ept) {
8403                 vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
8404                 vmx_x86_ops.tlb_remote_flush_with_range =
8405                                 hv_remote_flush_tlb_with_range;
8406         }
8407 #endif
8408
8409         if (!cpu_has_vmx_ple()) {
8410                 ple_gap = 0;
8411                 ple_window = 0;
8412                 ple_window_grow = 0;
8413                 ple_window_max = 0;
8414                 ple_window_shrink = 0;
8415         }
8416
8417         if (!cpu_has_vmx_apicv())
8418                 enable_apicv = 0;
8419         if (!enable_apicv)
8420                 vmx_x86_ops.sync_pir_to_irr = NULL;
8421
8422         if (!enable_apicv || !cpu_has_vmx_ipiv())
8423                 enable_ipiv = false;
8424
8425         if (cpu_has_vmx_tsc_scaling())
8426                 kvm_caps.has_tsc_control = true;
8427
8428         kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8429         kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8430         kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8431         kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8432
8433         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8434
8435         if (enable_ept)
8436                 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8437                                       cpu_has_vmx_ept_execute_only());
8438
8439         /*
8440          * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8441          * bits to shadow_zero_check.
8442          */
8443         vmx_setup_me_spte_mask();
8444
8445         kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
8446                           ept_caps_to_lpage_level(vmx_capability.ept));
8447
8448         /*
8449          * Only enable PML when hardware supports PML feature, and both EPT
8450          * and EPT A/D bit features are enabled -- PML depends on them to work.
8451          */
8452         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8453                 enable_pml = 0;
8454
8455         if (!enable_pml)
8456                 vmx_x86_ops.cpu_dirty_log_size = 0;
8457
8458         if (!cpu_has_vmx_preemption_timer())
8459                 enable_preemption_timer = false;
8460
8461         if (enable_preemption_timer) {
8462                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
8463
8464                 cpu_preemption_timer_multi =
8465                         vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8466
8467                 if (tsc_khz)
8468                         use_timer_freq = (u64)tsc_khz * 1000;
8469                 use_timer_freq >>= cpu_preemption_timer_multi;
8470
8471                 /*
8472                  * KVM "disables" the preemption timer by setting it to its max
8473                  * value.  Don't use the timer if it might cause spurious exits
8474                  * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8475                  */
8476                 if (use_timer_freq > 0xffffffffu / 10)
8477                         enable_preemption_timer = false;
8478         }
8479
8480         if (!enable_preemption_timer) {
8481                 vmx_x86_ops.set_hv_timer = NULL;
8482                 vmx_x86_ops.cancel_hv_timer = NULL;
8483                 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
8484         }
8485
8486         kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8487         kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8488
8489         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8490                 return -EINVAL;
8491         if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8492                 pt_mode = PT_MODE_SYSTEM;
8493         if (pt_mode == PT_MODE_HOST_GUEST)
8494                 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8495         else
8496                 vmx_init_ops.handle_intel_pt_intr = NULL;
8497
8498         setup_default_sgx_lepubkeyhash();
8499
8500         if (nested) {
8501                 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
8502
8503                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8504                 if (r)
8505                         return r;
8506         }
8507
8508         vmx_set_cpu_caps();
8509
8510         r = alloc_kvm_area();
8511         if (r && nested)
8512                 nested_vmx_hardware_unsetup();
8513
8514         kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8515
8516         return r;
8517 }
8518
8519 static struct kvm_x86_init_ops vmx_init_ops __initdata = {
8520         .hardware_setup = hardware_setup,
8521         .handle_intel_pt_intr = NULL,
8522
8523         .runtime_ops = &vmx_x86_ops,
8524         .pmu_ops = &intel_pmu_ops,
8525 };
8526
8527 static void vmx_cleanup_l1d_flush(void)
8528 {
8529         if (vmx_l1d_flush_pages) {
8530                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8531                 vmx_l1d_flush_pages = NULL;
8532         }
8533         /* Restore state so sysfs ignores VMX */
8534         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8535 }
8536
8537 static void __vmx_exit(void)
8538 {
8539         allow_smaller_maxphyaddr = false;
8540
8541 #ifdef CONFIG_KEXEC_CORE
8542         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8543         synchronize_rcu();
8544 #endif
8545         vmx_cleanup_l1d_flush();
8546 }
8547
8548 static void vmx_exit(void)
8549 {
8550         kvm_exit();
8551         kvm_x86_vendor_exit();
8552
8553         __vmx_exit();
8554 }
8555 module_exit(vmx_exit);
8556
8557 static int __init vmx_init(void)
8558 {
8559         int r, cpu;
8560
8561         if (!kvm_is_vmx_supported())
8562                 return -EOPNOTSUPP;
8563
8564         /*
8565          * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8566          * to unwind if a later step fails.
8567          */
8568         hv_init_evmcs();
8569
8570         r = kvm_x86_vendor_init(&vmx_init_ops);
8571         if (r)
8572                 return r;
8573
8574         /*
8575          * Must be called after common x86 init so enable_ept is properly set
8576          * up. Hand the parameter mitigation value in which was stored in
8577          * the pre module init parser. If no parameter was given, it will
8578          * contain 'auto' which will be turned into the default 'cond'
8579          * mitigation mode.
8580          */
8581         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8582         if (r)
8583                 goto err_l1d_flush;
8584
8585         vmx_setup_fb_clear_ctrl();
8586
8587         for_each_possible_cpu(cpu) {
8588                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8589
8590                 pi_init_cpu(cpu);
8591         }
8592
8593 #ifdef CONFIG_KEXEC_CORE
8594         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8595                            crash_vmclear_local_loaded_vmcss);
8596 #endif
8597         vmx_check_vmcs12_offsets();
8598
8599         /*
8600          * Shadow paging doesn't have a (further) performance penalty
8601          * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8602          * by default
8603          */
8604         if (!enable_ept)
8605                 allow_smaller_maxphyaddr = true;
8606
8607         /*
8608          * Common KVM initialization _must_ come last, after this, /dev/kvm is
8609          * exposed to userspace!
8610          */
8611         r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
8612                      THIS_MODULE);
8613         if (r)
8614                 goto err_kvm_init;
8615
8616         return 0;
8617
8618 err_kvm_init:
8619         __vmx_exit();
8620 err_l1d_flush:
8621         kvm_x86_vendor_exit();
8622         return r;
8623 }
8624 module_init(vmx_init);