arch/x86/kvm/svm/svm.c

   1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   2
   3 #include <linux/kvm_host.h>
   4
   5 #include "irq.h"
   6 #include "mmu.h"
   7 #include "kvm_cache_regs.h"
   8 #include "x86.h"
   9 #include "smm.h"
  10 #include "cpuid.h"
  11 #include "pmu.h"
  12
  13 #include <linux/module.h>
  14 #include <linux/mod_devicetable.h>
  15 #include <linux/kernel.h>
  16 #include <linux/vmalloc.h>
  17 #include <linux/highmem.h>
  18 #include <linux/amd-iommu.h>
  19 #include <linux/sched.h>
  20 #include <linux/trace_events.h>
  21 #include <linux/slab.h>
  22 #include <linux/hashtable.h>
  23 #include <linux/objtool.h>
  24 #include <linux/psp-sev.h>
  25 #include <linux/file.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/swap.h>
  28 #include <linux/rwsem.h>
  29 #include <linux/cc_platform.h>
  30
  31 #include <asm/apic.h>
  32 #include <asm/perf_event.h>
  33 #include <asm/tlbflush.h>
  34 #include <asm/desc.h>
  35 #include <asm/debugreg.h>
  36 #include <asm/kvm_para.h>
  37 #include <asm/irq_remapping.h>
  38 #include <asm/spec-ctrl.h>
  39 #include <asm/cpu_device_id.h>
  40 #include <asm/traps.h>
  41 #include <asm/fpu/api.h>
  42
  43 #include <asm/virtext.h>
  44 #include "trace.h"
  45
  46 #include "svm.h"
  47 #include "svm_ops.h"
  48
  49 #include "kvm_onhyperv.h"
  50 #include "svm_onhyperv.h"
  51
  52 MODULE_AUTHOR("Qumranet");
  53 MODULE_LICENSE("GPL");
  54
  55 #ifdef MODULE
  56 static const struct x86_cpu_id svm_cpu_id[] = {
  57         X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  58         {}
  59 };
  60 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  61 #endif
  62
  63 #define SEG_TYPE_LDT 2
  64 #define SEG_TYPE_BUSY_TSS16 3
  65
  66 static bool erratum_383_found __read_mostly;
  67
  68 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  69
  70 /*
  71  * Set osvw_len to higher value when updated Revision Guides
  72  * are published and we know what the new status bits are
  73  */
  74 static uint64_t osvw_len = 4, osvw_status;
  75
  76 static DEFINE_PER_CPU(u64, current_tsc_ratio);
  77
  78 #define X2APIC_MSR(x)   (APIC_BASE_MSR + (x >> 4))
  79
  80 static const struct svm_direct_access_msrs {
  81         u32 index;   /* Index of the MSR */
  82         bool always; /* True if intercept is initially cleared */
  83 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  84         { .index = MSR_STAR,                            .always = true  },
  85         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
  86         { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
  87         { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
  88 #ifdef CONFIG_X86_64
  89         { .index = MSR_GS_BASE,                         .always = true  },
  90         { .index = MSR_FS_BASE,                         .always = true  },
  91         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
  92         { .index = MSR_LSTAR,                           .always = true  },
  93         { .index = MSR_CSTAR,                           .always = true  },
  94         { .index = MSR_SYSCALL_MASK,                    .always = true  },
  95 #endif
  96         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
  97         { .index = MSR_IA32_PRED_CMD,                   .always = false },
  98         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
  99         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 100         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 101         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 102         { .index = MSR_EFER,                            .always = false },
 103         { .index = MSR_IA32_CR_PAT,                     .always = false },
 104         { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
 105         { .index = MSR_TSC_AUX,                         .always = false },
 106         { .index = X2APIC_MSR(APIC_ID),                 .always = false },
 107         { .index = X2APIC_MSR(APIC_LVR),                .always = false },
 108         { .index = X2APIC_MSR(APIC_TASKPRI),            .always = false },
 109         { .index = X2APIC_MSR(APIC_ARBPRI),             .always = false },
 110         { .index = X2APIC_MSR(APIC_PROCPRI),            .always = false },
 111         { .index = X2APIC_MSR(APIC_EOI),                .always = false },
 112         { .index = X2APIC_MSR(APIC_RRR),                .always = false },
 113         { .index = X2APIC_MSR(APIC_LDR),                .always = false },
 114         { .index = X2APIC_MSR(APIC_DFR),                .always = false },
 115         { .index = X2APIC_MSR(APIC_SPIV),               .always = false },
 116         { .index = X2APIC_MSR(APIC_ISR),                .always = false },
 117         { .index = X2APIC_MSR(APIC_TMR),                .always = false },
 118         { .index = X2APIC_MSR(APIC_IRR),                .always = false },
 119         { .index = X2APIC_MSR(APIC_ESR),                .always = false },
 120         { .index = X2APIC_MSR(APIC_ICR),                .always = false },
 121         { .index = X2APIC_MSR(APIC_ICR2),               .always = false },
 122
 123         /*
 124          * Note:
 125          * AMD does not virtualize APIC TSC-deadline timer mode, but it is
 126          * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
 127          * the AVIC hardware would generate GP fault. Therefore, always
 128          * intercept the MSR 0x832, and do not setup direct_access_msr.
 129          */
 130         { .index = X2APIC_MSR(APIC_LVTTHMR),            .always = false },
 131         { .index = X2APIC_MSR(APIC_LVTPC),              .always = false },
 132         { .index = X2APIC_MSR(APIC_LVT0),               .always = false },
 133         { .index = X2APIC_MSR(APIC_LVT1),               .always = false },
 134         { .index = X2APIC_MSR(APIC_LVTERR),             .always = false },
 135         { .index = X2APIC_MSR(APIC_TMICT),              .always = false },
 136         { .index = X2APIC_MSR(APIC_TMCCT),              .always = false },
 137         { .index = X2APIC_MSR(APIC_TDCR),               .always = false },
 138         { .index = MSR_INVALID,                         .always = false },
 139 };
 140
 141 /*
 142  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 143  * pause_filter_count: On processors that support Pause filtering(indicated
 144  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 145  *      count value. On VMRUN this value is loaded into an internal counter.
 146  *      Each time a pause instruction is executed, this counter is decremented
 147  *      until it reaches zero at which time a #VMEXIT is generated if pause
 148  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 149  *      Intercept Filtering for more details.
 150  *      This also indicate if ple logic enabled.
 151  *
 152  * pause_filter_thresh: In addition, some processor families support advanced
 153  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 154  *      the amount of time a guest is allowed to execute in a pause loop.
 155  *      In this mode, a 16-bit pause filter threshold field is added in the
 156  *      VMCB. The threshold value is a cycle count that is used to reset the
 157  *      pause counter. As with simple pause filtering, VMRUN loads the pause
 158  *      count value from VMCB into an internal counter. Then, on each pause
 159  *      instruction the hardware checks the elapsed number of cycles since
 160  *      the most recent pause instruction against the pause filter threshold.
 161  *      If the elapsed cycle count is greater than the pause filter threshold,
 162  *      then the internal pause count is reloaded from the VMCB and execution
 163  *      continues. If the elapsed cycle count is less than the pause filter
 164  *      threshold, then the internal pause count is decremented. If the count
 165  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 166  *      triggered. If advanced pause filtering is supported and pause filter
 167  *      threshold field is set to zero, the filter will operate in the simpler,
 168  *      count only mode.
 169  */
 170
 171 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 172 module_param(pause_filter_thresh, ushort, 0444);
 173
 174 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 175 module_param(pause_filter_count, ushort, 0444);
 176
 177 /* Default doubles per-vcpu window every exit. */
 178 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 179 module_param(pause_filter_count_grow, ushort, 0444);
 180
 181 /* Default resets per-vcpu window every exit to pause_filter_count. */
 182 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 183 module_param(pause_filter_count_shrink, ushort, 0444);
 184
 185 /* Default is to compute the maximum so we can never overflow. */
 186 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 187 module_param(pause_filter_count_max, ushort, 0444);
 188
 189 /*
 190  * Use nested page tables by default.  Note, NPT may get forced off by
 191  * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 192  */
 193 bool npt_enabled = true;
 194 module_param_named(npt, npt_enabled, bool, 0444);
 195
 196 /* allow nested virtualization in KVM/SVM */
 197 static int nested = true;
 198 module_param(nested, int, S_IRUGO);
 199
 200 /* enable/disable Next RIP Save */
 201 static int nrips = true;
 202 module_param(nrips, int, 0444);
 203
 204 /* enable/disable Virtual VMLOAD VMSAVE */
 205 static int vls = true;
 206 module_param(vls, int, 0444);
 207
 208 /* enable/disable Virtual GIF */
 209 int vgif = true;
 210 module_param(vgif, int, 0444);
 211
 212 /* enable/disable LBR virtualization */
 213 static int lbrv = true;
 214 module_param(lbrv, int, 0444);
 215
 216 static int tsc_scaling = true;
 217 module_param(tsc_scaling, int, 0444);
 218
 219 /*
 220  * enable / disable AVIC.  Because the defaults differ for APICv
 221  * support between VMX and SVM we cannot use module_param_named.
 222  */
 223 static bool avic;
 224 module_param(avic, bool, 0444);
 225
 226 bool __read_mostly dump_invalid_vmcb;
 227 module_param(dump_invalid_vmcb, bool, 0644);
 228
 229
 230 bool intercept_smi = true;
 231 module_param(intercept_smi, bool, 0444);
 232
 233
 234 static bool svm_gp_erratum_intercept = true;
 235
 236 static u8 rsm_ins_bytes[] = "\x0f\xaa";
 237
 238 static unsigned long iopm_base;
 239
 240 struct kvm_ldttss_desc {
 241         u16 limit0;
 242         u16 base0;
 243         unsigned base1:8, type:5, dpl:2, p:1;
 244         unsigned limit1:4, zero0:3, g:1, base2:8;
 245         u32 base3;
 246         u32 zero1;
 247 } __attribute__((packed));
 248
 249 DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
 250
 251 /*
 252  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 253  * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 254  *
 255  * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 256  * defer the restoration of TSC_AUX until the CPU returns to userspace.
 257  */
 258 static int tsc_aux_uret_slot __read_mostly = -1;
 259
 260 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 261
 262 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 263 #define MSRS_RANGE_SIZE 2048
 264 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 265
 266 u32 svm_msrpm_offset(u32 msr)
 267 {
 268         u32 offset;
 269         int i;
 270
 271         for (i = 0; i < NUM_MSR_MAPS; i++) {
 272                 if (msr < msrpm_ranges[i] ||
 273                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 274                         continue;
 275
 276                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 277                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 278
 279                 /* Now we have the u8 offset - but need the u32 offset */
 280                 return offset / 4;
 281         }
 282
 283         /* MSR not in any range */
 284         return MSR_INVALID;
 285 }
 286
 287 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
 288
 289 static int get_npt_level(void)
 290 {
 291 #ifdef CONFIG_X86_64
 292         return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 293 #else
 294         return PT32E_ROOT_LEVEL;
 295 #endif
 296 }
 297
 298 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 299 {
 300         struct vcpu_svm *svm = to_svm(vcpu);
 301         u64 old_efer = vcpu->arch.efer;
 302         vcpu->arch.efer = efer;
 303
 304         if (!npt_enabled) {
 305                 /* Shadow paging assumes NX to be available.  */
 306                 efer |= EFER_NX;
 307
 308                 if (!(efer & EFER_LMA))
 309                         efer &= ~EFER_LME;
 310         }
 311
 312         if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 313                 if (!(efer & EFER_SVME)) {
 314                         svm_leave_nested(vcpu);
 315                         svm_set_gif(svm, true);
 316                         /* #GP intercept is still needed for vmware backdoor */
 317                         if (!enable_vmware_backdoor)
 318                                 clr_exception_intercept(svm, GP_VECTOR);
 319
 320                         /*
 321                          * Free the nested guest state, unless we are in SMM.
 322                          * In this case we will return to the nested guest
 323                          * as soon as we leave SMM.
 324                          */
 325                         if (!is_smm(vcpu))
 326                                 svm_free_nested(svm);
 327
 328                 } else {
 329                         int ret = svm_allocate_nested(svm);
 330
 331                         if (ret) {
 332                                 vcpu->arch.efer = old_efer;
 333                                 return ret;
 334                         }
 335
 336                         /*
 337                          * Never intercept #GP for SEV guests, KVM can't
 338                          * decrypt guest memory to workaround the erratum.
 339                          */
 340                         if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
 341                                 set_exception_intercept(svm, GP_VECTOR);
 342                 }
 343         }
 344
 345         svm->vmcb->save.efer = efer | EFER_SVME;
 346         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 347         return 0;
 348 }
 349
 350 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 351 {
 352         struct vcpu_svm *svm = to_svm(vcpu);
 353         u32 ret = 0;
 354
 355         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 356                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 357         return ret;
 358 }
 359
 360 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 361 {
 362         struct vcpu_svm *svm = to_svm(vcpu);
 363
 364         if (mask == 0)
 365                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 366         else
 367                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 368
 369 }
 370
 371 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
 372                                            bool commit_side_effects)
 373 {
 374         struct vcpu_svm *svm = to_svm(vcpu);
 375         unsigned long old_rflags;
 376
 377         /*
 378          * SEV-ES does not expose the next RIP. The RIP update is controlled by
 379          * the type of exit and the #VC handler in the guest.
 380          */
 381         if (sev_es_guest(vcpu->kvm))
 382                 goto done;
 383
 384         if (nrips && svm->vmcb->control.next_rip != 0) {
 385                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 386                 svm->next_rip = svm->vmcb->control.next_rip;
 387         }
 388
 389         if (!svm->next_rip) {
 390                 if (unlikely(!commit_side_effects))
 391                         old_rflags = svm->vmcb->save.rflags;
 392
 393                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 394                         return 0;
 395
 396                 if (unlikely(!commit_side_effects))
 397                         svm->vmcb->save.rflags = old_rflags;
 398         } else {
 399                 kvm_rip_write(vcpu, svm->next_rip);
 400         }
 401
 402 done:
 403         if (likely(commit_side_effects))
 404                 svm_set_interrupt_shadow(vcpu, 0);
 405
 406         return 1;
 407 }
 408
 409 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 410 {
 411         return __svm_skip_emulated_instruction(vcpu, true);
 412 }
 413
 414 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
 415 {
 416         unsigned long rip, old_rip = kvm_rip_read(vcpu);
 417         struct vcpu_svm *svm = to_svm(vcpu);
 418
 419         /*
 420          * Due to architectural shortcomings, the CPU doesn't always provide
 421          * NextRIP, e.g. if KVM intercepted an exception that occurred while
 422          * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
 423          * the instruction even if NextRIP is supported to acquire the next
 424          * RIP so that it can be shoved into the NextRIP field, otherwise
 425          * hardware will fail to advance guest RIP during event injection.
 426          * Drop the exception/interrupt if emulation fails and effectively
 427          * retry the instruction, it's the least awful option.  If NRIPS is
 428          * in use, the skip must not commit any side effects such as clearing
 429          * the interrupt shadow or RFLAGS.RF.
 430          */
 431         if (!__svm_skip_emulated_instruction(vcpu, !nrips))
 432                 return -EIO;
 433
 434         rip = kvm_rip_read(vcpu);
 435
 436         /*
 437          * Save the injection information, even when using next_rip, as the
 438          * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
 439          * doesn't complete due to a VM-Exit occurring while the CPU is
 440          * vectoring the event.   Decoding the instruction isn't guaranteed to
 441          * work as there may be no backing instruction, e.g. if the event is
 442          * being injected by L1 for L2, or if the guest is patching INT3 into
 443          * a different instruction.
 444          */
 445         svm->soft_int_injected = true;
 446         svm->soft_int_csbase = svm->vmcb->save.cs.base;
 447         svm->soft_int_old_rip = old_rip;
 448         svm->soft_int_next_rip = rip;
 449
 450         if (nrips)
 451                 kvm_rip_write(vcpu, old_rip);
 452
 453         if (static_cpu_has(X86_FEATURE_NRIPS))
 454                 svm->vmcb->control.next_rip = rip;
 455
 456         return 0;
 457 }
 458
 459 static void svm_inject_exception(struct kvm_vcpu *vcpu)
 460 {
 461         struct kvm_queued_exception *ex = &vcpu->arch.exception;
 462         struct vcpu_svm *svm = to_svm(vcpu);
 463
 464         kvm_deliver_exception_payload(vcpu, ex);
 465
 466         if (kvm_exception_is_soft(ex->vector) &&
 467             svm_update_soft_interrupt_rip(vcpu))
 468                 return;
 469
 470         svm->vmcb->control.event_inj = ex->vector
 471                 | SVM_EVTINJ_VALID
 472                 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 473                 | SVM_EVTINJ_TYPE_EXEPT;
 474         svm->vmcb->control.event_inj_err = ex->error_code;
 475 }
 476
 477 static void svm_init_erratum_383(void)
 478 {
 479         u32 low, high;
 480         int err;
 481         u64 val;
 482
 483         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 484                 return;
 485
 486         /* Use _safe variants to not break nested virtualization */
 487         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 488         if (err)
 489                 return;
 490
 491         val |= (1ULL << 47);
 492
 493         low  = lower_32_bits(val);
 494         high = upper_32_bits(val);
 495
 496         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 497
 498         erratum_383_found = true;
 499 }
 500
 501 static void svm_init_osvw(struct kvm_vcpu *vcpu)
 502 {
 503         /*
 504          * Guests should see errata 400 and 415 as fixed (assuming that
 505          * HLT and IO instructions are intercepted).
 506          */
 507         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 508         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 509
 510         /*
 511          * By increasing VCPU's osvw.length to 3 we are telling the guest that
 512          * all osvw.status bits inside that length, including bit 0 (which is
 513          * reserved for erratum 298), are valid. However, if host processor's
 514          * osvw_len is 0 then osvw_status[0] carries no information. We need to
 515          * be conservative here and therefore we tell the guest that erratum 298
 516          * is present (because we really don't know).
 517          */
 518         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 519                 vcpu->arch.osvw.status |= 1;
 520 }
 521
 522 static bool kvm_is_svm_supported(void)
 523 {
 524         int cpu = raw_smp_processor_id();
 525         const char *msg;
 526         u64 vm_cr;
 527
 528         if (!cpu_has_svm(&msg)) {
 529                 pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
 530                 return false;
 531         }
 532
 533         if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
 534                 pr_info("KVM is unsupported when running as an SEV guest\n");
 535                 return false;
 536         }
 537
 538         rdmsrl(MSR_VM_CR, vm_cr);
 539         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
 540                 pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
 541                 return false;
 542         }
 543
 544         return true;
 545 }
 546
 547 static int svm_check_processor_compat(void)
 548 {
 549         if (!kvm_is_svm_supported())
 550                 return -EIO;
 551
 552         return 0;
 553 }
 554
 555 void __svm_write_tsc_multiplier(u64 multiplier)
 556 {
 557         preempt_disable();
 558
 559         if (multiplier == __this_cpu_read(current_tsc_ratio))
 560                 goto out;
 561
 562         wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 563         __this_cpu_write(current_tsc_ratio, multiplier);
 564 out:
 565         preempt_enable();
 566 }
 567
 568 static void svm_hardware_disable(void)
 569 {
 570         /* Make sure we clean up behind us */
 571         if (tsc_scaling)
 572                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 573
 574         cpu_svm_disable();
 575
 576         amd_pmu_disable_virt();
 577 }
 578
 579 static int svm_hardware_enable(void)
 580 {
 581
 582         struct svm_cpu_data *sd;
 583         uint64_t efer;
 584         struct desc_struct *gdt;
 585         int me = raw_smp_processor_id();
 586
 587         rdmsrl(MSR_EFER, efer);
 588         if (efer & EFER_SVME)
 589                 return -EBUSY;
 590
 591         sd = per_cpu_ptr(&svm_data, me);
 592         sd->asid_generation = 1;
 593         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 594         sd->next_asid = sd->max_asid + 1;
 595         sd->min_asid = max_sev_asid + 1;
 596
 597         gdt = get_current_gdt_rw();
 598         sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 599
 600         wrmsrl(MSR_EFER, efer | EFER_SVME);
 601
 602         wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
 603
 604         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 605                 /*
 606                  * Set the default value, even if we don't use TSC scaling
 607                  * to avoid having stale value in the msr
 608                  */
 609                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 610         }
 611
 612
 613         /*
 614          * Get OSVW bits.
 615          *
 616          * Note that it is possible to have a system with mixed processor
 617          * revisions and therefore different OSVW bits. If bits are not the same
 618          * on different processors then choose the worst case (i.e. if erratum
 619          * is present on one processor and not on another then assume that the
 620          * erratum is present everywhere).
 621          */
 622         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 623                 uint64_t len, status = 0;
 624                 int err;
 625
 626                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 627                 if (!err)
 628                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 629                                                       &err);
 630
 631                 if (err)
 632                         osvw_status = osvw_len = 0;
 633                 else {
 634                         if (len < osvw_len)
 635                                 osvw_len = len;
 636                         osvw_status |= status;
 637                         osvw_status &= (1ULL << osvw_len) - 1;
 638                 }
 639         } else
 640                 osvw_status = osvw_len = 0;
 641
 642         svm_init_erratum_383();
 643
 644         amd_pmu_enable_virt();
 645
 646         return 0;
 647 }
 648
 649 static void svm_cpu_uninit(int cpu)
 650 {
 651         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 652
 653         if (!sd->save_area)
 654                 return;
 655
 656         kfree(sd->sev_vmcbs);
 657         __free_page(sd->save_area);
 658         sd->save_area_pa = 0;
 659         sd->save_area = NULL;
 660 }
 661
 662 static int svm_cpu_init(int cpu)
 663 {
 664         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 665         int ret = -ENOMEM;
 666
 667         memset(sd, 0, sizeof(struct svm_cpu_data));
 668         sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
 669         if (!sd->save_area)
 670                 return ret;
 671
 672         ret = sev_cpu_init(sd);
 673         if (ret)
 674                 goto free_save_area;
 675
 676         sd->save_area_pa = __sme_page_pa(sd->save_area);
 677         return 0;
 678
 679 free_save_area:
 680         __free_page(sd->save_area);
 681         sd->save_area = NULL;
 682         return ret;
 683
 684 }
 685
 686 static int direct_access_msr_slot(u32 msr)
 687 {
 688         u32 i;
 689
 690         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 691                 if (direct_access_msrs[i].index == msr)
 692                         return i;
 693
 694         return -ENOENT;
 695 }
 696
 697 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 698                                      int write)
 699 {
 700         struct vcpu_svm *svm = to_svm(vcpu);
 701         int slot = direct_access_msr_slot(msr);
 702
 703         if (slot == -ENOENT)
 704                 return;
 705
 706         /* Set the shadow bitmaps to the desired intercept states */
 707         if (read)
 708                 set_bit(slot, svm->shadow_msr_intercept.read);
 709         else
 710                 clear_bit(slot, svm->shadow_msr_intercept.read);
 711
 712         if (write)
 713                 set_bit(slot, svm->shadow_msr_intercept.write);
 714         else
 715                 clear_bit(slot, svm->shadow_msr_intercept.write);
 716 }
 717
 718 static bool valid_msr_intercept(u32 index)
 719 {
 720         return direct_access_msr_slot(index) != -ENOENT;
 721 }
 722
 723 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 724 {
 725         u8 bit_write;
 726         unsigned long tmp;
 727         u32 offset;
 728         u32 *msrpm;
 729
 730         /*
 731          * For non-nested case:
 732          * If the L01 MSR bitmap does not intercept the MSR, then we need to
 733          * save it.
 734          *
 735          * For nested case:
 736          * If the L02 MSR bitmap does not intercept the MSR, then we need to
 737          * save it.
 738          */
 739         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 740                                       to_svm(vcpu)->msrpm;
 741
 742         offset    = svm_msrpm_offset(msr);
 743         bit_write = 2 * (msr & 0x0f) + 1;
 744         tmp       = msrpm[offset];
 745
 746         BUG_ON(offset == MSR_INVALID);
 747
 748         return !!test_bit(bit_write,  &tmp);
 749 }
 750
 751 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 752                                         u32 msr, int read, int write)
 753 {
 754         struct vcpu_svm *svm = to_svm(vcpu);
 755         u8 bit_read, bit_write;
 756         unsigned long tmp;
 757         u32 offset;
 758
 759         /*
 760          * If this warning triggers extend the direct_access_msrs list at the
 761          * beginning of the file
 762          */
 763         WARN_ON(!valid_msr_intercept(msr));
 764
 765         /* Enforce non allowed MSRs to trap */
 766         if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 767                 read = 0;
 768
 769         if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 770                 write = 0;
 771
 772         offset    = svm_msrpm_offset(msr);
 773         bit_read  = 2 * (msr & 0x0f);
 774         bit_write = 2 * (msr & 0x0f) + 1;
 775         tmp       = msrpm[offset];
 776
 777         BUG_ON(offset == MSR_INVALID);
 778
 779         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 780         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 781
 782         msrpm[offset] = tmp;
 783
 784         svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 785         svm->nested.force_msr_bitmap_recalc = true;
 786 }
 787
 788 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 789                           int read, int write)
 790 {
 791         set_shadow_msr_intercept(vcpu, msr, read, write);
 792         set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 793 }
 794
 795 u32 *svm_vcpu_alloc_msrpm(void)
 796 {
 797         unsigned int order = get_order(MSRPM_SIZE);
 798         struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 799         u32 *msrpm;
 800
 801         if (!pages)
 802                 return NULL;
 803
 804         msrpm = page_address(pages);
 805         memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 806
 807         return msrpm;
 808 }
 809
 810 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 811 {
 812         int i;
 813
 814         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 815                 if (!direct_access_msrs[i].always)
 816                         continue;
 817                 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 818         }
 819 }
 820
 821 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
 822 {
 823         int i;
 824
 825         if (intercept == svm->x2avic_msrs_intercepted)
 826                 return;
 827
 828         if (!x2avic_enabled ||
 829             !apic_x2apic_mode(svm->vcpu.arch.apic))
 830                 return;
 831
 832         for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
 833                 int index = direct_access_msrs[i].index;
 834
 835                 if ((index < APIC_BASE_MSR) ||
 836                     (index > APIC_BASE_MSR + 0xff))
 837                         continue;
 838                 set_msr_interception(&svm->vcpu, svm->msrpm, index,
 839                                      !intercept, !intercept);
 840         }
 841
 842         svm->x2avic_msrs_intercepted = intercept;
 843 }
 844
 845 void svm_vcpu_free_msrpm(u32 *msrpm)
 846 {
 847         __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 848 }
 849
 850 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 851 {
 852         struct vcpu_svm *svm = to_svm(vcpu);
 853         u32 i;
 854
 855         /*
 856          * Set intercept permissions for all direct access MSRs again. They
 857          * will automatically get filtered through the MSR filter, so we are
 858          * back in sync after this.
 859          */
 860         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 861                 u32 msr = direct_access_msrs[i].index;
 862                 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 863                 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 864
 865                 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 866         }
 867 }
 868
 869 static void add_msr_offset(u32 offset)
 870 {
 871         int i;
 872
 873         for (i = 0; i < MSRPM_OFFSETS; ++i) {
 874
 875                 /* Offset already in list? */
 876                 if (msrpm_offsets[i] == offset)
 877                         return;
 878
 879                 /* Slot used by another offset? */
 880                 if (msrpm_offsets[i] != MSR_INVALID)
 881                         continue;
 882
 883                 /* Add offset to list */
 884                 msrpm_offsets[i] = offset;
 885
 886                 return;
 887         }
 888
 889         /*
 890          * If this BUG triggers the msrpm_offsets table has an overflow. Just
 891          * increase MSRPM_OFFSETS in this case.
 892          */
 893         BUG();
 894 }
 895
 896 static void init_msrpm_offsets(void)
 897 {
 898         int i;
 899
 900         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 901
 902         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 903                 u32 offset;
 904
 905                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
 906                 BUG_ON(offset == MSR_INVALID);
 907
 908                 add_msr_offset(offset);
 909         }
 910 }
 911
 912 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 913 {
 914         to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
 915         to_vmcb->save.br_from           = from_vmcb->save.br_from;
 916         to_vmcb->save.br_to             = from_vmcb->save.br_to;
 917         to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
 918         to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
 919
 920         vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 921 }
 922
 923 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 924 {
 925         struct vcpu_svm *svm = to_svm(vcpu);
 926
 927         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 928         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 929         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 930         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 931         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 932
 933         /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
 934         if (is_guest_mode(vcpu))
 935                 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
 936 }
 937
 938 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 939 {
 940         struct vcpu_svm *svm = to_svm(vcpu);
 941
 942         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
 943         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 944         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 945         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 946         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 947
 948         /*
 949          * Move the LBR msrs back to the vmcb01 to avoid copying them
 950          * on nested guest entries.
 951          */
 952         if (is_guest_mode(vcpu))
 953                 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 954 }
 955
 956 static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
 957 {
 958         /*
 959          * If the LBR virtualization is disabled, the LBR msrs are always
 960          * kept in the vmcb01 to avoid copying them on nested guest entries.
 961          *
 962          * If nested, and the LBR virtualization is enabled/disabled, the msrs
 963          * are moved between the vmcb01 and vmcb02 as needed.
 964          */
 965         struct vmcb *vmcb =
 966                 (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
 967                         svm->vmcb : svm->vmcb01.ptr;
 968
 969         switch (index) {
 970         case MSR_IA32_DEBUGCTLMSR:
 971                 return vmcb->save.dbgctl;
 972         case MSR_IA32_LASTBRANCHFROMIP:
 973                 return vmcb->save.br_from;
 974         case MSR_IA32_LASTBRANCHTOIP:
 975                 return vmcb->save.br_to;
 976         case MSR_IA32_LASTINTFROMIP:
 977                 return vmcb->save.last_excp_from;
 978         case MSR_IA32_LASTINTTOIP:
 979                 return vmcb->save.last_excp_to;
 980         default:
 981                 KVM_BUG(false, svm->vcpu.kvm,
 982                         "%s: Unknown MSR 0x%x", __func__, index);
 983                 return 0;
 984         }
 985 }
 986
 987 void svm_update_lbrv(struct kvm_vcpu *vcpu)
 988 {
 989         struct vcpu_svm *svm = to_svm(vcpu);
 990
 991         bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
 992                                            DEBUGCTLMSR_LBR;
 993
 994         bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
 995                                       LBR_CTL_ENABLE_MASK);
 996
 997         if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
 998                 if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
 999                         enable_lbrv = true;
1000
1001         if (enable_lbrv == current_enable_lbrv)
1002                 return;
1003
1004         if (enable_lbrv)
1005                 svm_enable_lbrv(vcpu);
1006         else
1007                 svm_disable_lbrv(vcpu);
1008 }
1009
1010 void disable_nmi_singlestep(struct vcpu_svm *svm)
1011 {
1012         svm->nmi_singlestep = false;
1013
1014         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1015                 /* Clear our flags if they were not set by the guest */
1016                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1017                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1018                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1019                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1020         }
1021 }
1022
1023 static void grow_ple_window(struct kvm_vcpu *vcpu)
1024 {
1025         struct vcpu_svm *svm = to_svm(vcpu);
1026         struct vmcb_control_area *control = &svm->vmcb->control;
1027         int old = control->pause_filter_count;
1028
1029         if (kvm_pause_in_guest(vcpu->kvm))
1030                 return;
1031
1032         control->pause_filter_count = __grow_ple_window(old,
1033                                                         pause_filter_count,
1034                                                         pause_filter_count_grow,
1035                                                         pause_filter_count_max);
1036
1037         if (control->pause_filter_count != old) {
1038                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1039                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1040                                             control->pause_filter_count, old);
1041         }
1042 }
1043
1044 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1045 {
1046         struct vcpu_svm *svm = to_svm(vcpu);
1047         struct vmcb_control_area *control = &svm->vmcb->control;
1048         int old = control->pause_filter_count;
1049
1050         if (kvm_pause_in_guest(vcpu->kvm))
1051                 return;
1052
1053         control->pause_filter_count =
1054                                 __shrink_ple_window(old,
1055                                                     pause_filter_count,
1056                                                     pause_filter_count_shrink,
1057                                                     pause_filter_count);
1058         if (control->pause_filter_count != old) {
1059                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1060                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1061                                             control->pause_filter_count, old);
1062         }
1063 }
1064
1065 static void svm_hardware_unsetup(void)
1066 {
1067         int cpu;
1068
1069         sev_hardware_unsetup();
1070
1071         for_each_possible_cpu(cpu)
1072                 svm_cpu_uninit(cpu);
1073
1074         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1075         get_order(IOPM_SIZE));
1076         iopm_base = 0;
1077 }
1078
1079 static void init_seg(struct vmcb_seg *seg)
1080 {
1081         seg->selector = 0;
1082         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1083                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1084         seg->limit = 0xffff;
1085         seg->base = 0;
1086 }
1087
1088 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1089 {
1090         seg->selector = 0;
1091         seg->attrib = SVM_SELECTOR_P_MASK | type;
1092         seg->limit = 0xffff;
1093         seg->base = 0;
1094 }
1095
1096 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1097 {
1098         struct vcpu_svm *svm = to_svm(vcpu);
1099
1100         return svm->nested.ctl.tsc_offset;
1101 }
1102
1103 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1104 {
1105         struct vcpu_svm *svm = to_svm(vcpu);
1106
1107         return svm->tsc_ratio_msr;
1108 }
1109
1110 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1111 {
1112         struct vcpu_svm *svm = to_svm(vcpu);
1113
1114         svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1115         svm->vmcb->control.tsc_offset = offset;
1116         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1117 }
1118
1119 static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1120 {
1121         __svm_write_tsc_multiplier(multiplier);
1122 }
1123
1124
1125 /* Evaluate instruction intercepts that depend on guest CPUID features. */
1126 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1127                                               struct vcpu_svm *svm)
1128 {
1129         /*
1130          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1131          * roots, or if INVPCID is disabled in the guest to inject #UD.
1132          */
1133         if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1134                 if (!npt_enabled ||
1135                     !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1136                         svm_set_intercept(svm, INTERCEPT_INVPCID);
1137                 else
1138                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
1139         }
1140
1141         if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1142                 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1143                         svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1144                 else
1145                         svm_set_intercept(svm, INTERCEPT_RDTSCP);
1146         }
1147 }
1148
1149 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1150 {
1151         struct vcpu_svm *svm = to_svm(vcpu);
1152
1153         if (guest_cpuid_is_intel(vcpu)) {
1154                 /*
1155                  * We must intercept SYSENTER_EIP and SYSENTER_ESP
1156                  * accesses because the processor only stores 32 bits.
1157                  * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1158                  */
1159                 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1160                 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1161                 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1162
1163                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1164                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1165
1166                 svm->v_vmload_vmsave_enabled = false;
1167         } else {
1168                 /*
1169                  * If hardware supports Virtual VMLOAD VMSAVE then enable it
1170                  * in VMCB and clear intercepts to avoid #VMEXIT.
1171                  */
1172                 if (vls) {
1173                         svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1174                         svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1175                         svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1176                 }
1177                 /* No need to intercept these MSRs */
1178                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1179                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1180         }
1181 }
1182
1183 static void init_vmcb(struct kvm_vcpu *vcpu)
1184 {
1185         struct vcpu_svm *svm = to_svm(vcpu);
1186         struct vmcb *vmcb = svm->vmcb01.ptr;
1187         struct vmcb_control_area *control = &vmcb->control;
1188         struct vmcb_save_area *save = &vmcb->save;
1189
1190         svm_set_intercept(svm, INTERCEPT_CR0_READ);
1191         svm_set_intercept(svm, INTERCEPT_CR3_READ);
1192         svm_set_intercept(svm, INTERCEPT_CR4_READ);
1193         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1194         svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1195         svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1196         if (!kvm_vcpu_apicv_active(vcpu))
1197                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1198
1199         set_dr_intercepts(svm);
1200
1201         set_exception_intercept(svm, PF_VECTOR);
1202         set_exception_intercept(svm, UD_VECTOR);
1203         set_exception_intercept(svm, MC_VECTOR);
1204         set_exception_intercept(svm, AC_VECTOR);
1205         set_exception_intercept(svm, DB_VECTOR);
1206         /*
1207          * Guest access to VMware backdoor ports could legitimately
1208          * trigger #GP because of TSS I/O permission bitmap.
1209          * We intercept those #GP and allow access to them anyway
1210          * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
1211          * decrypt guest memory to decode the faulting instruction.
1212          */
1213         if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
1214                 set_exception_intercept(svm, GP_VECTOR);
1215
1216         svm_set_intercept(svm, INTERCEPT_INTR);
1217         svm_set_intercept(svm, INTERCEPT_NMI);
1218
1219         if (intercept_smi)
1220                 svm_set_intercept(svm, INTERCEPT_SMI);
1221
1222         svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1223         svm_set_intercept(svm, INTERCEPT_RDPMC);
1224         svm_set_intercept(svm, INTERCEPT_CPUID);
1225         svm_set_intercept(svm, INTERCEPT_INVD);
1226         svm_set_intercept(svm, INTERCEPT_INVLPG);
1227         svm_set_intercept(svm, INTERCEPT_INVLPGA);
1228         svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1229         svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1230         svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1231         svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1232         svm_set_intercept(svm, INTERCEPT_VMRUN);
1233         svm_set_intercept(svm, INTERCEPT_VMMCALL);
1234         svm_set_intercept(svm, INTERCEPT_VMLOAD);
1235         svm_set_intercept(svm, INTERCEPT_VMSAVE);
1236         svm_set_intercept(svm, INTERCEPT_STGI);
1237         svm_set_intercept(svm, INTERCEPT_CLGI);
1238         svm_set_intercept(svm, INTERCEPT_SKINIT);
1239         svm_set_intercept(svm, INTERCEPT_WBINVD);
1240         svm_set_intercept(svm, INTERCEPT_XSETBV);
1241         svm_set_intercept(svm, INTERCEPT_RDPRU);
1242         svm_set_intercept(svm, INTERCEPT_RSM);
1243
1244         if (!kvm_mwait_in_guest(vcpu->kvm)) {
1245                 svm_set_intercept(svm, INTERCEPT_MONITOR);
1246                 svm_set_intercept(svm, INTERCEPT_MWAIT);
1247         }
1248
1249         if (!kvm_hlt_in_guest(vcpu->kvm))
1250                 svm_set_intercept(svm, INTERCEPT_HLT);
1251
1252         control->iopm_base_pa = __sme_set(iopm_base);
1253         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1254         control->int_ctl = V_INTR_MASKING_MASK;
1255
1256         init_seg(&save->es);
1257         init_seg(&save->ss);
1258         init_seg(&save->ds);
1259         init_seg(&save->fs);
1260         init_seg(&save->gs);
1261
1262         save->cs.selector = 0xf000;
1263         save->cs.base = 0xffff0000;
1264         /* Executable/Readable Code Segment */
1265         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1266                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1267         save->cs.limit = 0xffff;
1268
1269         save->gdtr.base = 0;
1270         save->gdtr.limit = 0xffff;
1271         save->idtr.base = 0;
1272         save->idtr.limit = 0xffff;
1273
1274         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1275         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1276
1277         if (npt_enabled) {
1278                 /* Setup VMCB for Nested Paging */
1279                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1280                 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1281                 clr_exception_intercept(svm, PF_VECTOR);
1282                 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1283                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1284                 save->g_pat = vcpu->arch.pat;
1285                 save->cr3 = 0;
1286         }
1287         svm->current_vmcb->asid_generation = 0;
1288         svm->asid = 0;
1289
1290         svm->nested.vmcb12_gpa = INVALID_GPA;
1291         svm->nested.last_vmcb12_gpa = INVALID_GPA;
1292
1293         if (!kvm_pause_in_guest(vcpu->kvm)) {
1294                 control->pause_filter_count = pause_filter_count;
1295                 if (pause_filter_thresh)
1296                         control->pause_filter_thresh = pause_filter_thresh;
1297                 svm_set_intercept(svm, INTERCEPT_PAUSE);
1298         } else {
1299                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1300         }
1301
1302         svm_recalc_instruction_intercepts(vcpu, svm);
1303
1304         /*
1305          * If the host supports V_SPEC_CTRL then disable the interception
1306          * of MSR_IA32_SPEC_CTRL.
1307          */
1308         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1309                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1310
1311         if (kvm_vcpu_apicv_active(vcpu))
1312                 avic_init_vmcb(svm, vmcb);
1313
1314         if (vgif) {
1315                 svm_clr_intercept(svm, INTERCEPT_STGI);
1316                 svm_clr_intercept(svm, INTERCEPT_CLGI);
1317                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1318         }
1319
1320         if (sev_guest(vcpu->kvm))
1321                 sev_init_vmcb(svm);
1322
1323         svm_hv_init_vmcb(vmcb);
1324         init_vmcb_after_set_cpuid(vcpu);
1325
1326         vmcb_mark_all_dirty(vmcb);
1327
1328         enable_gif(svm);
1329 }
1330
1331 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1332 {
1333         struct vcpu_svm *svm = to_svm(vcpu);
1334
1335         svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1336
1337         svm_init_osvw(vcpu);
1338         vcpu->arch.microcode_version = 0x01000065;
1339         svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1340
1341         svm->nmi_masked = false;
1342         svm->awaiting_iret_completion = false;
1343
1344         if (sev_es_guest(vcpu->kvm))
1345                 sev_es_vcpu_reset(svm);
1346 }
1347
1348 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1349 {
1350         struct vcpu_svm *svm = to_svm(vcpu);
1351
1352         svm->spec_ctrl = 0;
1353         svm->virt_spec_ctrl = 0;
1354
1355         init_vmcb(vcpu);
1356
1357         if (!init_event)
1358                 __svm_vcpu_reset(vcpu);
1359 }
1360
1361 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1362 {
1363         svm->current_vmcb = target_vmcb;
1364         svm->vmcb = target_vmcb->ptr;
1365 }
1366
1367 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1368 {
1369         struct vcpu_svm *svm;
1370         struct page *vmcb01_page;
1371         struct page *vmsa_page = NULL;
1372         int err;
1373
1374         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1375         svm = to_svm(vcpu);
1376
1377         err = -ENOMEM;
1378         vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1379         if (!vmcb01_page)
1380                 goto out;
1381
1382         if (sev_es_guest(vcpu->kvm)) {
1383                 /*
1384                  * SEV-ES guests require a separate VMSA page used to contain
1385                  * the encrypted register state of the guest.
1386                  */
1387                 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1388                 if (!vmsa_page)
1389                         goto error_free_vmcb_page;
1390
1391                 /*
1392                  * SEV-ES guests maintain an encrypted version of their FPU
1393                  * state which is restored and saved on VMRUN and VMEXIT.
1394                  * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1395                  * do xsave/xrstor on it.
1396                  */
1397                 fpstate_set_confidential(&vcpu->arch.guest_fpu);
1398         }
1399
1400         err = avic_init_vcpu(svm);
1401         if (err)
1402                 goto error_free_vmsa_page;
1403
1404         svm->msrpm = svm_vcpu_alloc_msrpm();
1405         if (!svm->msrpm) {
1406                 err = -ENOMEM;
1407                 goto error_free_vmsa_page;
1408         }
1409
1410         svm->x2avic_msrs_intercepted = true;
1411
1412         svm->vmcb01.ptr = page_address(vmcb01_page);
1413         svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1414         svm_switch_vmcb(svm, &svm->vmcb01);
1415
1416         if (vmsa_page)
1417                 svm->sev_es.vmsa = page_address(vmsa_page);
1418
1419         svm->guest_state_loaded = false;
1420
1421         return 0;
1422
1423 error_free_vmsa_page:
1424         if (vmsa_page)
1425                 __free_page(vmsa_page);
1426 error_free_vmcb_page:
1427         __free_page(vmcb01_page);
1428 out:
1429         return err;
1430 }
1431
1432 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1433 {
1434         int i;
1435
1436         for_each_online_cpu(i)
1437                 cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1438 }
1439
1440 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1441 {
1442         struct vcpu_svm *svm = to_svm(vcpu);
1443
1444         /*
1445          * The vmcb page can be recycled, causing a false negative in
1446          * svm_vcpu_load(). So, ensure that no logical CPU has this
1447          * vmcb page recorded as its current vmcb.
1448          */
1449         svm_clear_current_vmcb(svm->vmcb);
1450
1451         svm_leave_nested(vcpu);
1452         svm_free_nested(svm);
1453
1454         sev_free_vcpu(vcpu);
1455
1456         __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1457         __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1458 }
1459
1460 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1461 {
1462         struct vcpu_svm *svm = to_svm(vcpu);
1463         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1464
1465         if (sev_es_guest(vcpu->kvm))
1466                 sev_es_unmap_ghcb(svm);
1467
1468         if (svm->guest_state_loaded)
1469                 return;
1470
1471         /*
1472          * Save additional host state that will be restored on VMEXIT (sev-es)
1473          * or subsequent vmload of host save area.
1474          */
1475         vmsave(sd->save_area_pa);
1476         if (sev_es_guest(vcpu->kvm)) {
1477                 struct sev_es_save_area *hostsa;
1478                 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1479
1480                 sev_es_prepare_switch_to_guest(hostsa);
1481         }
1482
1483         if (tsc_scaling)
1484                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1485
1486         if (likely(tsc_aux_uret_slot >= 0))
1487                 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1488
1489         svm->guest_state_loaded = true;
1490 }
1491
1492 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1493 {
1494         to_svm(vcpu)->guest_state_loaded = false;
1495 }
1496
1497 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1498 {
1499         struct vcpu_svm *svm = to_svm(vcpu);
1500         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1501
1502         if (sd->current_vmcb != svm->vmcb) {
1503                 sd->current_vmcb = svm->vmcb;
1504                 indirect_branch_prediction_barrier();
1505         }
1506         if (kvm_vcpu_apicv_active(vcpu))
1507                 avic_vcpu_load(vcpu, cpu);
1508 }
1509
1510 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1511 {
1512         if (kvm_vcpu_apicv_active(vcpu))
1513                 avic_vcpu_put(vcpu);
1514
1515         svm_prepare_host_switch(vcpu);
1516
1517         ++vcpu->stat.host_state_reload;
1518 }
1519
1520 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1521 {
1522         struct vcpu_svm *svm = to_svm(vcpu);
1523         unsigned long rflags = svm->vmcb->save.rflags;
1524
1525         if (svm->nmi_singlestep) {
1526                 /* Hide our flags if they were not set by the guest */
1527                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1528                         rflags &= ~X86_EFLAGS_TF;
1529                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1530                         rflags &= ~X86_EFLAGS_RF;
1531         }
1532         return rflags;
1533 }
1534
1535 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1536 {
1537         if (to_svm(vcpu)->nmi_singlestep)
1538                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1539
1540        /*
1541         * Any change of EFLAGS.VM is accompanied by a reload of SS
1542         * (caused by either a task switch or an inter-privilege IRET),
1543         * so we do not need to update the CPL here.
1544         */
1545         to_svm(vcpu)->vmcb->save.rflags = rflags;
1546 }
1547
1548 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1549 {
1550         struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1551
1552         return sev_es_guest(vcpu->kvm)
1553                 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1554                 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1555 }
1556
1557 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1558 {
1559         kvm_register_mark_available(vcpu, reg);
1560
1561         switch (reg) {
1562         case VCPU_EXREG_PDPTR:
1563                 /*
1564                  * When !npt_enabled, mmu->pdptrs[] is already available since
1565                  * it is always updated per SDM when moving to CRs.
1566                  */
1567                 if (npt_enabled)
1568                         load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1569                 break;
1570         default:
1571                 KVM_BUG_ON(1, vcpu->kvm);
1572         }
1573 }
1574
1575 static void svm_set_vintr(struct vcpu_svm *svm)
1576 {
1577         struct vmcb_control_area *control;
1578
1579         /*
1580          * The following fields are ignored when AVIC is enabled
1581          */
1582         WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1583
1584         svm_set_intercept(svm, INTERCEPT_VINTR);
1585
1586         /*
1587          * This is just a dummy VINTR to actually cause a vmexit to happen.
1588          * Actual injection of virtual interrupts happens through EVENTINJ.
1589          */
1590         control = &svm->vmcb->control;
1591         control->int_vector = 0x0;
1592         control->int_ctl &= ~V_INTR_PRIO_MASK;
1593         control->int_ctl |= V_IRQ_MASK |
1594                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1595         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1596 }
1597
1598 static void svm_clear_vintr(struct vcpu_svm *svm)
1599 {
1600         svm_clr_intercept(svm, INTERCEPT_VINTR);
1601
1602         /* Drop int_ctl fields related to VINTR injection.  */
1603         svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1604         if (is_guest_mode(&svm->vcpu)) {
1605                 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1606
1607                 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1608                         (svm->nested.ctl.int_ctl & V_TPR_MASK));
1609
1610                 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1611                         V_IRQ_INJECTION_BITS_MASK;
1612
1613                 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1614         }
1615
1616         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1617 }
1618
1619 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1620 {
1621         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1622         struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1623
1624         switch (seg) {
1625         case VCPU_SREG_CS: return &save->cs;
1626         case VCPU_SREG_DS: return &save->ds;
1627         case VCPU_SREG_ES: return &save->es;
1628         case VCPU_SREG_FS: return &save01->fs;
1629         case VCPU_SREG_GS: return &save01->gs;
1630         case VCPU_SREG_SS: return &save->ss;
1631         case VCPU_SREG_TR: return &save01->tr;
1632         case VCPU_SREG_LDTR: return &save01->ldtr;
1633         }
1634         BUG();
1635         return NULL;
1636 }
1637
1638 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1639 {
1640         struct vmcb_seg *s = svm_seg(vcpu, seg);
1641
1642         return s->base;
1643 }
1644
1645 static void svm_get_segment(struct kvm_vcpu *vcpu,
1646                             struct kvm_segment *var, int seg)
1647 {
1648         struct vmcb_seg *s = svm_seg(vcpu, seg);
1649
1650         var->base = s->base;
1651         var->limit = s->limit;
1652         var->selector = s->selector;
1653         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1654         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1655         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1656         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1657         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1658         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1659         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1660
1661         /*
1662          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1663          * However, the SVM spec states that the G bit is not observed by the
1664          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1665          * So let's synthesize a legal G bit for all segments, this helps
1666          * running KVM nested. It also helps cross-vendor migration, because
1667          * Intel's vmentry has a check on the 'G' bit.
1668          */
1669         var->g = s->limit > 0xfffff;
1670
1671         /*
1672          * AMD's VMCB does not have an explicit unusable field, so emulate it
1673          * for cross vendor migration purposes by "not present"
1674          */
1675         var->unusable = !var->present;
1676
1677         switch (seg) {
1678         case VCPU_SREG_TR:
1679                 /*
1680                  * Work around a bug where the busy flag in the tr selector
1681                  * isn't exposed
1682                  */
1683                 var->type |= 0x2;
1684                 break;
1685         case VCPU_SREG_DS:
1686         case VCPU_SREG_ES:
1687         case VCPU_SREG_FS:
1688         case VCPU_SREG_GS:
1689                 /*
1690                  * The accessed bit must always be set in the segment
1691                  * descriptor cache, although it can be cleared in the
1692                  * descriptor, the cached bit always remains at 1. Since
1693                  * Intel has a check on this, set it here to support
1694                  * cross-vendor migration.
1695                  */
1696                 if (!var->unusable)
1697                         var->type |= 0x1;
1698                 break;
1699         case VCPU_SREG_SS:
1700                 /*
1701                  * On AMD CPUs sometimes the DB bit in the segment
1702                  * descriptor is left as 1, although the whole segment has
1703                  * been made unusable. Clear it here to pass an Intel VMX
1704                  * entry check when cross vendor migrating.
1705                  */
1706                 if (var->unusable)
1707                         var->db = 0;
1708                 /* This is symmetric with svm_set_segment() */
1709                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1710                 break;
1711         }
1712 }
1713
1714 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1715 {
1716         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1717
1718         return save->cpl;
1719 }
1720
1721 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1722 {
1723         struct kvm_segment cs;
1724
1725         svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1726         *db = cs.db;
1727         *l = cs.l;
1728 }
1729
1730 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1731 {
1732         struct vcpu_svm *svm = to_svm(vcpu);
1733
1734         dt->size = svm->vmcb->save.idtr.limit;
1735         dt->address = svm->vmcb->save.idtr.base;
1736 }
1737
1738 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1739 {
1740         struct vcpu_svm *svm = to_svm(vcpu);
1741
1742         svm->vmcb->save.idtr.limit = dt->size;
1743         svm->vmcb->save.idtr.base = dt->address ;
1744         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1745 }
1746
1747 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1748 {
1749         struct vcpu_svm *svm = to_svm(vcpu);
1750
1751         dt->size = svm->vmcb->save.gdtr.limit;
1752         dt->address = svm->vmcb->save.gdtr.base;
1753 }
1754
1755 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1756 {
1757         struct vcpu_svm *svm = to_svm(vcpu);
1758
1759         svm->vmcb->save.gdtr.limit = dt->size;
1760         svm->vmcb->save.gdtr.base = dt->address ;
1761         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1762 }
1763
1764 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1765 {
1766         struct vcpu_svm *svm = to_svm(vcpu);
1767
1768         /*
1769          * For guests that don't set guest_state_protected, the cr3 update is
1770          * handled via kvm_mmu_load() while entering the guest. For guests
1771          * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1772          * VMCB save area now, since the save area will become the initial
1773          * contents of the VMSA, and future VMCB save area updates won't be
1774          * seen.
1775          */
1776         if (sev_es_guest(vcpu->kvm)) {
1777                 svm->vmcb->save.cr3 = cr3;
1778                 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1779         }
1780 }
1781
1782 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1783 {
1784         struct vcpu_svm *svm = to_svm(vcpu);
1785         u64 hcr0 = cr0;
1786         bool old_paging = is_paging(vcpu);
1787
1788 #ifdef CONFIG_X86_64
1789         if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1790                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1791                         vcpu->arch.efer |= EFER_LMA;
1792                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1793                 }
1794
1795                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1796                         vcpu->arch.efer &= ~EFER_LMA;
1797                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1798                 }
1799         }
1800 #endif
1801         vcpu->arch.cr0 = cr0;
1802
1803         if (!npt_enabled) {
1804                 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1805                 if (old_paging != is_paging(vcpu))
1806                         svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1807         }
1808
1809         /*
1810          * re-enable caching here because the QEMU bios
1811          * does not do it - this results in some delay at
1812          * reboot
1813          */
1814         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1815                 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1816
1817         svm->vmcb->save.cr0 = hcr0;
1818         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1819
1820         /*
1821          * SEV-ES guests must always keep the CR intercepts cleared. CR
1822          * tracking is done using the CR write traps.
1823          */
1824         if (sev_es_guest(vcpu->kvm))
1825                 return;
1826
1827         if (hcr0 == cr0) {
1828                 /* Selective CR0 write remains on.  */
1829                 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1830                 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1831         } else {
1832                 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1833                 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1834         }
1835 }
1836
1837 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1838 {
1839         return true;
1840 }
1841
1842 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1843 {
1844         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1845         unsigned long old_cr4 = vcpu->arch.cr4;
1846
1847         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1848                 svm_flush_tlb_current(vcpu);
1849
1850         vcpu->arch.cr4 = cr4;
1851         if (!npt_enabled) {
1852                 cr4 |= X86_CR4_PAE;
1853
1854                 if (!is_paging(vcpu))
1855                         cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1856         }
1857         cr4 |= host_cr4_mce;
1858         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1859         vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1860
1861         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1862                 kvm_update_cpuid_runtime(vcpu);
1863 }
1864
1865 static void svm_set_segment(struct kvm_vcpu *vcpu,
1866                             struct kvm_segment *var, int seg)
1867 {
1868         struct vcpu_svm *svm = to_svm(vcpu);
1869         struct vmcb_seg *s = svm_seg(vcpu, seg);
1870
1871         s->base = var->base;
1872         s->limit = var->limit;
1873         s->selector = var->selector;
1874         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1875         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1876         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1877         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1878         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1879         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1880         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1881         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1882
1883         /*
1884          * This is always accurate, except if SYSRET returned to a segment
1885          * with SS.DPL != 3.  Intel does not have this quirk, and always
1886          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1887          * would entail passing the CPL to userspace and back.
1888          */
1889         if (seg == VCPU_SREG_SS)
1890                 /* This is symmetric with svm_get_segment() */
1891                 svm->vmcb->save.cpl = (var->dpl & 3);
1892
1893         vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1894 }
1895
1896 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1897 {
1898         struct vcpu_svm *svm = to_svm(vcpu);
1899
1900         clr_exception_intercept(svm, BP_VECTOR);
1901
1902         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1903                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1904                         set_exception_intercept(svm, BP_VECTOR);
1905         }
1906 }
1907
1908 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1909 {
1910         if (sd->next_asid > sd->max_asid) {
1911                 ++sd->asid_generation;
1912                 sd->next_asid = sd->min_asid;
1913                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1914                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1915         }
1916
1917         svm->current_vmcb->asid_generation = sd->asid_generation;
1918         svm->asid = sd->next_asid++;
1919 }
1920
1921 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1922 {
1923         struct vmcb *vmcb = svm->vmcb;
1924
1925         if (svm->vcpu.arch.guest_state_protected)
1926                 return;
1927
1928         if (unlikely(value != vmcb->save.dr6)) {
1929                 vmcb->save.dr6 = value;
1930                 vmcb_mark_dirty(vmcb, VMCB_DR);
1931         }
1932 }
1933
1934 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1935 {
1936         struct vcpu_svm *svm = to_svm(vcpu);
1937
1938         if (vcpu->arch.guest_state_protected)
1939                 return;
1940
1941         get_debugreg(vcpu->arch.db[0], 0);
1942         get_debugreg(vcpu->arch.db[1], 1);
1943         get_debugreg(vcpu->arch.db[2], 2);
1944         get_debugreg(vcpu->arch.db[3], 3);
1945         /*
1946          * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1947          * because db_interception might need it.  We can do it before vmentry.
1948          */
1949         vcpu->arch.dr6 = svm->vmcb->save.dr6;
1950         vcpu->arch.dr7 = svm->vmcb->save.dr7;
1951         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1952         set_dr_intercepts(svm);
1953 }
1954
1955 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1956 {
1957         struct vcpu_svm *svm = to_svm(vcpu);
1958
1959         if (vcpu->arch.guest_state_protected)
1960                 return;
1961
1962         svm->vmcb->save.dr7 = value;
1963         vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1964 }
1965
1966 static int pf_interception(struct kvm_vcpu *vcpu)
1967 {
1968         struct vcpu_svm *svm = to_svm(vcpu);
1969
1970         u64 fault_address = svm->vmcb->control.exit_info_2;
1971         u64 error_code = svm->vmcb->control.exit_info_1;
1972
1973         return kvm_handle_page_fault(vcpu, error_code, fault_address,
1974                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1975                         svm->vmcb->control.insn_bytes : NULL,
1976                         svm->vmcb->control.insn_len);
1977 }
1978
1979 static int npf_interception(struct kvm_vcpu *vcpu)
1980 {
1981         struct vcpu_svm *svm = to_svm(vcpu);
1982
1983         u64 fault_address = svm->vmcb->control.exit_info_2;
1984         u64 error_code = svm->vmcb->control.exit_info_1;
1985
1986         trace_kvm_page_fault(vcpu, fault_address, error_code);
1987         return kvm_mmu_page_fault(vcpu, fault_address, error_code,
1988                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1989                         svm->vmcb->control.insn_bytes : NULL,
1990                         svm->vmcb->control.insn_len);
1991 }
1992
1993 static int db_interception(struct kvm_vcpu *vcpu)
1994 {
1995         struct kvm_run *kvm_run = vcpu->run;
1996         struct vcpu_svm *svm = to_svm(vcpu);
1997
1998         if (!(vcpu->guest_debug &
1999               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2000                 !svm->nmi_singlestep) {
2001                 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
2002                 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
2003                 return 1;
2004         }
2005
2006         if (svm->nmi_singlestep) {
2007                 disable_nmi_singlestep(svm);
2008                 /* Make sure we check for pending NMIs upon entry */
2009                 kvm_make_request(KVM_REQ_EVENT, vcpu);
2010         }
2011
2012         if (vcpu->guest_debug &
2013             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2014                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2015                 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2016                 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2017                 kvm_run->debug.arch.pc =
2018                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2019                 kvm_run->debug.arch.exception = DB_VECTOR;
2020                 return 0;
2021         }
2022
2023         return 1;
2024 }
2025
2026 static int bp_interception(struct kvm_vcpu *vcpu)
2027 {
2028         struct vcpu_svm *svm = to_svm(vcpu);
2029         struct kvm_run *kvm_run = vcpu->run;
2030
2031         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2032         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2033         kvm_run->debug.arch.exception = BP_VECTOR;
2034         return 0;
2035 }
2036
2037 static int ud_interception(struct kvm_vcpu *vcpu)
2038 {
2039         return handle_ud(vcpu);
2040 }
2041
2042 static int ac_interception(struct kvm_vcpu *vcpu)
2043 {
2044         kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2045         return 1;
2046 }
2047
2048 static bool is_erratum_383(void)
2049 {
2050         int err, i;
2051         u64 value;
2052
2053         if (!erratum_383_found)
2054                 return false;
2055
2056         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2057         if (err)
2058                 return false;
2059
2060         /* Bit 62 may or may not be set for this mce */
2061         value &= ~(1ULL << 62);
2062
2063         if (value != 0xb600000000010015ULL)
2064                 return false;
2065
2066         /* Clear MCi_STATUS registers */
2067         for (i = 0; i < 6; ++i)
2068                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2069
2070         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2071         if (!err) {
2072                 u32 low, high;
2073
2074                 value &= ~(1ULL << 2);
2075                 low    = lower_32_bits(value);
2076                 high   = upper_32_bits(value);
2077
2078                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2079         }
2080
2081         /* Flush tlb to evict multi-match entries */
2082         __flush_tlb_all();
2083
2084         return true;
2085 }
2086
2087 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2088 {
2089         if (is_erratum_383()) {
2090                 /*
2091                  * Erratum 383 triggered. Guest state is corrupt so kill the
2092                  * guest.
2093                  */
2094                 pr_err("Guest triggered AMD Erratum 383\n");
2095
2096                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2097
2098                 return;
2099         }
2100
2101         /*
2102          * On an #MC intercept the MCE handler is not called automatically in
2103          * the host. So do it by hand here.
2104          */
2105         kvm_machine_check();
2106 }
2107
2108 static int mc_interception(struct kvm_vcpu *vcpu)
2109 {
2110         return 1;
2111 }
2112
2113 static int shutdown_interception(struct kvm_vcpu *vcpu)
2114 {
2115         struct kvm_run *kvm_run = vcpu->run;
2116         struct vcpu_svm *svm = to_svm(vcpu);
2117
2118         /*
2119          * The VM save area has already been encrypted so it
2120          * cannot be reinitialized - just terminate.
2121          */
2122         if (sev_es_guest(vcpu->kvm))
2123                 return -EINVAL;
2124
2125         /*
2126          * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2127          * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2128          * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2129          * userspace.  At a platform view, INIT is acceptable behavior as
2130          * there exist bare metal platforms that automatically INIT the CPU
2131          * in response to shutdown.
2132          */
2133         clear_page(svm->vmcb);
2134         kvm_vcpu_reset(vcpu, true);
2135
2136         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2137         return 0;
2138 }
2139
2140 static int io_interception(struct kvm_vcpu *vcpu)
2141 {
2142         struct vcpu_svm *svm = to_svm(vcpu);
2143         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2144         int size, in, string;
2145         unsigned port;
2146
2147         ++vcpu->stat.io_exits;
2148         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2149         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2150         port = io_info >> 16;
2151         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2152
2153         if (string) {
2154                 if (sev_es_guest(vcpu->kvm))
2155                         return sev_es_string_io(svm, size, port, in);
2156                 else
2157                         return kvm_emulate_instruction(vcpu, 0);
2158         }
2159
2160         svm->next_rip = svm->vmcb->control.exit_info_2;
2161
2162         return kvm_fast_pio(vcpu, size, port, in);
2163 }
2164
2165 static int nmi_interception(struct kvm_vcpu *vcpu)
2166 {
2167         return 1;
2168 }
2169
2170 static int smi_interception(struct kvm_vcpu *vcpu)
2171 {
2172         return 1;
2173 }
2174
2175 static int intr_interception(struct kvm_vcpu *vcpu)
2176 {
2177         ++vcpu->stat.irq_exits;
2178         return 1;
2179 }
2180
2181 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2182 {
2183         struct vcpu_svm *svm = to_svm(vcpu);
2184         struct vmcb *vmcb12;
2185         struct kvm_host_map map;
2186         int ret;
2187
2188         if (nested_svm_check_permissions(vcpu))
2189                 return 1;
2190
2191         ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2192         if (ret) {
2193                 if (ret == -EINVAL)
2194                         kvm_inject_gp(vcpu, 0);
2195                 return 1;
2196         }
2197
2198         vmcb12 = map.hva;
2199
2200         ret = kvm_skip_emulated_instruction(vcpu);
2201
2202         if (vmload) {
2203                 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2204                 svm->sysenter_eip_hi = 0;
2205                 svm->sysenter_esp_hi = 0;
2206         } else {
2207                 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2208         }
2209
2210         kvm_vcpu_unmap(vcpu, &map, true);
2211
2212         return ret;
2213 }
2214
2215 static int vmload_interception(struct kvm_vcpu *vcpu)
2216 {
2217         return vmload_vmsave_interception(vcpu, true);
2218 }
2219
2220 static int vmsave_interception(struct kvm_vcpu *vcpu)
2221 {
2222         return vmload_vmsave_interception(vcpu, false);
2223 }
2224
2225 static int vmrun_interception(struct kvm_vcpu *vcpu)
2226 {
2227         if (nested_svm_check_permissions(vcpu))
2228                 return 1;
2229
2230         return nested_svm_vmrun(vcpu);
2231 }
2232
2233 enum {
2234         NONE_SVM_INSTR,
2235         SVM_INSTR_VMRUN,
2236         SVM_INSTR_VMLOAD,
2237         SVM_INSTR_VMSAVE,
2238 };
2239
2240 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2241 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2242 {
2243         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2244
2245         if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2246                 return NONE_SVM_INSTR;
2247
2248         switch (ctxt->modrm) {
2249         case 0xd8: /* VMRUN */
2250                 return SVM_INSTR_VMRUN;
2251         case 0xda: /* VMLOAD */
2252                 return SVM_INSTR_VMLOAD;
2253         case 0xdb: /* VMSAVE */
2254                 return SVM_INSTR_VMSAVE;
2255         default:
2256                 break;
2257         }
2258
2259         return NONE_SVM_INSTR;
2260 }
2261
2262 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2263 {
2264         const int guest_mode_exit_codes[] = {
2265                 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2266                 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2267                 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2268         };
2269         int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2270                 [SVM_INSTR_VMRUN] = vmrun_interception,
2271                 [SVM_INSTR_VMLOAD] = vmload_interception,
2272                 [SVM_INSTR_VMSAVE] = vmsave_interception,
2273         };
2274         struct vcpu_svm *svm = to_svm(vcpu);
2275         int ret;
2276
2277         if (is_guest_mode(vcpu)) {
2278                 /* Returns '1' or -errno on failure, '0' on success. */
2279                 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2280                 if (ret)
2281                         return ret;
2282                 return 1;
2283         }
2284         return svm_instr_handlers[opcode](vcpu);
2285 }
2286
2287 /*
2288  * #GP handling code. Note that #GP can be triggered under the following two
2289  * cases:
2290  *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2291  *      some AMD CPUs when EAX of these instructions are in the reserved memory
2292  *      regions (e.g. SMM memory on host).
2293  *   2) VMware backdoor
2294  */
2295 static int gp_interception(struct kvm_vcpu *vcpu)
2296 {
2297         struct vcpu_svm *svm = to_svm(vcpu);
2298         u32 error_code = svm->vmcb->control.exit_info_1;
2299         int opcode;
2300
2301         /* Both #GP cases have zero error_code */
2302         if (error_code)
2303                 goto reinject;
2304
2305         /* Decode the instruction for usage later */
2306         if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2307                 goto reinject;
2308
2309         opcode = svm_instr_opcode(vcpu);
2310
2311         if (opcode == NONE_SVM_INSTR) {
2312                 if (!enable_vmware_backdoor)
2313                         goto reinject;
2314
2315                 /*
2316                  * VMware backdoor emulation on #GP interception only handles
2317                  * IN{S}, OUT{S}, and RDPMC.
2318                  */
2319                 if (!is_guest_mode(vcpu))
2320                         return kvm_emulate_instruction(vcpu,
2321                                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2322         } else {
2323                 /* All SVM instructions expect page aligned RAX */
2324                 if (svm->vmcb->save.rax & ~PAGE_MASK)
2325                         goto reinject;
2326
2327                 return emulate_svm_instr(vcpu, opcode);
2328         }
2329
2330 reinject:
2331         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2332         return 1;
2333 }
2334
2335 void svm_set_gif(struct vcpu_svm *svm, bool value)
2336 {
2337         if (value) {
2338                 /*
2339                  * If VGIF is enabled, the STGI intercept is only added to
2340                  * detect the opening of the SMI/NMI window; remove it now.
2341                  * Likewise, clear the VINTR intercept, we will set it
2342                  * again while processing KVM_REQ_EVENT if needed.
2343                  */
2344                 if (vgif)
2345                         svm_clr_intercept(svm, INTERCEPT_STGI);
2346                 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2347                         svm_clear_vintr(svm);
2348
2349                 enable_gif(svm);
2350                 if (svm->vcpu.arch.smi_pending ||
2351                     svm->vcpu.arch.nmi_pending ||
2352                     kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2353                     kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2354                         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2355         } else {
2356                 disable_gif(svm);
2357
2358                 /*
2359                  * After a CLGI no interrupts should come.  But if vGIF is
2360                  * in use, we still rely on the VINTR intercept (rather than
2361                  * STGI) to detect an open interrupt window.
2362                 */
2363                 if (!vgif)
2364                         svm_clear_vintr(svm);
2365         }
2366 }
2367
2368 static int stgi_interception(struct kvm_vcpu *vcpu)
2369 {
2370         int ret;
2371
2372         if (nested_svm_check_permissions(vcpu))
2373                 return 1;
2374
2375         ret = kvm_skip_emulated_instruction(vcpu);
2376         svm_set_gif(to_svm(vcpu), true);
2377         return ret;
2378 }
2379
2380 static int clgi_interception(struct kvm_vcpu *vcpu)
2381 {
2382         int ret;
2383
2384         if (nested_svm_check_permissions(vcpu))
2385                 return 1;
2386
2387         ret = kvm_skip_emulated_instruction(vcpu);
2388         svm_set_gif(to_svm(vcpu), false);
2389         return ret;
2390 }
2391
2392 static int invlpga_interception(struct kvm_vcpu *vcpu)
2393 {
2394         gva_t gva = kvm_rax_read(vcpu);
2395         u32 asid = kvm_rcx_read(vcpu);
2396
2397         /* FIXME: Handle an address size prefix. */
2398         if (!is_long_mode(vcpu))
2399                 gva = (u32)gva;
2400
2401         trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2402
2403         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2404         kvm_mmu_invlpg(vcpu, gva);
2405
2406         return kvm_skip_emulated_instruction(vcpu);
2407 }
2408
2409 static int skinit_interception(struct kvm_vcpu *vcpu)
2410 {
2411         trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2412
2413         kvm_queue_exception(vcpu, UD_VECTOR);
2414         return 1;
2415 }
2416
2417 static int task_switch_interception(struct kvm_vcpu *vcpu)
2418 {
2419         struct vcpu_svm *svm = to_svm(vcpu);
2420         u16 tss_selector;
2421         int reason;
2422         int int_type = svm->vmcb->control.exit_int_info &
2423                 SVM_EXITINTINFO_TYPE_MASK;
2424         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2425         uint32_t type =
2426                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2427         uint32_t idt_v =
2428                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2429         bool has_error_code = false;
2430         u32 error_code = 0;
2431
2432         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2433
2434         if (svm->vmcb->control.exit_info_2 &
2435             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2436                 reason = TASK_SWITCH_IRET;
2437         else if (svm->vmcb->control.exit_info_2 &
2438                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2439                 reason = TASK_SWITCH_JMP;
2440         else if (idt_v)
2441                 reason = TASK_SWITCH_GATE;
2442         else
2443                 reason = TASK_SWITCH_CALL;
2444
2445         if (reason == TASK_SWITCH_GATE) {
2446                 switch (type) {
2447                 case SVM_EXITINTINFO_TYPE_NMI:
2448                         vcpu->arch.nmi_injected = false;
2449                         break;
2450                 case SVM_EXITINTINFO_TYPE_EXEPT:
2451                         if (svm->vmcb->control.exit_info_2 &
2452                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2453                                 has_error_code = true;
2454                                 error_code =
2455                                         (u32)svm->vmcb->control.exit_info_2;
2456                         }
2457                         kvm_clear_exception_queue(vcpu);
2458                         break;
2459                 case SVM_EXITINTINFO_TYPE_INTR:
2460                 case SVM_EXITINTINFO_TYPE_SOFT:
2461                         kvm_clear_interrupt_queue(vcpu);
2462                         break;
2463                 default:
2464                         break;
2465                 }
2466         }
2467
2468         if (reason != TASK_SWITCH_GATE ||
2469             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2470             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2471              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2472                 if (!svm_skip_emulated_instruction(vcpu))
2473                         return 0;
2474         }
2475
2476         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2477                 int_vec = -1;
2478
2479         return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2480                                has_error_code, error_code);
2481 }
2482
2483 static int iret_interception(struct kvm_vcpu *vcpu)
2484 {
2485         struct vcpu_svm *svm = to_svm(vcpu);
2486
2487         ++vcpu->stat.nmi_window_exits;
2488         svm->awaiting_iret_completion = true;
2489         if (!sev_es_guest(vcpu->kvm)) {
2490                 svm_clr_intercept(svm, INTERCEPT_IRET);
2491                 svm->nmi_iret_rip = kvm_rip_read(vcpu);
2492         }
2493         kvm_make_request(KVM_REQ_EVENT, vcpu);
2494         return 1;
2495 }
2496
2497 static int invlpg_interception(struct kvm_vcpu *vcpu)
2498 {
2499         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2500                 return kvm_emulate_instruction(vcpu, 0);
2501
2502         kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2503         return kvm_skip_emulated_instruction(vcpu);
2504 }
2505
2506 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2507 {
2508         return kvm_emulate_instruction(vcpu, 0);
2509 }
2510
2511 static int rsm_interception(struct kvm_vcpu *vcpu)
2512 {
2513         return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2514 }
2515
2516 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2517                                             unsigned long val)
2518 {
2519         struct vcpu_svm *svm = to_svm(vcpu);
2520         unsigned long cr0 = vcpu->arch.cr0;
2521         bool ret = false;
2522
2523         if (!is_guest_mode(vcpu) ||
2524             (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2525                 return false;
2526
2527         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2528         val &= ~SVM_CR0_SELECTIVE_MASK;
2529
2530         if (cr0 ^ val) {
2531                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2532                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2533         }
2534
2535         return ret;
2536 }
2537
2538 #define CR_VALID (1ULL << 63)
2539
2540 static int cr_interception(struct kvm_vcpu *vcpu)
2541 {
2542         struct vcpu_svm *svm = to_svm(vcpu);
2543         int reg, cr;
2544         unsigned long val;
2545         int err;
2546
2547         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2548                 return emulate_on_interception(vcpu);
2549
2550         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2551                 return emulate_on_interception(vcpu);
2552
2553         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2554         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2555                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2556         else
2557                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2558
2559         err = 0;
2560         if (cr >= 16) { /* mov to cr */
2561                 cr -= 16;
2562                 val = kvm_register_read(vcpu, reg);
2563                 trace_kvm_cr_write(cr, val);
2564                 switch (cr) {
2565                 case 0:
2566                         if (!check_selective_cr0_intercepted(vcpu, val))
2567                                 err = kvm_set_cr0(vcpu, val);
2568                         else
2569                                 return 1;
2570
2571                         break;
2572                 case 3:
2573                         err = kvm_set_cr3(vcpu, val);
2574                         break;
2575                 case 4:
2576                         err = kvm_set_cr4(vcpu, val);
2577                         break;
2578                 case 8:
2579                         err = kvm_set_cr8(vcpu, val);
2580                         break;
2581                 default:
2582                         WARN(1, "unhandled write to CR%d", cr);
2583                         kvm_queue_exception(vcpu, UD_VECTOR);
2584                         return 1;
2585                 }
2586         } else { /* mov from cr */
2587                 switch (cr) {
2588                 case 0:
2589                         val = kvm_read_cr0(vcpu);
2590                         break;
2591                 case 2:
2592                         val = vcpu->arch.cr2;
2593                         break;
2594                 case 3:
2595                         val = kvm_read_cr3(vcpu);
2596                         break;
2597                 case 4:
2598                         val = kvm_read_cr4(vcpu);
2599                         break;
2600                 case 8:
2601                         val = kvm_get_cr8(vcpu);
2602                         break;
2603                 default:
2604                         WARN(1, "unhandled read from CR%d", cr);
2605                         kvm_queue_exception(vcpu, UD_VECTOR);
2606                         return 1;
2607                 }
2608                 kvm_register_write(vcpu, reg, val);
2609                 trace_kvm_cr_read(cr, val);
2610         }
2611         return kvm_complete_insn_gp(vcpu, err);
2612 }
2613
2614 static int cr_trap(struct kvm_vcpu *vcpu)
2615 {
2616         struct vcpu_svm *svm = to_svm(vcpu);
2617         unsigned long old_value, new_value;
2618         unsigned int cr;
2619         int ret = 0;
2620
2621         new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2622
2623         cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2624         switch (cr) {
2625         case 0:
2626                 old_value = kvm_read_cr0(vcpu);
2627                 svm_set_cr0(vcpu, new_value);
2628
2629                 kvm_post_set_cr0(vcpu, old_value, new_value);
2630                 break;
2631         case 4:
2632                 old_value = kvm_read_cr4(vcpu);
2633                 svm_set_cr4(vcpu, new_value);
2634
2635                 kvm_post_set_cr4(vcpu, old_value, new_value);
2636                 break;
2637         case 8:
2638                 ret = kvm_set_cr8(vcpu, new_value);
2639                 break;
2640         default:
2641                 WARN(1, "unhandled CR%d write trap", cr);
2642                 kvm_queue_exception(vcpu, UD_VECTOR);
2643                 return 1;
2644         }
2645
2646         return kvm_complete_insn_gp(vcpu, ret);
2647 }
2648
2649 static int dr_interception(struct kvm_vcpu *vcpu)
2650 {
2651         struct vcpu_svm *svm = to_svm(vcpu);
2652         int reg, dr;
2653         unsigned long val;
2654         int err = 0;
2655
2656         if (vcpu->guest_debug == 0) {
2657                 /*
2658                  * No more DR vmexits; force a reload of the debug registers
2659                  * and reenter on this instruction.  The next vmexit will
2660                  * retrieve the full state of the debug registers.
2661                  */
2662                 clr_dr_intercepts(svm);
2663                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2664                 return 1;
2665         }
2666
2667         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2668                 return emulate_on_interception(vcpu);
2669
2670         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2671         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2672         if (dr >= 16) { /* mov to DRn  */
2673                 dr -= 16;
2674                 val = kvm_register_read(vcpu, reg);
2675                 err = kvm_set_dr(vcpu, dr, val);
2676         } else {
2677                 kvm_get_dr(vcpu, dr, &val);
2678                 kvm_register_write(vcpu, reg, val);
2679         }
2680
2681         return kvm_complete_insn_gp(vcpu, err);
2682 }
2683
2684 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2685 {
2686         int r;
2687
2688         u8 cr8_prev = kvm_get_cr8(vcpu);
2689         /* instruction emulation calls kvm_set_cr8() */
2690         r = cr_interception(vcpu);
2691         if (lapic_in_kernel(vcpu))
2692                 return r;
2693         if (cr8_prev <= kvm_get_cr8(vcpu))
2694                 return r;
2695         vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2696         return 0;
2697 }
2698
2699 static int efer_trap(struct kvm_vcpu *vcpu)
2700 {
2701         struct msr_data msr_info;
2702         int ret;
2703
2704         /*
2705          * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2706          * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2707          * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2708          * the guest doesn't have X86_FEATURE_SVM.
2709          */
2710         msr_info.host_initiated = false;
2711         msr_info.index = MSR_EFER;
2712         msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2713         ret = kvm_set_msr_common(vcpu, &msr_info);
2714
2715         return kvm_complete_insn_gp(vcpu, ret);
2716 }
2717
2718 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2719 {
2720         msr->data = 0;
2721
2722         switch (msr->index) {
2723         case MSR_AMD64_DE_CFG:
2724                 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2725                         msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2726                 break;
2727         default:
2728                 return KVM_MSR_RET_INVALID;
2729         }
2730
2731         return 0;
2732 }
2733
2734 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2735 {
2736         struct vcpu_svm *svm = to_svm(vcpu);
2737
2738         switch (msr_info->index) {
2739         case MSR_AMD64_TSC_RATIO:
2740                 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2741                         return 1;
2742                 msr_info->data = svm->tsc_ratio_msr;
2743                 break;
2744         case MSR_STAR:
2745                 msr_info->data = svm->vmcb01.ptr->save.star;
2746                 break;
2747 #ifdef CONFIG_X86_64
2748         case MSR_LSTAR:
2749                 msr_info->data = svm->vmcb01.ptr->save.lstar;
2750                 break;
2751         case MSR_CSTAR:
2752                 msr_info->data = svm->vmcb01.ptr->save.cstar;
2753                 break;
2754         case MSR_KERNEL_GS_BASE:
2755                 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2756                 break;
2757         case MSR_SYSCALL_MASK:
2758                 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2759                 break;
2760 #endif
2761         case MSR_IA32_SYSENTER_CS:
2762                 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2763                 break;
2764         case MSR_IA32_SYSENTER_EIP:
2765                 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2766                 if (guest_cpuid_is_intel(vcpu))
2767                         msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2768                 break;
2769         case MSR_IA32_SYSENTER_ESP:
2770                 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2771                 if (guest_cpuid_is_intel(vcpu))
2772                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2773                 break;
2774         case MSR_TSC_AUX:
2775                 msr_info->data = svm->tsc_aux;
2776                 break;
2777         case MSR_IA32_DEBUGCTLMSR:
2778         case MSR_IA32_LASTBRANCHFROMIP:
2779         case MSR_IA32_LASTBRANCHTOIP:
2780         case MSR_IA32_LASTINTFROMIP:
2781         case MSR_IA32_LASTINTTOIP:
2782                 msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
2783                 break;
2784         case MSR_VM_HSAVE_PA:
2785                 msr_info->data = svm->nested.hsave_msr;
2786                 break;
2787         case MSR_VM_CR:
2788                 msr_info->data = svm->nested.vm_cr_msr;
2789                 break;
2790         case MSR_IA32_SPEC_CTRL:
2791                 if (!msr_info->host_initiated &&
2792                     !guest_has_spec_ctrl_msr(vcpu))
2793                         return 1;
2794
2795                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2796                         msr_info->data = svm->vmcb->save.spec_ctrl;
2797                 else
2798                         msr_info->data = svm->spec_ctrl;
2799                 break;
2800         case MSR_AMD64_VIRT_SPEC_CTRL:
2801                 if (!msr_info->host_initiated &&
2802                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2803                         return 1;
2804
2805                 msr_info->data = svm->virt_spec_ctrl;
2806                 break;
2807         case MSR_F15H_IC_CFG: {
2808
2809                 int family, model;
2810
2811                 family = guest_cpuid_family(vcpu);
2812                 model  = guest_cpuid_model(vcpu);
2813
2814                 if (family < 0 || model < 0)
2815                         return kvm_get_msr_common(vcpu, msr_info);
2816
2817                 msr_info->data = 0;
2818
2819                 if (family == 0x15 &&
2820                     (model >= 0x2 && model < 0x20))
2821                         msr_info->data = 0x1E;
2822                 }
2823                 break;
2824         case MSR_AMD64_DE_CFG:
2825                 msr_info->data = svm->msr_decfg;
2826                 break;
2827         default:
2828                 return kvm_get_msr_common(vcpu, msr_info);
2829         }
2830         return 0;
2831 }
2832
2833 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2834 {
2835         struct vcpu_svm *svm = to_svm(vcpu);
2836         if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2837                 return kvm_complete_insn_gp(vcpu, err);
2838
2839         ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2840         ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2841                                 X86_TRAP_GP |
2842                                 SVM_EVTINJ_TYPE_EXEPT |
2843                                 SVM_EVTINJ_VALID);
2844         return 1;
2845 }
2846
2847 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2848 {
2849         struct vcpu_svm *svm = to_svm(vcpu);
2850         int svm_dis, chg_mask;
2851
2852         if (data & ~SVM_VM_CR_VALID_MASK)
2853                 return 1;
2854
2855         chg_mask = SVM_VM_CR_VALID_MASK;
2856
2857         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2858                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2859
2860         svm->nested.vm_cr_msr &= ~chg_mask;
2861         svm->nested.vm_cr_msr |= (data & chg_mask);
2862
2863         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2864
2865         /* check for svm_disable while efer.svme is set */
2866         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2867                 return 1;
2868
2869         return 0;
2870 }
2871
2872 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2873 {
2874         struct vcpu_svm *svm = to_svm(vcpu);
2875         int r;
2876
2877         u32 ecx = msr->index;
2878         u64 data = msr->data;
2879         switch (ecx) {
2880         case MSR_AMD64_TSC_RATIO:
2881
2882                 if (!svm->tsc_scaling_enabled) {
2883
2884                         if (!msr->host_initiated)
2885                                 return 1;
2886                         /*
2887                          * In case TSC scaling is not enabled, always
2888                          * leave this MSR at the default value.
2889                          *
2890                          * Due to bug in qemu 6.2.0, it would try to set
2891                          * this msr to 0 if tsc scaling is not enabled.
2892                          * Ignore this value as well.
2893                          */
2894                         if (data != 0 && data != svm->tsc_ratio_msr)
2895                                 return 1;
2896                         break;
2897                 }
2898
2899                 if (data & SVM_TSC_RATIO_RSVD)
2900                         return 1;
2901
2902                 svm->tsc_ratio_msr = data;
2903
2904                 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2905                         nested_svm_update_tsc_ratio_msr(vcpu);
2906
2907                 break;
2908         case MSR_IA32_CR_PAT:
2909                 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2910                         return 1;
2911                 vcpu->arch.pat = data;
2912                 svm->vmcb01.ptr->save.g_pat = data;
2913                 if (is_guest_mode(vcpu))
2914                         nested_vmcb02_compute_g_pat(svm);
2915                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2916                 break;
2917         case MSR_IA32_SPEC_CTRL:
2918                 if (!msr->host_initiated &&
2919                     !guest_has_spec_ctrl_msr(vcpu))
2920                         return 1;
2921
2922                 if (kvm_spec_ctrl_test_value(data))
2923                         return 1;
2924
2925                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2926                         svm->vmcb->save.spec_ctrl = data;
2927                 else
2928                         svm->spec_ctrl = data;
2929                 if (!data)
2930                         break;
2931
2932                 /*
2933                  * For non-nested:
2934                  * When it's written (to non-zero) for the first time, pass
2935                  * it through.
2936                  *
2937                  * For nested:
2938                  * The handling of the MSR bitmap for L2 guests is done in
2939                  * nested_svm_vmrun_msrpm.
2940                  * We update the L1 MSR bit as well since it will end up
2941                  * touching the MSR anyway now.
2942                  */
2943                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2944                 break;
2945         case MSR_IA32_PRED_CMD:
2946                 if (!msr->host_initiated &&
2947                     !guest_has_pred_cmd_msr(vcpu))
2948                         return 1;
2949
2950                 if (data & ~PRED_CMD_IBPB)
2951                         return 1;
2952                 if (!boot_cpu_has(X86_FEATURE_IBPB))
2953                         return 1;
2954                 if (!data)
2955                         break;
2956
2957                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2958                 break;
2959         case MSR_AMD64_VIRT_SPEC_CTRL:
2960                 if (!msr->host_initiated &&
2961                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2962                         return 1;
2963
2964                 if (data & ~SPEC_CTRL_SSBD)
2965                         return 1;
2966
2967                 svm->virt_spec_ctrl = data;
2968                 break;
2969         case MSR_STAR:
2970                 svm->vmcb01.ptr->save.star = data;
2971                 break;
2972 #ifdef CONFIG_X86_64
2973         case MSR_LSTAR:
2974                 svm->vmcb01.ptr->save.lstar = data;
2975                 break;
2976         case MSR_CSTAR:
2977                 svm->vmcb01.ptr->save.cstar = data;
2978                 break;
2979         case MSR_KERNEL_GS_BASE:
2980                 svm->vmcb01.ptr->save.kernel_gs_base = data;
2981                 break;
2982         case MSR_SYSCALL_MASK:
2983                 svm->vmcb01.ptr->save.sfmask = data;
2984                 break;
2985 #endif
2986         case MSR_IA32_SYSENTER_CS:
2987                 svm->vmcb01.ptr->save.sysenter_cs = data;
2988                 break;
2989         case MSR_IA32_SYSENTER_EIP:
2990                 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2991                 /*
2992                  * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2993                  * when we spoof an Intel vendor ID (for cross vendor migration).
2994                  * In this case we use this intercept to track the high
2995                  * 32 bit part of these msrs to support Intel's
2996                  * implementation of SYSENTER/SYSEXIT.
2997                  */
2998                 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2999                 break;
3000         case MSR_IA32_SYSENTER_ESP:
3001                 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3002                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3003                 break;
3004         case MSR_TSC_AUX:
3005                 /*
3006                  * TSC_AUX is usually changed only during boot and never read
3007                  * directly.  Intercept TSC_AUX instead of exposing it to the
3008                  * guest via direct_access_msrs, and switch it via user return.
3009                  */
3010                 preempt_disable();
3011                 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3012                 preempt_enable();
3013                 if (r)
3014                         return 1;
3015
3016                 svm->tsc_aux = data;
3017                 break;
3018         case MSR_IA32_DEBUGCTLMSR:
3019                 if (!lbrv) {
3020                         kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3021                         break;
3022                 }
3023                 if (data & DEBUGCTL_RESERVED_BITS)
3024                         return 1;
3025
3026                 if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
3027                         svm->vmcb->save.dbgctl = data;
3028                 else
3029                         svm->vmcb01.ptr->save.dbgctl = data;
3030
3031                 svm_update_lbrv(vcpu);
3032
3033                 break;
3034         case MSR_VM_HSAVE_PA:
3035                 /*
3036                  * Old kernels did not validate the value written to
3037                  * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3038                  * value to allow live migrating buggy or malicious guests
3039                  * originating from those kernels.
3040                  */
3041                 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3042                         return 1;
3043
3044                 svm->nested.hsave_msr = data & PAGE_MASK;
3045                 break;
3046         case MSR_VM_CR:
3047                 return svm_set_vm_cr(vcpu, data);
3048         case MSR_VM_IGNNE:
3049                 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3050                 break;
3051         case MSR_AMD64_DE_CFG: {
3052                 struct kvm_msr_entry msr_entry;
3053
3054                 msr_entry.index = msr->index;
3055                 if (svm_get_msr_feature(&msr_entry))
3056                         return 1;
3057
3058                 /* Check the supported bits */
3059                 if (data & ~msr_entry.data)
3060                         return 1;
3061
3062                 /* Don't allow the guest to change a bit, #GP */
3063                 if (!msr->host_initiated && (data ^ msr_entry.data))
3064                         return 1;
3065
3066                 svm->msr_decfg = data;
3067                 break;
3068         }
3069         default:
3070                 return kvm_set_msr_common(vcpu, msr);
3071         }
3072         return 0;
3073 }
3074
3075 static int msr_interception(struct kvm_vcpu *vcpu)
3076 {
3077         if (to_svm(vcpu)->vmcb->control.exit_info_1)
3078                 return kvm_emulate_wrmsr(vcpu);
3079         else
3080                 return kvm_emulate_rdmsr(vcpu);
3081 }
3082
3083 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3084 {
3085         kvm_make_request(KVM_REQ_EVENT, vcpu);
3086         svm_clear_vintr(to_svm(vcpu));
3087
3088         /*
3089          * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3090          * In this case AVIC was temporarily disabled for
3091          * requesting the IRQ window and we have to re-enable it.
3092          *
3093          * If running nested, still remove the VM wide AVIC inhibit to
3094          * support case in which the interrupt window was requested when the
3095          * vCPU was not running nested.
3096
3097          * All vCPUs which run still run nested, will remain to have their
3098          * AVIC still inhibited due to per-cpu AVIC inhibition.
3099          */
3100         kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3101
3102         ++vcpu->stat.irq_window_exits;
3103         return 1;
3104 }
3105
3106 static int pause_interception(struct kvm_vcpu *vcpu)
3107 {
3108         bool in_kernel;
3109         /*
3110          * CPL is not made available for an SEV-ES guest, therefore
3111          * vcpu->arch.preempted_in_kernel can never be true.  Just
3112          * set in_kernel to false as well.
3113          */
3114         in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3115
3116         grow_ple_window(vcpu);
3117
3118         kvm_vcpu_on_spin(vcpu, in_kernel);
3119         return kvm_skip_emulated_instruction(vcpu);
3120 }
3121
3122 static int invpcid_interception(struct kvm_vcpu *vcpu)
3123 {
3124         struct vcpu_svm *svm = to_svm(vcpu);
3125         unsigned long type;
3126         gva_t gva;
3127
3128         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3129                 kvm_queue_exception(vcpu, UD_VECTOR);
3130                 return 1;
3131         }
3132
3133         /*
3134          * For an INVPCID intercept:
3135          * EXITINFO1 provides the linear address of the memory operand.
3136          * EXITINFO2 provides the contents of the register operand.
3137          */
3138         type = svm->vmcb->control.exit_info_2;
3139         gva = svm->vmcb->control.exit_info_1;
3140
3141         return kvm_handle_invpcid(vcpu, type, gva);
3142 }
3143
3144 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3145         [SVM_EXIT_READ_CR0]                     = cr_interception,
3146         [SVM_EXIT_READ_CR3]                     = cr_interception,
3147         [SVM_EXIT_READ_CR4]                     = cr_interception,
3148         [SVM_EXIT_READ_CR8]                     = cr_interception,
3149         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3150         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3151         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3152         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3153         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3154         [SVM_EXIT_READ_DR0]                     = dr_interception,
3155         [SVM_EXIT_READ_DR1]                     = dr_interception,
3156         [SVM_EXIT_READ_DR2]                     = dr_interception,
3157         [SVM_EXIT_READ_DR3]                     = dr_interception,
3158         [SVM_EXIT_READ_DR4]                     = dr_interception,
3159         [SVM_EXIT_READ_DR5]                     = dr_interception,
3160         [SVM_EXIT_READ_DR6]                     = dr_interception,
3161         [SVM_EXIT_READ_DR7]                     = dr_interception,
3162         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3163         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3164         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3165         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3166         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3167         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3168         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3169         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3170         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3171         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3172         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3173         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3174         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3175         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3176         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3177         [SVM_EXIT_INTR]                         = intr_interception,
3178         [SVM_EXIT_NMI]                          = nmi_interception,
3179         [SVM_EXIT_SMI]                          = smi_interception,
3180         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3181         [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3182         [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3183         [SVM_EXIT_IRET]                         = iret_interception,
3184         [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3185         [SVM_EXIT_PAUSE]                        = pause_interception,
3186         [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3187         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3188         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3189         [SVM_EXIT_IOIO]                         = io_interception,
3190         [SVM_EXIT_MSR]                          = msr_interception,
3191         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3192         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3193         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3194         [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3195         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3196         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3197         [SVM_EXIT_STGI]                         = stgi_interception,
3198         [SVM_EXIT_CLGI]                         = clgi_interception,
3199         [SVM_EXIT_SKINIT]                       = skinit_interception,
3200         [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3201         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3202         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3203         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3204         [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3205         [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3206         [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3207         [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3208         [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3209         [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3210         [SVM_EXIT_INVPCID]                      = invpcid_interception,
3211         [SVM_EXIT_NPF]                          = npf_interception,
3212         [SVM_EXIT_RSM]                          = rsm_interception,
3213         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3214         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3215         [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3216 };
3217
3218 static void dump_vmcb(struct kvm_vcpu *vcpu)
3219 {
3220         struct vcpu_svm *svm = to_svm(vcpu);
3221         struct vmcb_control_area *control = &svm->vmcb->control;
3222         struct vmcb_save_area *save = &svm->vmcb->save;
3223         struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3224
3225         if (!dump_invalid_vmcb) {
3226                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3227                 return;
3228         }
3229
3230         pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3231                svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3232         pr_err("VMCB Control Area:\n");
3233         pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3234         pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3235         pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3236         pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3237         pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3238         pr_err("%-20s%08x %08x\n", "intercepts:",
3239               control->intercepts[INTERCEPT_WORD3],
3240                control->intercepts[INTERCEPT_WORD4]);
3241         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3242         pr_err("%-20s%d\n", "pause filter threshold:",
3243                control->pause_filter_thresh);
3244         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3245         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3246         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3247         pr_err("%-20s%d\n", "asid:", control->asid);
3248         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3249         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3250         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3251         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3252         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3253         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3254         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3255         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3256         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3257         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3258         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3259         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3260         pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3261         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3262         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3263         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3264         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3265         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3266         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3267         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3268         pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3269         pr_err("VMCB State Save Area:\n");
3270         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3271                "es:",
3272                save->es.selector, save->es.attrib,
3273                save->es.limit, save->es.base);
3274         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3275                "cs:",
3276                save->cs.selector, save->cs.attrib,
3277                save->cs.limit, save->cs.base);
3278         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3279                "ss:",
3280                save->ss.selector, save->ss.attrib,
3281                save->ss.limit, save->ss.base);
3282         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3283                "ds:",
3284                save->ds.selector, save->ds.attrib,
3285                save->ds.limit, save->ds.base);
3286         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3287                "fs:",
3288                save01->fs.selector, save01->fs.attrib,
3289                save01->fs.limit, save01->fs.base);
3290         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3291                "gs:",
3292                save01->gs.selector, save01->gs.attrib,
3293                save01->gs.limit, save01->gs.base);
3294         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3295                "gdtr:",
3296                save->gdtr.selector, save->gdtr.attrib,
3297                save->gdtr.limit, save->gdtr.base);
3298         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3299                "ldtr:",
3300                save01->ldtr.selector, save01->ldtr.attrib,
3301                save01->ldtr.limit, save01->ldtr.base);
3302         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3303                "idtr:",
3304                save->idtr.selector, save->idtr.attrib,
3305                save->idtr.limit, save->idtr.base);
3306         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3307                "tr:",
3308                save01->tr.selector, save01->tr.attrib,
3309                save01->tr.limit, save01->tr.base);
3310         pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3311                save->vmpl, save->cpl, save->efer);
3312         pr_err("%-15s %016llx %-13s %016llx\n",
3313                "cr0:", save->cr0, "cr2:", save->cr2);
3314         pr_err("%-15s %016llx %-13s %016llx\n",
3315                "cr3:", save->cr3, "cr4:", save->cr4);
3316         pr_err("%-15s %016llx %-13s %016llx\n",
3317                "dr6:", save->dr6, "dr7:", save->dr7);
3318         pr_err("%-15s %016llx %-13s %016llx\n",
3319                "rip:", save->rip, "rflags:", save->rflags);
3320         pr_err("%-15s %016llx %-13s %016llx\n",
3321                "rsp:", save->rsp, "rax:", save->rax);
3322         pr_err("%-15s %016llx %-13s %016llx\n",
3323                "star:", save01->star, "lstar:", save01->lstar);
3324         pr_err("%-15s %016llx %-13s %016llx\n",
3325                "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3326         pr_err("%-15s %016llx %-13s %016llx\n",
3327                "kernel_gs_base:", save01->kernel_gs_base,
3328                "sysenter_cs:", save01->sysenter_cs);
3329         pr_err("%-15s %016llx %-13s %016llx\n",
3330                "sysenter_esp:", save01->sysenter_esp,
3331                "sysenter_eip:", save01->sysenter_eip);
3332         pr_err("%-15s %016llx %-13s %016llx\n",
3333                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3334         pr_err("%-15s %016llx %-13s %016llx\n",
3335                "br_from:", save->br_from, "br_to:", save->br_to);
3336         pr_err("%-15s %016llx %-13s %016llx\n",
3337                "excp_from:", save->last_excp_from,
3338                "excp_to:", save->last_excp_to);
3339 }
3340
3341 static bool svm_check_exit_valid(u64 exit_code)
3342 {
3343         return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3344                 svm_exit_handlers[exit_code]);
3345 }
3346
3347 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3348 {
3349         vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3350         dump_vmcb(vcpu);
3351         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3352         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3353         vcpu->run->internal.ndata = 2;
3354         vcpu->run->internal.data[0] = exit_code;
3355         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3356         return 0;
3357 }
3358
3359 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3360 {
3361         if (!svm_check_exit_valid(exit_code))
3362                 return svm_handle_invalid_exit(vcpu, exit_code);
3363
3364 #ifdef CONFIG_RETPOLINE
3365         if (exit_code == SVM_EXIT_MSR)
3366                 return msr_interception(vcpu);
3367         else if (exit_code == SVM_EXIT_VINTR)
3368                 return interrupt_window_interception(vcpu);
3369         else if (exit_code == SVM_EXIT_INTR)
3370                 return intr_interception(vcpu);
3371         else if (exit_code == SVM_EXIT_HLT)
3372                 return kvm_emulate_halt(vcpu);
3373         else if (exit_code == SVM_EXIT_NPF)
3374                 return npf_interception(vcpu);
3375 #endif
3376         return svm_exit_handlers[exit_code](vcpu);
3377 }
3378
3379 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3380                               u64 *info1, u64 *info2,
3381                               u32 *intr_info, u32 *error_code)
3382 {
3383         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3384
3385         *reason = control->exit_code;
3386         *info1 = control->exit_info_1;
3387         *info2 = control->exit_info_2;
3388         *intr_info = control->exit_int_info;
3389         if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3390             (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3391                 *error_code = control->exit_int_info_err;
3392         else
3393                 *error_code = 0;
3394 }
3395
3396 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3397 {
3398         struct vcpu_svm *svm = to_svm(vcpu);
3399         struct kvm_run *kvm_run = vcpu->run;
3400         u32 exit_code = svm->vmcb->control.exit_code;
3401
3402         trace_kvm_exit(vcpu, KVM_ISA_SVM);
3403
3404         /* SEV-ES guests must use the CR write traps to track CR registers. */
3405         if (!sev_es_guest(vcpu->kvm)) {
3406                 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3407                         vcpu->arch.cr0 = svm->vmcb->save.cr0;
3408                 if (npt_enabled)
3409                         vcpu->arch.cr3 = svm->vmcb->save.cr3;
3410         }
3411
3412         if (is_guest_mode(vcpu)) {
3413                 int vmexit;
3414
3415                 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3416
3417                 vmexit = nested_svm_exit_special(svm);
3418
3419                 if (vmexit == NESTED_EXIT_CONTINUE)
3420                         vmexit = nested_svm_exit_handled(svm);
3421
3422                 if (vmexit == NESTED_EXIT_DONE)
3423                         return 1;
3424         }
3425
3426         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3427                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3428                 kvm_run->fail_entry.hardware_entry_failure_reason
3429                         = svm->vmcb->control.exit_code;
3430                 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3431                 dump_vmcb(vcpu);
3432                 return 0;
3433         }
3434
3435         if (exit_fastpath != EXIT_FASTPATH_NONE)
3436                 return 1;
3437
3438         return svm_invoke_exit_handler(vcpu, exit_code);
3439 }
3440
3441 static void reload_tss(struct kvm_vcpu *vcpu)
3442 {
3443         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3444
3445         sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3446         load_TR_desc();
3447 }
3448
3449 static void pre_svm_run(struct kvm_vcpu *vcpu)
3450 {
3451         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3452         struct vcpu_svm *svm = to_svm(vcpu);
3453
3454         /*
3455          * If the previous vmrun of the vmcb occurred on a different physical
3456          * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3457          * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3458          */
3459         if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3460                 svm->current_vmcb->asid_generation = 0;
3461                 vmcb_mark_all_dirty(svm->vmcb);
3462                 svm->current_vmcb->cpu = vcpu->cpu;
3463         }
3464
3465         if (sev_guest(vcpu->kvm))
3466                 return pre_sev_run(svm, vcpu->cpu);
3467
3468         /* FIXME: handle wraparound of asid_generation */
3469         if (svm->current_vmcb->asid_generation != sd->asid_generation)
3470                 new_asid(svm, sd);
3471 }
3472
3473 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3474 {
3475         struct vcpu_svm *svm = to_svm(vcpu);
3476
3477         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3478
3479         if (svm->nmi_l1_to_l2)
3480                 return;
3481
3482         svm->nmi_masked = true;
3483         if (!sev_es_guest(vcpu->kvm))
3484                 svm_set_intercept(svm, INTERCEPT_IRET);
3485         ++vcpu->stat.nmi_injections;
3486 }
3487
3488 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3489 {
3490         struct vcpu_svm *svm = to_svm(vcpu);
3491         u32 type;
3492
3493         if (vcpu->arch.interrupt.soft) {
3494                 if (svm_update_soft_interrupt_rip(vcpu))
3495                         return;
3496
3497                 type = SVM_EVTINJ_TYPE_SOFT;
3498         } else {
3499                 type = SVM_EVTINJ_TYPE_INTR;
3500         }
3501
3502         trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3503                            vcpu->arch.interrupt.soft, reinjected);
3504         ++vcpu->stat.irq_injections;
3505
3506         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3507                                        SVM_EVTINJ_VALID | type;
3508 }
3509
3510 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3511                                      int trig_mode, int vector)
3512 {
3513         /*
3514          * apic->apicv_active must be read after vcpu->mode.
3515          * Pairs with smp_store_release in vcpu_enter_guest.
3516          */
3517         bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3518
3519         /* Note, this is called iff the local APIC is in-kernel. */
3520         if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3521                 /* Process the interrupt via kvm_check_and_inject_events(). */
3522                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3523                 kvm_vcpu_kick(vcpu);
3524                 return;
3525         }
3526
3527         trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3528         if (in_guest_mode) {
3529                 /*
3530                  * Signal the doorbell to tell hardware to inject the IRQ.  If
3531                  * the vCPU exits the guest before the doorbell chimes, hardware
3532                  * will automatically process AVIC interrupts at the next VMRUN.
3533                  */
3534                 avic_ring_doorbell(vcpu);
3535         } else {
3536                 /*
3537                  * Wake the vCPU if it was blocking.  KVM will then detect the
3538                  * pending IRQ when checking if the vCPU has a wake event.
3539                  */
3540                 kvm_vcpu_wake_up(vcpu);
3541         }
3542 }
3543
3544 static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3545                                   int trig_mode, int vector)
3546 {
3547         kvm_lapic_set_irr(vector, apic);
3548
3549         /*
3550          * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3551          * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3552          * the read of guest_mode.  This guarantees that either VMRUN will see
3553          * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3554          * will signal the doorbell if the CPU has already entered the guest.
3555          */
3556         smp_mb__after_atomic();
3557         svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3558 }
3559
3560 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3561 {
3562         struct vcpu_svm *svm = to_svm(vcpu);
3563
3564         /*
3565          * SEV-ES guests must always keep the CR intercepts cleared. CR
3566          * tracking is done using the CR write traps.
3567          */
3568         if (sev_es_guest(vcpu->kvm))
3569                 return;
3570
3571         if (nested_svm_virtualize_tpr(vcpu))
3572                 return;
3573
3574         svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3575
3576         if (irr == -1)
3577                 return;
3578
3579         if (tpr >= irr)
3580                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3581 }
3582
3583 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3584 {
3585         struct vcpu_svm *svm = to_svm(vcpu);
3586         struct vmcb *vmcb = svm->vmcb;
3587
3588         if (!gif_set(svm))
3589                 return true;
3590
3591         if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3592                 return false;
3593
3594         return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3595                svm->nmi_masked;
3596 }
3597
3598 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3599 {
3600         struct vcpu_svm *svm = to_svm(vcpu);
3601         if (svm->nested.nested_run_pending)
3602                 return -EBUSY;
3603
3604         if (svm_nmi_blocked(vcpu))
3605                 return 0;
3606
3607         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3608         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3609                 return -EBUSY;
3610         return 1;
3611 }
3612
3613 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3614 {
3615         return to_svm(vcpu)->nmi_masked;
3616 }
3617
3618 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3619 {
3620         struct vcpu_svm *svm = to_svm(vcpu);
3621
3622         if (masked) {
3623                 svm->nmi_masked = true;
3624                 if (!sev_es_guest(vcpu->kvm))
3625                         svm_set_intercept(svm, INTERCEPT_IRET);
3626         } else {
3627                 svm->nmi_masked = false;
3628                 if (!sev_es_guest(vcpu->kvm))
3629                         svm_clr_intercept(svm, INTERCEPT_IRET);
3630         }
3631 }
3632
3633 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3634 {
3635         struct vcpu_svm *svm = to_svm(vcpu);
3636         struct vmcb *vmcb = svm->vmcb;
3637
3638         if (!gif_set(svm))
3639                 return true;
3640
3641         if (is_guest_mode(vcpu)) {
3642                 /* As long as interrupts are being delivered...  */
3643                 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3644                     ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3645                     : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3646                         return true;
3647
3648                 /* ... vmexits aren't blocked by the interrupt shadow  */
3649                 if (nested_exit_on_intr(svm))
3650                         return false;
3651         } else {
3652                 if (!svm_get_if_flag(vcpu))
3653                         return true;
3654         }
3655
3656         return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3657 }
3658
3659 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3660 {
3661         struct vcpu_svm *svm = to_svm(vcpu);
3662
3663         if (svm->nested.nested_run_pending)
3664                 return -EBUSY;
3665
3666         if (svm_interrupt_blocked(vcpu))
3667                 return 0;
3668
3669         /*
3670          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3671          * e.g. if the IRQ arrived asynchronously after checking nested events.
3672          */
3673         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3674                 return -EBUSY;
3675
3676         return 1;
3677 }
3678
3679 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3680 {
3681         struct vcpu_svm *svm = to_svm(vcpu);
3682
3683         /*
3684          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3685          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3686          * get that intercept, this function will be called again though and
3687          * we'll get the vintr intercept. However, if the vGIF feature is
3688          * enabled, the STGI interception will not occur. Enable the irq
3689          * window under the assumption that the hardware will set the GIF.
3690          */
3691         if (vgif || gif_set(svm)) {
3692                 /*
3693                  * IRQ window is not needed when AVIC is enabled,
3694                  * unless we have pending ExtINT since it cannot be injected
3695                  * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3696                  * and fallback to injecting IRQ via V_IRQ.
3697                  *
3698                  * If running nested, AVIC is already locally inhibited
3699                  * on this vCPU, therefore there is no need to request
3700                  * the VM wide AVIC inhibition.
3701                  */
3702                 if (!is_guest_mode(vcpu))
3703                         kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3704
3705                 svm_set_vintr(svm);
3706         }
3707 }
3708
3709 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3710 {
3711         struct vcpu_svm *svm = to_svm(vcpu);
3712
3713         if (svm->nmi_masked && !svm->awaiting_iret_completion)
3714                 return; /* IRET will cause a vm exit */
3715
3716         if (!gif_set(svm)) {
3717                 if (vgif)
3718                         svm_set_intercept(svm, INTERCEPT_STGI);
3719                 return; /* STGI will cause a vm exit */
3720         }
3721
3722         /*
3723          * Something prevents NMI from been injected. Single step over possible
3724          * problem (IRET or exception injection or interrupt shadow)
3725          */
3726         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3727         svm->nmi_singlestep = true;
3728         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3729 }
3730
3731 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3732 {
3733         struct vcpu_svm *svm = to_svm(vcpu);
3734
3735         /*
3736          * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3737          * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3738          * entries, and thus is a superset of Hyper-V's fine grained flushing.
3739          */
3740         kvm_hv_vcpu_purge_flush_tlb(vcpu);
3741
3742         /*
3743          * Flush only the current ASID even if the TLB flush was invoked via
3744          * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3745          * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3746          * unconditionally does a TLB flush on both nested VM-Enter and nested
3747          * VM-Exit (via kvm_mmu_reset_context()).
3748          */
3749         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3750                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3751         else
3752                 svm->current_vmcb->asid_generation--;
3753 }
3754
3755 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3756 {
3757         struct vcpu_svm *svm = to_svm(vcpu);
3758
3759         invlpga(gva, svm->vmcb->control.asid);
3760 }
3761
3762 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3763 {
3764         struct vcpu_svm *svm = to_svm(vcpu);
3765
3766         if (nested_svm_virtualize_tpr(vcpu))
3767                 return;
3768
3769         if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3770                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3771                 kvm_set_cr8(vcpu, cr8);
3772         }
3773 }
3774
3775 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3776 {
3777         struct vcpu_svm *svm = to_svm(vcpu);
3778         u64 cr8;
3779
3780         if (nested_svm_virtualize_tpr(vcpu) ||
3781             kvm_vcpu_apicv_active(vcpu))
3782                 return;
3783
3784         cr8 = kvm_get_cr8(vcpu);
3785         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3786         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3787 }
3788
3789 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3790                                         int type)
3791 {
3792         bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3793         bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3794         struct vcpu_svm *svm = to_svm(vcpu);
3795
3796         /*
3797          * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3798          * associated with the original soft exception/interrupt.  next_rip is
3799          * cleared on all exits that can occur while vectoring an event, so KVM
3800          * needs to manually set next_rip for re-injection.  Unlike the !nrips
3801          * case below, this needs to be done if and only if KVM is re-injecting
3802          * the same event, i.e. if the event is a soft exception/interrupt,
3803          * otherwise next_rip is unused on VMRUN.
3804          */
3805         if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3806             kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3807                 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3808         /*
3809          * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3810          * injecting the soft exception/interrupt.  That advancement needs to
3811          * be unwound if vectoring didn't complete.  Note, the new event may
3812          * not be the injected event, e.g. if KVM injected an INTn, the INTn
3813          * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3814          * be the reported vectored event, but RIP still needs to be unwound.
3815          */
3816         else if (!nrips && (is_soft || is_exception) &&
3817                  kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3818                 kvm_rip_write(vcpu, svm->soft_int_old_rip);
3819 }
3820
3821 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3822 {
3823         struct vcpu_svm *svm = to_svm(vcpu);
3824         u8 vector;
3825         int type;
3826         u32 exitintinfo = svm->vmcb->control.exit_int_info;
3827         bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3828         bool soft_int_injected = svm->soft_int_injected;
3829
3830         svm->nmi_l1_to_l2 = false;
3831         svm->soft_int_injected = false;
3832
3833         /*
3834          * If we've made progress since setting HF_IRET_MASK, we've
3835          * executed an IRET and can allow NMI injection.
3836          */
3837         if (svm->awaiting_iret_completion &&
3838             (sev_es_guest(vcpu->kvm) ||
3839              kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3840                 svm->awaiting_iret_completion = false;
3841                 svm->nmi_masked = false;
3842                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3843         }
3844
3845         vcpu->arch.nmi_injected = false;
3846         kvm_clear_exception_queue(vcpu);
3847         kvm_clear_interrupt_queue(vcpu);
3848
3849         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3850                 return;
3851
3852         kvm_make_request(KVM_REQ_EVENT, vcpu);
3853
3854         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3855         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3856
3857         if (soft_int_injected)
3858                 svm_complete_soft_interrupt(vcpu, vector, type);
3859
3860         switch (type) {
3861         case SVM_EXITINTINFO_TYPE_NMI:
3862                 vcpu->arch.nmi_injected = true;
3863                 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
3864                 break;
3865         case SVM_EXITINTINFO_TYPE_EXEPT:
3866                 /*
3867                  * Never re-inject a #VC exception.
3868                  */
3869                 if (vector == X86_TRAP_VC)
3870                         break;
3871
3872                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3873                         u32 err = svm->vmcb->control.exit_int_info_err;
3874                         kvm_requeue_exception_e(vcpu, vector, err);
3875
3876                 } else
3877                         kvm_requeue_exception(vcpu, vector);
3878                 break;
3879         case SVM_EXITINTINFO_TYPE_INTR:
3880                 kvm_queue_interrupt(vcpu, vector, false);
3881                 break;
3882         case SVM_EXITINTINFO_TYPE_SOFT:
3883                 kvm_queue_interrupt(vcpu, vector, true);
3884                 break;
3885         default:
3886                 break;
3887         }
3888
3889 }
3890
3891 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3892 {
3893         struct vcpu_svm *svm = to_svm(vcpu);
3894         struct vmcb_control_area *control = &svm->vmcb->control;
3895
3896         control->exit_int_info = control->event_inj;
3897         control->exit_int_info_err = control->event_inj_err;
3898         control->event_inj = 0;
3899         svm_complete_interrupts(vcpu);
3900 }
3901
3902 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
3903 {
3904         return 1;
3905 }
3906
3907 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3908 {
3909         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3910
3911         /*
3912          * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
3913          * can't read guest memory (dereference memslots) to decode the WRMSR.
3914          */
3915         if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
3916             nrips && control->next_rip)
3917                 return handle_fastpath_set_msr_irqoff(vcpu);
3918
3919         return EXIT_FASTPATH_NONE;
3920 }
3921
3922 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
3923 {
3924         struct vcpu_svm *svm = to_svm(vcpu);
3925
3926         guest_state_enter_irqoff();
3927
3928         if (sev_es_guest(vcpu->kvm))
3929                 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
3930         else
3931                 __svm_vcpu_run(svm, spec_ctrl_intercepted);
3932
3933         guest_state_exit_irqoff();
3934 }
3935
3936 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3937 {
3938         struct vcpu_svm *svm = to_svm(vcpu);
3939         bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
3940
3941         trace_kvm_entry(vcpu);
3942
3943         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3944         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3945         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3946
3947         /*
3948          * Disable singlestep if we're injecting an interrupt/exception.
3949          * We don't want our modified rflags to be pushed on the stack where
3950          * we might not be able to easily reset them if we disabled NMI
3951          * singlestep later.
3952          */
3953         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3954                 /*
3955                  * Event injection happens before external interrupts cause a
3956                  * vmexit and interrupts are disabled here, so smp_send_reschedule
3957                  * is enough to force an immediate vmexit.
3958                  */
3959                 disable_nmi_singlestep(svm);
3960                 smp_send_reschedule(vcpu->cpu);
3961         }
3962
3963         pre_svm_run(vcpu);
3964
3965         sync_lapic_to_cr8(vcpu);
3966
3967         if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3968                 svm->vmcb->control.asid = svm->asid;
3969                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3970         }
3971         svm->vmcb->save.cr2 = vcpu->arch.cr2;
3972
3973         svm_hv_update_vp_id(svm->vmcb, vcpu);
3974
3975         /*
3976          * Run with all-zero DR6 unless needed, so that we can get the exact cause
3977          * of a #DB.
3978          */
3979         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3980                 svm_set_dr6(svm, vcpu->arch.dr6);
3981         else
3982                 svm_set_dr6(svm, DR6_ACTIVE_LOW);
3983
3984         clgi();
3985         kvm_load_guest_xsave_state(vcpu);
3986
3987         kvm_wait_lapic_expire(vcpu);
3988
3989         /*
3990          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3991          * it's non-zero. Since vmentry is serialising on affected CPUs, there
3992          * is no need to worry about the conditional branch over the wrmsr
3993          * being speculatively taken.
3994          */
3995         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3996                 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
3997
3998         svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
3999
4000         if (!sev_es_guest(vcpu->kvm))
4001                 reload_tss(vcpu);
4002
4003         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4004                 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4005
4006         if (!sev_es_guest(vcpu->kvm)) {
4007                 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4008                 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4009                 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4010                 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4011         }
4012         vcpu->arch.regs_dirty = 0;
4013
4014         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4015                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4016
4017         kvm_load_host_xsave_state(vcpu);
4018         stgi();
4019
4020         /* Any pending NMI will happen here */
4021
4022         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4023                 kvm_after_interrupt(vcpu);
4024
4025         sync_cr8_to_lapic(vcpu);
4026
4027         svm->next_rip = 0;
4028         if (is_guest_mode(vcpu)) {
4029                 nested_sync_control_from_vmcb02(svm);
4030
4031                 /* Track VMRUNs that have made past consistency checking */
4032                 if (svm->nested.nested_run_pending &&
4033                     svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4034                         ++vcpu->stat.nested_run;
4035
4036                 svm->nested.nested_run_pending = 0;
4037         }
4038
4039         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4040         vmcb_mark_all_clean(svm->vmcb);
4041
4042         /* if exit due to PF check for async PF */
4043         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4044                 vcpu->arch.apf.host_apf_flags =
4045                         kvm_read_and_reset_apf_flags();
4046
4047         vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4048
4049         /*
4050          * We need to handle MC intercepts here before the vcpu has a chance to
4051          * change the physical cpu
4052          */
4053         if (unlikely(svm->vmcb->control.exit_code ==
4054                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4055                 svm_handle_mce(vcpu);
4056
4057         svm_complete_interrupts(vcpu);
4058
4059         if (is_guest_mode(vcpu))
4060                 return EXIT_FASTPATH_NONE;
4061
4062         return svm_exit_handlers_fastpath(vcpu);
4063 }
4064
4065 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4066                              int root_level)
4067 {
4068         struct vcpu_svm *svm = to_svm(vcpu);
4069         unsigned long cr3;
4070
4071         if (npt_enabled) {
4072                 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4073                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4074
4075                 hv_track_root_tdp(vcpu, root_hpa);
4076
4077                 cr3 = vcpu->arch.cr3;
4078         } else if (root_level >= PT64_ROOT_4LEVEL) {
4079                 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4080         } else {
4081                 /* PCID in the guest should be impossible with a 32-bit MMU. */
4082                 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4083                 cr3 = root_hpa;
4084         }
4085
4086         svm->vmcb->save.cr3 = cr3;
4087         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4088 }
4089
4090 static void
4091 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4092 {
4093         /*
4094          * Patch in the VMMCALL instruction:
4095          */
4096         hypercall[0] = 0x0f;
4097         hypercall[1] = 0x01;
4098         hypercall[2] = 0xd9;
4099 }
4100
4101 /*
4102  * The kvm parameter can be NULL (module initialization, or invocation before
4103  * VM creation). Be sure to check the kvm parameter before using it.
4104  */
4105 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4106 {
4107         switch (index) {
4108         case MSR_IA32_MCG_EXT_CTL:
4109         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4110                 return false;
4111         case MSR_IA32_SMBASE:
4112                 if (!IS_ENABLED(CONFIG_KVM_SMM))
4113                         return false;
4114                 /* SEV-ES guests do not support SMM, so report false */
4115                 if (kvm && sev_es_guest(kvm))
4116                         return false;
4117                 break;
4118         default:
4119                 break;
4120         }
4121
4122         return true;
4123 }
4124
4125 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4126 {
4127         struct vcpu_svm *svm = to_svm(vcpu);
4128         struct kvm_cpuid_entry2 *best;
4129
4130         vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4131                                     boot_cpu_has(X86_FEATURE_XSAVE) &&
4132                                     boot_cpu_has(X86_FEATURE_XSAVES);
4133
4134         /* Update nrips enabled cache */
4135         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4136                              guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4137
4138         svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4139         svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
4140
4141         svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4142
4143         svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4144                         guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4145
4146         svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4147                         guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4148
4149         svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
4150
4151         svm_recalc_instruction_intercepts(vcpu, svm);
4152
4153         if (boot_cpu_has(X86_FEATURE_IBPB))
4154                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
4155                                      !!guest_has_pred_cmd_msr(vcpu));
4156
4157         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4158         if (sev_guest(vcpu->kvm)) {
4159                 best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4160                 if (best)
4161                         vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4162         }
4163
4164         init_vmcb_after_set_cpuid(vcpu);
4165 }
4166
4167 static bool svm_has_wbinvd_exit(void)
4168 {
4169         return true;
4170 }
4171
4172 #define PRE_EX(exit)  { .exit_code = (exit), \
4173                         .stage = X86_ICPT_PRE_EXCEPT, }
4174 #define POST_EX(exit) { .exit_code = (exit), \
4175                         .stage = X86_ICPT_POST_EXCEPT, }
4176 #define POST_MEM(exit) { .exit_code = (exit), \
4177                         .stage = X86_ICPT_POST_MEMACCESS, }
4178
4179 static const struct __x86_intercept {
4180         u32 exit_code;
4181         enum x86_intercept_stage stage;
4182 } x86_intercept_map[] = {
4183         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4184         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4185         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4186         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4187         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4188         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4189         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4190         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4191         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4192         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4193         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4194         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4195         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4196         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4197         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4198         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4199         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4200         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4201         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4202         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4203         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4204         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4205         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4206         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4207         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4208         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4209         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4210         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4211         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4212         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4213         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4214         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4215         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4216         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4217         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4218         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4219         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4220         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4221         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4222         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4223         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4224         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4225         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4226         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4227         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4228         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4229         [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4230 };
4231
4232 #undef PRE_EX
4233 #undef POST_EX
4234 #undef POST_MEM
4235
4236 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4237                                struct x86_instruction_info *info,
4238                                enum x86_intercept_stage stage,
4239                                struct x86_exception *exception)
4240 {
4241         struct vcpu_svm *svm = to_svm(vcpu);
4242         int vmexit, ret = X86EMUL_CONTINUE;
4243         struct __x86_intercept icpt_info;
4244         struct vmcb *vmcb = svm->vmcb;
4245
4246         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4247                 goto out;
4248
4249         icpt_info = x86_intercept_map[info->intercept];
4250
4251         if (stage != icpt_info.stage)
4252                 goto out;
4253
4254         switch (icpt_info.exit_code) {
4255         case SVM_EXIT_READ_CR0:
4256                 if (info->intercept == x86_intercept_cr_read)
4257                         icpt_info.exit_code += info->modrm_reg;
4258                 break;
4259         case SVM_EXIT_WRITE_CR0: {
4260                 unsigned long cr0, val;
4261
4262                 if (info->intercept == x86_intercept_cr_write)
4263                         icpt_info.exit_code += info->modrm_reg;
4264
4265                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4266                     info->intercept == x86_intercept_clts)
4267                         break;
4268
4269                 if (!(vmcb12_is_intercept(&svm->nested.ctl,
4270                                         INTERCEPT_SELECTIVE_CR0)))
4271                         break;
4272
4273                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4274                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4275
4276                 if (info->intercept == x86_intercept_lmsw) {
4277                         cr0 &= 0xfUL;
4278                         val &= 0xfUL;
4279                         /* lmsw can't clear PE - catch this here */
4280                         if (cr0 & X86_CR0_PE)
4281                                 val |= X86_CR0_PE;
4282                 }
4283
4284                 if (cr0 ^ val)
4285                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4286
4287                 break;
4288         }
4289         case SVM_EXIT_READ_DR0:
4290         case SVM_EXIT_WRITE_DR0:
4291                 icpt_info.exit_code += info->modrm_reg;
4292                 break;
4293         case SVM_EXIT_MSR:
4294                 if (info->intercept == x86_intercept_wrmsr)
4295                         vmcb->control.exit_info_1 = 1;
4296                 else
4297                         vmcb->control.exit_info_1 = 0;
4298                 break;
4299         case SVM_EXIT_PAUSE:
4300                 /*
4301                  * We get this for NOP only, but pause
4302                  * is rep not, check this here
4303                  */
4304                 if (info->rep_prefix != REPE_PREFIX)
4305                         goto out;
4306                 break;
4307         case SVM_EXIT_IOIO: {
4308                 u64 exit_info;
4309                 u32 bytes;
4310
4311                 if (info->intercept == x86_intercept_in ||
4312                     info->intercept == x86_intercept_ins) {
4313                         exit_info = ((info->src_val & 0xffff) << 16) |
4314                                 SVM_IOIO_TYPE_MASK;
4315                         bytes = info->dst_bytes;
4316                 } else {
4317                         exit_info = (info->dst_val & 0xffff) << 16;
4318                         bytes = info->src_bytes;
4319                 }
4320
4321                 if (info->intercept == x86_intercept_outs ||
4322                     info->intercept == x86_intercept_ins)
4323                         exit_info |= SVM_IOIO_STR_MASK;
4324
4325                 if (info->rep_prefix)
4326                         exit_info |= SVM_IOIO_REP_MASK;
4327
4328                 bytes = min(bytes, 4u);
4329
4330                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4331
4332                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4333
4334                 vmcb->control.exit_info_1 = exit_info;
4335                 vmcb->control.exit_info_2 = info->next_rip;
4336
4337                 break;
4338         }
4339         default:
4340                 break;
4341         }
4342
4343         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4344         if (static_cpu_has(X86_FEATURE_NRIPS))
4345                 vmcb->control.next_rip  = info->next_rip;
4346         vmcb->control.exit_code = icpt_info.exit_code;
4347         vmexit = nested_svm_exit_handled(svm);
4348
4349         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4350                                            : X86EMUL_CONTINUE;
4351
4352 out:
4353         return ret;
4354 }
4355
4356 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4357 {
4358         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4359                 vcpu->arch.at_instruction_boundary = true;
4360 }
4361
4362 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4363 {
4364         if (!kvm_pause_in_guest(vcpu->kvm))
4365                 shrink_ple_window(vcpu);
4366 }
4367
4368 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4369 {
4370         /* [63:9] are reserved. */
4371         vcpu->arch.mcg_cap &= 0x1ff;
4372 }
4373
4374 #ifdef CONFIG_KVM_SMM
4375 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4376 {
4377         struct vcpu_svm *svm = to_svm(vcpu);
4378
4379         /* Per APM Vol.2 15.22.2 "Response to SMI" */
4380         if (!gif_set(svm))
4381                 return true;
4382
4383         return is_smm(vcpu);
4384 }
4385
4386 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4387 {
4388         struct vcpu_svm *svm = to_svm(vcpu);
4389         if (svm->nested.nested_run_pending)
4390                 return -EBUSY;
4391
4392         if (svm_smi_blocked(vcpu))
4393                 return 0;
4394
4395         /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4396         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4397                 return -EBUSY;
4398
4399         return 1;
4400 }
4401
4402 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4403 {
4404         struct vcpu_svm *svm = to_svm(vcpu);
4405         struct kvm_host_map map_save;
4406         int ret;
4407
4408         if (!is_guest_mode(vcpu))
4409                 return 0;
4410
4411         /*
4412          * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4413          * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4414          */
4415
4416         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4417                 return 1;
4418
4419         smram->smram64.svm_guest_flag = 1;
4420         smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4421
4422         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4423         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4424         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4425
4426         ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4427         if (ret)
4428                 return ret;
4429
4430         /*
4431          * KVM uses VMCB01 to store L1 host state while L2 runs but
4432          * VMCB01 is going to be used during SMM and thus the state will
4433          * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4434          * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4435          * format of the area is identical to guest save area offsetted
4436          * by 0x400 (matches the offset of 'struct vmcb_save_area'
4437          * within 'struct vmcb'). Note: HSAVE area may also be used by
4438          * L1 hypervisor to save additional host context (e.g. KVM does
4439          * that, see svm_prepare_switch_to_guest()) which must be
4440          * preserved.
4441          */
4442         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4443                 return 1;
4444
4445         BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4446
4447         svm_copy_vmrun_state(map_save.hva + 0x400,
4448                              &svm->vmcb01.ptr->save);
4449
4450         kvm_vcpu_unmap(vcpu, &map_save, true);
4451         return 0;
4452 }
4453
4454 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4455 {
4456         struct vcpu_svm *svm = to_svm(vcpu);
4457         struct kvm_host_map map, map_save;
4458         struct vmcb *vmcb12;
4459         int ret;
4460
4461         const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4462
4463         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4464                 return 0;
4465
4466         /* Non-zero if SMI arrived while vCPU was in guest mode. */
4467         if (!smram64->svm_guest_flag)
4468                 return 0;
4469
4470         if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4471                 return 1;
4472
4473         if (!(smram64->efer & EFER_SVME))
4474                 return 1;
4475
4476         if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4477                 return 1;
4478
4479         ret = 1;
4480         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4481                 goto unmap_map;
4482
4483         if (svm_allocate_nested(svm))
4484                 goto unmap_save;
4485
4486         /*
4487          * Restore L1 host state from L1 HSAVE area as VMCB01 was
4488          * used during SMM (see svm_enter_smm())
4489          */
4490
4491         svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4492
4493         /*
4494          * Enter the nested guest now
4495          */
4496
4497         vmcb_mark_all_dirty(svm->vmcb01.ptr);
4498
4499         vmcb12 = map.hva;
4500         nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4501         nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4502         ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4503
4504         if (ret)
4505                 goto unmap_save;
4506
4507         svm->nested.nested_run_pending = 1;
4508
4509 unmap_save:
4510         kvm_vcpu_unmap(vcpu, &map_save, true);
4511 unmap_map:
4512         kvm_vcpu_unmap(vcpu, &map, true);
4513         return ret;
4514 }
4515
4516 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4517 {
4518         struct vcpu_svm *svm = to_svm(vcpu);
4519
4520         if (!gif_set(svm)) {
4521                 if (vgif)
4522                         svm_set_intercept(svm, INTERCEPT_STGI);
4523                 /* STGI will cause a vm exit */
4524         } else {
4525                 /* We must be in SMM; RSM will cause a vmexit anyway.  */
4526         }
4527 }
4528 #endif
4529
4530 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4531                                         void *insn, int insn_len)
4532 {
4533         bool smep, smap, is_user;
4534         unsigned long cr4;
4535         u64 error_code;
4536
4537         /* Emulation is always possible when KVM has access to all guest state. */
4538         if (!sev_guest(vcpu->kvm))
4539                 return true;
4540
4541         /* #UD and #GP should never be intercepted for SEV guests. */
4542         WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4543                                   EMULTYPE_TRAP_UD_FORCED |
4544                                   EMULTYPE_VMWARE_GP));
4545
4546         /*
4547          * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4548          * to guest register state.
4549          */
4550         if (sev_es_guest(vcpu->kvm))
4551                 return false;
4552
4553         /*
4554          * Emulation is possible if the instruction is already decoded, e.g.
4555          * when completing I/O after returning from userspace.
4556          */
4557         if (emul_type & EMULTYPE_NO_DECODE)
4558                 return true;
4559
4560         /*
4561          * Emulation is possible for SEV guests if and only if a prefilled
4562          * buffer containing the bytes of the intercepted instruction is
4563          * available. SEV guest memory is encrypted with a guest specific key
4564          * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4565          * decode garbage.
4566          *
4567          * Inject #UD if KVM reached this point without an instruction buffer.
4568          * In practice, this path should never be hit by a well-behaved guest,
4569          * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4570          * is still theoretically reachable, e.g. via unaccelerated fault-like
4571          * AVIC access, and needs to be handled by KVM to avoid putting the
4572          * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
4573          * but its the least awful option given lack of insight into the guest.
4574          */
4575         if (unlikely(!insn)) {
4576                 kvm_queue_exception(vcpu, UD_VECTOR);
4577                 return false;
4578         }
4579
4580         /*
4581          * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4582          * will be empty if the DecodeAssist microcode cannot fetch bytes for
4583          * the faulting instruction because the code fetch itself faulted, e.g.
4584          * the guest attempted to fetch from emulated MMIO or a guest page
4585          * table used to translate CS:RIP resides in emulated MMIO.
4586          */
4587         if (likely(insn_len))
4588                 return true;
4589
4590         /*
4591          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4592          *
4593          * Errata:
4594          * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4595          * possible that CPU microcode implementing DecodeAssist will fail to
4596          * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4597          * be '0'.  This happens because microcode reads CS:RIP using a _data_
4598          * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4599          * gives up and does not fill the instruction bytes buffer.
4600          *
4601          * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4602          * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4603          * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4604          * GuestIntrBytes field of the VMCB.
4605          *
4606          * This does _not_ mean that the erratum has been encountered, as the
4607          * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4608          * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4609          * encountered a reserved/not-present #PF.
4610          *
4611          * To hit the erratum, the following conditions must be true:
4612          *    1. CR4.SMAP=1 (obviously).
4613          *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4614          *       have been hit as the guest would have encountered a SMEP
4615          *       violation #PF, not a #NPF.
4616          *    3. The #NPF is not due to a code fetch, in which case failure to
4617          *       retrieve the instruction bytes is legitimate (see abvoe).
4618          *
4619          * In addition, don't apply the erratum workaround if the #NPF occurred
4620          * while translating guest page tables (see below).
4621          */
4622         error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4623         if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4624                 goto resume_guest;
4625
4626         cr4 = kvm_read_cr4(vcpu);
4627         smep = cr4 & X86_CR4_SMEP;
4628         smap = cr4 & X86_CR4_SMAP;
4629         is_user = svm_get_cpl(vcpu) == 3;
4630         if (smap && (!smep || is_user)) {
4631                 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
4632
4633                 /*
4634                  * If the fault occurred in userspace, arbitrarily inject #GP
4635                  * to avoid killing the guest and to hopefully avoid confusing
4636                  * the guest kernel too much, e.g. injecting #PF would not be
4637                  * coherent with respect to the guest's page tables.  Request
4638                  * triple fault if the fault occurred in the kernel as there's
4639                  * no fault that KVM can inject without confusing the guest.
4640                  * In practice, the triple fault is moot as no sane SEV kernel
4641                  * will execute from user memory while also running with SMAP=1.
4642                  */
4643                 if (is_user)
4644                         kvm_inject_gp(vcpu, 0);
4645                 else
4646                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4647         }
4648
4649 resume_guest:
4650         /*
4651          * If the erratum was not hit, simply resume the guest and let it fault
4652          * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4653          * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4654          * userspace will kill the guest, and letting the emulator read garbage
4655          * will yield random behavior and potentially corrupt the guest.
4656          *
4657          * Simply resuming the guest is technically not a violation of the SEV
4658          * architecture.  AMD's APM states that all code fetches and page table
4659          * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4660          * APM also states that encrypted accesses to MMIO are "ignored", but
4661          * doesn't explicitly define "ignored", i.e. doing nothing and letting
4662          * the guest spin is technically "ignoring" the access.
4663          */
4664         return false;
4665 }
4666
4667 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4668 {
4669         struct vcpu_svm *svm = to_svm(vcpu);
4670
4671         return !gif_set(svm);
4672 }
4673
4674 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4675 {
4676         if (!sev_es_guest(vcpu->kvm))
4677                 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4678
4679         sev_vcpu_deliver_sipi_vector(vcpu, vector);
4680 }
4681
4682 static void svm_vm_destroy(struct kvm *kvm)
4683 {
4684         avic_vm_destroy(kvm);
4685         sev_vm_destroy(kvm);
4686 }
4687
4688 static int svm_vm_init(struct kvm *kvm)
4689 {
4690         if (!pause_filter_count || !pause_filter_thresh)
4691                 kvm->arch.pause_in_guest = true;
4692
4693         if (enable_apicv) {
4694                 int ret = avic_vm_init(kvm);
4695                 if (ret)
4696                         return ret;
4697         }
4698
4699         return 0;
4700 }
4701
4702 static struct kvm_x86_ops svm_x86_ops __initdata = {
4703         .name = KBUILD_MODNAME,
4704
4705         .check_processor_compatibility = svm_check_processor_compat,
4706
4707         .hardware_unsetup = svm_hardware_unsetup,
4708         .hardware_enable = svm_hardware_enable,
4709         .hardware_disable = svm_hardware_disable,
4710         .has_emulated_msr = svm_has_emulated_msr,
4711
4712         .vcpu_create = svm_vcpu_create,
4713         .vcpu_free = svm_vcpu_free,
4714         .vcpu_reset = svm_vcpu_reset,
4715
4716         .vm_size = sizeof(struct kvm_svm),
4717         .vm_init = svm_vm_init,
4718         .vm_destroy = svm_vm_destroy,
4719
4720         .prepare_switch_to_guest = svm_prepare_switch_to_guest,
4721         .vcpu_load = svm_vcpu_load,
4722         .vcpu_put = svm_vcpu_put,
4723         .vcpu_blocking = avic_vcpu_blocking,
4724         .vcpu_unblocking = avic_vcpu_unblocking,
4725
4726         .update_exception_bitmap = svm_update_exception_bitmap,
4727         .get_msr_feature = svm_get_msr_feature,
4728         .get_msr = svm_get_msr,
4729         .set_msr = svm_set_msr,
4730         .get_segment_base = svm_get_segment_base,
4731         .get_segment = svm_get_segment,
4732         .set_segment = svm_set_segment,
4733         .get_cpl = svm_get_cpl,
4734         .get_cs_db_l_bits = svm_get_cs_db_l_bits,
4735         .set_cr0 = svm_set_cr0,
4736         .post_set_cr3 = sev_post_set_cr3,
4737         .is_valid_cr4 = svm_is_valid_cr4,
4738         .set_cr4 = svm_set_cr4,
4739         .set_efer = svm_set_efer,
4740         .get_idt = svm_get_idt,
4741         .set_idt = svm_set_idt,
4742         .get_gdt = svm_get_gdt,
4743         .set_gdt = svm_set_gdt,
4744         .set_dr7 = svm_set_dr7,
4745         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4746         .cache_reg = svm_cache_reg,
4747         .get_rflags = svm_get_rflags,
4748         .set_rflags = svm_set_rflags,
4749         .get_if_flag = svm_get_if_flag,
4750
4751         .flush_tlb_all = svm_flush_tlb_current,
4752         .flush_tlb_current = svm_flush_tlb_current,
4753         .flush_tlb_gva = svm_flush_tlb_gva,
4754         .flush_tlb_guest = svm_flush_tlb_current,
4755
4756         .vcpu_pre_run = svm_vcpu_pre_run,
4757         .vcpu_run = svm_vcpu_run,
4758         .handle_exit = svm_handle_exit,
4759         .skip_emulated_instruction = svm_skip_emulated_instruction,
4760         .update_emulated_instruction = NULL,
4761         .set_interrupt_shadow = svm_set_interrupt_shadow,
4762         .get_interrupt_shadow = svm_get_interrupt_shadow,
4763         .patch_hypercall = svm_patch_hypercall,
4764         .inject_irq = svm_inject_irq,
4765         .inject_nmi = svm_inject_nmi,
4766         .inject_exception = svm_inject_exception,
4767         .cancel_injection = svm_cancel_injection,
4768         .interrupt_allowed = svm_interrupt_allowed,
4769         .nmi_allowed = svm_nmi_allowed,
4770         .get_nmi_mask = svm_get_nmi_mask,
4771         .set_nmi_mask = svm_set_nmi_mask,
4772         .enable_nmi_window = svm_enable_nmi_window,
4773         .enable_irq_window = svm_enable_irq_window,
4774         .update_cr8_intercept = svm_update_cr8_intercept,
4775         .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
4776         .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4777         .apicv_post_state_restore = avic_apicv_post_state_restore,
4778         .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
4779
4780         .get_exit_info = svm_get_exit_info,
4781
4782         .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4783
4784         .has_wbinvd_exit = svm_has_wbinvd_exit,
4785
4786         .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4787         .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4788         .write_tsc_offset = svm_write_tsc_offset,
4789         .write_tsc_multiplier = svm_write_tsc_multiplier,
4790
4791         .load_mmu_pgd = svm_load_mmu_pgd,
4792
4793         .check_intercept = svm_check_intercept,
4794         .handle_exit_irqoff = svm_handle_exit_irqoff,
4795
4796         .request_immediate_exit = __kvm_request_immediate_exit,
4797
4798         .sched_in = svm_sched_in,
4799
4800         .nested_ops = &svm_nested_ops,
4801
4802         .deliver_interrupt = svm_deliver_interrupt,
4803         .pi_update_irte = avic_pi_update_irte,
4804         .setup_mce = svm_setup_mce,
4805
4806 #ifdef CONFIG_KVM_SMM
4807         .smi_allowed = svm_smi_allowed,
4808         .enter_smm = svm_enter_smm,
4809         .leave_smm = svm_leave_smm,
4810         .enable_smi_window = svm_enable_smi_window,
4811 #endif
4812
4813         .mem_enc_ioctl = sev_mem_enc_ioctl,
4814         .mem_enc_register_region = sev_mem_enc_register_region,
4815         .mem_enc_unregister_region = sev_mem_enc_unregister_region,
4816         .guest_memory_reclaimed = sev_guest_memory_reclaimed,
4817
4818         .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4819         .vm_move_enc_context_from = sev_vm_move_enc_context_from,
4820
4821         .can_emulate_instruction = svm_can_emulate_instruction,
4822
4823         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
4824
4825         .msr_filter_changed = svm_msr_filter_changed,
4826         .complete_emulated_msr = svm_complete_emulated_msr,
4827
4828         .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4829         .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
4830 };
4831
4832 /*
4833  * The default MMIO mask is a single bit (excluding the present bit),
4834  * which could conflict with the memory encryption bit. Check for
4835  * memory encryption support and override the default MMIO mask if
4836  * memory encryption is enabled.
4837  */
4838 static __init void svm_adjust_mmio_mask(void)
4839 {
4840         unsigned int enc_bit, mask_bit;
4841         u64 msr, mask;
4842
4843         /* If there is no memory encryption support, use existing mask */
4844         if (cpuid_eax(0x80000000) < 0x8000001f)
4845                 return;
4846
4847         /* If memory encryption is not enabled, use existing mask */
4848         rdmsrl(MSR_AMD64_SYSCFG, msr);
4849         if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4850                 return;
4851
4852         enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4853         mask_bit = boot_cpu_data.x86_phys_bits;
4854
4855         /* Increment the mask bit if it is the same as the encryption bit */
4856         if (enc_bit == mask_bit)
4857                 mask_bit++;
4858
4859         /*
4860          * If the mask bit location is below 52, then some bits above the
4861          * physical addressing limit will always be reserved, so use the
4862          * rsvd_bits() function to generate the mask. This mask, along with
4863          * the present bit, will be used to generate a page fault with
4864          * PFER.RSV = 1.
4865          *
4866          * If the mask bit location is 52 (or above), then clear the mask.
4867          */
4868         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4869
4870         kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4871 }
4872
4873 static __init void svm_set_cpu_caps(void)
4874 {
4875         kvm_set_cpu_caps();
4876
4877         kvm_caps.supported_perf_cap = 0;
4878         kvm_caps.supported_xss = 0;
4879
4880         /* CPUID 0x80000001 and 0x8000000A (SVM features) */
4881         if (nested) {
4882                 kvm_cpu_cap_set(X86_FEATURE_SVM);
4883                 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
4884
4885                 if (nrips)
4886                         kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4887
4888                 if (npt_enabled)
4889                         kvm_cpu_cap_set(X86_FEATURE_NPT);
4890
4891                 if (tsc_scaling)
4892                         kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
4893
4894                 if (vls)
4895                         kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
4896                 if (lbrv)
4897                         kvm_cpu_cap_set(X86_FEATURE_LBRV);
4898
4899                 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
4900                         kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
4901
4902                 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
4903                         kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
4904
4905                 if (vgif)
4906                         kvm_cpu_cap_set(X86_FEATURE_VGIF);
4907
4908                 /* Nested VM can receive #VMEXIT instead of triggering #GP */
4909                 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
4910         }
4911
4912         /* CPUID 0x80000008 */
4913         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
4914             boot_cpu_has(X86_FEATURE_AMD_SSBD))
4915                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
4916
4917         /* AMD PMU PERFCTR_CORE CPUID */
4918         if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
4919                 kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
4920
4921         /* CPUID 0x8000001F (SME/SEV features) */
4922         sev_set_cpu_caps();
4923 }
4924
4925 static __init int svm_hardware_setup(void)
4926 {
4927         int cpu;
4928         struct page *iopm_pages;
4929         void *iopm_va;
4930         int r;
4931         unsigned int order = get_order(IOPM_SIZE);
4932
4933         /*
4934          * NX is required for shadow paging and for NPT if the NX huge pages
4935          * mitigation is enabled.
4936          */
4937         if (!boot_cpu_has(X86_FEATURE_NX)) {
4938                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
4939                 return -EOPNOTSUPP;
4940         }
4941         kvm_enable_efer_bits(EFER_NX);
4942
4943         iopm_pages = alloc_pages(GFP_KERNEL, order);
4944
4945         if (!iopm_pages)
4946                 return -ENOMEM;
4947
4948         iopm_va = page_address(iopm_pages);
4949         memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
4950         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
4951
4952         init_msrpm_offsets();
4953
4954         kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
4955                                      XFEATURE_MASK_BNDCSR);
4956
4957         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
4958                 kvm_enable_efer_bits(EFER_FFXSR);
4959
4960         if (tsc_scaling) {
4961                 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
4962                         tsc_scaling = false;
4963                 } else {
4964                         pr_info("TSC scaling supported\n");
4965                         kvm_caps.has_tsc_control = true;
4966                 }
4967         }
4968         kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
4969         kvm_caps.tsc_scaling_ratio_frac_bits = 32;
4970
4971         tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
4972
4973         if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
4974                 kvm_enable_efer_bits(EFER_AUTOIBRS);
4975
4976         /* Check for pause filtering support */
4977         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
4978                 pause_filter_count = 0;
4979                 pause_filter_thresh = 0;
4980         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
4981                 pause_filter_thresh = 0;
4982         }
4983
4984         if (nested) {
4985                 pr_info("Nested Virtualization enabled\n");
4986                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
4987         }
4988
4989         /*
4990          * KVM's MMU doesn't support using 2-level paging for itself, and thus
4991          * NPT isn't supported if the host is using 2-level paging since host
4992          * CR4 is unchanged on VMRUN.
4993          */
4994         if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
4995                 npt_enabled = false;
4996
4997         if (!boot_cpu_has(X86_FEATURE_NPT))
4998                 npt_enabled = false;
4999
5000         /* Force VM NPT level equal to the host's paging level */
5001         kvm_configure_mmu(npt_enabled, get_npt_level(),
5002                           get_npt_level(), PG_LEVEL_1G);
5003         pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5004
5005         /* Setup shadow_me_value and shadow_me_mask */
5006         kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5007
5008         svm_adjust_mmio_mask();
5009
5010         /*
5011          * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5012          * may be modified by svm_adjust_mmio_mask()).
5013          */
5014         sev_hardware_setup();
5015
5016         svm_hv_hardware_setup();
5017
5018         for_each_possible_cpu(cpu) {
5019                 r = svm_cpu_init(cpu);
5020                 if (r)
5021                         goto err;
5022         }
5023
5024         if (nrips) {
5025                 if (!boot_cpu_has(X86_FEATURE_NRIPS))
5026                         nrips = false;
5027         }
5028
5029         enable_apicv = avic = avic && avic_hardware_setup();
5030
5031         if (!enable_apicv) {
5032                 svm_x86_ops.vcpu_blocking = NULL;
5033                 svm_x86_ops.vcpu_unblocking = NULL;
5034                 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5035         } else if (!x2avic_enabled) {
5036                 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
5037         }
5038
5039         if (vls) {
5040                 if (!npt_enabled ||
5041                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5042                     !IS_ENABLED(CONFIG_X86_64)) {
5043                         vls = false;
5044                 } else {
5045                         pr_info("Virtual VMLOAD VMSAVE supported\n");
5046                 }
5047         }
5048
5049         if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5050                 svm_gp_erratum_intercept = false;
5051
5052         if (vgif) {
5053                 if (!boot_cpu_has(X86_FEATURE_VGIF))
5054                         vgif = false;
5055                 else
5056                         pr_info("Virtual GIF supported\n");
5057         }
5058
5059         if (lbrv) {
5060                 if (!boot_cpu_has(X86_FEATURE_LBRV))
5061                         lbrv = false;
5062                 else
5063                         pr_info("LBR virtualization supported\n");
5064         }
5065
5066         if (!enable_pmu)
5067                 pr_info("PMU virtualization is disabled\n");
5068
5069         svm_set_cpu_caps();
5070
5071         /*
5072          * It seems that on AMD processors PTE's accessed bit is
5073          * being set by the CPU hardware before the NPF vmexit.
5074          * This is not expected behaviour and our tests fail because
5075          * of it.
5076          * A workaround here is to disable support for
5077          * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5078          * In this case userspace can know if there is support using
5079          * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5080          * it
5081          * If future AMD CPU models change the behaviour described above,
5082          * this variable can be changed accordingly
5083          */
5084         allow_smaller_maxphyaddr = !npt_enabled;
5085
5086         return 0;
5087
5088 err:
5089         svm_hardware_unsetup();
5090         return r;
5091 }
5092
5093
5094 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5095         .hardware_setup = svm_hardware_setup,
5096
5097         .runtime_ops = &svm_x86_ops,
5098         .pmu_ops = &amd_pmu_ops,
5099 };
5100
5101 static int __init svm_init(void)
5102 {
5103         int r;
5104
5105         __unused_size_checks();
5106
5107         if (!kvm_is_svm_supported())
5108                 return -EOPNOTSUPP;
5109
5110         r = kvm_x86_vendor_init(&svm_init_ops);
5111         if (r)
5112                 return r;
5113
5114         /*
5115          * Common KVM initialization _must_ come last, after this, /dev/kvm is
5116          * exposed to userspace!
5117          */
5118         r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5119                      THIS_MODULE);
5120         if (r)
5121                 goto err_kvm_init;
5122
5123         return 0;
5124
5125 err_kvm_init:
5126         kvm_x86_vendor_exit();
5127         return r;
5128 }
5129
5130 static void __exit svm_exit(void)
5131 {
5132         kvm_exit();
5133         kvm_x86_vendor_exit();
5134 }
5135
5136 module_init(svm_init)
5137 module_exit(svm_exit)