Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[linux-block.git] / arch / x86 / kvm / x86.c
index b0c47b41c264982c993a098e738fa2ef8f9e6add..4bd5f8a751de91ffeb666e1be9c5db8ae3b65f36 100644 (file)
@@ -173,8 +173,13 @@ bool __read_mostly enable_vmware_backdoor = false;
 module_param(enable_vmware_backdoor, bool, S_IRUGO);
 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
 
-static bool __read_mostly force_emulation_prefix = false;
-module_param(force_emulation_prefix, bool, S_IRUGO);
+/*
+ * Flags to manipulate forced emulation behavior (any non-zero value will
+ * enable forced emulation).
+ */
+#define KVM_FEP_CLEAR_RFLAGS_RF        BIT(1)
+static int __read_mostly force_emulation_prefix;
+module_param(force_emulation_prefix, int, 0644);
 
 int __read_mostly pi_inject_timer = -1;
 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
@@ -528,6 +533,7 @@ static int exception_class(int vector)
 #define EXCPT_TRAP             1
 #define EXCPT_ABORT            2
 #define EXCPT_INTERRUPT                3
+#define EXCPT_DB               4
 
 static int exception_type(int vector)
 {
@@ -538,8 +544,14 @@ static int exception_type(int vector)
 
        mask = 1 << vector;
 
-       /* #DB is trap, as instruction watchpoints are handled elsewhere */
-       if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+       /*
+        * #DBs can be trap-like or fault-like, the caller must check other CPU
+        * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+        */
+       if (mask & (1 << DB_VECTOR))
+               return EXCPT_DB;
+
+       if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
                return EXCPT_TRAP;
 
        if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
@@ -549,16 +561,13 @@ static int exception_type(int vector)
        return EXCPT_FAULT;
 }
 
-void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+                                  struct kvm_queued_exception *ex)
 {
-       unsigned nr = vcpu->arch.exception.nr;
-       bool has_payload = vcpu->arch.exception.has_payload;
-       unsigned long payload = vcpu->arch.exception.payload;
-
-       if (!has_payload)
+       if (!ex->has_payload)
                return;
 
-       switch (nr) {
+       switch (ex->vector) {
        case DB_VECTOR:
                /*
                 * "Certain debug exceptions may clear bit 0-3.  The
@@ -583,8 +592,8 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
                 * So they need to be flipped for DR6.
                 */
                vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
-               vcpu->arch.dr6 |= payload;
-               vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
+               vcpu->arch.dr6 |= ex->payload;
+               vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
 
                /*
                 * The #DB payload is defined as compatible with the 'pending
@@ -595,15 +604,30 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
                vcpu->arch.dr6 &= ~BIT(12);
                break;
        case PF_VECTOR:
-               vcpu->arch.cr2 = payload;
+               vcpu->arch.cr2 = ex->payload;
                break;
        }
 
-       vcpu->arch.exception.has_payload = false;
-       vcpu->arch.exception.payload = 0;
+       ex->has_payload = false;
+       ex->payload = 0;
 }
 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
 
+static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+                                      bool has_error_code, u32 error_code,
+                                      bool has_payload, unsigned long payload)
+{
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+
+       ex->vector = vector;
+       ex->injected = false;
+       ex->pending = true;
+       ex->has_error_code = has_error_code;
+       ex->error_code = error_code;
+       ex->has_payload = has_payload;
+       ex->payload = payload;
+}
+
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                unsigned nr, bool has_error, u32 error_code,
                bool has_payload, unsigned long payload, bool reinject)
@@ -613,18 +637,31 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
+       /*
+        * If the exception is destined for L2 and isn't being reinjected,
+        * morph it to a VM-Exit if L1 wants to intercept the exception.  A
+        * previously injected exception is not checked because it was checked
+        * when it was original queued, and re-checking is incorrect if _L1_
+        * injected the exception, in which case it's exempt from interception.
+        */
+       if (!reinject && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+               kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+                                          has_payload, payload);
+               return;
+       }
+
        if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
        queue:
                if (reinject) {
                        /*
-                        * On vmentry, vcpu->arch.exception.pending is only
-                        * true if an event injection was blocked by
-                        * nested_run_pending.  In that case, however,
-                        * vcpu_enter_guest requests an immediate exit,
-                        * and the guest shouldn't proceed far enough to
-                        * need reinjection.
+                        * On VM-Entry, an exception can be pending if and only
+                        * if event injection was blocked by nested_run_pending.
+                        * In that case, however, vcpu_enter_guest() requests an
+                        * immediate exit, and the guest shouldn't proceed far
+                        * enough to need reinjection.
                         */
-                       WARN_ON_ONCE(vcpu->arch.exception.pending);
+                       WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
                        vcpu->arch.exception.injected = true;
                        if (WARN_ON_ONCE(has_payload)) {
                                /*
@@ -639,17 +676,18 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                        vcpu->arch.exception.injected = false;
                }
                vcpu->arch.exception.has_error_code = has_error;
-               vcpu->arch.exception.nr = nr;
+               vcpu->arch.exception.vector = nr;
                vcpu->arch.exception.error_code = error_code;
                vcpu->arch.exception.has_payload = has_payload;
                vcpu->arch.exception.payload = payload;
                if (!is_guest_mode(vcpu))
-                       kvm_deliver_exception_payload(vcpu);
+                       kvm_deliver_exception_payload(vcpu,
+                                                     &vcpu->arch.exception);
                return;
        }
 
        /* to check exception */
-       prev_nr = vcpu->arch.exception.nr;
+       prev_nr = vcpu->arch.exception.vector;
        if (prev_nr == DF_VECTOR) {
                /* triple fault -> shutdown */
                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -657,25 +695,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
        }
        class1 = exception_class(prev_nr);
        class2 = exception_class(nr);
-       if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
-               || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+       if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+           (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
                /*
-                * Generate double fault per SDM Table 5-5.  Set
-                * exception.pending = true so that the double fault
-                * can trigger a nested vmexit.
+                * Synthesize #DF.  Clear the previously injected or pending
+                * exception so as not to incorrectly trigger shutdown.
                 */
-               vcpu->arch.exception.pending = true;
                vcpu->arch.exception.injected = false;
-               vcpu->arch.exception.has_error_code = true;
-               vcpu->arch.exception.nr = DF_VECTOR;
-               vcpu->arch.exception.error_code = 0;
-               vcpu->arch.exception.has_payload = false;
-               vcpu->arch.exception.payload = 0;
-       } else
+               vcpu->arch.exception.pending = false;
+
+               kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
+       } else {
                /* replace previous exception with a new one in a hope
                   that instruction re-execution will regenerate lost
                   exception */
                goto queue;
+       }
 }
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
@@ -729,20 +764,22 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
        ++vcpu->stat.pf_guest;
-       vcpu->arch.exception.nested_apf =
-               is_guest_mode(vcpu) && fault->async_page_fault;
-       if (vcpu->arch.exception.nested_apf) {
-               vcpu->arch.apf.nested_apf_token = fault->address;
-               kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
-       } else {
+
+       /*
+        * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+        * whether or not L1 wants to intercept "regular" #PF.
+        */
+       if (is_guest_mode(vcpu) && fault->async_page_fault)
+               kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+                                          true, fault->error_code,
+                                          true, fault->address);
+       else
                kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
                                        fault->address);
-       }
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
-/* Returns true if the page fault was immediately morphed into a VM-Exit. */
-bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                    struct x86_exception *fault)
 {
        struct kvm_mmu *fault_mmu;
@@ -760,26 +797,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
                                       fault_mmu->root.hpa);
 
-       /*
-        * A workaround for KVM's bad exception handling.  If KVM injected an
-        * exception into L2, and L2 encountered a #PF while vectoring the
-        * injected exception, manually check to see if L1 wants to intercept
-        * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
-        * In all other cases, defer the check to nested_ops->check_events(),
-        * which will correctly handle priority (this does not).  Note, other
-        * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
-        * most problematic, e.g. when L0 and L1 are both intercepting #PF for
-        * shadow paging.
-        *
-        * TODO: Rewrite exception handling to track injected and pending
-        *       (VM-Exit) exceptions separately.
-        */
-       if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
-           kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
-               return true;
-
        fault_mmu->inject_page_fault(vcpu, fault);
-       return false;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
 
@@ -4841,7 +4859,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
        return (kvm_arch_interrupt_allowed(vcpu) &&
                kvm_cpu_accept_dm_intr(vcpu) &&
                !kvm_event_needs_reinjection(vcpu) &&
-               !vcpu->arch.exception.pending);
+               !kvm_is_exception_pending(vcpu));
 }
 
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@ -5016,25 +5034,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                               struct kvm_vcpu_events *events)
 {
+       struct kvm_queued_exception *ex;
+
        process_nmi(vcpu);
 
        if (kvm_check_request(KVM_REQ_SMI, vcpu))
                process_smi(vcpu);
 
        /*
-        * In guest mode, payload delivery should be deferred,
-        * so that the L1 hypervisor can intercept #PF before
-        * CR2 is modified (or intercept #DB before DR6 is
-        * modified under nVMX). Unless the per-VM capability,
-        * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
-        * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
-        * opportunistically defer the exception payload, deliver it if the
-        * capability hasn't been requested before processing a
-        * KVM_GET_VCPU_EVENTS.
+        * KVM's ABI only allows for one exception to be migrated.  Luckily,
+        * the only time there can be two queued exceptions is if there's a
+        * non-exiting _injected_ exception, and a pending exiting exception.
+        * In that case, ignore the VM-Exiting exception as it's an extension
+        * of the injected exception.
+        */
+       if (vcpu->arch.exception_vmexit.pending &&
+           !vcpu->arch.exception.pending &&
+           !vcpu->arch.exception.injected)
+               ex = &vcpu->arch.exception_vmexit;
+       else
+               ex = &vcpu->arch.exception;
+
+       /*
+        * In guest mode, payload delivery should be deferred if the exception
+        * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
+        * intercepts #PF, ditto for DR6 and #DBs.  If the per-VM capability,
+        * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
+        * propagate the payload and so it cannot be safely deferred.  Deliver
+        * the payload if the capability hasn't been requested.
         */
        if (!vcpu->kvm->arch.exception_payload_enabled &&
-           vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
-               kvm_deliver_exception_payload(vcpu);
+           ex->pending && ex->has_payload)
+               kvm_deliver_exception_payload(vcpu, ex);
 
        /*
         * The API doesn't provide the instruction length for software
@@ -5042,26 +5073,25 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
         * isn't advanced, we should expect to encounter the exception
         * again.
         */
-       if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+       if (kvm_exception_is_soft(ex->vector)) {
                events->exception.injected = 0;
                events->exception.pending = 0;
        } else {
-               events->exception.injected = vcpu->arch.exception.injected;
-               events->exception.pending = vcpu->arch.exception.pending;
+               events->exception.injected = ex->injected;
+               events->exception.pending = ex->pending;
                /*
                 * For ABI compatibility, deliberately conflate
                 * pending and injected exceptions when
                 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
                 */
                if (!vcpu->kvm->arch.exception_payload_enabled)
-                       events->exception.injected |=
-                               vcpu->arch.exception.pending;
+                       events->exception.injected |= ex->pending;
        }
-       events->exception.nr = vcpu->arch.exception.nr;
-       events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-       events->exception.error_code = vcpu->arch.exception.error_code;
-       events->exception_has_payload = vcpu->arch.exception.has_payload;
-       events->exception_payload = vcpu->arch.exception.payload;
+       events->exception.nr = ex->vector;
+       events->exception.has_error_code = ex->has_error_code;
+       events->exception.error_code = ex->error_code;
+       events->exception_has_payload = ex->has_payload;
+       events->exception_payload = ex->payload;
 
        events->interrupt.injected =
                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@ -5131,9 +5161,22 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        process_nmi(vcpu);
+
+       /*
+        * Flag that userspace is stuffing an exception, the next KVM_RUN will
+        * morph the exception to a VM-Exit if appropriate.  Do this only for
+        * pending exceptions, already-injected exceptions are not subject to
+        * intercpetion.  Note, userspace that conflates pending and injected
+        * is hosed, and will incorrectly convert an injected exception into a
+        * pending exception, which in turn may cause a spurious VM-Exit.
+        */
+       vcpu->arch.exception_from_userspace = events->exception.pending;
+
+       vcpu->arch.exception_vmexit.pending = false;
+
        vcpu->arch.exception.injected = events->exception.injected;
        vcpu->arch.exception.pending = events->exception.pending;
-       vcpu->arch.exception.nr = events->exception.nr;
+       vcpu->arch.exception.vector = events->exception.nr;
        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
        vcpu->arch.exception.error_code = events->exception.error_code;
        vcpu->arch.exception.has_payload = events->exception_has_payload;
@@ -7257,6 +7300,7 @@ static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
 int handle_ud(struct kvm_vcpu *vcpu)
 {
        static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
+       int fep_flags = READ_ONCE(force_emulation_prefix);
        int emul_type = EMULTYPE_TRAP_UD;
        char sig[5]; /* ud2; .ascii "kvm" */
        struct x86_exception e;
@@ -7264,10 +7308,12 @@ int handle_ud(struct kvm_vcpu *vcpu)
        if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
                return 1;
 
-       if (force_emulation_prefix &&
+       if (fep_flags &&
            kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
                                sig, sizeof(sig), &e) == 0 &&
            memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
+               if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
+                       kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
                kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
                emul_type = EMULTYPE_TRAP_UD_FORCED;
        }
@@ -7933,14 +7979,20 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
        int r;
 
        r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
+       if (r < 0)
+               return X86EMUL_UNHANDLEABLE;
 
-       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
-                                   complete_emulated_rdmsr, r)) {
-               /* Bounce to user space */
-               return X86EMUL_IO_NEEDED;
+       if (r) {
+               if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+                                      complete_emulated_rdmsr, r))
+                       return X86EMUL_IO_NEEDED;
+
+               trace_kvm_msr_read_ex(msr_index);
+               return X86EMUL_PROPAGATE_FAULT;
        }
 
-       return r;
+       trace_kvm_msr_read(msr_index, *pdata);
+       return X86EMUL_CONTINUE;
 }
 
 static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
@@ -7950,14 +8002,20 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
        int r;
 
        r = kvm_set_msr_with_filter(vcpu, msr_index, data);
+       if (r < 0)
+               return X86EMUL_UNHANDLEABLE;
 
-       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
-                                   complete_emulated_msr_access, r)) {
-               /* Bounce to user space */
-               return X86EMUL_IO_NEEDED;
+       if (r) {
+               if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+                                      complete_emulated_msr_access, r))
+                       return X86EMUL_IO_NEEDED;
+
+               trace_kvm_msr_write_ex(msr_index, data);
+               return X86EMUL_PROPAGATE_FAULT;
        }
 
-       return r;
+       trace_kvm_msr_write(msr_index, data);
+       return X86EMUL_CONTINUE;
 }
 
 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
@@ -8161,18 +8219,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
        }
 }
 
-static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
-       if (ctxt->exception.vector == PF_VECTOR)
-               return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
 
-       if (ctxt->exception.error_code_valid)
+       if (ctxt->exception.vector == PF_VECTOR)
+               kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+       else if (ctxt->exception.error_code_valid)
                kvm_queue_exception_e(vcpu, ctxt->exception.vector,
                                      ctxt->exception.error_code);
        else
                kvm_queue_exception(vcpu, ctxt->exception.vector);
-       return false;
 }
 
 static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -8548,8 +8605,46 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
 
-static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
+static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
 {
+       u32 shadow;
+
+       if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
+               return true;
+
+       /*
+        * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
+        * but AMD CPUs do not.  MOV/POP SS blocking is rare, check that first
+        * to avoid the relatively expensive CPUID lookup.
+        */
+       shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+       return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
+              guest_cpuid_is_intel(vcpu);
+}
+
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
+                                          int emulation_type, int *r)
+{
+       WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
+
+       /*
+        * Do not check for code breakpoints if hardware has already done the
+        * checks, as inferred from the emulation type.  On NO_DECODE and SKIP,
+        * the instruction has passed all exception checks, and all intercepted
+        * exceptions that trigger emulation have lower priority than code
+        * breakpoints, i.e. the fact that the intercepted exception occurred
+        * means any code breakpoints have already been serviced.
+        *
+        * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
+        * hardware has checked the RIP of the magic prefix, but not the RIP of
+        * the instruction being emulated.  The intent of forced emulation is
+        * to behave as if KVM intercepted the instruction without an exception
+        * and without a prefix.
+        */
+       if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+                             EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
+               return false;
+
        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
                struct kvm_run *kvm_run = vcpu->run;
@@ -8569,7 +8664,7 @@ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
        }
 
        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
-           !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+           !kvm_is_code_breakpoint_inhibited(vcpu)) {
                unsigned long eip = kvm_get_linear_rip(vcpu);
                u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
                                           vcpu->arch.dr7,
@@ -8671,8 +8766,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                 * are fault-like and are higher priority than any faults on
                 * the code fetch itself.
                 */
-               if (!(emulation_type & EMULTYPE_SKIP) &&
-                   kvm_vcpu_check_code_breakpoint(vcpu, &r))
+               if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
                        return r;
 
                r = x86_decode_emulated_instruction(vcpu, emulation_type,
@@ -8770,8 +8864,7 @@ restart:
 
        if (ctxt->have_exception) {
                r = 1;
-               if (inject_emulated_exception(vcpu))
-                       return r;
+               inject_emulated_exception(vcpu);
        } else if (vcpu->arch.pio.count) {
                if (!vcpu->arch.pio.in) {
                        /* FIXME: return into emulator if single-stepping.  */
@@ -8801,6 +8894,12 @@ writeback:
                unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
                toggle_interruptibility(vcpu, ctxt->interruptibility);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+               /*
+                * Note, EXCPT_DB is assumed to be fault-like as the emulator
+                * only supports code breakpoints and general detect #DB, both
+                * of which are fault-like.
+                */
                if (!ctxt->have_exception ||
                    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
                        kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
@@ -9662,74 +9761,155 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
-       trace_kvm_inj_exception(vcpu->arch.exception.nr,
+       trace_kvm_inj_exception(vcpu->arch.exception.vector,
                                vcpu->arch.exception.has_error_code,
                                vcpu->arch.exception.error_code,
                                vcpu->arch.exception.injected);
 
        if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
                vcpu->arch.exception.error_code = false;
-       static_call(kvm_x86_queue_exception)(vcpu);
+       static_call(kvm_x86_inject_exception)(vcpu);
 }
 
-static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+/*
+ * Check for any event (interrupt or exception) that is ready to be injected,
+ * and if there is at least one event, inject the event with the highest
+ * priority.  This handles both "pending" events, i.e. events that have never
+ * been injected into the guest, and "injected" events, i.e. events that were
+ * injected as part of a previous VM-Enter, but weren't successfully delivered
+ * and need to be re-injected.
+ *
+ * Note, this is not guaranteed to be invoked on a guest instruction boundary,
+ * i.e. doesn't guarantee that there's an event window in the guest.  KVM must
+ * be able to inject exceptions in the "middle" of an instruction, and so must
+ * also be able to re-inject NMIs and IRQs in the middle of an instruction.
+ * I.e. for exceptions and re-injected events, NOT invoking this on instruction
+ * boundaries is necessary and correct.
+ *
+ * For simplicity, KVM uses a single path to inject all events (except events
+ * that are injected directly from L1 to L2) and doesn't explicitly track
+ * instruction boundaries for asynchronous events.  However, because VM-Exits
+ * that can occur during instruction execution typically result in KVM skipping
+ * the instruction or injecting an exception, e.g. instruction and exception
+ * intercepts, and because pending exceptions have higher priority than pending
+ * interrupts, KVM still honors instruction boundaries in most scenarios.
+ *
+ * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
+ * the instruction or inject an exception, then KVM can incorrecty inject a new
+ * asynchrounous event if the event became pending after the CPU fetched the
+ * instruction (in the guest).  E.g. if a page fault (#PF, #NPF, EPT violation)
+ * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
+ * injected on the restarted instruction instead of being deferred until the
+ * instruction completes.
+ *
+ * In practice, this virtualization hole is unlikely to be observed by the
+ * guest, and even less likely to cause functional problems.  To detect the
+ * hole, the guest would have to trigger an event on a side effect of an early
+ * phase of instruction execution, e.g. on the instruction fetch from memory.
+ * And for it to be a functional problem, the guest would need to depend on the
+ * ordering between that side effect, the instruction completing, _and_ the
+ * delivery of the asynchronous event.
+ */
+static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
+                                      bool *req_immediate_exit)
 {
+       bool can_inject;
        int r;
-       bool can_inject = true;
 
-       /* try to reinject previous events if any */
+       /*
+        * Process nested events first, as nested VM-Exit supercedes event
+        * re-injection.  If there's an event queued for re-injection, it will
+        * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
+        */
+       if (is_guest_mode(vcpu))
+               r = kvm_check_nested_events(vcpu);
+       else
+               r = 0;
 
-       if (vcpu->arch.exception.injected) {
-               kvm_inject_exception(vcpu);
-               can_inject = false;
-       }
        /*
-        * Do not inject an NMI or interrupt if there is a pending
-        * exception.  Exceptions and interrupts are recognized at
-        * instruction boundaries, i.e. the start of an instruction.
-        * Trap-like exceptions, e.g. #DB, have higher priority than
-        * NMIs and interrupts, i.e. traps are recognized before an
-        * NMI/interrupt that's pending on the same instruction.
-        * Fault-like exceptions, e.g. #GP and #PF, are the lowest
-        * priority, but are only generated (pended) during instruction
-        * execution, i.e. a pending fault-like exception means the
-        * fault occurred on the *previous* instruction and must be
-        * serviced prior to recognizing any new events in order to
-        * fully complete the previous instruction.
+        * Re-inject exceptions and events *especially* if immediate entry+exit
+        * to/from L2 is needed, as any event that has already been injected
+        * into L2 needs to complete its lifecycle before injecting a new event.
+        *
+        * Don't re-inject an NMI or interrupt if there is a pending exception.
+        * This collision arises if an exception occurred while vectoring the
+        * injected event, KVM intercepted said exception, and KVM ultimately
+        * determined the fault belongs to the guest and queues the exception
+        * for injection back into the guest.
+        *
+        * "Injected" interrupts can also collide with pending exceptions if
+        * userspace ignores the "ready for injection" flag and blindly queues
+        * an interrupt.  In that case, prioritizing the exception is correct,
+        * as the exception "occurred" before the exit to userspace.  Trap-like
+        * exceptions, e.g. most #DBs, have higher priority than interrupts.
+        * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
+        * priority, they're only generated (pended) during instruction
+        * execution, and interrupts are recognized at instruction boundaries.
+        * Thus a pending fault-like exception means the fault occurred on the
+        * *previous* instruction and must be serviced prior to recognizing any
+        * new events in order to fully complete the previous instruction.
         */
-       else if (!vcpu->arch.exception.pending) {
-               if (vcpu->arch.nmi_injected) {
-                       static_call(kvm_x86_inject_nmi)(vcpu);
-                       can_inject = false;
-               } else if (vcpu->arch.interrupt.injected) {
-                       static_call(kvm_x86_inject_irq)(vcpu, true);
-                       can_inject = false;
-               }
-       }
+       if (vcpu->arch.exception.injected)
+               kvm_inject_exception(vcpu);
+       else if (kvm_is_exception_pending(vcpu))
+               ; /* see above */
+       else if (vcpu->arch.nmi_injected)
+               static_call(kvm_x86_inject_nmi)(vcpu);
+       else if (vcpu->arch.interrupt.injected)
+               static_call(kvm_x86_inject_irq)(vcpu, true);
 
+       /*
+        * Exceptions that morph to VM-Exits are handled above, and pending
+        * exceptions on top of injected exceptions that do not VM-Exit should
+        * either morph to #DF or, sadly, override the injected exception.
+        */
        WARN_ON_ONCE(vcpu->arch.exception.injected &&
                     vcpu->arch.exception.pending);
 
        /*
-        * Call check_nested_events() even if we reinjected a previous event
-        * in order for caller to determine if it should require immediate-exit
-        * from L2 to L1 due to pending L1 events which require exit
-        * from L2 to L1.
+        * Bail if immediate entry+exit to/from the guest is needed to complete
+        * nested VM-Enter or event re-injection so that a different pending
+        * event can be serviced (or if KVM needs to exit to userspace).
+        *
+        * Otherwise, continue processing events even if VM-Exit occurred.  The
+        * VM-Exit will have cleared exceptions that were meant for L2, but
+        * there may now be events that can be injected into L1.
         */
-       if (is_guest_mode(vcpu)) {
-               r = kvm_check_nested_events(vcpu);
-               if (r < 0)
-                       goto out;
-       }
+       if (r < 0)
+               goto out;
+
+       /*
+        * A pending exception VM-Exit should either result in nested VM-Exit
+        * or force an immediate re-entry and exit to/from L2, and exception
+        * VM-Exits cannot be injected (flag should _never_ be set).
+        */
+       WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+                    vcpu->arch.exception_vmexit.pending);
+
+       /*
+        * New events, other than exceptions, cannot be injected if KVM needs
+        * to re-inject a previous event.  See above comments on re-injecting
+        * for why pending exceptions get priority.
+        */
+       can_inject = !kvm_event_needs_reinjection(vcpu);
 
-       /* try to inject new event if pending */
        if (vcpu->arch.exception.pending) {
-               if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+               /*
+                * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+                * value pushed on the stack.  Trap-like exception and all #DBs
+                * leave RF as-is (KVM follows Intel's behavior in this regard;
+                * AMD states that code breakpoint #DBs excplitly clear RF=0).
+                *
+                * Note, most versions of Intel's SDM and AMD's APM incorrectly
+                * describe the behavior of General Detect #DBs, which are
+                * fault-like.  They do _not_ set RF, a la code breakpoints.
+                */
+               if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
                        __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
                                             X86_EFLAGS_RF);
 
-               if (vcpu->arch.exception.nr == DB_VECTOR) {
-                       kvm_deliver_exception_payload(vcpu);
+               if (vcpu->arch.exception.vector == DB_VECTOR) {
+                       kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
                        if (vcpu->arch.dr7 & DR7_GD) {
                                vcpu->arch.dr7 &= ~DR7_GD;
                                kvm_update_dr7(vcpu);
@@ -9801,11 +9981,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
        }
 
        if (is_guest_mode(vcpu) &&
-           kvm_x86_ops.nested_ops->hv_timer_pending &&
-           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+           kvm_x86_ops.nested_ops->has_events &&
+           kvm_x86_ops.nested_ops->has_events(vcpu))
                *req_immediate_exit = true;
 
-       WARN_ON(vcpu->arch.exception.pending);
+       WARN_ON(kvm_is_exception_pending(vcpu));
        return 0;
 
 out:
@@ -10110,7 +10290,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
         * When APICv gets disabled, we may still have injected interrupts
         * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
         * still active when the interrupt got accepted. Make sure
-        * inject_pending_event() is called to check for that.
+        * kvm_check_and_inject_events() is called to check for that.
         */
        if (!apic->apicv_active)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -10407,7 +10587,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               r = inject_pending_event(vcpu, &req_immediate_exit);
+               r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
                if (r < 0) {
                        r = 0;
                        goto out;
@@ -10646,10 +10826,26 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
                if (hv_timer)
                        kvm_lapic_switch_to_hv_timer(vcpu);
 
-               if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+               /*
+                * If the vCPU is not runnable, a signal or another host event
+                * of some kind is pending; service it without changing the
+                * vCPU's activity state.
+                */
+               if (!kvm_arch_vcpu_runnable(vcpu))
                        return 1;
        }
 
+       /*
+        * Evaluate nested events before exiting the halted state.  This allows
+        * the halt state to be recorded properly in the VMCS12's activity
+        * state field (AMD does not have a similar field and a VM-Exit always
+        * causes a spurious wakeup from HLT).
+        */
+       if (is_guest_mode(vcpu)) {
+               if (kvm_check_nested_events(vcpu) < 0)
+                       return 0;
+       }
+
        if (kvm_apic_accept_events(vcpu) < 0)
                return 0;
        switch(vcpu->arch.mp_state) {
@@ -10673,9 +10869,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
 
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu))
-               kvm_check_nested_events(vcpu);
-
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
 }
@@ -10824,6 +11017,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception;
        struct kvm_run *kvm_run = vcpu->run;
        int r;
 
@@ -10852,7 +11046,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                        r = 0;
                        goto out;
                }
-               kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                r = -EAGAIN;
                if (signal_pending(current)) {
                        r = -EINTR;
@@ -10882,6 +11075,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                }
        }
 
+       /*
+        * If userspace set a pending exception and L2 is active, convert it to
+        * a pending VM-Exit if L1 wants to intercept the exception.
+        */
+       if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+                                                       ex->error_code)) {
+               kvm_queue_exception_vmexit(vcpu, ex->vector,
+                                          ex->has_error_code, ex->error_code,
+                                          ex->has_payload, ex->payload);
+               ex->injected = false;
+               ex->pending = false;
+       }
+       vcpu->arch.exception_from_userspace = false;
+
        if (unlikely(vcpu->arch.complete_userspace_io)) {
                int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
                vcpu->arch.complete_userspace_io = NULL;
@@ -10988,6 +11196,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
 
        vcpu->arch.exception.pending = false;
+       vcpu->arch.exception_vmexit.pending = false;
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
@@ -11125,11 +11334,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
        }
 
        /*
-        * KVM_MP_STATE_INIT_RECEIVED means the processor is in
-        * INIT state; latched init should be reported using
-        * KVM_SET_VCPU_EVENTS, so reject it here.
+        * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
+        * forcing the guest into INIT/SIPI if those events are supposed to be
+        * blocked.  KVM prioritizes SMI over INIT, so reject INIT/SIPI state
+        * if an SMI is pending as well.
         */
-       if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
+       if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
            (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
             mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
                goto out;
@@ -11368,7 +11578,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
                r = -EBUSY;
-               if (vcpu->arch.exception.pending)
+               if (kvm_is_exception_pending(vcpu))
                        goto out;
                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
                        kvm_queue_exception(vcpu, DB_VECTOR);
@@ -11750,8 +11960,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
 
                /*
-                * To avoid have the INIT path from kvm_apic_has_events() that be
-                * called with loaded FPU and does not let userspace fix the state.
+                * All paths that lead to INIT are required to load the guest's
+                * FPU state (because most paths are buried in KVM_RUN).
                 */
                if (init_event)
                        kvm_put_guest_fpu(vcpu);
@@ -12080,6 +12290,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (ret)
                goto out_page_track;
 
+       ret = static_call(kvm_x86_vm_init)(kvm);
+       if (ret)
+               goto out_uninit_mmu;
+
        INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -12115,8 +12329,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm_hv_init_vm(kvm);
        kvm_xen_init_vm(kvm);
 
-       return static_call(kvm_x86_vm_init)(kvm);
+       return 0;
 
+out_uninit_mmu:
+       kvm_mmu_uninit_vm(kvm);
 out_page_track:
        kvm_page_track_cleanup(kvm);
 out:
@@ -12589,13 +12805,14 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
        if (!list_empty_careful(&vcpu->async_pf.done))
                return true;
 
-       if (kvm_apic_has_events(vcpu))
+       if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+           kvm_apic_init_sipi_allowed(vcpu))
                return true;
 
        if (vcpu->arch.pv.pv_unhalted)
                return true;
 
-       if (vcpu->arch.exception.pending)
+       if (kvm_is_exception_pending(vcpu))
                return true;
 
        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
@@ -12617,16 +12834,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
                return true;
 
        if (is_guest_mode(vcpu) &&
-           kvm_x86_ops.nested_ops->hv_timer_pending &&
-           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+           kvm_x86_ops.nested_ops->has_events &&
+           kvm_x86_ops.nested_ops->has_events(vcpu))
                return true;
 
        if (kvm_xen_has_pending_events(vcpu))
                return true;
 
-       if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
-               return true;
-
        return false;
 }
 
@@ -12850,7 +13064,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
        if (unlikely(!lapic_in_kernel(vcpu) ||
                     kvm_event_needs_reinjection(vcpu) ||
-                    vcpu->arch.exception.pending))
+                    kvm_is_exception_pending(vcpu)))
                return false;
 
        if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
@@ -13401,7 +13615,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);