Merge tag 'kvmarm-fixes-5.17-2' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorPaolo Bonzini <pbonzini@redhat.com>
Sat, 5 Feb 2022 05:58:25 +0000 (00:58 -0500)
committerPaolo Bonzini <pbonzini@redhat.com>
Sat, 5 Feb 2022 05:58:25 +0000 (00:58 -0500)
KVM/arm64 fixes for 5.17, take #2

- A couple of fixes when handling an exception while a SError has been
  delivered

- Workaround for Cortex-A510's single-step[ erratum

14 files changed:
arch/arm64/kvm/arm.c
arch/mips/kvm/mips.c
arch/riscv/kvm/vcpu.c
arch/riscv/kvm/vcpu_sbi_base.c
arch/x86/include/asm/kvm-x86-ops.h
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/cpuid.c
arch/x86/kvm/lapic.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm_host.h
include/uapi/linux/kvm.h

index a4a0063df456ca0f963f62ece504e9991edcf5da..ecc5958e27fe2b3fc69b9b1121a626495cb13c46 100644 (file)
@@ -797,6 +797,24 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
                        xfer_to_guest_mode_work_pending();
 }
 
+/*
+ * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
+ * the vCPU is running.
+ *
+ * This must be noinstr as instrumentation may make use of RCU, and this is not
+ * safe during the EQS.
+ */
+static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+       int ret;
+
+       guest_state_enter_irqoff();
+       ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
+       guest_state_exit_irqoff();
+
+       return ret;
+}
+
 /**
  * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
  * @vcpu:      The VCPU pointer
@@ -881,9 +899,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                 * Enter the guest
                 */
                trace_kvm_entry(*vcpu_pc(vcpu));
-               guest_enter_irqoff();
+               guest_timing_enter_irqoff();
 
-               ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
+               ret = kvm_arm_vcpu_enter_exit(vcpu);
 
                vcpu->mode = OUTSIDE_GUEST_MODE;
                vcpu->stat.exits++;
@@ -918,26 +936,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                kvm_arch_vcpu_ctxsync_fp(vcpu);
 
                /*
-                * We may have taken a host interrupt in HYP mode (ie
-                * while executing the guest). This interrupt is still
-                * pending, as we haven't serviced it yet!
+                * We must ensure that any pending interrupts are taken before
+                * we exit guest timing so that timer ticks are accounted as
+                * guest time. Transiently unmask interrupts so that any
+                * pending interrupts are taken.
                 *
-                * We're now back in SVC mode, with interrupts
-                * disabled.  Enabling the interrupts now will have
-                * the effect of taking the interrupt again, in SVC
-                * mode this time.
+                * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
+                * context synchronization event) is necessary to ensure that
+                * pending interrupts are taken.
                 */
                local_irq_enable();
+               isb();
+               local_irq_disable();
+
+               guest_timing_exit_irqoff();
+
+               local_irq_enable();
 
-               /*
-                * We do local_irq_enable() before calling guest_exit() so
-                * that if a timer interrupt hits while running the guest we
-                * account that tick as being spent in the guest.  We enable
-                * preemption after calling guest_exit() so that if we get
-                * preempted we make sure ticks after that is not counted as
-                * guest time.
-                */
-               guest_exit();
                trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
                /* Exit types that need handling before we can be preempted */
index e59cb6246f76314031dff911a0ff8714b7326177..a25e0b73ee7042c761b3c02b2cefde9b885c4e50 100644 (file)
@@ -414,6 +414,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        return -ENOIOCTLCMD;
 }
 
+/*
+ * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
+ * the vCPU is running.
+ *
+ * This must be noinstr as instrumentation may make use of RCU, and this is not
+ * safe during the EQS.
+ */
+static int noinstr kvm_mips_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+       int ret;
+
+       guest_state_enter_irqoff();
+       ret = kvm_mips_callbacks->vcpu_run(vcpu);
+       guest_state_exit_irqoff();
+
+       return ret;
+}
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
        int r = -EINTR;
@@ -434,7 +452,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
        lose_fpu(1);
 
        local_irq_disable();
-       guest_enter_irqoff();
+       guest_timing_enter_irqoff();
        trace_kvm_enter(vcpu);
 
        /*
@@ -445,10 +463,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
         */
        smp_store_mb(vcpu->mode, IN_GUEST_MODE);
 
-       r = kvm_mips_callbacks->vcpu_run(vcpu);
+       r = kvm_mips_vcpu_enter_exit(vcpu);
+
+       /*
+        * We must ensure that any pending interrupts are taken before
+        * we exit guest timing so that timer ticks are accounted as
+        * guest time. Transiently unmask interrupts so that any
+        * pending interrupts are taken.
+        *
+        * TODO: is there a barrier which ensures that pending interrupts are
+        * recognised? Currently this just hopes that the CPU takes any pending
+        * interrupts between the enable and disable.
+        */
+       local_irq_enable();
+       local_irq_disable();
 
        trace_kvm_out(vcpu);
-       guest_exit_irqoff();
+       guest_timing_exit_irqoff();
        local_irq_enable();
 
 out:
@@ -1168,7 +1199,7 @@ static void kvm_mips_set_c0_status(void)
 /*
  * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
  */
-int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
+static int __kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
        u32 cause = vcpu->arch.host_cp0_cause;
@@ -1357,6 +1388,17 @@ int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
        return ret;
 }
 
+int noinstr kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
+{
+       int ret;
+
+       guest_state_exit_irqoff();
+       ret = __kvm_mips_handle_exit(vcpu);
+       guest_state_enter_irqoff();
+
+       return ret;
+}
+
 /* Enable FPU for guest and restore context */
 void kvm_own_fpu(struct kvm_vcpu *vcpu)
 {
index 0c5239e05721541785ba7b52de3c02b4fe80a99e..624166004e36c637fb5fbd8966f3412f72a84294 100644 (file)
@@ -90,6 +90,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpu_context *cntx;
+       struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr;
 
        /* Mark this VCPU never ran */
        vcpu->arch.ran_atleast_once = false;
@@ -106,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        cntx->hstatus |= HSTATUS_SPVP;
        cntx->hstatus |= HSTATUS_SPV;
 
+       /* By default, make CY, TM, and IR counters accessible in VU mode */
+       reset_csr->scounteren = 0x7;
+
        /* Setup VCPU timer */
        kvm_riscv_vcpu_timer_init(vcpu);
 
@@ -699,6 +703,20 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu)
        csr_write(CSR_HVIP, csr->hvip);
 }
 
+/*
+ * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
+ * the vCPU is running.
+ *
+ * This must be noinstr as instrumentation may make use of RCU, and this is not
+ * safe during the EQS.
+ */
+static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+       guest_state_enter_irqoff();
+       __kvm_riscv_switch_to(&vcpu->arch);
+       guest_state_exit_irqoff();
+}
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
        int ret;
@@ -790,9 +808,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                        continue;
                }
 
-               guest_enter_irqoff();
+               guest_timing_enter_irqoff();
 
-               __kvm_riscv_switch_to(&vcpu->arch);
+               kvm_riscv_vcpu_enter_exit(vcpu);
 
                vcpu->mode = OUTSIDE_GUEST_MODE;
                vcpu->stat.exits++;
@@ -812,25 +830,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                kvm_riscv_vcpu_sync_interrupts(vcpu);
 
                /*
-                * We may have taken a host interrupt in VS/VU-mode (i.e.
-                * while executing the guest). This interrupt is still
-                * pending, as we haven't serviced it yet!
+                * We must ensure that any pending interrupts are taken before
+                * we exit guest timing so that timer ticks are accounted as
+                * guest time. Transiently unmask interrupts so that any
+                * pending interrupts are taken.
                 *
-                * We're now back in HS-mode with interrupts disabled
-                * so enabling the interrupts now will have the effect
-                * of taking the interrupt again, in HS-mode this time.
+                * There's no barrier which ensures that pending interrupts are
+                * recognised, so we just hope that the CPU takes any pending
+                * interrupts between the enable and disable.
                 */
                local_irq_enable();
+               local_irq_disable();
 
-               /*
-                * We do local_irq_enable() before calling guest_exit() so
-                * that if a timer interrupt hits while running the guest
-                * we account that tick as being spent in the guest. We
-                * enable preemption after calling guest_exit() so that if
-                * we get preempted we make sure ticks after that is not
-                * counted as guest time.
-                */
-               guest_exit();
+               guest_timing_exit_irqoff();
+
+               local_irq_enable();
 
                preempt_enable();
 
index 4ecf377f483b866bfa4c09144fda662efd6de5f4..48f431091cdbc197f739de2a221b53ccc254ccbe 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/version.h>
 #include <asm/csr.h>
 #include <asm/sbi.h>
 #include <asm/kvm_vcpu_timer.h>
@@ -32,7 +33,7 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
                *out_val = KVM_SBI_IMPID;
                break;
        case SBI_EXT_BASE_GET_IMP_VERSION:
-               *out_val = 0;
+               *out_val = LINUX_VERSION_CODE;
                break;
        case SBI_EXT_BASE_PROBE_EXT:
                if ((cp->a0 >= SBI_EXT_EXPERIMENTAL_START &&
index 631d5040b31ed3d700d0cbeb2bcee973ca5877a4..d39e0de06be2304e35bdfdd39243d64dc24076d6 100644 (file)
@@ -82,7 +82,7 @@ KVM_X86_OP_NULL(guest_apic_has_interrupt)
 KVM_X86_OP(load_eoi_exitmap)
 KVM_X86_OP(set_virtual_apic_mode)
 KVM_X86_OP_NULL(set_apic_access_page_addr)
-KVM_X86_OP(deliver_posted_interrupt)
+KVM_X86_OP(deliver_interrupt)
 KVM_X86_OP_NULL(sync_pir_to_irr)
 KVM_X86_OP(set_tss_addr)
 KVM_X86_OP(set_identity_map_addr)
index 6e7c545bc7ee158f8ca63af11d016b438d8c10bf..6dcccb304775411a8f38738526c79a5aa670c820 100644 (file)
@@ -1410,7 +1410,8 @@ struct kvm_x86_ops {
        void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
        void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
        void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
-       int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+       void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
+                                 int trig_mode, int vector);
        int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
index 28be02adc669ccd06f559cb7a6c2d230f094c4c0..494d4d3518597f28676ad2cf36f2737234e01572 100644 (file)
@@ -554,12 +554,13 @@ void kvm_set_cpu_caps(void)
        );
 
        kvm_cpu_cap_mask(CPUID_7_0_EBX,
-               F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-               F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
-               F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
-               F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
-               F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
-       );
+               F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) |
+               F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) |
+               F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) |
+               F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) |
+               F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) |
+               F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) |
+               F(AVX512VL));
 
        kvm_cpu_cap_mask(CPUID_7_ECX,
                F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
index 4662469240bc4202d11254d63d3520ae8a0968d4..d7e6fde82d254408fe3b208eb5631d1c718c71a7 100644 (file)
@@ -1096,14 +1096,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                                                       apic->regs + APIC_TMR);
                }
 
-               if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
-                       kvm_lapic_set_irr(vector, apic);
-                       kvm_make_request(KVM_REQ_EVENT, vcpu);
-                       kvm_vcpu_kick(vcpu);
-               } else {
-                       trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
-                                                  trig_mode, vector);
-               }
+               static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
+                                                      trig_mode, vector);
                break;
 
        case APIC_DM_REMRD:
index 6d97629655e3d03439c0c030aeb3cbfe8061644a..a290efb272ad1641f36c33b8b6bbca22bdf89a10 100644 (file)
@@ -3291,6 +3291,21 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
 
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+                                 int trig_mode, int vector)
+{
+       struct kvm_vcpu *vcpu = apic->vcpu;
+
+       if (svm_deliver_avic_intr(vcpu, vector)) {
+               kvm_lapic_set_irr(vector, apic);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               kvm_vcpu_kick(vcpu);
+       } else {
+               trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+                                          trig_mode, vector);
+       }
+}
+
 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -3615,7 +3630,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long vmcb_pa = svm->current_vmcb->pa;
 
-       kvm_guest_enter_irqoff();
+       guest_state_enter_irqoff();
 
        if (sev_es_guest(vcpu->kvm)) {
                __svm_sev_es_vcpu_run(vmcb_pa);
@@ -3635,7 +3650,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
                vmload(__sme_page_pa(sd->save_area));
        }
 
-       kvm_guest_exit_irqoff();
+       guest_state_exit_irqoff();
 }
 
 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
@@ -4545,7 +4560,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .pmu_ops = &amd_pmu_ops,
        .nested_ops = &svm_nested_ops,
 
-       .deliver_posted_interrupt = svm_deliver_avic_intr,
+       .deliver_interrupt = svm_deliver_interrupt,
        .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
        .update_pi_irte = svm_update_pi_irte,
        .setup_mce = svm_setup_mce,
index aca3ae2a02f34dd1a30c078803a98878a856f865..6c27bd0c89e1e613782c85637634e6215928ac24 100644 (file)
@@ -4041,6 +4041,21 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
        return 0;
 }
 
+static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+                                 int trig_mode, int vector)
+{
+       struct kvm_vcpu *vcpu = apic->vcpu;
+
+       if (vmx_deliver_posted_interrupt(vcpu, vector)) {
+               kvm_lapic_set_irr(vector, apic);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               kvm_vcpu_kick(vcpu);
+       } else {
+               trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+                                          trig_mode, vector);
+       }
+}
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -6754,7 +6769,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                        struct vcpu_vmx *vmx)
 {
-       kvm_guest_enter_irqoff();
+       guest_state_enter_irqoff();
 
        /* L1D Flush includes CPU buffer clear to mitigate MDS */
        if (static_branch_unlikely(&vmx_l1d_should_flush))
@@ -6770,7 +6785,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
        vcpu->arch.cr2 = native_read_cr2();
 
-       kvm_guest_exit_irqoff();
+       guest_state_exit_irqoff();
 }
 
 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -7768,7 +7783,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .hwapic_isr_update = vmx_hwapic_isr_update,
        .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
        .sync_pir_to_irr = vmx_sync_pir_to_irr,
-       .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+       .deliver_interrupt = vmx_deliver_interrupt,
        .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
 
        .set_tss_addr = vmx_set_tss_addr,
index 74b53a16f38a72062a123dd6ce04abd84a8c7264..7131d735b1ef3fb888beb9144795b0f312923a05 100644 (file)
@@ -90,6 +90,8 @@
 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
+#define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
+
 #define emul_to_vcpu(ctxt) \
        ((struct kvm_vcpu *)(ctxt)->vcpu)
 
@@ -4340,7 +4342,7 @@ static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
        void __user *uaddr = (void __user*)(unsigned long)attr->addr;
 
        if ((u64)(unsigned long)uaddr != attr->addr)
-               return ERR_PTR(-EFAULT);
+               return ERR_PTR_USR(-EFAULT);
        return uaddr;
 }
 
@@ -10041,6 +10043,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                set_debugreg(0, 7);
        }
 
+       guest_timing_enter_irqoff();
+
        for (;;) {
                /*
                 * Assert that vCPU vs. VM APICv state is consistent.  An APICv
@@ -10125,7 +10129,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         * of accounting via context tracking, but the loss of accuracy is
         * acceptable for all known use cases.
         */
-       vtime_account_guest_exit();
+       guest_timing_exit_irqoff();
 
        if (lapic_in_kernel(vcpu)) {
                s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
@@ -11639,8 +11643,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
        kvm_free_pit(kvm);
 }
 
-#define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
-
 /**
  * __x86_set_memory_region: Setup KVM internal memory slot
  *
index 635b75f9e14540aff2aceb6e4b1c5c3221c444ab..767ec7f9951608f984b4ac69a1c3205ce0a93ebc 100644 (file)
 
 void kvm_spurious_fault(void);
 
-static __always_inline void kvm_guest_enter_irqoff(void)
-{
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
-
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-static __always_inline void kvm_guest_exit_irqoff(void)
-{
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * context_tracking_guest_exit() restores host context and reinstates
-        * RCU if enabled and required.
-        *
-        * This needs to be done immediately after VM-Exit, before any code
-        * that might contain tracepoints or call out to the greater world,
-        * e.g. before x86_spec_ctrl_restore_host().
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       context_tracking_guest_exit();
-
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
-}
-
 #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)                \
 ({                                                                     \
        bool failed = (consistency_check);                              \
index 06912d6b39d051013b731ae04b3b26c9f68a5efb..f11039944c08ffd7d5bfe6675f95c8b9f2d3dd86 100644 (file)
@@ -29,7 +29,9 @@
 #include <linux/refcount.h>
 #include <linux/nospec.h>
 #include <linux/notifier.h>
+#include <linux/ftrace.h>
 #include <linux/hashtable.h>
+#include <linux/instrumentation.h>
 #include <linux/interval_tree.h>
 #include <linux/rbtree.h>
 #include <linux/xarray.h>
@@ -368,8 +370,11 @@ struct kvm_vcpu {
        u64 last_used_slot_gen;
 };
 
-/* must be called with irqs disabled */
-static __always_inline void guest_enter_irqoff(void)
+/*
+ * Start accounting time towards a guest.
+ * Must be called before entering guest context.
+ */
+static __always_inline void guest_timing_enter_irqoff(void)
 {
        /*
         * This is running in ioctl context so its safe to assume that it's the
@@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void)
        instrumentation_begin();
        vtime_account_guest_enter();
        instrumentation_end();
+}
 
+/*
+ * Enter guest context and enter an RCU extended quiescent state.
+ *
+ * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
+ * unsafe to use any code which may directly or indirectly use RCU, tracing
+ * (including IRQ flag tracing), or lockdep. All code in this period must be
+ * non-instrumentable.
+ */
+static __always_inline void guest_context_enter_irqoff(void)
+{
        /*
         * KVM does not hold any references to rcu protected data when it
         * switches CPU into a guest mode. In fact switching to a guest mode
@@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void)
        }
 }
 
-static __always_inline void guest_exit_irqoff(void)
+/*
+ * Deprecated. Architectures should move to guest_timing_enter_irqoff() and
+ * guest_state_enter_irqoff().
+ */
+static __always_inline void guest_enter_irqoff(void)
+{
+       guest_timing_enter_irqoff();
+       guest_context_enter_irqoff();
+}
+
+/**
+ * guest_state_enter_irqoff - Fixup state when entering a guest
+ *
+ * Entry to a guest will enable interrupts, but the kernel state is interrupts
+ * disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Tell lockdep that interrupts are enabled
+ *
+ * Invoked from architecture specific code before entering a guest.
+ * Must be called with interrupts disabled and the caller must be
+ * non-instrumentable.
+ * The caller has to invoke guest_timing_enter_irqoff() before this.
+ *
+ * Note: this is analogous to exit_to_user_mode().
+ */
+static __always_inline void guest_state_enter_irqoff(void)
+{
+       instrumentation_begin();
+       trace_hardirqs_on_prepare();
+       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+       instrumentation_end();
+
+       guest_context_enter_irqoff();
+       lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+/*
+ * Exit guest context and exit an RCU extended quiescent state.
+ *
+ * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
+ * unsafe to use any code which may directly or indirectly use RCU, tracing
+ * (including IRQ flag tracing), or lockdep. All code in this period must be
+ * non-instrumentable.
+ */
+static __always_inline void guest_context_exit_irqoff(void)
 {
        context_tracking_guest_exit();
+}
 
+/*
+ * Stop accounting time towards a guest.
+ * Must be called after exiting guest context.
+ */
+static __always_inline void guest_timing_exit_irqoff(void)
+{
        instrumentation_begin();
        /* Flush the guest cputime we spent on the guest */
        vtime_account_guest_exit();
        instrumentation_end();
 }
 
+/*
+ * Deprecated. Architectures should move to guest_state_exit_irqoff() and
+ * guest_timing_exit_irqoff().
+ */
+static __always_inline void guest_exit_irqoff(void)
+{
+       guest_context_exit_irqoff();
+       guest_timing_exit_irqoff();
+}
+
 static inline void guest_exit(void)
 {
        unsigned long flags;
@@ -413,6 +492,33 @@ static inline void guest_exit(void)
        local_irq_restore(flags);
 }
 
+/**
+ * guest_state_exit_irqoff - Establish state when returning from guest mode
+ *
+ * Entry from a guest disables interrupts, but guest mode is traced as
+ * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ *
+ * Invoked from architecture specific code after exiting a guest.
+ * Must be invoked with interrupts disabled and the caller must be
+ * non-instrumentable.
+ * The caller has to invoke guest_timing_exit_irqoff() after this.
+ *
+ * Note: this is analogous to enter_from_user_mode().
+ */
+static __always_inline void guest_state_exit_irqoff(void)
+{
+       lockdep_hardirqs_off(CALLER_ADDR0);
+       guest_context_exit_irqoff();
+
+       instrumentation_begin();
+       trace_hardirqs_off_finish();
+       instrumentation_end();
+}
+
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
        /*
index b46bcdb0cab1a3695f552434d34aff3f70b4c700..5191b57e156220bd7fc6d85940c301e16cc1645a 100644 (file)
@@ -1624,9 +1624,6 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET  _IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET   _IO(KVMIO,   0xc4)
 
-/* Available with KVM_CAP_XSAVE2 */
-#define KVM_GET_XSAVE2           _IOR(KVMIO,  0xcf, struct kvm_xsave)
-
 struct kvm_s390_pv_sec_parm {
        __u64 origin;
        __u64 length;
@@ -2048,4 +2045,7 @@ struct kvm_stats_desc {
 
 #define KVM_GET_STATS_FD  _IO(KVMIO,  0xce)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2           _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 #endif /* __LINUX_KVM_H */