Merge branch 'kvm-tsx-ctrl' into HEAD
[linux-2.6-block.git] / arch / x86 / kvm / x86.c
index fc54e3905fe33e9be6d5edad6f0629464063badb..c814c226d02ebc5cc3ff843e2cbb377ec22edb3f 100644 (file)
@@ -176,6 +176,8 @@ struct kvm_shared_msrs {
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static struct kvm_shared_msrs __percpu *shared_msrs;
 
+static u64 __read_mostly host_xss;
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
        { "pf_guest", VCPU_STAT(pf_guest) },
@@ -710,10 +712,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
        ret = 1;
 
        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
-       __set_bit(VCPU_EXREG_PDPTR,
-                 (unsigned long *)&vcpu->arch.regs_avail);
-       __set_bit(VCPU_EXREG_PDPTR,
-                 (unsigned long *)&vcpu->arch.regs_dirty);
+       kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+
 out:
 
        return ret;
@@ -723,7 +723,6 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
 bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
        u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
-       bool changed = true;
        int offset;
        gfn_t gfn;
        int r;
@@ -731,8 +730,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
        if (!is_pae_paging(vcpu))
                return false;
 
-       if (!test_bit(VCPU_EXREG_PDPTR,
-                     (unsigned long *)&vcpu->arch.regs_avail))
+       if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
                return true;
 
        gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
@@ -740,11 +738,9 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
        r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
                                       PFERR_USER_MASK | PFERR_WRITE_MASK);
        if (r < 0)
-               goto out;
-       changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-out:
+               return true;
 
-       return changed;
+       return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 }
 EXPORT_SYMBOL_GPL(pdptrs_changed);
 
@@ -813,27 +809,34 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
 {
-       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-                       !vcpu->guest_xcr0_loaded) {
-               /* kvm_set_xcr() also depends on this */
+       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
                if (vcpu->arch.xcr0 != host_xcr0)
                        xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
-               vcpu->guest_xcr0_loaded = 1;
+
+               if (vcpu->arch.xsaves_enabled &&
+                   vcpu->arch.ia32_xss != host_xss)
+                       wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
        }
 }
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
 
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->guest_xcr0_loaded) {
+       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
                if (vcpu->arch.xcr0 != host_xcr0)
                        xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
-               vcpu->guest_xcr0_loaded = 0;
+
+               if (vcpu->arch.xsaves_enabled &&
+                   vcpu->arch.ia32_xss != host_xss)
+                       wrmsrl(MSR_IA32_XSS, host_xss);
        }
+
 }
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
 
 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
@@ -985,7 +988,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
        kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
        vcpu->arch.cr3 = cr3;
-       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+       kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
        return 0;
 }
@@ -1547,20 +1550,25 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 }
 
 #ifdef CONFIG_X86_64
+struct pvclock_clock {
+       int vclock_mode;
+       u64 cycle_last;
+       u64 mask;
+       u32 mult;
+       u32 shift;
+};
+
 struct pvclock_gtod_data {
        seqcount_t      seq;
 
-       struct { /* extract of a clocksource struct */
-               int vclock_mode;
-               u64     cycle_last;
-               u64     mask;
-               u32     mult;
-               u32     shift;
-       } clock;
+       struct pvclock_clock clock; /* extract of a clocksource struct */
+       struct pvclock_clock raw_clock; /* extract of a clocksource struct */
 
+       u64             boot_ns_raw;
        u64             boot_ns;
        u64             nsec_base;
        u64             wall_time_sec;
+       u64             monotonic_raw_nsec;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1568,9 +1576,10 @@ static struct pvclock_gtod_data pvclock_gtod_data;
 static void update_pvclock_gtod(struct timekeeper *tk)
 {
        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
-       u64 boot_ns;
+       u64 boot_ns, boot_ns_raw;
 
        boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
+       boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot));
 
        write_seqcount_begin(&vdata->seq);
 
@@ -1581,11 +1590,20 @@ static void update_pvclock_gtod(struct timekeeper *tk)
        vdata->clock.mult               = tk->tkr_mono.mult;
        vdata->clock.shift              = tk->tkr_mono.shift;
 
+       vdata->raw_clock.vclock_mode    = tk->tkr_raw.clock->archdata.vclock_mode;
+       vdata->raw_clock.cycle_last     = tk->tkr_raw.cycle_last;
+       vdata->raw_clock.mask           = tk->tkr_raw.mask;
+       vdata->raw_clock.mult           = tk->tkr_raw.mult;
+       vdata->raw_clock.shift          = tk->tkr_raw.shift;
+
        vdata->boot_ns                  = boot_ns;
        vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
 
        vdata->wall_time_sec            = tk->xtime_sec;
 
+       vdata->boot_ns_raw              = boot_ns_raw;
+       vdata->monotonic_raw_nsec       = tk->tkr_raw.xtime_nsec;
+
        write_seqcount_end(&vdata->seq);
 }
 #endif
@@ -2009,21 +2027,21 @@ static u64 read_tsc(void)
        return last;
 }
 
-static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+                         int *mode)
 {
        long v;
-       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
        u64 tsc_pg_val;
 
-       switch (gtod->clock.vclock_mode) {
+       switch (clock->vclock_mode) {
        case VCLOCK_HVCLOCK:
                tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
                                                  tsc_timestamp);
                if (tsc_pg_val != U64_MAX) {
                        /* TSC page valid */
                        *mode = VCLOCK_HVCLOCK;
-                       v = (tsc_pg_val - gtod->clock.cycle_last) &
-                               gtod->clock.mask;
+                       v = (tsc_pg_val - clock->cycle_last) &
+                               clock->mask;
                } else {
                        /* TSC page invalid */
                        *mode = VCLOCK_NONE;
@@ -2032,8 +2050,8 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
        case VCLOCK_TSC:
                *mode = VCLOCK_TSC;
                *tsc_timestamp = read_tsc();
-               v = (*tsc_timestamp - gtod->clock.cycle_last) &
-                       gtod->clock.mask;
+               v = (*tsc_timestamp - clock->cycle_last) &
+                       clock->mask;
                break;
        default:
                *mode = VCLOCK_NONE;
@@ -2042,10 +2060,10 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
        if (*mode == VCLOCK_NONE)
                *tsc_timestamp = v = 0;
 
-       return v * gtod->clock.mult;
+       return v * clock->mult;
 }
 
-static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
 {
        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
        unsigned long seq;
@@ -2054,10 +2072,10 @@ static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
 
        do {
                seq = read_seqcount_begin(&gtod->seq);
-               ns = gtod->nsec_base;
-               ns += vgettsc(tsc_timestamp, &mode);
+               ns = gtod->monotonic_raw_nsec;
+               ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
                ns >>= gtod->clock.shift;
-               ns += gtod->boot_ns;
+               ns += gtod->boot_ns_raw;
        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
        *t = ns;
 
@@ -2075,7 +2093,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
                seq = read_seqcount_begin(&gtod->seq);
                ts->tv_sec = gtod->wall_time_sec;
                ns = gtod->nsec_base;
-               ns += vgettsc(tsc_timestamp, &mode);
+               ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
                ns >>= gtod->clock.shift;
        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
@@ -2092,7 +2110,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
        if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
                return false;
 
-       return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+       return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
                                                      tsc_timestamp));
 }
 
@@ -2715,6 +2733,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_TSC:
                kvm_write_tsc(vcpu, msr_info);
                break;
+       case MSR_IA32_XSS:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+                       return 1;
+               /*
+                * We do support PT if kvm_x86_ops->pt_supported(), but we do
+                * not support IA32_XSS[bit 8]. Guests will have to use
+                * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
+                * MSRs.
+                */
+               if (data != 0)
+                       return 1;
+               vcpu->arch.ia32_xss = data;
+               break;
        case MSR_SMI_COUNT:
                if (!msr_info->host_initiated)
                        return 1;
@@ -3042,6 +3074,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
                return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
                                   msr_info->host_initiated);
+       case MSR_IA32_XSS:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+                       return 1;
+               msr_info->data = vcpu->arch.ia32_xss;
+               break;
        case MSR_K7_CLK_CTL:
                /*
                 * Provide expected ramp-up count for K7. All other
@@ -3819,12 +3857,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                        else
                                vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-                       if (lapic_in_kernel(vcpu)) {
-                               if (events->smi.latched_init)
-                                       set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
-                               else
-                                       clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
-                       }
+               }
+
+               if (lapic_in_kernel(vcpu)) {
+                       if (events->smi.latched_init)
+                               set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+                       else
+                               clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                }
        }
 
@@ -4940,9 +4979,6 @@ set_identity_unlock:
                if (!irqchip_kernel(kvm))
                        goto set_irqchip_out;
                r = kvm_vm_ioctl_set_irqchip(kvm, chip);
-               if (r)
-                       goto set_irqchip_out;
-               r = 0;
        set_irqchip_out:
                kfree(chip);
                break;
@@ -6130,7 +6166,7 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
                              u32 pmc)
 {
-       return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+       return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
 }
 
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -7860,6 +7896,20 @@ static void process_smi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+                                      unsigned long *vcpu_bitmap)
+{
+       cpumask_var_t cpus;
+       bool called;
+
+       zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+       called = kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
+                                            vcpu_bitmap, cpus);
+
+       free_cpumask_var(cpus);
+}
+
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
        kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -7937,7 +7987,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
         */
        put_page(page);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
 
 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
 {
@@ -8696,8 +8745,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                goto out;
 
-       /* INITs are latched while in SMM */
-       if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+       /*
+        * KVM_MP_STATE_INIT_RECEIVED means the processor is in
+        * INIT state; latched init should be reported using
+        * KVM_SET_VCPU_EVENTS, so reject it here.
+        */
+       if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
            (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
             mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
                goto out;
@@ -8789,7 +8842,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        vcpu->arch.cr2 = sregs->cr2;
        mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
-       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+       kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
@@ -9316,6 +9369,9 @@ int kvm_arch_hardware_setup(void)
                kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
        }
 
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               rdmsrl(MSR_IA32_XSS, host_xss);
+
        kvm_init_msr_list();
        return 0;
 }
@@ -9369,7 +9425,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                goto fail_free_pio_data;
 
        if (irqchip_in_kernel(vcpu->kvm)) {
-               vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
+               vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm);
                r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
                if (r < 0)
                        goto fail_mmu_destroy;
@@ -9438,7 +9494,13 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
        vcpu->arch.l1tf_flush_l1d = true;
+       if (pmu->version && unlikely(pmu->event_count)) {
+               pmu->need_cleanup = true;
+               kvm_make_request(KVM_REQ_PMU, vcpu);
+       }
        kvm_x86_ops->sched_in(vcpu, cpu);
 }