Merge branch 'core-objtool-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / arch / x86 / kvm / vmx.c
index e1535225bc1ddef9cf90a978dd7513150cef58f2..75173efccac552c48472f95fc2729d410628c139 100644 (file)
@@ -596,6 +596,8 @@ struct vcpu_vmx {
        /* Support for PML */
 #define PML_ENTITY_NUM         512
        struct page *pml_pg;
+
+       u64 current_tsc_ratio;
 };
 
 enum segment_cache_field {
@@ -861,7 +863,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -961,25 +962,36 @@ static const u32 vmx_msr_index[] = {
        MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 };
 
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+       return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+       return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+       return is_exception_n(intr_info, PF_VECTOR);
 }
 
 static inline bool is_no_device(u32 intr_info)
 {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, NM_VECTOR);
 }
 
 static inline bool is_invalid_opcode(u32 intr_info)
 {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, UD_VECTOR);
 }
 
 static inline bool is_external_interrupt(u32 intr_info)
@@ -1811,6 +1823,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                        return;
                }
                break;
+       case MSR_IA32_PEBS_ENABLE:
+               /* PEBS needs a quiescent period after being disabled (to write
+                * a record).  Disabling PEBS through VMX MSR swapping doesn't
+                * provide that period, so a CPU could write host's record into
+                * guest's memory.
+                */
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
        }
 
        for (i = 0; i < m->nr; ++i)
@@ -1848,26 +1867,31 @@ static void reload_tss(void)
 
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 {
-       u64 guest_efer;
-       u64 ignore_bits;
+       u64 guest_efer = vmx->vcpu.arch.efer;
+       u64 ignore_bits = 0;
 
-       guest_efer = vmx->vcpu.arch.efer;
+       if (!enable_ept) {
+               /*
+                * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+                * host CPUID is more efficient than testing guest CPUID
+                * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+                */
+               if (boot_cpu_has(X86_FEATURE_SMEP))
+                       guest_efer |= EFER_NX;
+               else if (!(guest_efer & EFER_NX))
+                       ignore_bits |= EFER_NX;
+       }
 
        /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaningless
-        * outside long mode
+        * LMA and LME handled by hardware; SCE meaningless outside long mode.
         */
-       ignore_bits = EFER_NX | EFER_SCE;
+       ignore_bits |= EFER_SCE;
 #ifdef CONFIG_X86_64
        ignore_bits |= EFER_LMA | EFER_LME;
        /* SCE is meaningful only in long mode on Intel */
        if (guest_efer & EFER_LMA)
                ignore_bits &= ~(u64)EFER_SCE;
 #endif
-       guest_efer &= ~ignore_bits;
-       guest_efer |= host_efer & ignore_bits;
-       vmx->guest_msrs[efer_offset].data = guest_efer;
-       vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 
        clear_atomic_switch_msr(vmx, MSR_EFER);
 
@@ -1878,16 +1902,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
         */
        if (cpu_has_load_ia32_efer ||
            (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-               guest_efer = vmx->vcpu.arch.efer;
                if (!(guest_efer & EFER_LMA))
                        guest_efer &= ~EFER_LME;
                if (guest_efer != host_efer)
                        add_atomic_switch_msr(vmx, MSR_EFER,
                                              guest_efer, host_efer);
                return false;
-       }
+       } else {
+               guest_efer &= ~ignore_bits;
+               guest_efer |= host_efer & ignore_bits;
 
-       return true;
+               vmx->guest_msrs[efer_offset].data = guest_efer;
+               vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+               return true;
+       }
 }
 
 static unsigned long segment_base(u16 selector)
@@ -2127,14 +2156,16 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 
-               /* Setup TSC multiplier */
-               if (cpu_has_vmx_tsc_scaling())
-                       vmcs_write64(TSC_MULTIPLIER,
-                                    vcpu->arch.tsc_scaling_ratio);
-
                vmx->loaded_vmcs->cpu = cpu;
        }
 
+       /* Setup TSC multiplier */
+       if (kvm_has_tsc_control &&
+           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+               vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+               vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+       }
+
        vmx_vcpu_pi_load(vcpu, cpu);
 }
 
@@ -2584,7 +2615,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
        /* We support free control of debug control saving. */
@@ -2605,7 +2636,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                VM_ENTRY_LOAD_IA32_PAT;
        vmx->nested.nested_vmx_entry_ctls_high |=
                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
        /* We support free control of debug control loading. */
@@ -2849,7 +2880,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
        case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
@@ -2926,7 +2957,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                        return 1;
                vmcs_write64(GUEST_BNDCFGS, data);
                break;
@@ -3399,7 +3430,7 @@ static void init_vmcs_shadow_fields(void)
        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
                switch (shadow_read_write_fields[i]) {
                case GUEST_BNDCFGS:
-                       if (!vmx_mpx_supported())
+                       if (!kvm_mpx_supported())
                                continue;
                        break;
                default:
@@ -5608,11 +5639,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->guest_debug == 0) {
-               u32 cpu_based_vm_exec_control;
-
-               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_MOV_DR_EXITING);
 
                /*
                 * No more DR vmexits; force a reload of the debug registers
@@ -5649,8 +5677,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
 
 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 {
-       u32 cpu_based_vm_exec_control;
-
        get_debugreg(vcpu->arch.db[0], 0);
        get_debugreg(vcpu->arch.db[1], 1);
        get_debugreg(vcpu->arch.db[2], 2);
@@ -5659,10 +5685,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
 
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
 }
 
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5747,8 +5770,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-       kvm_emulate_hypercall(vcpu);
-       return 1;
+       return kvm_emulate_hypercall(vcpu);
 }
 
 static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6435,8 +6457,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 
        if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
+               item = list_last_entry(&vmx->nested.vmcs02_pool,
+                                      struct vmcs02_list, list);
                item->vmptr = vmx->nested.current_vmptr;
                list_move(&item->list, &vmx->nested.vmcs02_pool);
                return &item->vmcs02;
@@ -7752,6 +7774,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                else if (is_no_device(intr_info) &&
                         !(vmcs12->guest_cr0 & X86_CR0_TS))
                        return false;
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return false;
+               else if (is_breakpoint(intr_info) &&
+                        vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return false;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -10258,7 +10287,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
        if (nested_cpu_has_xsaves(vmcs12))
                vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@ -10766,13 +10795,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                 */
 
                kvm_set_msi_irq(e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
+
                        continue;
+               }
 
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
 
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
                if (set)