Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net>
Ben Widawsky <bwidawsk@kernel.org> <ben.widawsky@intel.com>
Ben Widawsky <bwidawsk@kernel.org> <benjamin.widawsky@intel.com>
+Bjorn Andersson <andersson@kernel.org> <bjorn@kryo.se>
+Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@linaro.org>
+Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@sonymobile.com>
Björn Steinbrink <B.Steinbrink@gmx.de>
Björn Töpel <bjorn@kernel.org> <bjorn.topel@gmail.com>
Björn Töpel <bjorn@kernel.org> <bjorn.topel@intel.com>
Christian Marangi <ansuelsmth@gmail.com>
Christophe Ricard <christophe.ricard@gmail.com>
Christoph Hellwig <hch@lst.de>
-Colin Ian King <colin.king@intel.com> <colin.king@canonical.com>
-Colin Ian King <colin.king@intel.com> <colin.i.king@gmail.com>
+Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com>
Corey Minyard <minyard@acm.org>
Damian Hobson-Garcia <dhobsong@igel.co.jp>
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
Greg Kroah-Hartman <greg@kroah.com>
Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
+Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@linux.vnet.ibm.com>
+Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@canonical.com>
Guo Ren <guoren@kernel.org> <guoren@linux.alibaba.com>
Guo Ren <guoren@kernel.org> <ren_guo@c-sky.com>
Gustavo Padovan <gustavo@las.ic.unicamp.br>
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com>
+Luca Ceresoli <luca.ceresoli@bootlin.com> <luca@lucaceresoli.net>
Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org>
Mythri P K <mythripk@ti.com>
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
+Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
Nguyen Anh Quynh <aquynh@gmail.com>
Nicholas Piggin <npiggin@gmail.com> <npiggen@suse.de>
Nicholas Piggin <npiggin@gmail.com> <npiggin@kernel.dk>
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
+ Oliver Upton <oliver.upton@linux.dev> <oupton@google.com>
Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Patrick Mochel <mochel@digitalimplant.org>
kvm_vcpu_halt(vcpu);
vcpu_clear_flag(vcpu, IN_WFIT);
- kvm_clear_request(KVM_REQ_UNHALT, vcpu);
preempt_disable();
vgic_v4_load(vcpu);
* at, which would end badly once inaccessible.
*/
kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
- kmemleak_free_part(__va(hyp_mem_base), hyp_mem_size);
+ kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
return pkvm_drop_host_privileges();
}
goto out;
}
- if (kvm_s390_pci_interp_allowed()) {
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
rc = kvm_s390_pci_init();
if (rc) {
pr_err("Unable to allocate AIFT for PCI\n");
void kvm_arch_exit(void)
{
kvm_s390_gib_destroy();
- if (kvm_s390_pci_interp_allowed())
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
kvm_s390_pci_exit();
debug_unregister(kvm_s390_dbf);
debug_unregister(kvm_s390_dbf_uv);
goto retry;
}
- /* nothing to do, just clear the request */
- kvm_clear_request(KVM_REQ_UNHALT, vcpu);
/* we left the vsie handler, nothing to do, just clear the request */
kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+ u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
+ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
} cpuid_cache;
};
struct timer_list poll_timer;
};
+ struct kvm_queued_exception {
+ bool pending;
+ bool injected;
+ bool has_error_code;
+ u8 vector;
+ u32 error_code;
+ unsigned long payload;
+ bool has_payload;
+ };
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
struct fpu_guest guest_fpu;
u64 xcr0;
+ u64 guest_supported_xcr0;
struct kvm_pio_request pio;
void *pio_data;
u8 event_exit_inst_len;
- struct kvm_queued_exception {
- bool pending;
- bool injected;
- bool has_error_code;
- u8 nr;
- u32 error_code;
- unsigned long payload;
- bool has_payload;
- u8 nested_apf;
- } exception;
+ bool exception_from_userspace;
+
+ /* Exceptions to be injected to the guest. */
+ struct kvm_queued_exception exception;
+ /* Exception VM-Exits to be synthesized to L1. */
+ struct kvm_queued_exception exception_vmexit;
struct kvm_queued_interrupt {
bool injected;
u32 id;
bool send_user_only;
u32 host_apf_flags;
- unsigned long nested_apf_token;
bool delivery_as_pf_vmexit;
bool pageready_pending;
} apf;
unsigned char *hypercall_addr);
void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected);
void (*inject_nmi)(struct kvm_vcpu *vcpu);
- void (*queue_exception)(struct kvm_vcpu *vcpu);
+ void (*inject_exception)(struct kvm_vcpu *vcpu);
void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
struct kvm_x86_nested_ops {
void (*leave_nested)(struct kvm_vcpu *vcpu);
+ bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code);
int (*check_events)(struct kvm_vcpu *vcpu);
- bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
- bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+ bool (*has_events)(struct kvm_vcpu *vcpu);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
- bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
+ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
+ {
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE,
+ KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+ }
+
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
- u64 guest_supported_xcr0;
best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
kvm_apic_set_version(vcpu);
}
- guest_supported_xcr0 =
+ vcpu->arch.guest_supported_xcr0 =
cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
- vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0;
+ /*
+ * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
+ * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
+ * supported by the host.
+ */
+ vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 |
+ XFEATURE_MASK_FPSSE;
kvm_update_pv_runtime(vcpu);
vcpu->arch.cr4_guest_rsvd_bits =
__cr4_reserved_bits(guest_cpuid_has, vcpu);
- kvm_hv_set_cpuid(vcpu);
+ kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent));
/* Invoke the vendor callback only after the above state is updated. */
static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
return 0;
}
+ if (kvm_cpuid_has_hyperv(e2, nent)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+ }
+
r = kvm_check_cpuid(vcpu, e2, nent);
if (r)
return r;
entry->edx = 0;
}
break;
- case 9:
- break;
case 0xa: { /* Architectural Performance Monitoring */
union cpuid10_eax eax;
union cpuid10_edx edx;
/*
* XXX: inoutclob user must know where the argument is being expanded.
- * Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault.
+ * Using asm goto would allow us to remove _fault.
*/
#define asm_safe(insn, inoutclob...) \
({ \
static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
- unsigned reg = ctxt->modrm_reg;
+ unsigned int reg;
- if (!(ctxt->d & ModRM))
+ if (ctxt->d & ModRM)
+ reg = ctxt->modrm_reg;
+ else
reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
if (ctxt->d & Sse) {
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->modrm_reg == VCPU_SREG_SS)
+ if (seg == VCPU_SREG_SS)
ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
if (ctxt->op_bytes > 2)
rsp_increment(ctxt, ctxt->op_bytes - 2);
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
- if (r == X86EMUL_IO_NEEDED)
- return r;
-
- if (r > 0)
+ if (r == X86EMUL_PROPAGATE_FAULT)
return emulate_gp(ctxt, 0);
- return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+ return r;
}
static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
- if (r == X86EMUL_IO_NEEDED)
- return r;
-
- if (r)
+ if (r == X86EMUL_PROPAGATE_FAULT)
return emulate_gp(ctxt, 0);
- *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
- *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
- return X86EMUL_CONTINUE;
+ if (r == X86EMUL_CONTINUE) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+ }
+ return r;
}
static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
{
u32 eax, ecx, edx;
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE))
+ return emulate_ud(ctxt);
+
eax = reg_read(ctxt, VCPU_REGS_RAX);
edx = reg_read(ctxt, VCPU_REGS_RDX);
ecx = reg_read(ctxt, VCPU_REGS_RCX);
ctxt->ops->get_dr(ctxt, 7, &dr7);
- /* Check if DR7.Global_Enable is set */
- return dr7 & (1 << 13);
+ return dr7 & DR7_GD;
}
static int check_dr_read(struct x86_emulate_ctxt *ctxt)
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
rmap_count = pte_list_add(cache, spte, rmap_head);
+ if (rmap_count > kvm->stat.max_mmu_rmap_size)
+ kvm->stat.max_mmu_rmap_size = rmap_count;
if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
kvm_zap_all_rmap_sptes(kvm, rmap_head);
kvm_flush_remote_tlbs_with_address(
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
+ static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+ kvm_mod_used_mmu_pages(kvm, +1);
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+ }
+
+ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+ kvm_mod_used_mmu_pages(kvm, -1);
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+ }
+
static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
{
MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
*/
sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
list_add(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_mod_used_mmu_pages(kvm, +1);
+ kvm_account_mmu_page(kvm, sp);
sp->gfn = gfn;
sp->role = role;
list_add(&sp->link, invalid_list);
else
list_move(&sp->link, invalid_list);
- kvm_mod_used_mmu_pages(kvm, -1);
+ kvm_unaccount_mmu_page(kvm, sp);
} else {
/*
* Remove the active root from the active page list, the root
vcpu->arch.l1tf_flush_l1d = true;
if (!flags) {
- trace_kvm_page_fault(fault_address, error_code);
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
ret = register_shrinker(&mmu_shrinker, "x86-mmu");
if (ret)
- goto out;
+ goto out_shrinker;
return 0;
+ out_shrinker:
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
out:
mmu_destroy_caches();
return ret;
return inequality ^ bit;
}
-
- /*
- * KVM wants to inject page-faults which it got to the guest. This function
- * checks whether in a nested guest, we need to inject them to L1 or L2.
- */
- static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
+ static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- unsigned int nr = vcpu->arch.exception.nr;
- bool has_payload = vcpu->arch.exception.has_payload;
- unsigned long payload = vcpu->arch.exception.payload;
-
- if (nr == PF_VECTOR) {
- if (vcpu->arch.exception.nested_apf) {
- *exit_qual = vcpu->arch.apf.nested_apf_token;
- return 1;
- }
- if (nested_vmx_is_page_fault_vmexit(vmcs12,
- vcpu->arch.exception.error_code)) {
- *exit_qual = has_payload ? payload : vcpu->arch.cr2;
- return 1;
- }
- } else if (vmcs12->exception_bitmap & (1u << nr)) {
- if (nr == DB_VECTOR) {
- if (!has_payload) {
- payload = vcpu->arch.dr6;
- payload &= ~DR6_BT;
- payload ^= DR6_ACTIVE_LOW;
- }
- *exit_qual = payload;
- } else
- *exit_qual = 0;
- return 1;
- }
- return 0;
- }
-
- static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
- {
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- WARN_ON(!is_guest_mode(vcpu));
+ /*
+ * Drop bits 31:16 of the error code when performing the #PF mask+match
+ * check. All VMCS fields involved are 32 bits, but Intel CPUs never
+ * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
+ * error code. Including the to-be-dropped bits in the check might
+ * result in an "impossible" or missed exit from L1's perspective.
+ */
+ if (vector == PF_VECTOR)
+ return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
- if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
- !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
- vmcs12->vm_exit_intr_error_code = fault->error_code;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
- INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
- fault->address);
- return true;
- }
- return false;
+ return (vmcs12->exception_bitmap & (1u << vector));
}
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
vmcs12->guest_rflags = evmcs->guest_rflags;
vmcs12->guest_interruptibility_info =
evmcs->guest_interruptibility_info;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ssp = evmcs->guest_ssp;
+ */
}
if (unlikely(!(hv_clean_fields &
vmcs12->host_fs_selector = evmcs->host_fs_selector;
vmcs12->host_gs_selector = evmcs->host_gs_selector;
vmcs12->host_tr_selector = evmcs->host_tr_selector;
+ vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
+ * vmcs12->host_ssp = evmcs->host_ssp;
+ * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
+ */
}
if (unlikely(!(hv_clean_fields &
vmcs12->tsc_offset = evmcs->tsc_offset;
vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+ vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
+ vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
}
if (unlikely(!(hv_clean_fields &
vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
vmcs12->guest_activity_state = evmcs->guest_activity_state;
vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+ vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
+ * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
+ * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
+ */
}
/*
* evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
* evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
* evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+ * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
+ * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
+ * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
+ * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
*
* Not present in struct vmcs12:
* evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
* evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
* evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
* evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+ * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
+ * evmcs->host_ssp = vmcs12->host_ssp;
+ * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
+ * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
+ * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
+ * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
+ * evmcs->guest_ssp = vmcs12->guest_ssp;
*/
evmcs->guest_es_selector = vmcs12->guest_es_selector;
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
- if (likely(!vmx->nested.enlightened_vmcs_enabled))
+ if (likely(!guest_cpuid_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
* are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
* on the related bits (if supported by the CPU) in the hope that
* we can avoid VMWrites during vmx_set_efer().
+ *
+ * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
+ * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
+ * do the same for L2.
*/
exec_control = __vm_entry_controls_get(vmcs01);
- exec_control |= vmcs12->vm_entry_controls;
+ exec_control |= (vmcs12->vm_entry_controls &
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
* bits which we consider mandatory enabled.
* The CR0_READ_SHADOW is what L2 should have expected to read given
* the specifications by L1; It's not enough to take
- * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+ * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
* have more bits than L1 expected.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
nested_check_vm_entry_controls(vcpu, vmcs12))
return -EINVAL;
- if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
+ if (guest_cpuid_has_evmcs(vcpu))
return nested_evmcs_check_controls(vmcs12);
return 0;
* L2 was running), map it here to make sure vmcs12 changes are
* properly reflected.
*/
- if (vmx->nested.enlightened_vmcs_enabled &&
+ if (guest_cpuid_has_evmcs(vcpu) &&
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
};
u32 failed_index;
+ trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
+ vmx->nested.current_vmptr,
+ vmcs12->guest_rip,
+ vmcs12->guest_intr_status,
+ vmcs12->vm_entry_intr_info_field,
+ vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
+ vmcs12->ept_pointer,
+ vmcs12->guest_cr3,
+ KVM_ISA_VMX);
+
kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+ if (!evaluate_pending_interrupts)
+ evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
}
/*
- * If L1 had a pending IRQ/NMI until it executed
- * VMLAUNCH/VMRESUME which wasn't delivered because it was
- * disallowed (e.g. interrupts disabled), L0 needs to
- * evaluate if this pending event should cause an exit from L2
- * to L1 or delivered directly to L2 (e.g. In case L1 don't
- * intercept EXTERNAL_INTERRUPT).
- *
- * Usually this would be handled by the processor noticing an
- * IRQ/NMI window request, or checking RVI during evaluation of
- * pending virtual interrupts. However, this setting was done
- * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
- * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+ * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
+ * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
+ * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
+ * unconditionally.
*/
if (unlikely(evaluate_pending_interrupts))
kvm_make_request(KVM_REQ_EVENT, vcpu);
is_double_fault(exit_intr_info))) {
vmcs12->idt_vectoring_info_field = 0;
} else if (vcpu->arch.exception.injected) {
- nr = vcpu->arch.exception.nr;
+ nr = vcpu->arch.exception.vector;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
if (kvm_exception_is_soft(nr)) {
return -ENXIO;
}
- static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
- unsigned long exit_qual)
+ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- unsigned int nr = vcpu->arch.exception.nr;
- u32 intr_info = nr | INTR_INFO_VALID_MASK;
+ unsigned long exit_qual;
+
+ if (ex->has_payload) {
+ exit_qual = ex->payload;
+ } else if (ex->vector == PF_VECTOR) {
+ exit_qual = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ exit_qual = vcpu->arch.dr6;
+ exit_qual &= ~DR6_BT;
+ exit_qual ^= DR6_ACTIVE_LOW;
+ } else {
+ exit_qual = 0;
+ }
- if (vcpu->arch.exception.has_error_code) {
- vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ if (ex->has_error_code) {
+ /*
+ * Intel CPUs do not generate error codes with bits 31:16 set,
+ * and more importantly VMX disallows setting bits 31:16 in the
+ * injected error code for VM-Entry. Drop the bits to mimic
+ * hardware and avoid inducing failure on nested VM-Entry if L1
+ * chooses to inject the exception back to L2. AMD CPUs _do_
+ * generate "full" 32-bit error codes, so KVM allows userspace
+ * to inject exception error codes with bits 31:16 set.
+ */
+ vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
- if (kvm_exception_is_soft(nr))
+ if (kvm_exception_is_soft(ex->vector))
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
}
/*
- * Returns true if a debug trap is pending delivery.
+ * Returns true if a debug trap is (likely) pending delivery. Infer the class
+ * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
+ * Using the payload is flawed because code breakpoints (fault-like) and data
+ * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
+ * this will return false positives if a to-be-injected code breakpoint #DB is
+ * pending (from KVM's perspective, but not "pending" across an instruction
+ * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
+ * too is trap-like.
*
- * In KVM, debug traps bear an exception payload. As such, the class of a #DB
- * exception may be inferred from the presence of an exception payload.
+ * KVM "works" despite these flaws as ICEBP isn't currently supported by the
+ * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
+ * #DB has already happened), and MTF isn't marked pending on code breakpoints
+ * from the emulator (because such #DBs are fault-like and thus don't trigger
+ * actions that fire on instruction retire).
+ */
+ static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
+ {
+ if (!ex->pending || ex->vector != DB_VECTOR)
+ return 0;
+
+ /* General Detect #DBs are always fault-like. */
+ return ex->payload & ~DR6_BD;
+ }
+
+ /*
+ * Returns true if there's a pending #DB exception that is lower priority than
+ * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
+ * KVM, but could theoretically be injected by userspace. Note, this code is
+ * imperfect, see above.
*/
- static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
+ static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
{
- return vcpu->arch.exception.pending &&
- vcpu->arch.exception.nr == DB_VECTOR &&
- vcpu->arch.exception.payload;
+ return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
}
/*
*/
static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
{
- if (vmx_pending_dbg_trap(vcpu))
- vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
- vcpu->arch.exception.payload);
+ unsigned long pending_dbg;
+
+ pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
+ if (pending_dbg)
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
}
static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
+ static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+ {
+ return nested_vmx_preemption_timer_pending(vcpu) ||
+ to_vmx(vcpu)->nested.mtf_pending;
+ }
+
+ /*
+ * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
+ * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
+ * and less minor edits to splice in the priority of VMX Non-Root specific
+ * events, e.g. MTF and NMI/INTR-window exiting.
+ *
+ * 1 Hardware Reset and Machine Checks
+ * - RESET
+ * - Machine Check
+ *
+ * 2 Trap on Task Switch
+ * - T flag in TSS is set (on task switch)
+ *
+ * 3 External Hardware Interventions
+ * - FLUSH
+ * - STOPCLK
+ * - SMI
+ * - INIT
+ *
+ * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
+ *
+ * 4 Traps on Previous Instruction
+ * - Breakpoints
+ * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
+ * breakpoint, or #DB due to a split-lock access)
+ *
+ * 4.3 VMX-preemption timer expired VM-exit
+ *
+ * 4.6 NMI-window exiting VM-exit[2]
+ *
+ * 5 Nonmaskable Interrupts (NMI)
+ *
+ * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
+ *
+ * 6 Maskable Hardware Interrupts
+ *
+ * 7 Code Breakpoint Fault
+ *
+ * 8 Faults from Fetching Next Instruction
+ * - Code-Segment Limit Violation
+ * - Code Page Fault
+ * - Control protection exception (missing ENDBRANCH at target of indirect
+ * call or jump)
+ *
+ * 9 Faults from Decoding Next Instruction
+ * - Instruction length > 15 bytes
+ * - Invalid Opcode
+ * - Coprocessor Not Available
+ *
+ *10 Faults on Executing Instruction
+ * - Overflow
+ * - Bound error
+ * - Invalid TSS
+ * - Segment Not Present
+ * - Stack fault
+ * - General Protection
+ * - Data Page Fault
+ * - Alignment Check
+ * - x86 FPU Floating-point exception
+ * - SIMD floating-point exception
+ * - Virtualization exception
+ * - Control protection exception
+ *
+ * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
+ * INIT signals, and higher priority events take priority over MTF VM exits.
+ * MTF VM exits take priority over debug-trap exceptions and lower priority
+ * events.
+ *
+ * [2] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
+ * timer take priority over VM exits caused by the "NMI-window exiting"
+ * VM-execution control and lower priority events.
+ *
+ * [3] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by "NMI-window exiting". VM exits caused by this control take
+ * priority over non-maskable interrupts (NMIs) and lower priority events.
+ *
+ * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
+ * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
+ * non-maskable interrupts (NMIs) and higher priority events take priority over
+ * delivery of a virtual interrupt; delivery of a virtual interrupt takes
+ * priority over external interrupts and lower priority events.
+ */
static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qual;
- bool block_nested_events =
- vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
- bool mtf_pending = vmx->nested.mtf_pending;
struct kvm_lapic *apic = vcpu->arch.apic;
-
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * Clear the MTF state. If a higher priority VM-exit is delivered first,
- * this state is discarded.
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
*/
- if (!block_nested_events)
- vmx->nested.mtf_pending = false;
+ bool block_nested_exceptions = vmx->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
clear_bit(KVM_APIC_INIT, &apic->pending_events);
if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+
+ /* MTF is discarded if the vCPU is in WFS. */
+ vmx->nested.mtf_pending = false;
return 0;
}
return -EBUSY;
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
- if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
apic->sipi_vector & 0xFFUL);
- return 0;
+ return 0;
+ }
+ /* Fallthrough, the SIPI is completely ignored. */
}
/*
- * Process any exceptions that are not debug traps before MTF.
+ * Process exceptions that are higher priority than Monitor Trap Flag:
+ * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
+ * could theoretically come in from userspace), and ICEBP (INT1).
*
- * Note that only a pending nested run can block a pending exception.
- * Otherwise an injected NMI/interrupt should either be
- * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
- * while delivering the pending exception.
+ * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
+ * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
+ * across SMI/RSM as it should; that needs to be addressed in order to
+ * prioritize SMI over MTF and trap-like #DBs.
*/
-
- if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
- if (vmx->nested.nested_run_pending)
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
+ if (block_nested_exceptions)
return -EBUSY;
- if (!nested_vmx_check_exception(vcpu, &exit_qual))
- goto no_vmexit;
- nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+
+ nested_vmx_inject_exception_vmexit(vcpu);
return 0;
}
- if (mtf_pending) {
+ if (vcpu->arch.exception.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (vmx->nested.mtf_pending) {
if (block_nested_events)
return -EBUSY;
nested_vmx_update_pending_dbg(vcpu);
return 0;
}
- if (vcpu->arch.exception.pending) {
- if (vmx->nested.nested_run_pending)
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
return -EBUSY;
- if (!nested_vmx_check_exception(vcpu, &exit_qual))
- goto no_vmexit;
- nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+
+ nested_vmx_inject_exception_vmexit(vcpu);
return 0;
}
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
if (nested_vmx_preemption_timer_pending(vcpu)) {
if (block_nested_events)
return -EBUSY;
nested_vmx_abort(vcpu,
VMX_ABORT_SAVE_GUEST_MSR_FAIL);
}
-
- /*
- * Drop what we picked up for L2 via vmx_complete_interrupts. It is
- * preserved above and would only end up incorrectly in L1.
- */
- vcpu->arch.nmi_injected = false;
- kvm_clear_exception_queue(vcpu);
- kvm_clear_interrupt_queue(vcpu);
}
/*
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ /* Pending MTF traps are discarded on VM-Exit. */
+ vmx->nested.mtf_pending = false;
+
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
WARN_ON_ONCE(nested_early_check);
}
+ /*
+ * Drop events/exceptions that were queued for re-injection to L2
+ * (picked up via vmx_complete_interrupts()), as well as exceptions
+ * that were pending for L2. Note, this must NOT be hoisted above
+ * prepare_vmcs12(), events/exceptions queued for re-injection need to
+ * be captured in vmcs12 (see vmcs12_save_pending_event()).
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
/* Update any VMCS fields that might have changed while L2 ran */
free_nested(vcpu);
- /* Process a latched INIT during time CPU was in VMX operation */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (kvm_apic_has_pending_init_or_sipi(vcpu))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return nested_vmx_succeed(vcpu);
}
* state. It is possible that the area will stay mapped as
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
- if (likely(!vmx->nested.enlightened_vmcs_enabled ||
+ if (likely(!guest_cpuid_has_evmcs(vcpu) ||
!nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
if (ret)
goto error_guest_mode;
+ if (vmx->nested.mtf_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return 0;
error_guest_mode:
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
*/
- void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
+ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
{
+ struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
+
/*
* Note that as a general rule, the high half of the MSRs (bits in
* the control fields which may be 1) should be initialized by the
*/
/* pin-based controls */
- rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- msrs->pinbased_ctls_low,
- msrs->pinbased_ctls_high);
- msrs->pinbased_ctls_low |=
+ msrs->pinbased_ctls_low =
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
msrs->pinbased_ctls_high &=
PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING |
PIN_BASED_VMX_PREEMPTION_TIMER;
/* exit controls */
- rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- msrs->exit_ctls_low,
- msrs->exit_ctls_high);
msrs->exit_ctls_low =
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
msrs->exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
- VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ VM_EXIT_CLEAR_BNDCFGS;
msrs->exit_ctls_high |=
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
- VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
/* We support free control of debug control saving. */
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
- rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- msrs->entry_ctls_low,
- msrs->entry_ctls_high);
msrs->entry_ctls_low =
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
msrs->entry_ctls_high &=
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
- VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
msrs->entry_ctls_high |=
- (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
/* We support free control of debug control loading. */
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- msrs->procbased_ctls_low,
- msrs->procbased_ctls_high);
msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
msrs->procbased_ctls_high &=
CPU_BASED_INTR_WINDOW_EXITING |
CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
* depend on CPUID bits, they are added later by
* vmx_vcpu_after_set_cpuid.
*/
- if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- msrs->secondary_ctls_low,
- msrs->secondary_ctls_high);
-
msrs->secondary_ctls_low = 0;
+
+ msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_ENABLE_RDTSCP |
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
/* miscellaneous data */
- rdmsr(MSR_IA32_VMX_MISC,
- msrs->misc_low,
- msrs->misc_high);
- msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
struct kvm_x86_nested_ops vmx_nested_ops = {
.leave_nested = vmx_leave_nested,
+ .is_exception_vmexit = nested_vmx_is_exception_vmexit,
.check_events = vmx_check_nested_events,
- .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
- .hv_timer_pending = nested_vmx_preemption_timer_pending,
+ .has_events = vmx_has_nested_events,
.triple_fault = nested_vmx_triple_fault,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
module_param(enable_vmware_backdoor, bool, S_IRUGO);
EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
- static bool __read_mostly force_emulation_prefix = false;
- module_param(force_emulation_prefix, bool, S_IRUGO);
+ /*
+ * Flags to manipulate forced emulation behavior (any non-zero value will
+ * enable forced emulation).
+ */
+ #define KVM_FEP_CLEAR_RFLAGS_RF BIT(1)
+ static int __read_mostly force_emulation_prefix;
+ module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
#define EXCPT_TRAP 1
#define EXCPT_ABORT 2
#define EXCPT_INTERRUPT 3
+ #define EXCPT_DB 4
static int exception_type(int vector)
{
mask = 1 << vector;
- /* #DB is trap, as instruction watchpoints are handled elsewhere */
- if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ /*
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+ */
+ if (mask & (1 << DB_VECTOR))
+ return EXCPT_DB;
+
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
return EXCPT_TRAP;
if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
return EXCPT_FAULT;
}
- void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+ struct kvm_queued_exception *ex)
{
- unsigned nr = vcpu->arch.exception.nr;
- bool has_payload = vcpu->arch.exception.has_payload;
- unsigned long payload = vcpu->arch.exception.payload;
-
- if (!has_payload)
+ if (!ex->has_payload)
return;
- switch (nr) {
+ switch (ex->vector) {
case DB_VECTOR:
/*
* "Certain debug exceptions may clear bit 0-3. The
* So they need to be flipped for DR6.
*/
vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
- vcpu->arch.dr6 |= payload;
- vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
+ vcpu->arch.dr6 |= ex->payload;
+ vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
/*
* The #DB payload is defined as compatible with the 'pending
vcpu->arch.dr6 &= ~BIT(12);
break;
case PF_VECTOR:
- vcpu->arch.cr2 = payload;
+ vcpu->arch.cr2 = ex->payload;
break;
}
- vcpu->arch.exception.has_payload = false;
- vcpu->arch.exception.payload = 0;
+ ex->has_payload = false;
+ ex->payload = 0;
}
EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+ bool has_error_code, u32 error_code,
+ bool has_payload, unsigned long payload)
+ {
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+
+ ex->vector = vector;
+ ex->injected = false;
+ ex->pending = true;
+ ex->has_error_code = has_error_code;
+ ex->error_code = error_code;
+ ex->has_payload = has_payload;
+ ex->payload = payload;
+ }
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ /*
+ * If the exception is destined for L2 and isn't being reinjected,
+ * morph it to a VM-Exit if L1 wants to intercept the exception. A
+ * previously injected exception is not checked because it was checked
+ * when it was original queued, and re-checking is incorrect if _L1_
+ * injected the exception, in which case it's exempt from interception.
+ */
+ if (!reinject && is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+ kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+ has_payload, payload);
+ return;
+ }
+
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
if (reinject) {
/*
- * On vmentry, vcpu->arch.exception.pending is only
- * true if an event injection was blocked by
- * nested_run_pending. In that case, however,
- * vcpu_enter_guest requests an immediate exit,
- * and the guest shouldn't proceed far enough to
- * need reinjection.
+ * On VM-Entry, an exception can be pending if and only
+ * if event injection was blocked by nested_run_pending.
+ * In that case, however, vcpu_enter_guest() requests an
+ * immediate exit, and the guest shouldn't proceed far
+ * enough to need reinjection.
*/
- WARN_ON_ONCE(vcpu->arch.exception.pending);
+ WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
vcpu->arch.exception.injected = true;
if (WARN_ON_ONCE(has_payload)) {
/*
vcpu->arch.exception.injected = false;
}
vcpu->arch.exception.has_error_code = has_error;
- vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.vector = nr;
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = has_payload;
vcpu->arch.exception.payload = payload;
if (!is_guest_mode(vcpu))
- kvm_deliver_exception_payload(vcpu);
+ kvm_deliver_exception_payload(vcpu,
+ &vcpu->arch.exception);
return;
}
/* to check exception */
- prev_nr = vcpu->arch.exception.nr;
+ prev_nr = vcpu->arch.exception.vector;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
}
class1 = exception_class(prev_nr);
class2 = exception_class(nr);
- if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
- || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+ (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
/*
- * Generate double fault per SDM Table 5-5. Set
- * exception.pending = true so that the double fault
- * can trigger a nested vmexit.
+ * Synthesize #DF. Clear the previously injected or pending
+ * exception so as not to incorrectly trigger shutdown.
*/
- vcpu->arch.exception.pending = true;
vcpu->arch.exception.injected = false;
- vcpu->arch.exception.has_error_code = true;
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- vcpu->arch.exception.has_payload = false;
- vcpu->arch.exception.payload = 0;
- } else
+ vcpu->arch.exception.pending = false;
+
+ kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
+ } else {
/* replace previous exception with a new one in a hope
that instruction re-execution will regenerate lost
exception */
goto queue;
+ }
}
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
- vcpu->arch.exception.nested_apf =
- is_guest_mode(vcpu) && fault->async_page_fault;
- if (vcpu->arch.exception.nested_apf) {
- vcpu->arch.apf.nested_apf_token = fault->address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
- } else {
+
+ /*
+ * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+ * whether or not L1 wants to intercept "regular" #PF.
+ */
+ if (is_guest_mode(vcpu) && fault->async_page_fault)
+ kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+ true, fault->error_code,
+ true, fault->address);
+ else
kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
fault->address);
- }
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
- /* Returns true if the page fault was immediately morphed into a VM-Exit. */
- bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct kvm_mmu *fault_mmu;
kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
fault_mmu->root.hpa);
- /*
- * A workaround for KVM's bad exception handling. If KVM injected an
- * exception into L2, and L2 encountered a #PF while vectoring the
- * injected exception, manually check to see if L1 wants to intercept
- * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
- * In all other cases, defer the check to nested_ops->check_events(),
- * which will correctly handle priority (this does not). Note, other
- * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
- * most problematic, e.g. when L0 and L1 are both intercepting #PF for
- * shadow paging.
- *
- * TODO: Rewrite exception handling to track injected and pending
- * (VM-Exit) exceptions separately.
- */
- if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
- kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
- return true;
-
fault_mmu->inject_page_fault(vcpu, fault);
- return false;
}
EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
}
EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
-static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.guest_fpu.fpstate->user_xfeatures;
-}
-
#ifdef CONFIG_X86_64
static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
{
- return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC;
+ return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
}
#endif
* saving. However, xcr0 bit 0 is always set, even if the
* emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
*/
- valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP;
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
return 1;
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
+ /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
__kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
kvm_inject_gp(vcpu, 0);
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
static unsigned int num_msr_based_features;
+/*
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
+ * does not yet virtualize. These include:
+ * 10 - MISC_PACKAGE_CTRLS
+ * 11 - ENERGY_FILTERING_CTL
+ * 12 - DOITM
+ * 18 - FB_CLEAR_CTRL
+ * 21 - XAPIC_DISABLE_STATUS
+ * 23 - OVERCLOCKING_STATUS
+ */
+
+#define KVM_SUPPORTED_ARCH_CAP \
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+
static u64 kvm_get_arch_capabilities(void)
{
u64 data = 0;
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
+ data &= KVM_SUPPORTED_ARCH_CAP;
+ }
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
*/
}
- /* Guests don't need to know "Fill buffer clear control" exists */
- data &= ~ARCH_CAP_FB_CLEAR_CTRL;
-
return data;
}
return (kvm_arch_interrupt_allowed(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu) &&
!kvm_event_needs_reinjection(vcpu) &&
- !vcpu->arch.exception.pending);
+ !kvm_is_exception_pending(vcpu));
}
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
+ struct kvm_queued_exception *ex;
+
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
/*
- * In guest mode, payload delivery should be deferred,
- * so that the L1 hypervisor can intercept #PF before
- * CR2 is modified (or intercept #DB before DR6 is
- * modified under nVMX). Unless the per-VM capability,
- * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
- * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
- * opportunistically defer the exception payload, deliver it if the
- * capability hasn't been requested before processing a
- * KVM_GET_VCPU_EVENTS.
+ * KVM's ABI only allows for one exception to be migrated. Luckily,
+ * the only time there can be two queued exceptions is if there's a
+ * non-exiting _injected_ exception, and a pending exiting exception.
+ * In that case, ignore the VM-Exiting exception as it's an extension
+ * of the injected exception.
+ */
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vcpu->arch.exception.pending &&
+ !vcpu->arch.exception.injected)
+ ex = &vcpu->arch.exception_vmexit;
+ else
+ ex = &vcpu->arch.exception;
+
+ /*
+ * In guest mode, payload delivery should be deferred if the exception
+ * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
+ * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
+ * propagate the payload and so it cannot be safely deferred. Deliver
+ * the payload if the capability hasn't been requested.
*/
if (!vcpu->kvm->arch.exception_payload_enabled &&
- vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
- kvm_deliver_exception_payload(vcpu);
+ ex->pending && ex->has_payload)
+ kvm_deliver_exception_payload(vcpu, ex);
/*
* The API doesn't provide the instruction length for software
* isn't advanced, we should expect to encounter the exception
* again.
*/
- if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+ if (kvm_exception_is_soft(ex->vector)) {
events->exception.injected = 0;
events->exception.pending = 0;
} else {
- events->exception.injected = vcpu->arch.exception.injected;
- events->exception.pending = vcpu->arch.exception.pending;
+ events->exception.injected = ex->injected;
+ events->exception.pending = ex->pending;
/*
* For ABI compatibility, deliberately conflate
* pending and injected exceptions when
* KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
*/
if (!vcpu->kvm->arch.exception_payload_enabled)
- events->exception.injected |=
- vcpu->arch.exception.pending;
+ events->exception.injected |= ex->pending;
}
- events->exception.nr = vcpu->arch.exception.nr;
- events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.error_code = vcpu->arch.exception.error_code;
- events->exception_has_payload = vcpu->arch.exception.has_payload;
- events->exception_payload = vcpu->arch.exception.payload;
+ events->exception.nr = ex->vector;
+ events->exception.has_error_code = ex->has_error_code;
+ events->exception.error_code = ex->error_code;
+ events->exception_has_payload = ex->has_payload;
+ events->exception_payload = ex->payload;
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
return -EINVAL;
process_nmi(vcpu);
+
+ /*
+ * Flag that userspace is stuffing an exception, the next KVM_RUN will
+ * morph the exception to a VM-Exit if appropriate. Do this only for
+ * pending exceptions, already-injected exceptions are not subject to
+ * intercpetion. Note, userspace that conflates pending and injected
+ * is hosed, and will incorrectly convert an injected exception into a
+ * pending exception, which in turn may cause a spurious VM-Exit.
+ */
+ vcpu->arch.exception_from_userspace = events->exception.pending;
+
+ vcpu->arch.exception_vmexit.pending = false;
+
vcpu->arch.exception.injected = events->exception.injected;
vcpu->arch.exception.pending = events->exception.pending;
- vcpu->arch.exception.nr = events->exception.nr;
+ vcpu->arch.exception.vector = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
vcpu->arch.exception.error_code = events->exception.error_code;
vcpu->arch.exception.has_payload = events->exception_has_payload;
int handle_ud(struct kvm_vcpu *vcpu)
{
static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
+ int fep_flags = READ_ONCE(force_emulation_prefix);
int emul_type = EMULTYPE_TRAP_UD;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
return 1;
- if (force_emulation_prefix &&
+ if (fep_flags &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
+ if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
+ kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
emul_type = EMULTYPE_TRAP_UD_FORCED;
}
int r;
r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
- if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
- complete_emulated_rdmsr, r)) {
- /* Bounce to user space */
- return X86EMUL_IO_NEEDED;
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_read_ex(msr_index);
+ return X86EMUL_PROPAGATE_FAULT;
}
- return r;
+ trace_kvm_msr_read(msr_index, *pdata);
+ return X86EMUL_CONTINUE;
}
static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
int r;
r = kvm_set_msr_with_filter(vcpu, msr_index, data);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
- if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
- complete_emulated_msr_access, r)) {
- /* Bounce to user space */
- return X86EMUL_IO_NEEDED;
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_write_ex(msr_index, data);
+ return X86EMUL_PROPAGATE_FAULT;
}
- return r;
+ trace_kvm_msr_write(msr_index, data);
+ return X86EMUL_CONTINUE;
}
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
}
}
- static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
- if (ctxt->exception.vector == PF_VECTOR)
- return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
- if (ctxt->exception.error_code_valid)
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
ctxt->exception.error_code);
else
kvm_queue_exception(vcpu, ctxt->exception.vector);
- return false;
}
static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
- static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
+ static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
+ u32 shadow;
+
+ if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
+ return true;
+
+ /*
+ * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
+ * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
+ * to avoid the relatively expensive CPUID lookup.
+ */
+ shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
+ guest_cpuid_is_intel(vcpu);
+ }
+
+ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
+ int emulation_type, int *r)
+ {
+ WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
+
+ /*
+ * Do not check for code breakpoints if hardware has already done the
+ * checks, as inferred from the emulation type. On NO_DECODE and SKIP,
+ * the instruction has passed all exception checks, and all intercepted
+ * exceptions that trigger emulation have lower priority than code
+ * breakpoints, i.e. the fact that the intercepted exception occurred
+ * means any code breakpoints have already been serviced.
+ *
+ * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
+ * hardware has checked the RIP of the magic prefix, but not the RIP of
+ * the instruction being emulated. The intent of forced emulation is
+ * to behave as if KVM intercepted the instruction without an exception
+ * and without a prefix.
+ */
+ if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
+ return false;
+
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
struct kvm_run *kvm_run = vcpu->run;
}
if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
- !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+ !kvm_is_code_breakpoint_inhibited(vcpu)) {
unsigned long eip = kvm_get_linear_rip(vcpu);
u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
* are fault-like and are higher priority than any faults on
* the code fetch itself.
*/
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_code_breakpoint(vcpu, &r))
+ if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
return r;
r = x86_decode_emulated_instruction(vcpu, emulation_type,
if (ctxt->have_exception) {
r = 1;
- if (inject_emulated_exception(vcpu))
- return r;
+ inject_emulated_exception(vcpu);
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in) {
/* FIXME: return into emulator if single-stepping. */
unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ /*
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
+ * only supports code breakpoints and general detect #DB, both
+ * of which are fault-like.
+ */
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ trace_kvm_inj_exception(vcpu->arch.exception.vector,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
vcpu->arch.exception.error_code = false;
- static_call(kvm_x86_queue_exception)(vcpu);
+ static_call(kvm_x86_inject_exception)(vcpu);
}
- static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+ /*
+ * Check for any event (interrupt or exception) that is ready to be injected,
+ * and if there is at least one event, inject the event with the highest
+ * priority. This handles both "pending" events, i.e. events that have never
+ * been injected into the guest, and "injected" events, i.e. events that were
+ * injected as part of a previous VM-Enter, but weren't successfully delivered
+ * and need to be re-injected.
+ *
+ * Note, this is not guaranteed to be invoked on a guest instruction boundary,
+ * i.e. doesn't guarantee that there's an event window in the guest. KVM must
+ * be able to inject exceptions in the "middle" of an instruction, and so must
+ * also be able to re-inject NMIs and IRQs in the middle of an instruction.
+ * I.e. for exceptions and re-injected events, NOT invoking this on instruction
+ * boundaries is necessary and correct.
+ *
+ * For simplicity, KVM uses a single path to inject all events (except events
+ * that are injected directly from L1 to L2) and doesn't explicitly track
+ * instruction boundaries for asynchronous events. However, because VM-Exits
+ * that can occur during instruction execution typically result in KVM skipping
+ * the instruction or injecting an exception, e.g. instruction and exception
+ * intercepts, and because pending exceptions have higher priority than pending
+ * interrupts, KVM still honors instruction boundaries in most scenarios.
+ *
+ * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
+ * the instruction or inject an exception, then KVM can incorrecty inject a new
+ * asynchrounous event if the event became pending after the CPU fetched the
+ * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
+ * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
+ * injected on the restarted instruction instead of being deferred until the
+ * instruction completes.
+ *
+ * In practice, this virtualization hole is unlikely to be observed by the
+ * guest, and even less likely to cause functional problems. To detect the
+ * hole, the guest would have to trigger an event on a side effect of an early
+ * phase of instruction execution, e.g. on the instruction fetch from memory.
+ * And for it to be a functional problem, the guest would need to depend on the
+ * ordering between that side effect, the instruction completing, _and_ the
+ * delivery of the asynchronous event.
+ */
+ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
+ bool *req_immediate_exit)
{
+ bool can_inject;
int r;
- bool can_inject = true;
- /* try to reinject previous events if any */
+ /*
+ * Process nested events first, as nested VM-Exit supercedes event
+ * re-injection. If there's an event queued for re-injection, it will
+ * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ r = kvm_check_nested_events(vcpu);
+ else
+ r = 0;
- if (vcpu->arch.exception.injected) {
- kvm_inject_exception(vcpu);
- can_inject = false;
- }
/*
- * Do not inject an NMI or interrupt if there is a pending
- * exception. Exceptions and interrupts are recognized at
- * instruction boundaries, i.e. the start of an instruction.
- * Trap-like exceptions, e.g. #DB, have higher priority than
- * NMIs and interrupts, i.e. traps are recognized before an
- * NMI/interrupt that's pending on the same instruction.
- * Fault-like exceptions, e.g. #GP and #PF, are the lowest
- * priority, but are only generated (pended) during instruction
- * execution, i.e. a pending fault-like exception means the
- * fault occurred on the *previous* instruction and must be
- * serviced prior to recognizing any new events in order to
- * fully complete the previous instruction.
+ * Re-inject exceptions and events *especially* if immediate entry+exit
+ * to/from L2 is needed, as any event that has already been injected
+ * into L2 needs to complete its lifecycle before injecting a new event.
+ *
+ * Don't re-inject an NMI or interrupt if there is a pending exception.
+ * This collision arises if an exception occurred while vectoring the
+ * injected event, KVM intercepted said exception, and KVM ultimately
+ * determined the fault belongs to the guest and queues the exception
+ * for injection back into the guest.
+ *
+ * "Injected" interrupts can also collide with pending exceptions if
+ * userspace ignores the "ready for injection" flag and blindly queues
+ * an interrupt. In that case, prioritizing the exception is correct,
+ * as the exception "occurred" before the exit to userspace. Trap-like
+ * exceptions, e.g. most #DBs, have higher priority than interrupts.
+ * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, they're only generated (pended) during instruction
+ * execution, and interrupts are recognized at instruction boundaries.
+ * Thus a pending fault-like exception means the fault occurred on the
+ * *previous* instruction and must be serviced prior to recognizing any
+ * new events in order to fully complete the previous instruction.
*/
- else if (!vcpu->arch.exception.pending) {
- if (vcpu->arch.nmi_injected) {
- static_call(kvm_x86_inject_nmi)(vcpu);
- can_inject = false;
- } else if (vcpu->arch.interrupt.injected) {
- static_call(kvm_x86_inject_irq)(vcpu, true);
- can_inject = false;
- }
- }
+ if (vcpu->arch.exception.injected)
+ kvm_inject_exception(vcpu);
+ else if (kvm_is_exception_pending(vcpu))
+ ; /* see above */
+ else if (vcpu->arch.nmi_injected)
+ static_call(kvm_x86_inject_nmi)(vcpu);
+ else if (vcpu->arch.interrupt.injected)
+ static_call(kvm_x86_inject_irq)(vcpu, true);
+ /*
+ * Exceptions that morph to VM-Exits are handled above, and pending
+ * exceptions on top of injected exceptions that do not VM-Exit should
+ * either morph to #DF or, sadly, override the injected exception.
+ */
WARN_ON_ONCE(vcpu->arch.exception.injected &&
vcpu->arch.exception.pending);
/*
- * Call check_nested_events() even if we reinjected a previous event
- * in order for caller to determine if it should require immediate-exit
- * from L2 to L1 due to pending L1 events which require exit
- * from L2 to L1.
+ * Bail if immediate entry+exit to/from the guest is needed to complete
+ * nested VM-Enter or event re-injection so that a different pending
+ * event can be serviced (or if KVM needs to exit to userspace).
+ *
+ * Otherwise, continue processing events even if VM-Exit occurred. The
+ * VM-Exit will have cleared exceptions that were meant for L2, but
+ * there may now be events that can be injected into L1.
*/
- if (is_guest_mode(vcpu)) {
- r = kvm_check_nested_events(vcpu);
- if (r < 0)
- goto out;
- }
+ if (r < 0)
+ goto out;
+
+ /*
+ * A pending exception VM-Exit should either result in nested VM-Exit
+ * or force an immediate re-entry and exit to/from L2, and exception
+ * VM-Exits cannot be injected (flag should _never_ be set).
+ */
+ WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+ vcpu->arch.exception_vmexit.pending);
+
+ /*
+ * New events, other than exceptions, cannot be injected if KVM needs
+ * to re-inject a previous event. See above comments on re-injecting
+ * for why pending exceptions get priority.
+ */
+ can_inject = !kvm_event_needs_reinjection(vcpu);
- /* try to inject new event if pending */
if (vcpu->arch.exception.pending) {
- if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+ /*
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+ * value pushed on the stack. Trap-like exception and all #DBs
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
+ *
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
+ * describe the behavior of General Detect #DBs, which are
+ * fault-like. They do _not_ set RF, a la code breakpoints.
+ */
+ if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
- if (vcpu->arch.exception.nr == DB_VECTOR) {
- kvm_deliver_exception_payload(vcpu);
+ if (vcpu->arch.exception.vector == DB_VECTOR) {
+ kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
if (vcpu->arch.dr7 & DR7_GD) {
vcpu->arch.dr7 &= ~DR7_GD;
kvm_update_dr7(vcpu);
}
if (is_guest_mode(vcpu) &&
- kvm_x86_ops.nested_ops->hv_timer_pending &&
- kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu))
*req_immediate_exit = true;
- WARN_ON(vcpu->arch.exception.pending);
+ WARN_ON(kvm_is_exception_pending(vcpu));
return 0;
out:
* When APICv gets disabled, we may still have injected interrupts
* pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
* still active when the interrupt got accepted. Make sure
- * inject_pending_event() is called to check for that.
+ * kvm_check_and_inject_events() is called to check for that.
*/
if (!apic->apicv_active)
kvm_make_request(KVM_REQ_EVENT, vcpu);
goto out;
}
- r = inject_pending_event(vcpu, &req_immediate_exit);
+ r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
if (r < 0) {
r = 0;
goto out;
if (hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
- if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ /*
+ * If the vCPU is not runnable, a signal or another host event
+ * of some kind is pending; service it without changing the
+ * vCPU's activity state.
+ */
+ if (!kvm_arch_vcpu_runnable(vcpu))
return 1;
}
+ /*
+ * Evaluate nested events before exiting the halted state. This allows
+ * the halt state to be recorded properly in the VMCS12's activity
+ * state field (AMD does not have a similar field and a VM-Exit always
+ * causes a spurious wakeup from HLT).
+ */
+ if (is_guest_mode(vcpu)) {
+ if (kvm_check_nested_events(vcpu) < 0)
+ return 0;
+ }
+
if (kvm_apic_accept_events(vcpu) < 0)
return 0;
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_INIT_RECEIVED:
break;
default:
- return -EINTR;
+ WARN_ON_ONCE(1);
+ break;
}
return 1;
}
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu))
- kvm_check_nested_events(vcpu);
-
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
}
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
int r;
r = 0;
goto out;
}
- kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
if (signal_pending(current)) {
r = -EINTR;
}
}
+ /*
+ * If userspace set a pending exception and L2 is active, convert it to
+ * a pending VM-Exit if L1 wants to intercept the exception.
+ */
+ if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+ ex->error_code)) {
+ kvm_queue_exception_vmexit(vcpu, ex->vector,
+ ex->has_error_code, ex->error_code,
+ ex->has_payload, ex->payload);
+ ex->injected = false;
+ ex->pending = false;
+ }
+ vcpu->arch.exception_from_userspace = false;
+
if (unlikely(vcpu->arch.complete_userspace_io)) {
int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
vcpu->arch.complete_userspace_io = NULL;
kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
vcpu->arch.exception.pending = false;
+ vcpu->arch.exception_vmexit.pending = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
vcpu_load(vcpu);
- if (!lapic_in_kernel(vcpu) &&
- mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+ switch (mp_state->mp_state) {
+ case KVM_MP_STATE_UNINITIALIZED:
+ case KVM_MP_STATE_HALTED:
+ case KVM_MP_STATE_AP_RESET_HOLD:
+ case KVM_MP_STATE_INIT_RECEIVED:
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ break;
+
+ case KVM_MP_STATE_RUNNABLE:
+ break;
+
+ default:
goto out;
+ }
/*
- * KVM_MP_STATE_INIT_RECEIVED means the processor is in
- * INIT state; latched init should be reported using
- * KVM_SET_VCPU_EVENTS, so reject it here.
+ * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
+ * forcing the guest into INIT/SIPI if those events are supposed to be
+ * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
+ * if an SMI is pending as well.
*/
- if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
+ if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
goto out;
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
- if (vcpu->arch.exception.pending)
+ if (kvm_is_exception_pending(vcpu))
goto out;
if (dbg->control & KVM_GUESTDBG_INJECT_DB)
kvm_queue_exception(vcpu, DB_VECTOR);
vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
- goto fail_free_pio_data;
+ goto fail_free_mce_banks;
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
kfree(vcpu->arch.mci_ctl2_banks);
-fail_free_pio_data:
free_page((unsigned long)vcpu->arch.pio_data);
fail_free_lapic:
kvm_free_lapic(vcpu);
struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
/*
- * To avoid have the INIT path from kvm_apic_has_events() that be
- * called with loaded FPU and does not let userspace fix the state.
+ * All paths that lead to INIT are required to load the guest's
+ * FPU state (because most paths are buried in KVM_RUN).
*/
if (init_event)
kvm_put_guest_fpu(vcpu);
if (ret)
goto out_page_track;
+ ret = static_call(kvm_x86_vm_init)(kvm);
+ if (ret)
+ goto out_uninit_mmu;
+
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
kvm_hv_init_vm(kvm);
kvm_xen_init_vm(kvm);
- return static_call(kvm_x86_vm_init)(kvm);
+ return 0;
+ out_uninit_mmu:
+ kvm_mmu_uninit_vm(kvm);
out_page_track:
kvm_page_track_cleanup(kvm);
out:
if (!list_empty_careful(&vcpu->async_pf.done))
return true;
- if (kvm_apic_has_events(vcpu))
+ if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ kvm_apic_init_sipi_allowed(vcpu))
return true;
if (vcpu->arch.pv.pv_unhalted)
return true;
- if (vcpu->arch.exception.pending)
+ if (kvm_is_exception_pending(vcpu))
return true;
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
return true;
if (is_guest_mode(vcpu) &&
- kvm_x86_ops.nested_ops->hv_timer_pending &&
- kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu))
return true;
if (kvm_xen_has_pending_events(vcpu))
return true;
- if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
- return true;
-
return false;
}
{
if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu) ||
- vcpu->arch.exception.pending))
+ kvm_is_exception_pending(vcpu)))
return false;
if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
- EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
" kernel_misc_reclaimable:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
" shadow_call_stack:%lukB"
#endif
" pagetables:%lukB"
+ " sec_pagetables:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
}
static noinline void __init
#define TEXT_FOR_HIGHMEM(xx)
#endif
+#ifdef CONFIG_ZONE_DEVICE
+#define TEXT_FOR_DEVICE(xx) xx "_device",
+#else
+#define TEXT_FOR_DEVICE(xx)
+#endif
+
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable",
+ TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+ TEXT_FOR_DEVICE(xx)
const char * const vmstat_text[] = {
/* enum zone_stat_item counters */
"nr_shadow_call_stack",
#endif
"nr_page_table_pages",
+ "nr_sec_page_table_pages",
#ifdef CONFIG_SWAP
"nr_swapcached",
#endif
LIBKVM += lib/sparsebit.c
LIBKVM += lib/test_util.c
+LIBKVM_STRING += lib/string_override.c
+
LIBKVM_x86_64 += lib/x86_64/apic.c
LIBKVM_x86_64 += lib/x86_64/handlers.S
LIBKVM_x86_64 += lib/x86_64/perf_test_util.c
TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
+ TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
LIBKVM_S := $(filter %.S,$(LIBKVM))
LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
+LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+# Compile the string overrides as freestanding to prevent the compiler from
+# generating self-referential code, e.g. without "freestanding" the compiler may
+# "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion.
+$(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
+
x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
$(TEST_GEN_PROGS): $(LIBKVM_OBJS)
$(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
#ifndef SELFTEST_KVM_VMX_H
#define SELFTEST_KVM_VMX_H
+ #include <asm/vmx.h>
+
#include <stdint.h>
#include "processor.h"
#include "apic.h"
#define VMX_EPT_VPID_CAP_AD_BITS 0x00200000
#define EXIT_REASON_FAILED_VMENTRY 0x80000000
- #define EXIT_REASON_EXCEPTION_NMI 0
- #define EXIT_REASON_EXTERNAL_INTERRUPT 1
- #define EXIT_REASON_TRIPLE_FAULT 2
- #define EXIT_REASON_INTERRUPT_WINDOW 7
- #define EXIT_REASON_NMI_WINDOW 8
- #define EXIT_REASON_TASK_SWITCH 9
- #define EXIT_REASON_CPUID 10
- #define EXIT_REASON_HLT 12
- #define EXIT_REASON_INVD 13
- #define EXIT_REASON_INVLPG 14
- #define EXIT_REASON_RDPMC 15
- #define EXIT_REASON_RDTSC 16
- #define EXIT_REASON_VMCALL 18
- #define EXIT_REASON_VMCLEAR 19
- #define EXIT_REASON_VMLAUNCH 20
- #define EXIT_REASON_VMPTRLD 21
- #define EXIT_REASON_VMPTRST 22
- #define EXIT_REASON_VMREAD 23
- #define EXIT_REASON_VMRESUME 24
- #define EXIT_REASON_VMWRITE 25
- #define EXIT_REASON_VMOFF 26
- #define EXIT_REASON_VMON 27
- #define EXIT_REASON_CR_ACCESS 28
- #define EXIT_REASON_DR_ACCESS 29
- #define EXIT_REASON_IO_INSTRUCTION 30
- #define EXIT_REASON_MSR_READ 31
- #define EXIT_REASON_MSR_WRITE 32
- #define EXIT_REASON_INVALID_STATE 33
- #define EXIT_REASON_MWAIT_INSTRUCTION 36
- #define EXIT_REASON_MONITOR_INSTRUCTION 39
- #define EXIT_REASON_PAUSE_INSTRUCTION 40
- #define EXIT_REASON_MCE_DURING_VMENTRY 41
- #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
- #define EXIT_REASON_APIC_ACCESS 44
- #define EXIT_REASON_EOI_INDUCED 45
- #define EXIT_REASON_EPT_VIOLATION 48
- #define EXIT_REASON_EPT_MISCONFIG 49
- #define EXIT_REASON_INVEPT 50
- #define EXIT_REASON_RDTSCP 51
- #define EXIT_REASON_PREEMPTION_TIMER 52
- #define EXIT_REASON_INVVPID 53
- #define EXIT_REASON_WBINVD 54
- #define EXIT_REASON_XSETBV 55
- #define EXIT_REASON_APIC_WRITE 56
- #define EXIT_REASON_INVPCID 58
- #define EXIT_REASON_PML_FULL 62
- #define EXIT_REASON_XSAVES 63
- #define EXIT_REASON_XRSTORS 64
- #define LAST_EXIT_REASON 64
enum vmcs_field {
VIRTUAL_PROCESSOR_ID = 0x00000000,
VMWRITE_BITMAP_HIGH = 0x00002029,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ ENCLS_EXITING_BITMAP = 0x0000202E,
+ ENCLS_EXITING_BITMAP_HIGH = 0x0000202F,
TSC_MULTIPLIER = 0x00002032,
TSC_MULTIPLIER_HIGH = 0x00002033,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
uint32_t memslot);
void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
uint64_t addr, uint64_t size);
+bool kvm_vm_has_ept(struct kvm_vm *vm);
void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot);
void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);