will allow the transition to secure guest mode. Otherwise KVM will
veto the transition.
+7.20 KVM_CAP_HALT_POLL
+----------------------
+
+:Architectures: all
+:Target: VM
+:Parameters: args[0] is the maximum poll time in nanoseconds
+:Returns: 0 on success; -1 on error
+
+This capability overrides the kvm module parameter halt_poll_ns for the
+target VM.
+
+VCPU polling allows a VCPU to poll for wakeup events instead of immediately
+scheduling during guest halts. The maximum time a VCPU can spend polling is
+controlled by the kvm module parameter halt_poll_ns. This capability allows
+the maximum halt time to specified on a per-VM basis, effectively overriding
+the module parameter for the target VM.
+
8. Other capabilities.
======================
natural_width cr4_guest_host_mask;
natural_width cr0_read_shadow;
natural_width cr4_read_shadow;
- natural_width cr3_target_value0;
- natural_width cr3_target_value1;
- natural_width cr3_target_value2;
- natural_width cr3_target_value3;
+ natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
natural_width exit_qualification;
natural_width guest_linear_address;
natural_width guest_cr0;
#include "trace.h"
-#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM }
-#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU }
-
struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT(halt_successful_poll),
- VCPU_STAT(halt_attempted_poll),
- VCPU_STAT(halt_poll_invalid),
- VCPU_STAT(halt_wakeup),
- VCPU_STAT(hvc_exit_stat),
- VCPU_STAT(wfe_exit_stat),
- VCPU_STAT(wfi_exit_stat),
- VCPU_STAT(mmio_exit_user),
- VCPU_STAT(mmio_exit_kernel),
- VCPU_STAT(exits),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
+ VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
+ VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
+ VCPU_STAT("mmio_exit_user", mmio_exit_user),
+ VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
+ VCPU_STAT("exits", exits),
{ NULL }
};
#define VECTORSPACING 0x100 /* for EI/VI mode */
#endif
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x)
struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "wait", VCPU_STAT(wait_exits), KVM_STAT_VCPU },
- { "cache", VCPU_STAT(cache_exits), KVM_STAT_VCPU },
- { "signal", VCPU_STAT(signal_exits), KVM_STAT_VCPU },
- { "interrupt", VCPU_STAT(int_exits), KVM_STAT_VCPU },
- { "cop_unusable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU },
- { "tlbmod", VCPU_STAT(tlbmod_exits), KVM_STAT_VCPU },
- { "tlbmiss_ld", VCPU_STAT(tlbmiss_ld_exits), KVM_STAT_VCPU },
- { "tlbmiss_st", VCPU_STAT(tlbmiss_st_exits), KVM_STAT_VCPU },
- { "addrerr_st", VCPU_STAT(addrerr_st_exits), KVM_STAT_VCPU },
- { "addrerr_ld", VCPU_STAT(addrerr_ld_exits), KVM_STAT_VCPU },
- { "syscall", VCPU_STAT(syscall_exits), KVM_STAT_VCPU },
- { "resvd_inst", VCPU_STAT(resvd_inst_exits), KVM_STAT_VCPU },
- { "break_inst", VCPU_STAT(break_inst_exits), KVM_STAT_VCPU },
- { "trap_inst", VCPU_STAT(trap_inst_exits), KVM_STAT_VCPU },
- { "msa_fpe", VCPU_STAT(msa_fpe_exits), KVM_STAT_VCPU },
- { "fpe", VCPU_STAT(fpe_exits), KVM_STAT_VCPU },
- { "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU },
- { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+ VCPU_STAT("wait", wait_exits),
+ VCPU_STAT("cache", cache_exits),
+ VCPU_STAT("signal", signal_exits),
+ VCPU_STAT("interrupt", int_exits),
+ VCPU_STAT("cop_unusable", cop_unusable_exits),
+ VCPU_STAT("tlbmod", tlbmod_exits),
+ VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits),
+ VCPU_STAT("tlbmiss_st", tlbmiss_st_exits),
+ VCPU_STAT("addrerr_st", addrerr_st_exits),
+ VCPU_STAT("addrerr_ld", addrerr_ld_exits),
+ VCPU_STAT("syscall", syscall_exits),
+ VCPU_STAT("resvd_inst", resvd_inst_exits),
+ VCPU_STAT("break_inst", break_inst_exits),
+ VCPU_STAT("trap_inst", trap_inst_exits),
+ VCPU_STAT("msa_fpe", msa_fpe_exits),
+ VCPU_STAT("fpe", fpe_exits),
+ VCPU_STAT("msa_disabled", msa_disabled_exits),
+ VCPU_STAT("flush_dcache", flush_dcache_exits),
#ifdef CONFIG_KVM_MIPS_VZ
- { "vz_gpsi", VCPU_STAT(vz_gpsi_exits), KVM_STAT_VCPU },
- { "vz_gsfc", VCPU_STAT(vz_gsfc_exits), KVM_STAT_VCPU },
- { "vz_hc", VCPU_STAT(vz_hc_exits), KVM_STAT_VCPU },
- { "vz_grr", VCPU_STAT(vz_grr_exits), KVM_STAT_VCPU },
- { "vz_gva", VCPU_STAT(vz_gva_exits), KVM_STAT_VCPU },
- { "vz_ghfc", VCPU_STAT(vz_ghfc_exits), KVM_STAT_VCPU },
- { "vz_gpa", VCPU_STAT(vz_gpa_exits), KVM_STAT_VCPU },
- { "vz_resvd", VCPU_STAT(vz_resvd_exits), KVM_STAT_VCPU },
+ VCPU_STAT("vz_gpsi", vz_gpsi_exits),
+ VCPU_STAT("vz_gsfc", vz_gsfc_exits),
+ VCPU_STAT("vz_hc", vz_hc_exits),
+ VCPU_STAT("vz_grr", vz_grr_exits),
+ VCPU_STAT("vz_gva", vz_gva_exits),
+ VCPU_STAT("vz_ghfc", vz_ghfc_exits),
+ VCPU_STAT("vz_gpa", vz_gpa_exits),
+ VCPU_STAT("vz_resvd", vz_resvd_exits),
#endif
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
- { "halt_wakeup", VCPU_STAT(halt_wakeup), KVM_STAT_VCPU },
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
{NULL}
};
return -ENOIOCTLCMD;
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *run = vcpu->run;
int r = -EINTR;
vcpu_load(vcpu);
#include "book3s.h"
#include "trace.h"
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
/* #define EXIT_DEBUG */
struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "exits", VCPU_STAT(sum_exits) },
- { "mmio", VCPU_STAT(mmio_exits) },
- { "sig", VCPU_STAT(signal_exits) },
- { "sysc", VCPU_STAT(syscall_exits) },
- { "inst_emu", VCPU_STAT(emulated_inst_exits) },
- { "dec", VCPU_STAT(dec_exits) },
- { "ext_intr", VCPU_STAT(ext_intr_exits) },
- { "queue_intr", VCPU_STAT(queue_intr) },
- { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) },
- { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) },
- { "halt_wait_ns", VCPU_STAT(halt_wait_ns) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
- { "halt_successful_wait", VCPU_STAT(halt_successful_wait) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "pf_storage", VCPU_STAT(pf_storage) },
- { "sp_storage", VCPU_STAT(sp_storage) },
- { "pf_instruc", VCPU_STAT(pf_instruc) },
- { "sp_instruc", VCPU_STAT(sp_instruc) },
- { "ld", VCPU_STAT(ld) },
- { "ld_slow", VCPU_STAT(ld_slow) },
- { "st", VCPU_STAT(st) },
- { "st_slow", VCPU_STAT(st_slow) },
- { "pthru_all", VCPU_STAT(pthru_all) },
- { "pthru_host", VCPU_STAT(pthru_host) },
- { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) },
- { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) },
- { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) },
+ VCPU_STAT("exits", sum_exits),
+ VCPU_STAT("mmio", mmio_exits),
+ VCPU_STAT("sig", signal_exits),
+ VCPU_STAT("sysc", syscall_exits),
+ VCPU_STAT("inst_emu", emulated_inst_exits),
+ VCPU_STAT("dec", dec_exits),
+ VCPU_STAT("ext_intr", ext_intr_exits),
+ VCPU_STAT("queue_intr", queue_intr),
+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VCPU_STAT("halt_wait_ns", halt_wait_ns),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_successful_wait", halt_successful_wait),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("pf_storage", pf_storage),
+ VCPU_STAT("sp_storage", sp_storage),
+ VCPU_STAT("pf_instruc", pf_instruc),
+ VCPU_STAT("sp_instruc", sp_instruc),
+ VCPU_STAT("ld", ld),
+ VCPU_STAT("ld_slow", ld_slow),
+ VCPU_STAT("st", st),
+ VCPU_STAT("st_slow", st_slow),
+ VCPU_STAT("pthru_all", pthru_all),
+ VCPU_STAT("pthru_host", pthru_host),
+ VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
+ VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
+ VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
{ NULL }
};
unsigned long kvmppc_booke_handlers;
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-
struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "mmio", VCPU_STAT(mmio_exits) },
- { "sig", VCPU_STAT(signal_exits) },
- { "itlb_r", VCPU_STAT(itlb_real_miss_exits) },
- { "itlb_v", VCPU_STAT(itlb_virt_miss_exits) },
- { "dtlb_r", VCPU_STAT(dtlb_real_miss_exits) },
- { "dtlb_v", VCPU_STAT(dtlb_virt_miss_exits) },
- { "sysc", VCPU_STAT(syscall_exits) },
- { "isi", VCPU_STAT(isi_exits) },
- { "dsi", VCPU_STAT(dsi_exits) },
- { "inst_emu", VCPU_STAT(emulated_inst_exits) },
- { "dec", VCPU_STAT(dec_exits) },
- { "ext_intr", VCPU_STAT(ext_intr_exits) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "doorbell", VCPU_STAT(dbell_exits) },
- { "guest doorbell", VCPU_STAT(gdbell_exits) },
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ VCPU_STAT("mmio", mmio_exits),
+ VCPU_STAT("sig", signal_exits),
+ VCPU_STAT("itlb_r", itlb_real_miss_exits),
+ VCPU_STAT("itlb_v", itlb_virt_miss_exits),
+ VCPU_STAT("dtlb_r", dtlb_real_miss_exits),
+ VCPU_STAT("dtlb_v", dtlb_virt_miss_exits),
+ VCPU_STAT("sysc", syscall_exits),
+ VCPU_STAT("isi", isi_exits),
+ VCPU_STAT("dsi", dsi_exits),
+ VCPU_STAT("inst_emu", emulated_inst_exits),
+ VCPU_STAT("dec", dec_exits),
+ VCPU_STAT("ext_intr", ext_intr_exits),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("doorbell", dbell_exits),
+ VCPU_STAT("guest doorbell", gdbell_exits),
+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
{ NULL }
};
return r;
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *run = vcpu->run;
int r;
vcpu_load(vcpu);
#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
(KVM_MAX_VCPUS + LOCAL_IRQS))
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-
struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "userspace_handled", VCPU_STAT(exit_userspace) },
- { "exit_null", VCPU_STAT(exit_null) },
- { "exit_validity", VCPU_STAT(exit_validity) },
- { "exit_stop_request", VCPU_STAT(exit_stop_request) },
- { "exit_external_request", VCPU_STAT(exit_external_request) },
- { "exit_io_request", VCPU_STAT(exit_io_request) },
- { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
- { "exit_instruction", VCPU_STAT(exit_instruction) },
- { "exit_pei", VCPU_STAT(exit_pei) },
- { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
- { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
- { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
- { "instruction_lctl", VCPU_STAT(instruction_lctl) },
- { "instruction_stctl", VCPU_STAT(instruction_stctl) },
- { "instruction_stctg", VCPU_STAT(instruction_stctg) },
- { "deliver_ckc", VCPU_STAT(deliver_ckc) },
- { "deliver_cputm", VCPU_STAT(deliver_cputm) },
- { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
- { "deliver_external_call", VCPU_STAT(deliver_external_call) },
- { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
- { "deliver_virtio", VCPU_STAT(deliver_virtio) },
- { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
- { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
- { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
- { "deliver_program", VCPU_STAT(deliver_program) },
- { "deliver_io", VCPU_STAT(deliver_io) },
- { "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
- { "exit_wait_state", VCPU_STAT(exit_wait_state) },
- { "inject_ckc", VCPU_STAT(inject_ckc) },
- { "inject_cputm", VCPU_STAT(inject_cputm) },
- { "inject_external_call", VCPU_STAT(inject_external_call) },
- { "inject_float_mchk", VM_STAT(inject_float_mchk) },
- { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
- { "inject_io", VM_STAT(inject_io) },
- { "inject_mchk", VCPU_STAT(inject_mchk) },
- { "inject_pfault_done", VM_STAT(inject_pfault_done) },
- { "inject_program", VCPU_STAT(inject_program) },
- { "inject_restart", VCPU_STAT(inject_restart) },
- { "inject_service_signal", VM_STAT(inject_service_signal) },
- { "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
- { "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
- { "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
- { "inject_virtio", VM_STAT(inject_virtio) },
- { "instruction_epsw", VCPU_STAT(instruction_epsw) },
- { "instruction_gs", VCPU_STAT(instruction_gs) },
- { "instruction_io_other", VCPU_STAT(instruction_io_other) },
- { "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
- { "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
- { "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
- { "instruction_ptff", VCPU_STAT(instruction_ptff) },
- { "instruction_stidp", VCPU_STAT(instruction_stidp) },
- { "instruction_sck", VCPU_STAT(instruction_sck) },
- { "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
- { "instruction_spx", VCPU_STAT(instruction_spx) },
- { "instruction_stpx", VCPU_STAT(instruction_stpx) },
- { "instruction_stap", VCPU_STAT(instruction_stap) },
- { "instruction_iske", VCPU_STAT(instruction_iske) },
- { "instruction_ri", VCPU_STAT(instruction_ri) },
- { "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
- { "instruction_sske", VCPU_STAT(instruction_sske) },
- { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
- { "instruction_essa", VCPU_STAT(instruction_essa) },
- { "instruction_stsi", VCPU_STAT(instruction_stsi) },
- { "instruction_stfl", VCPU_STAT(instruction_stfl) },
- { "instruction_tb", VCPU_STAT(instruction_tb) },
- { "instruction_tpi", VCPU_STAT(instruction_tpi) },
- { "instruction_tprot", VCPU_STAT(instruction_tprot) },
- { "instruction_tsch", VCPU_STAT(instruction_tsch) },
- { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
- { "instruction_sie", VCPU_STAT(instruction_sie) },
- { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
- { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
- { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
- { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
- { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) },
- { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) },
- { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
- { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
- { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
- { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
- { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
- { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
- { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
- { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
- { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
- { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
- { "instruction_diag_10", VCPU_STAT(diagnose_10) },
- { "instruction_diag_44", VCPU_STAT(diagnose_44) },
- { "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
- { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) },
- { "instruction_diag_258", VCPU_STAT(diagnose_258) },
- { "instruction_diag_308", VCPU_STAT(diagnose_308) },
- { "instruction_diag_500", VCPU_STAT(diagnose_500) },
- { "instruction_diag_other", VCPU_STAT(diagnose_other) },
+ VCPU_STAT("userspace_handled", exit_userspace),
+ VCPU_STAT("exit_null", exit_null),
+ VCPU_STAT("exit_validity", exit_validity),
+ VCPU_STAT("exit_stop_request", exit_stop_request),
+ VCPU_STAT("exit_external_request", exit_external_request),
+ VCPU_STAT("exit_io_request", exit_io_request),
+ VCPU_STAT("exit_external_interrupt", exit_external_interrupt),
+ VCPU_STAT("exit_instruction", exit_instruction),
+ VCPU_STAT("exit_pei", exit_pei),
+ VCPU_STAT("exit_program_interruption", exit_program_interruption),
+ VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program),
+ VCPU_STAT("exit_operation_exception", exit_operation_exception),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("instruction_lctlg", instruction_lctlg),
+ VCPU_STAT("instruction_lctl", instruction_lctl),
+ VCPU_STAT("instruction_stctl", instruction_stctl),
+ VCPU_STAT("instruction_stctg", instruction_stctg),
+ VCPU_STAT("deliver_ckc", deliver_ckc),
+ VCPU_STAT("deliver_cputm", deliver_cputm),
+ VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal),
+ VCPU_STAT("deliver_external_call", deliver_external_call),
+ VCPU_STAT("deliver_service_signal", deliver_service_signal),
+ VCPU_STAT("deliver_virtio", deliver_virtio),
+ VCPU_STAT("deliver_stop_signal", deliver_stop_signal),
+ VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal),
+ VCPU_STAT("deliver_restart_signal", deliver_restart_signal),
+ VCPU_STAT("deliver_program", deliver_program),
+ VCPU_STAT("deliver_io", deliver_io),
+ VCPU_STAT("deliver_machine_check", deliver_machine_check),
+ VCPU_STAT("exit_wait_state", exit_wait_state),
+ VCPU_STAT("inject_ckc", inject_ckc),
+ VCPU_STAT("inject_cputm", inject_cputm),
+ VCPU_STAT("inject_external_call", inject_external_call),
+ VM_STAT("inject_float_mchk", inject_float_mchk),
+ VCPU_STAT("inject_emergency_signal", inject_emergency_signal),
+ VM_STAT("inject_io", inject_io),
+ VCPU_STAT("inject_mchk", inject_mchk),
+ VM_STAT("inject_pfault_done", inject_pfault_done),
+ VCPU_STAT("inject_program", inject_program),
+ VCPU_STAT("inject_restart", inject_restart),
+ VM_STAT("inject_service_signal", inject_service_signal),
+ VCPU_STAT("inject_set_prefix", inject_set_prefix),
+ VCPU_STAT("inject_stop_signal", inject_stop_signal),
+ VCPU_STAT("inject_pfault_init", inject_pfault_init),
+ VM_STAT("inject_virtio", inject_virtio),
+ VCPU_STAT("instruction_epsw", instruction_epsw),
+ VCPU_STAT("instruction_gs", instruction_gs),
+ VCPU_STAT("instruction_io_other", instruction_io_other),
+ VCPU_STAT("instruction_lpsw", instruction_lpsw),
+ VCPU_STAT("instruction_lpswe", instruction_lpswe),
+ VCPU_STAT("instruction_pfmf", instruction_pfmf),
+ VCPU_STAT("instruction_ptff", instruction_ptff),
+ VCPU_STAT("instruction_stidp", instruction_stidp),
+ VCPU_STAT("instruction_sck", instruction_sck),
+ VCPU_STAT("instruction_sckpf", instruction_sckpf),
+ VCPU_STAT("instruction_spx", instruction_spx),
+ VCPU_STAT("instruction_stpx", instruction_stpx),
+ VCPU_STAT("instruction_stap", instruction_stap),
+ VCPU_STAT("instruction_iske", instruction_iske),
+ VCPU_STAT("instruction_ri", instruction_ri),
+ VCPU_STAT("instruction_rrbe", instruction_rrbe),
+ VCPU_STAT("instruction_sske", instruction_sske),
+ VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock),
+ VCPU_STAT("instruction_essa", instruction_essa),
+ VCPU_STAT("instruction_stsi", instruction_stsi),
+ VCPU_STAT("instruction_stfl", instruction_stfl),
+ VCPU_STAT("instruction_tb", instruction_tb),
+ VCPU_STAT("instruction_tpi", instruction_tpi),
+ VCPU_STAT("instruction_tprot", instruction_tprot),
+ VCPU_STAT("instruction_tsch", instruction_tsch),
+ VCPU_STAT("instruction_sthyi", instruction_sthyi),
+ VCPU_STAT("instruction_sie", instruction_sie),
+ VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense),
+ VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running),
+ VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call),
+ VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency),
+ VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency),
+ VCPU_STAT("instruction_sigp_start", instruction_sigp_start),
+ VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop),
+ VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status),
+ VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status),
+ VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status),
+ VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch),
+ VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix),
+ VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart),
+ VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset),
+ VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset),
+ VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown),
+ VCPU_STAT("instruction_diag_10", diagnose_10),
+ VCPU_STAT("instruction_diag_44", diagnose_44),
+ VCPU_STAT("instruction_diag_9c", diagnose_9c),
+ VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+ VCPU_STAT("instruction_diag_258", diagnose_258),
+ VCPU_STAT("instruction_diag_308", diagnose_308),
+ VCPU_STAT("instruction_diag_500", diagnose_500),
+ VCPU_STAT("instruction_diag_other", diagnose_other),
{ NULL }
};
store_regs_fmt2(vcpu, kvm_run);
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
int rc;
if (kvm_run->immediate_exit)
#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
#define KVM_REQ_APICV_UPDATE \
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
+#define KVM_REQ_HV_TLB_FLUSH \
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
VCPU_EXREG_CR3,
VCPU_EXREG_RFLAGS,
VCPU_EXREG_SEGMENTS,
+ VCPU_EXREG_EXIT_INFO_1,
+ VCPU_EXREG_EXIT_INFO_2,
};
enum {
};
struct kvm_mmu_root_info {
- gpa_t cr3;
+ gpa_t pgd;
hpa_t hpa;
};
#define KVM_MMU_ROOT_INFO_INVALID \
- ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
+ ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE })
#define KVM_MMU_NUM_PREV_ROOTS 3
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte);
hpa_t root_hpa;
- gpa_t root_cr3;
+ gpa_t root_pgd;
union kvm_mmu_role mmu_role;
u8 root_level;
u8 shadow_root_level;
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
- void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+ void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
+ void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
int (*tlb_remote_flush)(struct kvm *kvm);
int (*tlb_remote_flush_with_range)(struct kvm *kvm,
struct kvm_tlb_range *range);
*/
void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
- void (*run)(struct kvm_vcpu *vcpu);
+ /*
+ * Flush any TLB entries created by the guest. Like tlb_flush_gva(),
+ * does not need to flush GPA->HPA mappings.
+ */
+ void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
+
+ enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
- void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+ void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
struct x86_instruction_info *info,
enum x86_intercept_stage stage,
struct x86_exception *exception);
- void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu,
- enum exit_fastpath_completion *exit_fastpath);
+ void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- int (*check_nested_events)(struct kvm_vcpu *vcpu);
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
+ const struct kvm_x86_nested_ops *nested_ops;
/*
* Architecture specific hooks for vCPU blocking due to
void (*setup_mce)(struct kvm_vcpu *vcpu);
- int (*get_nested_state)(struct kvm_vcpu *vcpu,
- struct kvm_nested_state __user *user_kvm_nested_state,
- unsigned user_data_size);
- int (*set_nested_state)(struct kvm_vcpu *vcpu,
- struct kvm_nested_state __user *user_kvm_nested_state,
- struct kvm_nested_state *kvm_state);
- bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
-
int (*smi_allowed)(struct kvm_vcpu *vcpu);
int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
- int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
- uint16_t *vmcs_version);
- uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
-
bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
};
+struct kvm_x86_nested_ops {
+ int (*check_events)(struct kvm_vcpu *vcpu);
+ int (*get_state)(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ unsigned user_data_size);
+ int (*set_state)(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state);
+ bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
+ int (*enable_evmcs)(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version);
+ uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+};
+
struct kvm_x86_init_ops {
int (*cpu_has_kvm_support)(void);
int (*disabled_by_bios)(void);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gva_t gva, hpa_t root_hpa);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
+ bool skip_mmu_sync);
void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
cpuid_entry_override(entry, CPUID_8000_0001_ECX);
break;
+ case 0x80000006:
+ /* L2 cache and TLB: pass through host info. */
+ break;
case 0x80000007: /* Advanced power management */
/* invariant TSC is CPUID.80000007H:EDX[8] */
entry->edx &= (1 << 8);
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- kvm_make_vcpus_request_mask(kvm,
- KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH,
NULL, vcpu_mask, &hv_vcpu->tlb_flush);
ret_success:
};
int i, nent = ARRAY_SIZE(cpuid_entries);
- if (kvm_x86_ops.nested_get_evmcs_version)
- evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu);
+ if (kvm_x86_ops.nested_ops->get_evmcs_version)
+ evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
/* Skip NESTED_FEATURES if eVMCS is not supported */
if (!evmcs_ver)
local_irq_restore(flags);
}
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+ return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+}
+
static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
{
ktime_t now, remaining;
u64 ns_remaining_old, ns_remaining_new;
- apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
- * APIC_BUS_CYCLE_NS * apic->divide_count;
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
limit_periodic_timer_frequency(apic);
now = ktime_get();
apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
}
-static bool set_target_expiration(struct kvm_lapic *apic)
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
{
ktime_t now;
u64 tscl = rdtsc();
+ s64 deadline;
now = ktime_get();
- apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
- * APIC_BUS_CYCLE_NS * apic->divide_count;
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
if (!apic->lapic_timer.period) {
apic->lapic_timer.tscdeadline = 0;
}
limit_periodic_timer_frequency(apic);
+ deadline = apic->lapic_timer.period;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+ if (unlikely(count_reg != APIC_TMICT)) {
+ deadline = tmict_to_ns(apic,
+ kvm_lapic_get_reg(apic, count_reg));
+ if (unlikely(deadline <= 0))
+ deadline = apic->lapic_timer.period;
+ else if (unlikely(deadline > apic->lapic_timer.period)) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested lapic timer restore with "
+ "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+ "Using initial count to start timer.\n",
+ apic->vcpu->vcpu_id,
+ count_reg,
+ kvm_lapic_get_reg(apic, count_reg),
+ deadline, apic->lapic_timer.period);
+ kvm_lapic_set_reg(apic, count_reg, 0);
+ deadline = apic->lapic_timer.period;
+ }
+ }
+ }
apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
- nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
- apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+ nsec_to_cycles(apic->vcpu, deadline);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
return true;
}
restart_apic_timer(apic);
}
-static void start_apic_timer(struct kvm_lapic *apic)
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
{
atomic_set(&apic->lapic_timer.pending, 0);
if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
- && !set_target_expiration(apic))
+ && !set_target_expiration(apic, count_reg))
return;
restart_apic_timer(apic);
}
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+ __start_apic_timer(apic, APIC_TMICT);
+}
+
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
{
bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
{
memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+ /*
+ * Get calculated timer current count for remaining timer period (if
+ * any) and store it in the returned register set.
+ */
+ __kvm_lapic_set_reg(s->regs, APIC_TMCCT,
+ __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
return kvm_apic_state_fixup(vcpu, s, false);
}
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
- start_apic_timer(apic);
+ __start_apic_timer(apic, APIC_TMCCT);
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
return *((u32 *) (apic->regs + reg_off));
}
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+ *((u32 *) (regs + reg_off)) = val;
+}
+
static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
- *((u32 *) (apic->regs + reg_off)) = val;
+ __kvm_lapic_set_reg(apic->regs, reg_off, val);
}
extern struct static_key kvm_no_apic_vcpu;
&nx_huge_pages_recovery_ratio, 0644);
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+static bool __read_mostly force_flush_and_sync_on_reuse;
+module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
+
/*
* When setting this variable to true it enables Two-Dimensional-Paging
* where the hardware walks 2 page tables:
return 0;
}
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
-{
-}
-
static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
const void *pte)
return;
if (local_flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
#ifdef CONFIG_KVM_MMU_AUDIT
break;
WARN_ON(!list_empty(&invalid_list));
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
if (sp->unsync_children)
- kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
__clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
if (write_fault)
ret = RET_PF_EMULATE;
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
&invalid_list);
mmu->root_hpa = INVALID_PAGE;
}
- mmu->root_cr3 = 0;
+ mmu->root_pgd = 0;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
} else
BUG();
- /* root_cr3 is ignored for direct MMUs. */
- vcpu->arch.mmu->root_cr3 = 0;
+ /* root_pgd is ignored for direct MMUs. */
+ vcpu->arch.mmu->root_pgd = 0;
return 0;
}
{
struct kvm_mmu_page *sp;
u64 pdptr, pm_mask;
- gfn_t root_gfn, root_cr3;
+ gfn_t root_gfn, root_pgd;
int i;
- root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
- root_gfn = root_cr3 >> PAGE_SHIFT;
+ root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu->root_hpa = root;
- goto set_root_cr3;
+ goto set_root_pgd;
}
/*
vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
}
-set_root_cr3:
- vcpu->arch.mmu->root_cr3 = root_cr3;
+set_root_pgd:
+ vcpu->arch.mmu->root_pgd = root_pgd;
return 0;
}
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
bool *writable)
{
- struct kvm_memory_slot *slot;
+ struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
- /*
- * Don't expose private memslots to L2.
- */
- if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+ /* Don't expose private memslots to L2. */
+ if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
*pfn = KVM_PFN_NOSLOT;
+ *writable = false;
return false;
}
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
async = false;
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
+ context->invlpg = NULL;
context->update_pte = nonpaging_update_pte;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->nx = false;
}
-static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3,
+static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
- return (role.direct || cr3 == root->cr3) &&
+ return (role.direct || pgd == root->pgd) &&
VALID_PAGE(root->hpa) && page_header(root->hpa) &&
role.word == page_header(root->hpa)->role.word;
}
/*
- * Find out if a previously cached root matching the new CR3/role is available.
+ * Find out if a previously cached root matching the new pgd/role is available.
* The current root is also inserted into the cache.
* If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
* returned.
* Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
* false is returned. This root should now be freed by the caller.
*/
-static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
union kvm_mmu_page_role new_role)
{
uint i;
struct kvm_mmu_root_info root;
struct kvm_mmu *mmu = vcpu->arch.mmu;
- root.cr3 = mmu->root_cr3;
+ root.pgd = mmu->root_pgd;
root.hpa = mmu->root_hpa;
- if (is_root_usable(&root, new_cr3, new_role))
+ if (is_root_usable(&root, new_pgd, new_role))
return true;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
swap(root, mmu->prev_roots[i]);
- if (is_root_usable(&root, new_cr3, new_role))
+ if (is_root_usable(&root, new_pgd, new_role))
break;
}
mmu->root_hpa = root.hpa;
- mmu->root_cr3 = root.cr3;
+ mmu->root_pgd = root.pgd;
return i < KVM_MMU_NUM_PREV_ROOTS;
}
-static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
- union kvm_mmu_page_role new_role,
- bool skip_tlb_flush)
+static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
* later if necessary.
*/
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- mmu->root_level >= PT64_ROOT_4LEVEL) {
- if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
- return false;
-
- if (cached_root_available(vcpu, new_cr3, new_role)) {
- /*
- * It is possible that the cached previous root page is
- * obsolete because of a change in the MMU generation
- * number. However, changing the generation number is
- * accompanied by KVM_REQ_MMU_RELOAD, which will free
- * the root set here and allocate a new one.
- */
- kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
- if (!skip_tlb_flush) {
- kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
-
- /*
- * The last MMIO access's GVA and GPA are cached in the
- * VCPU. When switching to a new CR3, that GVA->GPA
- * mapping may no longer be valid. So clear any cached
- * MMIO info even when we don't need to sync the shadow
- * page tables.
- */
- vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
-
- __clear_sp_write_flooding_count(
- page_header(mmu->root_hpa));
-
- return true;
- }
- }
+ mmu->root_level >= PT64_ROOT_4LEVEL)
+ return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
+ cached_root_available(vcpu, new_pgd, new_role);
return false;
}
-static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
union kvm_mmu_page_role new_role,
- bool skip_tlb_flush)
+ bool skip_tlb_flush, bool skip_mmu_sync)
{
- if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
- KVM_MMU_ROOT_CURRENT);
+ if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
+ return;
+ }
+
+ /*
+ * It's possible that the cached previous root page is obsolete because
+ * of a change in the MMU generation number. However, changing the
+ * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
+ * free the root set here and allocate a new one.
+ */
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+
+ if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the VCPU. When
+ * switching to a new CR3, that GVA->GPA mapping may no longer be
+ * valid. So clear any cached MMIO info even when we don't need to sync
+ * the shadow page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
}
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
+ bool skip_mmu_sync)
{
- __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
- skip_tlb_flush);
+ __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
+ skip_tlb_flush, skip_mmu_sync);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
+EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
return kvm_read_cr3(vcpu);
}
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- vcpu->arch.mmu->inject_page_fault(vcpu, fault);
-}
-
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access, int *nr_present)
{
context->mmu_role.as_u64 = new_role.as_u64;
context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
+ context->invlpg = NULL;
context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
context->direct_map = true;
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level);
- __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
+ __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
if (new_role.as_u64 == context->mmu_role.as_u64)
return;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
+ /*
+ * L2 page tables are never shadowed, so there is no need to sync
+ * SPTEs.
+ */
+ g_context->invlpg = NULL;
+
/*
* Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
* L1's nested page tables (e.g. EPT12). The nested translation
if (r)
goto out;
kvm_mmu_load_pgd(vcpu);
- kvm_x86_ops.tlb_flush(vcpu, true);
+ kvm_x86_ops.tlb_flush_current(vcpu);
out:
return r;
}
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gva_t gva, hpa_t root_hpa)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
int i;
- /* INVLPG on a * non-canonical address is a NOP according to the SDM. */
- if (is_noncanonical_address(gva, vcpu))
+ /* It's actually a GPA for vcpu->arch.guest_mmu. */
+ if (mmu != &vcpu->arch.guest_mmu) {
+ /* INVLPG on a non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_address(gva, vcpu))
+ return;
+
+ kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+ }
+
+ if (!mmu->invlpg)
return;
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ if (root_hpa == INVALID_PAGE) {
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
- /*
- * INVLPG is required to invalidate any global mappings for the VA,
- * irrespective of PCID. Since it would take us roughly similar amount
- * of work to determine whether any of the prev_root mappings of the VA
- * is marked global, or to just sync it blindly, so we might as well
- * just always sync it.
- *
- * Mappings not reachable via the current cr3 or the prev_roots will be
- * synced when switching to that cr3, so nothing needs to be done here
- * for them.
- */
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (VALID_PAGE(mmu->prev_roots[i].hpa))
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Since it would take us roughly similar amount
+ * of work to determine whether any of the prev_root mappings of the VA
+ * is marked global, or to just sync it blindly, so we might as well
+ * just always sync it.
+ *
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (VALID_PAGE(mmu->prev_roots[i].hpa))
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ } else {
+ mmu->invlpg(vcpu, gva, root_hpa);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
- kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
- pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
tlb_flush = true;
}
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.root_mmu.root_cr3 = 0;
+ vcpu->arch.root_mmu.root_pgd = 0;
vcpu->arch.root_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.guest_mmu.root_cr3 = 0;
+ vcpu->arch.guest_mmu.root_pgd = 0;
vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
if (!prefault)
- inject_page_fault(vcpu, &walker.fault);
+ kvm_inject_emulated_page_fault(vcpu, &walker.fault);
return RET_PF_RETRY;
}
if ((vmcb->save.efer & EFER_SVME) == 0)
return false;
+ if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
+ (vmcb->save.cr0 & X86_CR0_NW))
+ return false;
+
if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
return false;
svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
svm->nested.intercept = nested_vmcb->control.intercept;
- svm_flush_tlb(&svm->vcpu, true);
+ svm_flush_tlb(&svm->vcpu);
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
struct kvm_host_map map;
u64 vmcb_gpa;
- vmcb_gpa = svm->vmcb->save.rax;
+ if (is_smm(&svm->vcpu)) {
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ vmcb_gpa = svm->vmcb->save.rax;
ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
if (ret == -EINVAL) {
kvm_inject_gp(&svm->vcpu, 0);
return (svm->nested.intercept & 1ULL);
}
-int svm_check_nested_events(struct kvm_vcpu *vcpu)
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool block_nested_events =
return NESTED_EXIT_CONTINUE;
}
+
+struct kvm_x86_nested_ops svm_nested_ops = {
+ .check_events = svm_check_nested_events,
+};
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
+#include <asm/mce.h>
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
return 1;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu, true);
+ svm_flush_tlb(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
return true;
}
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(®s, 0);
+#endif
+}
+
static void svm_handle_mce(struct vcpu_svm *svm)
{
if (is_erratum_383()) {
* On an #MC intercept the MCE handler is not called automatically in
* the host. So do it by hand here.
*/
- asm volatile (
- "int $0x12\n");
- /* not sure if we ever come back to this point */
-
- return;
+ kvm_machine_check();
}
static int mc_interception(struct vcpu_svm *svm)
return 0;
}
-void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * Flush only the current ASID even if the TLB flush was invoked via
+ * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
+ * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
+ * unconditionally does a TLB flush on both nested VM-Enter and nested
+ * VM-Exit (via kvm_mmu_reset_context()).
+ */
if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
else
svm_complete_interrupts(svm);
}
+static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ if (!is_guest_mode(vcpu) &&
+ to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ to_svm(vcpu)->vmcb->control.exit_info_1)
+ return handle_fastpath_set_msr_irqoff(vcpu);
+
+ return EXIT_FASTPATH_NONE;
+}
+
void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
{
+ enum exit_fastpath_completion exit_fastpath;
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
* again.
*/
if (unlikely(svm->nested.exit_required))
- return;
+ return EXIT_FASTPATH_NONE;
/*
* Disable singlestep if we're injecting an interrupt/exception.
stgi();
/* Any pending NMI will happen here */
+ exit_fastpath = svm_exit_handlers_fastpath(vcpu);
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_interrupt(&svm->vcpu);
svm_handle_mce(svm);
mark_all_clean(svm->vmcb);
+ return exit_fastpath;
}
static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
return ret;
}
-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
- enum exit_fastpath_completion *exit_fastpath)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- if (!is_guest_mode(vcpu) &&
- to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
- to_svm(vcpu)->vmcb->control.exit_info_1)
- *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
bool smap = cr4 & X86_CR4_SMAP;
bool is_user = svm_get_cpl(vcpu) == 3;
+ /*
+ * If RIP is invalid, go ahead with emulation which will cause an
+ * internal error exit.
+ */
+ if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+ return true;
+
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
/*
* TODO: Last condition latch INIT signals on vCPU when
* vCPU is in guest-mode and vmcb12 defines intercept on INIT.
- * To properly emulate the INIT intercept, SVM should implement
- * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit()
- * there if an INIT signal is pending.
+ * To properly emulate the INIT intercept,
+ * svm_check_nested_events() should call nested_svm_vmexit()
+ * if an INIT signal is pending.
*/
return !gif_set(svm) ||
(svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
- .tlb_flush = svm_flush_tlb,
+ .tlb_flush_all = svm_flush_tlb,
+ .tlb_flush_current = svm_flush_tlb,
.tlb_flush_gva = svm_flush_tlb_gva,
+ .tlb_flush_guest = svm_flush_tlb,
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.sched_in = svm_sched_in,
.pmu_ops = &amd_pmu_ops,
+ .nested_ops = &svm_nested_ops,
+
.deliver_posted_interrupt = svm_deliver_avic_intr,
.dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
.update_pi_irte = svm_update_pi_irte,
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
- .nested_enable_evmcs = NULL,
- .nested_get_evmcs_version = NULL,
-
.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
-
- .check_nested_events = svm_check_nested_events,
};
static struct kvm_x86_init_ops svm_init_ops __initdata = {
void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
/* nested.c */
int nested_svm_check_permissions(struct vcpu_svm *svm);
int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
-int svm_check_nested_events(struct kvm_vcpu *vcpu);
int nested_svm_exit_special(struct vcpu_svm *svm);
+extern struct kvm_x86_nested_ops svm_nested_ops;
+
/* avic.c */
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
#include <linux/smp.h>
#include "../hyperv.h"
+#include "../cpuid.h"
#include "evmcs.h"
#include "vmcs.h"
#include "vmx.h"
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
/* 32 bit rw */
EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- /*
- * vmcs_version represents the range of supported Enlightened VMCS
- * versions: lower 8 bits is the minimal version, higher 8 bits is the
- * maximum supported version. KVM supports versions from 1 to
- * KVM_EVMCS_VERSION.
- */
- if (vmx->nested.enlightened_vmcs_enabled)
- return (KVM_EVMCS_VERSION << 8) | 1;
-
- return 0;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * vmcs_version represents the range of supported Enlightened VMCS
+ * versions: lower 8 bits is the minimal version, higher 8 bits is the
+ * maximum supported version. KVM supports versions from 1 to
+ * KVM_EVMCS_VERSION.
+ */
+ if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
+ vmx->nested.enlightened_vmcs_enabled)
+ return (KVM_EVMCS_VERSION << 8) | 1;
+
+ return 0;
}
void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vmx_segment_cache_clear(vmx);
+ vmx_register_cache_reset(vcpu);
}
/*
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason;
+ u32 vm_exit_reason;
unsigned long exit_qualification = vcpu->arch.exit_qualification;
if (vmx->nested.pml_full) {
- exit_reason = EXIT_REASON_PML_FULL;
+ vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
exit_qualification &= INTR_INFO_UNBLOCK_NMI;
} else if (fault->error_code & PFERR_RSVD_MASK)
- exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
else
- exit_reason = EXIT_REASON_EPT_VIOLATION;
+ vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
- nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
+ nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
vmcs12->guest_physical_address = fault->address;
}
return (val & invalid_mask) == 0;
}
+/*
+ * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
+ * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
+ * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
+ * Here's why.
+ *
+ * If EPT is enabled by L0 a sync is never needed:
+ * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
+ * cannot be unsync'd SPTEs for either L1 or L2.
+ *
+ * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
+ * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
+ * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
+ * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
+ * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
+ *
+ * If EPT is disabled by L0:
+ * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
+ * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
+ * required to invalidate linear mappings (EPT is disabled so there are
+ * no combined or guest-physical mappings), i.e. L1 can't rely on the
+ * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
+ *
+ * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
+ * linear mappings (EPT is disabled so there are no combined or guest-physical
+ * mappings) to be invalidated on both VM-Enter and VM-Exit.
+ *
+ * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
+ * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
+ * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
+ * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
+ * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
+ * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
+ * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
+ * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
+ * stale TLB entries, at which point L0 will sync L2's MMU.
+ */
+static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
+{
+ return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
+}
+
/*
* Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
* emulating VM-Entry into a guest with EPT enabled. On failure, the expected
}
}
+ /*
+ * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
+ * flushes are handled by nested_vmx_transition_tlb_flush(). See
+ * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
+ */
if (!nested_ept)
- kvm_mmu_new_cr3(vcpu, cr3, false);
+ kvm_mmu_new_pgd(vcpu, cr3, true,
+ !nested_vmx_transition_mmu_sync(vcpu));
vcpu->arch.cr3 = cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
}
-static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ bool is_vmenter)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+ /*
+ * If VPID is disabled, linear and combined mappings are flushed on
+ * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
+ * their associated EPTP.
+ */
+ if (!enable_vpid)
+ return;
+
+ /*
+ * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
+ * for *all* contexts to be flushed on VM-Enter/VM-Exit.
+ *
+ * If VPID is enabled and used by vmc12, but L2 does not have a unique
+ * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
+ * a VPID for L2, flush the current context as the effective ASID is
+ * common to both L1 and L2.
+ *
+ * Defer the flush so that it runs after vmcs02.EPTP has been set by
+ * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
+ * redundant flushes further down the nested pipeline.
+ *
+ * If a TLB flush isn't required due to any of the above, and vpid12 is
+ * changing then the new "virtual" VPID (vpid12) will reuse the same
+ * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
+ * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
+ * all nested vCPUs.
+ */
+ if (!nested_cpu_has_vpid(vmcs12)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ } else if (!nested_has_guest_tlb_tag(vcpu)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ } else if (is_vmenter &&
+ vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ vpid_sync_context(nested_get_vpid02(vcpu));
+ }
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
* vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
* vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
* vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
- * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
- * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
- * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
- * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
* vmcs12->page_fault_error_code_mask =
* evmcs->page_fault_error_code_mask;
* vmcs12->page_fault_error_code_match =
* evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
* evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
* evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
- * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
- * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
- * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
- * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
* evmcs->tpr_threshold = vmcs12->tpr_threshold;
* evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
* evmcs->exception_bitmap = vmcs12->exception_bitmap;
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
- if (enable_vpid) {
- /*
- * There is no direct mapping between vpid02 and vpid12, the
- * vpid02 is per-vCPU for L0 and reused while the value of
- * vpid12 is changed w/ one invvpid during nested vmentry.
- * The vpid12 is allocated by L1 for L2, so it will not
- * influence global bitmap(for vpid01 and vpid02 allocation)
- * even if spawn a lot of nested vCPUs.
- */
- if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
- if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
- vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
- }
- } else {
- /*
- * If L1 use EPT, then L0 needs to execute INVEPT on
- * EPTP02 instead of EPTP01. Therefore, delay TLB
- * flush until vmcs02->eptp is fully updated by
- * KVM_REQ_LOAD_MMU_PGD. Note that this assumes
- * KVM_REQ_TLB_FLUSH is evaluated after
- * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest().
- */
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
- }
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
if (nested_cpu_has_ept(vmcs12))
nested_ept_init_mmu_context(vcpu);
u32 exit_reason = EXIT_REASON_INVALID_STATE;
u32 exit_qual;
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
* which already writes to vmcs12 directly.
*/
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- u32 exit_reason, u32 exit_intr_info,
+ u32 vm_exit_reason, u32 exit_intr_info,
unsigned long exit_qualification)
{
/* update exit information fields: */
- vmcs12->vm_exit_reason = exit_reason;
+ vmcs12->vm_exit_reason = vm_exit_reason;
vmcs12->exit_qualification = exit_qualification;
vmcs12->vm_exit_intr_info = exit_intr_info;
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
- /*
- * If vmcs01 doesn't use VPID, CPU flushes TLB on every
- * VMEntry/VMExit. Thus, no need to flush TLB.
- *
- * If vmcs12 doesn't use VPID, L1 expects TLB to be
- * flushed on every VMEntry/VMExit.
- *
- * Otherwise, we can preserve TLB entries as long as we are
- * able to tag L1 TLB entries differently than L2 TLB entries.
- *
- * If vmcs12 uses EPT, we need to execute this flush on EPTP01
- * and therefore we request the TLB flush to happen only after VMCS EPTP
- * has been set by KVM_REQ_LOAD_MMU_PGD.
- */
- if (enable_vpid &&
- (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
* VMFail, like everything else we just need to ensure our
* software model is up-to-date.
*/
- if (enable_ept)
+ if (enable_ept && is_pae_paging(vcpu))
ept_save_pdptrs(vcpu);
kvm_mmu_reset_context(vcpu);
* and modify vmcs12 to make it see what it would expect to see there if
* L2 was its real guest. Must only be called when in L2 (is_guest_mode())
*/
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
u32 exit_intr_info, unsigned long exit_qualification)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ /* Service the TLB flush request for L2 before switching to L1. */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
leave_guest_mode(vcpu);
if (nested_cpu_has_preemption_timer(vmcs12))
if (likely(!vmx->fail)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
- if (exit_reason != -1)
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
+ if (vm_exit_reason != -1)
+ prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
+ exit_intr_info, exit_qualification);
/*
* Must happen outside of sync_vmcs02_to_vmcs12() as it will
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
vmx->nested.pi_desc = NULL;
- /*
- * We are now running in L2, mmu_notifier will force to reload the
- * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
- */
- kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ if (vmx->nested.reload_vmcs01_apic_access_page) {
+ vmx->nested.reload_vmcs01_apic_access_page = false;
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ }
- if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+ if ((vm_exit_reason != -1) &&
+ (enable_shadow_vmcs || vmx->nested.hv_evmcs))
vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
- if (exit_reason != -1)
+ if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
vmcs12->idt_vectoring_info_field,
gva_t gva;
struct x86_exception e;
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmcs_read32(VMX_INSTRUCTION_INFO), false,
sizeof(*vmpointer), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ kvm_inject_emulated_page_fault(vcpu, &e);
return 1;
}
{
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
: get_vmcs12(vcpu);
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct x86_exception e;
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ kvm_inject_emulated_page_fault(vcpu, &e);
return 1;
}
}
{
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
: get_vmcs12(vcpu);
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct x86_exception e;
instr_info, false, len, &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ kvm_inject_emulated_page_fault(vcpu, &e);
return 1;
}
}
/* Emulate the VMPTRST instruction */
static int handle_vmptrst(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
struct x86_exception e;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
sizeof(gpa_t), &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ kvm_inject_emulated_page_fault(vcpu, &e);
return 1;
}
return nested_vmx_succeed(vcpu);
}
+#define EPTP_PA_MASK GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+ return VALID_PAGE(root_hpa) &&
+ ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmx_instruction_info, types;
- unsigned long type;
+ unsigned long type, roots_to_free;
+ struct kvm_mmu *mmu;
gva_t gva;
struct x86_exception e;
struct {
u64 eptp, gpa;
} operand;
+ int i;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_EPT) ||
/* According to the Intel VMX instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
*/
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ kvm_inject_emulated_page_fault(vcpu, &e);
return 1;
}
- switch (type) {
- case VMX_EPT_EXTENT_GLOBAL:
- case VMX_EPT_EXTENT_CONTEXT:
/*
- * TODO: Sync the necessary shadow EPT roots here, rather than
- * at the next emulated VM-entry.
+ * Nested EPT roots are always held through guest_mmu,
+ * not root_mmu.
*/
+ mmu = &vcpu->arch.guest_mmu;
+
+ switch (type) {
+ case VMX_EPT_EXTENT_CONTEXT:
+ if (!nested_vmx_check_eptp(vcpu, operand.eptp))
+ return nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ roots_to_free = 0;
+ if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
+ mmu->prev_roots[i].pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+ break;
+ case VMX_EPT_EXTENT_GLOBAL:
+ roots_to_free = KVM_MMU_ROOTS_ALL;
break;
default:
BUG();
break;
}
+ if (roots_to_free)
+ kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+
return nested_vmx_succeed(vcpu);
}
/* according to the intel vmx instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
*/
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ kvm_inject_emulated_page_fault(vcpu, &e);
return 1;
}
if (operand.vpid >> 16)
is_noncanonical_address(operand.gla, vcpu))
return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- if (cpu_has_vmx_invvpid_individual_addr()) {
- __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
- vpid02, operand.gla);
- } else
- __vmx_flush_tlb(vcpu, vpid02, false);
+ vpid_sync_vcpu_addr(vpid02, operand.gla);
break;
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
if (!operand.vpid)
return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- __vmx_flush_tlb(vcpu, vpid02, false);
+ vpid_sync_context(vpid02);
break;
case VMX_VPID_EXTENT_ALL_CONTEXT:
- __vmx_flush_tlb(vcpu, vpid02, false);
+ vpid_sync_context(vpid02);
break;
default:
WARN_ON_ONCE(1);
return kvm_skip_emulated_instruction(vcpu);
}
+ /*
+ * Sync the shadow page tables if EPT is disabled, L1 is invalidating
+ * linear mappings for L2 (tagged with L2's VPID). Free all roots as
+ * VPIDs are not tracked in the MMU role.
+ *
+ * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
+ * an MMU when EPT is disabled.
+ *
+ * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
+ */
+ if (!enable_ept)
+ kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
+ KVM_MMU_ROOTS_ALL);
+
return nested_vmx_succeed(vcpu);
}
fail:
nested_vmx_vmexit(vcpu, vmx->exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
+ vmx_get_intr_info(vcpu),
+ vmx_get_exit_qual(vcpu));
return 1;
}
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int cr = exit_qualification & 15;
int reg;
unsigned long val;
return true;
break;
case 3:
- if ((vmcs12->cr3_target_count >= 1 &&
- vmcs12->cr3_target_value0 == val) ||
- (vmcs12->cr3_target_count >= 2 &&
- vmcs12->cr3_target_value1 == val) ||
- (vmcs12->cr3_target_count >= 3 &&
- vmcs12->cr3_target_value2 == val) ||
- (vmcs12->cr3_target_count >= 4 &&
- vmcs12->cr3_target_value3 == val))
- return false;
if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
return true;
break;
}
/*
- * Return true if we should exit from L2 to L1 to handle an exit, or false if we
- * should handle it ourselves in L0 (and then continue L2). Only call this
- * when in is_guest_mode (L2).
+ * Return true if L0 wants to handle an exit from L2 regardless of whether or not
+ * L1 wants the exit. Only call this when in is_guest_mode (L2).
*/
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
{
- u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- WARN_ON_ONCE(vmx->nested.nested_run_pending);
-
- if (unlikely(vmx->fail)) {
- trace_kvm_nested_vmenter_failed(
- "hardware VM-instruction error: ",
- vmcs_read32(VM_INSTRUCTION_ERROR));
- return true;
- }
-
- trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
- vmcs_readl(EXIT_QUALIFICATION),
- vmx->idt_vectoring_info,
- intr_info,
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
- KVM_ISA_VMX);
+ u32 intr_info;
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
- return false;
+ return true;
else if (is_page_fault(intr_info))
- return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
+ return vcpu->arch.apf.host_apf_reason || !enable_ept;
else if (is_debug(intr_info) &&
vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return false;
+ return true;
else if (is_breakpoint(intr_info) &&
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- return false;
+ return true;
+ return false;
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return true;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return true;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return true;
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
+ return true;
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return true;
+ case EXIT_REASON_PML_FULL:
+ /* We emulate PML support to L1. */
+ return true;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return true;
+ case EXIT_REASON_ENCLS:
+ /* SGX is never exposed to L1 */
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+/*
+ * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
+ * is_guest_mode (L2).
+ */
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 intr_info;
+
+ switch (exit_reason) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
+ if (is_nmi(intr_info))
+ return true;
+ else if (is_page_fault(intr_info))
+ return true;
return vmcs12->exception_bitmap &
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
case EXIT_REASON_EXTERNAL_INTERRUPT:
- return false;
+ return nested_exit_on_intr(vcpu);
case EXIT_REASON_TRIPLE_FAULT:
return true;
case EXIT_REASON_INTERRUPT_WINDOW:
nested_cpu_has2(vmcs12,
SECONDARY_EXEC_PAUSE_LOOP_EXITING);
case EXIT_REASON_MCE_DURING_VMENTRY:
- return false;
+ return true;
case EXIT_REASON_TPR_BELOW_THRESHOLD:
return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
case EXIT_REASON_APIC_ACCESS:
* delivery" only come from vmcs12.
*/
return true;
- case EXIT_REASON_EPT_VIOLATION:
- /*
- * L0 always deals with the EPT violation. If nested EPT is
- * used, and the nested mmu code discovers that the address is
- * missing in the guest EPT table (EPT12), the EPT violation
- * will be injected with nested_ept_inject_page_fault()
- */
- return false;
- case EXIT_REASON_EPT_MISCONFIG:
- /*
- * L2 never uses directly L1's EPT, but rather L0's own EPT
- * table (shadow on EPT) or a merged EPT table that L0 built
- * (EPT on EPT). So any problems with the structure of the
- * table is L0's fault.
- */
- return false;
case EXIT_REASON_INVPCID:
return
nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
- case EXIT_REASON_PREEMPTION_TIMER:
- return false;
- case EXIT_REASON_PML_FULL:
- /* We emulate PML support to L1. */
- return false;
- case EXIT_REASON_VMFUNC:
- /* VM functions are emulated through L2->L0 vmexits. */
- return false;
- case EXIT_REASON_ENCLS:
- /* SGX is never exposed to L1 */
- return false;
case EXIT_REASON_UMWAIT:
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
}
}
+/*
+ * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
+ * reflected into L1.
+ */
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason = vmx->exit_reason;
+ unsigned long exit_qual;
+ u32 exit_intr_info;
+
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+ /*
+ * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
+ * has already loaded L2's state.
+ */
+ if (unlikely(vmx->fail)) {
+ trace_kvm_nested_vmenter_failed(
+ "hardware VM-instruction error: ",
+ vmcs_read32(VM_INSTRUCTION_ERROR));
+ exit_intr_info = 0;
+ exit_qual = 0;
+ goto reflect_vmexit;
+ }
+
+ exit_intr_info = vmx_get_intr_info(vcpu);
+ exit_qual = vmx_get_exit_qual(vcpu);
+
+ trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
+ vmx->idt_vectoring_info, exit_intr_info,
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ KVM_ISA_VMX);
+
+ /* If L0 (KVM) wants the exit, it trumps L1's desires. */
+ if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /* If L1 doesn't want the exit, handle it in L0. */
+ if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /*
+ * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
+ * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
+ * need to be synthesized by querying the in-kernel LAPIC, but external
+ * interrupts are never reflected to L1 so it's a non-issue.
+ */
+ if ((exit_intr_info &
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+
+reflect_vmexit:
+ nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
+ return true;
+}
static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
* reason is that if one of these bits is necessary, it will appear
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
* fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_exit_reflected() will not pass related exits to L1.
+ * nested_vmx_l1_wants_exit() will not pass related exits to L1.
* These rules have exceptions below.
*/
exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
- ops->check_nested_events = vmx_check_nested_events;
- ops->get_nested_state = vmx_get_nested_state;
- ops->set_nested_state = vmx_set_nested_state;
- ops->get_vmcs12_pages = nested_get_vmcs12_pages;
- ops->nested_enable_evmcs = nested_enable_evmcs;
- ops->nested_get_evmcs_version = nested_get_evmcs_version;
-
return 0;
}
+
+struct kvm_x86_nested_ops vmx_nested_ops = {
+ .check_events = vmx_check_nested_events,
+ .get_state = vmx_get_nested_state,
+ .set_state = vmx_set_nested_state,
+ .get_vmcs12_pages = nested_get_vmcs12_pages,
+ .enable_evmcs = nested_enable_evmcs,
+ .get_evmcs_version = nested_get_evmcs_version,
+};
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
bool from_vmentry);
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
u32 exit_intr_info, unsigned long exit_qualification);
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
vmx->nested.hv_evmcs;
}
+static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
{
/* return the page table to be shadowed - in our case, EPT12 */
return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
}
-/*
- * Reflect a VM Exit into L1.
- */
-static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
- u32 exit_reason)
-{
- u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- /*
- * At this point, the exit interruption info in exit_intr_info
- * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
- * we need to query the in-kernel LAPIC.
- */
- WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
- if ((exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- vmcs12->vm_exit_intr_error_code =
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
- }
-
- nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
-}
-
/*
* Return the cr0 value that a nested guest would read. This is a combination
* of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
#define nested_guest_cr4_valid nested_cr4_valid
#define nested_host_cr4_valid nested_cr4_valid
+extern struct kvm_x86_nested_ops vmx_nested_ops;
+
#endif /* __KVM_X86_VMX_NESTED_H */
vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
}
-static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
-{
- if (vpid == 0)
- return true;
-
- if (cpu_has_vmx_invvpid_individual_addr()) {
- __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
- return true;
- }
-
- return false;
-}
-
static inline void vpid_sync_vcpu_single(int vpid)
{
if (vpid == 0)
return;
- if (cpu_has_vmx_invvpid_single())
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
}
static inline void vpid_sync_vcpu_global(void)
{
- if (cpu_has_vmx_invvpid_global())
- __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}
static inline void vpid_sync_context(int vpid)
{
if (cpu_has_vmx_invvpid_single())
vpid_sync_vcpu_single(vpid);
- else
+ else if (vpid != 0)
vpid_sync_vcpu_global();
}
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_individual_addr())
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ else
+ vpid_sync_context(vpid);
+}
+
static inline void ept_sync_global(void)
{
__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
FIELD(CR0_READ_SHADOW, cr0_read_shadow),
FIELD(CR4_READ_SHADOW, cr4_read_shadow),
- FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
- FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
- FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
- FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
FIELD(EXIT_QUALIFICATION, exit_qualification),
FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
FIELD(GUEST_CR0, guest_cr0),
natural_width cr4_guest_host_mask;
natural_width cr0_read_shadow;
natural_width cr4_read_shadow;
- natural_width cr3_target_value0;
- natural_width cr3_target_value1;
- natural_width cr3_target_value2;
- natural_width cr3_target_value3;
+ natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
natural_width exit_qualification;
natural_width guest_linear_address;
natural_width guest_cr0;
CHECK_OFFSET(cr4_guest_host_mask, 352);
CHECK_OFFSET(cr0_read_shadow, 360);
CHECK_OFFSET(cr4_read_shadow, 368);
- CHECK_OFFSET(cr3_target_value0, 376);
- CHECK_OFFSET(cr3_target_value1, 384);
- CHECK_OFFSET(cr3_target_value2, 392);
- CHECK_OFFSET(cr3_target_value3, 400);
+ CHECK_OFFSET(dead_space, 376);
CHECK_OFFSET(exit_qualification, 408);
CHECK_OFFSET(guest_linear_address, 416);
CHECK_OFFSET(guest_cr0, 424);
VMX_SEGMENT_FIELD(LDTR),
};
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
static unsigned long host_idt_base;
/*
void *gdt = get_current_gdt_ro();
unsigned long sysenter_esp;
+ /*
+ * Flush all EPTP/VPID contexts, the new pCPU may have stale
+ * TLB entries from its previous association with the vCPU.
+ */
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
#endif
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
+ * the CPU is not required to invalidate guest-physical mappings on
+ * VM-Entry, even if VPID is disabled. Guest-physical mappings are
+ * associated with the root EPT structure and not any particular VPID
+ * (INVVPID also isn't required to invalidate guest-physical mappings).
+ */
+ if (enable_ept) {
+ ept_sync_global();
+ } else if (enable_vpid) {
+ if (cpu_has_vmx_invvpid_global()) {
+ vpid_sync_vcpu_global();
+ } else {
+ vpid_sync_vcpu_single(vmx->vpid);
+ vpid_sync_vcpu_single(vmx->nested.vpid02);
+ }
+ }
+}
+
+static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
- int vpid = to_vmx(vcpu)->vpid;
+ u64 root_hpa = vcpu->arch.mmu->root_hpa;
+
+ /* No flush required if the current context is invalid. */
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ if (enable_ept)
+ ept_sync_context(construct_eptp(vcpu, root_hpa));
+ else if (!is_guest_mode(vcpu))
+ vpid_sync_context(to_vmx(vcpu)->vpid);
+ else
+ vpid_sync_context(nested_get_vpid02(vcpu));
+}
- if (!vpid_sync_vcpu_addr(vpid, addr))
- vpid_sync_context(vpid);
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ /*
+ * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vmx_flush_tlb_guest() for an explanation of why this is ok.
+ */
+ vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+}
+static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
/*
- * If VPIDs are not supported or enabled, then the above is a no-op.
- * But we don't really need a TLB flush in that case anyway, because
- * each VM entry/exit includes an implicit flush when VPID is 0.
+ * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
+ * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
+ * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
+ * i.e. no explicit INVVPID is necessary.
*/
+ vpid_sync_context(to_vmx(vcpu)->vpid);
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (is_pae_paging(vcpu)) {
- mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
- mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
- mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
- mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- }
+ if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
+ return;
+
+ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
}
return eptp;
}
-void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
unsigned long guest_cr3;
u64 eptp;
- guest_cr3 = cr3;
if (enable_ept) {
- eptp = construct_eptp(vcpu, cr3);
+ eptp = construct_eptp(vcpu, pgd);
vmcs_write64(EPT_POINTER, eptp);
if (kvm_x86_ops.tlb_remote_flush) {
else /* vmcs01.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
ept_load_pdptrs(vcpu);
+ } else {
+ guest_cr3 = pgd;
}
if (update_guest_cr3)
}
if (is_page_fault(intr_info)) {
- cr2 = vmcs_readl(EXIT_QUALIFICATION);
+ cr2 = vmx_get_exit_qual(vcpu);
/* EPT won't cause page fault directly */
WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
switch (ex_no) {
case DB_VECTOR:
- dr6 = vmcs_readl(EXIT_QUALIFICATION);
+ dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
if (is_icebp(intr_info))
int size, in, string;
unsigned port;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
string = (exit_qualification & 16) != 0;
++vcpu->stat.io_exits;
int err;
int ret;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
cr = exit_qualification & 15;
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
unsigned long exit_qualification;
int dr, dr7, reg;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
/* First, if DR does not exist, trigger UD */
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
kvm_mmu_invlpg(vcpu, exit_qualification);
return kvm_skip_emulated_instruction(vcpu);
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int access_type, offset;
access_type = exit_qualification & APIC_ACCESS_TYPE;
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int vector = exit_qualification & 0xff;
/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
u32 offset = exit_qualification & 0xfff;
/* APIC-write VM exit is trap-like and thus no need to adjust IP */
idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
reason = (u32)exit_qualification >> 30;
if (reason == TASK_SWITCH_GATE && idt_v) {
gpa_t gpa;
u64 error_code;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
/*
* EPT violation happened while executing iret from NMI,
/* According to the Intel instruction reference, the memory operand
* is read even if it isn't needed (e.g., for type==all)
*/
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false,
sizeof(operand), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ kvm_inject_emulated_page_fault(vcpu, &e);
return 1;
}
if (kvm_get_active_pcid(vcpu) == operand.pcid) {
kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
== operand.pcid)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
trace_kvm_pml_full(vcpu->vcpu_id);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
/*
* PML buffer FULL happened while executing iret from NMI,
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
- *info1 = vmcs_readl(EXIT_QUALIFICATION);
- *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+ *info1 = vmx_get_exit_qual(vcpu);
+ *info2 = vmx_get_intr_info(vcpu);
}
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
unsigned long cr4;
u64 efer;
- int i, n;
if (!dump_invalid_vmcs) {
pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
- n = vmcs_read32(CR3_TARGET_COUNT);
- for (i = 0; i + 1 < n; i += 4)
- pr_err("CR3 target%u=%016lx target%u=%016lx\n",
- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
- i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
- if (i < n)
- pr_err("CR3 target%u=%016lx\n",
- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
pr_err("PLE Gap=%08x Window=%08x\n",
vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
*/
nested_mark_vmcs12_pages_dirty(vcpu);
- if (nested_vmx_exit_reflected(vcpu, exit_reason))
- return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+ if (nested_vmx_reflect_vmexit(vcpu))
+ return 1;
}
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
if (flexpriority_enabled) {
sec_exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- vmx_flush_tlb(vcpu, true);
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ /*
+ * Flush the TLB, reloading the APIC access page will
+ * only do so if its physical address has changed, but
+ * the guest may have inserted a non-APIC mapping into
+ * the TLB while the APIC access page was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
break;
case LAPIC_MODE_X2APIC:
vmx_update_msr_bitmap(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
- if (!is_guest_mode(vcpu)) {
- vmcs_write64(APIC_ACCESS_ADDR, hpa);
- vmx_flush_tlb(vcpu, true);
+ struct page *page;
+
+ /* Defer reload until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
+ return;
}
+
+ if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return;
+
+ page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return;
+
+ vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+ vmx_flush_tlb_current(vcpu);
+
+ /*
+ * Do not pin apic access page in memory, the MMU notifier
+ * will call us again if it is migrated or swapped out.
+ */
+ put_page(page);
}
static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
- if (is_page_fault(vmx->exit_intr_info)) {
+ if (is_page_fault(intr_info)) {
vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
/* Handle machine checks before interrupts are enabled */
- } else if (is_machine_check(vmx->exit_intr_info)) {
+ } else if (is_machine_check(intr_info)) {
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
- } else if (is_nmi(vmx->exit_intr_info)) {
+ } else if (is_nmi(intr_info)) {
kvm_before_interrupt(&vmx->vcpu);
asm("int $2");
kvm_after_interrupt(&vmx->vcpu);
unsigned long tmp;
#endif
gate_desc *desc;
- u32 intr_info;
+ u32 intr_info = vmx_get_intr_info(vcpu);
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
}
STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
- enum exit_fastpath_completion *exit_fastpath)
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
- else if (!is_guest_mode(vcpu) &&
- vmx->exit_reason == EXIT_REASON_MSR_WRITE)
- *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
}
static bool vmx_has_emulated_msr(int index)
if (enable_vnmi) {
if (vmx->loaded_vmcs->nmi_known_unmasked)
return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
+ enum exit_fastpath_completion exit_fastpath;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
if (vmx->emulation_required)
- return;
+ return EXIT_FASTPATH_NONE;
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
loadsegment(es, __USER_DS);
#endif
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR3));
- vcpu->arch.regs_dirty = 0;
+ vmx_register_cache_reset(vcpu);
pt_guest_exit(vmx);
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
- vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
- if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ if (unlikely(vmx->fail)) {
+ vmx->exit_reason = 0xdead;
+ return EXIT_FASTPATH_NONE;
+ }
+
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+ if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
- if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
- return;
+ if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ return EXIT_FASTPATH_NONE;
+
+ if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+ exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
+ else
+ exit_fastpath = EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
+
+ return exit_fastpath;
}
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
- .tlb_flush = vmx_flush_tlb,
+ .tlb_flush_all = vmx_flush_tlb_all,
+ .tlb_flush_current = vmx_flush_tlb_current,
.tlb_flush_gva = vmx_flush_tlb_gva,
+ .tlb_flush_guest = vmx_flush_tlb_guest,
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.post_block = vmx_post_block,
.pmu_ops = &intel_pmu_ops,
+ .nested_ops = &vmx_nested_ops,
.update_pi_irte = vmx_update_pi_irte,
.pre_leave_smm = vmx_pre_leave_smm,
.enable_smi_window = enable_smi_window,
- .check_nested_events = NULL,
- .get_nested_state = NULL,
- .set_nested_state = NULL,
- .get_vmcs12_pages = NULL,
- .nested_enable_evmcs = NULL,
- .nested_get_evmcs_version = NULL,
.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
};
#include <asm/intel_pt.h>
#include "capabilities.h"
+#include "kvm_cache_regs.h"
#include "ops.h"
#include "vmcs.h"
bool vmcs02_initialized;
bool change_vmcs01_virtual_apic_mode;
+ bool reload_vmcs01_apic_access_page;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
*/
bool guest_state_loaded;
+ unsigned long exit_qualification;
u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
{
- vmx->segment_cache.bitmask = 0;
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_SEGMENTS)
+ | (1 << VCPU_EXREG_CR3)
+ | (1 << VCPU_EXREG_EXIT_INFO_1)
+ | (1 << VCPU_EXREG_EXIT_INFO_2));
+ vcpu->arch.regs_dirty = 0;
}
static inline u32 vmx_vmentry_ctrl(void)
return &(to_vmx(vcpu)->pi_desc);
}
+static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ }
+ return vmx->exit_qualification;
+}
+
+static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ }
+ return vmx->exit_intr_info;
+}
+
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
-static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
- bool invalidate_gpa)
-{
- if (enable_ept && (invalidate_gpa || !enable_vpid)) {
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
- ept_sync_context(construct_eptp(vcpu,
- vcpu->arch.mmu->root_hpa));
- } else {
- vpid_sync_context(vpid);
- }
-}
-
-static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
-{
- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
-}
-
static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
{
vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
EXPORT_SYMBOL_GPL(supported_xss);
struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "pf_fixed", VCPU_STAT(pf_fixed) },
- { "pf_guest", VCPU_STAT(pf_guest) },
- { "tlb_flush", VCPU_STAT(tlb_flush) },
- { "invlpg", VCPU_STAT(invlpg) },
- { "exits", VCPU_STAT(exits) },
- { "io_exits", VCPU_STAT(io_exits) },
- { "mmio_exits", VCPU_STAT(mmio_exits) },
- { "signal_exits", VCPU_STAT(signal_exits) },
- { "irq_window", VCPU_STAT(irq_window_exits) },
- { "nmi_window", VCPU_STAT(nmi_window_exits) },
- { "halt_exits", VCPU_STAT(halt_exits) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "hypercalls", VCPU_STAT(hypercalls) },
- { "request_irq", VCPU_STAT(request_irq_exits) },
- { "irq_exits", VCPU_STAT(irq_exits) },
- { "host_state_reload", VCPU_STAT(host_state_reload) },
- { "fpu_reload", VCPU_STAT(fpu_reload) },
- { "insn_emulation", VCPU_STAT(insn_emulation) },
- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
- { "irq_injections", VCPU_STAT(irq_injections) },
- { "nmi_injections", VCPU_STAT(nmi_injections) },
- { "req_event", VCPU_STAT(req_event) },
- { "l1d_flush", VCPU_STAT(l1d_flush) },
- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
- { "mmu_flooded", VM_STAT(mmu_flooded) },
- { "mmu_recycled", VM_STAT(mmu_recycled) },
- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
- { "mmu_unsync", VM_STAT(mmu_unsync) },
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages, .mode = 0444) },
- { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
- { "max_mmu_page_hash_collisions",
- VM_STAT(max_mmu_page_hash_collisions) },
+ VCPU_STAT("pf_fixed", pf_fixed),
+ VCPU_STAT("pf_guest", pf_guest),
+ VCPU_STAT("tlb_flush", tlb_flush),
+ VCPU_STAT("invlpg", invlpg),
+ VCPU_STAT("exits", exits),
+ VCPU_STAT("io_exits", io_exits),
+ VCPU_STAT("mmio_exits", mmio_exits),
+ VCPU_STAT("signal_exits", signal_exits),
+ VCPU_STAT("irq_window", irq_window_exits),
+ VCPU_STAT("nmi_window", nmi_window_exits),
+ VCPU_STAT("halt_exits", halt_exits),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("hypercalls", hypercalls),
+ VCPU_STAT("request_irq", request_irq_exits),
+ VCPU_STAT("irq_exits", irq_exits),
+ VCPU_STAT("host_state_reload", host_state_reload),
+ VCPU_STAT("fpu_reload", fpu_reload),
+ VCPU_STAT("insn_emulation", insn_emulation),
+ VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
+ VCPU_STAT("irq_injections", irq_injections),
+ VCPU_STAT("nmi_injections", nmi_injections),
+ VCPU_STAT("req_event", req_event),
+ VCPU_STAT("l1d_flush", l1d_flush),
+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
+ VM_STAT("mmu_pte_write", mmu_pte_write),
+ VM_STAT("mmu_pte_updated", mmu_pte_updated),
+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
+ VM_STAT("mmu_flooded", mmu_flooded),
+ VM_STAT("mmu_recycled", mmu_recycled),
+ VM_STAT("mmu_cache_miss", mmu_cache_miss),
+ VM_STAT("mmu_unsync", mmu_unsync),
+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
+ VM_STAT("largepages", lpages, .mode = 0444),
+ VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
+ VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
{ NULL }
};
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
- else
- vcpu->arch.mmu->inject_page_fault(vcpu, fault);
+ struct kvm_mmu *fault_mmu;
+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
+
+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
+ vcpu->arch.walk_mmu;
+
+ /*
+ * Invalidate the TLB entry for the faulting address, if it exists,
+ * else the access will fault indefinitely (and to emulate hardware).
+ */
+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
+ !(fault->error_code & PFERR_RSVD_MASK))
+ kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
+ fault_mmu->root_hpa);
+ fault_mmu->inject_page_fault(vcpu, fault);
return fault->nested_page_fault;
}
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
if (!skip_tlb_flush) {
kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
return 0;
}
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
- kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
+ kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
vcpu->arch.cr3 = cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
vcpu->arch.time = 0;
}
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa);
+ kvm_x86_ops.tlb_flush_all(vcpu);
+}
+
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops.tlb_flush_guest(vcpu);
}
static void record_steal_time(struct kvm_vcpu *vcpu)
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
st->preempted & KVM_VCPU_FLUSH_TLB);
if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
- kvm_vcpu_flush_tlb(vcpu, false);
+ kvm_vcpu_flush_tlb_guest(vcpu);
vcpu->arch.st.preempted = 0;
r = KVM_X2APIC_API_VALID_FLAGS;
break;
case KVM_CAP_NESTED_STATE:
- r = kvm_x86_ops.get_nested_state ?
- kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0;
+ r = kvm_x86_ops.nested_ops->get_state ?
+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
break;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
r = kvm_x86_ops.enable_direct_tlbflush != NULL;
break;
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
- r = kvm_x86_ops.nested_enable_evmcs != NULL;
+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
break;
default:
break;
return kvm_hv_activate_synic(vcpu, cap->cap ==
KVM_CAP_HYPERV_SYNIC2);
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
- if (!kvm_x86_ops.nested_enable_evmcs)
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
return -ENOTTY;
- r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version);
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
if (!r) {
user_ptr = (void __user *)(uintptr_t)cap->args[0];
if (copy_to_user(user_ptr, &vmcs_version,
u32 user_data_size;
r = -EINVAL;
- if (!kvm_x86_ops.get_nested_state)
+ if (!kvm_x86_ops.nested_ops->get_state)
break;
BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
if (get_user(user_data_size, &user_kvm_nested_state->size))
break;
- r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state,
- user_data_size);
+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
+ user_data_size);
if (r < 0)
break;
int idx;
r = -EINVAL;
- if (!kvm_x86_ops.set_nested_state)
+ if (!kvm_x86_ops.nested_ops->set_state)
break;
r = -EFAULT;
break;
idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
{
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
if (ctxt->exception.vector == PF_VECTOR)
- return kvm_propagate_fault(vcpu, &ctxt->exception);
+ return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
* from L2 to L1 due to pending L1 events which require exit
* from L2 to L1.
*/
- if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) {
- r = kvm_x86_ops.check_nested_events(vcpu);
+ if (is_guest_mode(vcpu)) {
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
if (r != 0)
return r;
}
* proposal and current concerns. Perhaps we should be setting
* KVM_REQ_EVENT only on certain events and not unconditionally?
*/
- if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) {
- r = kvm_x86_ops.check_nested_events(vcpu);
+ if (is_guest_mode(vcpu)) {
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
if (r != 0)
return r;
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
- struct page *page = NULL;
-
if (!lapic_in_kernel(vcpu))
return;
if (!kvm_x86_ops.set_apic_access_page_addr)
return;
- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page))
- return;
- kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page));
-
- /*
- * Do not pin apic access page in memory, the MMU notifier
- * will call us again if it is migrated or swapped out.
- */
- put_page(page);
+ kvm_x86_ops.set_apic_access_page_addr(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
- enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE;
+ enum exit_fastpath_completion exit_fastpath;
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
- if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) {
+ if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
r = 0;
goto out;
}
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
kvm_mmu_load_pgd(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
- kvm_vcpu_flush_tlb(vcpu, true);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+ kvm_vcpu_flush_tlb_all(vcpu);
+
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+ if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- kvm_x86_ops.run(vcpu);
+ exit_fastpath = kvm_x86_ops.run(vcpu);
/*
* Do this here before restoring debug registers on the host. And
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath);
+ kvm_x86_ops.handle_exit_irqoff(vcpu);
/*
* Consume any pending interrupts, including the possible source of
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events)
- kvm_x86_ops.check_nested_events(vcpu);
+ if (is_guest_mode(vcpu))
+ kvm_x86_ops.nested_ops->check_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
trace_kvm_fpu(0);
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
int r;
vcpu_load(vcpu);
r = -EAGAIN;
if (signal_pending(current)) {
r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
goto out;
}
- if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
r = -EINVAL;
goto out;
}
- if (vcpu->run->kvm_dirty_regs) {
+ if (kvm_run->kvm_dirty_regs) {
r = sync_regs(vcpu);
if (r != 0)
goto out;
out:
kvm_put_guest_fpu(vcpu);
- if (vcpu->run->kvm_valid_regs)
+ if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops.tlb_flush_current(vcpu);
+}
+
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
pid_t userspace_pid;
+ unsigned int max_halt_poll_ns;
};
#define kvm_err(fmt, ...) \
struct kvm_mp_state *mp_state);
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);
int kvm_arch_init(void *opaque);
void kvm_arch_exit(void);
#define KVM_DBGFS_GET_MODE(dbgfs_item) \
((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
+#define VM_STAT(n, x, ...) \
+ { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
+#define VCPU_STAT(n, x, ...) \
+ { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
+
extern struct kvm_stats_debugfs_item debugfs_entries[];
extern struct dentry *kvm_debugfs_dir;
}
#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot)
+{
+ return (memslot && memslot->id < KVM_USER_MEM_SLOTS &&
+ !(memslot->flags & KVM_MEMSLOT_INVALID));
+}
+
struct kvm_vcpu *kvm_get_running_vcpu(void);
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
#define KVM_CAP_S390_VCPU_RESETS 179
#define KVM_CAP_S390_PROTECTED 180
#define KVM_CAP_PPC_SECURE_GUEST 181
+#define KVM_CAP_HALT_POLL 182
#ifdef KVM_CAP_IRQ_ROUTING
import struct
import re
import subprocess
+import signal
from collections import defaultdict, namedtuple
from functools import reduce
from datetime import datetime
'RESET': 0x00002403,
}
+signal_received = False
+
ENCODING = locale.getpreferredencoding(False)
TRACE_FILTER = re.compile(r'^[^\(]*$')
def get_banner(self):
return self._banner
- @staticmethod
- def get_statline(keys, s):
+ def get_statline(self, keys, s):
res = ''
for key in keys:
res += ' %9d' % s[key].delta
def get_banner(self):
return self._banner
- @staticmethod
- def get_statline(keys, s):
+ def get_statline(self, keys, s):
return reduce(lambda res, key: "{},{!s}".format(res, s[key].delta),
keys, '')
def log(stats, opts, frmt, keys):
"""Prints statistics as reiterating key block, multiple value blocks."""
+ global signal_received
line = 0
banner_repeat = 20
+ f = None
+
+ def do_banner(opts):
+ nonlocal f
+ if opts.log_to_file:
+ if not f:
+ try:
+ f = open(opts.log_to_file, 'a')
+ except (IOError, OSError):
+ sys.exit("Error: Could not open file: %s" %
+ opts.log_to_file)
+ if isinstance(frmt, CSVFormat) and f.tell() != 0:
+ return
+ print(frmt.get_banner(), file=f or sys.stdout)
+
+ def do_statline(opts, values):
+ statline = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + \
+ frmt.get_statline(keys, values)
+ print(statline, file=f or sys.stdout)
+
+ do_banner(opts)
+ banner_printed = True
while True:
try:
time.sleep(opts.set_delay)
- if line % banner_repeat == 0:
- print(frmt.get_banner())
- print(datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
- frmt.get_statline(keys, stats.get()))
- line += 1
+ if signal_received:
+ banner_printed = True
+ line = 0
+ f.close()
+ do_banner(opts)
+ signal_received = False
+ if (line % banner_repeat == 0 and not banner_printed and
+ not (opts.log_to_file and isinstance(frmt, CSVFormat))):
+ do_banner(opts)
+ banner_printed = True
+ values = stats.get()
+ if (not opts.skip_zero_records or
+ any(values[k].delta != 0 for k in keys)):
+ do_statline(opts, values)
+ line += 1
+ banner_printed = False
except KeyboardInterrupt:
break
+ if opts.log_to_file:
+ f.close()
+
+
+def handle_signal(sig, frame):
+ global signal_received
+
+ signal_received = True
+
+ return
+
def is_delay_valid(delay):
"""Verify delay is in valid value range."""
argparser.add_argument('-c', '--csv',
action='store_true',
default=False,
- help='log in csv format - requires option -l/--log',
+ help='log in csv format - requires option -l/-L',
)
argparser.add_argument('-d', '--debugfs',
action='store_true',
default=False,
help='run in logging mode (like vmstat)',
)
+ argparser.add_argument('-L', '--log-to-file',
+ type=str,
+ metavar='FILE',
+ help="like '--log', but logging to a file"
+ )
argparser.add_argument('-p', '--pid',
type=int,
default=0,
default=False,
help='retrieve statistics from tracepoints',
)
+ argparser.add_argument('-z', '--skip-zero-records',
+ action='store_true',
+ default=False,
+ help='omit records with all zeros in logging mode',
+ )
options = argparser.parse_args()
- if options.csv and not options.log:
+ if options.csv and not (options.log or options.log_to_file):
sys.exit('Error: Option -c/--csv requires -l/--log')
+ if options.skip_zero_records and not (options.log or options.log_to_file):
+ sys.exit('Error: Option -z/--skip-zero-records requires -l/-L')
try:
# verify that we were passed a valid regex up front
re.compile(options.fields)
sys.stdout.write(' ' + '\n '.join(sorted(set(event_list))) + '\n')
sys.exit(0)
- if options.log:
+ if options.log or options.log_to_file:
+ if options.log_to_file:
+ signal.signal(signal.SIGHUP, handle_signal)
keys = sorted(stats.get().keys())
if options.csv:
frmt = CSVFormat(keys)
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+
+[Unit]
+Description=Service that logs KVM kernel module trace events
+Before=qemu-kvm.service
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/kvm_stat -dtcz -s 10 -L /var/log/kvm_stat.csv
+ExecReload=/bin/kill -HUP $MAINPID
+Restart=always
+SyslogIdentifier=kvm_stat
+SyslogLevel=debug
+
+[Install]
+WantedBy=multi-user.target
run in batch mode for one second
-c::
---csv=<file>::
- log in csv format - requires option -l/--log
+--csv::
+ log in csv format. Requires option -l/--log or -L/--log-to-file.
+ When used with option -L/--log-to-file, the header is only ever
+ written to start of file to preserve the format.
-d::
--debugfs::
--log::
run in logging mode (like vmstat)
+
+-L<file>::
+--log-to-file=<file>::
+ like -l/--log, but logging to a file. Appends to existing files.
+
-p<pid>::
--pid=<pid>::
limit statistics to one virtual machine (pid)
--tracepoints::
retrieve statistics from tracepoints
+*z*::
+--skip-zero-records::
+ omit records with all zeros in logging mode
+
SEE ALSO
--------
'perf'(1), 'trace-cmd'(1)
/x86_64/hyperv_cpuid
/x86_64/mmio_warning_test
/x86_64/platform_info_test
-/x86_64/set_memory_region_test
/x86_64/set_sregs_test
/x86_64/smm_test
/x86_64/state_test
/demand_paging_test
/dirty_log_test
/kvm_create_max_vcpus
+/set_memory_region_test
/steal_time
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
-TEST_GEN_PROGS_x86_64 += x86_64/set_memory_region_test
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
TEST_GEN_PROGS_x86_64 += x86_64/smm_test
TEST_GEN_PROGS_x86_64 += x86_64/state_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
TEST_GEN_PROGS_aarch64 += clear_dirty_log_test
TEST_GEN_PROGS_aarch64 += demand_paging_test
TEST_GEN_PROGS_aarch64 += dirty_log_test
TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += set_memory_region_test
TEST_GEN_PROGS_aarch64 += steal_time
TEST_GEN_PROGS_s390x = s390x/memop
TEST_GEN_PROGS_s390x += demand_paging_test
TEST_GEN_PROGS_s390x += dirty_log_test
TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += set_memory_region_test
TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
LIBKVM += $(LIBKVM_$(UNAME_M))
#include "test_util.h"
#include "asm/kvm.h"
+#include "linux/list.h"
#include "linux/kvm.h"
#include <sys/ioctl.h>
void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
uint32_t data_memslot, uint32_t pgd_memslot);
unsigned int vm_get_page_size(struct kvm_vm *vm);
unsigned int vm_get_page_shift(struct kvm_vm *vm);
unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+int vm_get_fd(struct kvm_vm *vm);
unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
#define GUEST_DONE() ucall(UCALL_DONE, 0)
-#define GUEST_ASSERT(_condition) do { \
- if (!(_condition)) \
- ucall(UCALL_ABORT, 2, \
- "Failed guest assert: " \
- #_condition, __LINE__); \
+#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \
+ if (!(_condition)) \
+ ucall(UCALL_ABORT, 2 + _nargs, \
+ "Failed guest assert: " \
+ #_condition, __LINE__, _args); \
} while (0)
+#define GUEST_ASSERT(_condition) \
+ __GUEST_ASSERT((_condition), 0, 0)
+
+#define GUEST_ASSERT_1(_condition, arg1) \
+ __GUEST_ASSERT((_condition), 1, (arg1))
+
+#define GUEST_ASSERT_2(_condition, arg1, arg2) \
+ __GUEST_ASSERT((_condition), 2, (arg1), (arg2))
+
+#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
+ __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3))
+
+#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
+ __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4))
+
#endif /* SELFTEST_KVM_UTIL_H */
vm = calloc(1, sizeof(*vm));
TEST_ASSERT(vm != NULL, "Insufficient Memory");
+ INIT_LIST_HEAD(&vm->vcpus);
+ INIT_LIST_HEAD(&vm->userspace_mem_regions);
+
vm->mode = mode;
vm->type = 0;
if (vmp->has_irqchip)
vm_create_irqchip(vmp);
- for (region = vmp->userspace_mem_region_head; region;
- region = region->next) {
+ list_for_each_entry(region, &vmp->userspace_mem_regions, list) {
int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region);
TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
" rc: %i errno: %i\n"
{
struct userspace_mem_region *region;
- for (region = vm->userspace_mem_region_head; region;
- region = region->next) {
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
uint64_t existing_start = region->region.guest_phys_addr;
uint64_t existing_end = region->region.guest_phys_addr
+ region->region.memory_size - 1;
*/
struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid)
{
- struct vcpu *vcpup;
+ struct vcpu *vcpu;
- for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) {
- if (vcpup->id == vcpuid)
- return vcpup;
+ list_for_each_entry(vcpu, &vm->vcpus, list) {
+ if (vcpu->id == vcpuid)
+ return vcpu;
}
return NULL;
* VM VCPU Remove
*
* Input Args:
- * vm - Virtual Machine
- * vcpuid - VCPU ID
+ * vcpu - VCPU to remove
*
* Output Args: None
*
* Return: None, TEST_ASSERT failures for all error conditions
*
- * Within the VM specified by vm, removes the VCPU given by vcpuid.
+ * Removes a vCPU from a VM and frees its resources.
*/
-static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
+static void vm_vcpu_rm(struct vcpu *vcpu)
{
- struct vcpu *vcpu = vcpu_find(vm, vcpuid);
int ret;
ret = munmap(vcpu->state, sizeof(*vcpu->state));
TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
"errno: %i", ret, errno);
- if (vcpu->next)
- vcpu->next->prev = vcpu->prev;
- if (vcpu->prev)
- vcpu->prev->next = vcpu->next;
- else
- vm->vcpu_head = vcpu->next;
+ list_del(&vcpu->list);
free(vcpu);
}
void kvm_vm_release(struct kvm_vm *vmp)
{
+ struct vcpu *vcpu, *tmp;
int ret;
- while (vmp->vcpu_head)
- vm_vcpu_rm(vmp, vmp->vcpu_head->id);
+ list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
+ vm_vcpu_rm(vcpu);
ret = close(vmp->fd);
TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
" vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
}
+static void __vm_mem_region_delete(struct kvm_vm *vm,
+ struct userspace_mem_region *region)
+{
+ int ret;
+
+ list_del(®ion->list);
+
+ region->region.memory_size = 0;
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+
+ sparsebit_free(®ion->unused_phy_pages);
+ ret = munmap(region->mmap_start, region->mmap_size);
+ TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno);
+
+ free(region);
+}
+
/*
* Destroys and frees the VM pointed to by vmp.
*/
void kvm_vm_free(struct kvm_vm *vmp)
{
- int ret;
+ struct userspace_mem_region *region, *tmp;
if (vmp == NULL)
return;
/* Free userspace_mem_regions. */
- while (vmp->userspace_mem_region_head) {
- struct userspace_mem_region *region
- = vmp->userspace_mem_region_head;
-
- region->region.memory_size = 0;
- ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION,
- ®ion->region);
- TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
- "rc: %i errno: %i", ret, errno);
-
- vmp->userspace_mem_region_head = region->next;
- sparsebit_free(®ion->unused_phy_pages);
- ret = munmap(region->mmap_start, region->mmap_size);
- TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i",
- ret, errno);
-
- free(region);
- }
+ list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list)
+ __vm_mem_region_delete(vmp, region);
/* Free sparsebit arrays. */
sparsebit_free(&vmp->vpages_valid);
(uint64_t) region->region.memory_size);
/* Confirm no region with the requested slot already exists. */
- for (region = vm->userspace_mem_region_head; region;
- region = region->next) {
- if (region->region.slot == slot)
- break;
- }
- if (region != NULL)
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if (region->region.slot != slot)
+ continue;
+
TEST_FAIL("A mem region with the requested slot "
"already exists.\n"
" requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
region->region.slot,
(uint64_t) region->region.guest_phys_addr,
(uint64_t) region->region.memory_size);
+ }
/* Allocate and initialize new mem region structure. */
region = calloc(1, sizeof(*region));
guest_paddr, (uint64_t) region->region.memory_size);
/* Add to linked-list of memory regions. */
- if (vm->userspace_mem_region_head)
- vm->userspace_mem_region_head->prev = region;
- region->next = vm->userspace_mem_region_head;
- vm->userspace_mem_region_head = region;
+ list_add(®ion->list, &vm->userspace_mem_regions);
}
/*
{
struct userspace_mem_region *region;
- for (region = vm->userspace_mem_region_head; region;
- region = region->next) {
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
if (region->region.slot == memslot)
- break;
- }
- if (region == NULL) {
- fprintf(stderr, "No mem region with the requested slot found,\n"
- " requested slot: %u\n", memslot);
- fputs("---- vm dump ----\n", stderr);
- vm_dump(stderr, vm, 2);
- TEST_FAIL("Mem region not found");
+ return region;
}
- return region;
+ fprintf(stderr, "No mem region with the requested slot found,\n"
+ " requested slot: %u\n", memslot);
+ fputs("---- vm dump ----\n", stderr);
+ vm_dump(stderr, vm, 2);
+ TEST_FAIL("Mem region not found");
+ return NULL;
}
/*
ret, errno, slot, new_gpa);
}
+/*
+ * VM Memory Region Delete
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * slot - Slot of the memory region to delete
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Delete a memory region.
+ */
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
+{
+ __vm_mem_region_delete(vm, memslot2region(vm, slot));
+}
+
/*
* VCPU mmap Size
*
"vcpu id: %u errno: %i", vcpuid, errno);
/* Add to linked-list of VCPUs. */
- if (vm->vcpu_head)
- vm->vcpu_head->prev = vcpu;
- vcpu->next = vm->vcpu_head;
- vm->vcpu_head = vcpu;
+ list_add(&vcpu->list, &vm->vcpus);
}
/*
void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
{
struct userspace_mem_region *region;
- for (region = vm->userspace_mem_region_head; region;
- region = region->next) {
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
if ((gpa >= region->region.guest_phys_addr)
&& (gpa <= (region->region.guest_phys_addr
+ region->region.memory_size - 1)))
vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
{
struct userspace_mem_region *region;
- for (region = vm->userspace_mem_region_head; region;
- region = region->next) {
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
if ((hva >= region->host_mem)
&& (hva <= (region->host_mem
+ region->region.memory_size - 1)))
fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
fprintf(stream, "%*sMem Regions:\n", indent, "");
- for (region = vm->userspace_mem_region_head; region;
- region = region->next) {
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
"host_virt: %p\n", indent + 2, "",
(uint64_t) region->region.guest_phys_addr,
virt_dump(stream, vm, indent + 4);
}
fprintf(stream, "%*sVCPUs:\n", indent, "");
- for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next)
+ list_for_each_entry(vcpu, &vm->vcpus, list)
vcpu_dump(stream, vm, vcpu->id, indent + 2);
}
return vm->max_gfn;
}
+int vm_get_fd(struct kvm_vm *vm)
+{
+ return vm->fd;
+}
+
static unsigned int vm_calc_num_pages(unsigned int num_pages,
unsigned int page_shift,
unsigned int new_page_shift,
#define KVM_DEV_PATH "/dev/kvm"
struct userspace_mem_region {
- struct userspace_mem_region *next, *prev;
struct kvm_userspace_memory_region region;
struct sparsebit *unused_phy_pages;
int fd;
void *host_mem;
void *mmap_start;
size_t mmap_size;
+ struct list_head list;
};
struct vcpu {
- struct vcpu *next, *prev;
+ struct list_head list;
uint32_t id;
int fd;
struct kvm_run *state;
unsigned int pa_bits;
unsigned int va_bits;
uint64_t max_gfn;
- struct vcpu *vcpu_head;
- struct userspace_mem_region *userspace_mem_region_head;
+ struct list_head vcpus;
+ struct list_head userspace_mem_regions;
struct sparsebit *vpages_valid;
struct sparsebit *vpages_mapped;
bool has_irqchip;
void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
{
- struct vcpu *vcpu = vm->vcpu_head;
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+ if (!vcpu)
+ return;
fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n",
indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/compiler.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define VCPU_ID 0
+
+/*
+ * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a
+ * 2MB sized and aligned region so that the initial region corresponds to
+ * exactly one large page.
+ */
+#define MEM_REGION_SIZE 0x200000
+
+#ifdef __x86_64__
+/*
+ * Somewhat arbitrary location and slot, intended to not overlap anything.
+ */
+#define MEM_REGION_GPA 0xc0000000
+#define MEM_REGION_SLOT 10
+
+static const uint64_t MMIO_VAL = 0xbeefull;
+
+extern const uint64_t final_rip_start;
+extern const uint64_t final_rip_end;
+
+static sem_t vcpu_ready;
+
+static inline uint64_t guest_spin_on_val(uint64_t spin_val)
+{
+ uint64_t val;
+
+ do {
+ val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
+ } while (val == spin_val);
+
+ GUEST_SYNC(0);
+ return val;
+}
+
+static void *vcpu_worker(void *data)
+{
+ struct kvm_vm *vm = data;
+ struct kvm_run *run;
+ struct ucall uc;
+ uint64_t cmd;
+
+ /*
+ * Loop until the guest is done. Re-enter the guest on all MMIO exits,
+ * which will occur if the guest attempts to access a memslot after it
+ * has been deleted or while it is being moved .
+ */
+ run = vcpu_state(vm, VCPU_ID);
+
+ while (1) {
+ vcpu_run(vm, VCPU_ID);
+
+ if (run->exit_reason == KVM_EXIT_IO) {
+ cmd = get_ucall(vm, VCPU_ID, &uc);
+ if (cmd != UCALL_SYNC)
+ break;
+
+ sem_post(&vcpu_ready);
+ continue;
+ }
+
+ if (run->exit_reason != KVM_EXIT_MMIO)
+ break;
+
+ TEST_ASSERT(!run->mmio.is_write, "Unexpected exit mmio write");
+ TEST_ASSERT(run->mmio.len == 8,
+ "Unexpected exit mmio size = %u", run->mmio.len);
+
+ TEST_ASSERT(run->mmio.phys_addr == MEM_REGION_GPA,
+ "Unexpected exit mmio address = 0x%llx",
+ run->mmio.phys_addr);
+ memcpy(run->mmio.data, &MMIO_VAL, 8);
+ }
+
+ if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT)
+ TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2]);
+
+ return NULL;
+}
+
+static void wait_for_vcpu(void)
+{
+ struct timespec ts;
+
+ TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts),
+ "clock_gettime() failed: %d\n", errno);
+
+ ts.tv_sec += 2;
+ TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts),
+ "sem_timedwait() failed: %d\n", errno);
+
+ /* Wait for the vCPU thread to reenter the guest. */
+ usleep(100000);
+}
+
+static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code)
+{
+ struct kvm_vm *vm;
+ uint64_t *hva;
+ uint64_t gpa;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+ MEM_REGION_GPA, MEM_REGION_SLOT,
+ MEM_REGION_SIZE / getpagesize(), 0);
+
+ /*
+ * Allocate and map two pages so that the GPA accessed by guest_code()
+ * stays valid across the memslot move.
+ */
+ gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT);
+ TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
+
+ virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0);
+
+ /* Ditto for the host mapping so that both pages can be zeroed. */
+ hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+ memset(hva, 0, 2 * 4096);
+
+ pthread_create(vcpu_thread, NULL, vcpu_worker, vm);
+
+ /* Ensure the guest thread is spun up. */
+ wait_for_vcpu();
+
+ return vm;
+}
+
+
+static void guest_code_move_memory_region(void)
+{
+ uint64_t val;
+
+ GUEST_SYNC(0);
+
+ /*
+ * Spin until the memory region is moved to a misaligned address. This
+ * may or may not trigger MMIO, as the window where the memslot is
+ * invalid is quite small.
+ */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
+
+ /* Spin until the memory region is realigned. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 1, val);
+
+ GUEST_DONE();
+}
+
+static void test_move_memory_region(void)
+{
+ pthread_t vcpu_thread;
+ struct kvm_vm *vm;
+ uint64_t *hva;
+
+ vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region);
+
+ hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+
+ /*
+ * Shift the region's base GPA. The guest should not see "2" as the
+ * hva->gpa translation is misaligned, i.e. the guest is accessing a
+ * different host pfn.
+ */
+ vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096);
+ WRITE_ONCE(*hva, 2);
+
+ /*
+ * The guest _might_ see an invalid memslot and trigger MMIO, but it's
+ * a tiny window. Spin and defer the sync until the memslot is
+ * restored and guest behavior is once again deterministic.
+ */
+ usleep(100000);
+
+ /*
+ * Note, value in memory needs to be changed *before* restoring the
+ * memslot, else the guest could race the update and see "2".
+ */
+ WRITE_ONCE(*hva, 1);
+
+ /* Restore the original base, the guest should see "1". */
+ vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA);
+ wait_for_vcpu();
+ /* Defered sync from when the memslot was misaligned (above). */
+ wait_for_vcpu();
+
+ pthread_join(vcpu_thread, NULL);
+
+ kvm_vm_free(vm);
+}
+
+static void guest_code_delete_memory_region(void)
+{
+ uint64_t val;
+
+ GUEST_SYNC(0);
+
+ /* Spin until the memory region is deleted. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+ /* Spin until the memory region is recreated. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 0, val);
+
+ /* Spin until the memory region is deleted. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+ asm("1:\n\t"
+ ".pushsection .rodata\n\t"
+ ".global final_rip_start\n\t"
+ "final_rip_start: .quad 1b\n\t"
+ ".popsection");
+
+ /* Spin indefinitely (until the code memslot is deleted). */
+ guest_spin_on_val(MMIO_VAL);
+
+ asm("1:\n\t"
+ ".pushsection .rodata\n\t"
+ ".global final_rip_end\n\t"
+ "final_rip_end: .quad 1b\n\t"
+ ".popsection");
+
+ GUEST_ASSERT_1(0, 0);
+}
+
+static void test_delete_memory_region(void)
+{
+ pthread_t vcpu_thread;
+ struct kvm_regs regs;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+
+ vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region);
+
+ /* Delete the memory region, the guest should not die. */
+ vm_mem_region_delete(vm, MEM_REGION_SLOT);
+ wait_for_vcpu();
+
+ /* Recreate the memory region. The guest should see "0". */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+ MEM_REGION_GPA, MEM_REGION_SLOT,
+ MEM_REGION_SIZE / getpagesize(), 0);
+ wait_for_vcpu();
+
+ /* Delete the region again so that there's only one memslot left. */
+ vm_mem_region_delete(vm, MEM_REGION_SLOT);
+ wait_for_vcpu();
+
+ /*
+ * Delete the primary memslot. This should cause an emulation error or
+ * shutdown due to the page tables getting nuked.
+ */
+ vm_mem_region_delete(vm, 0);
+
+ pthread_join(vcpu_thread, NULL);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN ||
+ run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+ "Unexpected exit reason = %d", run->exit_reason);
+
+ vcpu_regs_get(vm, VCPU_ID, ®s);
+
+ /*
+ * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already,
+ * so the instruction pointer would point to the reset vector.
+ */
+ if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR)
+ TEST_ASSERT(regs.rip >= final_rip_start &&
+ regs.rip < final_rip_end,
+ "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n",
+ final_rip_start, final_rip_end, regs.rip);
+
+ kvm_vm_free(vm);
+}
+
+static void test_zero_memory_regions(void)
+{
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+
+ pr_info("Testing KVM_RUN with zero added memory regions\n");
+
+ vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+
+ TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64),
+ "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno);
+ vcpu_run(vm, VCPU_ID);
+
+ run = vcpu_state(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+ "Unexpected exit_reason = %u\n", run->exit_reason);
+
+ kvm_vm_free(vm);
+}
+#endif /* __x86_64__ */
+
+/*
+ * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any
+ * tentative to add further slots should fail.
+ */
+static void test_add_max_memory_regions(void)
+{
+ int ret;
+ struct kvm_vm *vm;
+ uint32_t max_mem_slots;
+ uint32_t slot;
+ uint64_t guest_addr = 0x0;
+ uint64_t mem_reg_npages;
+ void *mem;
+
+ max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+ TEST_ASSERT(max_mem_slots > 0,
+ "KVM_CAP_NR_MEMSLOTS should be greater than 0");
+ pr_info("Allowed number of memory slots: %i\n", max_mem_slots);
+
+ vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+
+ mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE);
+
+ /* Check it can be added memory slots up to the maximum allowed */
+ pr_info("Adding slots 0..%i, each memory region with %dK size\n",
+ (max_mem_slots - 1), MEM_REGION_SIZE >> 10);
+ for (slot = 0; slot < max_mem_slots; slot++) {
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_addr, slot, mem_reg_npages,
+ 0);
+ guest_addr += MEM_REGION_SIZE;
+ }
+
+ /* Check it cannot be added memory slots beyond the limit */
+ mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
+
+ ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION,
+ &(struct kvm_userspace_memory_region) {slot, 0, guest_addr,
+ MEM_REGION_SIZE, (uint64_t) mem});
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "Adding one more memory slot should fail with EINVAL");
+
+ munmap(mem, MEM_REGION_SIZE);
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef __x86_64__
+ int i, loops;
+#endif
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+#ifdef __x86_64__
+ /*
+ * FIXME: the zero-memslot test fails on aarch64 and s390x because
+ * KVM_RUN fails with ENOEXEC or EFAULT.
+ */
+ test_zero_memory_regions();
+#endif
+
+ test_add_max_memory_regions();
+
+#ifdef __x86_64__
+ if (argc > 1)
+ loops = atoi(argv[1]);
+ else
+ loops = 10;
+
+ pr_info("Testing MOVE of in-use region, %d loops\n", loops);
+ for (i = 0; i < loops; i++)
+ test_move_memory_region();
+
+ pr_info("Testing DELETE of in-use region, %d loops\n", loops);
+ for (i = 0; i < loops; i++)
+ test_delete_memory_region();
+#endif
+
+ return 0;
+}
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE /* for program_invocation_short_name */
-#include <fcntl.h>
-#include <pthread.h>
-#include <sched.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include <linux/compiler.h>
-
-#include <test_util.h>
-#include <kvm_util.h>
-#include <processor.h>
-
-#define VCPU_ID 0
-
-/*
- * Somewhat arbitrary location and slot, intended to not overlap anything. The
- * location and size are specifically 2mb sized/aligned so that the initial
- * region corresponds to exactly one large page.
- */
-#define MEM_REGION_GPA 0xc0000000
-#define MEM_REGION_SIZE 0x200000
-#define MEM_REGION_SLOT 10
-
-static void guest_code(void)
-{
- uint64_t val;
-
- do {
- val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
- } while (!val);
-
- if (val != 1)
- ucall(UCALL_ABORT, 1, val);
-
- GUEST_DONE();
-}
-
-static void *vcpu_worker(void *data)
-{
- struct kvm_vm *vm = data;
- struct kvm_run *run;
- struct ucall uc;
- uint64_t cmd;
-
- /*
- * Loop until the guest is done. Re-enter the guest on all MMIO exits,
- * which will occur if the guest attempts to access a memslot while it
- * is being moved.
- */
- run = vcpu_state(vm, VCPU_ID);
- do {
- vcpu_run(vm, VCPU_ID);
- } while (run->exit_reason == KVM_EXIT_MMIO);
-
- TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
- "Unexpected exit reason = %d", run->exit_reason);
-
- cmd = get_ucall(vm, VCPU_ID, &uc);
- TEST_ASSERT(cmd == UCALL_DONE, "Unexpected val in guest = %lu", uc.args[0]);
- return NULL;
-}
-
-static void test_move_memory_region(void)
-{
- pthread_t vcpu_thread;
- struct kvm_vm *vm;
- uint64_t *hva;
- uint64_t gpa;
-
- vm = vm_create_default(VCPU_ID, 0, guest_code);
-
- vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
- MEM_REGION_GPA, MEM_REGION_SLOT,
- MEM_REGION_SIZE / getpagesize(), 0);
-
- /*
- * Allocate and map two pages so that the GPA accessed by guest_code()
- * stays valid across the memslot move.
- */
- gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT);
- TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
-
- virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0);
-
- /* Ditto for the host mapping so that both pages can be zeroed. */
- hva = addr_gpa2hva(vm, MEM_REGION_GPA);
- memset(hva, 0, 2 * 4096);
-
- pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);
-
- /* Ensure the guest thread is spun up. */
- usleep(100000);
-
- /*
- * Shift the region's base GPA. The guest should not see "2" as the
- * hva->gpa translation is misaligned, i.e. the guest is accessing a
- * different host pfn.
- */
- vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096);
- WRITE_ONCE(*hva, 2);
-
- usleep(100000);
-
- /*
- * Note, value in memory needs to be changed *before* restoring the
- * memslot, else the guest could race the update and see "2".
- */
- WRITE_ONCE(*hva, 1);
-
- /* Restore the original base, the guest should see "1". */
- vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA);
-
- pthread_join(vcpu_thread, NULL);
-
- kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
- int i, loops;
-
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
- if (argc > 1)
- loops = atoi(argv[1]);
- else
- loops = 10;
-
- for (i = 0; i < loops; i++)
- test_move_memory_region();
-
- return 0;
-}
/**
* kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
* @vcpu: The VCPU pointer
- * @run: The kvm_run structure pointer used for userspace state exchange
*
* This function is called through the VCPU_RUN ioctl called from user space. It
* will execute VM code in a loop until the time slice for the process is used
* return with return value 0 and with the kvm_run structure filled in with the
* required data for the requested emulation.
*/
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *run = vcpu->run;
int ret;
if (unlikely(!kvm_vcpu_initialized(vcpu)))
return ret;
if (run->exit_reason == KVM_EXIT_MMIO) {
- ret = kvm_handle_mmio_return(vcpu, vcpu->run);
+ ret = kvm_handle_mmio_return(vcpu, run);
if (ret)
return ret;
}
goto out_err_no_arch_destroy_vm;
}
+ kvm->max_halt_poll_ns = halt_poll_ns;
+
r = kvm_arch_init_vm(kvm, type);
if (r)
goto out_err_no_arch_destroy_vm;
{
return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
- if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
- memslot->flags & KVM_MEMSLOT_INVALID)
- return false;
-
- return true;
+ return kvm_is_visible_memslot(memslot);
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
if (!kvm_arch_no_poll(vcpu)) {
if (!vcpu_valid_wakeup(vcpu)) {
shrink_halt_poll_ns(vcpu);
- } else if (halt_poll_ns) {
+ } else if (vcpu->kvm->max_halt_poll_ns) {
if (block_ns <= vcpu->halt_poll_ns)
;
/* we had a long block, shrink polling */
- else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+ else if (vcpu->halt_poll_ns &&
+ block_ns > vcpu->kvm->max_halt_poll_ns)
shrink_halt_poll_ns(vcpu);
/* we had a short halt and our poll time is too small */
- else if (vcpu->halt_poll_ns < halt_poll_ns &&
- block_ns < halt_poll_ns)
+ else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
+ block_ns < vcpu->kvm->max_halt_poll_ns)
grow_halt_poll_ns(vcpu);
} else {
vcpu->halt_poll_ns = 0;
if (r)
goto vcpu_free_run_page;
- kvm_create_vcpu_debugfs(vcpu);
-
mutex_lock(&kvm->lock);
if (kvm_get_vcpu_by_id(kvm, id)) {
r = -EEXIST;
mutex_unlock(&kvm->lock);
kvm_arch_vcpu_postcreate(vcpu);
+ kvm_create_vcpu_debugfs(vcpu);
return r;
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
- debugfs_remove_recursive(vcpu->debugfs_dentry);
kvm_arch_vcpu_destroy(vcpu);
vcpu_free_run_page:
free_page((unsigned long)vcpu->run);
synchronize_rcu();
put_pid(oldpid);
}
- r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+ r = kvm_arch_vcpu_ioctl_run(vcpu);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
case KVM_SET_REGS: {
struct kvm_regs *kvm_regs;
- r = -ENOMEM;
kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
if (IS_ERR(kvm_regs)) {
r = PTR_ERR(kvm_regs);
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM:
case KVM_CAP_ENABLE_CAP_VM:
+ case KVM_CAP_HALT_POLL:
return 1;
#ifdef CONFIG_KVM_MMIO
case KVM_CAP_COALESCED_MMIO:
return 0;
}
#endif
+ case KVM_CAP_HALT_POLL: {
+ if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
+ return -EINVAL;
+
+ kvm->max_halt_poll_ns = cap->args[0];
+ return 0;
+ }
default:
return kvm_vm_ioctl_enable_cap(kvm, cap);
}