Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)
Pull kvm updates from Paolo Bonzini:
 "The first batch of KVM patches, mostly covering x86.

  ARM:

   - Account stage2 page table allocations in memory stats

  x86:

   - Account EPT/NPT arm64 page table allocations in memory stats

   - Tracepoint cleanups/fixes for nested VM-Enter and emulated MSR
     accesses

   - Drop eVMCS controls filtering for KVM on Hyper-V, all known
     versions of Hyper-V now support eVMCS fields associated with
     features that are enumerated to the guest

   - Use KVM's sanitized VMCS config as the basis for the values of
     nested VMX capabilities MSRs

   - A myriad event/exception fixes and cleanups. Most notably, pending
     exceptions morph into VM-Exits earlier, as soon as the exception is
     queued, instead of waiting until the next vmentry. This fixed a
     longstanding issue where the exceptions would incorrecly become
     double-faults instead of triggering a vmexit; the common case of
     page-fault vmexits had a special workaround, but now it's fixed for
     good

   - A handful of fixes for memory leaks in error paths

   - Cleanups for VMREAD trampoline and VMX's VM-Exit assembly flow

   - Never write to memory from non-sleepable kvm_vcpu_check_block()

   - Selftests refinements and cleanups

   - Misc typo cleanups

  Generic:

   - remove KVM_REQ_UNHALT"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (94 commits)
  KVM: remove KVM_REQ_UNHALT
  KVM: mips, x86: do not rely on KVM_REQ_UNHALT
  KVM: x86: never write to memory from kvm_vcpu_check_block()
  KVM: x86: Don't snapshot pending INIT/SIPI prior to checking nested events
  KVM: nVMX: Make event request on VMXOFF iff INIT/SIPI is pending
  KVM: nVMX: Make an event request if INIT or SIPI is pending on VM-Enter
  KVM: SVM: Make an event request if INIT or SIPI is pending when GIF is set
  KVM: x86: lapic does not have to process INIT if it is blocked
  KVM: x86: Rename kvm_apic_has_events() to make it INIT/SIPI specific
  KVM: x86: Rename and expose helper to detect if INIT/SIPI are allowed
  KVM: nVMX: Make an event request when pending an MTF nested VM-Exit
  KVM: x86: make vendor code check for all nested events
  mailmap: Update Oliver's email address
  KVM: x86: Allow force_emulation_prefix to be written without a reload
  KVM: selftests: Add an x86-only test to verify nested exception queueing
  KVM: selftests: Use uapi header to get VMX and SVM exit reasons/codes
  KVM: x86: Rename inject_pending_events() to kvm_check_and_inject_events()
  KVM: VMX: Update MTF and ICEBP comments to document KVM's subtle behavior
  KVM: x86: Treat pending TRIPLE_FAULT requests as pending exceptions
  KVM: x86: Morph pending exceptions to pending VM-Exits at queue time
  ...

13 files changed:
1  2 
.mailmap
arch/arm64/kvm/arm.c
arch/s390/kvm/kvm-s390.c
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/cpuid.c
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/x86.c
mm/page_alloc.c
mm/vmstat.c
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/include/x86_64/vmx.h

diff --combined .mailmap
index 191778125ef19f6ac941d9e74da45b4f1eb1f5ae,d1f7ed1019cf4fb5b60a777d4762568487e2a709..3e63fb0b10883c814b6b629d39455609de72775b
+++ b/.mailmap
@@@ -71,9 -71,6 +71,9 @@@ Ben M Cahill <ben.m.cahill@intel.com
  Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net>
  Ben Widawsky <bwidawsk@kernel.org> <ben.widawsky@intel.com>
  Ben Widawsky <bwidawsk@kernel.org> <benjamin.widawsky@intel.com>
 +Bjorn Andersson <andersson@kernel.org> <bjorn@kryo.se>
 +Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@linaro.org>
 +Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@sonymobile.com>
  Björn Steinbrink <B.Steinbrink@gmx.de>
  Björn Töpel <bjorn@kernel.org> <bjorn.topel@gmail.com>
  Björn Töpel <bjorn@kernel.org> <bjorn.topel@intel.com>
@@@ -101,7 -98,8 +101,7 @@@ Christian Brauner <brauner@kernel.org> 
  Christian Marangi <ansuelsmth@gmail.com>
  Christophe Ricard <christophe.ricard@gmail.com>
  Christoph Hellwig <hch@lst.de>
 -Colin Ian King <colin.king@intel.com> <colin.king@canonical.com>
 -Colin Ian King <colin.king@intel.com> <colin.i.king@gmail.com>
 +Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com>
  Corey Minyard <minyard@acm.org>
  Damian Hobson-Garcia <dhobsong@igel.co.jp>
  Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
@@@ -152,8 -150,6 +152,8 @@@ Greg Kroah-Hartman <gregkh@suse.de
  Greg Kroah-Hartman <greg@kroah.com>
  Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
  Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
 +Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@linux.vnet.ibm.com>
 +Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@canonical.com>
  Guo Ren <guoren@kernel.org> <guoren@linux.alibaba.com>
  Guo Ren <guoren@kernel.org> <ren_guo@c-sky.com>
  Gustavo Padovan <gustavo@las.ic.unicamp.br>
@@@ -257,7 -253,6 +257,7 @@@ Linus Lüssing <linus.luessing@c0d3.blu
  Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
  Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
  Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com>
 +Luca Ceresoli <luca.ceresoli@bootlin.com> <luca@lucaceresoli.net>
  Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
  Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
  Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org>
@@@ -318,7 -313,6 +318,7 @@@ Morten Welinder <welinder@troll.com
  Mythri P K <mythripk@ti.com>
  Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
  Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
 +Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
  Nguyen Anh Quynh <aquynh@gmail.com>
  Nicholas Piggin <npiggin@gmail.com> <npiggen@suse.de>
  Nicholas Piggin <npiggin@gmail.com> <npiggin@kernel.dk>
@@@ -336,6 -330,7 +336,7 @@@ Oleksij Rempel <linux@rempel-privat.de
  Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
  Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
  Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
+ Oliver Upton <oliver.upton@linux.dev> <oupton@google.com>
  Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
  Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
  Patrick Mochel <mochel@digitalimplant.org>
diff --combined arch/arm64/kvm/arm.c
index 917086be5c6b105186f7b4d8de5754738876c640,4f949b64fdc91afab531f36c55b5c177f610d03e..446f628a9de1837023f4cb354bda9d2e649fdad3
@@@ -666,7 -666,6 +666,6 @@@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu
  
        kvm_vcpu_halt(vcpu);
        vcpu_clear_flag(vcpu, IN_WFIT);
-       kvm_clear_request(KVM_REQ_UNHALT, vcpu);
  
        preempt_disable();
        vgic_v4_load(vcpu);
@@@ -2114,7 -2113,7 +2113,7 @@@ static int finalize_hyp_mode(void
         * at, which would end badly once inaccessible.
         */
        kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
 -      kmemleak_free_part(__va(hyp_mem_base), hyp_mem_size);
 +      kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
        return pkvm_drop_host_privileges();
  }
  
diff --combined arch/s390/kvm/kvm-s390.c
index b7ef0b71014df3dd8d92467cad741bfbdf0f06ea,aa39ea4582bd1acfa83f33d360ba1d1b69bac8f7..45d4b8182b0734c27e70e8582d043b8ffba7b554
@@@ -505,7 -505,7 +505,7 @@@ int kvm_arch_init(void *opaque
                goto out;
        }
  
 -      if (kvm_s390_pci_interp_allowed()) {
 +      if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
                rc = kvm_s390_pci_init();
                if (rc) {
                        pr_err("Unable to allocate AIFT for PCI\n");
@@@ -527,7 -527,7 +527,7 @@@ out
  void kvm_arch_exit(void)
  {
        kvm_s390_gib_destroy();
 -      if (kvm_s390_pci_interp_allowed())
 +      if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
                kvm_s390_pci_exit();
        debug_unregister(kvm_s390_dbf);
        debug_unregister(kvm_s390_dbf_uv);
@@@ -4343,8 -4343,6 +4343,6 @@@ retry
                goto retry;
        }
  
-       /* nothing to do, just clear the request */
-       kvm_clear_request(KVM_REQ_UNHALT, vcpu);
        /* we left the vsie handler, nothing to do, just clear the request */
        kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
  
index aa381ab69a1911ffb5afde4e49dc26ab67edc783,d40206b16d6cc260e3141f73484390cd00bcc4fb..61b9dd34d333ec928521908971116d954db574ee
@@@ -615,6 -615,8 +615,8 @@@ struct kvm_vcpu_hv 
                u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
                u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
                u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+               u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
+               u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
        } cpuid_cache;
  };
  
@@@ -639,6 -641,16 +641,16 @@@ struct kvm_vcpu_xen 
        struct timer_list poll_timer;
  };
  
+ struct kvm_queued_exception {
+       bool pending;
+       bool injected;
+       bool has_error_code;
+       u8 vector;
+       u32 error_code;
+       unsigned long payload;
+       bool has_payload;
+ };
  struct kvm_vcpu_arch {
        /*
         * rip and regs accesses must go through
        struct fpu_guest guest_fpu;
  
        u64 xcr0;
 +      u64 guest_supported_xcr0;
  
        struct kvm_pio_request pio;
        void *pio_data;
  
        u8 event_exit_inst_len;
  
-       struct kvm_queued_exception {
-               bool pending;
-               bool injected;
-               bool has_error_code;
-               u8 nr;
-               u32 error_code;
-               unsigned long payload;
-               bool has_payload;
-               u8 nested_apf;
-       } exception;
+       bool exception_from_userspace;
+       /* Exceptions to be injected to the guest. */
+       struct kvm_queued_exception exception;
+       /* Exception VM-Exits to be synthesized to L1. */
+       struct kvm_queued_exception exception_vmexit;
  
        struct kvm_queued_interrupt {
                bool injected;
                u32 id;
                bool send_user_only;
                u32 host_apf_flags;
-               unsigned long nested_apf_token;
                bool delivery_as_pf_vmexit;
                bool pageready_pending;
        } apf;
@@@ -1524,7 -1530,7 +1531,7 @@@ struct kvm_x86_ops 
                                unsigned char *hypercall_addr);
        void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected);
        void (*inject_nmi)(struct kvm_vcpu *vcpu);
-       void (*queue_exception)(struct kvm_vcpu *vcpu);
+       void (*inject_exception)(struct kvm_vcpu *vcpu);
        void (*cancel_injection)(struct kvm_vcpu *vcpu);
        int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
        int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
  
  struct kvm_x86_nested_ops {
        void (*leave_nested)(struct kvm_vcpu *vcpu);
+       bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
+                                   u32 error_code);
        int (*check_events)(struct kvm_vcpu *vcpu);
-       bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
-                                            struct x86_exception *fault);
-       bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+       bool (*has_events)(struct kvm_vcpu *vcpu);
        void (*triple_fault)(struct kvm_vcpu *vcpu);
        int (*get_state)(struct kvm_vcpu *vcpu,
                         struct kvm_nested_state __user *user_kvm_nested_state,
@@@ -1863,7 -1869,7 +1870,7 @@@ void kvm_queue_exception_p(struct kvm_v
  void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
  void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                    struct x86_exception *fault);
  bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
  bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
diff --combined arch/x86/kvm/cpuid.c
index 2796dde06302a901a2dba41df1e7bb64d04992c9,ffdc28684cb7981f7395fae81e54dd5e361c727f..7065462378e2933d7c76711a54cc64c70140443a
@@@ -311,10 -311,20 +311,19 @@@ void kvm_update_cpuid_runtime(struct kv
  }
  EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
  
+ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
+ {
+       struct kvm_cpuid_entry2 *entry;
+       entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE,
+                                 KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+       return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+ }
  static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
        struct kvm_cpuid_entry2 *best;
 -      u64 guest_supported_xcr0;
  
        best = kvm_find_cpuid_entry(vcpu, 1);
        if (best && apic) {
                kvm_apic_set_version(vcpu);
        }
  
 -      guest_supported_xcr0 =
 +      vcpu->arch.guest_supported_xcr0 =
                cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
  
 -      vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0;
 +      /*
 +       * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
 +       * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
 +       * supported by the host.
 +       */
 +      vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 |
 +                                                     XFEATURE_MASK_FPSSE;
  
        kvm_update_pv_runtime(vcpu);
  
        vcpu->arch.cr4_guest_rsvd_bits =
            __cr4_reserved_bits(guest_cpuid_has, vcpu);
  
-       kvm_hv_set_cpuid(vcpu);
+       kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
+                                                   vcpu->arch.cpuid_nent));
  
        /* Invoke the vendor callback only after the above state is updated. */
        static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
@@@ -409,6 -414,12 +419,12 @@@ static int kvm_set_cpuid(struct kvm_vcp
                return 0;
        }
  
+       if (kvm_cpuid_has_hyperv(e2, nent)) {
+               r = kvm_hv_vcpu_init(vcpu);
+               if (r)
+                       return r;
+       }
        r = kvm_check_cpuid(vcpu, e2, nent);
        if (r)
                return r;
@@@ -902,6 -913,8 +918,6 @@@ static inline int __do_cpuid_func(struc
                        entry->edx = 0;
                }
                break;
 -      case 9:
 -              break;
        case 0xa: { /* Architectural Performance Monitoring */
                union cpuid10_eax eax;
                union cpuid10_edx edx;
diff --combined arch/x86/kvm/emulate.c
index aacb28c83e437d83712ed029c47f59ffc7c6cde7,b6180032dfd6c0f888655d1bfe77c33d4ed994cb..3b27622d46425b58c7285f11f60b346a550c8f84
@@@ -479,7 -479,7 +479,7 @@@ FOP_END
  
  /*
   * XXX: inoutclob user must know where the argument is being expanded.
 - *      Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault.
 + *      Using asm goto would allow us to remove _fault.
   */
  #define asm_safe(insn, inoutclob...) \
  ({ \
@@@ -1137,9 -1137,11 +1137,11 @@@ static int em_fnstsw(struct x86_emulate
  static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
                                    struct operand *op)
  {
-       unsigned reg = ctxt->modrm_reg;
+       unsigned int reg;
  
-       if (!(ctxt->d & ModRM))
+       if (ctxt->d & ModRM)
+               reg = ctxt->modrm_reg;
+       else
                reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
  
        if (ctxt->d & Sse) {
@@@ -1953,7 -1955,7 +1955,7 @@@ static int em_pop_sreg(struct x86_emula
        if (rc != X86EMUL_CONTINUE)
                return rc;
  
-       if (ctxt->modrm_reg == VCPU_SREG_SS)
+       if (seg == VCPU_SREG_SS)
                ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
        if (ctxt->op_bytes > 2)
                rsp_increment(ctxt, ctxt->op_bytes - 2);
@@@ -3645,13 -3647,10 +3647,10 @@@ static int em_wrmsr(struct x86_emulate_
                | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
        r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
  
-       if (r == X86EMUL_IO_NEEDED)
-               return r;
-       if (r > 0)
+       if (r == X86EMUL_PROPAGATE_FAULT)
                return emulate_gp(ctxt, 0);
  
-       return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+       return r;
  }
  
  static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
  
        r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
  
-       if (r == X86EMUL_IO_NEEDED)
-               return r;
-       if (r)
+       if (r == X86EMUL_PROPAGATE_FAULT)
                return emulate_gp(ctxt, 0);
  
-       *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
-       *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
-       return X86EMUL_CONTINUE;
+       if (r == X86EMUL_CONTINUE) {
+               *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+               *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+       }
+       return r;
  }
  
  static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
@@@ -4132,9 -4130,6 +4130,9 @@@ static int em_xsetbv(struct x86_emulate
  {
        u32 eax, ecx, edx;
  
 +      if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE))
 +              return emulate_ud(ctxt);
 +
        eax = reg_read(ctxt, VCPU_REGS_RAX);
        edx = reg_read(ctxt, VCPU_REGS_RDX);
        ecx = reg_read(ctxt, VCPU_REGS_RCX);
@@@ -4171,8 -4166,7 +4169,7 @@@ static int check_dr7_gd(struct x86_emul
  
        ctxt->ops->get_dr(ctxt, 7, &dr7);
  
-       /* Check if DR7.Global_Enable is set */
-       return dr7 & (1 << 13);
+       return dr7 & DR7_GD;
  }
  
  static int check_dr_read(struct x86_emulate_ctxt *ctxt)
diff --combined arch/x86/kvm/mmu/mmu.c
index 3552e6af3684437f66f1d6a864499095bf176c74,40feb5ec761e7e2ce0c7464699e1a25e6d4daf48..6f81539061d6485905e5a2e50a49293096f16035
@@@ -1596,8 -1596,6 +1596,8 @@@ static void __rmap_add(struct kvm *kvm
        rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
        rmap_count = pte_list_add(cache, spte, rmap_head);
  
 +      if (rmap_count > kvm->stat.max_mmu_rmap_size)
 +              kvm->stat.max_mmu_rmap_size = rmap_count;
        if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
                kvm_zap_all_rmap_sptes(kvm, rmap_head);
                kvm_flush_remote_tlbs_with_address(
@@@ -1667,6 -1665,18 +1667,18 @@@ static inline void kvm_mod_used_mmu_pag
        percpu_counter_add(&kvm_total_used_mmu_pages, nr);
  }
  
+ static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+       kvm_mod_used_mmu_pages(kvm, +1);
+       kvm_account_pgtable_pages((void *)sp->spt, +1);
+ }
+ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+       kvm_mod_used_mmu_pages(kvm, -1);
+       kvm_account_pgtable_pages((void *)sp->spt, -1);
+ }
  static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
  {
        MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
@@@ -2124,7 -2134,7 +2136,7 @@@ static struct kvm_mmu_page *kvm_mmu_all
         */
        sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
        list_add(&sp->link, &kvm->arch.active_mmu_pages);
-       kvm_mod_used_mmu_pages(kvm, +1);
+       kvm_account_mmu_page(kvm, sp);
  
        sp->gfn = gfn;
        sp->role = role;
@@@ -2458,7 -2468,7 +2470,7 @@@ static bool __kvm_mmu_prepare_zap_page(
                        list_add(&sp->link, invalid_list);
                else
                        list_move(&sp->link, invalid_list);
-               kvm_mod_used_mmu_pages(kvm, -1);
+               kvm_unaccount_mmu_page(kvm, sp);
        } else {
                /*
                 * Remove the active root from the active page list, the root
@@@ -4292,7 -4302,7 +4304,7 @@@ int kvm_handle_page_fault(struct kvm_vc
  
        vcpu->arch.l1tf_flush_l1d = true;
        if (!flags) {
-               trace_kvm_page_fault(fault_address, error_code);
+               trace_kvm_page_fault(vcpu, fault_address, error_code);
  
                if (kvm_event_needs_reinjection(vcpu))
                        kvm_mmu_unprotect_page_virt(vcpu, fault_address);
@@@ -6704,10 -6714,12 +6716,12 @@@ int kvm_mmu_vendor_module_init(void
  
        ret = register_shrinker(&mmu_shrinker, "x86-mmu");
        if (ret)
-               goto out;
+               goto out_shrinker;
  
        return 0;
  
+ out_shrinker:
+       percpu_counter_destroy(&kvm_total_used_mmu_pages);
  out:
        mmu_destroy_caches();
        return ret;
index 7eaf96064cb0e1375a1b01547b3a75db4080b70a,8f67a9c4a28706360abd9a8c841ca7e0a132225c..0c62352dda6abc9bf72dfaaaa760cc5bb78bbcbf
@@@ -439,61 -439,22 +439,22 @@@ static bool nested_vmx_is_page_fault_vm
        return inequality ^ bit;
  }
  
- /*
-  * KVM wants to inject page-faults which it got to the guest. This function
-  * checks whether in a nested guest, we need to inject them to L1 or L2.
-  */
- static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
+ static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+                                          u32 error_code)
  {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       unsigned int nr = vcpu->arch.exception.nr;
-       bool has_payload = vcpu->arch.exception.has_payload;
-       unsigned long payload = vcpu->arch.exception.payload;
-       if (nr == PF_VECTOR) {
-               if (vcpu->arch.exception.nested_apf) {
-                       *exit_qual = vcpu->arch.apf.nested_apf_token;
-                       return 1;
-               }
-               if (nested_vmx_is_page_fault_vmexit(vmcs12,
-                                                   vcpu->arch.exception.error_code)) {
-                       *exit_qual = has_payload ? payload : vcpu->arch.cr2;
-                       return 1;
-               }
-       } else if (vmcs12->exception_bitmap & (1u << nr)) {
-               if (nr == DB_VECTOR) {
-                       if (!has_payload) {
-                               payload = vcpu->arch.dr6;
-                               payload &= ~DR6_BT;
-                               payload ^= DR6_ACTIVE_LOW;
-                       }
-                       *exit_qual = payload;
-               } else
-                       *exit_qual = 0;
-               return 1;
-       }
  
-       return 0;
- }
- static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
-                                                   struct x86_exception *fault)
- {
-       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       WARN_ON(!is_guest_mode(vcpu));
+       /*
+        * Drop bits 31:16 of the error code when performing the #PF mask+match
+        * check.  All VMCS fields involved are 32 bits, but Intel CPUs never
+        * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
+        * error code.  Including the to-be-dropped bits in the check might
+        * result in an "impossible" or missed exit from L1's perspective.
+        */
+       if (vector == PF_VECTOR)
+               return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
  
-       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
-           !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
-               vmcs12->vm_exit_intr_error_code = fault->error_code;
-               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                                 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-                                 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
-                                 fault->address);
-               return true;
-       }
-       return false;
+       return (vmcs12->exception_bitmap & (1u << vector));
  }
  
  static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@@ -1607,6 -1568,10 +1568,10 @@@ static void copy_enlightened_to_vmcs12(
                vmcs12->guest_rflags = evmcs->guest_rflags;
                vmcs12->guest_interruptibility_info =
                        evmcs->guest_interruptibility_info;
+               /*
+                * Not present in struct vmcs12:
+                * vmcs12->guest_ssp = evmcs->guest_ssp;
+                */
        }
  
        if (unlikely(!(hv_clean_fields &
                vmcs12->host_fs_selector = evmcs->host_fs_selector;
                vmcs12->host_gs_selector = evmcs->host_gs_selector;
                vmcs12->host_tr_selector = evmcs->host_tr_selector;
+               vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
+               /*
+                * Not present in struct vmcs12:
+                * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
+                * vmcs12->host_ssp = evmcs->host_ssp;
+                * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
+                */
        }
  
        if (unlikely(!(hv_clean_fields &
                vmcs12->tsc_offset = evmcs->tsc_offset;
                vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
                vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+               vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
+               vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
        }
  
        if (unlikely(!(hv_clean_fields &
                vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
                vmcs12->guest_activity_state = evmcs->guest_activity_state;
                vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+               vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
+               /*
+                * Not present in struct vmcs12:
+                * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
+                * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
+                * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
+                */
        }
  
        /*
@@@ -1869,12 -1850,23 +1850,23 @@@ static void copy_vmcs12_to_enlightened(
         * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
         * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
         * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+        * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
+        * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
+        * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
+        * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
         *
         * Not present in struct vmcs12:
         * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
         * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
         * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
         * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+        * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
+        * evmcs->host_ssp = vmcs12->host_ssp;
+        * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
+        * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
+        * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
+        * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
+        * evmcs->guest_ssp = vmcs12->guest_ssp;
         */
  
        evmcs->guest_es_selector = vmcs12->guest_es_selector;
@@@ -1982,7 -1974,7 +1974,7 @@@ static enum nested_evmptrld_status nest
        bool evmcs_gpa_changed = false;
        u64 evmcs_gpa;
  
-       if (likely(!vmx->nested.enlightened_vmcs_enabled))
+       if (likely(!guest_cpuid_has_evmcs(vcpu)))
                return EVMPTRLD_DISABLED;
  
        if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
@@@ -2328,9 -2320,14 +2320,14 @@@ static void prepare_vmcs02_early(struc
         * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
         * on the related bits (if supported by the CPU) in the hope that
         * we can avoid VMWrites during vmx_set_efer().
+        *
+        * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
+        * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
+        * do the same for L2.
         */
        exec_control = __vm_entry_controls_get(vmcs01);
-       exec_control |= vmcs12->vm_entry_controls;
+       exec_control |= (vmcs12->vm_entry_controls &
+                        ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
        exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
        if (cpu_has_load_ia32_efer()) {
                if (guest_efer & EFER_LMA)
@@@ -2570,7 -2567,7 +2567,7 @@@ static int prepare_vmcs02(struct kvm_vc
         * bits which we consider mandatory enabled.
         * The CR0_READ_SHADOW is what L2 should have expected to read given
         * the specifications by L1; It's not enough to take
 -       * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
 +       * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
         * have more bits than L1 expected.
         */
        vmx_set_cr0(vcpu, vmcs12->guest_cr0);
@@@ -2863,7 -2860,7 +2860,7 @@@ static int nested_vmx_check_controls(st
            nested_check_vm_entry_controls(vcpu, vmcs12))
                return -EINVAL;
  
-       if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
+       if (guest_cpuid_has_evmcs(vcpu))
                return nested_evmcs_check_controls(vmcs12);
  
        return 0;
@@@ -3145,7 -3142,7 +3142,7 @@@ static bool nested_get_evmcs_page(struc
         * L2 was running), map it here to make sure vmcs12 changes are
         * properly reflected.
         */
-       if (vmx->nested.enlightened_vmcs_enabled &&
+       if (guest_cpuid_has_evmcs(vcpu) &&
            vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
                enum nested_evmptrld_status evmptrld_status =
                        nested_vmx_handle_enlightened_vmptrld(vcpu, false);
@@@ -3364,12 -3361,24 +3361,24 @@@ enum nvmx_vmentry_status nested_vmx_ent
        };
        u32 failed_index;
  
+       trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
+                                vmx->nested.current_vmptr,
+                                vmcs12->guest_rip,
+                                vmcs12->guest_intr_status,
+                                vmcs12->vm_entry_intr_info_field,
+                                vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
+                                vmcs12->ept_pointer,
+                                vmcs12->guest_cr3,
+                                KVM_ISA_VMX);
        kvm_service_local_tlb_flush_requests(vcpu);
  
        evaluate_pending_interrupts = exec_controls_get(vmx) &
                (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
        if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
                evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+       if (!evaluate_pending_interrupts)
+               evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
  
        if (!vmx->nested.nested_run_pending ||
            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
        }
  
        /*
-        * If L1 had a pending IRQ/NMI until it executed
-        * VMLAUNCH/VMRESUME which wasn't delivered because it was
-        * disallowed (e.g. interrupts disabled), L0 needs to
-        * evaluate if this pending event should cause an exit from L2
-        * to L1 or delivered directly to L2 (e.g. In case L1 don't
-        * intercept EXTERNAL_INTERRUPT).
-        *
-        * Usually this would be handled by the processor noticing an
-        * IRQ/NMI window request, or checking RVI during evaluation of
-        * pending virtual interrupts.  However, this setting was done
-        * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
-        * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+        * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
+        * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
+        * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
+        * unconditionally.
         */
        if (unlikely(evaluate_pending_interrupts))
                kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -3718,7 -3719,7 +3719,7 @@@ static void vmcs12_save_pending_event(s
             is_double_fault(exit_intr_info))) {
                vmcs12->idt_vectoring_info_field = 0;
        } else if (vcpu->arch.exception.injected) {
-               nr = vcpu->arch.exception.nr;
+               nr = vcpu->arch.exception.vector;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
  
                if (kvm_exception_is_soft(nr)) {
@@@ -3819,19 -3820,40 +3820,40 @@@ mmio_needed
        return -ENXIO;
  }
  
- static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
-                                              unsigned long exit_qual)
+ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
  {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+       u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       unsigned int nr = vcpu->arch.exception.nr;
-       u32 intr_info = nr | INTR_INFO_VALID_MASK;
+       unsigned long exit_qual;
+       if (ex->has_payload) {
+               exit_qual = ex->payload;
+       } else if (ex->vector == PF_VECTOR) {
+               exit_qual = vcpu->arch.cr2;
+       } else if (ex->vector == DB_VECTOR) {
+               exit_qual = vcpu->arch.dr6;
+               exit_qual &= ~DR6_BT;
+               exit_qual ^= DR6_ACTIVE_LOW;
+       } else {
+               exit_qual = 0;
+       }
  
-       if (vcpu->arch.exception.has_error_code) {
-               vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+       if (ex->has_error_code) {
+               /*
+                * Intel CPUs do not generate error codes with bits 31:16 set,
+                * and more importantly VMX disallows setting bits 31:16 in the
+                * injected error code for VM-Entry.  Drop the bits to mimic
+                * hardware and avoid inducing failure on nested VM-Entry if L1
+                * chooses to inject the exception back to L2.  AMD CPUs _do_
+                * generate "full" 32-bit error codes, so KVM allows userspace
+                * to inject exception error codes with bits 31:16 set.
+                */
+               vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
        }
  
-       if (kvm_exception_is_soft(nr))
+       if (kvm_exception_is_soft(ex->vector))
                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
        else
                intr_info |= INTR_TYPE_HARD_EXCEPTION;
  }
  
  /*
-  * Returns true if a debug trap is pending delivery.
+  * Returns true if a debug trap is (likely) pending delivery.  Infer the class
+  * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
+  * Using the payload is flawed because code breakpoints (fault-like) and data
+  * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
+  * this will return false positives if a to-be-injected code breakpoint #DB is
+  * pending (from KVM's perspective, but not "pending" across an instruction
+  * boundary).  ICEBP, a.k.a. INT1, is also not reflected here even though it
+  * too is trap-like.
   *
-  * In KVM, debug traps bear an exception payload. As such, the class of a #DB
-  * exception may be inferred from the presence of an exception payload.
+  * KVM "works" despite these flaws as ICEBP isn't currently supported by the
+  * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
+  * #DB has already happened), and MTF isn't marked pending on code breakpoints
+  * from the emulator (because such #DBs are fault-like and thus don't trigger
+  * actions that fire on instruction retire).
+  */
+ static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
+ {
+       if (!ex->pending || ex->vector != DB_VECTOR)
+               return 0;
+       /* General Detect #DBs are always fault-like. */
+       return ex->payload & ~DR6_BD;
+ }
+ /*
+  * Returns true if there's a pending #DB exception that is lower priority than
+  * a pending Monitor Trap Flag VM-Exit.  TSS T-flag #DBs are not emulated by
+  * KVM, but could theoretically be injected by userspace.  Note, this code is
+  * imperfect, see above.
   */
- static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
+ static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
  {
-       return vcpu->arch.exception.pending &&
-                       vcpu->arch.exception.nr == DB_VECTOR &&
-                       vcpu->arch.exception.payload;
+       return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
  }
  
  /*
   */
  static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
  {
-       if (vmx_pending_dbg_trap(vcpu))
-               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-                           vcpu->arch.exception.payload);
+       unsigned long pending_dbg;
+       pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
+       if (pending_dbg)
+               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
  }
  
  static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
               to_vmx(vcpu)->nested.preemption_timer_expired;
  }
  
+ static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+ {
+       return nested_vmx_preemption_timer_pending(vcpu) ||
+              to_vmx(vcpu)->nested.mtf_pending;
+ }
+ /*
+  * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
+  * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
+  * and less minor edits to splice in the priority of VMX Non-Root specific
+  * events, e.g. MTF and NMI/INTR-window exiting.
+  *
+  * 1 Hardware Reset and Machine Checks
+  *    - RESET
+  *    - Machine Check
+  *
+  * 2 Trap on Task Switch
+  *    - T flag in TSS is set (on task switch)
+  *
+  * 3 External Hardware Interventions
+  *    - FLUSH
+  *    - STOPCLK
+  *    - SMI
+  *    - INIT
+  *
+  * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
+  *
+  * 4 Traps on Previous Instruction
+  *    - Breakpoints
+  *    - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
+  *      breakpoint, or #DB due to a split-lock access)
+  *
+  * 4.3        VMX-preemption timer expired VM-exit
+  *
+  * 4.6        NMI-window exiting VM-exit[2]
+  *
+  * 5 Nonmaskable Interrupts (NMI)
+  *
+  * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
+  *
+  * 6 Maskable Hardware Interrupts
+  *
+  * 7 Code Breakpoint Fault
+  *
+  * 8 Faults from Fetching Next Instruction
+  *    - Code-Segment Limit Violation
+  *    - Code Page Fault
+  *    - Control protection exception (missing ENDBRANCH at target of indirect
+  *                                    call or jump)
+  *
+  * 9 Faults from Decoding Next Instruction
+  *    - Instruction length > 15 bytes
+  *    - Invalid Opcode
+  *    - Coprocessor Not Available
+  *
+  *10 Faults on Executing Instruction
+  *    - Overflow
+  *    - Bound error
+  *    - Invalid TSS
+  *    - Segment Not Present
+  *    - Stack fault
+  *    - General Protection
+  *    - Data Page Fault
+  *    - Alignment Check
+  *    - x86 FPU Floating-point exception
+  *    - SIMD floating-point exception
+  *    - Virtualization exception
+  *    - Control protection exception
+  *
+  * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
+  *     INIT signals, and higher priority events take priority over MTF VM exits.
+  *     MTF VM exits take priority over debug-trap exceptions and lower priority
+  *     events.
+  *
+  * [2] Debug-trap exceptions and higher priority events take priority over VM exits
+  *     caused by the VMX-preemption timer.  VM exits caused by the VMX-preemption
+  *     timer take priority over VM exits caused by the "NMI-window exiting"
+  *     VM-execution control and lower priority events.
+  *
+  * [3] Debug-trap exceptions and higher priority events take priority over VM exits
+  *     caused by "NMI-window exiting".  VM exits caused by this control take
+  *     priority over non-maskable interrupts (NMIs) and lower priority events.
+  *
+  * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
+  *     the 1-setting of the "interrupt-window exiting" VM-execution control.  Thus,
+  *     non-maskable interrupts (NMIs) and higher priority events take priority over
+  *     delivery of a virtual interrupt; delivery of a virtual interrupt takes
+  *     priority over external interrupts and lower priority events.
+  */
  static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
  {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long exit_qual;
-       bool block_nested_events =
-           vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
-       bool mtf_pending = vmx->nested.mtf_pending;
        struct kvm_lapic *apic = vcpu->arch.apic;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        /*
-        * Clear the MTF state. If a higher priority VM-exit is delivered first,
-        * this state is discarded.
+        * Only a pending nested run blocks a pending exception.  If there is a
+        * previously injected event, the pending exception occurred while said
+        * event was being delivered and thus needs to be handled.
         */
-       if (!block_nested_events)
-               vmx->nested.mtf_pending = false;
+       bool block_nested_exceptions = vmx->nested.nested_run_pending;
+       /*
+        * New events (not exceptions) are only recognized at instruction
+        * boundaries.  If an event needs reinjection, then KVM is handling a
+        * VM-Exit that occurred _during_ instruction execution; new events are
+        * blocked until the instruction completes.
+        */
+       bool block_nested_events = block_nested_exceptions ||
+                                  kvm_event_needs_reinjection(vcpu);
  
        if (lapic_in_kernel(vcpu) &&
                test_bit(KVM_APIC_INIT, &apic->pending_events)) {
                clear_bit(KVM_APIC_INIT, &apic->pending_events);
                if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
                        nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+               /* MTF is discarded if the vCPU is in WFS. */
+               vmx->nested.mtf_pending = false;
                return 0;
        }
  
                        return -EBUSY;
  
                clear_bit(KVM_APIC_SIPI, &apic->pending_events);
-               if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+               if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
                                                apic->sipi_vector & 0xFFUL);
-               return 0;
+                       return 0;
+               }
+               /* Fallthrough, the SIPI is completely ignored. */
        }
  
        /*
-        * Process any exceptions that are not debug traps before MTF.
+        * Process exceptions that are higher priority than Monitor Trap Flag:
+        * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
+        * could theoretically come in from userspace), and ICEBP (INT1).
         *
-        * Note that only a pending nested run can block a pending exception.
-        * Otherwise an injected NMI/interrupt should either be
-        * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
-        * while delivering the pending exception.
+        * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
+        * for TSS T flag #DBs).  KVM also doesn't save/restore pending MTF
+        * across SMI/RSM as it should; that needs to be addressed in order to
+        * prioritize SMI over MTF and trap-like #DBs.
         */
-       if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
-               if (vmx->nested.nested_run_pending)
+       if (vcpu->arch.exception_vmexit.pending &&
+           !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
+               if (block_nested_exceptions)
                        return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+               nested_vmx_inject_exception_vmexit(vcpu);
                return 0;
        }
  
-       if (mtf_pending) {
+       if (vcpu->arch.exception.pending &&
+           !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
+               if (block_nested_exceptions)
+                       return -EBUSY;
+               goto no_vmexit;
+       }
+       if (vmx->nested.mtf_pending) {
                if (block_nested_events)
                        return -EBUSY;
                nested_vmx_update_pending_dbg(vcpu);
                return 0;
        }
  
-       if (vcpu->arch.exception.pending) {
-               if (vmx->nested.nested_run_pending)
+       if (vcpu->arch.exception_vmexit.pending) {
+               if (block_nested_exceptions)
                        return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+               nested_vmx_inject_exception_vmexit(vcpu);
                return 0;
        }
  
+       if (vcpu->arch.exception.pending) {
+               if (block_nested_exceptions)
+                       return -EBUSY;
+               goto no_vmexit;
+       }
        if (nested_vmx_preemption_timer_pending(vcpu)) {
                if (block_nested_events)
                        return -EBUSY;
@@@ -4255,14 -4412,6 +4412,6 @@@ static void prepare_vmcs12(struct kvm_v
                        nested_vmx_abort(vcpu,
                                         VMX_ABORT_SAVE_GUEST_MSR_FAIL);
        }
-       /*
-        * Drop what we picked up for L2 via vmx_complete_interrupts. It is
-        * preserved above and would only end up incorrectly in L1.
-        */
-       vcpu->arch.nmi_injected = false;
-       kvm_clear_exception_queue(vcpu);
-       kvm_clear_interrupt_queue(vcpu);
  }
  
  /*
@@@ -4538,6 -4687,9 +4687,9 @@@ void nested_vmx_vmexit(struct kvm_vcpu 
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
+       /* Pending MTF traps are discarded on VM-Exit. */
+       vmx->nested.mtf_pending = false;
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
  
                WARN_ON_ONCE(nested_early_check);
        }
  
+       /*
+        * Drop events/exceptions that were queued for re-injection to L2
+        * (picked up via vmx_complete_interrupts()), as well as exceptions
+        * that were pending for L2.  Note, this must NOT be hoisted above
+        * prepare_vmcs12(), events/exceptions queued for re-injection need to
+        * be captured in vmcs12 (see vmcs12_save_pending_event()).
+        */
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
  
        /* Update any VMCS fields that might have changed while L2 ran */
@@@ -5030,8 -5193,8 +5193,8 @@@ static int handle_vmxoff(struct kvm_vcp
  
        free_nested(vcpu);
  
-       /* Process a latched INIT during time CPU was in VMX operation */
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       if (kvm_apic_has_pending_init_or_sipi(vcpu))
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
  
        return nested_vmx_succeed(vcpu);
  }
@@@ -5067,7 -5230,7 +5230,7 @@@ static int handle_vmclear(struct kvm_vc
         * state. It is possible that the area will stay mapped as
         * vmx->nested.hv_evmcs but this shouldn't be a problem.
         */
-       if (likely(!vmx->nested.enlightened_vmcs_enabled ||
+       if (likely(!guest_cpuid_has_evmcs(vcpu) ||
                   !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
                if (vmptr == vmx->nested.current_vmptr)
                        nested_release_vmcs12(vcpu);
@@@ -6463,6 -6626,9 +6626,9 @@@ static int vmx_set_nested_state(struct 
        if (ret)
                goto error_guest_mode;
  
+       if (vmx->nested.mtf_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        return 0;
  
  error_guest_mode:
@@@ -6522,8 -6688,10 +6688,10 @@@ static u64 nested_vmx_calc_vmcs_enum_ms
   * bit in the high half is on if the corresponding bit in the control field
   * may be on. See also vmx_control_verify().
   */
- void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
+ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
  {
+       struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
        /*
         * Note that as a general rule, the high half of the MSRs (bits in
         * the control fields which may be 1) should be initialized by the
         */
  
        /* pin-based controls */
-       rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
-               msrs->pinbased_ctls_low,
-               msrs->pinbased_ctls_high);
-       msrs->pinbased_ctls_low |=
+       msrs->pinbased_ctls_low =
                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+       msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
        msrs->pinbased_ctls_high &=
                PIN_BASED_EXT_INTR_MASK |
                PIN_BASED_NMI_EXITING |
                PIN_BASED_VMX_PREEMPTION_TIMER;
  
        /* exit controls */
-       rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-               msrs->exit_ctls_low,
-               msrs->exit_ctls_high);
        msrs->exit_ctls_low =
                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  
+       msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
        msrs->exit_ctls_high &=
  #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
  #endif
                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
-               VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+               VM_EXIT_CLEAR_BNDCFGS;
        msrs->exit_ctls_high |=
                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
-               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
+               VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
  
        /* We support free control of debug control saving. */
        msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
  
        /* entry controls */
-       rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
-               msrs->entry_ctls_low,
-               msrs->entry_ctls_high);
        msrs->entry_ctls_low =
                VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+       msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
        msrs->entry_ctls_high &=
  #ifdef CONFIG_X86_64
                VM_ENTRY_IA32E_MODE |
  #endif
-               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
-               VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
        msrs->entry_ctls_high |=
-               (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
+               (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
+                VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
  
        /* We support free control of debug control loading. */
        msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
  
        /* cpu-based controls */
-       rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
-               msrs->procbased_ctls_low,
-               msrs->procbased_ctls_high);
        msrs->procbased_ctls_low =
                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+       msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
        msrs->procbased_ctls_high &=
                CPU_BASED_INTR_WINDOW_EXITING |
                CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
         * depend on CPUID bits, they are added later by
         * vmx_vcpu_after_set_cpuid.
         */
-       if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
-               rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
-                     msrs->secondary_ctls_low,
-                     msrs->secondary_ctls_high);
        msrs->secondary_ctls_low = 0;
+       msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
        msrs->secondary_ctls_high &=
                SECONDARY_EXEC_DESC |
                SECONDARY_EXEC_ENABLE_RDTSCP |
                msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
  
        /* miscellaneous data */
-       rdmsr(MSR_IA32_VMX_MISC,
-               msrs->misc_low,
-               msrs->misc_high);
-       msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+       msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
        msrs->misc_low |=
                MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
                VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
@@@ -6814,9 -6972,9 +6972,9 @@@ __init int nested_vmx_hardware_setup(in
  
  struct kvm_x86_nested_ops vmx_nested_ops = {
        .leave_nested = vmx_leave_nested,
+       .is_exception_vmexit = nested_vmx_is_exception_vmexit,
        .check_events = vmx_check_nested_events,
-       .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
-       .hv_timer_pending = nested_vmx_preemption_timer_pending,
+       .has_events = vmx_has_nested_events,
        .triple_fault = nested_vmx_triple_fault,
        .get_state = vmx_get_nested_state,
        .set_state = vmx_set_nested_state,
diff --combined arch/x86/kvm/x86.c
index b0c47b41c264982c993a098e738fa2ef8f9e6add,eb9d2c23fb04ecdd94728f4541c18655c6e3579c..4bd5f8a751de91ffeb666e1be9c5db8ae3b65f36
@@@ -173,8 -173,13 +173,13 @@@ bool __read_mostly enable_vmware_backdo
  module_param(enable_vmware_backdoor, bool, S_IRUGO);
  EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
  
- static bool __read_mostly force_emulation_prefix = false;
- module_param(force_emulation_prefix, bool, S_IRUGO);
+ /*
+  * Flags to manipulate forced emulation behavior (any non-zero value will
+  * enable forced emulation).
+  */
+ #define KVM_FEP_CLEAR_RFLAGS_RF       BIT(1)
+ static int __read_mostly force_emulation_prefix;
+ module_param(force_emulation_prefix, int, 0644);
  
  int __read_mostly pi_inject_timer = -1;
  module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
@@@ -528,6 -533,7 +533,7 @@@ static int exception_class(int vector
  #define EXCPT_TRAP            1
  #define EXCPT_ABORT           2
  #define EXCPT_INTERRUPT               3
+ #define EXCPT_DB              4
  
  static int exception_type(int vector)
  {
  
        mask = 1 << vector;
  
-       /* #DB is trap, as instruction watchpoints are handled elsewhere */
-       if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+       /*
+        * #DBs can be trap-like or fault-like, the caller must check other CPU
+        * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+        */
+       if (mask & (1 << DB_VECTOR))
+               return EXCPT_DB;
+       if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
                return EXCPT_TRAP;
  
        if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
        return EXCPT_FAULT;
  }
  
- void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+                                  struct kvm_queued_exception *ex)
  {
-       unsigned nr = vcpu->arch.exception.nr;
-       bool has_payload = vcpu->arch.exception.has_payload;
-       unsigned long payload = vcpu->arch.exception.payload;
-       if (!has_payload)
+       if (!ex->has_payload)
                return;
  
-       switch (nr) {
+       switch (ex->vector) {
        case DB_VECTOR:
                /*
                 * "Certain debug exceptions may clear bit 0-3.  The
                 * So they need to be flipped for DR6.
                 */
                vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
-               vcpu->arch.dr6 |= payload;
-               vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
+               vcpu->arch.dr6 |= ex->payload;
+               vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
  
                /*
                 * The #DB payload is defined as compatible with the 'pending
                vcpu->arch.dr6 &= ~BIT(12);
                break;
        case PF_VECTOR:
-               vcpu->arch.cr2 = payload;
+               vcpu->arch.cr2 = ex->payload;
                break;
        }
  
-       vcpu->arch.exception.has_payload = false;
-       vcpu->arch.exception.payload = 0;
+       ex->has_payload = false;
+       ex->payload = 0;
  }
  EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
  
+ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+                                      bool has_error_code, u32 error_code,
+                                      bool has_payload, unsigned long payload)
+ {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+       ex->vector = vector;
+       ex->injected = false;
+       ex->pending = true;
+       ex->has_error_code = has_error_code;
+       ex->error_code = error_code;
+       ex->has_payload = has_payload;
+       ex->payload = payload;
+ }
  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                unsigned nr, bool has_error, u32 error_code,
                bool has_payload, unsigned long payload, bool reinject)
  
        kvm_make_request(KVM_REQ_EVENT, vcpu);
  
+       /*
+        * If the exception is destined for L2 and isn't being reinjected,
+        * morph it to a VM-Exit if L1 wants to intercept the exception.  A
+        * previously injected exception is not checked because it was checked
+        * when it was original queued, and re-checking is incorrect if _L1_
+        * injected the exception, in which case it's exempt from interception.
+        */
+       if (!reinject && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+               kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+                                          has_payload, payload);
+               return;
+       }
        if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
        queue:
                if (reinject) {
                        /*
-                        * On vmentry, vcpu->arch.exception.pending is only
-                        * true if an event injection was blocked by
-                        * nested_run_pending.  In that case, however,
-                        * vcpu_enter_guest requests an immediate exit,
-                        * and the guest shouldn't proceed far enough to
-                        * need reinjection.
+                        * On VM-Entry, an exception can be pending if and only
+                        * if event injection was blocked by nested_run_pending.
+                        * In that case, however, vcpu_enter_guest() requests an
+                        * immediate exit, and the guest shouldn't proceed far
+                        * enough to need reinjection.
                         */
-                       WARN_ON_ONCE(vcpu->arch.exception.pending);
+                       WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
                        vcpu->arch.exception.injected = true;
                        if (WARN_ON_ONCE(has_payload)) {
                                /*
                        vcpu->arch.exception.injected = false;
                }
                vcpu->arch.exception.has_error_code = has_error;
-               vcpu->arch.exception.nr = nr;
+               vcpu->arch.exception.vector = nr;
                vcpu->arch.exception.error_code = error_code;
                vcpu->arch.exception.has_payload = has_payload;
                vcpu->arch.exception.payload = payload;
                if (!is_guest_mode(vcpu))
-                       kvm_deliver_exception_payload(vcpu);
+                       kvm_deliver_exception_payload(vcpu,
+                                                     &vcpu->arch.exception);
                return;
        }
  
        /* to check exception */
-       prev_nr = vcpu->arch.exception.nr;
+       prev_nr = vcpu->arch.exception.vector;
        if (prev_nr == DF_VECTOR) {
                /* triple fault -> shutdown */
                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
        }
        class1 = exception_class(prev_nr);
        class2 = exception_class(nr);
-       if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
-               || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+       if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+           (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
                /*
-                * Generate double fault per SDM Table 5-5.  Set
-                * exception.pending = true so that the double fault
-                * can trigger a nested vmexit.
+                * Synthesize #DF.  Clear the previously injected or pending
+                * exception so as not to incorrectly trigger shutdown.
                 */
-               vcpu->arch.exception.pending = true;
                vcpu->arch.exception.injected = false;
-               vcpu->arch.exception.has_error_code = true;
-               vcpu->arch.exception.nr = DF_VECTOR;
-               vcpu->arch.exception.error_code = 0;
-               vcpu->arch.exception.has_payload = false;
-               vcpu->arch.exception.payload = 0;
-       } else
+               vcpu->arch.exception.pending = false;
+               kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
+       } else {
                /* replace previous exception with a new one in a hope
                   that instruction re-execution will regenerate lost
                   exception */
                goto queue;
+       }
  }
  
  void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
@@@ -729,20 -764,22 +764,22 @@@ static int complete_emulated_insn_gp(st
  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
  {
        ++vcpu->stat.pf_guest;
-       vcpu->arch.exception.nested_apf =
-               is_guest_mode(vcpu) && fault->async_page_fault;
-       if (vcpu->arch.exception.nested_apf) {
-               vcpu->arch.apf.nested_apf_token = fault->address;
-               kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
-       } else {
+       /*
+        * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+        * whether or not L1 wants to intercept "regular" #PF.
+        */
+       if (is_guest_mode(vcpu) && fault->async_page_fault)
+               kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+                                          true, fault->error_code,
+                                          true, fault->address);
+       else
                kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
                                        fault->address);
-       }
  }
  EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
  
- /* Returns true if the page fault was immediately morphed into a VM-Exit. */
- bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                    struct x86_exception *fault)
  {
        struct kvm_mmu *fault_mmu;
                kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
                                       fault_mmu->root.hpa);
  
-       /*
-        * A workaround for KVM's bad exception handling.  If KVM injected an
-        * exception into L2, and L2 encountered a #PF while vectoring the
-        * injected exception, manually check to see if L1 wants to intercept
-        * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
-        * In all other cases, defer the check to nested_ops->check_events(),
-        * which will correctly handle priority (this does not).  Note, other
-        * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
-        * most problematic, e.g. when L0 and L1 are both intercepting #PF for
-        * shadow paging.
-        *
-        * TODO: Rewrite exception handling to track injected and pending
-        *       (VM-Exit) exceptions separately.
-        */
-       if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
-           kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
-               return true;
        fault_mmu->inject_page_fault(vcpu, fault);
-       return false;
  }
  EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
  
@@@ -1011,10 -1029,15 +1029,10 @@@ void kvm_load_host_xsave_state(struct k
  }
  EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
  
 -static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu)
 -{
 -      return vcpu->arch.guest_fpu.fpstate->user_xfeatures;
 -}
 -
  #ifdef CONFIG_X86_64
  static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
  {
 -      return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC;
 +      return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
  }
  #endif
  
@@@ -1037,7 -1060,7 +1055,7 @@@ static int __kvm_set_xcr(struct kvm_vcp
         * saving.  However, xcr0 bit 0 is always set, even if the
         * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
         */
 -      valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP;
 +      valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
        if (xcr0 & ~valid_bits)
                return 1;
  
  
  int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
  {
 +      /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
        if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
            __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
                kvm_inject_gp(vcpu, 0);
@@@ -1553,32 -1575,12 +1571,32 @@@ static const u32 msr_based_features_all
  static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
  static unsigned int num_msr_based_features;
  
 +/*
 + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
 + * does not yet virtualize. These include:
 + *   10 - MISC_PACKAGE_CTRLS
 + *   11 - ENERGY_FILTERING_CTL
 + *   12 - DOITM
 + *   18 - FB_CLEAR_CTRL
 + *   21 - XAPIC_DISABLE_STATUS
 + *   23 - OVERCLOCKING_STATUS
 + */
 +
 +#define KVM_SUPPORTED_ARCH_CAP \
 +      (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
 +       ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
 +       ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
 +       ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
 +       ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
 +
  static u64 kvm_get_arch_capabilities(void)
  {
        u64 data = 0;
  
 -      if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
 +      if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
                rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
 +              data &= KVM_SUPPORTED_ARCH_CAP;
 +      }
  
        /*
         * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
                 */
        }
  
 -      /* Guests don't need to know "Fill buffer clear control" exists */
 -      data &= ~ARCH_CAP_FB_CLEAR_CTRL;
 -
        return data;
  }
  
@@@ -4841,7 -4846,7 +4859,7 @@@ static int kvm_vcpu_ready_for_interrupt
        return (kvm_arch_interrupt_allowed(vcpu) &&
                kvm_cpu_accept_dm_intr(vcpu) &&
                !kvm_event_needs_reinjection(vcpu) &&
-               !vcpu->arch.exception.pending);
+               !kvm_is_exception_pending(vcpu));
  }
  
  static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@@ -5016,25 -5021,38 +5034,38 @@@ static int kvm_vcpu_ioctl_x86_set_mce(s
  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                               struct kvm_vcpu_events *events)
  {
+       struct kvm_queued_exception *ex;
        process_nmi(vcpu);
  
        if (kvm_check_request(KVM_REQ_SMI, vcpu))
                process_smi(vcpu);
  
        /*
-        * In guest mode, payload delivery should be deferred,
-        * so that the L1 hypervisor can intercept #PF before
-        * CR2 is modified (or intercept #DB before DR6 is
-        * modified under nVMX). Unless the per-VM capability,
-        * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
-        * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
-        * opportunistically defer the exception payload, deliver it if the
-        * capability hasn't been requested before processing a
-        * KVM_GET_VCPU_EVENTS.
+        * KVM's ABI only allows for one exception to be migrated.  Luckily,
+        * the only time there can be two queued exceptions is if there's a
+        * non-exiting _injected_ exception, and a pending exiting exception.
+        * In that case, ignore the VM-Exiting exception as it's an extension
+        * of the injected exception.
+        */
+       if (vcpu->arch.exception_vmexit.pending &&
+           !vcpu->arch.exception.pending &&
+           !vcpu->arch.exception.injected)
+               ex = &vcpu->arch.exception_vmexit;
+       else
+               ex = &vcpu->arch.exception;
+       /*
+        * In guest mode, payload delivery should be deferred if the exception
+        * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
+        * intercepts #PF, ditto for DR6 and #DBs.  If the per-VM capability,
+        * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
+        * propagate the payload and so it cannot be safely deferred.  Deliver
+        * the payload if the capability hasn't been requested.
         */
        if (!vcpu->kvm->arch.exception_payload_enabled &&
-           vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
-               kvm_deliver_exception_payload(vcpu);
+           ex->pending && ex->has_payload)
+               kvm_deliver_exception_payload(vcpu, ex);
  
        /*
         * The API doesn't provide the instruction length for software
         * isn't advanced, we should expect to encounter the exception
         * again.
         */
-       if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+       if (kvm_exception_is_soft(ex->vector)) {
                events->exception.injected = 0;
                events->exception.pending = 0;
        } else {
-               events->exception.injected = vcpu->arch.exception.injected;
-               events->exception.pending = vcpu->arch.exception.pending;
+               events->exception.injected = ex->injected;
+               events->exception.pending = ex->pending;
                /*
                 * For ABI compatibility, deliberately conflate
                 * pending and injected exceptions when
                 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
                 */
                if (!vcpu->kvm->arch.exception_payload_enabled)
-                       events->exception.injected |=
-                               vcpu->arch.exception.pending;
+                       events->exception.injected |= ex->pending;
        }
-       events->exception.nr = vcpu->arch.exception.nr;
-       events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-       events->exception.error_code = vcpu->arch.exception.error_code;
-       events->exception_has_payload = vcpu->arch.exception.has_payload;
-       events->exception_payload = vcpu->arch.exception.payload;
+       events->exception.nr = ex->vector;
+       events->exception.has_error_code = ex->has_error_code;
+       events->exception.error_code = ex->error_code;
+       events->exception_has_payload = ex->has_payload;
+       events->exception_payload = ex->payload;
  
        events->interrupt.injected =
                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@@ -5131,9 -5148,22 +5161,22 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
                return -EINVAL;
  
        process_nmi(vcpu);
+       /*
+        * Flag that userspace is stuffing an exception, the next KVM_RUN will
+        * morph the exception to a VM-Exit if appropriate.  Do this only for
+        * pending exceptions, already-injected exceptions are not subject to
+        * intercpetion.  Note, userspace that conflates pending and injected
+        * is hosed, and will incorrectly convert an injected exception into a
+        * pending exception, which in turn may cause a spurious VM-Exit.
+        */
+       vcpu->arch.exception_from_userspace = events->exception.pending;
+       vcpu->arch.exception_vmexit.pending = false;
        vcpu->arch.exception.injected = events->exception.injected;
        vcpu->arch.exception.pending = events->exception.pending;
-       vcpu->arch.exception.nr = events->exception.nr;
+       vcpu->arch.exception.vector = events->exception.nr;
        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
        vcpu->arch.exception.error_code = events->exception.error_code;
        vcpu->arch.exception.has_payload = events->exception_has_payload;
@@@ -7257,6 -7287,7 +7300,7 @@@ static int kvm_can_emulate_insn(struct 
  int handle_ud(struct kvm_vcpu *vcpu)
  {
        static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
+       int fep_flags = READ_ONCE(force_emulation_prefix);
        int emul_type = EMULTYPE_TRAP_UD;
        char sig[5]; /* ud2; .ascii "kvm" */
        struct x86_exception e;
        if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
                return 1;
  
-       if (force_emulation_prefix &&
+       if (fep_flags &&
            kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
                                sig, sizeof(sig), &e) == 0 &&
            memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
+               if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
+                       kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
                kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
                emul_type = EMULTYPE_TRAP_UD_FORCED;
        }
@@@ -7933,14 -7966,20 +7979,20 @@@ static int emulator_get_msr_with_filter
        int r;
  
        r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
+       if (r < 0)
+               return X86EMUL_UNHANDLEABLE;
  
-       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
-                                   complete_emulated_rdmsr, r)) {
-               /* Bounce to user space */
-               return X86EMUL_IO_NEEDED;
+       if (r) {
+               if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+                                      complete_emulated_rdmsr, r))
+                       return X86EMUL_IO_NEEDED;
+               trace_kvm_msr_read_ex(msr_index);
+               return X86EMUL_PROPAGATE_FAULT;
        }
  
-       return r;
+       trace_kvm_msr_read(msr_index, *pdata);
+       return X86EMUL_CONTINUE;
  }
  
  static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
        int r;
  
        r = kvm_set_msr_with_filter(vcpu, msr_index, data);
+       if (r < 0)
+               return X86EMUL_UNHANDLEABLE;
  
-       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
-                                   complete_emulated_msr_access, r)) {
-               /* Bounce to user space */
-               return X86EMUL_IO_NEEDED;
+       if (r) {
+               if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+                                      complete_emulated_msr_access, r))
+                       return X86EMUL_IO_NEEDED;
+               trace_kvm_msr_write_ex(msr_index, data);
+               return X86EMUL_PROPAGATE_FAULT;
        }
  
-       return r;
+       trace_kvm_msr_write(msr_index, data);
+       return X86EMUL_CONTINUE;
  }
  
  static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
@@@ -8161,18 -8206,17 +8219,17 @@@ static void toggle_interruptibility(str
        }
  }
  
- static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
  {
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
-       if (ctxt->exception.vector == PF_VECTOR)
-               return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
  
-       if (ctxt->exception.error_code_valid)
+       if (ctxt->exception.vector == PF_VECTOR)
+               kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+       else if (ctxt->exception.error_code_valid)
                kvm_queue_exception_e(vcpu, ctxt->exception.vector,
                                      ctxt->exception.error_code);
        else
                kvm_queue_exception(vcpu, ctxt->exception.vector);
-       return false;
  }
  
  static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
@@@ -8548,8 -8592,46 +8605,46 @@@ int kvm_skip_emulated_instruction(struc
  }
  EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
  
- static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
+ static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
  {
+       u32 shadow;
+       if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
+               return true;
+       /*
+        * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
+        * but AMD CPUs do not.  MOV/POP SS blocking is rare, check that first
+        * to avoid the relatively expensive CPUID lookup.
+        */
+       shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+       return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
+              guest_cpuid_is_intel(vcpu);
+ }
+ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
+                                          int emulation_type, int *r)
+ {
+       WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
+       /*
+        * Do not check for code breakpoints if hardware has already done the
+        * checks, as inferred from the emulation type.  On NO_DECODE and SKIP,
+        * the instruction has passed all exception checks, and all intercepted
+        * exceptions that trigger emulation have lower priority than code
+        * breakpoints, i.e. the fact that the intercepted exception occurred
+        * means any code breakpoints have already been serviced.
+        *
+        * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
+        * hardware has checked the RIP of the magic prefix, but not the RIP of
+        * the instruction being emulated.  The intent of forced emulation is
+        * to behave as if KVM intercepted the instruction without an exception
+        * and without a prefix.
+        */
+       if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+                             EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
+               return false;
        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
                struct kvm_run *kvm_run = vcpu->run;
        }
  
        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
-           !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+           !kvm_is_code_breakpoint_inhibited(vcpu)) {
                unsigned long eip = kvm_get_linear_rip(vcpu);
                u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
                                           vcpu->arch.dr7,
@@@ -8671,8 -8753,7 +8766,7 @@@ int x86_emulate_instruction(struct kvm_
                 * are fault-like and are higher priority than any faults on
                 * the code fetch itself.
                 */
-               if (!(emulation_type & EMULTYPE_SKIP) &&
-                   kvm_vcpu_check_code_breakpoint(vcpu, &r))
+               if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
                        return r;
  
                r = x86_decode_emulated_instruction(vcpu, emulation_type,
@@@ -8770,8 -8851,7 +8864,7 @@@ restart
  
        if (ctxt->have_exception) {
                r = 1;
-               if (inject_emulated_exception(vcpu))
-                       return r;
+               inject_emulated_exception(vcpu);
        } else if (vcpu->arch.pio.count) {
                if (!vcpu->arch.pio.in) {
                        /* FIXME: return into emulator if single-stepping.  */
@@@ -8801,6 -8881,12 +8894,12 @@@ writeback
                unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
                toggle_interruptibility(vcpu, ctxt->interruptibility);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+               /*
+                * Note, EXCPT_DB is assumed to be fault-like as the emulator
+                * only supports code breakpoints and general detect #DB, both
+                * of which are fault-like.
+                */
                if (!ctxt->have_exception ||
                    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
                        kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
@@@ -9662,74 -9748,155 +9761,155 @@@ int kvm_check_nested_events(struct kvm_
  
  static void kvm_inject_exception(struct kvm_vcpu *vcpu)
  {
-       trace_kvm_inj_exception(vcpu->arch.exception.nr,
+       trace_kvm_inj_exception(vcpu->arch.exception.vector,
                                vcpu->arch.exception.has_error_code,
                                vcpu->arch.exception.error_code,
                                vcpu->arch.exception.injected);
  
        if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
                vcpu->arch.exception.error_code = false;
-       static_call(kvm_x86_queue_exception)(vcpu);
+       static_call(kvm_x86_inject_exception)(vcpu);
  }
  
- static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+ /*
+  * Check for any event (interrupt or exception) that is ready to be injected,
+  * and if there is at least one event, inject the event with the highest
+  * priority.  This handles both "pending" events, i.e. events that have never
+  * been injected into the guest, and "injected" events, i.e. events that were
+  * injected as part of a previous VM-Enter, but weren't successfully delivered
+  * and need to be re-injected.
+  *
+  * Note, this is not guaranteed to be invoked on a guest instruction boundary,
+  * i.e. doesn't guarantee that there's an event window in the guest.  KVM must
+  * be able to inject exceptions in the "middle" of an instruction, and so must
+  * also be able to re-inject NMIs and IRQs in the middle of an instruction.
+  * I.e. for exceptions and re-injected events, NOT invoking this on instruction
+  * boundaries is necessary and correct.
+  *
+  * For simplicity, KVM uses a single path to inject all events (except events
+  * that are injected directly from L1 to L2) and doesn't explicitly track
+  * instruction boundaries for asynchronous events.  However, because VM-Exits
+  * that can occur during instruction execution typically result in KVM skipping
+  * the instruction or injecting an exception, e.g. instruction and exception
+  * intercepts, and because pending exceptions have higher priority than pending
+  * interrupts, KVM still honors instruction boundaries in most scenarios.
+  *
+  * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
+  * the instruction or inject an exception, then KVM can incorrecty inject a new
+  * asynchrounous event if the event became pending after the CPU fetched the
+  * instruction (in the guest).  E.g. if a page fault (#PF, #NPF, EPT violation)
+  * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
+  * injected on the restarted instruction instead of being deferred until the
+  * instruction completes.
+  *
+  * In practice, this virtualization hole is unlikely to be observed by the
+  * guest, and even less likely to cause functional problems.  To detect the
+  * hole, the guest would have to trigger an event on a side effect of an early
+  * phase of instruction execution, e.g. on the instruction fetch from memory.
+  * And for it to be a functional problem, the guest would need to depend on the
+  * ordering between that side effect, the instruction completing, _and_ the
+  * delivery of the asynchronous event.
+  */
+ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
+                                      bool *req_immediate_exit)
  {
+       bool can_inject;
        int r;
-       bool can_inject = true;
  
-       /* try to reinject previous events if any */
+       /*
+        * Process nested events first, as nested VM-Exit supercedes event
+        * re-injection.  If there's an event queued for re-injection, it will
+        * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
+        */
+       if (is_guest_mode(vcpu))
+               r = kvm_check_nested_events(vcpu);
+       else
+               r = 0;
  
-       if (vcpu->arch.exception.injected) {
-               kvm_inject_exception(vcpu);
-               can_inject = false;
-       }
        /*
-        * Do not inject an NMI or interrupt if there is a pending
-        * exception.  Exceptions and interrupts are recognized at
-        * instruction boundaries, i.e. the start of an instruction.
-        * Trap-like exceptions, e.g. #DB, have higher priority than
-        * NMIs and interrupts, i.e. traps are recognized before an
-        * NMI/interrupt that's pending on the same instruction.
-        * Fault-like exceptions, e.g. #GP and #PF, are the lowest
-        * priority, but are only generated (pended) during instruction
-        * execution, i.e. a pending fault-like exception means the
-        * fault occurred on the *previous* instruction and must be
-        * serviced prior to recognizing any new events in order to
-        * fully complete the previous instruction.
+        * Re-inject exceptions and events *especially* if immediate entry+exit
+        * to/from L2 is needed, as any event that has already been injected
+        * into L2 needs to complete its lifecycle before injecting a new event.
+        *
+        * Don't re-inject an NMI or interrupt if there is a pending exception.
+        * This collision arises if an exception occurred while vectoring the
+        * injected event, KVM intercepted said exception, and KVM ultimately
+        * determined the fault belongs to the guest and queues the exception
+        * for injection back into the guest.
+        *
+        * "Injected" interrupts can also collide with pending exceptions if
+        * userspace ignores the "ready for injection" flag and blindly queues
+        * an interrupt.  In that case, prioritizing the exception is correct,
+        * as the exception "occurred" before the exit to userspace.  Trap-like
+        * exceptions, e.g. most #DBs, have higher priority than interrupts.
+        * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
+        * priority, they're only generated (pended) during instruction
+        * execution, and interrupts are recognized at instruction boundaries.
+        * Thus a pending fault-like exception means the fault occurred on the
+        * *previous* instruction and must be serviced prior to recognizing any
+        * new events in order to fully complete the previous instruction.
         */
-       else if (!vcpu->arch.exception.pending) {
-               if (vcpu->arch.nmi_injected) {
-                       static_call(kvm_x86_inject_nmi)(vcpu);
-                       can_inject = false;
-               } else if (vcpu->arch.interrupt.injected) {
-                       static_call(kvm_x86_inject_irq)(vcpu, true);
-                       can_inject = false;
-               }
-       }
+       if (vcpu->arch.exception.injected)
+               kvm_inject_exception(vcpu);
+       else if (kvm_is_exception_pending(vcpu))
+               ; /* see above */
+       else if (vcpu->arch.nmi_injected)
+               static_call(kvm_x86_inject_nmi)(vcpu);
+       else if (vcpu->arch.interrupt.injected)
+               static_call(kvm_x86_inject_irq)(vcpu, true);
  
+       /*
+        * Exceptions that morph to VM-Exits are handled above, and pending
+        * exceptions on top of injected exceptions that do not VM-Exit should
+        * either morph to #DF or, sadly, override the injected exception.
+        */
        WARN_ON_ONCE(vcpu->arch.exception.injected &&
                     vcpu->arch.exception.pending);
  
        /*
-        * Call check_nested_events() even if we reinjected a previous event
-        * in order for caller to determine if it should require immediate-exit
-        * from L2 to L1 due to pending L1 events which require exit
-        * from L2 to L1.
+        * Bail if immediate entry+exit to/from the guest is needed to complete
+        * nested VM-Enter or event re-injection so that a different pending
+        * event can be serviced (or if KVM needs to exit to userspace).
+        *
+        * Otherwise, continue processing events even if VM-Exit occurred.  The
+        * VM-Exit will have cleared exceptions that were meant for L2, but
+        * there may now be events that can be injected into L1.
         */
-       if (is_guest_mode(vcpu)) {
-               r = kvm_check_nested_events(vcpu);
-               if (r < 0)
-                       goto out;
-       }
+       if (r < 0)
+               goto out;
+       /*
+        * A pending exception VM-Exit should either result in nested VM-Exit
+        * or force an immediate re-entry and exit to/from L2, and exception
+        * VM-Exits cannot be injected (flag should _never_ be set).
+        */
+       WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+                    vcpu->arch.exception_vmexit.pending);
+       /*
+        * New events, other than exceptions, cannot be injected if KVM needs
+        * to re-inject a previous event.  See above comments on re-injecting
+        * for why pending exceptions get priority.
+        */
+       can_inject = !kvm_event_needs_reinjection(vcpu);
  
-       /* try to inject new event if pending */
        if (vcpu->arch.exception.pending) {
-               if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+               /*
+                * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+                * value pushed on the stack.  Trap-like exception and all #DBs
+                * leave RF as-is (KVM follows Intel's behavior in this regard;
+                * AMD states that code breakpoint #DBs excplitly clear RF=0).
+                *
+                * Note, most versions of Intel's SDM and AMD's APM incorrectly
+                * describe the behavior of General Detect #DBs, which are
+                * fault-like.  They do _not_ set RF, a la code breakpoints.
+                */
+               if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
                        __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
                                             X86_EFLAGS_RF);
  
-               if (vcpu->arch.exception.nr == DB_VECTOR) {
-                       kvm_deliver_exception_payload(vcpu);
+               if (vcpu->arch.exception.vector == DB_VECTOR) {
+                       kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
                        if (vcpu->arch.dr7 & DR7_GD) {
                                vcpu->arch.dr7 &= ~DR7_GD;
                                kvm_update_dr7(vcpu);
        }
  
        if (is_guest_mode(vcpu) &&
-           kvm_x86_ops.nested_ops->hv_timer_pending &&
-           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+           kvm_x86_ops.nested_ops->has_events &&
+           kvm_x86_ops.nested_ops->has_events(vcpu))
                *req_immediate_exit = true;
  
-       WARN_ON(vcpu->arch.exception.pending);
+       WARN_ON(kvm_is_exception_pending(vcpu));
        return 0;
  
  out:
@@@ -10110,7 -10277,7 +10290,7 @@@ void kvm_vcpu_update_apicv(struct kvm_v
         * When APICv gets disabled, we may still have injected interrupts
         * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
         * still active when the interrupt got accepted. Make sure
-        * inject_pending_event() is called to check for that.
+        * kvm_check_and_inject_events() is called to check for that.
         */
        if (!apic->apicv_active)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -10407,7 -10574,7 +10587,7 @@@ static int vcpu_enter_guest(struct kvm_
                        goto out;
                }
  
-               r = inject_pending_event(vcpu, &req_immediate_exit);
+               r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
                if (r < 0) {
                        r = 0;
                        goto out;
@@@ -10646,10 -10813,26 +10826,26 @@@ static inline int vcpu_block(struct kvm
                if (hv_timer)
                        kvm_lapic_switch_to_hv_timer(vcpu);
  
-               if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+               /*
+                * If the vCPU is not runnable, a signal or another host event
+                * of some kind is pending; service it without changing the
+                * vCPU's activity state.
+                */
+               if (!kvm_arch_vcpu_runnable(vcpu))
                        return 1;
        }
  
+       /*
+        * Evaluate nested events before exiting the halted state.  This allows
+        * the halt state to be recorded properly in the VMCS12's activity
+        * state field (AMD does not have a similar field and a VM-Exit always
+        * causes a spurious wakeup from HLT).
+        */
+       if (is_guest_mode(vcpu)) {
+               if (kvm_check_nested_events(vcpu) < 0)
+                       return 0;
+       }
        if (kvm_apic_accept_events(vcpu) < 0)
                return 0;
        switch(vcpu->arch.mp_state) {
        case KVM_MP_STATE_INIT_RECEIVED:
                break;
        default:
 -              return -EINTR;
 +              WARN_ON_ONCE(1);
 +              break;
        }
        return 1;
  }
  
  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(vcpu))
-               kvm_check_nested_events(vcpu);
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
  }
@@@ -10824,6 -11003,7 +11017,7 @@@ static void kvm_put_guest_fpu(struct kv
  
  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception;
        struct kvm_run *kvm_run = vcpu->run;
        int r;
  
                        r = 0;
                        goto out;
                }
-               kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                r = -EAGAIN;
                if (signal_pending(current)) {
                        r = -EINTR;
                }
        }
  
+       /*
+        * If userspace set a pending exception and L2 is active, convert it to
+        * a pending VM-Exit if L1 wants to intercept the exception.
+        */
+       if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+                                                       ex->error_code)) {
+               kvm_queue_exception_vmexit(vcpu, ex->vector,
+                                          ex->has_error_code, ex->error_code,
+                                          ex->has_payload, ex->payload);
+               ex->injected = false;
+               ex->pending = false;
+       }
+       vcpu->arch.exception_from_userspace = false;
        if (unlikely(vcpu->arch.complete_userspace_io)) {
                int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
                vcpu->arch.complete_userspace_io = NULL;
@@@ -10988,6 -11182,7 +11196,7 @@@ static void __set_regs(struct kvm_vcpu 
        kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
  
        vcpu->arch.exception.pending = false;
+       vcpu->arch.exception_vmexit.pending = false;
  
        kvm_make_request(KVM_REQ_EVENT, vcpu);
  }
@@@ -11107,29 -11302,17 +11316,30 @@@ int kvm_arch_vcpu_ioctl_set_mpstate(str
  
        vcpu_load(vcpu);
  
 -      if (!lapic_in_kernel(vcpu) &&
 -          mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
 +      switch (mp_state->mp_state) {
 +      case KVM_MP_STATE_UNINITIALIZED:
 +      case KVM_MP_STATE_HALTED:
 +      case KVM_MP_STATE_AP_RESET_HOLD:
 +      case KVM_MP_STATE_INIT_RECEIVED:
 +      case KVM_MP_STATE_SIPI_RECEIVED:
 +              if (!lapic_in_kernel(vcpu))
 +                      goto out;
 +              break;
 +
 +      case KVM_MP_STATE_RUNNABLE:
 +              break;
 +
 +      default:
                goto out;
 +      }
  
        /*
-        * KVM_MP_STATE_INIT_RECEIVED means the processor is in
-        * INIT state; latched init should be reported using
-        * KVM_SET_VCPU_EVENTS, so reject it here.
+        * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
+        * forcing the guest into INIT/SIPI if those events are supposed to be
+        * blocked.  KVM prioritizes SMI over INIT, so reject INIT/SIPI state
+        * if an SMI is pending as well.
         */
-       if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
+       if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
            (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
             mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
                goto out;
@@@ -11368,7 -11551,7 +11578,7 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
  
        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
                r = -EBUSY;
-               if (vcpu->arch.exception.pending)
+               if (kvm_is_exception_pending(vcpu))
                        goto out;
                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
                        kvm_queue_exception(vcpu, DB_VECTOR);
@@@ -11590,7 -11773,7 +11800,7 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
        vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
                                            GFP_KERNEL_ACCOUNT);
        if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
 -              goto fail_free_pio_data;
 +              goto fail_free_mce_banks;
        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
  
        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
@@@ -11644,6 -11827,7 +11854,6 @@@ free_wbinvd_dirty_mask
  fail_free_mce_banks:
        kfree(vcpu->arch.mce_banks);
        kfree(vcpu->arch.mci_ctl2_banks);
 -fail_free_pio_data:
        free_page((unsigned long)vcpu->arch.pio_data);
  fail_free_lapic:
        kvm_free_lapic(vcpu);
@@@ -11750,8 -11934,8 +11960,8 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc
                struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
  
                /*
-                * To avoid have the INIT path from kvm_apic_has_events() that be
-                * called with loaded FPU and does not let userspace fix the state.
+                * All paths that lead to INIT are required to load the guest's
+                * FPU state (because most paths are buried in KVM_RUN).
                 */
                if (init_event)
                        kvm_put_guest_fpu(vcpu);
@@@ -12080,6 -12264,10 +12290,10 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        if (ret)
                goto out_page_track;
  
+       ret = static_call(kvm_x86_vm_init)(kvm);
+       if (ret)
+               goto out_uninit_mmu;
        INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
        kvm_hv_init_vm(kvm);
        kvm_xen_init_vm(kvm);
  
-       return static_call(kvm_x86_vm_init)(kvm);
+       return 0;
  
+ out_uninit_mmu:
+       kvm_mmu_uninit_vm(kvm);
  out_page_track:
        kvm_page_track_cleanup(kvm);
  out:
@@@ -12589,13 -12779,14 +12805,14 @@@ static inline bool kvm_vcpu_has_events(
        if (!list_empty_careful(&vcpu->async_pf.done))
                return true;
  
-       if (kvm_apic_has_events(vcpu))
+       if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+           kvm_apic_init_sipi_allowed(vcpu))
                return true;
  
        if (vcpu->arch.pv.pv_unhalted)
                return true;
  
-       if (vcpu->arch.exception.pending)
+       if (kvm_is_exception_pending(vcpu))
                return true;
  
        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
                return true;
  
        if (is_guest_mode(vcpu) &&
-           kvm_x86_ops.nested_ops->hv_timer_pending &&
-           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+           kvm_x86_ops.nested_ops->has_events &&
+           kvm_x86_ops.nested_ops->has_events(vcpu))
                return true;
  
        if (kvm_xen_has_pending_events(vcpu))
                return true;
  
-       if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
-               return true;
        return false;
  }
  
@@@ -12850,7 -13038,7 +13064,7 @@@ bool kvm_can_do_async_pf(struct kvm_vcp
  {
        if (unlikely(!lapic_in_kernel(vcpu) ||
                     kvm_event_needs_reinjection(vcpu) ||
-                    vcpu->arch.exception.pending))
+                    kvm_is_exception_pending(vcpu)))
                return false;
  
        if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
@@@ -13401,7 -13589,7 +13615,7 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_vi
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
- EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
diff --combined mm/page_alloc.c
index d04211f0ef0b142c9625b47e4115e0515804590e,90461bd947448ed0e3934435aa912463f9b896c8..08522a831c7a7294a2f5c37879464b2e1cde00ff
@@@ -4708,30 -4708,6 +4708,30 @@@ void fs_reclaim_release(gfp_t gfp_mask
  EXPORT_SYMBOL_GPL(fs_reclaim_release);
  #endif
  
 +/*
 + * Zonelists may change due to hotplug during allocation. Detect when zonelists
 + * have been rebuilt so allocation retries. Reader side does not lock and
 + * retries the allocation if zonelist changes. Writer side is protected by the
 + * embedded spin_lock.
 + */
 +static DEFINE_SEQLOCK(zonelist_update_seq);
 +
 +static unsigned int zonelist_iter_begin(void)
 +{
 +      if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
 +              return read_seqbegin(&zonelist_update_seq);
 +
 +      return 0;
 +}
 +
 +static unsigned int check_retry_zonelist(unsigned int seq)
 +{
 +      if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
 +              return read_seqretry(&zonelist_update_seq, seq);
 +
 +      return seq;
 +}
 +
  /* Perform direct synchronous page reclaim */
  static unsigned long
  __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@@ -5025,7 -5001,6 +5025,7 @@@ __alloc_pages_slowpath(gfp_t gfp_mask, 
        int compaction_retries;
        int no_progress_loops;
        unsigned int cpuset_mems_cookie;
 +      unsigned int zonelist_iter_cookie;
        int reserve_flags;
  
        /*
                                (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                gfp_mask &= ~__GFP_ATOMIC;
  
 -retry_cpuset:
 +restart:
        compaction_retries = 0;
        no_progress_loops = 0;
        compact_priority = DEF_COMPACT_PRIORITY;
        cpuset_mems_cookie = read_mems_allowed_begin();
 +      zonelist_iter_cookie = zonelist_iter_begin();
  
        /*
         * The fast path uses conservative alloc_flags to succeed only until
@@@ -5213,13 -5187,9 +5213,13 @@@ retry
                goto retry;
  
  
 -      /* Deal with possible cpuset update races before we start OOM killing */
 -      if (check_retry_cpuset(cpuset_mems_cookie, ac))
 -              goto retry_cpuset;
 +      /*
 +       * Deal with possible cpuset update races or zonelist updates to avoid
 +       * a unnecessary OOM kill.
 +       */
 +      if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
 +          check_retry_zonelist(zonelist_iter_cookie))
 +              goto restart;
  
        /* Reclaim has failed us, start killing things */
        page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
        }
  
  nopage:
 -      /* Deal with possible cpuset update races before we fail */
 -      if (check_retry_cpuset(cpuset_mems_cookie, ac))
 -              goto retry_cpuset;
 +      /*
 +       * Deal with possible cpuset update races or zonelist updates to avoid
 +       * a unnecessary OOM kill.
 +       */
 +      if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
 +          check_retry_zonelist(zonelist_iter_cookie))
 +              goto restart;
  
        /*
         * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@@ -5740,18 -5706,6 +5740,18 @@@ refill
                /* reset page count bias and offset to start of new frag */
                nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                offset = size - fragsz;
 +              if (unlikely(offset < 0)) {
 +                      /*
 +                       * The caller is trying to allocate a fragment
 +                       * with fragsz > PAGE_SIZE but the cache isn't big
 +                       * enough to satisfy the request, this may
 +                       * happen in low memory conditions.
 +                       * We don't release the cache page because
 +                       * it could make memory pressure worse
 +                       * so we simply return NULL here.
 +                       */
 +                      return NULL;
 +              }
        }
  
        nc->pagecnt_bias--;
@@@ -6085,7 -6039,8 +6085,8 @@@ void show_free_areas(unsigned int filte
                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
                " unevictable:%lu dirty:%lu writeback:%lu\n"
                " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-               " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+               " mapped:%lu shmem:%lu pagetables:%lu\n"
+               " sec_pagetables:%lu bounce:%lu\n"
                " kernel_misc_reclaimable:%lu\n"
                " free:%lu free_pcp:%lu free_cma:%lu\n",
                global_node_page_state(NR_ACTIVE_ANON),
                global_node_page_state(NR_FILE_MAPPED),
                global_node_page_state(NR_SHMEM),
                global_node_page_state(NR_PAGETABLE),
+               global_node_page_state(NR_SECONDARY_PAGETABLE),
                global_zone_page_state(NR_BOUNCE),
                global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
                global_zone_page_state(NR_FREE_PAGES),
                        " shadow_call_stack:%lukB"
  #endif
                        " pagetables:%lukB"
+                       " sec_pagetables:%lukB"
                        " all_unreclaimable? %s"
                        "\n",
                        pgdat->node_id,
                        node_page_state(pgdat, NR_KERNEL_SCS_KB),
  #endif
                        K(node_page_state(pgdat, NR_PAGETABLE)),
+                       K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
                        pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
                                "yes" : "no");
        }
@@@ -6560,8 -6518,9 +6564,8 @@@ static void __build_all_zonelists(void 
        int nid;
        int __maybe_unused cpu;
        pg_data_t *self = data;
 -      static DEFINE_SPINLOCK(lock);
  
 -      spin_lock(&lock);
 +      write_seqlock(&zonelist_update_seq);
  
  #ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
  #endif
        }
  
 -      spin_unlock(&lock);
 +      write_sequnlock(&zonelist_update_seq);
  }
  
  static noinline void __init
diff --combined mm/vmstat.c
index 90af9a8572f5a7073520ddaf2f4d1d3aaec2b7ac,b937eba681d1533e773e201fd62ad9edcbe38cc2..da264a040c5567d0ac192ed001134ee67e6291b7
@@@ -1168,15 -1168,8 +1168,15 @@@ int fragmentation_index(struct zone *zo
  #define TEXT_FOR_HIGHMEM(xx)
  #endif
  
 +#ifdef CONFIG_ZONE_DEVICE
 +#define TEXT_FOR_DEVICE(xx) xx "_device",
 +#else
 +#define TEXT_FOR_DEVICE(xx)
 +#endif
 +
  #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
 -                                      TEXT_FOR_HIGHMEM(xx) xx "_movable",
 +                                      TEXT_FOR_HIGHMEM(xx) xx "_movable", \
 +                                      TEXT_FOR_DEVICE(xx)
  
  const char * const vmstat_text[] = {
        /* enum zone_stat_item counters */
        "nr_shadow_call_stack",
  #endif
        "nr_page_table_pages",
+       "nr_sec_page_table_pages",
  #ifdef CONFIG_SWAP
        "nr_swapcached",
  #endif
index 6448cb9f710f5fc9095a872edaaa52589094f00a,8b1b32628ac8426657566db7817c8ef27390b399..fde3ae8cfa4c7cc1b909d9caa17a152365ffb8b1
@@@ -48,8 -48,6 +48,8 @@@ LIBKVM += lib/rbtree.
  LIBKVM += lib/sparsebit.c
  LIBKVM += lib/test_util.c
  
 +LIBKVM_STRING += lib/string_override.c
 +
  LIBKVM_x86_64 += lib/x86_64/apic.c
  LIBKVM_x86_64 += lib/x86_64/handlers.S
  LIBKVM_x86_64 += lib/x86_64/perf_test_util.c
@@@ -91,6 -89,7 +91,7 @@@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_clo
  TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
  TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
  TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
+ TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
  TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
  TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
  TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
@@@ -222,8 -221,7 +223,8 @@@ LIBKVM_C := $(filter %.c,$(LIBKVM)
  LIBKVM_S := $(filter %.S,$(LIBKVM))
  LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
  LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
 -LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
 +LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
 +LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
  
  EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
  
@@@ -234,12 -232,6 +235,12 @@@ $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.
  $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
        $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
  
 +# Compile the string overrides as freestanding to prevent the compiler from
 +# generating self-referential code, e.g. without "freestanding" the compiler may
 +# "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion.
 +$(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
 +      $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
 +
  x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
  $(TEST_GEN_PROGS): $(LIBKVM_OBJS)
  $(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
index 790c6d1ecb3482b1219f61f94516c9c2a297f67f,d07f13c9fced12527a14eb4745739f961f9e28d0..71b290b6469d6ba9f4230c016e3d10156c034648
@@@ -8,6 -8,8 +8,8 @@@
  #ifndef SELFTEST_KVM_VMX_H
  #define SELFTEST_KVM_VMX_H
  
+ #include <asm/vmx.h>
  #include <stdint.h>
  #include "processor.h"
  #include "apic.h"
  #define VMX_EPT_VPID_CAP_AD_BITS              0x00200000
  
  #define EXIT_REASON_FAILED_VMENTRY    0x80000000
- #define EXIT_REASON_EXCEPTION_NMI     0
- #define EXIT_REASON_EXTERNAL_INTERRUPT        1
- #define EXIT_REASON_TRIPLE_FAULT      2
- #define EXIT_REASON_INTERRUPT_WINDOW  7
- #define EXIT_REASON_NMI_WINDOW                8
- #define EXIT_REASON_TASK_SWITCH               9
- #define EXIT_REASON_CPUID             10
- #define EXIT_REASON_HLT                       12
- #define EXIT_REASON_INVD              13
- #define EXIT_REASON_INVLPG            14
- #define EXIT_REASON_RDPMC             15
- #define EXIT_REASON_RDTSC             16
- #define EXIT_REASON_VMCALL            18
- #define EXIT_REASON_VMCLEAR           19
- #define EXIT_REASON_VMLAUNCH          20
- #define EXIT_REASON_VMPTRLD           21
- #define EXIT_REASON_VMPTRST           22
- #define EXIT_REASON_VMREAD            23
- #define EXIT_REASON_VMRESUME          24
- #define EXIT_REASON_VMWRITE           25
- #define EXIT_REASON_VMOFF             26
- #define EXIT_REASON_VMON              27
- #define EXIT_REASON_CR_ACCESS         28
- #define EXIT_REASON_DR_ACCESS         29
- #define EXIT_REASON_IO_INSTRUCTION    30
- #define EXIT_REASON_MSR_READ          31
- #define EXIT_REASON_MSR_WRITE         32
- #define EXIT_REASON_INVALID_STATE     33
- #define EXIT_REASON_MWAIT_INSTRUCTION 36
- #define EXIT_REASON_MONITOR_INSTRUCTION 39
- #define EXIT_REASON_PAUSE_INSTRUCTION 40
- #define EXIT_REASON_MCE_DURING_VMENTRY        41
- #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
- #define EXIT_REASON_APIC_ACCESS               44
- #define EXIT_REASON_EOI_INDUCED               45
- #define EXIT_REASON_EPT_VIOLATION     48
- #define EXIT_REASON_EPT_MISCONFIG     49
- #define EXIT_REASON_INVEPT            50
- #define EXIT_REASON_RDTSCP            51
- #define EXIT_REASON_PREEMPTION_TIMER  52
- #define EXIT_REASON_INVVPID           53
- #define EXIT_REASON_WBINVD            54
- #define EXIT_REASON_XSETBV            55
- #define EXIT_REASON_APIC_WRITE                56
- #define EXIT_REASON_INVPCID           58
- #define EXIT_REASON_PML_FULL          62
- #define EXIT_REASON_XSAVES            63
- #define EXIT_REASON_XRSTORS           64
- #define LAST_EXIT_REASON              64
  
  enum vmcs_field {
        VIRTUAL_PROCESSOR_ID            = 0x00000000,
        VMWRITE_BITMAP_HIGH             = 0x00002029,
        XSS_EXIT_BITMAP                 = 0x0000202C,
        XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
+       ENCLS_EXITING_BITMAP            = 0x0000202E,
+       ENCLS_EXITING_BITMAP_HIGH       = 0x0000202F,
        TSC_MULTIPLIER                  = 0x00002032,
        TSC_MULTIPLIER_HIGH             = 0x00002033,
        GUEST_PHYSICAL_ADDRESS          = 0x00002400,
@@@ -617,7 -572,6 +572,7 @@@ void nested_map_memslot(struct vmx_page
                        uint32_t memslot);
  void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
                            uint64_t addr, uint64_t size);
 +bool kvm_vm_has_ept(struct kvm_vm *vm);
  void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
                  uint32_t eptp_memslot);
  void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);