Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)
diff --combined .mailmap

index 191778125ef19f6ac941d9e74da45b4f1eb1f5ae,d1f7ed1019cf4fb5b60a777d4762568487e2a709..3e63fb0b10883c814b6b629d39455609de72775b
--- 1/.mailmap
--- 2/.mailmap
+++ b/.mailmap
@@@ -71,9 -71,6 +71,9 @@@ Ben M Cahill <ben.m.cahill@intel.com
   Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net>
   Ben Widawsky <bwidawsk@kernel.org> <ben.widawsky@intel.com>
   Ben Widawsky <bwidawsk@kernel.org> <benjamin.widawsky@intel.com>
+ +Bjorn Andersson <andersson@kernel.org> <bjorn@kryo.se>
+ +Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@linaro.org>
+ +Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@sonymobile.com>
   Björn Steinbrink <B.Steinbrink@gmx.de>
   Björn Töpel <bjorn@kernel.org> <bjorn.topel@gmail.com>
   Björn Töpel <bjorn@kernel.org> <bjorn.topel@intel.com>
@@@ -101,7 -98,8 +101,7 @@@ Christian Brauner <brauner@kernel.org> 
   Christian Marangi <ansuelsmth@gmail.com>
   Christophe Ricard <christophe.ricard@gmail.com>
   Christoph Hellwig <hch@lst.de>
- -Colin Ian King <colin.king@intel.com> <colin.king@canonical.com>
- -Colin Ian King <colin.king@intel.com> <colin.i.king@gmail.com>
+ +Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com>
   Corey Minyard <minyard@acm.org>
   Damian Hobson-Garcia <dhobsong@igel.co.jp>
   Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
@@@ -152,8 -150,6 +152,8 @@@ Greg Kroah-Hartman <gregkh@suse.de
   Greg Kroah-Hartman <greg@kroah.com>
   Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
   Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
+ +Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@linux.vnet.ibm.com>
+ +Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@canonical.com>
   Guo Ren <guoren@kernel.org> <guoren@linux.alibaba.com>
   Guo Ren <guoren@kernel.org> <ren_guo@c-sky.com>
   Gustavo Padovan <gustavo@las.ic.unicamp.br>
@@@ -257,7 -253,6 +257,7 @@@ Linus Lüssing <linus.luessing@c0d3.blu
   Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
   Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
   Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com>
+ +Luca Ceresoli <luca.ceresoli@bootlin.com> <luca@lucaceresoli.net>
   Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
   Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
   Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org>
@@@ -318,7 -313,6 +318,7 @@@ Morten Welinder <welinder@troll.com
   Mythri P K <mythripk@ti.com>
   Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
   Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
+ +Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
   Nguyen Anh Quynh <aquynh@gmail.com>
   Nicholas Piggin <npiggin@gmail.com> <npiggen@suse.de>
   Nicholas Piggin <npiggin@gmail.com> <npiggin@kernel.dk>
@@@ -336,6 -330,7 +336,7 @@@ Oleksij Rempel <linux@rempel-privat.de
   Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
   Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
   Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
+ Oliver Upton <oliver.upton@linux.dev> <oupton@google.com>
   Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
   Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
   Patrick Mochel <mochel@digitalimplant.org>
diff --combined arch/arm64/kvm/arm.c

index 917086be5c6b105186f7b4d8de5754738876c640,4f949b64fdc91afab531f36c55b5c177f610d03e..446f628a9de1837023f4cb354bda9d2e649fdad3
--- 1/arch/arm64/kvm/arm.c
--- 2/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@@ -666,7 -666,6 +666,6 @@@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu
   
         kvm_vcpu_halt(vcpu);
         vcpu_clear_flag(vcpu, IN_WFIT);
-       kvm_clear_request(KVM_REQ_UNHALT, vcpu);
   
         preempt_disable();
         vgic_v4_load(vcpu);
@@@ -2114,7 -2113,7 +2113,7 @@@ static int finalize_hyp_mode(void
          * at, which would end badly once inaccessible.
          */
         kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
- -      kmemleak_free_part(__va(hyp_mem_base), hyp_mem_size);
+ +      kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
         return pkvm_drop_host_privileges();
   }
   
diff --combined arch/s390/kvm/kvm-s390.c

index b7ef0b71014df3dd8d92467cad741bfbdf0f06ea,aa39ea4582bd1acfa83f33d360ba1d1b69bac8f7..45d4b8182b0734c27e70e8582d043b8ffba7b554
--- 1/arch/s390/kvm/kvm-s390.c
--- 2/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -505,7 -505,7 +505,7 @@@ int kvm_arch_init(void *opaque
                 goto out;
         }
   
- -      if (kvm_s390_pci_interp_allowed()) {
+ +      if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
                 rc = kvm_s390_pci_init();
                 if (rc) {
                         pr_err("Unable to allocate AIFT for PCI\n");
@@@ -527,7 -527,7 +527,7 @@@ out
   void kvm_arch_exit(void)
   {
         kvm_s390_gib_destroy();
- -      if (kvm_s390_pci_interp_allowed())
+ +      if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
                 kvm_s390_pci_exit();
         debug_unregister(kvm_s390_dbf);
         debug_unregister(kvm_s390_dbf_uv);
@@@ -4343,8 -4343,6 +4343,6 @@@ retry
                 goto retry;
         }
   
-       /* nothing to do, just clear the request */
-       kvm_clear_request(KVM_REQ_UNHALT, vcpu);
         /* we left the vsie handler, nothing to do, just clear the request */
         kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
   
diff --combined arch/x86/include/asm/kvm_host.h

index aa381ab69a1911ffb5afde4e49dc26ab67edc783,d40206b16d6cc260e3141f73484390cd00bcc4fb..61b9dd34d333ec928521908971116d954db574ee
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -615,6 -615,8 +615,8 @@@ struct kvm_vcpu_hv 
                 u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
                 u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
                 u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+               u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
+               u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
         } cpuid_cache;
   };
   
@@@ -639,6 -641,16 +641,16 @@@ struct kvm_vcpu_xen 
         struct timer_list poll_timer;
   };
   
+ struct kvm_queued_exception {
+       bool pending;
+       bool injected;
+       bool has_error_code;
+       u8 vector;
+       u32 error_code;
+       unsigned long payload;
+       bool has_payload;
+ };
+ 
   struct kvm_vcpu_arch {
         /*
          * rip and regs accesses must go through
@@@ -729,7 -741,6 +741,7 @@@
         struct fpu_guest guest_fpu;
   
         u64 xcr0;
+ +      u64 guest_supported_xcr0;
   
         struct kvm_pio_request pio;
         void *pio_data;
@@@ -738,16 -749,12 +750,12 @@@
   
         u8 event_exit_inst_len;
   
-       struct kvm_queued_exception {
-               bool pending;
-               bool injected;
-               bool has_error_code;
-               u8 nr;
-               u32 error_code;
-               unsigned long payload;
-               bool has_payload;
-               u8 nested_apf;
-       } exception;
+       bool exception_from_userspace;
+ 
+       /* Exceptions to be injected to the guest. */
+       struct kvm_queued_exception exception;
+       /* Exception VM-Exits to be synthesized to L1. */
+       struct kvm_queued_exception exception_vmexit;
   
         struct kvm_queued_interrupt {
                 bool injected;
@@@ -858,7 -865,6 +866,6 @@@
                 u32 id;
                 bool send_user_only;
                 u32 host_apf_flags;
-               unsigned long nested_apf_token;
                 bool delivery_as_pf_vmexit;
                 bool pageready_pending;
         } apf;
@@@ -1524,7 -1530,7 +1531,7 @@@ struct kvm_x86_ops 
                                 unsigned char *hypercall_addr);
         void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected);
         void (*inject_nmi)(struct kvm_vcpu *vcpu);
-       void (*queue_exception)(struct kvm_vcpu *vcpu);
+       void (*inject_exception)(struct kvm_vcpu *vcpu);
         void (*cancel_injection)(struct kvm_vcpu *vcpu);
         int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
         int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
@@@ -1634,10 -1640,10 +1641,10 @@@
   
   struct kvm_x86_nested_ops {
         void (*leave_nested)(struct kvm_vcpu *vcpu);
+       bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
+                                   u32 error_code);
         int (*check_events)(struct kvm_vcpu *vcpu);
-       bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
-                                            struct x86_exception *fault);
-       bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+       bool (*has_events)(struct kvm_vcpu *vcpu);
         void (*triple_fault)(struct kvm_vcpu *vcpu);
         int (*get_state)(struct kvm_vcpu *vcpu,
                          struct kvm_nested_state __user *user_kvm_nested_state,
@@@ -1863,7 -1869,7 +1870,7 @@@ void kvm_queue_exception_p(struct kvm_v
   void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
   void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
   void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
- bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                     struct x86_exception *fault);
   bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
   bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
diff --combined arch/x86/kvm/cpuid.c

index 2796dde06302a901a2dba41df1e7bb64d04992c9,ffdc28684cb7981f7395fae81e54dd5e361c727f..7065462378e2933d7c76711a54cc64c70140443a
--- 1/arch/x86/kvm/cpuid.c
--- 2/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@@ -311,10 -311,20 +311,19 @@@ void kvm_update_cpuid_runtime(struct kv
   }
   EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
   
+ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
+ {
+       struct kvm_cpuid_entry2 *entry;
+ 
+       entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE,
+                                 KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+       return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+ }
+ 
   static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
         struct kvm_cpuid_entry2 *best;
- -      u64 guest_supported_xcr0;
   
         best = kvm_find_cpuid_entry(vcpu, 1);
         if (best && apic) {
@@@ -326,16 -336,10 +335,16 @@@
                 kvm_apic_set_version(vcpu);
         }
   
- -      guest_supported_xcr0 =
+ +      vcpu->arch.guest_supported_xcr0 =
                 cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
   
- -      vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0;
+ +      /*
+ +       * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
+ +       * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
+ +       * supported by the host.
+ +       */
+ +      vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 |
+ +                                                     XFEATURE_MASK_FPSSE;
   
         kvm_update_pv_runtime(vcpu);
   
@@@ -346,7 -350,8 +355,8 @@@
         vcpu->arch.cr4_guest_rsvd_bits =
             __cr4_reserved_bits(guest_cpuid_has, vcpu);
   
-       kvm_hv_set_cpuid(vcpu);
+       kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
+                                                   vcpu->arch.cpuid_nent));
   
         /* Invoke the vendor callback only after the above state is updated. */
         static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
@@@ -409,6 -414,12 +419,12 @@@ static int kvm_set_cpuid(struct kvm_vcp
                 return 0;
         }
   
+       if (kvm_cpuid_has_hyperv(e2, nent)) {
+               r = kvm_hv_vcpu_init(vcpu);
+               if (r)
+                       return r;
+       }
+ 
         r = kvm_check_cpuid(vcpu, e2, nent);
         if (r)
                 return r;
@@@ -902,6 -913,8 +918,6 @@@ static inline int __do_cpuid_func(struc
                         entry->edx = 0;
                 }
                 break;
- -      case 9:
- -              break;
         case 0xa: { /* Architectural Performance Monitoring */
                 union cpuid10_eax eax;
                 union cpuid10_edx edx;
diff --combined arch/x86/kvm/emulate.c

index aacb28c83e437d83712ed029c47f59ffc7c6cde7,b6180032dfd6c0f888655d1bfe77c33d4ed994cb..3b27622d46425b58c7285f11f60b346a550c8f84
--- 1/arch/x86/kvm/emulate.c
--- 2/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@@ -479,7 -479,7 +479,7 @@@ FOP_END
   
   /*
    * XXX: inoutclob user must know where the argument is being expanded.
- - *      Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault.
+ + *      Using asm goto would allow us to remove _fault.
    */
   #define asm_safe(insn, inoutclob...) \
   ({ \
@@@ -1137,9 -1137,11 +1137,11 @@@ static int em_fnstsw(struct x86_emulate
   static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
                                     struct operand *op)
   {
-       unsigned reg = ctxt->modrm_reg;
+       unsigned int reg;
   
-       if (!(ctxt->d & ModRM))
+       if (ctxt->d & ModRM)
+               reg = ctxt->modrm_reg;
+       else
                 reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
   
         if (ctxt->d & Sse) {
@@@ -1953,7 -1955,7 +1955,7 @@@ static int em_pop_sreg(struct x86_emula
         if (rc != X86EMUL_CONTINUE)
                 return rc;
   
-       if (ctxt->modrm_reg == VCPU_SREG_SS)
+       if (seg == VCPU_SREG_SS)
                 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
         if (ctxt->op_bytes > 2)
                 rsp_increment(ctxt, ctxt->op_bytes - 2);
@@@ -3645,13 -3647,10 +3647,10 @@@ static int em_wrmsr(struct x86_emulate_
                 | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
         r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
   
-       if (r == X86EMUL_IO_NEEDED)
-               return r;
- 
-       if (r > 0)
+       if (r == X86EMUL_PROPAGATE_FAULT)
                 return emulate_gp(ctxt, 0);
   
-       return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+       return r;
   }
   
   static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
@@@ -3662,15 -3661,14 +3661,14 @@@
   
         r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
   
-       if (r == X86EMUL_IO_NEEDED)
-               return r;
- 
-       if (r)
+       if (r == X86EMUL_PROPAGATE_FAULT)
                 return emulate_gp(ctxt, 0);
   
-       *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
-       *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
-       return X86EMUL_CONTINUE;
+       if (r == X86EMUL_CONTINUE) {
+               *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+               *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+       }
+       return r;
   }
   
   static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
@@@ -4132,9 -4130,6 +4130,9 @@@ static int em_xsetbv(struct x86_emulate
   {
         u32 eax, ecx, edx;
   
+ +      if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE))
+ +              return emulate_ud(ctxt);
+ +
         eax = reg_read(ctxt, VCPU_REGS_RAX);
         edx = reg_read(ctxt, VCPU_REGS_RDX);
         ecx = reg_read(ctxt, VCPU_REGS_RCX);
@@@ -4171,8 -4166,7 +4169,7 @@@ static int check_dr7_gd(struct x86_emul
   
         ctxt->ops->get_dr(ctxt, 7, &dr7);
   
-       /* Check if DR7.Global_Enable is set */
-       return dr7 & (1 << 13);
+       return dr7 & DR7_GD;
   }
   
   static int check_dr_read(struct x86_emulate_ctxt *ctxt)
diff --combined arch/x86/kvm/mmu/mmu.c

index 3552e6af3684437f66f1d6a864499095bf176c74,40feb5ec761e7e2ce0c7464699e1a25e6d4daf48..6f81539061d6485905e5a2e50a49293096f16035
--- 1/arch/x86/kvm/mmu/mmu.c
--- 2/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@@ -1596,8 -1596,6 +1596,8 @@@ static void __rmap_add(struct kvm *kvm
         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
         rmap_count = pte_list_add(cache, spte, rmap_head);
   
+ +      if (rmap_count > kvm->stat.max_mmu_rmap_size)
+ +              kvm->stat.max_mmu_rmap_size = rmap_count;
         if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
                 kvm_zap_all_rmap_sptes(kvm, rmap_head);
                 kvm_flush_remote_tlbs_with_address(
@@@ -1667,6 -1665,18 +1667,18 @@@ static inline void kvm_mod_used_mmu_pag
         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
   }
   
+ static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+       kvm_mod_used_mmu_pages(kvm, +1);
+       kvm_account_pgtable_pages((void *)sp->spt, +1);
+ }
+ 
+ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+       kvm_mod_used_mmu_pages(kvm, -1);
+       kvm_account_pgtable_pages((void *)sp->spt, -1);
+ }
+ 
   static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
   {
         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
@@@ -2124,7 -2134,7 +2136,7 @@@ static struct kvm_mmu_page *kvm_mmu_all
          */
         sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
         list_add(&sp->link, &kvm->arch.active_mmu_pages);
-       kvm_mod_used_mmu_pages(kvm, +1);
+       kvm_account_mmu_page(kvm, sp);
   
         sp->gfn = gfn;
         sp->role = role;
@@@ -2458,7 -2468,7 +2470,7 @@@ static bool __kvm_mmu_prepare_zap_page(
                         list_add(&sp->link, invalid_list);
                 else
                         list_move(&sp->link, invalid_list);
-               kvm_mod_used_mmu_pages(kvm, -1);
+               kvm_unaccount_mmu_page(kvm, sp);
         } else {
                 /*
                  * Remove the active root from the active page list, the root
@@@ -4292,7 -4302,7 +4304,7 @@@ int kvm_handle_page_fault(struct kvm_vc
   
         vcpu->arch.l1tf_flush_l1d = true;
         if (!flags) {
-               trace_kvm_page_fault(fault_address, error_code);
+               trace_kvm_page_fault(vcpu, fault_address, error_code);
   
                 if (kvm_event_needs_reinjection(vcpu))
                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
@@@ -6704,10 -6714,12 +6716,12 @@@ int kvm_mmu_vendor_module_init(void
   
         ret = register_shrinker(&mmu_shrinker, "x86-mmu");
         if (ret)
-               goto out;
+               goto out_shrinker;
   
         return 0;
   
+ out_shrinker:
+       percpu_counter_destroy(&kvm_total_used_mmu_pages);
   out:
         mmu_destroy_caches();
         return ret;
diff --combined arch/x86/kvm/vmx/nested.c

index 7eaf96064cb0e1375a1b01547b3a75db4080b70a,8f67a9c4a28706360abd9a8c841ca7e0a132225c..0c62352dda6abc9bf72dfaaaa760cc5bb78bbcbf
--- 1/arch/x86/kvm/vmx/nested.c
--- 2/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@@ -439,61 -439,22 +439,22 @@@ static bool nested_vmx_is_page_fault_vm
         return inequality ^ bit;
   }
   
- 
- /*
-  * KVM wants to inject page-faults which it got to the guest. This function
-  * checks whether in a nested guest, we need to inject them to L1 or L2.
-  */
- static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
+ static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+                                          u32 error_code)
   {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       unsigned int nr = vcpu->arch.exception.nr;
-       bool has_payload = vcpu->arch.exception.has_payload;
-       unsigned long payload = vcpu->arch.exception.payload;
- 
-       if (nr == PF_VECTOR) {
-               if (vcpu->arch.exception.nested_apf) {
-                       *exit_qual = vcpu->arch.apf.nested_apf_token;
-                       return 1;
-               }
-               if (nested_vmx_is_page_fault_vmexit(vmcs12,
-                                                   vcpu->arch.exception.error_code)) {
-                       *exit_qual = has_payload ? payload : vcpu->arch.cr2;
-                       return 1;
-               }
-       } else if (vmcs12->exception_bitmap & (1u << nr)) {
-               if (nr == DB_VECTOR) {
-                       if (!has_payload) {
-                               payload = vcpu->arch.dr6;
-                               payload &= ~DR6_BT;
-                               payload ^= DR6_ACTIVE_LOW;
-                       }
-                       *exit_qual = payload;
-               } else
-                       *exit_qual = 0;
-               return 1;
-       }
   
-       return 0;
- }
- 
- static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
-                                                   struct x86_exception *fault)
- {
-       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- 
-       WARN_ON(!is_guest_mode(vcpu));
+       /*
+        * Drop bits 31:16 of the error code when performing the #PF mask+match
+        * check.  All VMCS fields involved are 32 bits, but Intel CPUs never
+        * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
+        * error code.  Including the to-be-dropped bits in the check might
+        * result in an "impossible" or missed exit from L1's perspective.
+        */
+       if (vector == PF_VECTOR)
+               return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
   
-       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
-           !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
-               vmcs12->vm_exit_intr_error_code = fault->error_code;
-               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                                 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-                                 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
-                                 fault->address);
-               return true;
-       }
-       return false;
+       return (vmcs12->exception_bitmap & (1u << vector));
   }
   
   static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@@ -1607,6 -1568,10 +1568,10 @@@ static void copy_enlightened_to_vmcs12(
                 vmcs12->guest_rflags = evmcs->guest_rflags;
                 vmcs12->guest_interruptibility_info =
                         evmcs->guest_interruptibility_info;
+               /*
+                * Not present in struct vmcs12:
+                * vmcs12->guest_ssp = evmcs->guest_ssp;
+                */
         }
   
         if (unlikely(!(hv_clean_fields &
@@@ -1653,6 -1618,13 +1618,13 @@@
                 vmcs12->host_fs_selector = evmcs->host_fs_selector;
                 vmcs12->host_gs_selector = evmcs->host_gs_selector;
                 vmcs12->host_tr_selector = evmcs->host_tr_selector;
+               vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
+               /*
+                * Not present in struct vmcs12:
+                * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
+                * vmcs12->host_ssp = evmcs->host_ssp;
+                * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
+                */
         }
   
         if (unlikely(!(hv_clean_fields &
@@@ -1720,6 -1692,8 +1692,8 @@@
                 vmcs12->tsc_offset = evmcs->tsc_offset;
                 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
                 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+               vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
+               vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
         }
   
         if (unlikely(!(hv_clean_fields &
@@@ -1767,6 -1741,13 +1741,13 @@@
                 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
                 vmcs12->guest_activity_state = evmcs->guest_activity_state;
                 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+               vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
+               /*
+                * Not present in struct vmcs12:
+                * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
+                * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
+                * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
+                */
         }
   
         /*
@@@ -1869,12 -1850,23 +1850,23 @@@ static void copy_vmcs12_to_enlightened(
          * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
          * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
          * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+        * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
+        * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
+        * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
+        * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
          *
          * Not present in struct vmcs12:
          * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
          * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
          * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
          * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+        * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
+        * evmcs->host_ssp = vmcs12->host_ssp;
+        * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
+        * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
+        * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
+        * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
+        * evmcs->guest_ssp = vmcs12->guest_ssp;
          */
   
         evmcs->guest_es_selector = vmcs12->guest_es_selector;
@@@ -1982,7 -1974,7 +1974,7 @@@ static enum nested_evmptrld_status nest
         bool evmcs_gpa_changed = false;
         u64 evmcs_gpa;
   
-       if (likely(!vmx->nested.enlightened_vmcs_enabled))
+       if (likely(!guest_cpuid_has_evmcs(vcpu)))
                 return EVMPTRLD_DISABLED;
   
         if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
@@@ -2328,9 -2320,14 +2320,14 @@@ static void prepare_vmcs02_early(struc
          * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
          * on the related bits (if supported by the CPU) in the hope that
          * we can avoid VMWrites during vmx_set_efer().
+        *
+        * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
+        * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
+        * do the same for L2.
          */
         exec_control = __vm_entry_controls_get(vmcs01);
-       exec_control |= vmcs12->vm_entry_controls;
+       exec_control |= (vmcs12->vm_entry_controls &
+                        ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
         exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
         if (cpu_has_load_ia32_efer()) {
                 if (guest_efer & EFER_LMA)
@@@ -2570,7 -2567,7 +2567,7 @@@ static int prepare_vmcs02(struct kvm_vc
          * bits which we consider mandatory enabled.
          * The CR0_READ_SHADOW is what L2 should have expected to read given
          * the specifications by L1; It's not enough to take
- -       * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+ +       * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
          * have more bits than L1 expected.
          */
         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
@@@ -2863,7 -2860,7 +2860,7 @@@ static int nested_vmx_check_controls(st
             nested_check_vm_entry_controls(vcpu, vmcs12))
                 return -EINVAL;
   
-       if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
+       if (guest_cpuid_has_evmcs(vcpu))
                 return nested_evmcs_check_controls(vmcs12);
   
         return 0;
@@@ -3145,7 -3142,7 +3142,7 @@@ static bool nested_get_evmcs_page(struc
          * L2 was running), map it here to make sure vmcs12 changes are
          * properly reflected.
          */
-       if (vmx->nested.enlightened_vmcs_enabled &&
+       if (guest_cpuid_has_evmcs(vcpu) &&
             vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
                 enum nested_evmptrld_status evmptrld_status =
                         nested_vmx_handle_enlightened_vmptrld(vcpu, false);
@@@ -3364,12 -3361,24 +3361,24 @@@ enum nvmx_vmentry_status nested_vmx_ent
         };
         u32 failed_index;
   
+       trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
+                                vmx->nested.current_vmptr,
+                                vmcs12->guest_rip,
+                                vmcs12->guest_intr_status,
+                                vmcs12->vm_entry_intr_info_field,
+                                vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
+                                vmcs12->ept_pointer,
+                                vmcs12->guest_cr3,
+                                KVM_ISA_VMX);
+ 
         kvm_service_local_tlb_flush_requests(vcpu);
   
         evaluate_pending_interrupts = exec_controls_get(vmx) &
                 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+       if (!evaluate_pending_interrupts)
+               evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
   
         if (!vmx->nested.nested_run_pending ||
             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@@ -3450,18 -3459,10 +3459,10 @@@
         }
   
         /*
-        * If L1 had a pending IRQ/NMI until it executed
-        * VMLAUNCH/VMRESUME which wasn't delivered because it was
-        * disallowed (e.g. interrupts disabled), L0 needs to
-        * evaluate if this pending event should cause an exit from L2
-        * to L1 or delivered directly to L2 (e.g. In case L1 don't
-        * intercept EXTERNAL_INTERRUPT).
-        *
-        * Usually this would be handled by the processor noticing an
-        * IRQ/NMI window request, or checking RVI during evaluation of
-        * pending virtual interrupts.  However, this setting was done
-        * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
-        * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+        * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
+        * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
+        * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
+        * unconditionally.
          */
         if (unlikely(evaluate_pending_interrupts))
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -3718,7 -3719,7 +3719,7 @@@ static void vmcs12_save_pending_event(s
              is_double_fault(exit_intr_info))) {
                 vmcs12->idt_vectoring_info_field = 0;
         } else if (vcpu->arch.exception.injected) {
-               nr = vcpu->arch.exception.nr;
+               nr = vcpu->arch.exception.vector;
                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
   
                 if (kvm_exception_is_soft(nr)) {
@@@ -3819,19 -3820,40 +3820,40 @@@ mmio_needed
         return -ENXIO;
   }
   
- static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
-                                              unsigned long exit_qual)
+ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
   {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+       u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       unsigned int nr = vcpu->arch.exception.nr;
-       u32 intr_info = nr | INTR_INFO_VALID_MASK;
+       unsigned long exit_qual;
+ 
+       if (ex->has_payload) {
+               exit_qual = ex->payload;
+       } else if (ex->vector == PF_VECTOR) {
+               exit_qual = vcpu->arch.cr2;
+       } else if (ex->vector == DB_VECTOR) {
+               exit_qual = vcpu->arch.dr6;
+               exit_qual &= ~DR6_BT;
+               exit_qual ^= DR6_ACTIVE_LOW;
+       } else {
+               exit_qual = 0;
+       }
   
-       if (vcpu->arch.exception.has_error_code) {
-               vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+       if (ex->has_error_code) {
+               /*
+                * Intel CPUs do not generate error codes with bits 31:16 set,
+                * and more importantly VMX disallows setting bits 31:16 in the
+                * injected error code for VM-Entry.  Drop the bits to mimic
+                * hardware and avoid inducing failure on nested VM-Entry if L1
+                * chooses to inject the exception back to L2.  AMD CPUs _do_
+                * generate "full" 32-bit error codes, so KVM allows userspace
+                * to inject exception error codes with bits 31:16 set.
+                */
+               vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
         }
   
-       if (kvm_exception_is_soft(nr))
+       if (kvm_exception_is_soft(ex->vector))
                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
         else
                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@@ -3844,16 -3866,39 +3866,39 @@@
   }
   
   /*
-  * Returns true if a debug trap is pending delivery.
+  * Returns true if a debug trap is (likely) pending delivery.  Infer the class
+  * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
+  * Using the payload is flawed because code breakpoints (fault-like) and data
+  * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
+  * this will return false positives if a to-be-injected code breakpoint #DB is
+  * pending (from KVM's perspective, but not "pending" across an instruction
+  * boundary).  ICEBP, a.k.a. INT1, is also not reflected here even though it
+  * too is trap-like.
    *
-  * In KVM, debug traps bear an exception payload. As such, the class of a #DB
-  * exception may be inferred from the presence of an exception payload.
+  * KVM "works" despite these flaws as ICEBP isn't currently supported by the
+  * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
+  * #DB has already happened), and MTF isn't marked pending on code breakpoints
+  * from the emulator (because such #DBs are fault-like and thus don't trigger
+  * actions that fire on instruction retire).
+  */
+ static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
+ {
+       if (!ex->pending || ex->vector != DB_VECTOR)
+               return 0;
+ 
+       /* General Detect #DBs are always fault-like. */
+       return ex->payload & ~DR6_BD;
+ }
+ 
+ /*
+  * Returns true if there's a pending #DB exception that is lower priority than
+  * a pending Monitor Trap Flag VM-Exit.  TSS T-flag #DBs are not emulated by
+  * KVM, but could theoretically be injected by userspace.  Note, this code is
+  * imperfect, see above.
    */
- static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
+ static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
   {
-       return vcpu->arch.exception.pending &&
-                       vcpu->arch.exception.nr == DB_VECTOR &&
-                       vcpu->arch.exception.payload;
+       return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
   }
   
   /*
@@@ -3865,9 -3910,11 +3910,11 @@@
    */
   static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
   {
-       if (vmx_pending_dbg_trap(vcpu))
-               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-                           vcpu->arch.exception.payload);
+       unsigned long pending_dbg;
+ 
+       pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
+       if (pending_dbg)
+               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
   }
   
   static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
@@@ -3876,21 -3923,113 +3923,113 @@@
                to_vmx(vcpu)->nested.preemption_timer_expired;
   }
   
+ static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+ {
+       return nested_vmx_preemption_timer_pending(vcpu) ||
+              to_vmx(vcpu)->nested.mtf_pending;
+ }
+ 
+ /*
+  * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
+  * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
+  * and less minor edits to splice in the priority of VMX Non-Root specific
+  * events, e.g. MTF and NMI/INTR-window exiting.
+  *
+  * 1 Hardware Reset and Machine Checks
+  *    - RESET
+  *    - Machine Check
+  *
+  * 2 Trap on Task Switch
+  *    - T flag in TSS is set (on task switch)
+  *
+  * 3 External Hardware Interventions
+  *    - FLUSH
+  *    - STOPCLK
+  *    - SMI
+  *    - INIT
+  *
+  * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
+  *
+  * 4 Traps on Previous Instruction
+  *    - Breakpoints
+  *    - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
+  *      breakpoint, or #DB due to a split-lock access)
+  *
+  * 4.3        VMX-preemption timer expired VM-exit
+  *
+  * 4.6        NMI-window exiting VM-exit[2]
+  *
+  * 5 Nonmaskable Interrupts (NMI)
+  *
+  * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
+  *
+  * 6 Maskable Hardware Interrupts
+  *
+  * 7 Code Breakpoint Fault
+  *
+  * 8 Faults from Fetching Next Instruction
+  *    - Code-Segment Limit Violation
+  *    - Code Page Fault
+  *    - Control protection exception (missing ENDBRANCH at target of indirect
+  *                                    call or jump)
+  *
+  * 9 Faults from Decoding Next Instruction
+  *    - Instruction length > 15 bytes
+  *    - Invalid Opcode
+  *    - Coprocessor Not Available
+  *
+  *10 Faults on Executing Instruction
+  *    - Overflow
+  *    - Bound error
+  *    - Invalid TSS
+  *    - Segment Not Present
+  *    - Stack fault
+  *    - General Protection
+  *    - Data Page Fault
+  *    - Alignment Check
+  *    - x86 FPU Floating-point exception
+  *    - SIMD floating-point exception
+  *    - Virtualization exception
+  *    - Control protection exception
+  *
+  * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
+  *     INIT signals, and higher priority events take priority over MTF VM exits.
+  *     MTF VM exits take priority over debug-trap exceptions and lower priority
+  *     events.
+  *
+  * [2] Debug-trap exceptions and higher priority events take priority over VM exits
+  *     caused by the VMX-preemption timer.  VM exits caused by the VMX-preemption
+  *     timer take priority over VM exits caused by the "NMI-window exiting"
+  *     VM-execution control and lower priority events.
+  *
+  * [3] Debug-trap exceptions and higher priority events take priority over VM exits
+  *     caused by "NMI-window exiting".  VM exits caused by this control take
+  *     priority over non-maskable interrupts (NMIs) and lower priority events.
+  *
+  * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
+  *     the 1-setting of the "interrupt-window exiting" VM-execution control.  Thus,
+  *     non-maskable interrupts (NMIs) and higher priority events take priority over
+  *     delivery of a virtual interrupt; delivery of a virtual interrupt takes
+  *     priority over external interrupts and lower priority events.
+  */
   static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
   {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long exit_qual;
-       bool block_nested_events =
-           vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
-       bool mtf_pending = vmx->nested.mtf_pending;
         struct kvm_lapic *apic = vcpu->arch.apic;
- 
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
         /*
-        * Clear the MTF state. If a higher priority VM-exit is delivered first,
-        * this state is discarded.
+        * Only a pending nested run blocks a pending exception.  If there is a
+        * previously injected event, the pending exception occurred while said
+        * event was being delivered and thus needs to be handled.
          */
-       if (!block_nested_events)
-               vmx->nested.mtf_pending = false;
+       bool block_nested_exceptions = vmx->nested.nested_run_pending;
+       /*
+        * New events (not exceptions) are only recognized at instruction
+        * boundaries.  If an event needs reinjection, then KVM is handling a
+        * VM-Exit that occurred _during_ instruction execution; new events are
+        * blocked until the instruction completes.
+        */
+       bool block_nested_events = block_nested_exceptions ||
+                                  kvm_event_needs_reinjection(vcpu);
   
         if (lapic_in_kernel(vcpu) &&
                 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
@@@ -3900,6 -4039,9 +4039,9 @@@
                 clear_bit(KVM_APIC_INIT, &apic->pending_events);
                 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
                         nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+ 
+               /* MTF is discarded if the vCPU is in WFS. */
+               vmx->nested.mtf_pending = false;
                 return 0;
         }
   
@@@ -3909,31 -4051,41 +4051,41 @@@
                         return -EBUSY;
   
                 clear_bit(KVM_APIC_SIPI, &apic->pending_events);
-               if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+               if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                         nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
                                                 apic->sipi_vector & 0xFFUL);
-               return 0;
+                       return 0;
+               }
+               /* Fallthrough, the SIPI is completely ignored. */
         }
   
         /*
-        * Process any exceptions that are not debug traps before MTF.
+        * Process exceptions that are higher priority than Monitor Trap Flag:
+        * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
+        * could theoretically come in from userspace), and ICEBP (INT1).
          *
-        * Note that only a pending nested run can block a pending exception.
-        * Otherwise an injected NMI/interrupt should either be
-        * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
-        * while delivering the pending exception.
+        * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
+        * for TSS T flag #DBs).  KVM also doesn't save/restore pending MTF
+        * across SMI/RSM as it should; that needs to be addressed in order to
+        * prioritize SMI over MTF and trap-like #DBs.
          */
- 
-       if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
-               if (vmx->nested.nested_run_pending)
+       if (vcpu->arch.exception_vmexit.pending &&
+           !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
+               if (block_nested_exceptions)
                         return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ 
+               nested_vmx_inject_exception_vmexit(vcpu);
                 return 0;
         }
   
-       if (mtf_pending) {
+       if (vcpu->arch.exception.pending &&
+           !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
+               if (block_nested_exceptions)
+                       return -EBUSY;
+               goto no_vmexit;
+       }
+ 
+       if (vmx->nested.mtf_pending) {
                 if (block_nested_events)
                         return -EBUSY;
                 nested_vmx_update_pending_dbg(vcpu);
@@@ -3941,15 -4093,20 +4093,20 @@@
                 return 0;
         }
   
-       if (vcpu->arch.exception.pending) {
-               if (vmx->nested.nested_run_pending)
+       if (vcpu->arch.exception_vmexit.pending) {
+               if (block_nested_exceptions)
                         return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ 
+               nested_vmx_inject_exception_vmexit(vcpu);
                 return 0;
         }
   
+       if (vcpu->arch.exception.pending) {
+               if (block_nested_exceptions)
+                       return -EBUSY;
+               goto no_vmexit;
+       }
+ 
         if (nested_vmx_preemption_timer_pending(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
@@@ -4255,14 -4412,6 +4412,6 @@@ static void prepare_vmcs12(struct kvm_v
                         nested_vmx_abort(vcpu,
                                          VMX_ABORT_SAVE_GUEST_MSR_FAIL);
         }
- 
-       /*
-        * Drop what we picked up for L2 via vmx_complete_interrupts. It is
-        * preserved above and would only end up incorrectly in L1.
-        */
-       vcpu->arch.nmi_injected = false;
-       kvm_clear_exception_queue(vcpu);
-       kvm_clear_interrupt_queue(vcpu);
   }
   
   /*
@@@ -4538,6 -4687,9 +4687,9 @@@ void nested_vmx_vmexit(struct kvm_vcpu 
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   
+       /* Pending MTF traps are discarded on VM-Exit. */
+       vmx->nested.mtf_pending = false;
+ 
         /* trying to cancel vmlaunch/vmresume is a bug */
         WARN_ON_ONCE(vmx->nested.nested_run_pending);
   
@@@ -4602,6 -4754,17 +4754,17 @@@
                 WARN_ON_ONCE(nested_early_check);
         }
   
+       /*
+        * Drop events/exceptions that were queued for re-injection to L2
+        * (picked up via vmx_complete_interrupts()), as well as exceptions
+        * that were pending for L2.  Note, this must NOT be hoisted above
+        * prepare_vmcs12(), events/exceptions queued for re-injection need to
+        * be captured in vmcs12 (see vmcs12_save_pending_event()).
+        */
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
+ 
         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
   
         /* Update any VMCS fields that might have changed while L2 ran */
@@@ -5030,8 -5193,8 +5193,8 @@@ static int handle_vmxoff(struct kvm_vcp
   
         free_nested(vcpu);
   
-       /* Process a latched INIT during time CPU was in VMX operation */
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       if (kvm_apic_has_pending_init_or_sipi(vcpu))
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
   
         return nested_vmx_succeed(vcpu);
   }
@@@ -5067,7 -5230,7 +5230,7 @@@ static int handle_vmclear(struct kvm_vc
          * state. It is possible that the area will stay mapped as
          * vmx->nested.hv_evmcs but this shouldn't be a problem.
          */
-       if (likely(!vmx->nested.enlightened_vmcs_enabled ||
+       if (likely(!guest_cpuid_has_evmcs(vcpu) ||
                    !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
                 if (vmptr == vmx->nested.current_vmptr)
                         nested_release_vmcs12(vcpu);
@@@ -6463,6 -6626,9 +6626,9 @@@ static int vmx_set_nested_state(struct 
         if (ret)
                 goto error_guest_mode;
   
+       if (vmx->nested.mtf_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+ 
         return 0;
   
   error_guest_mode:
@@@ -6522,8 -6688,10 +6688,10 @@@ static u64 nested_vmx_calc_vmcs_enum_ms
    * bit in the high half is on if the corresponding bit in the control field
    * may be on. See also vmx_control_verify().
    */
- void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
+ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
   {
+       struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
+ 
         /*
          * Note that as a general rule, the high half of the MSRs (bits in
          * the control fields which may be 1) should be initialized by the
@@@ -6540,11 -6708,10 +6708,10 @@@
          */
   
         /* pin-based controls */
-       rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
-               msrs->pinbased_ctls_low,
-               msrs->pinbased_ctls_high);
-       msrs->pinbased_ctls_low |=
+       msrs->pinbased_ctls_low =
                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ 
+       msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
         msrs->pinbased_ctls_high &=
                 PIN_BASED_EXT_INTR_MASK |
                 PIN_BASED_NMI_EXITING |
@@@ -6555,50 -6722,47 +6722,47 @@@
                 PIN_BASED_VMX_PREEMPTION_TIMER;
   
         /* exit controls */
-       rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-               msrs->exit_ctls_low,
-               msrs->exit_ctls_high);
         msrs->exit_ctls_low =
                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
   
+       msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
         msrs->exit_ctls_high &=
   #ifdef CONFIG_X86_64
                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
   #endif
                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
-               VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+               VM_EXIT_CLEAR_BNDCFGS;
         msrs->exit_ctls_high |=
                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
-               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
+               VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
   
         /* We support free control of debug control saving. */
         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
   
         /* entry controls */
-       rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
-               msrs->entry_ctls_low,
-               msrs->entry_ctls_high);
         msrs->entry_ctls_low =
                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ 
+       msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
         msrs->entry_ctls_high &=
   #ifdef CONFIG_X86_64
                 VM_ENTRY_IA32E_MODE |
   #endif
-               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
-               VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
         msrs->entry_ctls_high |=
-               (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
+               (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
+                VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
   
         /* We support free control of debug control loading. */
         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
   
         /* cpu-based controls */
-       rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
-               msrs->procbased_ctls_low,
-               msrs->procbased_ctls_high);
         msrs->procbased_ctls_low =
                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ 
+       msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
         msrs->procbased_ctls_high &=
                 CPU_BASED_INTR_WINDOW_EXITING |
                 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
@@@ -6632,12 -6796,9 +6796,9 @@@
          * depend on CPUID bits, they are added later by
          * vmx_vcpu_after_set_cpuid.
          */
-       if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
-               rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
-                     msrs->secondary_ctls_low,
-                     msrs->secondary_ctls_high);
- 
         msrs->secondary_ctls_low = 0;
+ 
+       msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
         msrs->secondary_ctls_high &=
                 SECONDARY_EXEC_DESC |
                 SECONDARY_EXEC_ENABLE_RDTSCP |
@@@ -6717,10 -6878,7 +6878,7 @@@
                 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
   
         /* miscellaneous data */
-       rdmsr(MSR_IA32_VMX_MISC,
-               msrs->misc_low,
-               msrs->misc_high);
-       msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+       msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
         msrs->misc_low |=
                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
@@@ -6814,9 -6972,9 +6972,9 @@@ __init int nested_vmx_hardware_setup(in
   
   struct kvm_x86_nested_ops vmx_nested_ops = {
         .leave_nested = vmx_leave_nested,
+       .is_exception_vmexit = nested_vmx_is_exception_vmexit,
         .check_events = vmx_check_nested_events,
-       .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
-       .hv_timer_pending = nested_vmx_preemption_timer_pending,
+       .has_events = vmx_has_nested_events,
         .triple_fault = nested_vmx_triple_fault,
         .get_state = vmx_get_nested_state,
         .set_state = vmx_set_nested_state,
diff --combined arch/x86/kvm/x86.c

index b0c47b41c264982c993a098e738fa2ef8f9e6add,eb9d2c23fb04ecdd94728f4541c18655c6e3579c..4bd5f8a751de91ffeb666e1be9c5db8ae3b65f36
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -173,8 -173,13 +173,13 @@@ bool __read_mostly enable_vmware_backdo
   module_param(enable_vmware_backdoor, bool, S_IRUGO);
   EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
   
- static bool __read_mostly force_emulation_prefix = false;
- module_param(force_emulation_prefix, bool, S_IRUGO);
+ /*
+  * Flags to manipulate forced emulation behavior (any non-zero value will
+  * enable forced emulation).
+  */
+ #define KVM_FEP_CLEAR_RFLAGS_RF       BIT(1)
+ static int __read_mostly force_emulation_prefix;
+ module_param(force_emulation_prefix, int, 0644);
   
   int __read_mostly pi_inject_timer = -1;
   module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
@@@ -528,6 -533,7 +533,7 @@@ static int exception_class(int vector
   #define EXCPT_TRAP            1
   #define EXCPT_ABORT           2
   #define EXCPT_INTERRUPT               3
+ #define EXCPT_DB              4
   
   static int exception_type(int vector)
   {
@@@ -538,8 -544,14 +544,14 @@@
   
         mask = 1 << vector;
   
-       /* #DB is trap, as instruction watchpoints are handled elsewhere */
-       if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+       /*
+        * #DBs can be trap-like or fault-like, the caller must check other CPU
+        * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+        */
+       if (mask & (1 << DB_VECTOR))
+               return EXCPT_DB;
+ 
+       if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
                 return EXCPT_TRAP;
   
         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
@@@ -549,16 -561,13 +561,13 @@@
         return EXCPT_FAULT;
   }
   
- void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+                                  struct kvm_queued_exception *ex)
   {
-       unsigned nr = vcpu->arch.exception.nr;
-       bool has_payload = vcpu->arch.exception.has_payload;
-       unsigned long payload = vcpu->arch.exception.payload;
- 
-       if (!has_payload)
+       if (!ex->has_payload)
                 return;
   
-       switch (nr) {
+       switch (ex->vector) {
         case DB_VECTOR:
                 /*
                  * "Certain debug exceptions may clear bit 0-3.  The
@@@ -583,8 -592,8 +592,8 @@@
                  * So they need to be flipped for DR6.
                  */
                 vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
-               vcpu->arch.dr6 |= payload;
-               vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
+               vcpu->arch.dr6 |= ex->payload;
+               vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
   
                 /*
                  * The #DB payload is defined as compatible with the 'pending
@@@ -595,15 -604,30 +604,30 @@@
                 vcpu->arch.dr6 &= ~BIT(12);
                 break;
         case PF_VECTOR:
-               vcpu->arch.cr2 = payload;
+               vcpu->arch.cr2 = ex->payload;
                 break;
         }
   
-       vcpu->arch.exception.has_payload = false;
-       vcpu->arch.exception.payload = 0;
+       ex->has_payload = false;
+       ex->payload = 0;
   }
   EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
   
+ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+                                      bool has_error_code, u32 error_code,
+                                      bool has_payload, unsigned long payload)
+ {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ 
+       ex->vector = vector;
+       ex->injected = false;
+       ex->pending = true;
+       ex->has_error_code = has_error_code;
+       ex->error_code = error_code;
+       ex->has_payload = has_payload;
+       ex->payload = payload;
+ }
+ 
   static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                 unsigned nr, bool has_error, u32 error_code,
                 bool has_payload, unsigned long payload, bool reinject)
@@@ -613,18 -637,31 +637,31 @@@
   
         kvm_make_request(KVM_REQ_EVENT, vcpu);
   
+       /*
+        * If the exception is destined for L2 and isn't being reinjected,
+        * morph it to a VM-Exit if L1 wants to intercept the exception.  A
+        * previously injected exception is not checked because it was checked
+        * when it was original queued, and re-checking is incorrect if _L1_
+        * injected the exception, in which case it's exempt from interception.
+        */
+       if (!reinject && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+               kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+                                          has_payload, payload);
+               return;
+       }
+ 
         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
         queue:
                 if (reinject) {
                         /*
-                        * On vmentry, vcpu->arch.exception.pending is only
-                        * true if an event injection was blocked by
-                        * nested_run_pending.  In that case, however,
-                        * vcpu_enter_guest requests an immediate exit,
-                        * and the guest shouldn't proceed far enough to
-                        * need reinjection.
+                        * On VM-Entry, an exception can be pending if and only
+                        * if event injection was blocked by nested_run_pending.
+                        * In that case, however, vcpu_enter_guest() requests an
+                        * immediate exit, and the guest shouldn't proceed far
+                        * enough to need reinjection.
                          */
-                       WARN_ON_ONCE(vcpu->arch.exception.pending);
+                       WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
                         vcpu->arch.exception.injected = true;
                         if (WARN_ON_ONCE(has_payload)) {
                                 /*
@@@ -639,17 -676,18 +676,18 @@@
                         vcpu->arch.exception.injected = false;
                 }
                 vcpu->arch.exception.has_error_code = has_error;
-               vcpu->arch.exception.nr = nr;
+               vcpu->arch.exception.vector = nr;
                 vcpu->arch.exception.error_code = error_code;
                 vcpu->arch.exception.has_payload = has_payload;
                 vcpu->arch.exception.payload = payload;
                 if (!is_guest_mode(vcpu))
-                       kvm_deliver_exception_payload(vcpu);
+                       kvm_deliver_exception_payload(vcpu,
+                                                     &vcpu->arch.exception);
                 return;
         }
   
         /* to check exception */
-       prev_nr = vcpu->arch.exception.nr;
+       prev_nr = vcpu->arch.exception.vector;
         if (prev_nr == DF_VECTOR) {
                 /* triple fault -> shutdown */
                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@@ -657,25 -695,22 +695,22 @@@
         }
         class1 = exception_class(prev_nr);
         class2 = exception_class(nr);
-       if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
-               || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+       if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+           (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
                 /*
-                * Generate double fault per SDM Table 5-5.  Set
-                * exception.pending = true so that the double fault
-                * can trigger a nested vmexit.
+                * Synthesize #DF.  Clear the previously injected or pending
+                * exception so as not to incorrectly trigger shutdown.
                  */
-               vcpu->arch.exception.pending = true;
                 vcpu->arch.exception.injected = false;
-               vcpu->arch.exception.has_error_code = true;
-               vcpu->arch.exception.nr = DF_VECTOR;
-               vcpu->arch.exception.error_code = 0;
-               vcpu->arch.exception.has_payload = false;
-               vcpu->arch.exception.payload = 0;
-       } else
+               vcpu->arch.exception.pending = false;
+ 
+               kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
+       } else {
                 /* replace previous exception with a new one in a hope
                    that instruction re-execution will regenerate lost
                    exception */
                 goto queue;
+       }
   }
   
   void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
@@@ -729,20 -764,22 +764,22 @@@ static int complete_emulated_insn_gp(st
   void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
   {
         ++vcpu->stat.pf_guest;
-       vcpu->arch.exception.nested_apf =
-               is_guest_mode(vcpu) && fault->async_page_fault;
-       if (vcpu->arch.exception.nested_apf) {
-               vcpu->arch.apf.nested_apf_token = fault->address;
-               kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
-       } else {
+ 
+       /*
+        * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+        * whether or not L1 wants to intercept "regular" #PF.
+        */
+       if (is_guest_mode(vcpu) && fault->async_page_fault)
+               kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+                                          true, fault->error_code,
+                                          true, fault->address);
+       else
                 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
                                         fault->address);
-       }
   }
   EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
   
- /* Returns true if the page fault was immediately morphed into a VM-Exit. */
- bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                     struct x86_exception *fault)
   {
         struct kvm_mmu *fault_mmu;
@@@ -760,26 -797,7 +797,7 @@@
                 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
                                        fault_mmu->root.hpa);
   
-       /*
-        * A workaround for KVM's bad exception handling.  If KVM injected an
-        * exception into L2, and L2 encountered a #PF while vectoring the
-        * injected exception, manually check to see if L1 wants to intercept
-        * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
-        * In all other cases, defer the check to nested_ops->check_events(),
-        * which will correctly handle priority (this does not).  Note, other
-        * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
-        * most problematic, e.g. when L0 and L1 are both intercepting #PF for
-        * shadow paging.
-        *
-        * TODO: Rewrite exception handling to track injected and pending
-        *       (VM-Exit) exceptions separately.
-        */
-       if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
-           kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
-               return true;
- 
         fault_mmu->inject_page_fault(vcpu, fault);
-       return false;
   }
   EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
   
@@@ -1011,10 -1029,15 +1029,10 @@@ void kvm_load_host_xsave_state(struct k
   }
   EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
   
- -static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu)
- -{
- -      return vcpu->arch.guest_fpu.fpstate->user_xfeatures;
- -}
- -
   #ifdef CONFIG_X86_64
   static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
   {
- -      return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC;
+ +      return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
   }
   #endif
   
@@@ -1037,7 -1060,7 +1055,7 @@@ static int __kvm_set_xcr(struct kvm_vcp
          * saving.  However, xcr0 bit 0 is always set, even if the
          * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
          */
- -      valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP;
+ +      valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
         if (xcr0 & ~valid_bits)
                 return 1;
   
@@@ -1065,7 -1088,6 +1083,7 @@@
   
   int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
   {
+ +      /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
         if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
             __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
                 kvm_inject_gp(vcpu, 0);
@@@ -1553,32 -1575,12 +1571,32 @@@ static const u32 msr_based_features_all
   static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
   static unsigned int num_msr_based_features;
   
+ +/*
+ + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
+ + * does not yet virtualize. These include:
+ + *   10 - MISC_PACKAGE_CTRLS
+ + *   11 - ENERGY_FILTERING_CTL
+ + *   12 - DOITM
+ + *   18 - FB_CLEAR_CTRL
+ + *   21 - XAPIC_DISABLE_STATUS
+ + *   23 - OVERCLOCKING_STATUS
+ + */
+ +
+ +#define KVM_SUPPORTED_ARCH_CAP \
+ +      (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
+ +       ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
+ +       ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
+ +       ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
+ +       ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+ +
   static u64 kvm_get_arch_capabilities(void)
   {
         u64 data = 0;
   
- -      if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ +      if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
+ +              data &= KVM_SUPPORTED_ARCH_CAP;
+ +      }
   
         /*
          * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@@ -1626,6 -1628,9 +1644,6 @@@
                  */
         }
   
- -      /* Guests don't need to know "Fill buffer clear control" exists */
- -      data &= ~ARCH_CAP_FB_CLEAR_CTRL;
- -
         return data;
   }
   
@@@ -4841,7 -4846,7 +4859,7 @@@ static int kvm_vcpu_ready_for_interrupt
         return (kvm_arch_interrupt_allowed(vcpu) &&
                 kvm_cpu_accept_dm_intr(vcpu) &&
                 !kvm_event_needs_reinjection(vcpu) &&
-               !vcpu->arch.exception.pending);
+               !kvm_is_exception_pending(vcpu));
   }
   
   static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@@ -5016,25 -5021,38 +5034,38 @@@ static int kvm_vcpu_ioctl_x86_set_mce(s
   static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                                struct kvm_vcpu_events *events)
   {
+       struct kvm_queued_exception *ex;
+ 
         process_nmi(vcpu);
   
         if (kvm_check_request(KVM_REQ_SMI, vcpu))
                 process_smi(vcpu);
   
         /*
-        * In guest mode, payload delivery should be deferred,
-        * so that the L1 hypervisor can intercept #PF before
-        * CR2 is modified (or intercept #DB before DR6 is
-        * modified under nVMX). Unless the per-VM capability,
-        * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
-        * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
-        * opportunistically defer the exception payload, deliver it if the
-        * capability hasn't been requested before processing a
-        * KVM_GET_VCPU_EVENTS.
+        * KVM's ABI only allows for one exception to be migrated.  Luckily,
+        * the only time there can be two queued exceptions is if there's a
+        * non-exiting _injected_ exception, and a pending exiting exception.
+        * In that case, ignore the VM-Exiting exception as it's an extension
+        * of the injected exception.
+        */
+       if (vcpu->arch.exception_vmexit.pending &&
+           !vcpu->arch.exception.pending &&
+           !vcpu->arch.exception.injected)
+               ex = &vcpu->arch.exception_vmexit;
+       else
+               ex = &vcpu->arch.exception;
+ 
+       /*
+        * In guest mode, payload delivery should be deferred if the exception
+        * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
+        * intercepts #PF, ditto for DR6 and #DBs.  If the per-VM capability,
+        * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
+        * propagate the payload and so it cannot be safely deferred.  Deliver
+        * the payload if the capability hasn't been requested.
          */
         if (!vcpu->kvm->arch.exception_payload_enabled &&
-           vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
-               kvm_deliver_exception_payload(vcpu);
+           ex->pending && ex->has_payload)
+               kvm_deliver_exception_payload(vcpu, ex);
   
         /*
          * The API doesn't provide the instruction length for software
@@@ -5042,26 -5060,25 +5073,25 @@@
          * isn't advanced, we should expect to encounter the exception
          * again.
          */
-       if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+       if (kvm_exception_is_soft(ex->vector)) {
                 events->exception.injected = 0;
                 events->exception.pending = 0;
         } else {
-               events->exception.injected = vcpu->arch.exception.injected;
-               events->exception.pending = vcpu->arch.exception.pending;
+               events->exception.injected = ex->injected;
+               events->exception.pending = ex->pending;
                 /*
                  * For ABI compatibility, deliberately conflate
                  * pending and injected exceptions when
                  * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
                  */
                 if (!vcpu->kvm->arch.exception_payload_enabled)
-                       events->exception.injected |=
-                               vcpu->arch.exception.pending;
+                       events->exception.injected |= ex->pending;
         }
-       events->exception.nr = vcpu->arch.exception.nr;
-       events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-       events->exception.error_code = vcpu->arch.exception.error_code;
-       events->exception_has_payload = vcpu->arch.exception.has_payload;
-       events->exception_payload = vcpu->arch.exception.payload;
+       events->exception.nr = ex->vector;
+       events->exception.has_error_code = ex->has_error_code;
+       events->exception.error_code = ex->error_code;
+       events->exception_has_payload = ex->has_payload;
+       events->exception_payload = ex->payload;
   
         events->interrupt.injected =
                 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@@ -5131,9 -5148,22 +5161,22 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
                 return -EINVAL;
   
         process_nmi(vcpu);
+ 
+       /*
+        * Flag that userspace is stuffing an exception, the next KVM_RUN will
+        * morph the exception to a VM-Exit if appropriate.  Do this only for
+        * pending exceptions, already-injected exceptions are not subject to
+        * intercpetion.  Note, userspace that conflates pending and injected
+        * is hosed, and will incorrectly convert an injected exception into a
+        * pending exception, which in turn may cause a spurious VM-Exit.
+        */
+       vcpu->arch.exception_from_userspace = events->exception.pending;
+ 
+       vcpu->arch.exception_vmexit.pending = false;
+ 
         vcpu->arch.exception.injected = events->exception.injected;
         vcpu->arch.exception.pending = events->exception.pending;
-       vcpu->arch.exception.nr = events->exception.nr;
+       vcpu->arch.exception.vector = events->exception.nr;
         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
         vcpu->arch.exception.error_code = events->exception.error_code;
         vcpu->arch.exception.has_payload = events->exception_has_payload;
@@@ -7257,6 -7287,7 +7300,7 @@@ static int kvm_can_emulate_insn(struct 
   int handle_ud(struct kvm_vcpu *vcpu)
   {
         static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
+       int fep_flags = READ_ONCE(force_emulation_prefix);
         int emul_type = EMULTYPE_TRAP_UD;
         char sig[5]; /* ud2; .ascii "kvm" */
         struct x86_exception e;
@@@ -7264,10 -7295,12 +7308,12 @@@
         if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
                 return 1;
   
-       if (force_emulation_prefix &&
+       if (fep_flags &&
             kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
                                 sig, sizeof(sig), &e) == 0 &&
             memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
+               if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
+                       kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
                 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
                 emul_type = EMULTYPE_TRAP_UD_FORCED;
         }
@@@ -7933,14 -7966,20 +7979,20 @@@ static int emulator_get_msr_with_filter
         int r;
   
         r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
+       if (r < 0)
+               return X86EMUL_UNHANDLEABLE;
   
-       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
-                                   complete_emulated_rdmsr, r)) {
-               /* Bounce to user space */
-               return X86EMUL_IO_NEEDED;
+       if (r) {
+               if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+                                      complete_emulated_rdmsr, r))
+                       return X86EMUL_IO_NEEDED;
+ 
+               trace_kvm_msr_read_ex(msr_index);
+               return X86EMUL_PROPAGATE_FAULT;
         }
   
-       return r;
+       trace_kvm_msr_read(msr_index, *pdata);
+       return X86EMUL_CONTINUE;
   }
   
   static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
@@@ -7950,14 -7989,20 +8002,20 @@@
         int r;
   
         r = kvm_set_msr_with_filter(vcpu, msr_index, data);
+       if (r < 0)
+               return X86EMUL_UNHANDLEABLE;
   
-       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
-                                   complete_emulated_msr_access, r)) {
-               /* Bounce to user space */
-               return X86EMUL_IO_NEEDED;
+       if (r) {
+               if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+                                      complete_emulated_msr_access, r))
+                       return X86EMUL_IO_NEEDED;
+ 
+               trace_kvm_msr_write_ex(msr_index, data);
+               return X86EMUL_PROPAGATE_FAULT;
         }
   
-       return r;
+       trace_kvm_msr_write(msr_index, data);
+       return X86EMUL_CONTINUE;
   }
   
   static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
@@@ -8161,18 -8206,17 +8219,17 @@@ static void toggle_interruptibility(str
         }
   }
   
- static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
   {
         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
-       if (ctxt->exception.vector == PF_VECTOR)
-               return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
   
-       if (ctxt->exception.error_code_valid)
+       if (ctxt->exception.vector == PF_VECTOR)
+               kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+       else if (ctxt->exception.error_code_valid)
                 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
                                       ctxt->exception.error_code);
         else
                 kvm_queue_exception(vcpu, ctxt->exception.vector);
-       return false;
   }
   
   static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
@@@ -8548,8 -8592,46 +8605,46 @@@ int kvm_skip_emulated_instruction(struc
   }
   EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
   
- static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
+ static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
   {
+       u32 shadow;
+ 
+       if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
+               return true;
+ 
+       /*
+        * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
+        * but AMD CPUs do not.  MOV/POP SS blocking is rare, check that first
+        * to avoid the relatively expensive CPUID lookup.
+        */
+       shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+       return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
+              guest_cpuid_is_intel(vcpu);
+ }
+ 
+ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
+                                          int emulation_type, int *r)
+ {
+       WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
+ 
+       /*
+        * Do not check for code breakpoints if hardware has already done the
+        * checks, as inferred from the emulation type.  On NO_DECODE and SKIP,
+        * the instruction has passed all exception checks, and all intercepted
+        * exceptions that trigger emulation have lower priority than code
+        * breakpoints, i.e. the fact that the intercepted exception occurred
+        * means any code breakpoints have already been serviced.
+        *
+        * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
+        * hardware has checked the RIP of the magic prefix, but not the RIP of
+        * the instruction being emulated.  The intent of forced emulation is
+        * to behave as if KVM intercepted the instruction without an exception
+        * and without a prefix.
+        */
+       if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+                             EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
+               return false;
+ 
         if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
             (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
                 struct kvm_run *kvm_run = vcpu->run;
@@@ -8569,7 -8651,7 +8664,7 @@@
         }
   
         if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
-           !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+           !kvm_is_code_breakpoint_inhibited(vcpu)) {
                 unsigned long eip = kvm_get_linear_rip(vcpu);
                 u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
                                            vcpu->arch.dr7,
@@@ -8671,8 -8753,7 +8766,7 @@@ int x86_emulate_instruction(struct kvm_
                  * are fault-like and are higher priority than any faults on
                  * the code fetch itself.
                  */
-               if (!(emulation_type & EMULTYPE_SKIP) &&
-                   kvm_vcpu_check_code_breakpoint(vcpu, &r))
+               if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
                         return r;
   
                 r = x86_decode_emulated_instruction(vcpu, emulation_type,
@@@ -8770,8 -8851,7 +8864,7 @@@ restart
   
         if (ctxt->have_exception) {
                 r = 1;
-               if (inject_emulated_exception(vcpu))
-                       return r;
+               inject_emulated_exception(vcpu);
         } else if (vcpu->arch.pio.count) {
                 if (!vcpu->arch.pio.in) {
                         /* FIXME: return into emulator if single-stepping.  */
@@@ -8801,6 -8881,12 +8894,12 @@@ writeback
                 unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
                 toggle_interruptibility(vcpu, ctxt->interruptibility);
                 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ 
+               /*
+                * Note, EXCPT_DB is assumed to be fault-like as the emulator
+                * only supports code breakpoints and general detect #DB, both
+                * of which are fault-like.
+                */
                 if (!ctxt->have_exception ||
                     exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
                         kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
@@@ -9662,74 -9748,155 +9761,155 @@@ int kvm_check_nested_events(struct kvm_
   
   static void kvm_inject_exception(struct kvm_vcpu *vcpu)
   {
-       trace_kvm_inj_exception(vcpu->arch.exception.nr,
+       trace_kvm_inj_exception(vcpu->arch.exception.vector,
                                 vcpu->arch.exception.has_error_code,
                                 vcpu->arch.exception.error_code,
                                 vcpu->arch.exception.injected);
   
         if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
                 vcpu->arch.exception.error_code = false;
-       static_call(kvm_x86_queue_exception)(vcpu);
+       static_call(kvm_x86_inject_exception)(vcpu);
   }
   
- static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+ /*
+  * Check for any event (interrupt or exception) that is ready to be injected,
+  * and if there is at least one event, inject the event with the highest
+  * priority.  This handles both "pending" events, i.e. events that have never
+  * been injected into the guest, and "injected" events, i.e. events that were
+  * injected as part of a previous VM-Enter, but weren't successfully delivered
+  * and need to be re-injected.
+  *
+  * Note, this is not guaranteed to be invoked on a guest instruction boundary,
+  * i.e. doesn't guarantee that there's an event window in the guest.  KVM must
+  * be able to inject exceptions in the "middle" of an instruction, and so must
+  * also be able to re-inject NMIs and IRQs in the middle of an instruction.
+  * I.e. for exceptions and re-injected events, NOT invoking this on instruction
+  * boundaries is necessary and correct.
+  *
+  * For simplicity, KVM uses a single path to inject all events (except events
+  * that are injected directly from L1 to L2) and doesn't explicitly track
+  * instruction boundaries for asynchronous events.  However, because VM-Exits
+  * that can occur during instruction execution typically result in KVM skipping
+  * the instruction or injecting an exception, e.g. instruction and exception
+  * intercepts, and because pending exceptions have higher priority than pending
+  * interrupts, KVM still honors instruction boundaries in most scenarios.
+  *
+  * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
+  * the instruction or inject an exception, then KVM can incorrecty inject a new
+  * asynchrounous event if the event became pending after the CPU fetched the
+  * instruction (in the guest).  E.g. if a page fault (#PF, #NPF, EPT violation)
+  * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
+  * injected on the restarted instruction instead of being deferred until the
+  * instruction completes.
+  *
+  * In practice, this virtualization hole is unlikely to be observed by the
+  * guest, and even less likely to cause functional problems.  To detect the
+  * hole, the guest would have to trigger an event on a side effect of an early
+  * phase of instruction execution, e.g. on the instruction fetch from memory.
+  * And for it to be a functional problem, the guest would need to depend on the
+  * ordering between that side effect, the instruction completing, _and_ the
+  * delivery of the asynchronous event.
+  */
+ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
+                                      bool *req_immediate_exit)
   {
+       bool can_inject;
         int r;
-       bool can_inject = true;
   
-       /* try to reinject previous events if any */
+       /*
+        * Process nested events first, as nested VM-Exit supercedes event
+        * re-injection.  If there's an event queued for re-injection, it will
+        * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
+        */
+       if (is_guest_mode(vcpu))
+               r = kvm_check_nested_events(vcpu);
+       else
+               r = 0;
   
-       if (vcpu->arch.exception.injected) {
-               kvm_inject_exception(vcpu);
-               can_inject = false;
-       }
         /*
-        * Do not inject an NMI or interrupt if there is a pending
-        * exception.  Exceptions and interrupts are recognized at
-        * instruction boundaries, i.e. the start of an instruction.
-        * Trap-like exceptions, e.g. #DB, have higher priority than
-        * NMIs and interrupts, i.e. traps are recognized before an
-        * NMI/interrupt that's pending on the same instruction.
-        * Fault-like exceptions, e.g. #GP and #PF, are the lowest
-        * priority, but are only generated (pended) during instruction
-        * execution, i.e. a pending fault-like exception means the
-        * fault occurred on the *previous* instruction and must be
-        * serviced prior to recognizing any new events in order to
-        * fully complete the previous instruction.
+        * Re-inject exceptions and events *especially* if immediate entry+exit
+        * to/from L2 is needed, as any event that has already been injected
+        * into L2 needs to complete its lifecycle before injecting a new event.
+        *
+        * Don't re-inject an NMI or interrupt if there is a pending exception.
+        * This collision arises if an exception occurred while vectoring the
+        * injected event, KVM intercepted said exception, and KVM ultimately
+        * determined the fault belongs to the guest and queues the exception
+        * for injection back into the guest.
+        *
+        * "Injected" interrupts can also collide with pending exceptions if
+        * userspace ignores the "ready for injection" flag and blindly queues
+        * an interrupt.  In that case, prioritizing the exception is correct,
+        * as the exception "occurred" before the exit to userspace.  Trap-like
+        * exceptions, e.g. most #DBs, have higher priority than interrupts.
+        * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
+        * priority, they're only generated (pended) during instruction
+        * execution, and interrupts are recognized at instruction boundaries.
+        * Thus a pending fault-like exception means the fault occurred on the
+        * *previous* instruction and must be serviced prior to recognizing any
+        * new events in order to fully complete the previous instruction.
          */
-       else if (!vcpu->arch.exception.pending) {
-               if (vcpu->arch.nmi_injected) {
-                       static_call(kvm_x86_inject_nmi)(vcpu);
-                       can_inject = false;
-               } else if (vcpu->arch.interrupt.injected) {
-                       static_call(kvm_x86_inject_irq)(vcpu, true);
-                       can_inject = false;
-               }
-       }
+       if (vcpu->arch.exception.injected)
+               kvm_inject_exception(vcpu);
+       else if (kvm_is_exception_pending(vcpu))
+               ; /* see above */
+       else if (vcpu->arch.nmi_injected)
+               static_call(kvm_x86_inject_nmi)(vcpu);
+       else if (vcpu->arch.interrupt.injected)
+               static_call(kvm_x86_inject_irq)(vcpu, true);
   
+       /*
+        * Exceptions that morph to VM-Exits are handled above, and pending
+        * exceptions on top of injected exceptions that do not VM-Exit should
+        * either morph to #DF or, sadly, override the injected exception.
+        */
         WARN_ON_ONCE(vcpu->arch.exception.injected &&
                      vcpu->arch.exception.pending);
   
         /*
-        * Call check_nested_events() even if we reinjected a previous event
-        * in order for caller to determine if it should require immediate-exit
-        * from L2 to L1 due to pending L1 events which require exit
-        * from L2 to L1.
+        * Bail if immediate entry+exit to/from the guest is needed to complete
+        * nested VM-Enter or event re-injection so that a different pending
+        * event can be serviced (or if KVM needs to exit to userspace).
+        *
+        * Otherwise, continue processing events even if VM-Exit occurred.  The
+        * VM-Exit will have cleared exceptions that were meant for L2, but
+        * there may now be events that can be injected into L1.
          */
-       if (is_guest_mode(vcpu)) {
-               r = kvm_check_nested_events(vcpu);
-               if (r < 0)
-                       goto out;
-       }
+       if (r < 0)
+               goto out;
+ 
+       /*
+        * A pending exception VM-Exit should either result in nested VM-Exit
+        * or force an immediate re-entry and exit to/from L2, and exception
+        * VM-Exits cannot be injected (flag should _never_ be set).
+        */
+       WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+                    vcpu->arch.exception_vmexit.pending);
+ 
+       /*
+        * New events, other than exceptions, cannot be injected if KVM needs
+        * to re-inject a previous event.  See above comments on re-injecting
+        * for why pending exceptions get priority.
+        */
+       can_inject = !kvm_event_needs_reinjection(vcpu);
   
-       /* try to inject new event if pending */
         if (vcpu->arch.exception.pending) {
-               if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+               /*
+                * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+                * value pushed on the stack.  Trap-like exception and all #DBs
+                * leave RF as-is (KVM follows Intel's behavior in this regard;
+                * AMD states that code breakpoint #DBs excplitly clear RF=0).
+                *
+                * Note, most versions of Intel's SDM and AMD's APM incorrectly
+                * describe the behavior of General Detect #DBs, which are
+                * fault-like.  They do _not_ set RF, a la code breakpoints.
+                */
+               if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
                         __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
                                              X86_EFLAGS_RF);
   
-               if (vcpu->arch.exception.nr == DB_VECTOR) {
-                       kvm_deliver_exception_payload(vcpu);
+               if (vcpu->arch.exception.vector == DB_VECTOR) {
+                       kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
                         if (vcpu->arch.dr7 & DR7_GD) {
                                 vcpu->arch.dr7 &= ~DR7_GD;
                                 kvm_update_dr7(vcpu);
@@@ -9801,11 -9968,11 +9981,11 @@@
         }
   
         if (is_guest_mode(vcpu) &&
-           kvm_x86_ops.nested_ops->hv_timer_pending &&
-           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+           kvm_x86_ops.nested_ops->has_events &&
+           kvm_x86_ops.nested_ops->has_events(vcpu))
                 *req_immediate_exit = true;
   
-       WARN_ON(vcpu->arch.exception.pending);
+       WARN_ON(kvm_is_exception_pending(vcpu));
         return 0;
   
   out:
@@@ -10110,7 -10277,7 +10290,7 @@@ void kvm_vcpu_update_apicv(struct kvm_v
          * When APICv gets disabled, we may still have injected interrupts
          * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
          * still active when the interrupt got accepted. Make sure
-        * inject_pending_event() is called to check for that.
+        * kvm_check_and_inject_events() is called to check for that.
          */
         if (!apic->apicv_active)
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -10407,7 -10574,7 +10587,7 @@@ static int vcpu_enter_guest(struct kvm_
                         goto out;
                 }
   
-               r = inject_pending_event(vcpu, &req_immediate_exit);
+               r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
                 if (r < 0) {
                         r = 0;
                         goto out;
@@@ -10646,10 -10813,26 +10826,26 @@@ static inline int vcpu_block(struct kvm
                 if (hv_timer)
                         kvm_lapic_switch_to_hv_timer(vcpu);
   
-               if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+               /*
+                * If the vCPU is not runnable, a signal or another host event
+                * of some kind is pending; service it without changing the
+                * vCPU's activity state.
+                */
+               if (!kvm_arch_vcpu_runnable(vcpu))
                         return 1;
         }
   
+       /*
+        * Evaluate nested events before exiting the halted state.  This allows
+        * the halt state to be recorded properly in the VMCS12's activity
+        * state field (AMD does not have a similar field and a VM-Exit always
+        * causes a spurious wakeup from HLT).
+        */
+       if (is_guest_mode(vcpu)) {
+               if (kvm_check_nested_events(vcpu) < 0)
+                       return 0;
+       }
+ 
         if (kvm_apic_accept_events(vcpu) < 0)
                 return 0;
         switch(vcpu->arch.mp_state) {
@@@ -10665,17 -10848,13 +10861,14 @@@
         case KVM_MP_STATE_INIT_RECEIVED:
                 break;
         default:
- -              return -EINTR;
+ +              WARN_ON_ONCE(1);
+ +              break;
         }
         return 1;
   }
   
   static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
   {
-       if (is_guest_mode(vcpu))
-               kvm_check_nested_events(vcpu);
- 
         return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                 !vcpu->arch.apf.halted);
   }
@@@ -10824,6 -11003,7 +11017,7 @@@ static void kvm_put_guest_fpu(struct kv
   
   int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
   {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception;
         struct kvm_run *kvm_run = vcpu->run;
         int r;
   
@@@ -10852,7 -11032,6 +11046,6 @@@
                         r = 0;
                         goto out;
                 }
-               kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                 r = -EAGAIN;
                 if (signal_pending(current)) {
                         r = -EINTR;
@@@ -10882,6 -11061,21 +11075,21 @@@
                 }
         }
   
+       /*
+        * If userspace set a pending exception and L2 is active, convert it to
+        * a pending VM-Exit if L1 wants to intercept the exception.
+        */
+       if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+                                                       ex->error_code)) {
+               kvm_queue_exception_vmexit(vcpu, ex->vector,
+                                          ex->has_error_code, ex->error_code,
+                                          ex->has_payload, ex->payload);
+               ex->injected = false;
+               ex->pending = false;
+       }
+       vcpu->arch.exception_from_userspace = false;
+ 
         if (unlikely(vcpu->arch.complete_userspace_io)) {
                 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
                 vcpu->arch.complete_userspace_io = NULL;
@@@ -10988,6 -11182,7 +11196,7 @@@ static void __set_regs(struct kvm_vcpu 
         kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
   
         vcpu->arch.exception.pending = false;
+       vcpu->arch.exception_vmexit.pending = false;
   
         kvm_make_request(KVM_REQ_EVENT, vcpu);
   }
@@@ -11107,29 -11302,17 +11316,30 @@@ int kvm_arch_vcpu_ioctl_set_mpstate(str
   
         vcpu_load(vcpu);
   
- -      if (!lapic_in_kernel(vcpu) &&
- -          mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+ +      switch (mp_state->mp_state) {
+ +      case KVM_MP_STATE_UNINITIALIZED:
+ +      case KVM_MP_STATE_HALTED:
+ +      case KVM_MP_STATE_AP_RESET_HOLD:
+ +      case KVM_MP_STATE_INIT_RECEIVED:
+ +      case KVM_MP_STATE_SIPI_RECEIVED:
+ +              if (!lapic_in_kernel(vcpu))
+ +                      goto out;
+ +              break;
+ +
+ +      case KVM_MP_STATE_RUNNABLE:
+ +              break;
+ +
+ +      default:
                 goto out;
+ +      }
   
         /*
-        * KVM_MP_STATE_INIT_RECEIVED means the processor is in
-        * INIT state; latched init should be reported using
-        * KVM_SET_VCPU_EVENTS, so reject it here.
+        * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
+        * forcing the guest into INIT/SIPI if those events are supposed to be
+        * blocked.  KVM prioritizes SMI over INIT, so reject INIT/SIPI state
+        * if an SMI is pending as well.
          */
-       if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
+       if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
             (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
              mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
                 goto out;
@@@ -11368,7 -11551,7 +11578,7 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
   
         if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
                 r = -EBUSY;
-               if (vcpu->arch.exception.pending)
+               if (kvm_is_exception_pending(vcpu))
                         goto out;
                 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
                         kvm_queue_exception(vcpu, DB_VECTOR);
@@@ -11590,7 -11773,7 +11800,7 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
         vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
                                             GFP_KERNEL_ACCOUNT);
         if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
- -              goto fail_free_pio_data;
+ +              goto fail_free_mce_banks;
         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
   
         if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
@@@ -11644,6 -11827,7 +11854,6 @@@ free_wbinvd_dirty_mask
   fail_free_mce_banks:
         kfree(vcpu->arch.mce_banks);
         kfree(vcpu->arch.mci_ctl2_banks);
- -fail_free_pio_data:
         free_page((unsigned long)vcpu->arch.pio_data);
   fail_free_lapic:
         kvm_free_lapic(vcpu);
@@@ -11750,8 -11934,8 +11960,8 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc
                 struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
   
                 /*
-                * To avoid have the INIT path from kvm_apic_has_events() that be
-                * called with loaded FPU and does not let userspace fix the state.
+                * All paths that lead to INIT are required to load the guest's
+                * FPU state (because most paths are buried in KVM_RUN).
                  */
                 if (init_event)
                         kvm_put_guest_fpu(vcpu);
@@@ -12080,6 -12264,10 +12290,10 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         if (ret)
                 goto out_page_track;
   
+       ret = static_call(kvm_x86_vm_init)(kvm);
+       if (ret)
+               goto out_uninit_mmu;
+ 
         INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
         atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@@ -12115,8 -12303,10 +12329,10 @@@
         kvm_hv_init_vm(kvm);
         kvm_xen_init_vm(kvm);
   
-       return static_call(kvm_x86_vm_init)(kvm);
+       return 0;
   
+ out_uninit_mmu:
+       kvm_mmu_uninit_vm(kvm);
   out_page_track:
         kvm_page_track_cleanup(kvm);
   out:
@@@ -12589,13 -12779,14 +12805,14 @@@ static inline bool kvm_vcpu_has_events(
         if (!list_empty_careful(&vcpu->async_pf.done))
                 return true;
   
-       if (kvm_apic_has_events(vcpu))
+       if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+           kvm_apic_init_sipi_allowed(vcpu))
                 return true;
   
         if (vcpu->arch.pv.pv_unhalted)
                 return true;
   
-       if (vcpu->arch.exception.pending)
+       if (kvm_is_exception_pending(vcpu))
                 return true;
   
         if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
@@@ -12617,16 -12808,13 +12834,13 @@@
                 return true;
   
         if (is_guest_mode(vcpu) &&
-           kvm_x86_ops.nested_ops->hv_timer_pending &&
-           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+           kvm_x86_ops.nested_ops->has_events &&
+           kvm_x86_ops.nested_ops->has_events(vcpu))
                 return true;
   
         if (kvm_xen_has_pending_events(vcpu))
                 return true;
   
-       if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
-               return true;
- 
         return false;
   }
   
@@@ -12850,7 -13038,7 +13064,7 @@@ bool kvm_can_do_async_pf(struct kvm_vcp
   {
         if (unlikely(!lapic_in_kernel(vcpu) ||
                      kvm_event_needs_reinjection(vcpu) ||
-                    vcpu->arch.exception.pending))
+                    kvm_is_exception_pending(vcpu)))
                 return false;
   
         if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
@@@ -13401,7 -13589,7 +13615,7 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_vi
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
- EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
diff --combined mm/page_alloc.c

index d04211f0ef0b142c9625b47e4115e0515804590e,90461bd947448ed0e3934435aa912463f9b896c8..08522a831c7a7294a2f5c37879464b2e1cde00ff
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -4708,30 -4708,6 +4708,30 @@@ void fs_reclaim_release(gfp_t gfp_mask
   EXPORT_SYMBOL_GPL(fs_reclaim_release);
   #endif
   
+ +/*
+ + * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ + * have been rebuilt so allocation retries. Reader side does not lock and
+ + * retries the allocation if zonelist changes. Writer side is protected by the
+ + * embedded spin_lock.
+ + */
+ +static DEFINE_SEQLOCK(zonelist_update_seq);
+ +
+ +static unsigned int zonelist_iter_begin(void)
+ +{
+ +      if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ +              return read_seqbegin(&zonelist_update_seq);
+ +
+ +      return 0;
+ +}
+ +
+ +static unsigned int check_retry_zonelist(unsigned int seq)
+ +{
+ +      if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ +              return read_seqretry(&zonelist_update_seq, seq);
+ +
+ +      return seq;
+ +}
+ +
   /* Perform direct synchronous page reclaim */
   static unsigned long
   __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@@ -5025,7 -5001,6 +5025,7 @@@ __alloc_pages_slowpath(gfp_t gfp_mask, 
         int compaction_retries;
         int no_progress_loops;
         unsigned int cpuset_mems_cookie;
+ +      unsigned int zonelist_iter_cookie;
         int reserve_flags;
   
         /*
@@@ -5036,12 -5011,11 +5036,12 @@@
                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                 gfp_mask &= ~__GFP_ATOMIC;
   
- -retry_cpuset:
+ +restart:
         compaction_retries = 0;
         no_progress_loops = 0;
         compact_priority = DEF_COMPACT_PRIORITY;
         cpuset_mems_cookie = read_mems_allowed_begin();
+ +      zonelist_iter_cookie = zonelist_iter_begin();
   
         /*
          * The fast path uses conservative alloc_flags to succeed only until
@@@ -5213,13 -5187,9 +5213,13 @@@ retry
                 goto retry;
   
   
- -      /* Deal with possible cpuset update races before we start OOM killing */
- -      if (check_retry_cpuset(cpuset_mems_cookie, ac))
- -              goto retry_cpuset;
+ +      /*
+ +       * Deal with possible cpuset update races or zonelist updates to avoid
+ +       * a unnecessary OOM kill.
+ +       */
+ +      if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ +          check_retry_zonelist(zonelist_iter_cookie))
+ +              goto restart;
   
         /* Reclaim has failed us, start killing things */
         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@@ -5239,13 -5209,9 +5239,13 @@@
         }
   
   nopage:
- -      /* Deal with possible cpuset update races before we fail */
- -      if (check_retry_cpuset(cpuset_mems_cookie, ac))
- -              goto retry_cpuset;
+ +      /*
+ +       * Deal with possible cpuset update races or zonelist updates to avoid
+ +       * a unnecessary OOM kill.
+ +       */
+ +      if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ +          check_retry_zonelist(zonelist_iter_cookie))
+ +              goto restart;
   
         /*
          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@@ -5740,18 -5706,6 +5740,18 @@@ refill
                 /* reset page count bias and offset to start of new frag */
                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                 offset = size - fragsz;
+ +              if (unlikely(offset < 0)) {
+ +                      /*
+ +                       * The caller is trying to allocate a fragment
+ +                       * with fragsz > PAGE_SIZE but the cache isn't big
+ +                       * enough to satisfy the request, this may
+ +                       * happen in low memory conditions.
+ +                       * We don't release the cache page because
+ +                       * it could make memory pressure worse
+ +                       * so we simply return NULL here.
+ +                       */
+ +                      return NULL;
+ +              }
         }
   
         nc->pagecnt_bias--;
@@@ -6085,7 -6039,8 +6085,8 @@@ void show_free_areas(unsigned int filte
                 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
                 " unevictable:%lu dirty:%lu writeback:%lu\n"
                 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-               " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+               " mapped:%lu shmem:%lu pagetables:%lu\n"
+               " sec_pagetables:%lu bounce:%lu\n"
                 " kernel_misc_reclaimable:%lu\n"
                 " free:%lu free_pcp:%lu free_cma:%lu\n",
                 global_node_page_state(NR_ACTIVE_ANON),
@@@ -6102,6 -6057,7 +6103,7 @@@
                 global_node_page_state(NR_FILE_MAPPED),
                 global_node_page_state(NR_SHMEM),
                 global_node_page_state(NR_PAGETABLE),
+               global_node_page_state(NR_SECONDARY_PAGETABLE),
                 global_zone_page_state(NR_BOUNCE),
                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
                 global_zone_page_state(NR_FREE_PAGES),
@@@ -6135,6 -6091,7 +6137,7 @@@
                         " shadow_call_stack:%lukB"
   #endif
                         " pagetables:%lukB"
+                       " sec_pagetables:%lukB"
                         " all_unreclaimable? %s"
                         "\n",
                         pgdat->node_id,
@@@ -6160,6 -6117,7 +6163,7 @@@
                         node_page_state(pgdat, NR_KERNEL_SCS_KB),
   #endif
                         K(node_page_state(pgdat, NR_PAGETABLE)),
+                       K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
                         pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
                                 "yes" : "no");
         }
@@@ -6560,8 -6518,9 +6564,8 @@@ static void __build_all_zonelists(void 
         int nid;
         int __maybe_unused cpu;
         pg_data_t *self = data;
- -      static DEFINE_SPINLOCK(lock);
   
- -      spin_lock(&lock);
+ +      write_seqlock(&zonelist_update_seq);
   
   #ifdef CONFIG_NUMA
         memset(node_load, 0, sizeof(node_load));
@@@ -6598,7 -6557,7 +6602,7 @@@
   #endif
         }
   
- -      spin_unlock(&lock);
+ +      write_sequnlock(&zonelist_update_seq);
   }
   
   static noinline void __init
diff --combined mm/vmstat.c

index 90af9a8572f5a7073520ddaf2f4d1d3aaec2b7ac,b937eba681d1533e773e201fd62ad9edcbe38cc2..da264a040c5567d0ac192ed001134ee67e6291b7
--- 1/mm/vmstat.c
--- 2/mm/vmstat.c
+++ b/mm/vmstat.c
@@@ -1168,15 -1168,8 +1168,15 @@@ int fragmentation_index(struct zone *zo
   #define TEXT_FOR_HIGHMEM(xx)
   #endif
   
+ +#ifdef CONFIG_ZONE_DEVICE
+ +#define TEXT_FOR_DEVICE(xx) xx "_device",
+ +#else
+ +#define TEXT_FOR_DEVICE(xx)
+ +#endif
+ +
   #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- -                                      TEXT_FOR_HIGHMEM(xx) xx "_movable",
+ +                                      TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+ +                                      TEXT_FOR_DEVICE(xx)
   
   const char * const vmstat_text[] = {
         /* enum zone_stat_item counters */
@@@ -1247,6 -1240,7 +1247,7 @@@
         "nr_shadow_call_stack",
   #endif
         "nr_page_table_pages",
+       "nr_sec_page_table_pages",
   #ifdef CONFIG_SWAP
         "nr_swapcached",
   #endif
diff --combined tools/testing/selftests/kvm/Makefile

index 6448cb9f710f5fc9095a872edaaa52589094f00a,8b1b32628ac8426657566db7817c8ef27390b399..fde3ae8cfa4c7cc1b909d9caa17a152365ffb8b1
--- 1/tools/testing/selftests/kvm/Makefile
--- 2/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@@ -48,8 -48,6 +48,8 @@@ LIBKVM += lib/rbtree.
   LIBKVM += lib/sparsebit.c
   LIBKVM += lib/test_util.c
   
+ +LIBKVM_STRING += lib/string_override.c
+ +
   LIBKVM_x86_64 += lib/x86_64/apic.c
   LIBKVM_x86_64 += lib/x86_64/handlers.S
   LIBKVM_x86_64 += lib/x86_64/perf_test_util.c
@@@ -91,6 -89,7 +91,7 @@@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_clo
   TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
   TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
   TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
+ TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
   TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
   TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
   TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
@@@ -222,8 -221,7 +223,8 @@@ LIBKVM_C := $(filter %.c,$(LIBKVM)
   LIBKVM_S := $(filter %.S,$(LIBKVM))
   LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
   LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
- -LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
+ +LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
+ +LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
   
   EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
   
@@@ -234,12 -232,6 +235,12 @@@ $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.
   $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
         $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
   
+ +# Compile the string overrides as freestanding to prevent the compiler from
+ +# generating self-referential code, e.g. without "freestanding" the compiler may
+ +# "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion.
+ +$(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
+ +      $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
+ +
   x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
   $(TEST_GEN_PROGS): $(LIBKVM_OBJS)
   $(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
diff --combined tools/testing/selftests/kvm/include/x86_64/vmx.h

index 790c6d1ecb3482b1219f61f94516c9c2a297f67f,d07f13c9fced12527a14eb4745739f961f9e28d0..71b290b6469d6ba9f4230c016e3d10156c034648
--- 1/tools/testing/selftests/kvm/include/x86_64/vmx.h
--- 2/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@@ -8,6 -8,8 +8,8 @@@
   #ifndef SELFTEST_KVM_VMX_H
   #define SELFTEST_KVM_VMX_H
   
+ #include <asm/vmx.h>
+ 
   #include <stdint.h>
   #include "processor.h"
   #include "apic.h"
@@@ -100,55 -102,6 +102,6 @@@
   #define VMX_EPT_VPID_CAP_AD_BITS              0x00200000
   
   #define EXIT_REASON_FAILED_VMENTRY    0x80000000
- #define EXIT_REASON_EXCEPTION_NMI     0
- #define EXIT_REASON_EXTERNAL_INTERRUPT        1
- #define EXIT_REASON_TRIPLE_FAULT      2
- #define EXIT_REASON_INTERRUPT_WINDOW  7
- #define EXIT_REASON_NMI_WINDOW                8
- #define EXIT_REASON_TASK_SWITCH               9
- #define EXIT_REASON_CPUID             10
- #define EXIT_REASON_HLT                       12
- #define EXIT_REASON_INVD              13
- #define EXIT_REASON_INVLPG            14
- #define EXIT_REASON_RDPMC             15
- #define EXIT_REASON_RDTSC             16
- #define EXIT_REASON_VMCALL            18
- #define EXIT_REASON_VMCLEAR           19
- #define EXIT_REASON_VMLAUNCH          20
- #define EXIT_REASON_VMPTRLD           21
- #define EXIT_REASON_VMPTRST           22
- #define EXIT_REASON_VMREAD            23
- #define EXIT_REASON_VMRESUME          24
- #define EXIT_REASON_VMWRITE           25
- #define EXIT_REASON_VMOFF             26
- #define EXIT_REASON_VMON              27
- #define EXIT_REASON_CR_ACCESS         28
- #define EXIT_REASON_DR_ACCESS         29
- #define EXIT_REASON_IO_INSTRUCTION    30
- #define EXIT_REASON_MSR_READ          31
- #define EXIT_REASON_MSR_WRITE         32
- #define EXIT_REASON_INVALID_STATE     33
- #define EXIT_REASON_MWAIT_INSTRUCTION 36
- #define EXIT_REASON_MONITOR_INSTRUCTION 39
- #define EXIT_REASON_PAUSE_INSTRUCTION 40
- #define EXIT_REASON_MCE_DURING_VMENTRY        41
- #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
- #define EXIT_REASON_APIC_ACCESS               44
- #define EXIT_REASON_EOI_INDUCED               45
- #define EXIT_REASON_EPT_VIOLATION     48
- #define EXIT_REASON_EPT_MISCONFIG     49
- #define EXIT_REASON_INVEPT            50
- #define EXIT_REASON_RDTSCP            51
- #define EXIT_REASON_PREEMPTION_TIMER  52
- #define EXIT_REASON_INVVPID           53
- #define EXIT_REASON_WBINVD            54
- #define EXIT_REASON_XSETBV            55
- #define EXIT_REASON_APIC_WRITE                56
- #define EXIT_REASON_INVPCID           58
- #define EXIT_REASON_PML_FULL          62
- #define EXIT_REASON_XSAVES            63
- #define EXIT_REASON_XRSTORS           64
- #define LAST_EXIT_REASON              64
   
   enum vmcs_field {
         VIRTUAL_PROCESSOR_ID            = 0x00000000,
@@@ -208,6 -161,8 +161,8 @@@
         VMWRITE_BITMAP_HIGH             = 0x00002029,
         XSS_EXIT_BITMAP                 = 0x0000202C,
         XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
+       ENCLS_EXITING_BITMAP            = 0x0000202E,
+       ENCLS_EXITING_BITMAP_HIGH       = 0x0000202F,
         TSC_MULTIPLIER                  = 0x00002032,
         TSC_MULTIPLIER_HIGH             = 0x00002033,
         GUEST_PHYSICAL_ADDRESS          = 0x00002400,
@@@ -617,7 -572,6 +572,7 @@@ void nested_map_memslot(struct vmx_page
                         uint32_t memslot);
   void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
                             uint64_t addr, uint64_t size);
+ +bool kvm_vm_has_ept(struct kvm_vm *vm);
   void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
                   uint32_t eptp_memslot);
   void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 9 Oct 2022 16:39:55 +0000 (09:39 -0700)
		1	2
.mailmap	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kvm/arm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/kvm-s390.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/cpuid.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/emulate.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/nested.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmstat.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/kvm/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/kvm/include/x86_64/vmx.h	patch \|	diff1 \|	diff2 \|	blob \| history