KVM: x86/mmu: Use EMULTYPE flag to track write #PFs to shadow pages
authorSean Christopherson <seanjc@google.com>
Thu, 2 Feb 2023 18:28:15 +0000 (18:28 +0000)
committerPaolo Bonzini <pbonzini@redhat.com>
Tue, 14 Mar 2023 14:28:56 +0000 (10:28 -0400)
Use a new EMULTYPE flag, EMULTYPE_WRITE_PF_TO_SP, to track page faults
on self-changing writes to shadowed page tables instead of propagating
that information to the emulator via a semi-persistent vCPU flag.  Using
a flag in "struct kvm_vcpu_arch" is confusing, especially as implemented,
as it's not at all obvious that clearing the flag only when emulation
actually occurs is correct.

E.g. if KVM sets the flag and then retries the fault without ever getting
to the emulator, the flag will be left set for future calls into the
emulator.  But because the flag is consumed if and only if both
EMULTYPE_PF and EMULTYPE_ALLOW_RETRY_PF are set, and because
EMULTYPE_ALLOW_RETRY_PF is deliberately not set for direct MMUs, emulated
MMIO, or while L2 is active, KVM avoids false positives on a stale flag
since FNAME(page_fault) is guaranteed to be run and refresh the flag
before it's ultimately consumed by the tail end of reexecute_instruction().

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20230202182817.407394-2-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/x86.c

index 808c292ad3f4935b50864d1287fd7a7941432c55..a45de1118a42d6976d659055363c3de86e5c4630 100644 (file)
@@ -947,23 +947,6 @@ struct kvm_vcpu_arch {
 
        u64 msr_kvm_poll_control;
 
-       /*
-        * Indicates the guest is trying to write a gfn that contains one or
-        * more of the PTEs used to translate the write itself, i.e. the access
-        * is changing its own translation in the guest page tables.  KVM exits
-        * to userspace if emulation of the faulting instruction fails and this
-        * flag is set, as KVM cannot make forward progress.
-        *
-        * If emulation fails for a write to guest page tables, KVM unprotects
-        * (zaps) the shadow page for the target gfn and resumes the guest to
-        * retry the non-emulatable instruction (on hardware).  Unprotecting the
-        * gfn doesn't allow forward progress for a self-changing access because
-        * doing so also zaps the translation for the gfn, i.e. retrying the
-        * instruction will hit a !PRESENT fault, which results in a new shadow
-        * page and sends KVM back to square one.
-        */
-       bool write_fault_to_shadow_pgtable;
-
        /* set at EPT violation at this point */
        unsigned long exit_qualification;
 
@@ -1907,6 +1890,25 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
  * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
  *                              state and inject single-step #DBs after skipping
  *                              an instruction (after completing userspace I/O).
+ *
+ * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
+ *                          is attempting to write a gfn that contains one or
+ *                          more of the PTEs used to translate the write itself,
+ *                          and the owning page table is being shadowed by KVM.
+ *                          If emulation of the faulting instruction fails and
+ *                          this flag is set, KVM will exit to userspace instead
+ *                          of retrying emulation as KVM cannot make forward
+ *                          progress.
+ *
+ *                          If emulation fails for a write to guest page tables,
+ *                          KVM unprotects (zaps) the shadow page for the target
+ *                          gfn and resumes the guest to retry the non-emulatable
+ *                          instruction (on hardware).  Unprotecting the gfn
+ *                          doesn't allow forward progress for a self-changing
+ *                          access because doing so also zaps the translation for
+ *                          the gfn, i.e. retrying the instruction will hit a
+ *                          !PRESENT fault, which results in a new shadow page
+ *                          and sends KVM back to square one.
  */
 #define EMULTYPE_NO_DECODE         (1 << 0)
 #define EMULTYPE_TRAP_UD           (1 << 1)
@@ -1916,6 +1918,7 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
 #define EMULTYPE_VMWARE_GP         (1 << 5)
 #define EMULTYPE_PF                (1 << 6)
 #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
+#define EMULTYPE_WRITE_PF_TO_SP            (1 << 8)
 
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
 int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
index c8ebe542c565f33a097e4d2428a8492e659854ee..144c5a01cd778092034ee5fd002a2f0e14e33135 100644 (file)
@@ -4203,7 +4203,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
              work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
                return;
 
-       kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
+       kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
 }
 
 static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -5664,7 +5664,8 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 
        if (r == RET_PF_INVALID) {
                r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
-                                         lower_32_bits(error_code), false);
+                                         lower_32_bits(error_code), false,
+                                         &emulation_type);
                if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
                        return -EIO;
        }
index cc58631e233682c103a1549f6a79b4e50d01222e..2cbb155c686c7dea434d2f9a5d1eed1da9be5c0c 100644 (file)
@@ -240,6 +240,13 @@ struct kvm_page_fault {
        kvm_pfn_t pfn;
        hva_t hva;
        bool map_writable;
+
+       /*
+        * Indicates the guest is trying to write a gfn that contains one or
+        * more of the PTEs used to translate the write itself, i.e. the access
+        * is changing its own translation in the guest page tables.
+        */
+       bool write_fault_to_shadow_pgtable;
 };
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
@@ -273,7 +280,7 @@ enum {
 };
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                       u32 err, bool prefetch)
+                                       u32 err, bool prefetch, int *emulation_type)
 {
        struct kvm_page_fault fault = {
                .addr = cr2_or_gpa,
@@ -312,6 +319,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        else
                r = vcpu->arch.mmu->page_fault(vcpu, &fault);
 
+       if (fault.write_fault_to_shadow_pgtable && emulation_type)
+               *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+
        /*
         * Similar to above, prefetch faults aren't truly spurious, and the
         * async #PF path doesn't do emulation.  Do count faults that are fixed
index 57f0b75c80f9d52a8d0401e8ebadb39f2d6d1283..5d2958299b4ff17ae9dac50c4cbc9eb2c709ed0f 100644 (file)
@@ -825,10 +825,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
        if (r)
                return r;
 
-       vcpu->arch.write_fault_to_shadow_pgtable = false;
-
        is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
-             &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
+             &walker, fault->user, &fault->write_fault_to_shadow_pgtable);
 
        if (is_self_change_mapping)
                fault->max_level = PG_LEVEL_4K;
index 7713420abab093b19d6c19e41d9ca9454c006a08..ff7f398a0c6a98117cdf1578d2893c9265d0252e 100644 (file)
@@ -8463,7 +8463,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                 bool write_fault_to_shadow_pgtable,
                                  int emulation_type)
 {
        gpa_t gpa = cr2_or_gpa;
@@ -8534,7 +8533,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         * be fixed by unprotecting shadow page and it should
         * be reported to userspace.
         */
-       return !write_fault_to_shadow_pgtable;
+       return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
 }
 
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -8782,20 +8781,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        int r;
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
-       bool write_fault_to_spt;
 
        if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
                return 1;
 
        vcpu->arch.l1tf_flush_l1d = true;
 
-       /*
-        * Clear write_fault_to_shadow_pgtable here to ensure it is
-        * never reused.
-        */
-       write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
-       vcpu->arch.write_fault_to_shadow_pgtable = false;
-
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                kvm_clear_exception_queue(vcpu);
 
@@ -8816,7 +8807,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                                return 1;
                        }
                        if (reexecute_instruction(vcpu, cr2_or_gpa,
-                                                 write_fault_to_spt,
                                                  emulation_type))
                                return 1;
 
@@ -8895,8 +8885,7 @@ restart:
                return 1;
 
        if (r == EMULATION_FAILED) {
-               if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
-                                       emulation_type))
+               if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
                        return 1;
 
                return handle_emulation_failure(vcpu, emulation_type);