KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG
authorDavid Matlack <dmatlack@google.com>
Wed, 19 Jan 2022 23:07:37 +0000 (23:07 +0000)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 10 Feb 2022 18:50:43 +0000 (13:50 -0500)
When using KVM_DIRTY_LOG_INITIALLY_SET, huge pages are not
write-protected when dirty logging is enabled on the memslot. Instead
they are write-protected once userspace invokes KVM_CLEAR_DIRTY_LOG for
the first time and only for the specific sub-region being cleared.

Enhance KVM_CLEAR_DIRTY_LOG to also try to split huge pages prior to
write-protecting to avoid causing write-protection faults on vCPU
threads. This also allows userspace to smear the cost of huge page
splitting across multiple ioctls, rather than splitting the entire
memslot as is the case when initially-all-set is not used.

Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220119230739.2234394-17-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Documentation/admin-guide/kernel-parameters.txt
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/mmu/tdp_mmu.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h

index 5d80a0faa0961cd3a45a97f6d8ef44cd8963bf9a..2a9746fe6c4ab64638d2caca2cf648a5f0f35c83 100644 (file)
                        KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
                        disabled, all huge pages in a memslot will be eagerly
                        split when dirty logging is enabled on that memslot. If
-                       enabled, huge pages will not be eagerly split.
+                       enabled, eager page splitting will be performed during
+                       the KVM_CLEAR_DIRTY ioctl, and only for the pages being
+                       cleared.
 
                        Eager page splitting currently only supports splitting
                        huge pages mapped by the TDP MMU.
index 8bfb069fb3df3872e777684ad24cfe54f2f2794e..10815b672a26962529e53bb3ed250d471f97491b 100644 (file)
@@ -1590,6 +1590,10 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
                                       const struct kvm_memory_slot *memslot,
                                       int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+                                 const struct kvm_memory_slot *memslot,
+                                 u64 start, u64 end,
+                                 int target_level);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
index 308c8b21f9b19d53fe651eb493bfc84d26ab8cb1..296f8723f9ae92653b4c6023bd9bb0343b6212c2 100644 (file)
@@ -1358,6 +1358,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
                gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
 
+               if (READ_ONCE(eager_page_split))
+                       kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
                kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
 
                /* Cross two large pages? */
@@ -5830,16 +5833,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+                                  const struct kvm_memory_slot *memslot,
+                                  u64 start, u64 end,
+                                  int target_level)
+{
+       if (is_tdp_mmu_enabled(kvm))
+               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+                                                target_level, false);
+
+       /*
+        * A TLB flush is unnecessary at this point for the same resons as in
+        * kvm_mmu_slot_try_split_huge_pages().
+        */
+}
+
 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
-                                      const struct kvm_memory_slot *memslot,
-                                      int target_level)
+                                       const struct kvm_memory_slot *memslot,
+                                       int target_level)
 {
        u64 start = memslot->base_gfn;
        u64 end = start + memslot->npages;
 
        if (is_tdp_mmu_enabled(kvm)) {
                read_lock(&kvm->mmu_lock);
-               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
                read_unlock(&kvm->mmu_lock);
        }
 
index 6dfd6db154d8624614b1853ab7c064ed93fcdde5..dae2cebcf8b5c41d6f66f40d2635d8a2f318e78d 100644 (file)
@@ -963,27 +963,33 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 }
 
 /*
- * tdp_mmu_link_sp_atomic - Atomically replace the given spte with an spte
- * pointing to the provided page table.
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
  *
  * @kvm: kvm instance
  * @iter: a tdp_iter instance currently on the SPTE that should be set
  * @sp: The new TDP page table to install.
  * @account_nx: True if this page table is being installed to split a
  *              non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
  *
  * Returns: 0 if the new page table was installed. Non-0 if the page table
  *          could not be installed (e.g. the atomic compare-exchange failed).
  */
-static int tdp_mmu_link_sp_atomic(struct kvm *kvm, struct tdp_iter *iter,
-                                 struct kvm_mmu_page *sp, bool account_nx)
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+                          struct kvm_mmu_page *sp, bool account_nx,
+                          bool shared)
 {
        u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
-       int ret;
+       int ret = 0;
 
-       ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
-       if (ret)
-               return ret;
+       if (shared) {
+               ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+               if (ret)
+                       return ret;
+       } else {
+               tdp_mmu_set_spte(kvm, iter, spte);
+       }
 
        spin_lock(&kvm->arch.tdp_mmu_pages_lock);
        list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
@@ -1051,7 +1057,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
                        sp = tdp_mmu_alloc_sp(vcpu);
                        tdp_mmu_init_child_sp(sp, &iter);
 
-                       if (tdp_mmu_link_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) {
+                       if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
                                tdp_mmu_free_sp(sp);
                                break;
                        }
@@ -1277,12 +1283,11 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
 }
 
 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
-                                                      struct tdp_iter *iter)
+                                                      struct tdp_iter *iter,
+                                                      bool shared)
 {
        struct kvm_mmu_page *sp;
 
-       lockdep_assert_held_read(&kvm->mmu_lock);
-
        /*
         * Since we are allocating while under the MMU lock we have to be
         * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
@@ -1297,20 +1302,27 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
                return sp;
 
        rcu_read_unlock();
-       read_unlock(&kvm->mmu_lock);
+
+       if (shared)
+               read_unlock(&kvm->mmu_lock);
+       else
+               write_unlock(&kvm->mmu_lock);
 
        iter->yielded = true;
        sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
 
-       read_lock(&kvm->mmu_lock);
+       if (shared)
+               read_lock(&kvm->mmu_lock);
+       else
+               write_lock(&kvm->mmu_lock);
+
        rcu_read_lock();
 
        return sp;
 }
 
-static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
-                                         struct tdp_iter *iter,
-                                         struct kvm_mmu_page *sp)
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+                                  struct kvm_mmu_page *sp, bool shared)
 {
        const u64 huge_spte = iter->old_spte;
        const int level = iter->level;
@@ -1333,7 +1345,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
         * correctness standpoint since the translation will be the same either
         * way.
         */
-       ret = tdp_mmu_link_sp_atomic(kvm, iter, sp, false);
+       ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
        if (ret)
                return ret;
 
@@ -1350,7 +1362,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
                                         struct kvm_mmu_page *root,
                                         gfn_t start, gfn_t end,
-                                        int target_level)
+                                        int target_level, bool shared)
 {
        struct kvm_mmu_page *sp = NULL;
        struct tdp_iter iter;
@@ -1371,14 +1383,14 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
         */
        for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
 retry:
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
                        continue;
 
                if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
                        continue;
 
                if (!sp) {
-                       sp = tdp_mmu_alloc_sp_for_split(kvm, &iter);
+                       sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
                        if (!sp) {
                                ret = -ENOMEM;
                                break;
@@ -1388,7 +1400,7 @@ retry:
                                continue;
                }
 
-               if (tdp_mmu_split_huge_page_atomic(kvm, &iter, sp))
+               if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
                        goto retry;
 
                sp = NULL;
@@ -1408,23 +1420,24 @@ retry:
        return ret;
 }
 
+
 /*
  * Try to split all huge pages mapped by the TDP MMU down to the target level.
  */
 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
                                      const struct kvm_memory_slot *slot,
                                      gfn_t start, gfn_t end,
-                                     int target_level)
+                                     int target_level, bool shared)
 {
        struct kvm_mmu_page *root;
        int r = 0;
 
-       lockdep_assert_held_read(&kvm->mmu_lock);
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) {
-               r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+               r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
                if (r) {
-                       kvm_tdp_mmu_put_root(kvm, root, true);
+                       kvm_tdp_mmu_put_root(kvm, root, shared);
                        break;
                }
        }
index fdb3a886e50f94de799b98570d6c289826d52b0e..3f987785702a4b1938c67319a07a41fe469976c2 100644 (file)
@@ -70,7 +70,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
                                      const struct kvm_memory_slot *slot,
                                      gfn_t start, gfn_t end,
-                                     int target_level);
+                                     int target_level, bool shared);
 
 static inline void kvm_tdp_mmu_walk_lockless_begin(void)
 {
index ffef31feac3a882b5835efff00ee4f4d13bcb4cd..803b2e4c7b758ffe5df8e5b387b4a3a9fd3a4ea8 100644 (file)
@@ -192,7 +192,7 @@ bool __read_mostly enable_pmu = true;
 EXPORT_SYMBOL_GPL(enable_pmu);
 module_param(enable_pmu, bool, 0444);
 
-static bool __read_mostly eager_page_split = true;
+bool __read_mostly eager_page_split = true;
 module_param(eager_page_split, bool, 0644);
 
 /*
index 767ec7f9951608f984b4ac69a1c3205ce0a93ebc..aa86abad914d2f6e2d6d8ce14920d1c5cad20dfa 100644 (file)
@@ -307,6 +307,8 @@ extern int pi_inject_timer;
 
 extern bool report_ignored_msrs;
 
+extern bool eager_page_split;
+
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
        return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,