arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = true;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return false;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28
  29         return true;
  30 }
  31
  32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
  33                                                              bool shared)
  34 {
  35         if (shared)
  36                 lockdep_assert_held_read(&kvm->mmu_lock);
  37         else
  38                 lockdep_assert_held_write(&kvm->mmu_lock);
  39 }
  40
  41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  42 {
  43         if (!kvm->arch.tdp_mmu_enabled)
  44                 return;
  45
  46         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
  47         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  48
  49         /*
  50          * Ensure that all the outstanding RCU callbacks to free shadow pages
  51          * can run before the VM is torn down.
  52          */
  53         rcu_barrier();
  54 }
  55
  56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  57                           gfn_t start, gfn_t end, bool can_yield, bool flush,
  58                           bool shared);
  59
  60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  61 {
  62         free_page((unsigned long)sp->spt);
  63         kmem_cache_free(mmu_page_header_cache, sp);
  64 }
  65
  66 /*
  67  * This is called through call_rcu in order to free TDP page table memory
  68  * safely with respect to other kernel threads that may be operating on
  69  * the memory.
  70  * By only accessing TDP MMU page table memory in an RCU read critical
  71  * section, and freeing it after a grace period, lockless access to that
  72  * memory won't use it after it is freed.
  73  */
  74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
  75 {
  76         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
  77                                                rcu_head);
  78
  79         tdp_mmu_free_sp(sp);
  80 }
  81
  82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
  83                           bool shared)
  84 {
  85         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
  86
  87         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
  88                 return;
  89
  90         WARN_ON(!root->tdp_mmu_page);
  91
  92         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
  93         list_del_rcu(&root->link);
  94         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  95
  96         zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
  97
  98         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
  99 }
 100
 101 /*
 102  * Finds the next valid root after root (or the first valid root if root
 103  * is NULL), takes a reference on it, and returns that next root. If root
 104  * is not NULL, this thread should have already taken a reference on it, and
 105  * that reference will be dropped. If no valid root is found, this
 106  * function will return NULL.
 107  */
 108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 109                                               struct kvm_mmu_page *prev_root,
 110                                               bool shared)
 111 {
 112         struct kvm_mmu_page *next_root;
 113
 114         rcu_read_lock();
 115
 116         if (prev_root)
 117                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 118                                                   &prev_root->link,
 119                                                   typeof(*prev_root), link);
 120         else
 121                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 122                                                    typeof(*next_root), link);
 123
 124         while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
 125                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 126                                 &next_root->link, typeof(*next_root), link);
 127
 128         rcu_read_unlock();
 129
 130         if (prev_root)
 131                 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
 132
 133         return next_root;
 134 }
 135
 136 /*
 137  * Note: this iterator gets and puts references to the roots it iterates over.
 138  * This makes it safe to release the MMU lock and yield within the loop, but
 139  * if exiting the loop early, the caller must drop the reference to the most
 140  * recent root. (Unless keeping a live reference is desirable.)
 141  *
 142  * If shared is set, this function is operating under the MMU lock in read
 143  * mode. In the unlikely event that this thread must free a root, the lock
 144  * will be temporarily dropped and reacquired in write mode.
 145  */
 146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)  \
 147         for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
 148              _root;                                                     \
 149              _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
 150                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
 151                 } else
 152
 153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                              \
 154         list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,         \
 155                                 lockdep_is_held_type(&kvm->mmu_lock, 0) ||      \
 156                                 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
 157                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
 158                 } else
 159
 160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 161                                                    int level)
 162 {
 163         union kvm_mmu_page_role role;
 164
 165         role = vcpu->arch.mmu->mmu_role.base;
 166         role.level = level;
 167         role.direct = true;
 168         role.gpte_is_8_bytes = true;
 169         role.access = ACC_ALL;
 170
 171         return role;
 172 }
 173
 174 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 175                                                int level)
 176 {
 177         struct kvm_mmu_page *sp;
 178
 179         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 180         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 181         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 182
 183         sp->role.word = page_role_for_level(vcpu, level).word;
 184         sp->gfn = gfn;
 185         sp->tdp_mmu_page = true;
 186
 187         trace_kvm_mmu_get_page(sp, true);
 188
 189         return sp;
 190 }
 191
 192 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 193 {
 194         union kvm_mmu_page_role role;
 195         struct kvm *kvm = vcpu->kvm;
 196         struct kvm_mmu_page *root;
 197
 198         lockdep_assert_held_write(&kvm->mmu_lock);
 199
 200         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 201
 202         /* Check for an existing root before allocating a new one. */
 203         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 204                 if (root->role.word == role.word &&
 205                     kvm_tdp_mmu_get_root(kvm, root))
 206                         goto out;
 207         }
 208
 209         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 210         refcount_set(&root->tdp_mmu_root_count, 1);
 211
 212         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 213         list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 214         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 215
 216 out:
 217         return __pa(root->spt);
 218 }
 219
 220 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 221                                 u64 old_spte, u64 new_spte, int level,
 222                                 bool shared);
 223
 224 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 225 {
 226         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 227                 return;
 228
 229         if (is_accessed_spte(old_spte) &&
 230             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 231              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 232                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 233 }
 234
 235 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 236                                           u64 old_spte, u64 new_spte, int level)
 237 {
 238         bool pfn_changed;
 239         struct kvm_memory_slot *slot;
 240
 241         if (level > PG_LEVEL_4K)
 242                 return;
 243
 244         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 245
 246         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 247             is_writable_pte(new_spte)) {
 248                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 249                 mark_page_dirty_in_slot(kvm, slot, gfn);
 250         }
 251 }
 252
 253 /**
 254  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 255  *
 256  * @kvm: kvm instance
 257  * @sp: the new page
 258  * @account_nx: This page replaces a NX large page and should be marked for
 259  *              eventual reclaim.
 260  */
 261 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 262                               bool account_nx)
 263 {
 264         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 265         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 266         if (account_nx)
 267                 account_huge_nx_page(kvm, sp);
 268         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 269 }
 270
 271 /**
 272  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 273  *
 274  * @kvm: kvm instance
 275  * @sp: the page to be removed
 276  * @shared: This operation may not be running under the exclusive use of
 277  *          the MMU lock and the operation must synchronize with other
 278  *          threads that might be adding or removing pages.
 279  */
 280 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 281                                 bool shared)
 282 {
 283         if (shared)
 284                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 285         else
 286                 lockdep_assert_held_write(&kvm->mmu_lock);
 287
 288         list_del(&sp->link);
 289         if (sp->lpage_disallowed)
 290                 unaccount_huge_nx_page(kvm, sp);
 291
 292         if (shared)
 293                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 294 }
 295
 296 /**
 297  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 298  *
 299  * @kvm: kvm instance
 300  * @pt: the page removed from the paging structure
 301  * @shared: This operation may not be running under the exclusive use
 302  *          of the MMU lock and the operation must synchronize with other
 303  *          threads that might be modifying SPTEs.
 304  *
 305  * Given a page table that has been removed from the TDP paging structure,
 306  * iterates through the page table to clear SPTEs and free child page tables.
 307  *
 308  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 309  * protection. Since this thread removed it from the paging structure,
 310  * this thread will be responsible for ensuring the page is freed. Hence the
 311  * early rcu_dereferences in the function.
 312  */
 313 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 314                                         bool shared)
 315 {
 316         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 317         int level = sp->role.level;
 318         gfn_t base_gfn = sp->gfn;
 319         u64 old_child_spte;
 320         u64 *sptep;
 321         gfn_t gfn;
 322         int i;
 323
 324         trace_kvm_mmu_prepare_zap_page(sp);
 325
 326         tdp_mmu_unlink_page(kvm, sp, shared);
 327
 328         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 329                 sptep = rcu_dereference(pt) + i;
 330                 gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
 331
 332                 if (shared) {
 333                         /*
 334                          * Set the SPTE to a nonpresent value that other
 335                          * threads will not overwrite. If the SPTE was
 336                          * already marked as removed then another thread
 337                          * handling a page fault could overwrite it, so
 338                          * set the SPTE until it is set from some other
 339                          * value to the removed SPTE value.
 340                          */
 341                         for (;;) {
 342                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 343                                 if (!is_removed_spte(old_child_spte))
 344                                         break;
 345                                 cpu_relax();
 346                         }
 347                 } else {
 348                         /*
 349                          * If the SPTE is not MMU-present, there is no backing
 350                          * page associated with the SPTE and so no side effects
 351                          * that need to be recorded, and exclusive ownership of
 352                          * mmu_lock ensures the SPTE can't be made present.
 353                          * Note, zapping MMIO SPTEs is also unnecessary as they
 354                          * are guarded by the memslots generation, not by being
 355                          * unreachable.
 356                          */
 357                         old_child_spte = READ_ONCE(*sptep);
 358                         if (!is_shadow_present_pte(old_child_spte))
 359                                 continue;
 360
 361                         /*
 362                          * Marking the SPTE as a removed SPTE is not
 363                          * strictly necessary here as the MMU lock will
 364                          * stop other threads from concurrently modifying
 365                          * this SPTE. Using the removed SPTE value keeps
 366                          * the two branches consistent and simplifies
 367                          * the function.
 368                          */
 369                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 370                 }
 371                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 372                                     old_child_spte, REMOVED_SPTE, level,
 373                                     shared);
 374         }
 375
 376         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 377                                            KVM_PAGES_PER_HPAGE(level + 1));
 378
 379         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 380 }
 381
 382 /**
 383  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
 384  * @kvm: kvm instance
 385  * @as_id: the address space of the paging structure the SPTE was a part of
 386  * @gfn: the base GFN that was mapped by the SPTE
 387  * @old_spte: The value of the SPTE before the change
 388  * @new_spte: The value of the SPTE after the change
 389  * @level: the level of the PT the SPTE is part of in the paging structure
 390  * @shared: This operation may not be running under the exclusive use of
 391  *          the MMU lock and the operation must synchronize with other
 392  *          threads that might be modifying SPTEs.
 393  *
 394  * Handle bookkeeping that might result from the modification of a SPTE.
 395  * This function must be called for all TDP SPTE modifications.
 396  */
 397 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 398                                   u64 old_spte, u64 new_spte, int level,
 399                                   bool shared)
 400 {
 401         bool was_present = is_shadow_present_pte(old_spte);
 402         bool is_present = is_shadow_present_pte(new_spte);
 403         bool was_leaf = was_present && is_last_spte(old_spte, level);
 404         bool is_leaf = is_present && is_last_spte(new_spte, level);
 405         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 406
 407         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 408         WARN_ON(level < PG_LEVEL_4K);
 409         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 410
 411         /*
 412          * If this warning were to trigger it would indicate that there was a
 413          * missing MMU notifier or a race with some notifier handler.
 414          * A present, leaf SPTE should never be directly replaced with another
 415          * present leaf SPTE pointing to a different PFN. A notifier handler
 416          * should be zapping the SPTE before the main MM's page table is
 417          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 418          * thread before replacement.
 419          */
 420         if (was_leaf && is_leaf && pfn_changed) {
 421                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 422                        "SPTE with another present leaf SPTE mapping a\n"
 423                        "different PFN!\n"
 424                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 425                        as_id, gfn, old_spte, new_spte, level);
 426
 427                 /*
 428                  * Crash the host to prevent error propagation and guest data
 429                  * corruption.
 430                  */
 431                 BUG();
 432         }
 433
 434         if (old_spte == new_spte)
 435                 return;
 436
 437         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 438
 439         /*
 440          * The only times a SPTE should be changed from a non-present to
 441          * non-present state is when an MMIO entry is installed/modified/
 442          * removed. In that case, there is nothing to do here.
 443          */
 444         if (!was_present && !is_present) {
 445                 /*
 446                  * If this change does not involve a MMIO SPTE or removed SPTE,
 447                  * it is unexpected. Log the change, though it should not
 448                  * impact the guest since both the former and current SPTEs
 449                  * are nonpresent.
 450                  */
 451                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 452                             !is_mmio_spte(new_spte) &&
 453                             !is_removed_spte(new_spte)))
 454                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 455                                "should not be replaced with another,\n"
 456                                "different nonpresent SPTE, unless one or both\n"
 457                                "are MMIO SPTEs, or the new SPTE is\n"
 458                                "a temporary removed SPTE.\n"
 459                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 460                                as_id, gfn, old_spte, new_spte, level);
 461                 return;
 462         }
 463
 464         if (is_leaf != was_leaf)
 465                 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
 466
 467         if (was_leaf && is_dirty_spte(old_spte) &&
 468             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 469                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 470
 471         /*
 472          * Recursively handle child PTs if the change removed a subtree from
 473          * the paging structure.
 474          */
 475         if (was_present && !was_leaf && (pfn_changed || !is_present))
 476                 handle_removed_tdp_mmu_page(kvm,
 477                                 spte_to_child_pt(old_spte, level), shared);
 478 }
 479
 480 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 481                                 u64 old_spte, u64 new_spte, int level,
 482                                 bool shared)
 483 {
 484         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 485                               shared);
 486         handle_changed_spte_acc_track(old_spte, new_spte, level);
 487         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 488                                       new_spte, level);
 489 }
 490
 491 /*
 492  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
 493  * and handle the associated bookkeeping.  Do not mark the page dirty
 494  * in KVM's dirty bitmaps.
 495  *
 496  * @kvm: kvm instance
 497  * @iter: a tdp_iter instance currently on the SPTE that should be set
 498  * @new_spte: The value the SPTE should be set to
 499  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 500  *          this function will have no side-effects.
 501  */
 502 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 503                                            struct tdp_iter *iter,
 504                                            u64 new_spte)
 505 {
 506         lockdep_assert_held_read(&kvm->mmu_lock);
 507
 508         /*
 509          * Do not change removed SPTEs. Only the thread that froze the SPTE
 510          * may modify it.
 511          */
 512         if (is_removed_spte(iter->old_spte))
 513                 return false;
 514
 515         /*
 516          * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
 517          * does not hold the mmu_lock.
 518          */
 519         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 520                       new_spte) != iter->old_spte)
 521                 return false;
 522
 523         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 524                               new_spte, iter->level, true);
 525         handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 526
 527         return true;
 528 }
 529
 530 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 531                                            struct tdp_iter *iter)
 532 {
 533         /*
 534          * Freeze the SPTE by setting it to a special,
 535          * non-present value. This will stop other threads from
 536          * immediately installing a present entry in its place
 537          * before the TLBs are flushed.
 538          */
 539         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 540                 return false;
 541
 542         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 543                                            KVM_PAGES_PER_HPAGE(iter->level));
 544
 545         /*
 546          * No other thread can overwrite the removed SPTE as they
 547          * must either wait on the MMU lock or use
 548          * tdp_mmu_set_spte_atomic which will not overwrite the
 549          * special removed SPTE value. No bookkeeping is needed
 550          * here since the SPTE is going from non-present
 551          * to non-present.
 552          */
 553         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 554
 555         return true;
 556 }
 557
 558
 559 /*
 560  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 561  * @kvm: kvm instance
 562  * @iter: a tdp_iter instance currently on the SPTE that should be set
 563  * @new_spte: The value the SPTE should be set to
 564  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 565  *                    of the page. Should be set unless handling an MMU
 566  *                    notifier for access tracking. Leaving record_acc_track
 567  *                    unset in that case prevents page accesses from being
 568  *                    double counted.
 569  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 570  *                    appropriate for the change being made. Should be set
 571  *                    unless performing certain dirty logging operations.
 572  *                    Leaving record_dirty_log unset in that case prevents page
 573  *                    writes from being double counted.
 574  */
 575 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 576                                       u64 new_spte, bool record_acc_track,
 577                                       bool record_dirty_log)
 578 {
 579         lockdep_assert_held_write(&kvm->mmu_lock);
 580
 581         /*
 582          * No thread should be using this function to set SPTEs to the
 583          * temporary removed SPTE value.
 584          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 585          * should be used. If operating under the MMU lock in write mode, the
 586          * use of the removed SPTE should not be necessary.
 587          */
 588         WARN_ON(is_removed_spte(iter->old_spte));
 589
 590         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 591
 592         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 593                               new_spte, iter->level, false);
 594         if (record_acc_track)
 595                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 596                                               iter->level);
 597         if (record_dirty_log)
 598                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 599                                               iter->old_spte, new_spte,
 600                                               iter->level);
 601 }
 602
 603 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 604                                     u64 new_spte)
 605 {
 606         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 607 }
 608
 609 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 610                                                  struct tdp_iter *iter,
 611                                                  u64 new_spte)
 612 {
 613         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 614 }
 615
 616 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 617                                                  struct tdp_iter *iter,
 618                                                  u64 new_spte)
 619 {
 620         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 621 }
 622
 623 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 624         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 625
 626 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 627         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 628                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 629                     !is_last_spte(_iter.old_spte, _iter.level))         \
 630                         continue;                                       \
 631                 else
 632
 633 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 634         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 635                          _mmu->shadow_root_level, _start, _end)
 636
 637 /*
 638  * Yield if the MMU lock is contended or this thread needs to return control
 639  * to the scheduler.
 640  *
 641  * If this function should yield and flush is set, it will perform a remote
 642  * TLB flush before yielding.
 643  *
 644  * If this function yields, it will also reset the tdp_iter's walk over the
 645  * paging structure and the calling function should skip to the next
 646  * iteration to allow the iterator to continue its traversal from the
 647  * paging structure root.
 648  *
 649  * Return true if this function yielded and the iterator's traversal was reset.
 650  * Return false if a yield was not needed.
 651  */
 652 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 653                                              struct tdp_iter *iter, bool flush,
 654                                              bool shared)
 655 {
 656         /* Ensure forward progress has been made before yielding. */
 657         if (iter->next_last_level_gfn == iter->yielded_gfn)
 658                 return false;
 659
 660         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 661                 rcu_read_unlock();
 662
 663                 if (flush)
 664                         kvm_flush_remote_tlbs(kvm);
 665
 666                 if (shared)
 667                         cond_resched_rwlock_read(&kvm->mmu_lock);
 668                 else
 669                         cond_resched_rwlock_write(&kvm->mmu_lock);
 670
 671                 rcu_read_lock();
 672
 673                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 674
 675                 tdp_iter_restart(iter);
 676
 677                 return true;
 678         }
 679
 680         return false;
 681 }
 682
 683 /*
 684  * Tears down the mappings for the range of gfns, [start, end), and frees the
 685  * non-root pages mapping GFNs strictly within that range. Returns true if
 686  * SPTEs have been cleared and a TLB flush is needed before releasing the
 687  * MMU lock.
 688  *
 689  * If can_yield is true, will release the MMU lock and reschedule if the
 690  * scheduler needs the CPU or there is contention on the MMU lock. If this
 691  * function cannot yield, it will not release the MMU lock or reschedule and
 692  * the caller must ensure it does not supply too large a GFN range, or the
 693  * operation can cause a soft lockup.
 694  *
 695  * If shared is true, this thread holds the MMU lock in read mode and must
 696  * account for the possibility that other threads are modifying the paging
 697  * structures concurrently. If shared is false, this thread should hold the
 698  * MMU lock in write mode.
 699  */
 700 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 701                           gfn_t start, gfn_t end, bool can_yield, bool flush,
 702                           bool shared)
 703 {
 704         gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 705         bool zap_all = (start == 0 && end >= max_gfn_host);
 706         struct tdp_iter iter;
 707
 708         /*
 709          * No need to try to step down in the iterator when zapping all SPTEs,
 710          * zapping the top-level non-leaf SPTEs will recurse on their children.
 711          */
 712         int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
 713
 714         /*
 715          * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
 716          * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
 717          * and so KVM will never install a SPTE for such addresses.
 718          */
 719         end = min(end, max_gfn_host);
 720
 721         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 722
 723         rcu_read_lock();
 724
 725         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
 726                                    min_level, start, end) {
 727 retry:
 728                 if (can_yield &&
 729                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
 730                         flush = false;
 731                         continue;
 732                 }
 733
 734                 if (!is_shadow_present_pte(iter.old_spte))
 735                         continue;
 736
 737                 /*
 738                  * If this is a non-last-level SPTE that covers a larger range
 739                  * than should be zapped, continue, and zap the mappings at a
 740                  * lower level, except when zapping all SPTEs.
 741                  */
 742                 if (!zap_all &&
 743                     (iter.gfn < start ||
 744                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 745                     !is_last_spte(iter.old_spte, iter.level))
 746                         continue;
 747
 748                 if (!shared) {
 749                         tdp_mmu_set_spte(kvm, &iter, 0);
 750                         flush = true;
 751                 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
 752                         /*
 753                          * The iter must explicitly re-read the SPTE because
 754                          * the atomic cmpxchg failed.
 755                          */
 756                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 757                         goto retry;
 758                 }
 759         }
 760
 761         rcu_read_unlock();
 762         return flush;
 763 }
 764
 765 /*
 766  * Tears down the mappings for the range of gfns, [start, end), and frees the
 767  * non-root pages mapping GFNs strictly within that range. Returns true if
 768  * SPTEs have been cleared and a TLB flush is needed before releasing the
 769  * MMU lock.
 770  */
 771 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 772                                  gfn_t end, bool can_yield, bool flush)
 773 {
 774         struct kvm_mmu_page *root;
 775
 776         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
 777                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
 778                                       false);
 779
 780         return flush;
 781 }
 782
 783 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 784 {
 785         bool flush = false;
 786         int i;
 787
 788         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 789                 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
 790
 791         if (flush)
 792                 kvm_flush_remote_tlbs(kvm);
 793 }
 794
 795 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
 796                                                   struct kvm_mmu_page *prev_root)
 797 {
 798         struct kvm_mmu_page *next_root;
 799
 800         if (prev_root)
 801                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 802                                                   &prev_root->link,
 803                                                   typeof(*prev_root), link);
 804         else
 805                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 806                                                    typeof(*next_root), link);
 807
 808         while (next_root && !(next_root->role.invalid &&
 809                               refcount_read(&next_root->tdp_mmu_root_count)))
 810                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 811                                                   &next_root->link,
 812                                                   typeof(*next_root), link);
 813
 814         return next_root;
 815 }
 816
 817 /*
 818  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
 819  * invalidated root, they will not be freed until this function drops the
 820  * reference. Before dropping that reference, tear down the paging
 821  * structure so that whichever thread does drop the last reference
 822  * only has to do a trivial amount of work. Since the roots are invalid,
 823  * no new SPTEs should be created under them.
 824  */
 825 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
 826 {
 827         struct kvm_mmu_page *next_root;
 828         struct kvm_mmu_page *root;
 829         bool flush = false;
 830
 831         lockdep_assert_held_read(&kvm->mmu_lock);
 832
 833         rcu_read_lock();
 834
 835         root = next_invalidated_root(kvm, NULL);
 836
 837         while (root) {
 838                 next_root = next_invalidated_root(kvm, root);
 839
 840                 rcu_read_unlock();
 841
 842                 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
 843
 844                 /*
 845                  * Put the reference acquired in
 846                  * kvm_tdp_mmu_invalidate_roots
 847                  */
 848                 kvm_tdp_mmu_put_root(kvm, root, true);
 849
 850                 root = next_root;
 851
 852                 rcu_read_lock();
 853         }
 854
 855         rcu_read_unlock();
 856
 857         if (flush)
 858                 kvm_flush_remote_tlbs(kvm);
 859 }
 860
 861 /*
 862  * Mark each TDP MMU root as invalid so that other threads
 863  * will drop their references and allow the root count to
 864  * go to 0.
 865  *
 866  * Also take a reference on all roots so that this thread
 867  * can do the bulk of the work required to free the roots
 868  * once they are invalidated. Without this reference, a
 869  * vCPU thread might drop the last reference to a root and
 870  * get stuck with tearing down the entire paging structure.
 871  *
 872  * Roots which have a zero refcount should be skipped as
 873  * they're already being torn down.
 874  * Already invalid roots should be referenced again so that
 875  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
 876  * done with them.
 877  *
 878  * This has essentially the same effect for the TDP MMU
 879  * as updating mmu_valid_gen does for the shadow MMU.
 880  */
 881 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 882 {
 883         struct kvm_mmu_page *root;
 884
 885         lockdep_assert_held_write(&kvm->mmu_lock);
 886         list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
 887                 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
 888                         root->role.invalid = true;
 889 }
 890
 891 /*
 892  * Installs a last-level SPTE to handle a TDP page fault.
 893  * (NPT/EPT violation/misconfiguration)
 894  */
 895 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 896                                           struct kvm_page_fault *fault,
 897                                           struct tdp_iter *iter)
 898 {
 899         u64 new_spte;
 900         int ret = RET_PF_FIXED;
 901         int make_spte_ret = 0;
 902
 903         if (unlikely(is_noslot_pfn(fault->pfn)))
 904                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 905         else
 906                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 907                                          fault->pfn, iter->old_spte, fault->prefault, true,
 908                                          fault->map_writable, !shadow_accessed_mask,
 909                                          &new_spte);
 910
 911         if (new_spte == iter->old_spte)
 912                 ret = RET_PF_SPURIOUS;
 913         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 914                 return RET_PF_RETRY;
 915
 916         /*
 917          * If the page fault was caused by a write but the page is write
 918          * protected, emulation is needed. If the emulation was skipped,
 919          * the vCPU would have the same fault again.
 920          */
 921         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 922                 if (fault->write)
 923                         ret = RET_PF_EMULATE;
 924         }
 925
 926         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 927         if (unlikely(is_mmio_spte(new_spte))) {
 928                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 929                                      new_spte);
 930                 ret = RET_PF_EMULATE;
 931         } else {
 932                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 933                                        rcu_dereference(iter->sptep));
 934         }
 935
 936         /*
 937          * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
 938          * consistent with legacy MMU behavior.
 939          */
 940         if (ret != RET_PF_SPURIOUS)
 941                 vcpu->stat.pf_fixed++;
 942
 943         return ret;
 944 }
 945
 946 /*
 947  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 948  * page tables and SPTEs to translate the faulting guest physical address.
 949  */
 950 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 951 {
 952         struct kvm_mmu *mmu = vcpu->arch.mmu;
 953         struct tdp_iter iter;
 954         struct kvm_mmu_page *sp;
 955         u64 *child_pt;
 956         u64 new_spte;
 957         int ret;
 958
 959         kvm_mmu_hugepage_adjust(vcpu, fault);
 960
 961         trace_kvm_mmu_spte_requested(fault);
 962
 963         rcu_read_lock();
 964
 965         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
 966                 if (fault->nx_huge_page_workaround_enabled)
 967                         disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
 968
 969                 if (iter.level == fault->goal_level)
 970                         break;
 971
 972                 /*
 973                  * If there is an SPTE mapping a large page at a higher level
 974                  * than the target, that SPTE must be cleared and replaced
 975                  * with a non-leaf SPTE.
 976                  */
 977                 if (is_shadow_present_pte(iter.old_spte) &&
 978                     is_large_pte(iter.old_spte)) {
 979                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 980                                 break;
 981
 982                         /*
 983                          * The iter must explicitly re-read the spte here
 984                          * because the new value informs the !present
 985                          * path below.
 986                          */
 987                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 988                 }
 989
 990                 if (!is_shadow_present_pte(iter.old_spte)) {
 991                         /*
 992                          * If SPTE has been frozen by another thread, just
 993                          * give up and retry, avoiding unnecessary page table
 994                          * allocation and free.
 995                          */
 996                         if (is_removed_spte(iter.old_spte))
 997                                 break;
 998
 999                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1000                         child_pt = sp->spt;
1001
1002                         new_spte = make_nonleaf_spte(child_pt,
1003                                                      !shadow_accessed_mask);
1004
1005                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
1006                                 tdp_mmu_link_page(vcpu->kvm, sp,
1007                                                   fault->huge_page_disallowed &&
1008                                                   fault->req_level >= iter.level);
1009
1010                                 trace_kvm_mmu_get_page(sp, true);
1011                         } else {
1012                                 tdp_mmu_free_sp(sp);
1013                                 break;
1014                         }
1015                 }
1016         }
1017
1018         if (iter.level != fault->goal_level) {
1019                 rcu_read_unlock();
1020                 return RET_PF_RETRY;
1021         }
1022
1023         ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1024         rcu_read_unlock();
1025
1026         return ret;
1027 }
1028
1029 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1030                                  bool flush)
1031 {
1032         struct kvm_mmu_page *root;
1033
1034         for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1035                 flush |= zap_gfn_range(kvm, root, range->start, range->end,
1036                                        range->may_block, flush, false);
1037
1038         return flush;
1039 }
1040
1041 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1042                               struct kvm_gfn_range *range);
1043
1044 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1045                                                    struct kvm_gfn_range *range,
1046                                                    tdp_handler_t handler)
1047 {
1048         struct kvm_mmu_page *root;
1049         struct tdp_iter iter;
1050         bool ret = false;
1051
1052         rcu_read_lock();
1053
1054         /*
1055          * Don't support rescheduling, none of the MMU notifiers that funnel
1056          * into this helper allow blocking; it'd be dead, wasteful code.
1057          */
1058         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1059                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1060                         ret |= handler(kvm, &iter, range);
1061         }
1062
1063         rcu_read_unlock();
1064
1065         return ret;
1066 }
1067
1068 /*
1069  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1070  * if any of the GFNs in the range have been accessed.
1071  */
1072 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1073                           struct kvm_gfn_range *range)
1074 {
1075         u64 new_spte = 0;
1076
1077         /* If we have a non-accessed entry we don't need to change the pte. */
1078         if (!is_accessed_spte(iter->old_spte))
1079                 return false;
1080
1081         new_spte = iter->old_spte;
1082
1083         if (spte_ad_enabled(new_spte)) {
1084                 new_spte &= ~shadow_accessed_mask;
1085         } else {
1086                 /*
1087                  * Capture the dirty status of the page, so that it doesn't get
1088                  * lost when the SPTE is marked for access tracking.
1089                  */
1090                 if (is_writable_pte(new_spte))
1091                         kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1092
1093                 new_spte = mark_spte_for_access_track(new_spte);
1094         }
1095
1096         tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1097
1098         return true;
1099 }
1100
1101 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1102 {
1103         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1104 }
1105
1106 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1107                          struct kvm_gfn_range *range)
1108 {
1109         return is_accessed_spte(iter->old_spte);
1110 }
1111
1112 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1113 {
1114         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1115 }
1116
1117 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1118                          struct kvm_gfn_range *range)
1119 {
1120         u64 new_spte;
1121
1122         /* Huge pages aren't expected to be modified without first being zapped. */
1123         WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1124
1125         if (iter->level != PG_LEVEL_4K ||
1126             !is_shadow_present_pte(iter->old_spte))
1127                 return false;
1128
1129         /*
1130          * Note, when changing a read-only SPTE, it's not strictly necessary to
1131          * zero the SPTE before setting the new PFN, but doing so preserves the
1132          * invariant that the PFN of a present * leaf SPTE can never change.
1133          * See __handle_changed_spte().
1134          */
1135         tdp_mmu_set_spte(kvm, iter, 0);
1136
1137         if (!pte_write(range->pte)) {
1138                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1139                                                                   pte_pfn(range->pte));
1140
1141                 tdp_mmu_set_spte(kvm, iter, new_spte);
1142         }
1143
1144         return true;
1145 }
1146
1147 /*
1148  * Handle the changed_pte MMU notifier for the TDP MMU.
1149  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1150  * notifier.
1151  * Returns non-zero if a flush is needed before releasing the MMU lock.
1152  */
1153 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1154 {
1155         bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1156
1157         /* FIXME: return 'flush' instead of flushing here. */
1158         if (flush)
1159                 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1160
1161         return false;
1162 }
1163
1164 /*
1165  * Remove write access from all SPTEs at or above min_level that map GFNs
1166  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1167  * be flushed.
1168  */
1169 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1170                              gfn_t start, gfn_t end, int min_level)
1171 {
1172         struct tdp_iter iter;
1173         u64 new_spte;
1174         bool spte_set = false;
1175
1176         rcu_read_lock();
1177
1178         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1179
1180         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1181                                    min_level, start, end) {
1182 retry:
1183                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1184                         continue;
1185
1186                 if (!is_shadow_present_pte(iter.old_spte) ||
1187                     !is_last_spte(iter.old_spte, iter.level) ||
1188                     !(iter.old_spte & PT_WRITABLE_MASK))
1189                         continue;
1190
1191                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1192
1193                 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1194                         /*
1195                          * The iter must explicitly re-read the SPTE because
1196                          * the atomic cmpxchg failed.
1197                          */
1198                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1199                         goto retry;
1200                 }
1201                 spte_set = true;
1202         }
1203
1204         rcu_read_unlock();
1205         return spte_set;
1206 }
1207
1208 /*
1209  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1210  * only affect leaf SPTEs down to min_level.
1211  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1212  */
1213 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1214                              const struct kvm_memory_slot *slot, int min_level)
1215 {
1216         struct kvm_mmu_page *root;
1217         bool spte_set = false;
1218
1219         lockdep_assert_held_read(&kvm->mmu_lock);
1220
1221         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1222                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1223                              slot->base_gfn + slot->npages, min_level);
1224
1225         return spte_set;
1226 }
1227
1228 /*
1229  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1230  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1231  * If AD bits are not enabled, this will require clearing the writable bit on
1232  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1233  * be flushed.
1234  */
1235 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1236                            gfn_t start, gfn_t end)
1237 {
1238         struct tdp_iter iter;
1239         u64 new_spte;
1240         bool spte_set = false;
1241
1242         rcu_read_lock();
1243
1244         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1245 retry:
1246                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1247                         continue;
1248
1249                 if (spte_ad_need_write_protect(iter.old_spte)) {
1250                         if (is_writable_pte(iter.old_spte))
1251                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1252                         else
1253                                 continue;
1254                 } else {
1255                         if (iter.old_spte & shadow_dirty_mask)
1256                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1257                         else
1258                                 continue;
1259                 }
1260
1261                 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1262                         /*
1263                          * The iter must explicitly re-read the SPTE because
1264                          * the atomic cmpxchg failed.
1265                          */
1266                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1267                         goto retry;
1268                 }
1269                 spte_set = true;
1270         }
1271
1272         rcu_read_unlock();
1273         return spte_set;
1274 }
1275
1276 /*
1277  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1278  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1279  * If AD bits are not enabled, this will require clearing the writable bit on
1280  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1281  * be flushed.
1282  */
1283 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1284                                   const struct kvm_memory_slot *slot)
1285 {
1286         struct kvm_mmu_page *root;
1287         bool spte_set = false;
1288
1289         lockdep_assert_held_read(&kvm->mmu_lock);
1290
1291         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1292                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1293                                 slot->base_gfn + slot->npages);
1294
1295         return spte_set;
1296 }
1297
1298 /*
1299  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1300  * set in mask, starting at gfn. The given memslot is expected to contain all
1301  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1302  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1303  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1304  */
1305 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1306                                   gfn_t gfn, unsigned long mask, bool wrprot)
1307 {
1308         struct tdp_iter iter;
1309         u64 new_spte;
1310
1311         rcu_read_lock();
1312
1313         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1314                                     gfn + BITS_PER_LONG) {
1315                 if (!mask)
1316                         break;
1317
1318                 if (iter.level > PG_LEVEL_4K ||
1319                     !(mask & (1UL << (iter.gfn - gfn))))
1320                         continue;
1321
1322                 mask &= ~(1UL << (iter.gfn - gfn));
1323
1324                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1325                         if (is_writable_pte(iter.old_spte))
1326                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1327                         else
1328                                 continue;
1329                 } else {
1330                         if (iter.old_spte & shadow_dirty_mask)
1331                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1332                         else
1333                                 continue;
1334                 }
1335
1336                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1337         }
1338
1339         rcu_read_unlock();
1340 }
1341
1342 /*
1343  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1344  * set in mask, starting at gfn. The given memslot is expected to contain all
1345  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1346  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1347  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1348  */
1349 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1350                                        struct kvm_memory_slot *slot,
1351                                        gfn_t gfn, unsigned long mask,
1352                                        bool wrprot)
1353 {
1354         struct kvm_mmu_page *root;
1355
1356         lockdep_assert_held_write(&kvm->mmu_lock);
1357         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1358                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1359 }
1360
1361 /*
1362  * Clear leaf entries which could be replaced by large mappings, for
1363  * GFNs within the slot.
1364  */
1365 static bool zap_collapsible_spte_range(struct kvm *kvm,
1366                                        struct kvm_mmu_page *root,
1367                                        const struct kvm_memory_slot *slot,
1368                                        bool flush)
1369 {
1370         gfn_t start = slot->base_gfn;
1371         gfn_t end = start + slot->npages;
1372         struct tdp_iter iter;
1373         kvm_pfn_t pfn;
1374
1375         rcu_read_lock();
1376
1377         tdp_root_for_each_pte(iter, root, start, end) {
1378 retry:
1379                 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1380                         flush = false;
1381                         continue;
1382                 }
1383
1384                 if (!is_shadow_present_pte(iter.old_spte) ||
1385                     !is_last_spte(iter.old_spte, iter.level))
1386                         continue;
1387
1388                 pfn = spte_to_pfn(iter.old_spte);
1389                 if (kvm_is_reserved_pfn(pfn) ||
1390                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1391                                                             pfn, PG_LEVEL_NUM))
1392                         continue;
1393
1394                 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1395                         /*
1396                          * The iter must explicitly re-read the SPTE because
1397                          * the atomic cmpxchg failed.
1398                          */
1399                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1400                         goto retry;
1401                 }
1402                 flush = true;
1403         }
1404
1405         rcu_read_unlock();
1406
1407         return flush;
1408 }
1409
1410 /*
1411  * Clear non-leaf entries (and free associated page tables) which could
1412  * be replaced by large mappings, for GFNs within the slot.
1413  */
1414 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1415                                        const struct kvm_memory_slot *slot,
1416                                        bool flush)
1417 {
1418         struct kvm_mmu_page *root;
1419
1420         lockdep_assert_held_read(&kvm->mmu_lock);
1421
1422         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1423                 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1424
1425         return flush;
1426 }
1427
1428 /*
1429  * Removes write access on the last level SPTE mapping this GFN and unsets the
1430  * MMU-writable bit to ensure future writes continue to be intercepted.
1431  * Returns true if an SPTE was set and a TLB flush is needed.
1432  */
1433 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1434                               gfn_t gfn, int min_level)
1435 {
1436         struct tdp_iter iter;
1437         u64 new_spte;
1438         bool spte_set = false;
1439
1440         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1441
1442         rcu_read_lock();
1443
1444         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1445                                    min_level, gfn, gfn + 1) {
1446                 if (!is_shadow_present_pte(iter.old_spte) ||
1447                     !is_last_spte(iter.old_spte, iter.level))
1448                         continue;
1449
1450                 if (!is_writable_pte(iter.old_spte))
1451                         break;
1452
1453                 new_spte = iter.old_spte &
1454                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1455
1456                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1457                 spte_set = true;
1458         }
1459
1460         rcu_read_unlock();
1461
1462         return spte_set;
1463 }
1464
1465 /*
1466  * Removes write access on the last level SPTE mapping this GFN and unsets the
1467  * MMU-writable bit to ensure future writes continue to be intercepted.
1468  * Returns true if an SPTE was set and a TLB flush is needed.
1469  */
1470 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1471                                    struct kvm_memory_slot *slot, gfn_t gfn,
1472                                    int min_level)
1473 {
1474         struct kvm_mmu_page *root;
1475         bool spte_set = false;
1476
1477         lockdep_assert_held_write(&kvm->mmu_lock);
1478         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1479                 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1480
1481         return spte_set;
1482 }
1483
1484 /*
1485  * Return the level of the lowest level SPTE added to sptes.
1486  * That SPTE may be non-present.
1487  *
1488  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1489  */
1490 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1491                          int *root_level)
1492 {
1493         struct tdp_iter iter;
1494         struct kvm_mmu *mmu = vcpu->arch.mmu;
1495         gfn_t gfn = addr >> PAGE_SHIFT;
1496         int leaf = -1;
1497
1498         *root_level = vcpu->arch.mmu->shadow_root_level;
1499
1500         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1501                 leaf = iter.level;
1502                 sptes[leaf] = iter.old_spte;
1503         }
1504
1505         return leaf;
1506 }
1507
1508 /*
1509  * Returns the last level spte pointer of the shadow page walk for the given
1510  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1511  * walk could be performed, returns NULL and *spte does not contain valid data.
1512  *
1513  * Contract:
1514  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1515  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1516  *
1517  * WARNING: This function is only intended to be called during fast_page_fault.
1518  */
1519 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1520                                         u64 *spte)
1521 {
1522         struct tdp_iter iter;
1523         struct kvm_mmu *mmu = vcpu->arch.mmu;
1524         gfn_t gfn = addr >> PAGE_SHIFT;
1525         tdp_ptep_t sptep = NULL;
1526
1527         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1528                 *spte = iter.old_spte;
1529                 sptep = iter.sptep;
1530         }
1531
1532         /*
1533          * Perform the rcu_dereference to get the raw spte pointer value since
1534          * we are passing it up to fast_page_fault, which is shared with the
1535          * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1536          * annotation.
1537          *
1538          * This is safe since fast_page_fault obeys the contracts of this
1539          * function as well as all TDP MMU contracts around modifying SPTEs
1540          * outside of mmu_lock.
1541          */
1542         return rcu_dereference(sptep);
1543 }