arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 static bool __read_mostly tdp_mmu_enabled = false;
  11
  12 static bool is_tdp_mmu_enabled(void)
  13 {
  14 #ifdef CONFIG_X86_64
  15         return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
  16 #else
  17         return false;
  18 #endif /* CONFIG_X86_64 */
  19 }
  20
  21 /* Initializes the TDP MMU for the VM, if enabled. */
  22 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  23 {
  24         if (!is_tdp_mmu_enabled())
  25                 return;
  26
  27         /* This should not be changed for the lifetime of the VM. */
  28         kvm->arch.tdp_mmu_enabled = true;
  29
  30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  31         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  32 }
  33
  34 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  35 {
  36         if (!kvm->arch.tdp_mmu_enabled)
  37                 return;
  38
  39         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  40 }
  41
  42 #define for_each_tdp_mmu_root(_kvm, _root)                          \
  43         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  44
  45 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
  46 {
  47         struct kvm_mmu_page *sp;
  48
  49         sp = to_shadow_page(hpa);
  50
  51         return sp->tdp_mmu_page && sp->root_count;
  52 }
  53
  54 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  55                           gfn_t start, gfn_t end);
  56
  57 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  58 {
  59         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
  60
  61         lockdep_assert_held(&kvm->mmu_lock);
  62
  63         WARN_ON(root->root_count);
  64         WARN_ON(!root->tdp_mmu_page);
  65
  66         list_del(&root->link);
  67
  68         zap_gfn_range(kvm, root, 0, max_gfn);
  69
  70         free_page((unsigned long)root->spt);
  71         kmem_cache_free(mmu_page_header_cache, root);
  72 }
  73
  74 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
  75                                                    int level)
  76 {
  77         union kvm_mmu_page_role role;
  78
  79         role = vcpu->arch.mmu->mmu_role.base;
  80         role.level = level;
  81         role.direct = true;
  82         role.gpte_is_8_bytes = true;
  83         role.access = ACC_ALL;
  84
  85         return role;
  86 }
  87
  88 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
  89                                                int level)
  90 {
  91         struct kvm_mmu_page *sp;
  92
  93         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
  94         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
  95         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
  96
  97         sp->role.word = page_role_for_level(vcpu, level).word;
  98         sp->gfn = gfn;
  99         sp->tdp_mmu_page = true;
 100
 101         return sp;
 102 }
 103
 104 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 105 {
 106         union kvm_mmu_page_role role;
 107         struct kvm *kvm = vcpu->kvm;
 108         struct kvm_mmu_page *root;
 109
 110         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 111
 112         spin_lock(&kvm->mmu_lock);
 113
 114         /* Check for an existing root before allocating a new one. */
 115         for_each_tdp_mmu_root(kvm, root) {
 116                 if (root->role.word == role.word) {
 117                         kvm_mmu_get_root(kvm, root);
 118                         spin_unlock(&kvm->mmu_lock);
 119                         return root;
 120                 }
 121         }
 122
 123         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 124         root->root_count = 1;
 125
 126         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 127
 128         spin_unlock(&kvm->mmu_lock);
 129
 130         return root;
 131 }
 132
 133 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 134 {
 135         struct kvm_mmu_page *root;
 136
 137         root = get_tdp_mmu_vcpu_root(vcpu);
 138         if (!root)
 139                 return INVALID_PAGE;
 140
 141         return __pa(root->spt);
 142 }
 143
 144 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 145                                 u64 old_spte, u64 new_spte, int level);
 146
 147 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 148 {
 149         return sp->role.smm ? 1 : 0;
 150 }
 151
 152 /**
 153  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 154  * @kvm: kvm instance
 155  * @as_id: the address space of the paging structure the SPTE was a part of
 156  * @gfn: the base GFN that was mapped by the SPTE
 157  * @old_spte: The value of the SPTE before the change
 158  * @new_spte: The value of the SPTE after the change
 159  * @level: the level of the PT the SPTE is part of in the paging structure
 160  *
 161  * Handle bookkeeping that might result from the modification of a SPTE.
 162  * This function must be called for all TDP SPTE modifications.
 163  */
 164 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 165                                 u64 old_spte, u64 new_spte, int level)
 166 {
 167         bool was_present = is_shadow_present_pte(old_spte);
 168         bool is_present = is_shadow_present_pte(new_spte);
 169         bool was_leaf = was_present && is_last_spte(old_spte, level);
 170         bool is_leaf = is_present && is_last_spte(new_spte, level);
 171         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 172         u64 *pt;
 173         struct kvm_mmu_page *sp;
 174         u64 old_child_spte;
 175         int i;
 176
 177         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 178         WARN_ON(level < PG_LEVEL_4K);
 179         WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level));
 180
 181         /*
 182          * If this warning were to trigger it would indicate that there was a
 183          * missing MMU notifier or a race with some notifier handler.
 184          * A present, leaf SPTE should never be directly replaced with another
 185          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 186          * should be zapping the SPTE before the main MM's page table is
 187          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 188          * thread before replacement.
 189          */
 190         if (was_leaf && is_leaf && pfn_changed) {
 191                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 192                        "SPTE with another present leaf SPTE mapping a\n"
 193                        "different PFN!\n"
 194                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 195                        as_id, gfn, old_spte, new_spte, level);
 196
 197                 /*
 198                  * Crash the host to prevent error propagation and guest data
 199                  * courruption.
 200                  */
 201                 BUG();
 202         }
 203
 204         if (old_spte == new_spte)
 205                 return;
 206
 207         /*
 208          * The only times a SPTE should be changed from a non-present to
 209          * non-present state is when an MMIO entry is installed/modified/
 210          * removed. In that case, there is nothing to do here.
 211          */
 212         if (!was_present && !is_present) {
 213                 /*
 214                  * If this change does not involve a MMIO SPTE, it is
 215                  * unexpected. Log the change, though it should not impact the
 216                  * guest since both the former and current SPTEs are nonpresent.
 217                  */
 218                 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
 219                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 220                                "should not be replaced with another,\n"
 221                                "different nonpresent SPTE, unless one or both\n"
 222                                "are MMIO SPTEs.\n"
 223                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 224                                as_id, gfn, old_spte, new_spte, level);
 225                 return;
 226         }
 227
 228
 229         if (was_leaf && is_dirty_spte(old_spte) &&
 230             (!is_dirty_spte(new_spte) || pfn_changed))
 231                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 232
 233         /*
 234          * Recursively handle child PTs if the change removed a subtree from
 235          * the paging structure.
 236          */
 237         if (was_present && !was_leaf && (pfn_changed || !is_present)) {
 238                 pt = spte_to_child_pt(old_spte, level);
 239                 sp = sptep_to_sp(pt);
 240
 241                 list_del(&sp->link);
 242
 243                 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 244                         old_child_spte = READ_ONCE(*(pt + i));
 245                         WRITE_ONCE(*(pt + i), 0);
 246                         handle_changed_spte(kvm, as_id,
 247                                 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
 248                                 old_child_spte, 0, level - 1);
 249                 }
 250
 251                 kvm_flush_remote_tlbs_with_address(kvm, gfn,
 252                                                    KVM_PAGES_PER_HPAGE(level));
 253
 254                 free_page((unsigned long)pt);
 255                 kmem_cache_free(mmu_page_header_cache, sp);
 256         }
 257 }
 258
 259 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 260                                 u64 old_spte, u64 new_spte, int level)
 261 {
 262         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
 263 }
 264
 265 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 266                                     u64 new_spte)
 267 {
 268         u64 *root_pt = tdp_iter_root_pt(iter);
 269         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 270         int as_id = kvm_mmu_page_as_id(root);
 271
 272         *iter->sptep = new_spte;
 273
 274         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 275                             iter->level);
 276 }
 277
 278 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 279         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 280
 281 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 282         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 283                          _mmu->shadow_root_level, _start, _end)
 284
 285 /*
 286  * Flush the TLB if the process should drop kvm->mmu_lock.
 287  * Return whether the caller still needs to flush the tlb.
 288  */
 289 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
 290 {
 291         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 292                 kvm_flush_remote_tlbs(kvm);
 293                 cond_resched_lock(&kvm->mmu_lock);
 294                 tdp_iter_refresh_walk(iter);
 295                 return false;
 296         } else {
 297                 return true;
 298         }
 299 }
 300
 301 /*
 302  * Tears down the mappings for the range of gfns, [start, end), and frees the
 303  * non-root pages mapping GFNs strictly within that range. Returns true if
 304  * SPTEs have been cleared and a TLB flush is needed before releasing the
 305  * MMU lock.
 306  */
 307 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 308                           gfn_t start, gfn_t end)
 309 {
 310         struct tdp_iter iter;
 311         bool flush_needed = false;
 312
 313         tdp_root_for_each_pte(iter, root, start, end) {
 314                 if (!is_shadow_present_pte(iter.old_spte))
 315                         continue;
 316
 317                 /*
 318                  * If this is a non-last-level SPTE that covers a larger range
 319                  * than should be zapped, continue, and zap the mappings at a
 320                  * lower level.
 321                  */
 322                 if ((iter.gfn < start ||
 323                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 324                     !is_last_spte(iter.old_spte, iter.level))
 325                         continue;
 326
 327                 tdp_mmu_set_spte(kvm, &iter, 0);
 328
 329                 flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
 330         }
 331         return flush_needed;
 332 }
 333
 334 /*
 335  * Tears down the mappings for the range of gfns, [start, end), and frees the
 336  * non-root pages mapping GFNs strictly within that range. Returns true if
 337  * SPTEs have been cleared and a TLB flush is needed before releasing the
 338  * MMU lock.
 339  */
 340 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 341 {
 342         struct kvm_mmu_page *root;
 343         bool flush = false;
 344
 345         for_each_tdp_mmu_root(kvm, root) {
 346                 /*
 347                  * Take a reference on the root so that it cannot be freed if
 348                  * this thread releases the MMU lock and yields in this loop.
 349                  */
 350                 kvm_mmu_get_root(kvm, root);
 351
 352                 flush |= zap_gfn_range(kvm, root, start, end);
 353
 354                 kvm_mmu_put_root(kvm, root);
 355         }
 356
 357         return flush;
 358 }
 359
 360 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 361 {
 362         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
 363         bool flush;
 364
 365         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 366         if (flush)
 367                 kvm_flush_remote_tlbs(kvm);
 368 }
 369
 370 /*
 371  * Installs a last-level SPTE to handle a TDP page fault.
 372  * (NPT/EPT violation/misconfiguration)
 373  */
 374 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 375                                           int map_writable,
 376                                           struct tdp_iter *iter,
 377                                           kvm_pfn_t pfn, bool prefault)
 378 {
 379         u64 new_spte;
 380         int ret = 0;
 381         int make_spte_ret = 0;
 382
 383         if (unlikely(is_noslot_pfn(pfn))) {
 384                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 385                 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
 386         } else
 387                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 388                                          pfn, iter->old_spte, prefault, true,
 389                                          map_writable, !shadow_accessed_mask,
 390                                          &new_spte);
 391
 392         if (new_spte == iter->old_spte)
 393                 ret = RET_PF_SPURIOUS;
 394         else
 395                 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
 396
 397         /*
 398          * If the page fault was caused by a write but the page is write
 399          * protected, emulation is needed. If the emulation was skipped,
 400          * the vCPU would have the same fault again.
 401          */
 402         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 403                 if (write)
 404                         ret = RET_PF_EMULATE;
 405                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 406         }
 407
 408         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 409         if (unlikely(is_mmio_spte(new_spte)))
 410                 ret = RET_PF_EMULATE;
 411
 412         trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
 413         if (!prefault)
 414                 vcpu->stat.pf_fixed++;
 415
 416         return ret;
 417 }
 418
 419 /*
 420  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 421  * page tables and SPTEs to translate the faulting guest physical address.
 422  */
 423 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 424                     int map_writable, int max_level, kvm_pfn_t pfn,
 425                     bool prefault)
 426 {
 427         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 428         bool write = error_code & PFERR_WRITE_MASK;
 429         bool exec = error_code & PFERR_FETCH_MASK;
 430         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 431         struct kvm_mmu *mmu = vcpu->arch.mmu;
 432         struct tdp_iter iter;
 433         struct kvm_mmu_page *sp;
 434         u64 *child_pt;
 435         u64 new_spte;
 436         int ret;
 437         gfn_t gfn = gpa >> PAGE_SHIFT;
 438         int level;
 439         int req_level;
 440
 441         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 442                 return RET_PF_RETRY;
 443         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 444                 return RET_PF_RETRY;
 445
 446         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 447                                         huge_page_disallowed, &req_level);
 448
 449         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 450         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 451                 if (nx_huge_page_workaround_enabled)
 452                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 453                                                    iter.level, &pfn, &level);
 454
 455                 if (iter.level == level)
 456                         break;
 457
 458                 /*
 459                  * If there is an SPTE mapping a large page at a higher level
 460                  * than the target, that SPTE must be cleared and replaced
 461                  * with a non-leaf SPTE.
 462                  */
 463                 if (is_shadow_present_pte(iter.old_spte) &&
 464                     is_large_pte(iter.old_spte)) {
 465                         tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
 466
 467                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
 468                                         KVM_PAGES_PER_HPAGE(iter.level));
 469
 470                         /*
 471                          * The iter must explicitly re-read the spte here
 472                          * because the new value informs the !present
 473                          * path below.
 474                          */
 475                         iter.old_spte = READ_ONCE(*iter.sptep);
 476                 }
 477
 478                 if (!is_shadow_present_pte(iter.old_spte)) {
 479                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 480                         list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
 481                         child_pt = sp->spt;
 482                         clear_page(child_pt);
 483                         new_spte = make_nonleaf_spte(child_pt,
 484                                                      !shadow_accessed_mask);
 485
 486                         trace_kvm_mmu_get_page(sp, true);
 487                         tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
 488                 }
 489         }
 490
 491         if (WARN_ON(iter.level != level))
 492                 return RET_PF_RETRY;
 493
 494         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 495                                               pfn, prefault);
 496
 497         return ret;
 498 }