arch/x86/kvm/mmu/mmu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * MMU support
   9  *
  10  * Copyright (C) 2006 Qumranet, Inc.
  11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  12  *
  13  * Authors:
  14  *   Yaniv Kamay  <yaniv@qumranet.com>
  15  *   Avi Kivity   <avi@qumranet.com>
  16  */
  17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  18
  19 #include "irq.h"
  20 #include "ioapic.h"
  21 #include "mmu.h"
  22 #include "mmu_internal.h"
  23 #include "tdp_mmu.h"
  24 #include "x86.h"
  25 #include "kvm_cache_regs.h"
  26 #include "smm.h"
  27 #include "kvm_emulate.h"
  28 #include "cpuid.h"
  29 #include "spte.h"
  30
  31 #include <linux/kvm_host.h>
  32 #include <linux/types.h>
  33 #include <linux/string.h>
  34 #include <linux/mm.h>
  35 #include <linux/highmem.h>
  36 #include <linux/moduleparam.h>
  37 #include <linux/export.h>
  38 #include <linux/swap.h>
  39 #include <linux/hugetlb.h>
  40 #include <linux/compiler.h>
  41 #include <linux/srcu.h>
  42 #include <linux/slab.h>
  43 #include <linux/sched/signal.h>
  44 #include <linux/uaccess.h>
  45 #include <linux/hash.h>
  46 #include <linux/kern_levels.h>
  47 #include <linux/kstrtox.h>
  48 #include <linux/kthread.h>
  49
  50 #include <asm/page.h>
  51 #include <asm/memtype.h>
  52 #include <asm/cmpxchg.h>
  53 #include <asm/io.h>
  54 #include <asm/set_memory.h>
  55 #include <asm/vmx.h>
  56 #include <asm/kvm_page_track.h>
  57 #include "trace.h"
  58
  59 extern bool itlb_multihit_kvm_mitigation;
  60
  61 int __read_mostly nx_huge_pages = -1;
  62 static uint __read_mostly nx_huge_pages_recovery_period_ms;
  63 #ifdef CONFIG_PREEMPT_RT
  64 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
  65 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
  66 #else
  67 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
  68 #endif
  69
  70 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
  71 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
  72
  73 static const struct kernel_param_ops nx_huge_pages_ops = {
  74         .set = set_nx_huge_pages,
  75         .get = param_get_bool,
  76 };
  77
  78 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
  79         .set = set_nx_huge_pages_recovery_param,
  80         .get = param_get_uint,
  81 };
  82
  83 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
  84 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
  85 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
  86                 &nx_huge_pages_recovery_ratio, 0644);
  87 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
  88 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
  89                 &nx_huge_pages_recovery_period_ms, 0644);
  90 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
  91
  92 static bool __read_mostly force_flush_and_sync_on_reuse;
  93 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
  94
  95 /*
  96  * When setting this variable to true it enables Two-Dimensional-Paging
  97  * where the hardware walks 2 page tables:
  98  * 1. the guest-virtual to guest-physical
  99  * 2. while doing 1. it walks guest-physical to host-physical
 100  * If the hardware supports that we don't need to do shadow paging.
 101  */
 102 bool tdp_enabled = false;
 103
 104 static bool __ro_after_init tdp_mmu_allowed;
 105
 106 #ifdef CONFIG_X86_64
 107 bool __read_mostly tdp_mmu_enabled = true;
 108 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
 109 #endif
 110
 111 static int max_huge_page_level __read_mostly;
 112 static int tdp_root_level __read_mostly;
 113 static int max_tdp_level __read_mostly;
 114
 115 #ifdef MMU_DEBUG
 116 bool dbg = 0;
 117 module_param(dbg, bool, 0644);
 118 #endif
 119
 120 #define PTE_PREFETCH_NUM                8
 121
 122 #include <trace/events/kvm.h>
 123
 124 /* make pte_list_desc fit well in cache lines */
 125 #define PTE_LIST_EXT 14
 126
 127 /*
 128  * struct pte_list_desc is the core data structure used to implement a custom
 129  * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
 130  * given GFN when used in the context of rmaps.  Using a custom list allows KVM
 131  * to optimize for the common case where many GFNs will have at most a handful
 132  * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
 133  * memory footprint, which in turn improves runtime performance by exploiting
 134  * cache locality.
 135  *
 136  * A list is comprised of one or more pte_list_desc objects (descriptors).
 137  * Each individual descriptor stores up to PTE_LIST_EXT SPTEs.  If a descriptor
 138  * is full and a new SPTEs needs to be added, a new descriptor is allocated and
 139  * becomes the head of the list.  This means that by definitions, all tail
 140  * descriptors are full.
 141  *
 142  * Note, the meta data fields are deliberately placed at the start of the
 143  * structure to optimize the cacheline layout; accessing the descriptor will
 144  * touch only a single cacheline so long as @spte_count<=6 (or if only the
 145  * descriptors metadata is accessed).
 146  */
 147 struct pte_list_desc {
 148         struct pte_list_desc *more;
 149         /* The number of PTEs stored in _this_ descriptor. */
 150         u32 spte_count;
 151         /* The number of PTEs stored in all tails of this descriptor. */
 152         u32 tail_count;
 153         u64 *sptes[PTE_LIST_EXT];
 154 };
 155
 156 struct kvm_shadow_walk_iterator {
 157         u64 addr;
 158         hpa_t shadow_addr;
 159         u64 *sptep;
 160         int level;
 161         unsigned index;
 162 };
 163
 164 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
 165         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
 166                                          (_root), (_addr));                \
 167              shadow_walk_okay(&(_walker));                                 \
 168              shadow_walk_next(&(_walker)))
 169
 170 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 171         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 172              shadow_walk_okay(&(_walker));                      \
 173              shadow_walk_next(&(_walker)))
 174
 175 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
 176         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
 177              shadow_walk_okay(&(_walker)) &&                            \
 178                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 179              __shadow_walk_next(&(_walker), spte))
 180
 181 static struct kmem_cache *pte_list_desc_cache;
 182 struct kmem_cache *mmu_page_header_cache;
 183 static struct percpu_counter kvm_total_used_mmu_pages;
 184
 185 static void mmu_spte_set(u64 *sptep, u64 spte);
 186
 187 struct kvm_mmu_role_regs {
 188         const unsigned long cr0;
 189         const unsigned long cr4;
 190         const u64 efer;
 191 };
 192
 193 #define CREATE_TRACE_POINTS
 194 #include "mmutrace.h"
 195
 196 /*
 197  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
 198  * reading from the role_regs.  Once the root_role is constructed, it becomes
 199  * the single source of truth for the MMU's state.
 200  */
 201 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                   \
 202 static inline bool __maybe_unused                                       \
 203 ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)             \
 204 {                                                                       \
 205         return !!(regs->reg & flag);                                    \
 206 }
 207 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
 208 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
 209 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
 210 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
 211 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
 212 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
 213 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
 214 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
 215 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
 216 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
 217
 218 /*
 219  * The MMU itself (with a valid role) is the single source of truth for the
 220  * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
 221  * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
 222  * and the vCPU may be incorrect/irrelevant.
 223  */
 224 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)         \
 225 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)        \
 226 {                                                               \
 227         return !!(mmu->cpu_role. base_or_ext . reg##_##name);   \
 228 }
 229 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
 230 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
 231 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
 232 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
 233 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
 234 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
 235 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
 236 BUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
 237
 238 static inline bool is_cr0_pg(struct kvm_mmu *mmu)
 239 {
 240         return mmu->cpu_role.base.level > 0;
 241 }
 242
 243 static inline bool is_cr4_pae(struct kvm_mmu *mmu)
 244 {
 245         return !mmu->cpu_role.base.has_4_byte_gpte;
 246 }
 247
 248 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
 249 {
 250         struct kvm_mmu_role_regs regs = {
 251                 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
 252                 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
 253                 .efer = vcpu->arch.efer,
 254         };
 255
 256         return regs;
 257 }
 258
 259 static inline bool kvm_available_flush_tlb_with_range(void)
 260 {
 261         return kvm_x86_ops.tlb_remote_flush_with_range;
 262 }
 263
 264 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
 265                                  gfn_t nr_pages)
 266 {
 267         struct kvm_tlb_range range;
 268         int ret = -EOPNOTSUPP;
 269
 270         range.start_gfn = start_gfn;
 271         range.pages = nr_pages;
 272
 273         if (kvm_x86_ops.tlb_remote_flush_with_range)
 274                 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, &range);
 275
 276         if (ret)
 277                 kvm_flush_remote_tlbs(kvm);
 278 }
 279
 280 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
 281
 282 /* Flush the range of guest memory mapped by the given SPTE. */
 283 static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
 284 {
 285         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
 286         gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
 287
 288         kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
 289 }
 290
 291 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 292                            unsigned int access)
 293 {
 294         u64 spte = make_mmio_spte(vcpu, gfn, access);
 295
 296         trace_mark_mmio_spte(sptep, gfn, spte);
 297         mmu_spte_set(sptep, spte);
 298 }
 299
 300 static gfn_t get_mmio_spte_gfn(u64 spte)
 301 {
 302         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 303
 304         gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
 305                & shadow_nonpresent_or_rsvd_mask;
 306
 307         return gpa >> PAGE_SHIFT;
 308 }
 309
 310 static unsigned get_mmio_spte_access(u64 spte)
 311 {
 312         return spte & shadow_mmio_access_mask;
 313 }
 314
 315 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 316 {
 317         u64 kvm_gen, spte_gen, gen;
 318
 319         gen = kvm_vcpu_memslots(vcpu)->generation;
 320         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 321                 return false;
 322
 323         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 324         spte_gen = get_mmio_spte_generation(spte);
 325
 326         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
 327         return likely(kvm_gen == spte_gen);
 328 }
 329
 330 static int is_cpuid_PSE36(void)
 331 {
 332         return 1;
 333 }
 334
 335 #ifdef CONFIG_X86_64
 336 static void __set_spte(u64 *sptep, u64 spte)
 337 {
 338         WRITE_ONCE(*sptep, spte);
 339 }
 340
 341 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 342 {
 343         WRITE_ONCE(*sptep, spte);
 344 }
 345
 346 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 347 {
 348         return xchg(sptep, spte);
 349 }
 350
 351 static u64 __get_spte_lockless(u64 *sptep)
 352 {
 353         return READ_ONCE(*sptep);
 354 }
 355 #else
 356 union split_spte {
 357         struct {
 358                 u32 spte_low;
 359                 u32 spte_high;
 360         };
 361         u64 spte;
 362 };
 363
 364 static void count_spte_clear(u64 *sptep, u64 spte)
 365 {
 366         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
 367
 368         if (is_shadow_present_pte(spte))
 369                 return;
 370
 371         /* Ensure the spte is completely set before we increase the count */
 372         smp_wmb();
 373         sp->clear_spte_count++;
 374 }
 375
 376 static void __set_spte(u64 *sptep, u64 spte)
 377 {
 378         union split_spte *ssptep, sspte;
 379
 380         ssptep = (union split_spte *)sptep;
 381         sspte = (union split_spte)spte;
 382
 383         ssptep->spte_high = sspte.spte_high;
 384
 385         /*
 386          * If we map the spte from nonpresent to present, We should store
 387          * the high bits firstly, then set present bit, so cpu can not
 388          * fetch this spte while we are setting the spte.
 389          */
 390         smp_wmb();
 391
 392         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 393 }
 394
 395 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 396 {
 397         union split_spte *ssptep, sspte;
 398
 399         ssptep = (union split_spte *)sptep;
 400         sspte = (union split_spte)spte;
 401
 402         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 403
 404         /*
 405          * If we map the spte from present to nonpresent, we should clear
 406          * present bit firstly to avoid vcpu fetch the old high bits.
 407          */
 408         smp_wmb();
 409
 410         ssptep->spte_high = sspte.spte_high;
 411         count_spte_clear(sptep, spte);
 412 }
 413
 414 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 415 {
 416         union split_spte *ssptep, sspte, orig;
 417
 418         ssptep = (union split_spte *)sptep;
 419         sspte = (union split_spte)spte;
 420
 421         /* xchg acts as a barrier before the setting of the high bits */
 422         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 423         orig.spte_high = ssptep->spte_high;
 424         ssptep->spte_high = sspte.spte_high;
 425         count_spte_clear(sptep, spte);
 426
 427         return orig.spte;
 428 }
 429
 430 /*
 431  * The idea using the light way get the spte on x86_32 guest is from
 432  * gup_get_pte (mm/gup.c).
 433  *
 434  * An spte tlb flush may be pending, because kvm_set_pte_rmap
 435  * coalesces them and we are running out of the MMU lock.  Therefore
 436  * we need to protect against in-progress updates of the spte.
 437  *
 438  * Reading the spte while an update is in progress may get the old value
 439  * for the high part of the spte.  The race is fine for a present->non-present
 440  * change (because the high part of the spte is ignored for non-present spte),
 441  * but for a present->present change we must reread the spte.
 442  *
 443  * All such changes are done in two steps (present->non-present and
 444  * non-present->present), hence it is enough to count the number of
 445  * present->non-present updates: if it changed while reading the spte,
 446  * we might have hit the race.  This is done using clear_spte_count.
 447  */
 448 static u64 __get_spte_lockless(u64 *sptep)
 449 {
 450         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
 451         union split_spte spte, *orig = (union split_spte *)sptep;
 452         int count;
 453
 454 retry:
 455         count = sp->clear_spte_count;
 456         smp_rmb();
 457
 458         spte.spte_low = orig->spte_low;
 459         smp_rmb();
 460
 461         spte.spte_high = orig->spte_high;
 462         smp_rmb();
 463
 464         if (unlikely(spte.spte_low != orig->spte_low ||
 465               count != sp->clear_spte_count))
 466                 goto retry;
 467
 468         return spte.spte;
 469 }
 470 #endif
 471
 472 /* Rules for using mmu_spte_set:
 473  * Set the sptep from nonpresent to present.
 474  * Note: the sptep being assigned *must* be either not present
 475  * or in a state where the hardware will not attempt to update
 476  * the spte.
 477  */
 478 static void mmu_spte_set(u64 *sptep, u64 new_spte)
 479 {
 480         WARN_ON(is_shadow_present_pte(*sptep));
 481         __set_spte(sptep, new_spte);
 482 }
 483
 484 /*
 485  * Update the SPTE (excluding the PFN), but do not track changes in its
 486  * accessed/dirty status.
 487  */
 488 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 489 {
 490         u64 old_spte = *sptep;
 491
 492         WARN_ON(!is_shadow_present_pte(new_spte));
 493         check_spte_writable_invariants(new_spte);
 494
 495         if (!is_shadow_present_pte(old_spte)) {
 496                 mmu_spte_set(sptep, new_spte);
 497                 return old_spte;
 498         }
 499
 500         if (!spte_has_volatile_bits(old_spte))
 501                 __update_clear_spte_fast(sptep, new_spte);
 502         else
 503                 old_spte = __update_clear_spte_slow(sptep, new_spte);
 504
 505         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 506
 507         return old_spte;
 508 }
 509
 510 /* Rules for using mmu_spte_update:
 511  * Update the state bits, it means the mapped pfn is not changed.
 512  *
 513  * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
 514  * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
 515  * spte, even though the writable spte might be cached on a CPU's TLB.
 516  *
 517  * Returns true if the TLB needs to be flushed
 518  */
 519 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 520 {
 521         bool flush = false;
 522         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 523
 524         if (!is_shadow_present_pte(old_spte))
 525                 return false;
 526
 527         /*
 528          * For the spte updated out of mmu-lock is safe, since
 529          * we always atomically update it, see the comments in
 530          * spte_has_volatile_bits().
 531          */
 532         if (is_mmu_writable_spte(old_spte) &&
 533               !is_writable_pte(new_spte))
 534                 flush = true;
 535
 536         /*
 537          * Flush TLB when accessed/dirty states are changed in the page tables,
 538          * to guarantee consistency between TLB and page tables.
 539          */
 540
 541         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
 542                 flush = true;
 543                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 544         }
 545
 546         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
 547                 flush = true;
 548                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 549         }
 550
 551         return flush;
 552 }
 553
 554 /*
 555  * Rules for using mmu_spte_clear_track_bits:
 556  * It sets the sptep from present to nonpresent, and track the
 557  * state bits, it is used to clear the last level sptep.
 558  * Returns the old PTE.
 559  */
 560 static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
 561 {
 562         kvm_pfn_t pfn;
 563         u64 old_spte = *sptep;
 564         int level = sptep_to_sp(sptep)->role.level;
 565         struct page *page;
 566
 567         if (!is_shadow_present_pte(old_spte) ||
 568             !spte_has_volatile_bits(old_spte))
 569                 __update_clear_spte_fast(sptep, 0ull);
 570         else
 571                 old_spte = __update_clear_spte_slow(sptep, 0ull);
 572
 573         if (!is_shadow_present_pte(old_spte))
 574                 return old_spte;
 575
 576         kvm_update_page_stats(kvm, level, -1);
 577
 578         pfn = spte_to_pfn(old_spte);
 579
 580         /*
 581          * KVM doesn't hold a reference to any pages mapped into the guest, and
 582          * instead uses the mmu_notifier to ensure that KVM unmaps any pages
 583          * before they are reclaimed.  Sanity check that, if the pfn is backed
 584          * by a refcounted page, the refcount is elevated.
 585          */
 586         page = kvm_pfn_to_refcounted_page(pfn);
 587         WARN_ON(page && !page_count(page));
 588
 589         if (is_accessed_spte(old_spte))
 590                 kvm_set_pfn_accessed(pfn);
 591
 592         if (is_dirty_spte(old_spte))
 593                 kvm_set_pfn_dirty(pfn);
 594
 595         return old_spte;
 596 }
 597
 598 /*
 599  * Rules for using mmu_spte_clear_no_track:
 600  * Directly clear spte without caring the state bits of sptep,
 601  * it is used to set the upper level spte.
 602  */
 603 static void mmu_spte_clear_no_track(u64 *sptep)
 604 {
 605         __update_clear_spte_fast(sptep, 0ull);
 606 }
 607
 608 static u64 mmu_spte_get_lockless(u64 *sptep)
 609 {
 610         return __get_spte_lockless(sptep);
 611 }
 612
 613 /* Returns the Accessed status of the PTE and resets it at the same time. */
 614 static bool mmu_spte_age(u64 *sptep)
 615 {
 616         u64 spte = mmu_spte_get_lockless(sptep);
 617
 618         if (!is_accessed_spte(spte))
 619                 return false;
 620
 621         if (spte_ad_enabled(spte)) {
 622                 clear_bit((ffs(shadow_accessed_mask) - 1),
 623                           (unsigned long *)sptep);
 624         } else {
 625                 /*
 626                  * Capture the dirty status of the page, so that it doesn't get
 627                  * lost when the SPTE is marked for access tracking.
 628                  */
 629                 if (is_writable_pte(spte))
 630                         kvm_set_pfn_dirty(spte_to_pfn(spte));
 631
 632                 spte = mark_spte_for_access_track(spte);
 633                 mmu_spte_update_no_track(sptep, spte);
 634         }
 635
 636         return true;
 637 }
 638
 639 static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
 640 {
 641         return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
 642 }
 643
 644 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 645 {
 646         if (is_tdp_mmu_active(vcpu)) {
 647                 kvm_tdp_mmu_walk_lockless_begin();
 648         } else {
 649                 /*
 650                  * Prevent page table teardown by making any free-er wait during
 651                  * kvm_flush_remote_tlbs() IPI to all active vcpus.
 652                  */
 653                 local_irq_disable();
 654
 655                 /*
 656                  * Make sure a following spte read is not reordered ahead of the write
 657                  * to vcpu->mode.
 658                  */
 659                 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 660         }
 661 }
 662
 663 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 664 {
 665         if (is_tdp_mmu_active(vcpu)) {
 666                 kvm_tdp_mmu_walk_lockless_end();
 667         } else {
 668                 /*
 669                  * Make sure the write to vcpu->mode is not reordered in front of
 670                  * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
 671                  * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 672                  */
 673                 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 674                 local_irq_enable();
 675         }
 676 }
 677
 678 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 679 {
 680         int r;
 681
 682         /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
 683         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
 684                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 685         if (r)
 686                 return r;
 687         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
 688                                        PT64_ROOT_MAX_LEVEL);
 689         if (r)
 690                 return r;
 691         if (maybe_indirect) {
 692                 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
 693                                                PT64_ROOT_MAX_LEVEL);
 694                 if (r)
 695                         return r;
 696         }
 697         return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 698                                           PT64_ROOT_MAX_LEVEL);
 699 }
 700
 701 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 702 {
 703         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
 704         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
 705         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
 706         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 707 }
 708
 709 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 710 {
 711         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 712 }
 713
 714 static bool sp_has_gptes(struct kvm_mmu_page *sp);
 715
 716 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 717 {
 718         if (sp->role.passthrough)
 719                 return sp->gfn;
 720
 721         if (!sp->role.direct)
 722                 return sp->shadowed_translation[index] >> PAGE_SHIFT;
 723
 724         return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
 725 }
 726
 727 /*
 728  * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
 729  * that the SPTE itself may have a more constrained access permissions that
 730  * what the guest enforces. For example, a guest may create an executable
 731  * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
 732  */
 733 static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
 734 {
 735         if (sp_has_gptes(sp))
 736                 return sp->shadowed_translation[index] & ACC_ALL;
 737
 738         /*
 739          * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
 740          * KVM is not shadowing any guest page tables, so the "guest access
 741          * permissions" are just ACC_ALL.
 742          *
 743          * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
 744          * is shadowing a guest huge page with small pages, the guest access
 745          * permissions being shadowed are the access permissions of the huge
 746          * page.
 747          *
 748          * In both cases, sp->role.access contains the correct access bits.
 749          */
 750         return sp->role.access;
 751 }
 752
 753 static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
 754                                          gfn_t gfn, unsigned int access)
 755 {
 756         if (sp_has_gptes(sp)) {
 757                 sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
 758                 return;
 759         }
 760
 761         WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
 762                   "access mismatch under %s page %llx (expected %u, got %u)\n",
 763                   sp->role.passthrough ? "passthrough" : "direct",
 764                   sp->gfn, kvm_mmu_page_get_access(sp, index), access);
 765
 766         WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
 767                   "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
 768                   sp->role.passthrough ? "passthrough" : "direct",
 769                   sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
 770 }
 771
 772 static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
 773                                     unsigned int access)
 774 {
 775         gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
 776
 777         kvm_mmu_page_set_translation(sp, index, gfn, access);
 778 }
 779
 780 /*
 781  * Return the pointer to the large page information for a given gfn,
 782  * handling slots that are not large page aligned.
 783  */
 784 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 785                 const struct kvm_memory_slot *slot, int level)
 786 {
 787         unsigned long idx;
 788
 789         idx = gfn_to_index(gfn, slot->base_gfn, level);
 790         return &slot->arch.lpage_info[level - 2][idx];
 791 }
 792
 793 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
 794                                             gfn_t gfn, int count)
 795 {
 796         struct kvm_lpage_info *linfo;
 797         int i;
 798
 799         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 800                 linfo = lpage_info_slot(gfn, slot, i);
 801                 linfo->disallow_lpage += count;
 802                 WARN_ON(linfo->disallow_lpage < 0);
 803         }
 804 }
 805
 806 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
 807 {
 808         update_gfn_disallow_lpage_count(slot, gfn, 1);
 809 }
 810
 811 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
 812 {
 813         update_gfn_disallow_lpage_count(slot, gfn, -1);
 814 }
 815
 816 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 817 {
 818         struct kvm_memslots *slots;
 819         struct kvm_memory_slot *slot;
 820         gfn_t gfn;
 821
 822         kvm->arch.indirect_shadow_pages++;
 823         gfn = sp->gfn;
 824         slots = kvm_memslots_for_spte_role(kvm, sp->role);
 825         slot = __gfn_to_memslot(slots, gfn);
 826
 827         /* the non-leaf shadow pages are keeping readonly. */
 828         if (sp->role.level > PG_LEVEL_4K)
 829                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
 830                                                     KVM_PAGE_TRACK_WRITE);
 831
 832         kvm_mmu_gfn_disallow_lpage(slot, gfn);
 833
 834         if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
 835                 kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
 836 }
 837
 838 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 839 {
 840         /*
 841          * If it's possible to replace the shadow page with an NX huge page,
 842          * i.e. if the shadow page is the only thing currently preventing KVM
 843          * from using a huge page, add the shadow page to the list of "to be
 844          * zapped for NX recovery" pages.  Note, the shadow page can already be
 845          * on the list if KVM is reusing an existing shadow page, i.e. if KVM
 846          * links a shadow page at multiple points.
 847          */
 848         if (!list_empty(&sp->possible_nx_huge_page_link))
 849                 return;
 850
 851         ++kvm->stat.nx_lpage_splits;
 852         list_add_tail(&sp->possible_nx_huge_page_link,
 853                       &kvm->arch.possible_nx_huge_pages);
 854 }
 855
 856 static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 857                                  bool nx_huge_page_possible)
 858 {
 859         sp->nx_huge_page_disallowed = true;
 860
 861         if (nx_huge_page_possible)
 862                 track_possible_nx_huge_page(kvm, sp);
 863 }
 864
 865 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 866 {
 867         struct kvm_memslots *slots;
 868         struct kvm_memory_slot *slot;
 869         gfn_t gfn;
 870
 871         kvm->arch.indirect_shadow_pages--;
 872         gfn = sp->gfn;
 873         slots = kvm_memslots_for_spte_role(kvm, sp->role);
 874         slot = __gfn_to_memslot(slots, gfn);
 875         if (sp->role.level > PG_LEVEL_4K)
 876                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
 877                                                        KVM_PAGE_TRACK_WRITE);
 878
 879         kvm_mmu_gfn_allow_lpage(slot, gfn);
 880 }
 881
 882 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 883 {
 884         if (list_empty(&sp->possible_nx_huge_page_link))
 885                 return;
 886
 887         --kvm->stat.nx_lpage_splits;
 888         list_del_init(&sp->possible_nx_huge_page_link);
 889 }
 890
 891 static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 892 {
 893         sp->nx_huge_page_disallowed = false;
 894
 895         untrack_possible_nx_huge_page(kvm, sp);
 896 }
 897
 898 static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
 899                                                            gfn_t gfn,
 900                                                            bool no_dirty_log)
 901 {
 902         struct kvm_memory_slot *slot;
 903
 904         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 905         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 906                 return NULL;
 907         if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
 908                 return NULL;
 909
 910         return slot;
 911 }
 912
 913 /*
 914  * About rmap_head encoding:
 915  *
 916  * If the bit zero of rmap_head->val is clear, then it points to the only spte
 917  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
 918  * pte_list_desc containing more mappings.
 919  */
 920
 921 /*
 922  * Returns the number of pointers in the rmap chain, not counting the new one.
 923  */
 924 static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
 925                         struct kvm_rmap_head *rmap_head)
 926 {
 927         struct pte_list_desc *desc;
 928         int count = 0;
 929
 930         if (!rmap_head->val) {
 931                 rmap_printk("%p %llx 0->1\n", spte, *spte);
 932                 rmap_head->val = (unsigned long)spte;
 933         } else if (!(rmap_head->val & 1)) {
 934                 rmap_printk("%p %llx 1->many\n", spte, *spte);
 935                 desc = kvm_mmu_memory_cache_alloc(cache);
 936                 desc->sptes[0] = (u64 *)rmap_head->val;
 937                 desc->sptes[1] = spte;
 938                 desc->spte_count = 2;
 939                 desc->tail_count = 0;
 940                 rmap_head->val = (unsigned long)desc | 1;
 941                 ++count;
 942         } else {
 943                 rmap_printk("%p %llx many->many\n", spte, *spte);
 944                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 945                 count = desc->tail_count + desc->spte_count;
 946
 947                 /*
 948                  * If the previous head is full, allocate a new head descriptor
 949                  * as tail descriptors are always kept full.
 950                  */
 951                 if (desc->spte_count == PTE_LIST_EXT) {
 952                         desc = kvm_mmu_memory_cache_alloc(cache);
 953                         desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 954                         desc->spte_count = 0;
 955                         desc->tail_count = count;
 956                         rmap_head->val = (unsigned long)desc | 1;
 957                 }
 958                 desc->sptes[desc->spte_count++] = spte;
 959         }
 960         return count;
 961 }
 962
 963 static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
 964                                        struct pte_list_desc *desc, int i)
 965 {
 966         struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 967         int j = head_desc->spte_count - 1;
 968
 969         /*
 970          * The head descriptor should never be empty.  A new head is added only
 971          * when adding an entry and the previous head is full, and heads are
 972          * removed (this flow) when they become empty.
 973          */
 974         BUG_ON(j < 0);
 975
 976         /*
 977          * Replace the to-be-freed SPTE with the last valid entry from the head
 978          * descriptor to ensure that tail descriptors are full at all times.
 979          * Note, this also means that tail_count is stable for each descriptor.
 980          */
 981         desc->sptes[i] = head_desc->sptes[j];
 982         head_desc->sptes[j] = NULL;
 983         head_desc->spte_count--;
 984         if (head_desc->spte_count)
 985                 return;
 986
 987         /*
 988          * The head descriptor is empty.  If there are no tail descriptors,
 989          * nullify the rmap head to mark the list as emtpy, else point the rmap
 990          * head at the next descriptor, i.e. the new head.
 991          */
 992         if (!head_desc->more)
 993                 rmap_head->val = 0;
 994         else
 995                 rmap_head->val = (unsigned long)head_desc->more | 1;
 996         mmu_free_pte_list_desc(head_desc);
 997 }
 998
 999 static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1000 {
1001         struct pte_list_desc *desc;
1002         int i;
1003
1004         if (!rmap_head->val) {
1005                 pr_err("%s: %p 0->BUG\n", __func__, spte);
1006                 BUG();
1007         } else if (!(rmap_head->val & 1)) {
1008                 rmap_printk("%p 1->0\n", spte);
1009                 if ((u64 *)rmap_head->val != spte) {
1010                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
1011                         BUG();
1012                 }
1013                 rmap_head->val = 0;
1014         } else {
1015                 rmap_printk("%p many->many\n", spte);
1016                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1017                 while (desc) {
1018                         for (i = 0; i < desc->spte_count; ++i) {
1019                                 if (desc->sptes[i] == spte) {
1020                                         pte_list_desc_remove_entry(rmap_head, desc, i);
1021                                         return;
1022                                 }
1023                         }
1024                         desc = desc->more;
1025                 }
1026                 pr_err("%s: %p many->many\n", __func__, spte);
1027                 BUG();
1028         }
1029 }
1030
1031 static void kvm_zap_one_rmap_spte(struct kvm *kvm,
1032                                   struct kvm_rmap_head *rmap_head, u64 *sptep)
1033 {
1034         mmu_spte_clear_track_bits(kvm, sptep);
1035         pte_list_remove(sptep, rmap_head);
1036 }
1037
1038 /* Return true if at least one SPTE was zapped, false otherwise */
1039 static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
1040                                    struct kvm_rmap_head *rmap_head)
1041 {
1042         struct pte_list_desc *desc, *next;
1043         int i;
1044
1045         if (!rmap_head->val)
1046                 return false;
1047
1048         if (!(rmap_head->val & 1)) {
1049                 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
1050                 goto out;
1051         }
1052
1053         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1054
1055         for (; desc; desc = next) {
1056                 for (i = 0; i < desc->spte_count; i++)
1057                         mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
1058                 next = desc->more;
1059                 mmu_free_pte_list_desc(desc);
1060         }
1061 out:
1062         /* rmap_head is meaningless now, remember to reset it */
1063         rmap_head->val = 0;
1064         return true;
1065 }
1066
1067 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
1068 {
1069         struct pte_list_desc *desc;
1070
1071         if (!rmap_head->val)
1072                 return 0;
1073         else if (!(rmap_head->val & 1))
1074                 return 1;
1075
1076         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1077         return desc->tail_count + desc->spte_count;
1078 }
1079
1080 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1081                                          const struct kvm_memory_slot *slot)
1082 {
1083         unsigned long idx;
1084
1085         idx = gfn_to_index(gfn, slot->base_gfn, level);
1086         return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1087 }
1088
1089 static void rmap_remove(struct kvm *kvm, u64 *spte)
1090 {
1091         struct kvm_memslots *slots;
1092         struct kvm_memory_slot *slot;
1093         struct kvm_mmu_page *sp;
1094         gfn_t gfn;
1095         struct kvm_rmap_head *rmap_head;
1096
1097         sp = sptep_to_sp(spte);
1098         gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
1099
1100         /*
1101          * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1102          * so we have to determine which memslots to use based on context
1103          * information in sp->role.
1104          */
1105         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1106
1107         slot = __gfn_to_memslot(slots, gfn);
1108         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1109
1110         pte_list_remove(spte, rmap_head);
1111 }
1112
1113 /*
1114  * Used by the following functions to iterate through the sptes linked by a
1115  * rmap.  All fields are private and not assumed to be used outside.
1116  */
1117 struct rmap_iterator {
1118         /* private fields */
1119         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1120         int pos;                        /* index of the sptep */
1121 };
1122
1123 /*
1124  * Iteration must be started by this function.  This should also be used after
1125  * removing/dropping sptes from the rmap link because in such cases the
1126  * information in the iterator may not be valid.
1127  *
1128  * Returns sptep if found, NULL otherwise.
1129  */
1130 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1131                            struct rmap_iterator *iter)
1132 {
1133         u64 *sptep;
1134
1135         if (!rmap_head->val)
1136                 return NULL;
1137
1138         if (!(rmap_head->val & 1)) {
1139                 iter->desc = NULL;
1140                 sptep = (u64 *)rmap_head->val;
1141                 goto out;
1142         }
1143
1144         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1145         iter->pos = 0;
1146         sptep = iter->desc->sptes[iter->pos];
1147 out:
1148         BUG_ON(!is_shadow_present_pte(*sptep));
1149         return sptep;
1150 }
1151
1152 /*
1153  * Must be used with a valid iterator: e.g. after rmap_get_first().
1154  *
1155  * Returns sptep if found, NULL otherwise.
1156  */
1157 static u64 *rmap_get_next(struct rmap_iterator *iter)
1158 {
1159         u64 *sptep;
1160
1161         if (iter->desc) {
1162                 if (iter->pos < PTE_LIST_EXT - 1) {
1163                         ++iter->pos;
1164                         sptep = iter->desc->sptes[iter->pos];
1165                         if (sptep)
1166                                 goto out;
1167                 }
1168
1169                 iter->desc = iter->desc->more;
1170
1171                 if (iter->desc) {
1172                         iter->pos = 0;
1173                         /* desc->sptes[0] cannot be NULL */
1174                         sptep = iter->desc->sptes[iter->pos];
1175                         goto out;
1176                 }
1177         }
1178
1179         return NULL;
1180 out:
1181         BUG_ON(!is_shadow_present_pte(*sptep));
1182         return sptep;
1183 }
1184
1185 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1186         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1187              _spte_; _spte_ = rmap_get_next(_iter_))
1188
1189 static void drop_spte(struct kvm *kvm, u64 *sptep)
1190 {
1191         u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1192
1193         if (is_shadow_present_pte(old_spte))
1194                 rmap_remove(kvm, sptep);
1195 }
1196
1197 static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
1198 {
1199         struct kvm_mmu_page *sp;
1200
1201         sp = sptep_to_sp(sptep);
1202         WARN_ON(sp->role.level == PG_LEVEL_4K);
1203
1204         drop_spte(kvm, sptep);
1205
1206         if (flush)
1207                 kvm_flush_remote_tlbs_sptep(kvm, sptep);
1208 }
1209
1210 /*
1211  * Write-protect on the specified @sptep, @pt_protect indicates whether
1212  * spte write-protection is caused by protecting shadow page table.
1213  *
1214  * Note: write protection is difference between dirty logging and spte
1215  * protection:
1216  * - for dirty logging, the spte can be set to writable at anytime if
1217  *   its dirty bitmap is properly set.
1218  * - for spte protection, the spte can be writable only after unsync-ing
1219  *   shadow page.
1220  *
1221  * Return true if tlb need be flushed.
1222  */
1223 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1224 {
1225         u64 spte = *sptep;
1226
1227         if (!is_writable_pte(spte) &&
1228             !(pt_protect && is_mmu_writable_spte(spte)))
1229                 return false;
1230
1231         rmap_printk("spte %p %llx\n", sptep, *sptep);
1232
1233         if (pt_protect)
1234                 spte &= ~shadow_mmu_writable_mask;
1235         spte = spte & ~PT_WRITABLE_MASK;
1236
1237         return mmu_spte_update(sptep, spte);
1238 }
1239
1240 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1241                                bool pt_protect)
1242 {
1243         u64 *sptep;
1244         struct rmap_iterator iter;
1245         bool flush = false;
1246
1247         for_each_rmap_spte(rmap_head, &iter, sptep)
1248                 flush |= spte_write_protect(sptep, pt_protect);
1249
1250         return flush;
1251 }
1252
1253 static bool spte_clear_dirty(u64 *sptep)
1254 {
1255         u64 spte = *sptep;
1256
1257         rmap_printk("spte %p %llx\n", sptep, *sptep);
1258
1259         MMU_WARN_ON(!spte_ad_enabled(spte));
1260         spte &= ~shadow_dirty_mask;
1261         return mmu_spte_update(sptep, spte);
1262 }
1263
1264 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1265 {
1266         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1267                                                (unsigned long *)sptep);
1268         if (was_writable && !spte_ad_enabled(*sptep))
1269                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1270
1271         return was_writable;
1272 }
1273
1274 /*
1275  * Gets the GFN ready for another round of dirty logging by clearing the
1276  *      - D bit on ad-enabled SPTEs, and
1277  *      - W bit on ad-disabled SPTEs.
1278  * Returns true iff any D or W bits were cleared.
1279  */
1280 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1281                                const struct kvm_memory_slot *slot)
1282 {
1283         u64 *sptep;
1284         struct rmap_iterator iter;
1285         bool flush = false;
1286
1287         for_each_rmap_spte(rmap_head, &iter, sptep)
1288                 if (spte_ad_need_write_protect(*sptep))
1289                         flush |= spte_wrprot_for_clear_dirty(sptep);
1290                 else
1291                         flush |= spte_clear_dirty(sptep);
1292
1293         return flush;
1294 }
1295
1296 /**
1297  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1298  * @kvm: kvm instance
1299  * @slot: slot to protect
1300  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1301  * @mask: indicates which pages we should protect
1302  *
1303  * Used when we do not need to care about huge page mappings.
1304  */
1305 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1306                                      struct kvm_memory_slot *slot,
1307                                      gfn_t gfn_offset, unsigned long mask)
1308 {
1309         struct kvm_rmap_head *rmap_head;
1310
1311         if (tdp_mmu_enabled)
1312                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1313                                 slot->base_gfn + gfn_offset, mask, true);
1314
1315         if (!kvm_memslots_have_rmaps(kvm))
1316                 return;
1317
1318         while (mask) {
1319                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1320                                         PG_LEVEL_4K, slot);
1321                 rmap_write_protect(rmap_head, false);
1322
1323                 /* clear the first set bit */
1324                 mask &= mask - 1;
1325         }
1326 }
1327
1328 /**
1329  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1330  * protect the page if the D-bit isn't supported.
1331  * @kvm: kvm instance
1332  * @slot: slot to clear D-bit
1333  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1334  * @mask: indicates which pages we should clear D-bit
1335  *
1336  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1337  */
1338 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1339                                          struct kvm_memory_slot *slot,
1340                                          gfn_t gfn_offset, unsigned long mask)
1341 {
1342         struct kvm_rmap_head *rmap_head;
1343
1344         if (tdp_mmu_enabled)
1345                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1346                                 slot->base_gfn + gfn_offset, mask, false);
1347
1348         if (!kvm_memslots_have_rmaps(kvm))
1349                 return;
1350
1351         while (mask) {
1352                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1353                                         PG_LEVEL_4K, slot);
1354                 __rmap_clear_dirty(kvm, rmap_head, slot);
1355
1356                 /* clear the first set bit */
1357                 mask &= mask - 1;
1358         }
1359 }
1360
1361 /**
1362  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1363  * PT level pages.
1364  *
1365  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1366  * enable dirty logging for them.
1367  *
1368  * We need to care about huge page mappings: e.g. during dirty logging we may
1369  * have such mappings.
1370  */
1371 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1372                                 struct kvm_memory_slot *slot,
1373                                 gfn_t gfn_offset, unsigned long mask)
1374 {
1375         /*
1376          * Huge pages are NOT write protected when we start dirty logging in
1377          * initially-all-set mode; must write protect them here so that they
1378          * are split to 4K on the first write.
1379          *
1380          * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1381          * of memslot has no such restriction, so the range can cross two large
1382          * pages.
1383          */
1384         if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1385                 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1386                 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1387
1388                 if (READ_ONCE(eager_page_split))
1389                         kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1390
1391                 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1392
1393                 /* Cross two large pages? */
1394                 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1395                     ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1396                         kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1397                                                        PG_LEVEL_2M);
1398         }
1399
1400         /* Now handle 4K PTEs.  */
1401         if (kvm_x86_ops.cpu_dirty_log_size)
1402                 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1403         else
1404                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1405 }
1406
1407 int kvm_cpu_dirty_log_size(void)
1408 {
1409         return kvm_x86_ops.cpu_dirty_log_size;
1410 }
1411
1412 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1413                                     struct kvm_memory_slot *slot, u64 gfn,
1414                                     int min_level)
1415 {
1416         struct kvm_rmap_head *rmap_head;
1417         int i;
1418         bool write_protected = false;
1419
1420         if (kvm_memslots_have_rmaps(kvm)) {
1421                 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1422                         rmap_head = gfn_to_rmap(gfn, i, slot);
1423                         write_protected |= rmap_write_protect(rmap_head, true);
1424                 }
1425         }
1426
1427         if (tdp_mmu_enabled)
1428                 write_protected |=
1429                         kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1430
1431         return write_protected;
1432 }
1433
1434 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1435 {
1436         struct kvm_memory_slot *slot;
1437
1438         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1439         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1440 }
1441
1442 static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1443                            const struct kvm_memory_slot *slot)
1444 {
1445         return kvm_zap_all_rmap_sptes(kvm, rmap_head);
1446 }
1447
1448 static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1449                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
1450                          pte_t unused)
1451 {
1452         return __kvm_zap_rmap(kvm, rmap_head, slot);
1453 }
1454
1455 static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1456                              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1457                              pte_t pte)
1458 {
1459         u64 *sptep;
1460         struct rmap_iterator iter;
1461         bool need_flush = false;
1462         u64 new_spte;
1463         kvm_pfn_t new_pfn;
1464
1465         WARN_ON(pte_huge(pte));
1466         new_pfn = pte_pfn(pte);
1467
1468 restart:
1469         for_each_rmap_spte(rmap_head, &iter, sptep) {
1470                 rmap_printk("spte %p %llx gfn %llx (%d)\n",
1471                             sptep, *sptep, gfn, level);
1472
1473                 need_flush = true;
1474
1475                 if (pte_write(pte)) {
1476                         kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
1477                         goto restart;
1478                 } else {
1479                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1480                                         *sptep, new_pfn);
1481
1482                         mmu_spte_clear_track_bits(kvm, sptep);
1483                         mmu_spte_set(sptep, new_spte);
1484                 }
1485         }
1486
1487         if (need_flush && kvm_available_flush_tlb_with_range()) {
1488                 kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
1489                 return false;
1490         }
1491
1492         return need_flush;
1493 }
1494
1495 struct slot_rmap_walk_iterator {
1496         /* input fields. */
1497         const struct kvm_memory_slot *slot;
1498         gfn_t start_gfn;
1499         gfn_t end_gfn;
1500         int start_level;
1501         int end_level;
1502
1503         /* output fields. */
1504         gfn_t gfn;
1505         struct kvm_rmap_head *rmap;
1506         int level;
1507
1508         /* private field. */
1509         struct kvm_rmap_head *end_rmap;
1510 };
1511
1512 static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
1513                                  int level)
1514 {
1515         iterator->level = level;
1516         iterator->gfn = iterator->start_gfn;
1517         iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1518         iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1519 }
1520
1521 static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1522                                 const struct kvm_memory_slot *slot,
1523                                 int start_level, int end_level,
1524                                 gfn_t start_gfn, gfn_t end_gfn)
1525 {
1526         iterator->slot = slot;
1527         iterator->start_level = start_level;
1528         iterator->end_level = end_level;
1529         iterator->start_gfn = start_gfn;
1530         iterator->end_gfn = end_gfn;
1531
1532         rmap_walk_init_level(iterator, iterator->start_level);
1533 }
1534
1535 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1536 {
1537         return !!iterator->rmap;
1538 }
1539
1540 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1541 {
1542         while (++iterator->rmap <= iterator->end_rmap) {
1543                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1544
1545                 if (iterator->rmap->val)
1546                         return;
1547         }
1548
1549         if (++iterator->level > iterator->end_level) {
1550                 iterator->rmap = NULL;
1551                 return;
1552         }
1553
1554         rmap_walk_init_level(iterator, iterator->level);
1555 }
1556
1557 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1558            _start_gfn, _end_gfn, _iter_)                                \
1559         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1560                                  _end_level_, _start_gfn, _end_gfn);    \
1561              slot_rmap_walk_okay(_iter_);                               \
1562              slot_rmap_walk_next(_iter_))
1563
1564 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1565                                struct kvm_memory_slot *slot, gfn_t gfn,
1566                                int level, pte_t pte);
1567
1568 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1569                                                  struct kvm_gfn_range *range,
1570                                                  rmap_handler_t handler)
1571 {
1572         struct slot_rmap_walk_iterator iterator;
1573         bool ret = false;
1574
1575         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1576                                  range->start, range->end - 1, &iterator)
1577                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1578                                iterator.level, range->pte);
1579
1580         return ret;
1581 }
1582
1583 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1584 {
1585         bool flush = false;
1586
1587         if (kvm_memslots_have_rmaps(kvm))
1588                 flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
1589
1590         if (tdp_mmu_enabled)
1591                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1592
1593         return flush;
1594 }
1595
1596 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1597 {
1598         bool flush = false;
1599
1600         if (kvm_memslots_have_rmaps(kvm))
1601                 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
1602
1603         if (tdp_mmu_enabled)
1604                 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1605
1606         return flush;
1607 }
1608
1609 static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1610                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
1611                          pte_t unused)
1612 {
1613         u64 *sptep;
1614         struct rmap_iterator iter;
1615         int young = 0;
1616
1617         for_each_rmap_spte(rmap_head, &iter, sptep)
1618                 young |= mmu_spte_age(sptep);
1619
1620         return young;
1621 }
1622
1623 static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1624                               struct kvm_memory_slot *slot, gfn_t gfn,
1625                               int level, pte_t unused)
1626 {
1627         u64 *sptep;
1628         struct rmap_iterator iter;
1629
1630         for_each_rmap_spte(rmap_head, &iter, sptep)
1631                 if (is_accessed_spte(*sptep))
1632                         return true;
1633         return false;
1634 }
1635
1636 #define RMAP_RECYCLE_THRESHOLD 1000
1637
1638 static void __rmap_add(struct kvm *kvm,
1639                        struct kvm_mmu_memory_cache *cache,
1640                        const struct kvm_memory_slot *slot,
1641                        u64 *spte, gfn_t gfn, unsigned int access)
1642 {
1643         struct kvm_mmu_page *sp;
1644         struct kvm_rmap_head *rmap_head;
1645         int rmap_count;
1646
1647         sp = sptep_to_sp(spte);
1648         kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
1649         kvm_update_page_stats(kvm, sp->role.level, 1);
1650
1651         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1652         rmap_count = pte_list_add(cache, spte, rmap_head);
1653
1654         if (rmap_count > kvm->stat.max_mmu_rmap_size)
1655                 kvm->stat.max_mmu_rmap_size = rmap_count;
1656         if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1657                 kvm_zap_all_rmap_sptes(kvm, rmap_head);
1658                 kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
1659         }
1660 }
1661
1662 static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
1663                      u64 *spte, gfn_t gfn, unsigned int access)
1664 {
1665         struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
1666
1667         __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
1668 }
1669
1670 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1671 {
1672         bool young = false;
1673
1674         if (kvm_memslots_have_rmaps(kvm))
1675                 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
1676
1677         if (tdp_mmu_enabled)
1678                 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1679
1680         return young;
1681 }
1682
1683 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1684 {
1685         bool young = false;
1686
1687         if (kvm_memslots_have_rmaps(kvm))
1688                 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
1689
1690         if (tdp_mmu_enabled)
1691                 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1692
1693         return young;
1694 }
1695
1696 #ifdef MMU_DEBUG
1697 static int is_empty_shadow_page(u64 *spt)
1698 {
1699         u64 *pos;
1700         u64 *end;
1701
1702         for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
1703                 if (is_shadow_present_pte(*pos)) {
1704                         printk(KERN_ERR "%s: %p %llx\n", __func__,
1705                                pos, *pos);
1706                         return 0;
1707                 }
1708         return 1;
1709 }
1710 #endif
1711
1712 /*
1713  * This value is the sum of all of the kvm instances's
1714  * kvm->arch.n_used_mmu_pages values.  We need a global,
1715  * aggregate version in order to make the slab shrinker
1716  * faster
1717  */
1718 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1719 {
1720         kvm->arch.n_used_mmu_pages += nr;
1721         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1722 }
1723
1724 static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1725 {
1726         kvm_mod_used_mmu_pages(kvm, +1);
1727         kvm_account_pgtable_pages((void *)sp->spt, +1);
1728 }
1729
1730 static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1731 {
1732         kvm_mod_used_mmu_pages(kvm, -1);
1733         kvm_account_pgtable_pages((void *)sp->spt, -1);
1734 }
1735
1736 static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
1737 {
1738         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1739         hlist_del(&sp->hash_link);
1740         list_del(&sp->link);
1741         free_page((unsigned long)sp->spt);
1742         if (!sp->role.direct)
1743                 free_page((unsigned long)sp->shadowed_translation);
1744         kmem_cache_free(mmu_page_header_cache, sp);
1745 }
1746
1747 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1748 {
1749         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1750 }
1751
1752 static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
1753                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1754 {
1755         if (!parent_pte)
1756                 return;
1757
1758         pte_list_add(cache, parent_pte, &sp->parent_ptes);
1759 }
1760
1761 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1762                                        u64 *parent_pte)
1763 {
1764         pte_list_remove(parent_pte, &sp->parent_ptes);
1765 }
1766
1767 static void drop_parent_pte(struct kvm_mmu_page *sp,
1768                             u64 *parent_pte)
1769 {
1770         mmu_page_remove_parent_pte(sp, parent_pte);
1771         mmu_spte_clear_no_track(parent_pte);
1772 }
1773
1774 static void mark_unsync(u64 *spte);
1775 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1776 {
1777         u64 *sptep;
1778         struct rmap_iterator iter;
1779
1780         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1781                 mark_unsync(sptep);
1782         }
1783 }
1784
1785 static void mark_unsync(u64 *spte)
1786 {
1787         struct kvm_mmu_page *sp;
1788
1789         sp = sptep_to_sp(spte);
1790         if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
1791                 return;
1792         if (sp->unsync_children++)
1793                 return;
1794         kvm_mmu_mark_parents_unsync(sp);
1795 }
1796
1797 #define KVM_PAGE_ARRAY_NR 16
1798
1799 struct kvm_mmu_pages {
1800         struct mmu_page_and_offset {
1801                 struct kvm_mmu_page *sp;
1802                 unsigned int idx;
1803         } page[KVM_PAGE_ARRAY_NR];
1804         unsigned int nr;
1805 };
1806
1807 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1808                          int idx)
1809 {
1810         int i;
1811
1812         if (sp->unsync)
1813                 for (i=0; i < pvec->nr; i++)
1814                         if (pvec->page[i].sp == sp)
1815                                 return 0;
1816
1817         pvec->page[pvec->nr].sp = sp;
1818         pvec->page[pvec->nr].idx = idx;
1819         pvec->nr++;
1820         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1821 }
1822
1823 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1824 {
1825         --sp->unsync_children;
1826         WARN_ON((int)sp->unsync_children < 0);
1827         __clear_bit(idx, sp->unsync_child_bitmap);
1828 }
1829
1830 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1831                            struct kvm_mmu_pages *pvec)
1832 {
1833         int i, ret, nr_unsync_leaf = 0;
1834
1835         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1836                 struct kvm_mmu_page *child;
1837                 u64 ent = sp->spt[i];
1838
1839                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1840                         clear_unsync_child_bit(sp, i);
1841                         continue;
1842                 }
1843
1844                 child = spte_to_child_sp(ent);
1845
1846                 if (child->unsync_children) {
1847                         if (mmu_pages_add(pvec, child, i))
1848                                 return -ENOSPC;
1849
1850                         ret = __mmu_unsync_walk(child, pvec);
1851                         if (!ret) {
1852                                 clear_unsync_child_bit(sp, i);
1853                                 continue;
1854                         } else if (ret > 0) {
1855                                 nr_unsync_leaf += ret;
1856                         } else
1857                                 return ret;
1858                 } else if (child->unsync) {
1859                         nr_unsync_leaf++;
1860                         if (mmu_pages_add(pvec, child, i))
1861                                 return -ENOSPC;
1862                 } else
1863                         clear_unsync_child_bit(sp, i);
1864         }
1865
1866         return nr_unsync_leaf;
1867 }
1868
1869 #define INVALID_INDEX (-1)
1870
1871 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1872                            struct kvm_mmu_pages *pvec)
1873 {
1874         pvec->nr = 0;
1875         if (!sp->unsync_children)
1876                 return 0;
1877
1878         mmu_pages_add(pvec, sp, INVALID_INDEX);
1879         return __mmu_unsync_walk(sp, pvec);
1880 }
1881
1882 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1883 {
1884         WARN_ON(!sp->unsync);
1885         trace_kvm_mmu_sync_page(sp);
1886         sp->unsync = 0;
1887         --kvm->stat.mmu_unsync;
1888 }
1889
1890 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1891                                      struct list_head *invalid_list);
1892 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1893                                     struct list_head *invalid_list);
1894
1895 static bool sp_has_gptes(struct kvm_mmu_page *sp)
1896 {
1897         if (sp->role.direct)
1898                 return false;
1899
1900         if (sp->role.passthrough)
1901                 return false;
1902
1903         return true;
1904 }
1905
1906 #define for_each_valid_sp(_kvm, _sp, _list)                             \
1907         hlist_for_each_entry(_sp, _list, hash_link)                     \
1908                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
1909                 } else
1910
1911 #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)               \
1912         for_each_valid_sp(_kvm, _sp,                                    \
1913           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
1914                 if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
1915
1916 static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1917 {
1918         union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
1919
1920         /*
1921          * Ignore various flags when verifying that it's safe to sync a shadow
1922          * page using the current MMU context.
1923          *
1924          *  - level: not part of the overall MMU role and will never match as the MMU's
1925          *           level tracks the root level
1926          *  - access: updated based on the new guest PTE
1927          *  - quadrant: not part of the overall MMU role (similar to level)
1928          */
1929         const union kvm_mmu_page_role sync_role_ign = {
1930                 .level = 0xf,
1931                 .access = 0x7,
1932                 .quadrant = 0x3,
1933                 .passthrough = 0x1,
1934         };
1935
1936         /*
1937          * Direct pages can never be unsync, and KVM should never attempt to
1938          * sync a shadow page for a different MMU context, e.g. if the role
1939          * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
1940          * reserved bits checks will be wrong, etc...
1941          */
1942         if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
1943                          (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
1944                 return false;
1945
1946         return true;
1947 }
1948
1949 static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
1950 {
1951         if (!sp->spt[i])
1952                 return 0;
1953
1954         return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
1955 }
1956
1957 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1958 {
1959         int flush = 0;
1960         int i;
1961
1962         if (!kvm_sync_page_check(vcpu, sp))
1963                 return -1;
1964
1965         for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
1966                 int ret = kvm_sync_spte(vcpu, sp, i);
1967
1968                 if (ret < -1)
1969                         return -1;
1970                 flush |= ret;
1971         }
1972
1973         /*
1974          * Note, any flush is purely for KVM's correctness, e.g. when dropping
1975          * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
1976          * unmap or dirty logging event doesn't fail to flush.  The guest is
1977          * responsible for flushing the TLB to ensure any changes in protection
1978          * bits are recognized, i.e. until the guest flushes or page faults on
1979          * a relevant address, KVM is architecturally allowed to let vCPUs use
1980          * cached translations with the old protection bits.
1981          */
1982         return flush;
1983 }
1984
1985 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1986                          struct list_head *invalid_list)
1987 {
1988         int ret = __kvm_sync_page(vcpu, sp);
1989
1990         if (ret < 0)
1991                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1992         return ret;
1993 }
1994
1995 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1996                                         struct list_head *invalid_list,
1997                                         bool remote_flush)
1998 {
1999         if (!remote_flush && list_empty(invalid_list))
2000                 return false;
2001
2002         if (!list_empty(invalid_list))
2003                 kvm_mmu_commit_zap_page(kvm, invalid_list);
2004         else
2005                 kvm_flush_remote_tlbs(kvm);
2006         return true;
2007 }
2008
2009 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2010 {
2011         if (sp->role.invalid)
2012                 return true;
2013
2014         /* TDP MMU pages do not use the MMU generation. */
2015         return !is_tdp_mmu_page(sp) &&
2016                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2017 }
2018
2019 struct mmu_page_path {
2020         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2021         unsigned int idx[PT64_ROOT_MAX_LEVEL];
2022 };
2023
2024 #define for_each_sp(pvec, sp, parents, i)                       \
2025                 for (i = mmu_pages_first(&pvec, &parents);      \
2026                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2027                         i = mmu_pages_next(&pvec, &parents, i))
2028
2029 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2030                           struct mmu_page_path *parents,
2031                           int i)
2032 {
2033         int n;
2034
2035         for (n = i+1; n < pvec->nr; n++) {
2036                 struct kvm_mmu_page *sp = pvec->page[n].sp;
2037                 unsigned idx = pvec->page[n].idx;
2038                 int level = sp->role.level;
2039
2040                 parents->idx[level-1] = idx;
2041                 if (level == PG_LEVEL_4K)
2042                         break;
2043
2044                 parents->parent[level-2] = sp;
2045         }
2046
2047         return n;
2048 }
2049
2050 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2051                            struct mmu_page_path *parents)
2052 {
2053         struct kvm_mmu_page *sp;
2054         int level;
2055
2056         if (pvec->nr == 0)
2057                 return 0;
2058
2059         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2060
2061         sp = pvec->page[0].sp;
2062         level = sp->role.level;
2063         WARN_ON(level == PG_LEVEL_4K);
2064
2065         parents->parent[level-2] = sp;
2066
2067         /* Also set up a sentinel.  Further entries in pvec are all
2068          * children of sp, so this element is never overwritten.
2069          */
2070         parents->parent[level-1] = NULL;
2071         return mmu_pages_next(pvec, parents, 0);
2072 }
2073
2074 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2075 {
2076         struct kvm_mmu_page *sp;
2077         unsigned int level = 0;
2078
2079         do {
2080                 unsigned int idx = parents->idx[level];
2081                 sp = parents->parent[level];
2082                 if (!sp)
2083                         return;
2084
2085                 WARN_ON(idx == INVALID_INDEX);
2086                 clear_unsync_child_bit(sp, idx);
2087                 level++;
2088         } while (!sp->unsync_children);
2089 }
2090
2091 static int mmu_sync_children(struct kvm_vcpu *vcpu,
2092                              struct kvm_mmu_page *parent, bool can_yield)
2093 {
2094         int i;
2095         struct kvm_mmu_page *sp;
2096         struct mmu_page_path parents;
2097         struct kvm_mmu_pages pages;
2098         LIST_HEAD(invalid_list);
2099         bool flush = false;
2100
2101         while (mmu_unsync_walk(parent, &pages)) {
2102                 bool protected = false;
2103
2104                 for_each_sp(pages, sp, parents, i)
2105                         protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
2106
2107                 if (protected) {
2108                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
2109                         flush = false;
2110                 }
2111
2112                 for_each_sp(pages, sp, parents, i) {
2113                         kvm_unlink_unsync_page(vcpu->kvm, sp);
2114                         flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
2115                         mmu_pages_clear_parents(&parents);
2116                 }
2117                 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
2118                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2119                         if (!can_yield) {
2120                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2121                                 return -EINTR;
2122                         }
2123
2124                         cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
2125                         flush = false;
2126                 }
2127         }
2128
2129         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2130         return 0;
2131 }
2132
2133 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2134 {
2135         atomic_set(&sp->write_flooding_count,  0);
2136 }
2137
2138 static void clear_sp_write_flooding_count(u64 *spte)
2139 {
2140         __clear_sp_write_flooding_count(sptep_to_sp(spte));
2141 }
2142
2143 /*
2144  * The vCPU is required when finding indirect shadow pages; the shadow
2145  * page may already exist and syncing it needs the vCPU pointer in
2146  * order to read guest page tables.  Direct shadow pages are never
2147  * unsync, thus @vcpu can be NULL if @role.direct is true.
2148  */
2149 static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
2150                                                      struct kvm_vcpu *vcpu,
2151                                                      gfn_t gfn,
2152                                                      struct hlist_head *sp_list,
2153                                                      union kvm_mmu_page_role role)
2154 {
2155         struct kvm_mmu_page *sp;
2156         int ret;
2157         int collisions = 0;
2158         LIST_HEAD(invalid_list);
2159
2160         for_each_valid_sp(kvm, sp, sp_list) {
2161                 if (sp->gfn != gfn) {
2162                         collisions++;
2163                         continue;
2164                 }
2165
2166                 if (sp->role.word != role.word) {
2167                         /*
2168                          * If the guest is creating an upper-level page, zap
2169                          * unsync pages for the same gfn.  While it's possible
2170                          * the guest is using recursive page tables, in all
2171                          * likelihood the guest has stopped using the unsync
2172                          * page and is installing a completely unrelated page.
2173                          * Unsync pages must not be left as is, because the new
2174                          * upper-level page will be write-protected.
2175                          */
2176                         if (role.level > PG_LEVEL_4K && sp->unsync)
2177                                 kvm_mmu_prepare_zap_page(kvm, sp,
2178                                                          &invalid_list);
2179                         continue;
2180                 }
2181
2182                 /* unsync and write-flooding only apply to indirect SPs. */
2183                 if (sp->role.direct)
2184                         goto out;
2185
2186                 if (sp->unsync) {
2187                         if (KVM_BUG_ON(!vcpu, kvm))
2188                                 break;
2189
2190                         /*
2191                          * The page is good, but is stale.  kvm_sync_page does
2192                          * get the latest guest state, but (unlike mmu_unsync_children)
2193                          * it doesn't write-protect the page or mark it synchronized!
2194                          * This way the validity of the mapping is ensured, but the
2195                          * overhead of write protection is not incurred until the
2196                          * guest invalidates the TLB mapping.  This allows multiple
2197                          * SPs for a single gfn to be unsync.
2198                          *
2199                          * If the sync fails, the page is zapped.  If so, break
2200                          * in order to rebuild it.
2201                          */
2202                         ret = kvm_sync_page(vcpu, sp, &invalid_list);
2203                         if (ret < 0)
2204                                 break;
2205
2206                         WARN_ON(!list_empty(&invalid_list));
2207                         if (ret > 0)
2208                                 kvm_flush_remote_tlbs(kvm);
2209                 }
2210
2211                 __clear_sp_write_flooding_count(sp);
2212
2213                 goto out;
2214         }
2215
2216         sp = NULL;
2217         ++kvm->stat.mmu_cache_miss;
2218
2219 out:
2220         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2221
2222         if (collisions > kvm->stat.max_mmu_page_hash_collisions)
2223                 kvm->stat.max_mmu_page_hash_collisions = collisions;
2224         return sp;
2225 }
2226
2227 /* Caches used when allocating a new shadow page. */
2228 struct shadow_page_caches {
2229         struct kvm_mmu_memory_cache *page_header_cache;
2230         struct kvm_mmu_memory_cache *shadow_page_cache;
2231         struct kvm_mmu_memory_cache *shadowed_info_cache;
2232 };
2233
2234 static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
2235                                                       struct shadow_page_caches *caches,
2236                                                       gfn_t gfn,
2237                                                       struct hlist_head *sp_list,
2238                                                       union kvm_mmu_page_role role)
2239 {
2240         struct kvm_mmu_page *sp;
2241
2242         sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
2243         sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
2244         if (!role.direct)
2245                 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
2246
2247         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2248
2249         INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
2250
2251         /*
2252          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2253          * depends on valid pages being added to the head of the list.  See
2254          * comments in kvm_zap_obsolete_pages().
2255          */
2256         sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
2257         list_add(&sp->link, &kvm->arch.active_mmu_pages);
2258         kvm_account_mmu_page(kvm, sp);
2259
2260         sp->gfn = gfn;
2261         sp->role = role;
2262         hlist_add_head(&sp->hash_link, sp_list);
2263         if (sp_has_gptes(sp))
2264                 account_shadowed(kvm, sp);
2265
2266         return sp;
2267 }
2268
2269 /* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
2270 static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
2271                                                       struct kvm_vcpu *vcpu,
2272                                                       struct shadow_page_caches *caches,
2273                                                       gfn_t gfn,
2274                                                       union kvm_mmu_page_role role)
2275 {
2276         struct hlist_head *sp_list;
2277         struct kvm_mmu_page *sp;
2278         bool created = false;
2279
2280         sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2281
2282         sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
2283         if (!sp) {
2284                 created = true;
2285                 sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
2286         }
2287
2288         trace_kvm_mmu_get_page(sp, created);
2289         return sp;
2290 }
2291
2292 static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
2293                                                     gfn_t gfn,
2294                                                     union kvm_mmu_page_role role)
2295 {
2296         struct shadow_page_caches caches = {
2297                 .page_header_cache = &vcpu->arch.mmu_page_header_cache,
2298                 .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
2299                 .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
2300         };
2301
2302         return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
2303 }
2304
2305 static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
2306                                                   unsigned int access)
2307 {
2308         struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
2309         union kvm_mmu_page_role role;
2310
2311         role = parent_sp->role;
2312         role.level--;
2313         role.access = access;
2314         role.direct = direct;
2315         role.passthrough = 0;
2316
2317         /*
2318          * If the guest has 4-byte PTEs then that means it's using 32-bit,
2319          * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
2320          * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
2321          * shadow each guest page table with multiple shadow page tables, which
2322          * requires extra bookkeeping in the role.
2323          *
2324          * Specifically, to shadow the guest's page directory (which covers a
2325          * 4GiB address space), KVM uses 4 PAE page directories, each mapping
2326          * 1GiB of the address space. @role.quadrant encodes which quarter of
2327          * the address space each maps.
2328          *
2329          * To shadow the guest's page tables (which each map a 4MiB region), KVM
2330          * uses 2 PAE page tables, each mapping a 2MiB region. For these,
2331          * @role.quadrant encodes which half of the region they map.
2332          *
2333          * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
2334          * consumes bits 29:21.  To consume bits 31:30, KVM's uses 4 shadow
2335          * PDPTEs; those 4 PAE page directories are pre-allocated and their
2336          * quadrant is assigned in mmu_alloc_root().   A 4-byte PTE consumes
2337          * bits 21:12, while an 8-byte PTE consumes bits 20:12.  To consume
2338          * bit 21 in the PTE (the child here), KVM propagates that bit to the
2339          * quadrant, i.e. sets quadrant to '0' or '1'.  The parent 8-byte PDE
2340          * covers bit 21 (see above), thus the quadrant is calculated from the
2341          * _least_ significant bit of the PDE index.
2342          */
2343         if (role.has_4_byte_gpte) {
2344                 WARN_ON_ONCE(role.level != PG_LEVEL_4K);
2345                 role.quadrant = spte_index(sptep) & 1;
2346         }
2347
2348         return role;
2349 }
2350
2351 static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
2352                                                  u64 *sptep, gfn_t gfn,
2353                                                  bool direct, unsigned int access)
2354 {
2355         union kvm_mmu_page_role role;
2356
2357         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
2358                 return ERR_PTR(-EEXIST);
2359
2360         role = kvm_mmu_child_role(sptep, direct, access);
2361         return kvm_mmu_get_shadow_page(vcpu, gfn, role);
2362 }
2363
2364 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2365                                         struct kvm_vcpu *vcpu, hpa_t root,
2366                                         u64 addr)
2367 {
2368         iterator->addr = addr;
2369         iterator->shadow_addr = root;
2370         iterator->level = vcpu->arch.mmu->root_role.level;
2371
2372         if (iterator->level >= PT64_ROOT_4LEVEL &&
2373             vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
2374             !vcpu->arch.mmu->root_role.direct)
2375                 iterator->level = PT32E_ROOT_LEVEL;
2376
2377         if (iterator->level == PT32E_ROOT_LEVEL) {
2378                 /*
2379                  * prev_root is currently only used for 64-bit hosts. So only
2380                  * the active root_hpa is valid here.
2381                  */
2382                 BUG_ON(root != vcpu->arch.mmu->root.hpa);
2383
2384                 iterator->shadow_addr
2385                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2386                 iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
2387                 --iterator->level;
2388                 if (!iterator->shadow_addr)
2389                         iterator->level = 0;
2390         }
2391 }
2392
2393 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2394                              struct kvm_vcpu *vcpu, u64 addr)
2395 {
2396         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2397                                     addr);
2398 }
2399
2400 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2401 {
2402         if (iterator->level < PG_LEVEL_4K)
2403                 return false;
2404
2405         iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
2406         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2407         return true;
2408 }
2409
2410 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2411                                u64 spte)
2412 {
2413         if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2414                 iterator->level = 0;
2415                 return;
2416         }
2417
2418         iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
2419         --iterator->level;
2420 }
2421
2422 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2423 {
2424         __shadow_walk_next(iterator, *iterator->sptep);
2425 }
2426
2427 static void __link_shadow_page(struct kvm *kvm,
2428                                struct kvm_mmu_memory_cache *cache, u64 *sptep,
2429                                struct kvm_mmu_page *sp, bool flush)
2430 {
2431         u64 spte;
2432
2433         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2434
2435         /*
2436          * If an SPTE is present already, it must be a leaf and therefore
2437          * a large one.  Drop it, and flush the TLB if needed, before
2438          * installing sp.
2439          */
2440         if (is_shadow_present_pte(*sptep))
2441                 drop_large_spte(kvm, sptep, flush);
2442
2443         spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2444
2445         mmu_spte_set(sptep, spte);
2446
2447         mmu_page_add_parent_pte(cache, sp, sptep);
2448
2449         /*
2450          * The non-direct sub-pagetable must be updated before linking.  For
2451          * L1 sp, the pagetable is updated via kvm_sync_page() in
2452          * kvm_mmu_find_shadow_page() without write-protecting the gfn,
2453          * so sp->unsync can be true or false.  For higher level non-direct
2454          * sp, the pagetable is updated/synced via mmu_sync_children() in
2455          * FNAME(fetch)(), so sp->unsync_children can only be false.
2456          * WARN_ON_ONCE() if anything happens unexpectedly.
2457          */
2458         if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
2459                 mark_unsync(sptep);
2460 }
2461
2462 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2463                              struct kvm_mmu_page *sp)
2464 {
2465         __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
2466 }
2467
2468 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2469                                    unsigned direct_access)
2470 {
2471         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2472                 struct kvm_mmu_page *child;
2473
2474                 /*
2475                  * For the direct sp, if the guest pte's dirty bit
2476                  * changed form clean to dirty, it will corrupt the
2477                  * sp's access: allow writable in the read-only sp,
2478                  * so we should update the spte at this point to get
2479                  * a new sp with the correct access.
2480                  */
2481                 child = spte_to_child_sp(*sptep);
2482                 if (child->role.access == direct_access)
2483                         return;
2484
2485                 drop_parent_pte(child, sptep);
2486                 kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
2487         }
2488 }
2489
2490 /* Returns the number of zapped non-leaf child shadow pages. */
2491 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2492                             u64 *spte, struct list_head *invalid_list)
2493 {
2494         u64 pte;
2495         struct kvm_mmu_page *child;
2496
2497         pte = *spte;
2498         if (is_shadow_present_pte(pte)) {
2499                 if (is_last_spte(pte, sp->role.level)) {
2500                         drop_spte(kvm, spte);
2501                 } else {
2502                         child = spte_to_child_sp(pte);
2503                         drop_parent_pte(child, spte);
2504
2505                         /*
2506                          * Recursively zap nested TDP SPs, parentless SPs are
2507                          * unlikely to be used again in the near future.  This
2508                          * avoids retaining a large number of stale nested SPs.
2509                          */
2510                         if (tdp_enabled && invalid_list &&
2511                             child->role.guest_mode && !child->parent_ptes.val)
2512                                 return kvm_mmu_prepare_zap_page(kvm, child,
2513                                                                 invalid_list);
2514                 }
2515         } else if (is_mmio_spte(pte)) {
2516                 mmu_spte_clear_no_track(spte);
2517         }
2518         return 0;
2519 }
2520
2521 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2522                                         struct kvm_mmu_page *sp,
2523                                         struct list_head *invalid_list)
2524 {
2525         int zapped = 0;
2526         unsigned i;
2527
2528         for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
2529                 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2530
2531         return zapped;
2532 }
2533
2534 static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
2535 {
2536         u64 *sptep;
2537         struct rmap_iterator iter;
2538
2539         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2540                 drop_parent_pte(sp, sptep);
2541 }
2542
2543 static int mmu_zap_unsync_children(struct kvm *kvm,
2544                                    struct kvm_mmu_page *parent,
2545                                    struct list_head *invalid_list)
2546 {
2547         int i, zapped = 0;
2548         struct mmu_page_path parents;
2549         struct kvm_mmu_pages pages;
2550
2551         if (parent->role.level == PG_LEVEL_4K)
2552                 return 0;
2553
2554         while (mmu_unsync_walk(parent, &pages)) {
2555                 struct kvm_mmu_page *sp;
2556
2557                 for_each_sp(pages, sp, parents, i) {
2558                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2559                         mmu_pages_clear_parents(&parents);
2560                         zapped++;
2561                 }
2562         }
2563
2564         return zapped;
2565 }
2566
2567 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2568                                        struct kvm_mmu_page *sp,
2569                                        struct list_head *invalid_list,
2570                                        int *nr_zapped)
2571 {
2572         bool list_unstable, zapped_root = false;
2573
2574         lockdep_assert_held_write(&kvm->mmu_lock);
2575         trace_kvm_mmu_prepare_zap_page(sp);
2576         ++kvm->stat.mmu_shadow_zapped;
2577         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2578         *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2579         kvm_mmu_unlink_parents(sp);
2580
2581         /* Zapping children means active_mmu_pages has become unstable. */
2582         list_unstable = *nr_zapped;
2583
2584         if (!sp->role.invalid && sp_has_gptes(sp))
2585                 unaccount_shadowed(kvm, sp);
2586
2587         if (sp->unsync)
2588                 kvm_unlink_unsync_page(kvm, sp);
2589         if (!sp->root_count) {
2590                 /* Count self */
2591                 (*nr_zapped)++;
2592
2593                 /*
2594                  * Already invalid pages (previously active roots) are not on
2595                  * the active page list.  See list_del() in the "else" case of
2596                  * !sp->root_count.
2597                  */
2598                 if (sp->role.invalid)
2599                         list_add(&sp->link, invalid_list);
2600                 else
2601                         list_move(&sp->link, invalid_list);
2602                 kvm_unaccount_mmu_page(kvm, sp);
2603         } else {
2604                 /*
2605                  * Remove the active root from the active page list, the root
2606                  * will be explicitly freed when the root_count hits zero.
2607                  */
2608                 list_del(&sp->link);
2609
2610                 /*
2611                  * Obsolete pages cannot be used on any vCPUs, see the comment
2612                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2613                  * treats invalid shadow pages as being obsolete.
2614                  */
2615                 zapped_root = !is_obsolete_sp(kvm, sp);
2616         }
2617
2618         if (sp->nx_huge_page_disallowed)
2619                 unaccount_nx_huge_page(kvm, sp);
2620
2621         sp->role.invalid = 1;
2622
2623         /*
2624          * Make the request to free obsolete roots after marking the root
2625          * invalid, otherwise other vCPUs may not see it as invalid.
2626          */
2627         if (zapped_root)
2628                 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2629         return list_unstable;
2630 }
2631
2632 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2633                                      struct list_head *invalid_list)
2634 {
2635         int nr_zapped;
2636
2637         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2638         return nr_zapped;
2639 }
2640
2641 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2642                                     struct list_head *invalid_list)
2643 {
2644         struct kvm_mmu_page *sp, *nsp;
2645
2646         if (list_empty(invalid_list))
2647                 return;
2648
2649         /*
2650          * We need to make sure everyone sees our modifications to
2651          * the page tables and see changes to vcpu->mode here. The barrier
2652          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2653          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2654          *
2655          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2656          * guest mode and/or lockless shadow page table walks.
2657          */
2658         kvm_flush_remote_tlbs(kvm);
2659
2660         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2661                 WARN_ON(!sp->role.invalid || sp->root_count);
2662                 kvm_mmu_free_shadow_page(sp);
2663         }
2664 }
2665
2666 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2667                                                   unsigned long nr_to_zap)
2668 {
2669         unsigned long total_zapped = 0;
2670         struct kvm_mmu_page *sp, *tmp;
2671         LIST_HEAD(invalid_list);
2672         bool unstable;
2673         int nr_zapped;
2674
2675         if (list_empty(&kvm->arch.active_mmu_pages))
2676                 return 0;
2677
2678 restart:
2679         list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2680                 /*
2681                  * Don't zap active root pages, the page itself can't be freed
2682                  * and zapping it will just force vCPUs to realloc and reload.
2683                  */
2684                 if (sp->root_count)
2685                         continue;
2686
2687                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2688                                                       &nr_zapped);
2689                 total_zapped += nr_zapped;
2690                 if (total_zapped >= nr_to_zap)
2691                         break;
2692
2693                 if (unstable)
2694                         goto restart;
2695         }
2696
2697         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2698
2699         kvm->stat.mmu_recycled += total_zapped;
2700         return total_zapped;
2701 }
2702
2703 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2704 {
2705         if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2706                 return kvm->arch.n_max_mmu_pages -
2707                         kvm->arch.n_used_mmu_pages;
2708
2709         return 0;
2710 }
2711
2712 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2713 {
2714         unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2715
2716         if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2717                 return 0;
2718
2719         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2720
2721         /*
2722          * Note, this check is intentionally soft, it only guarantees that one
2723          * page is available, while the caller may end up allocating as many as
2724          * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2725          * exceeding the (arbitrary by default) limit will not harm the host,
2726          * being too aggressive may unnecessarily kill the guest, and getting an
2727          * exact count is far more trouble than it's worth, especially in the
2728          * page fault paths.
2729          */
2730         if (!kvm_mmu_available_pages(vcpu->kvm))
2731                 return -ENOSPC;
2732         return 0;
2733 }
2734
2735 /*
2736  * Changing the number of mmu pages allocated to the vm
2737  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2738  */
2739 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2740 {
2741         write_lock(&kvm->mmu_lock);
2742
2743         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2744                 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2745                                                   goal_nr_mmu_pages);
2746
2747                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2748         }
2749
2750         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2751
2752         write_unlock(&kvm->mmu_lock);
2753 }
2754
2755 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2756 {
2757         struct kvm_mmu_page *sp;
2758         LIST_HEAD(invalid_list);
2759         int r;
2760
2761         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2762         r = 0;
2763         write_lock(&kvm->mmu_lock);
2764         for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2765                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2766                          sp->role.word);
2767                 r = 1;
2768                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2769         }
2770         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2771         write_unlock(&kvm->mmu_lock);
2772
2773         return r;
2774 }
2775
2776 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2777 {
2778         gpa_t gpa;
2779         int r;
2780
2781         if (vcpu->arch.mmu->root_role.direct)
2782                 return 0;
2783
2784         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2785
2786         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2787
2788         return r;
2789 }
2790
2791 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2792 {
2793         trace_kvm_mmu_unsync_page(sp);
2794         ++kvm->stat.mmu_unsync;
2795         sp->unsync = 1;
2796
2797         kvm_mmu_mark_parents_unsync(sp);
2798 }
2799
2800 /*
2801  * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2802  * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2803  * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2804  * be write-protected.
2805  */
2806 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2807                             gfn_t gfn, bool can_unsync, bool prefetch)
2808 {
2809         struct kvm_mmu_page *sp;
2810         bool locked = false;
2811
2812         /*
2813          * Force write-protection if the page is being tracked.  Note, the page
2814          * track machinery is used to write-protect upper-level shadow pages,
2815          * i.e. this guards the role.level == 4K assertion below!
2816          */
2817         if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
2818                 return -EPERM;
2819
2820         /*
2821          * The page is not write-tracked, mark existing shadow pages unsync
2822          * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
2823          * that case, KVM must complete emulation of the guest TLB flush before
2824          * allowing shadow pages to become unsync (writable by the guest).
2825          */
2826         for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2827                 if (!can_unsync)
2828                         return -EPERM;
2829
2830                 if (sp->unsync)
2831                         continue;
2832
2833                 if (prefetch)
2834                         return -EEXIST;
2835
2836                 /*
2837                  * TDP MMU page faults require an additional spinlock as they
2838                  * run with mmu_lock held for read, not write, and the unsync
2839                  * logic is not thread safe.  Take the spinklock regardless of
2840                  * the MMU type to avoid extra conditionals/parameters, there's
2841                  * no meaningful penalty if mmu_lock is held for write.
2842                  */
2843                 if (!locked) {
2844                         locked = true;
2845                         spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2846
2847                         /*
2848                          * Recheck after taking the spinlock, a different vCPU
2849                          * may have since marked the page unsync.  A false
2850                          * positive on the unprotected check above is not
2851                          * possible as clearing sp->unsync _must_ hold mmu_lock
2852                          * for write, i.e. unsync cannot transition from 0->1
2853                          * while this CPU holds mmu_lock for read (or write).
2854                          */
2855                         if (READ_ONCE(sp->unsync))
2856                                 continue;
2857                 }
2858
2859                 WARN_ON(sp->role.level != PG_LEVEL_4K);
2860                 kvm_unsync_page(kvm, sp);
2861         }
2862         if (locked)
2863                 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2864
2865         /*
2866          * We need to ensure that the marking of unsync pages is visible
2867          * before the SPTE is updated to allow writes because
2868          * kvm_mmu_sync_roots() checks the unsync flags without holding
2869          * the MMU lock and so can race with this. If the SPTE was updated
2870          * before the page had been marked as unsync-ed, something like the
2871          * following could happen:
2872          *
2873          * CPU 1                    CPU 2
2874          * ---------------------------------------------------------------------
2875          * 1.2 Host updates SPTE
2876          *     to be writable
2877          *                      2.1 Guest writes a GPTE for GVA X.
2878          *                          (GPTE being in the guest page table shadowed
2879          *                           by the SP from CPU 1.)
2880          *                          This reads SPTE during the page table walk.
2881          *                          Since SPTE.W is read as 1, there is no
2882          *                          fault.
2883          *
2884          *                      2.2 Guest issues TLB flush.
2885          *                          That causes a VM Exit.
2886          *
2887          *                      2.3 Walking of unsync pages sees sp->unsync is
2888          *                          false and skips the page.
2889          *
2890          *                      2.4 Guest accesses GVA X.
2891          *                          Since the mapping in the SP was not updated,
2892          *                          so the old mapping for GVA X incorrectly
2893          *                          gets used.
2894          * 1.1 Host marks SP
2895          *     as unsync
2896          *     (sp->unsync = true)
2897          *
2898          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2899          * the situation in 2.4 does not arise.  It pairs with the read barrier
2900          * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2901          */
2902         smp_wmb();
2903
2904         return 0;
2905 }
2906
2907 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2908                         u64 *sptep, unsigned int pte_access, gfn_t gfn,
2909                         kvm_pfn_t pfn, struct kvm_page_fault *fault)
2910 {
2911         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2912         int level = sp->role.level;
2913         int was_rmapped = 0;
2914         int ret = RET_PF_FIXED;
2915         bool flush = false;
2916         bool wrprot;
2917         u64 spte;
2918
2919         /* Prefetching always gets a writable pfn.  */
2920         bool host_writable = !fault || fault->map_writable;
2921         bool prefetch = !fault || fault->prefetch;
2922         bool write_fault = fault && fault->write;
2923
2924         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2925                  *sptep, write_fault, gfn);
2926
2927         if (unlikely(is_noslot_pfn(pfn))) {
2928                 vcpu->stat.pf_mmio_spte_created++;
2929                 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2930                 return RET_PF_EMULATE;
2931         }
2932
2933         if (is_shadow_present_pte(*sptep)) {
2934                 /*
2935                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2936                  * the parent of the now unreachable PTE.
2937                  */
2938                 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2939                         struct kvm_mmu_page *child;
2940                         u64 pte = *sptep;
2941
2942                         child = spte_to_child_sp(pte);
2943                         drop_parent_pte(child, sptep);
2944                         flush = true;
2945                 } else if (pfn != spte_to_pfn(*sptep)) {
2946                         pgprintk("hfn old %llx new %llx\n",
2947                                  spte_to_pfn(*sptep), pfn);
2948                         drop_spte(vcpu->kvm, sptep);
2949                         flush = true;
2950                 } else
2951                         was_rmapped = 1;
2952         }
2953
2954         wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2955                            true, host_writable, &spte);
2956
2957         if (*sptep == spte) {
2958                 ret = RET_PF_SPURIOUS;
2959         } else {
2960                 flush |= mmu_spte_update(sptep, spte);
2961                 trace_kvm_mmu_set_spte(level, gfn, sptep);
2962         }
2963
2964         if (wrprot) {
2965                 if (write_fault)
2966                         ret = RET_PF_EMULATE;
2967         }
2968
2969         if (flush)
2970                 kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
2971
2972         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2973
2974         if (!was_rmapped) {
2975                 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2976                 rmap_add(vcpu, slot, sptep, gfn, pte_access);
2977         } else {
2978                 /* Already rmapped but the pte_access bits may have changed. */
2979                 kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
2980         }
2981
2982         return ret;
2983 }
2984
2985 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2986                                     struct kvm_mmu_page *sp,
2987                                     u64 *start, u64 *end)
2988 {
2989         struct page *pages[PTE_PREFETCH_NUM];
2990         struct kvm_memory_slot *slot;
2991         unsigned int access = sp->role.access;
2992         int i, ret;
2993         gfn_t gfn;
2994
2995         gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
2996         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2997         if (!slot)
2998                 return -1;
2999
3000         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3001         if (ret <= 0)
3002                 return -1;
3003
3004         for (i = 0; i < ret; i++, gfn++, start++) {
3005                 mmu_set_spte(vcpu, slot, start, access, gfn,
3006                              page_to_pfn(pages[i]), NULL);
3007                 put_page(pages[i]);
3008         }
3009
3010         return 0;
3011 }
3012
3013 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3014                                   struct kvm_mmu_page *sp, u64 *sptep)
3015 {
3016         u64 *spte, *start = NULL;
3017         int i;
3018
3019         WARN_ON(!sp->role.direct);
3020
3021         i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
3022         spte = sp->spt + i;
3023
3024         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3025                 if (is_shadow_present_pte(*spte) || spte == sptep) {
3026                         if (!start)
3027                                 continue;
3028                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3029                                 return;
3030                         start = NULL;
3031                 } else if (!start)
3032                         start = spte;
3033         }
3034         if (start)
3035                 direct_pte_prefetch_many(vcpu, sp, start, spte);
3036 }
3037
3038 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3039 {
3040         struct kvm_mmu_page *sp;
3041
3042         sp = sptep_to_sp(sptep);
3043
3044         /*
3045          * Without accessed bits, there's no way to distinguish between
3046          * actually accessed translations and prefetched, so disable pte
3047          * prefetch if accessed bits aren't available.
3048          */
3049         if (sp_ad_disabled(sp))
3050                 return;
3051
3052         if (sp->role.level > PG_LEVEL_4K)
3053                 return;
3054
3055         /*
3056          * If addresses are being invalidated, skip prefetching to avoid
3057          * accidentally prefetching those addresses.
3058          */
3059         if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
3060                 return;
3061
3062         __direct_pte_prefetch(vcpu, sp, sptep);
3063 }
3064
3065 /*
3066  * Lookup the mapping level for @gfn in the current mm.
3067  *
3068  * WARNING!  Use of host_pfn_mapping_level() requires the caller and the end
3069  * consumer to be tied into KVM's handlers for MMU notifier events!
3070  *
3071  * There are several ways to safely use this helper:
3072  *
3073  * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
3074  *   consuming it.  In this case, mmu_lock doesn't need to be held during the
3075  *   lookup, but it does need to be held while checking the MMU notifier.
3076  *
3077  * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
3078  *   event for the hva.  This can be done by explicit checking the MMU notifier
3079  *   or by ensuring that KVM already has a valid mapping that covers the hva.
3080  *
3081  * - Do not use the result to install new mappings, e.g. use the host mapping
3082  *   level only to decide whether or not to zap an entry.  In this case, it's
3083  *   not required to hold mmu_lock (though it's highly likely the caller will
3084  *   want to hold mmu_lock anyways, e.g. to modify SPTEs).
3085  *
3086  * Note!  The lookup can still race with modifications to host page tables, but
3087  * the above "rules" ensure KVM will not _consume_ the result of the walk if a
3088  * race with the primary MMU occurs.
3089  */
3090 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
3091                                   const struct kvm_memory_slot *slot)
3092 {
3093         int level = PG_LEVEL_4K;
3094         unsigned long hva;
3095         unsigned long flags;
3096         pgd_t pgd;
3097         p4d_t p4d;
3098         pud_t pud;
3099         pmd_t pmd;
3100
3101         /*
3102          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
3103          * is not solely for performance, it's also necessary to avoid the
3104          * "writable" check in __gfn_to_hva_many(), which will always fail on
3105          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
3106          * page fault steps have already verified the guest isn't writing a
3107          * read-only memslot.
3108          */
3109         hva = __gfn_to_hva_memslot(slot, gfn);
3110
3111         /*
3112          * Disable IRQs to prevent concurrent tear down of host page tables,
3113          * e.g. if the primary MMU promotes a P*D to a huge page and then frees
3114          * the original page table.
3115          */
3116         local_irq_save(flags);
3117
3118         /*
3119          * Read each entry once.  As above, a non-leaf entry can be promoted to
3120          * a huge page _during_ this walk.  Re-reading the entry could send the
3121          * walk into the weeks, e.g. p*d_large() returns false (sees the old
3122          * value) and then p*d_offset() walks into the target huge page instead
3123          * of the old page table (sees the new value).
3124          */
3125         pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
3126         if (pgd_none(pgd))
3127                 goto out;
3128
3129         p4d = READ_ONCE(*p4d_offset(&pgd, hva));
3130         if (p4d_none(p4d) || !p4d_present(p4d))
3131                 goto out;
3132
3133         pud = READ_ONCE(*pud_offset(&p4d, hva));
3134         if (pud_none(pud) || !pud_present(pud))
3135                 goto out;
3136
3137         if (pud_large(pud)) {
3138                 level = PG_LEVEL_1G;
3139                 goto out;
3140         }
3141
3142         pmd = READ_ONCE(*pmd_offset(&pud, hva));
3143         if (pmd_none(pmd) || !pmd_present(pmd))
3144                 goto out;
3145
3146         if (pmd_large(pmd))
3147                 level = PG_LEVEL_2M;
3148
3149 out:
3150         local_irq_restore(flags);
3151         return level;
3152 }
3153
3154 int kvm_mmu_max_mapping_level(struct kvm *kvm,
3155                               const struct kvm_memory_slot *slot, gfn_t gfn,
3156                               int max_level)
3157 {
3158         struct kvm_lpage_info *linfo;
3159         int host_level;
3160
3161         max_level = min(max_level, max_huge_page_level);
3162         for ( ; max_level > PG_LEVEL_4K; max_level--) {
3163                 linfo = lpage_info_slot(gfn, slot, max_level);
3164                 if (!linfo->disallow_lpage)
3165                         break;
3166         }
3167
3168         if (max_level == PG_LEVEL_4K)
3169                 return PG_LEVEL_4K;
3170
3171         host_level = host_pfn_mapping_level(kvm, gfn, slot);
3172         return min(host_level, max_level);
3173 }
3174
3175 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3176 {
3177         struct kvm_memory_slot *slot = fault->slot;
3178         kvm_pfn_t mask;
3179
3180         fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
3181
3182         if (unlikely(fault->max_level == PG_LEVEL_4K))
3183                 return;
3184
3185         if (is_error_noslot_pfn(fault->pfn))
3186                 return;
3187
3188         if (kvm_slot_dirty_track_enabled(slot))
3189                 return;
3190
3191         /*
3192          * Enforce the iTLB multihit workaround after capturing the requested
3193          * level, which will be used to do precise, accurate accounting.
3194          */
3195         fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
3196                                                      fault->gfn, fault->max_level);
3197         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
3198                 return;
3199
3200         /*
3201          * mmu_invalidate_retry() was successful and mmu_lock is held, so
3202          * the pmd can't be split from under us.
3203          */
3204         fault->goal_level = fault->req_level;
3205         mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
3206         VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
3207         fault->pfn &= ~mask;
3208 }
3209
3210 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
3211 {
3212         if (cur_level > PG_LEVEL_4K &&
3213             cur_level == fault->goal_level &&
3214             is_shadow_present_pte(spte) &&
3215             !is_large_pte(spte) &&
3216             spte_to_child_sp(spte)->nx_huge_page_disallowed) {
3217                 /*
3218                  * A small SPTE exists for this pfn, but FNAME(fetch),
3219                  * direct_map(), or kvm_tdp_mmu_map() would like to create a
3220                  * large PTE instead: just force them to go down another level,
3221                  * patching back for them into pfn the next 9 bits of the
3222                  * address.
3223                  */
3224                 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
3225                                 KVM_PAGES_PER_HPAGE(cur_level - 1);
3226                 fault->pfn |= fault->gfn & page_mask;
3227                 fault->goal_level--;
3228         }
3229 }
3230
3231 static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3232 {
3233         struct kvm_shadow_walk_iterator it;
3234         struct kvm_mmu_page *sp;
3235         int ret;
3236         gfn_t base_gfn = fault->gfn;
3237
3238         kvm_mmu_hugepage_adjust(vcpu, fault);
3239
3240         trace_kvm_mmu_spte_requested(fault);
3241         for_each_shadow_entry(vcpu, fault->addr, it) {
3242                 /*
3243                  * We cannot overwrite existing page tables with an NX
3244                  * large page, as the leaf could be executable.
3245                  */
3246                 if (fault->nx_huge_page_workaround_enabled)
3247                         disallowed_hugepage_adjust(fault, *it.sptep, it.level);
3248
3249                 base_gfn = gfn_round_for_level(fault->gfn, it.level);
3250                 if (it.level == fault->goal_level)
3251                         break;
3252
3253                 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
3254                 if (sp == ERR_PTR(-EEXIST))
3255                         continue;
3256
3257                 link_shadow_page(vcpu, it.sptep, sp);
3258                 if (fault->huge_page_disallowed)
3259                         account_nx_huge_page(vcpu->kvm, sp,
3260                                              fault->req_level >= it.level);
3261         }
3262
3263         if (WARN_ON_ONCE(it.level != fault->goal_level))
3264                 return -EFAULT;
3265
3266         ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
3267                            base_gfn, fault->pfn, fault);
3268         if (ret == RET_PF_SPURIOUS)
3269                 return ret;
3270
3271         direct_pte_prefetch(vcpu, it.sptep);
3272         return ret;
3273 }
3274
3275 static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
3276 {
3277         unsigned long hva = gfn_to_hva_memslot(slot, gfn);
3278
3279         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
3280 }
3281
3282 static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3283 {
3284         if (is_sigpending_pfn(fault->pfn)) {
3285                 kvm_handle_signal_exit(vcpu);
3286                 return -EINTR;
3287         }
3288
3289         /*
3290          * Do not cache the mmio info caused by writing the readonly gfn
3291          * into the spte otherwise read access on readonly gfn also can
3292          * caused mmio page fault and treat it as mmio access.
3293          */
3294         if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
3295                 return RET_PF_EMULATE;
3296
3297         if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
3298                 kvm_send_hwpoison_signal(fault->slot, fault->gfn);
3299                 return RET_PF_RETRY;
3300         }
3301
3302         return -EFAULT;
3303 }
3304
3305 static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
3306                                    struct kvm_page_fault *fault,
3307                                    unsigned int access)
3308 {
3309         gva_t gva = fault->is_tdp ? 0 : fault->addr;
3310
3311         vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3312                              access & shadow_mmio_access_mask);
3313
3314         /*
3315          * If MMIO caching is disabled, emulate immediately without
3316          * touching the shadow page tables as attempting to install an
3317          * MMIO SPTE will just be an expensive nop.
3318          */
3319         if (unlikely(!enable_mmio_caching))
3320                 return RET_PF_EMULATE;
3321
3322         /*
3323          * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
3324          * any guest that generates such gfns is running nested and is being
3325          * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
3326          * only if L1's MAXPHYADDR is inaccurate with respect to the
3327          * hardware's).
3328          */
3329         if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
3330                 return RET_PF_EMULATE;
3331
3332         return RET_PF_CONTINUE;
3333 }
3334
3335 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3336 {
3337         /*
3338          * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
3339          * reach the common page fault handler if the SPTE has an invalid MMIO
3340          * generation number.  Refreshing the MMIO generation needs to go down
3341          * the slow path.  Note, EPT Misconfigs do NOT set the PRESENT flag!
3342          */
3343         if (fault->rsvd)
3344                 return false;
3345
3346         /*
3347          * #PF can be fast if:
3348          *
3349          * 1. The shadow page table entry is not present and A/D bits are
3350          *    disabled _by KVM_, which could mean that the fault is potentially
3351          *    caused by access tracking (if enabled).  If A/D bits are enabled
3352          *    by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
3353          *    bits for L2 and employ access tracking, but the fast page fault
3354          *    mechanism only supports direct MMUs.
3355          * 2. The shadow page table entry is present, the access is a write,
3356          *    and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
3357          *    the fault was caused by a write-protection violation.  If the
3358          *    SPTE is MMU-writable (determined later), the fault can be fixed
3359          *    by setting the Writable bit, which can be done out of mmu_lock.
3360          */
3361         if (!fault->present)
3362                 return !kvm_ad_enabled();
3363
3364         /*
3365          * Note, instruction fetches and writes are mutually exclusive, ignore
3366          * the "exec" flag.
3367          */
3368         return fault->write;
3369 }
3370
3371 /*
3372  * Returns true if the SPTE was fixed successfully. Otherwise,
3373  * someone else modified the SPTE from its original value.
3374  */
3375 static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
3376                                     struct kvm_page_fault *fault,
3377                                     u64 *sptep, u64 old_spte, u64 new_spte)
3378 {
3379         /*
3380          * Theoretically we could also set dirty bit (and flush TLB) here in
3381          * order to eliminate unnecessary PML logging. See comments in
3382          * set_spte. But fast_page_fault is very unlikely to happen with PML
3383          * enabled, so we do not do this. This might result in the same GPA
3384          * to be logged in PML buffer again when the write really happens, and
3385          * eventually to be called by mark_page_dirty twice. But it's also no
3386          * harm. This also avoids the TLB flush needed after setting dirty bit
3387          * so non-PML cases won't be impacted.
3388          *
3389          * Compare with set_spte where instead shadow_dirty_mask is set.
3390          */
3391         if (!try_cmpxchg64(sptep, &old_spte, new_spte))
3392                 return false;
3393
3394         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3395                 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3396
3397         return true;
3398 }
3399
3400 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3401 {
3402         if (fault->exec)
3403                 return is_executable_pte(spte);
3404
3405         if (fault->write)
3406                 return is_writable_pte(spte);
3407
3408         /* Fault was on Read access */
3409         return spte & PT_PRESENT_MASK;
3410 }
3411
3412 /*
3413  * Returns the last level spte pointer of the shadow page walk for the given
3414  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3415  * walk could be performed, returns NULL and *spte does not contain valid data.
3416  *
3417  * Contract:
3418  *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3419  *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3420  */
3421 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3422 {
3423         struct kvm_shadow_walk_iterator iterator;
3424         u64 old_spte;
3425         u64 *sptep = NULL;
3426
3427         for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3428                 sptep = iterator.sptep;
3429                 *spte = old_spte;
3430         }
3431
3432         return sptep;
3433 }
3434
3435 /*
3436  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3437  */
3438 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3439 {
3440         struct kvm_mmu_page *sp;
3441         int ret = RET_PF_INVALID;
3442         u64 spte = 0ull;
3443         u64 *sptep = NULL;
3444         uint retry_count = 0;
3445
3446         if (!page_fault_can_be_fast(fault))
3447                 return ret;
3448
3449         walk_shadow_page_lockless_begin(vcpu);
3450
3451         do {
3452                 u64 new_spte;
3453
3454                 if (tdp_mmu_enabled)
3455                         sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3456                 else
3457                         sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3458
3459                 if (!is_shadow_present_pte(spte))
3460                         break;
3461
3462                 sp = sptep_to_sp(sptep);
3463                 if (!is_last_spte(spte, sp->role.level))
3464                         break;
3465
3466                 /*
3467                  * Check whether the memory access that caused the fault would
3468                  * still cause it if it were to be performed right now. If not,
3469                  * then this is a spurious fault caused by TLB lazily flushed,
3470                  * or some other CPU has already fixed the PTE after the
3471                  * current CPU took the fault.
3472                  *
3473                  * Need not check the access of upper level table entries since
3474                  * they are always ACC_ALL.
3475                  */
3476                 if (is_access_allowed(fault, spte)) {
3477                         ret = RET_PF_SPURIOUS;
3478                         break;
3479                 }
3480
3481                 new_spte = spte;
3482
3483                 /*
3484                  * KVM only supports fixing page faults outside of MMU lock for
3485                  * direct MMUs, nested MMUs are always indirect, and KVM always
3486                  * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
3487                  * enabled, the SPTE can't be an access-tracked SPTE.
3488                  */
3489                 if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
3490                         new_spte = restore_acc_track_spte(new_spte);
3491
3492                 /*
3493                  * To keep things simple, only SPTEs that are MMU-writable can
3494                  * be made fully writable outside of mmu_lock, e.g. only SPTEs
3495                  * that were write-protected for dirty-logging or access
3496                  * tracking are handled here.  Don't bother checking if the
3497                  * SPTE is writable to prioritize running with A/D bits enabled.
3498                  * The is_access_allowed() check above handles the common case
3499                  * of the fault being spurious, and the SPTE is known to be
3500                  * shadow-present, i.e. except for access tracking restoration
3501                  * making the new SPTE writable, the check is wasteful.
3502                  */
3503                 if (fault->write && is_mmu_writable_spte(spte)) {
3504                         new_spte |= PT_WRITABLE_MASK;
3505
3506                         /*
3507                          * Do not fix write-permission on the large spte when
3508                          * dirty logging is enabled. Since we only dirty the
3509                          * first page into the dirty-bitmap in
3510                          * fast_pf_fix_direct_spte(), other pages are missed
3511                          * if its slot has dirty logging enabled.
3512                          *
3513                          * Instead, we let the slow page fault path create a
3514                          * normal spte to fix the access.
3515                          */
3516                         if (sp->role.level > PG_LEVEL_4K &&
3517                             kvm_slot_dirty_track_enabled(fault->slot))
3518                                 break;
3519                 }
3520
3521                 /* Verify that the fault can be handled in the fast path */
3522                 if (new_spte == spte ||
3523                     !is_access_allowed(fault, new_spte))
3524                         break;
3525
3526                 /*
3527                  * Currently, fast page fault only works for direct mapping
3528                  * since the gfn is not stable for indirect shadow page. See
3529                  * Documentation/virt/kvm/locking.rst to get more detail.
3530                  */
3531                 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3532                         ret = RET_PF_FIXED;
3533                         break;
3534                 }
3535
3536                 if (++retry_count > 4) {
3537                         pr_warn_once("Fast #PF retrying more than 4 times.\n");
3538                         break;
3539                 }
3540
3541         } while (true);
3542
3543         trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3544         walk_shadow_page_lockless_end(vcpu);
3545
3546         if (ret != RET_PF_INVALID)
3547                 vcpu->stat.pf_fast++;
3548
3549         return ret;
3550 }
3551
3552 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3553                                struct list_head *invalid_list)
3554 {
3555         struct kvm_mmu_page *sp;
3556
3557         if (!VALID_PAGE(*root_hpa))
3558                 return;
3559
3560         /*
3561          * The "root" may be a special root, e.g. a PAE entry, treat it as a
3562          * SPTE to ensure any non-PA bits are dropped.
3563          */
3564         sp = spte_to_child_sp(*root_hpa);
3565         if (WARN_ON(!sp))
3566                 return;
3567
3568         if (is_tdp_mmu_page(sp))
3569                 kvm_tdp_mmu_put_root(kvm, sp, false);
3570         else if (!--sp->root_count && sp->role.invalid)
3571                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3572
3573         *root_hpa = INVALID_PAGE;
3574 }
3575
3576 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3577 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3578                         ulong roots_to_free)
3579 {
3580         int i;
3581         LIST_HEAD(invalid_list);
3582         bool free_active_root;
3583
3584         WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
3585
3586         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3587
3588         /* Before acquiring the MMU lock, see if we need to do any real work. */
3589         free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3590                 && VALID_PAGE(mmu->root.hpa);
3591
3592         if (!free_active_root) {
3593                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3594                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3595                             VALID_PAGE(mmu->prev_roots[i].hpa))
3596                                 break;
3597
3598                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3599                         return;
3600         }
3601
3602         write_lock(&kvm->mmu_lock);
3603
3604         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3605                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3606                         mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3607                                            &invalid_list);
3608
3609         if (free_active_root) {
3610                 if (to_shadow_page(mmu->root.hpa)) {
3611                         mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3612                 } else if (mmu->pae_root) {
3613                         for (i = 0; i < 4; ++i) {
3614                                 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3615                                         continue;
3616
3617                                 mmu_free_root_page(kvm, &mmu->pae_root[i],
3618                                                    &invalid_list);
3619                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3620                         }
3621                 }
3622                 mmu->root.hpa = INVALID_PAGE;
3623                 mmu->root.pgd = 0;
3624         }
3625
3626         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3627         write_unlock(&kvm->mmu_lock);
3628 }
3629 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3630
3631 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3632 {
3633         unsigned long roots_to_free = 0;
3634         hpa_t root_hpa;
3635         int i;
3636
3637         /*
3638          * This should not be called while L2 is active, L2 can't invalidate
3639          * _only_ its own roots, e.g. INVVPID unconditionally exits.
3640          */
3641         WARN_ON_ONCE(mmu->root_role.guest_mode);
3642
3643         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3644                 root_hpa = mmu->prev_roots[i].hpa;
3645                 if (!VALID_PAGE(root_hpa))
3646                         continue;
3647
3648                 if (!to_shadow_page(root_hpa) ||
3649                         to_shadow_page(root_hpa)->role.guest_mode)
3650                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3651         }
3652
3653         kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3654 }
3655 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3656
3657
3658 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3659 {
3660         int ret = 0;
3661
3662         if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3663                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3664                 ret = 1;
3665         }
3666
3667         return ret;
3668 }
3669
3670 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
3671                             u8 level)
3672 {
3673         union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
3674         struct kvm_mmu_page *sp;
3675
3676         role.level = level;
3677         role.quadrant = quadrant;
3678
3679         WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
3680         WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
3681
3682         sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
3683         ++sp->root_count;
3684
3685         return __pa(sp->spt);
3686 }
3687
3688 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3689 {
3690         struct kvm_mmu *mmu = vcpu->arch.mmu;
3691         u8 shadow_root_level = mmu->root_role.level;
3692         hpa_t root;
3693         unsigned i;
3694         int r;
3695
3696         write_lock(&vcpu->kvm->mmu_lock);
3697         r = make_mmu_pages_available(vcpu);
3698         if (r < 0)
3699                 goto out_unlock;
3700
3701         if (tdp_mmu_enabled) {
3702                 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3703                 mmu->root.hpa = root;
3704         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3705                 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
3706                 mmu->root.hpa = root;
3707         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3708                 if (WARN_ON_ONCE(!mmu->pae_root)) {
3709                         r = -EIO;
3710                         goto out_unlock;
3711                 }
3712
3713                 for (i = 0; i < 4; ++i) {
3714                         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3715
3716                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
3717                                               PT32_ROOT_LEVEL);
3718                         mmu->pae_root[i] = root | PT_PRESENT_MASK |
3719                                            shadow_me_value;
3720                 }
3721                 mmu->root.hpa = __pa(mmu->pae_root);
3722         } else {
3723                 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3724                 r = -EIO;
3725                 goto out_unlock;
3726         }
3727
3728         /* root.pgd is ignored for direct MMUs. */
3729         mmu->root.pgd = 0;
3730 out_unlock:
3731         write_unlock(&vcpu->kvm->mmu_lock);
3732         return r;
3733 }
3734
3735 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3736 {
3737         struct kvm_memslots *slots;
3738         struct kvm_memory_slot *slot;
3739         int r = 0, i, bkt;
3740
3741         /*
3742          * Check if this is the first shadow root being allocated before
3743          * taking the lock.
3744          */
3745         if (kvm_shadow_root_allocated(kvm))
3746                 return 0;
3747
3748         mutex_lock(&kvm->slots_arch_lock);
3749
3750         /* Recheck, under the lock, whether this is the first shadow root. */
3751         if (kvm_shadow_root_allocated(kvm))
3752                 goto out_unlock;
3753
3754         /*
3755          * Check if anything actually needs to be allocated, e.g. all metadata
3756          * will be allocated upfront if TDP is disabled.
3757          */
3758         if (kvm_memslots_have_rmaps(kvm) &&
3759             kvm_page_track_write_tracking_enabled(kvm))
3760                 goto out_success;
3761
3762         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3763                 slots = __kvm_memslots(kvm, i);
3764                 kvm_for_each_memslot(slot, bkt, slots) {
3765                         /*
3766                          * Both of these functions are no-ops if the target is
3767                          * already allocated, so unconditionally calling both
3768                          * is safe.  Intentionally do NOT free allocations on
3769                          * failure to avoid having to track which allocations
3770                          * were made now versus when the memslot was created.
3771                          * The metadata is guaranteed to be freed when the slot
3772                          * is freed, and will be kept/used if userspace retries
3773                          * KVM_RUN instead of killing the VM.
3774                          */
3775                         r = memslot_rmap_alloc(slot, slot->npages);
3776                         if (r)
3777                                 goto out_unlock;
3778                         r = kvm_page_track_write_tracking_alloc(slot);
3779                         if (r)
3780                                 goto out_unlock;
3781                 }
3782         }
3783
3784         /*
3785          * Ensure that shadow_root_allocated becomes true strictly after
3786          * all the related pointers are set.
3787          */
3788 out_success:
3789         smp_store_release(&kvm->arch.shadow_root_allocated, true);
3790
3791 out_unlock:
3792         mutex_unlock(&kvm->slots_arch_lock);
3793         return r;
3794 }
3795
3796 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3797 {
3798         struct kvm_mmu *mmu = vcpu->arch.mmu;
3799         u64 pdptrs[4], pm_mask;
3800         gfn_t root_gfn, root_pgd;
3801         int quadrant, i, r;
3802         hpa_t root;
3803
3804         root_pgd = mmu->get_guest_pgd(vcpu);
3805         root_gfn = root_pgd >> PAGE_SHIFT;
3806
3807         if (mmu_check_root(vcpu, root_gfn))
3808                 return 1;
3809
3810         /*
3811          * On SVM, reading PDPTRs might access guest memory, which might fault
3812          * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3813          */
3814         if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3815                 for (i = 0; i < 4; ++i) {
3816                         pdptrs[i] = mmu->get_pdptr(vcpu, i);
3817                         if (!(pdptrs[i] & PT_PRESENT_MASK))
3818                                 continue;
3819
3820                         if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3821                                 return 1;
3822                 }
3823         }
3824
3825         r = mmu_first_shadow_root_alloc(vcpu->kvm);
3826         if (r)
3827                 return r;
3828
3829         write_lock(&vcpu->kvm->mmu_lock);
3830         r = make_mmu_pages_available(vcpu);
3831         if (r < 0)
3832                 goto out_unlock;
3833
3834         /*
3835          * Do we shadow a long mode page table? If so we need to
3836          * write-protect the guests page table root.
3837          */
3838         if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3839                 root = mmu_alloc_root(vcpu, root_gfn, 0,
3840                                       mmu->root_role.level);
3841                 mmu->root.hpa = root;
3842                 goto set_root_pgd;
3843         }
3844
3845         if (WARN_ON_ONCE(!mmu->pae_root)) {
3846                 r = -EIO;
3847                 goto out_unlock;
3848         }
3849
3850         /*
3851          * We shadow a 32 bit page table. This may be a legacy 2-level
3852          * or a PAE 3-level page table. In either case we need to be aware that
3853          * the shadow page table may be a PAE or a long mode page table.
3854          */
3855         pm_mask = PT_PRESENT_MASK | shadow_me_value;
3856         if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
3857                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3858
3859                 if (WARN_ON_ONCE(!mmu->pml4_root)) {
3860                         r = -EIO;
3861                         goto out_unlock;
3862                 }
3863                 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3864
3865                 if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
3866                         if (WARN_ON_ONCE(!mmu->pml5_root)) {
3867                                 r = -EIO;
3868                                 goto out_unlock;
3869                         }
3870                         mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3871                 }
3872         }
3873
3874         for (i = 0; i < 4; ++i) {
3875                 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3876
3877                 if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3878                         if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3879                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3880                                 continue;
3881                         }
3882                         root_gfn = pdptrs[i] >> PAGE_SHIFT;
3883                 }
3884
3885                 /*
3886                  * If shadowing 32-bit non-PAE page tables, each PAE page
3887                  * directory maps one quarter of the guest's non-PAE page
3888                  * directory. Othwerise each PAE page direct shadows one guest
3889                  * PAE page directory so that quadrant should be 0.
3890                  */
3891                 quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
3892
3893                 root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
3894                 mmu->pae_root[i] = root | pm_mask;
3895         }
3896
3897         if (mmu->root_role.level == PT64_ROOT_5LEVEL)
3898                 mmu->root.hpa = __pa(mmu->pml5_root);
3899         else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
3900                 mmu->root.hpa = __pa(mmu->pml4_root);
3901         else
3902                 mmu->root.hpa = __pa(mmu->pae_root);
3903
3904 set_root_pgd:
3905         mmu->root.pgd = root_pgd;
3906 out_unlock:
3907         write_unlock(&vcpu->kvm->mmu_lock);
3908
3909         return r;
3910 }
3911
3912 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3913 {
3914         struct kvm_mmu *mmu = vcpu->arch.mmu;
3915         bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
3916         u64 *pml5_root = NULL;
3917         u64 *pml4_root = NULL;
3918         u64 *pae_root;
3919
3920         /*
3921          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3922          * tables are allocated and initialized at root creation as there is no
3923          * equivalent level in the guest's NPT to shadow.  Allocate the tables
3924          * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3925          */
3926         if (mmu->root_role.direct ||
3927             mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
3928             mmu->root_role.level < PT64_ROOT_4LEVEL)
3929                 return 0;
3930
3931         /*
3932          * NPT, the only paging mode that uses this horror, uses a fixed number
3933          * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3934          * all MMus are 5-level.  Thus, this can safely require that pml5_root
3935          * is allocated if the other roots are valid and pml5 is needed, as any
3936          * prior MMU would also have required pml5.
3937          */
3938         if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3939                 return 0;
3940
3941         /*
3942          * The special roots should always be allocated in concert.  Yell and
3943          * bail if KVM ends up in a state where only one of the roots is valid.
3944          */
3945         if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3946                          (need_pml5 && mmu->pml5_root)))
3947                 return -EIO;
3948
3949         /*
3950          * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3951          * doesn't need to be decrypted.
3952          */
3953         pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3954         if (!pae_root)
3955                 return -ENOMEM;
3956
3957 #ifdef CONFIG_X86_64
3958         pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3959         if (!pml4_root)
3960                 goto err_pml4;
3961
3962         if (need_pml5) {
3963                 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3964                 if (!pml5_root)
3965                         goto err_pml5;
3966         }
3967 #endif
3968
3969         mmu->pae_root = pae_root;
3970         mmu->pml4_root = pml4_root;
3971         mmu->pml5_root = pml5_root;
3972
3973         return 0;
3974
3975 #ifdef CONFIG_X86_64
3976 err_pml5:
3977         free_page((unsigned long)pml4_root);
3978 err_pml4:
3979         free_page((unsigned long)pae_root);
3980         return -ENOMEM;
3981 #endif
3982 }
3983
3984 static bool is_unsync_root(hpa_t root)
3985 {
3986         struct kvm_mmu_page *sp;
3987
3988         if (!VALID_PAGE(root))
3989                 return false;
3990
3991         /*
3992          * The read barrier orders the CPU's read of SPTE.W during the page table
3993          * walk before the reads of sp->unsync/sp->unsync_children here.
3994          *
3995          * Even if another CPU was marking the SP as unsync-ed simultaneously,
3996          * any guest page table changes are not guaranteed to be visible anyway
3997          * until this VCPU issues a TLB flush strictly after those changes are
3998          * made.  We only need to ensure that the other CPU sets these flags
3999          * before any actual changes to the page tables are made.  The comments
4000          * in mmu_try_to_unsync_pages() describe what could go wrong if this
4001          * requirement isn't satisfied.
4002          */
4003         smp_rmb();
4004         sp = to_shadow_page(root);
4005
4006         /*
4007          * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
4008          * PDPTEs for a given PAE root need to be synchronized individually.
4009          */
4010         if (WARN_ON_ONCE(!sp))
4011                 return false;
4012
4013         if (sp->unsync || sp->unsync_children)
4014                 return true;
4015
4016         return false;
4017 }
4018
4019 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
4020 {
4021         int i;
4022         struct kvm_mmu_page *sp;
4023
4024         if (vcpu->arch.mmu->root_role.direct)
4025                 return;
4026
4027         if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
4028                 return;
4029
4030         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4031
4032         if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
4033                 hpa_t root = vcpu->arch.mmu->root.hpa;
4034                 sp = to_shadow_page(root);
4035
4036                 if (!is_unsync_root(root))
4037                         return;
4038
4039                 write_lock(&vcpu->kvm->mmu_lock);
4040                 mmu_sync_children(vcpu, sp, true);
4041                 write_unlock(&vcpu->kvm->mmu_lock);
4042                 return;
4043         }
4044
4045         write_lock(&vcpu->kvm->mmu_lock);
4046
4047         for (i = 0; i < 4; ++i) {
4048                 hpa_t root = vcpu->arch.mmu->pae_root[i];
4049
4050                 if (IS_VALID_PAE_ROOT(root)) {
4051                         sp = spte_to_child_sp(root);
4052                         mmu_sync_children(vcpu, sp, true);
4053                 }
4054         }
4055
4056         write_unlock(&vcpu->kvm->mmu_lock);
4057 }
4058
4059 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
4060 {
4061         unsigned long roots_to_free = 0;
4062         int i;
4063
4064         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4065                 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
4066                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
4067
4068         /* sync prev_roots by simply freeing them */
4069         kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
4070 }
4071
4072 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4073                                   gpa_t vaddr, u64 access,
4074                                   struct x86_exception *exception)
4075 {
4076         if (exception)
4077                 exception->error_code = 0;
4078         return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
4079 }
4080
4081 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4082 {
4083         /*
4084          * A nested guest cannot use the MMIO cache if it is using nested
4085          * page tables, because cr2 is a nGPA while the cache stores GPAs.
4086          */
4087         if (mmu_is_nested(vcpu))
4088                 return false;
4089
4090         if (direct)
4091                 return vcpu_match_mmio_gpa(vcpu, addr);
4092
4093         return vcpu_match_mmio_gva(vcpu, addr);
4094 }
4095
4096 /*
4097  * Return the level of the lowest level SPTE added to sptes.
4098  * That SPTE may be non-present.
4099  *
4100  * Must be called between walk_shadow_page_lockless_{begin,end}.
4101  */
4102 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
4103 {
4104         struct kvm_shadow_walk_iterator iterator;
4105         int leaf = -1;
4106         u64 spte;
4107
4108         for (shadow_walk_init(&iterator, vcpu, addr),
4109              *root_level = iterator.level;
4110              shadow_walk_okay(&iterator);
4111              __shadow_walk_next(&iterator, spte)) {
4112                 leaf = iterator.level;
4113                 spte = mmu_spte_get_lockless(iterator.sptep);
4114
4115                 sptes[leaf] = spte;
4116         }
4117
4118         return leaf;
4119 }
4120
4121 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
4122 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
4123 {
4124         u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
4125         struct rsvd_bits_validate *rsvd_check;
4126         int root, leaf, level;
4127         bool reserved = false;
4128
4129         walk_shadow_page_lockless_begin(vcpu);
4130
4131         if (is_tdp_mmu_active(vcpu))
4132                 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
4133         else
4134                 leaf = get_walk(vcpu, addr, sptes, &root);
4135
4136         walk_shadow_page_lockless_end(vcpu);
4137
4138         if (unlikely(leaf < 0)) {
4139                 *sptep = 0ull;
4140                 return reserved;
4141         }
4142
4143         *sptep = sptes[leaf];
4144
4145         /*
4146          * Skip reserved bits checks on the terminal leaf if it's not a valid
4147          * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
4148          * design, always have reserved bits set.  The purpose of the checks is
4149          * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
4150          */
4151         if (!is_shadow_present_pte(sptes[leaf]))
4152                 leaf++;
4153
4154         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
4155
4156         for (level = root; level >= leaf; level--)
4157                 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
4158
4159         if (reserved) {
4160                 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
4161                        __func__, addr);
4162                 for (level = root; level >= leaf; level--)
4163                         pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4164                                sptes[level], level,
4165                                get_rsvd_bits(rsvd_check, sptes[level], level));
4166         }
4167
4168         return reserved;
4169 }
4170
4171 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4172 {
4173         u64 spte;
4174         bool reserved;
4175
4176         if (mmio_info_in_cache(vcpu, addr, direct))
4177                 return RET_PF_EMULATE;
4178
4179         reserved = get_mmio_spte(vcpu, addr, &spte);
4180         if (WARN_ON(reserved))
4181                 return -EINVAL;
4182
4183         if (is_mmio_spte(spte)) {
4184                 gfn_t gfn = get_mmio_spte_gfn(spte);
4185                 unsigned int access = get_mmio_spte_access(spte);
4186
4187                 if (!check_mmio_spte(vcpu, spte))
4188                         return RET_PF_INVALID;
4189
4190                 if (direct)
4191                         addr = 0;
4192
4193                 trace_handle_mmio_page_fault(addr, gfn, access);
4194                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4195                 return RET_PF_EMULATE;
4196         }
4197
4198         /*
4199          * If the page table is zapped by other cpus, let CPU fault again on
4200          * the address.
4201          */
4202         return RET_PF_RETRY;
4203 }
4204
4205 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4206                                          struct kvm_page_fault *fault)
4207 {
4208         if (unlikely(fault->rsvd))
4209                 return false;
4210
4211         if (!fault->present || !fault->write)
4212                 return false;
4213
4214         /*
4215          * guest is writing the page which is write tracked which can
4216          * not be fixed by page fault handler.
4217          */
4218         if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
4219                 return true;
4220
4221         return false;
4222 }
4223
4224 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4225 {
4226         struct kvm_shadow_walk_iterator iterator;
4227         u64 spte;
4228
4229         walk_shadow_page_lockless_begin(vcpu);
4230         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
4231                 clear_sp_write_flooding_count(iterator.sptep);
4232         walk_shadow_page_lockless_end(vcpu);
4233 }
4234
4235 static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
4236 {
4237         /* make sure the token value is not 0 */
4238         u32 id = vcpu->arch.apf.id;
4239
4240         if (id << 12 == 0)
4241                 vcpu->arch.apf.id = 1;
4242
4243         return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4244 }
4245
4246 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4247                                     gfn_t gfn)
4248 {
4249         struct kvm_arch_async_pf arch;
4250
4251         arch.token = alloc_apf_token(vcpu);
4252         arch.gfn = gfn;
4253         arch.direct_map = vcpu->arch.mmu->root_role.direct;
4254         arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
4255
4256         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4257                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4258 }
4259
4260 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
4261 {
4262         int r;
4263
4264         if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
4265               work->wakeup_all)
4266                 return;
4267
4268         r = kvm_mmu_reload(vcpu);
4269         if (unlikely(r))
4270                 return;
4271
4272         if (!vcpu->arch.mmu->root_role.direct &&
4273               work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
4274                 return;
4275
4276         kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
4277 }
4278
4279 static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4280 {
4281         struct kvm_memory_slot *slot = fault->slot;
4282         bool async;
4283
4284         /*
4285          * Retry the page fault if the gfn hit a memslot that is being deleted
4286          * or moved.  This ensures any existing SPTEs for the old memslot will
4287          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4288          */
4289         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
4290                 return RET_PF_RETRY;
4291
4292         if (!kvm_is_visible_memslot(slot)) {
4293                 /* Don't expose private memslots to L2. */
4294                 if (is_guest_mode(vcpu)) {
4295                         fault->slot = NULL;
4296                         fault->pfn = KVM_PFN_NOSLOT;
4297                         fault->map_writable = false;
4298                         return RET_PF_CONTINUE;
4299                 }
4300                 /*
4301                  * If the APIC access page exists but is disabled, go directly
4302                  * to emulation without caching the MMIO access or creating a
4303                  * MMIO SPTE.  That way the cache doesn't need to be purged
4304                  * when the AVIC is re-enabled.
4305                  */
4306                 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
4307                     !kvm_apicv_activated(vcpu->kvm))
4308                         return RET_PF_EMULATE;
4309         }
4310
4311         async = false;
4312         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
4313                                           fault->write, &fault->map_writable,
4314                                           &fault->hva);
4315         if (!async)
4316                 return RET_PF_CONTINUE; /* *pfn has correct page already */
4317
4318         if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
4319                 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
4320                 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
4321                         trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
4322                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4323                         return RET_PF_RETRY;
4324                 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
4325                         return RET_PF_RETRY;
4326                 }
4327         }
4328
4329         /*
4330          * Allow gup to bail on pending non-fatal signals when it's also allowed
4331          * to wait for IO.  Note, gup always bails if it is unable to quickly
4332          * get a page and a fatal signal, i.e. SIGKILL, is pending.
4333          */
4334         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
4335                                           fault->write, &fault->map_writable,
4336                                           &fault->hva);
4337         return RET_PF_CONTINUE;
4338 }
4339
4340 static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
4341                            unsigned int access)
4342 {
4343         int ret;
4344
4345         fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
4346         smp_rmb();
4347
4348         ret = __kvm_faultin_pfn(vcpu, fault);
4349         if (ret != RET_PF_CONTINUE)
4350                 return ret;
4351
4352         if (unlikely(is_error_pfn(fault->pfn)))
4353                 return kvm_handle_error_pfn(vcpu, fault);
4354
4355         if (unlikely(!fault->slot))
4356                 return kvm_handle_noslot_fault(vcpu, fault, access);
4357
4358         return RET_PF_CONTINUE;
4359 }
4360
4361 /*
4362  * Returns true if the page fault is stale and needs to be retried, i.e. if the
4363  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
4364  */
4365 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
4366                                 struct kvm_page_fault *fault)
4367 {
4368         struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
4369
4370         /* Special roots, e.g. pae_root, are not backed by shadow pages. */
4371         if (sp && is_obsolete_sp(vcpu->kvm, sp))
4372                 return true;
4373
4374         /*
4375          * Roots without an associated shadow page are considered invalid if
4376          * there is a pending request to free obsolete roots.  The request is
4377          * only a hint that the current root _may_ be obsolete and needs to be
4378          * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4379          * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4380          * to reload even if no vCPU is actively using the root.
4381          */
4382         if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4383                 return true;
4384
4385         return fault->slot &&
4386                mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
4387 }
4388
4389 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4390 {
4391         int r;
4392
4393         if (page_fault_handle_page_track(vcpu, fault))
4394                 return RET_PF_EMULATE;
4395
4396         r = fast_page_fault(vcpu, fault);
4397         if (r != RET_PF_INVALID)
4398                 return r;
4399
4400         r = mmu_topup_memory_caches(vcpu, false);
4401         if (r)
4402                 return r;
4403
4404         r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
4405         if (r != RET_PF_CONTINUE)
4406                 return r;
4407
4408         r = RET_PF_RETRY;
4409         write_lock(&vcpu->kvm->mmu_lock);
4410
4411         if (is_page_fault_stale(vcpu, fault))
4412                 goto out_unlock;
4413
4414         r = make_mmu_pages_available(vcpu);
4415         if (r)
4416                 goto out_unlock;
4417
4418         r = direct_map(vcpu, fault);
4419
4420 out_unlock:
4421         write_unlock(&vcpu->kvm->mmu_lock);
4422         kvm_release_pfn_clean(fault->pfn);
4423         return r;
4424 }
4425
4426 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4427                                 struct kvm_page_fault *fault)
4428 {
4429         pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
4430
4431         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4432         fault->max_level = PG_LEVEL_2M;
4433         return direct_page_fault(vcpu, fault);
4434 }
4435
4436 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4437                                 u64 fault_address, char *insn, int insn_len)
4438 {
4439         int r = 1;
4440         u32 flags = vcpu->arch.apf.host_apf_flags;
4441
4442 #ifndef CONFIG_X86_64
4443         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4444         if (WARN_ON_ONCE(fault_address >> 32))
4445                 return -EFAULT;
4446 #endif
4447
4448         vcpu->arch.l1tf_flush_l1d = true;
4449         if (!flags) {
4450                 trace_kvm_page_fault(vcpu, fault_address, error_code);
4451
4452                 if (kvm_event_needs_reinjection(vcpu))
4453                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4454                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4455                                 insn_len);
4456         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4457                 vcpu->arch.apf.host_apf_flags = 0;
4458                 local_irq_disable();
4459                 kvm_async_pf_task_wait_schedule(fault_address);
4460                 local_irq_enable();
4461         } else {
4462                 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4463         }
4464
4465         return r;
4466 }
4467 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4468
4469 #ifdef CONFIG_X86_64
4470 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
4471                                   struct kvm_page_fault *fault)
4472 {
4473         int r;
4474
4475         if (page_fault_handle_page_track(vcpu, fault))
4476                 return RET_PF_EMULATE;
4477
4478         r = fast_page_fault(vcpu, fault);
4479         if (r != RET_PF_INVALID)
4480                 return r;
4481
4482         r = mmu_topup_memory_caches(vcpu, false);
4483         if (r)
4484                 return r;
4485
4486         r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
4487         if (r != RET_PF_CONTINUE)
4488                 return r;
4489
4490         r = RET_PF_RETRY;
4491         read_lock(&vcpu->kvm->mmu_lock);
4492
4493         if (is_page_fault_stale(vcpu, fault))
4494                 goto out_unlock;
4495
4496         r = kvm_tdp_mmu_map(vcpu, fault);
4497
4498 out_unlock:
4499         read_unlock(&vcpu->kvm->mmu_lock);
4500         kvm_release_pfn_clean(fault->pfn);
4501         return r;
4502 }
4503 #endif
4504
4505 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4506 {
4507         /*
4508          * If the guest's MTRRs may be used to compute the "real" memtype,
4509          * restrict the mapping level to ensure KVM uses a consistent memtype
4510          * across the entire mapping.  If the host MTRRs are ignored by TDP
4511          * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
4512          * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
4513          * from the guest's MTRRs so that guest accesses to memory that is
4514          * DMA'd aren't cached against the guest's wishes.
4515          *
4516          * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
4517          * e.g. KVM will force UC memtype for host MMIO.
4518          */
4519         if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
4520                 for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
4521                         int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4522                         gfn_t base = gfn_round_for_level(fault->gfn,
4523                                                          fault->max_level);
4524
4525                         if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4526                                 break;
4527                 }
4528         }
4529
4530 #ifdef CONFIG_X86_64
4531         if (tdp_mmu_enabled)
4532                 return kvm_tdp_mmu_page_fault(vcpu, fault);
4533 #endif
4534
4535         return direct_page_fault(vcpu, fault);
4536 }
4537
4538 static void nonpaging_init_context(struct kvm_mmu *context)
4539 {
4540         context->page_fault = nonpaging_page_fault;
4541         context->gva_to_gpa = nonpaging_gva_to_gpa;
4542         context->sync_spte = NULL;
4543 }
4544
4545 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4546                                   union kvm_mmu_page_role role)
4547 {
4548         return (role.direct || pgd == root->pgd) &&
4549                VALID_PAGE(root->hpa) &&
4550                role.word == to_shadow_page(root->hpa)->role.word;
4551 }
4552
4553 /*
4554  * Find out if a previously cached root matching the new pgd/role is available,
4555  * and insert the current root as the MRU in the cache.
4556  * If a matching root is found, it is assigned to kvm_mmu->root and
4557  * true is returned.
4558  * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4559  * evicted to make room for the current root, and false is returned.
4560  */
4561 static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4562                                               gpa_t new_pgd,
4563                                               union kvm_mmu_page_role new_role)
4564 {
4565         uint i;
4566
4567         if (is_root_usable(&mmu->root, new_pgd, new_role))
4568                 return true;
4569
4570         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4571                 /*
4572                  * The swaps end up rotating the cache like this:
4573                  *   C   0 1 2 3   (on entry to the function)
4574                  *   0   C 1 2 3
4575                  *   1   C 0 2 3
4576                  *   2   C 0 1 3
4577                  *   3   C 0 1 2   (on exit from the loop)
4578                  */
4579                 swap(mmu->root, mmu->prev_roots[i]);
4580                 if (is_root_usable(&mmu->root, new_pgd, new_role))
4581                         return true;
4582         }
4583
4584         kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4585         return false;
4586 }
4587
4588 /*
4589  * Find out if a previously cached root matching the new pgd/role is available.
4590  * On entry, mmu->root is invalid.
4591  * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4592  * of the cache becomes invalid, and true is returned.
4593  * If no match is found, kvm_mmu->root is left invalid and false is returned.
4594  */
4595 static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4596                                              gpa_t new_pgd,
4597                                              union kvm_mmu_page_role new_role)
4598 {
4599         uint i;
4600
4601         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4602                 if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4603                         goto hit;
4604
4605         return false;
4606
4607 hit:
4608         swap(mmu->root, mmu->prev_roots[i]);
4609         /* Bubble up the remaining roots.  */
4610         for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4611                 mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4612         mmu->prev_roots[i].hpa = INVALID_PAGE;
4613         return true;
4614 }
4615
4616 static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4617                             gpa_t new_pgd, union kvm_mmu_page_role new_role)
4618 {
4619         /*
4620          * For now, limit the caching to 64-bit hosts+VMs in order to avoid
4621          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4622          * later if necessary.
4623          */
4624         if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
4625                 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4626
4627         if (VALID_PAGE(mmu->root.hpa))
4628                 return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4629         else
4630                 return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4631 }
4632
4633 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4634 {
4635         struct kvm_mmu *mmu = vcpu->arch.mmu;
4636         union kvm_mmu_page_role new_role = mmu->root_role;
4637
4638         /*
4639          * Return immediately if no usable root was found, kvm_mmu_reload()
4640          * will establish a valid root prior to the next VM-Enter.
4641          */
4642         if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
4643                 return;
4644
4645         /*
4646          * It's possible that the cached previous root page is obsolete because
4647          * of a change in the MMU generation number. However, changing the
4648          * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4649          * which will free the root set here and allocate a new one.
4650          */
4651         kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4652
4653         if (force_flush_and_sync_on_reuse) {
4654                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4655                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4656         }
4657
4658         /*
4659          * The last MMIO access's GVA and GPA are cached in the VCPU. When
4660          * switching to a new CR3, that GVA->GPA mapping may no longer be
4661          * valid. So clear any cached MMIO info even when we don't need to sync
4662          * the shadow page tables.
4663          */
4664         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4665
4666         /*
4667          * If this is a direct root page, it doesn't have a write flooding
4668          * count. Otherwise, clear the write flooding count.
4669          */
4670         if (!new_role.direct)
4671                 __clear_sp_write_flooding_count(
4672                                 to_shadow_page(vcpu->arch.mmu->root.hpa));
4673 }
4674 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4675
4676 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4677 {
4678         return kvm_read_cr3(vcpu);
4679 }
4680
4681 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4682                            unsigned int access)
4683 {
4684         if (unlikely(is_mmio_spte(*sptep))) {
4685                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4686                         mmu_spte_clear_no_track(sptep);
4687                         return true;
4688                 }
4689
4690                 mark_mmio_spte(vcpu, sptep, gfn, access);
4691                 return true;
4692         }
4693
4694         return false;
4695 }
4696
4697 #define PTTYPE_EPT 18 /* arbitrary */
4698 #define PTTYPE PTTYPE_EPT
4699 #include "paging_tmpl.h"
4700 #undef PTTYPE
4701
4702 #define PTTYPE 64
4703 #include "paging_tmpl.h"
4704 #undef PTTYPE
4705
4706 #define PTTYPE 32
4707 #include "paging_tmpl.h"
4708 #undef PTTYPE
4709
4710 static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4711                                     u64 pa_bits_rsvd, int level, bool nx,
4712                                     bool gbpages, bool pse, bool amd)
4713 {
4714         u64 gbpages_bit_rsvd = 0;
4715         u64 nonleaf_bit8_rsvd = 0;
4716         u64 high_bits_rsvd;
4717
4718         rsvd_check->bad_mt_xwr = 0;
4719
4720         if (!gbpages)
4721                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4722
4723         if (level == PT32E_ROOT_LEVEL)
4724                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4725         else
4726                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4727
4728         /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4729         if (!nx)
4730                 high_bits_rsvd |= rsvd_bits(63, 63);
4731
4732         /*
4733          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4734          * leaf entries) on AMD CPUs only.
4735          */
4736         if (amd)
4737                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4738
4739         switch (level) {
4740         case PT32_ROOT_LEVEL:
4741                 /* no rsvd bits for 2 level 4K page table entries */
4742                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4743                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4744                 rsvd_check->rsvd_bits_mask[1][0] =
4745                         rsvd_check->rsvd_bits_mask[0][0];
4746
4747                 if (!pse) {
4748                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4749                         break;
4750                 }
4751
4752                 if (is_cpuid_PSE36())
4753                         /* 36bits PSE 4MB page */
4754                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4755                 else
4756                         /* 32 bits PSE 4MB page */
4757                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4758                 break;
4759         case PT32E_ROOT_LEVEL:
4760                 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4761                                                    high_bits_rsvd |
4762                                                    rsvd_bits(5, 8) |
4763                                                    rsvd_bits(1, 2);     /* PDPTE */
4764                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;      /* PDE */
4765                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;      /* PTE */
4766                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4767                                                    rsvd_bits(13, 20);   /* large page */
4768                 rsvd_check->rsvd_bits_mask[1][0] =
4769                         rsvd_check->rsvd_bits_mask[0][0];
4770                 break;
4771         case PT64_ROOT_5LEVEL:
4772                 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4773                                                    nonleaf_bit8_rsvd |
4774                                                    rsvd_bits(7, 7);
4775                 rsvd_check->rsvd_bits_mask[1][4] =
4776                         rsvd_check->rsvd_bits_mask[0][4];
4777                 fallthrough;
4778         case PT64_ROOT_4LEVEL:
4779                 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4780                                                    nonleaf_bit8_rsvd |
4781                                                    rsvd_bits(7, 7);
4782                 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4783                                                    gbpages_bit_rsvd;
4784                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4785                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4786                 rsvd_check->rsvd_bits_mask[1][3] =
4787                         rsvd_check->rsvd_bits_mask[0][3];
4788                 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4789                                                    gbpages_bit_rsvd |
4790                                                    rsvd_bits(13, 29);
4791                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4792                                                    rsvd_bits(13, 20); /* large page */
4793                 rsvd_check->rsvd_bits_mask[1][0] =
4794                         rsvd_check->rsvd_bits_mask[0][0];
4795                 break;
4796         }
4797 }
4798
4799 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4800 {
4801         /*
4802          * If TDP is enabled, let the guest use GBPAGES if they're supported in
4803          * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
4804          * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
4805          * walk for performance and complexity reasons.  Not to mention KVM
4806          * _can't_ solve the problem because GVA->GPA walks aren't visible to
4807          * KVM once a TDP translation is installed.  Mimic hardware behavior so
4808          * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
4809          */
4810         return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4811                              guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4812 }
4813
4814 static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4815                                         struct kvm_mmu *context)
4816 {
4817         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
4818                                 vcpu->arch.reserved_gpa_bits,
4819                                 context->cpu_role.base.level, is_efer_nx(context),
4820                                 guest_can_use_gbpages(vcpu),
4821                                 is_cr4_pse(context),
4822                                 guest_cpuid_is_amd_or_hygon(vcpu));
4823 }
4824
4825 static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4826                                         u64 pa_bits_rsvd, bool execonly,
4827                                         int huge_page_level)
4828 {
4829         u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4830         u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4831         u64 bad_mt_xwr;
4832
4833         if (huge_page_level < PG_LEVEL_1G)
4834                 large_1g_rsvd = rsvd_bits(7, 7);
4835         if (huge_page_level < PG_LEVEL_2M)
4836                 large_2m_rsvd = rsvd_bits(7, 7);
4837
4838         rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4839         rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4840         rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4841         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4842         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4843
4844         /* large page */
4845         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4846         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4847         rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4848         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4849         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4850
4851         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4852         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4853         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4854         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4855         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4856         if (!execonly) {
4857                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4858                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4859         }
4860         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4861 }
4862
4863 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4864                 struct kvm_mmu *context, bool execonly, int huge_page_level)
4865 {
4866         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4867                                     vcpu->arch.reserved_gpa_bits, execonly,
4868                                     huge_page_level);
4869 }
4870
4871 static inline u64 reserved_hpa_bits(void)
4872 {
4873         return rsvd_bits(shadow_phys_bits, 63);
4874 }
4875
4876 /*
4877  * the page table on host is the shadow page table for the page
4878  * table in guest or amd nested guest, its mmu features completely
4879  * follow the features in guest.
4880  */
4881 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4882                                         struct kvm_mmu *context)
4883 {
4884         /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4885         bool is_amd = true;
4886         /* KVM doesn't use 2-level page tables for the shadow MMU. */
4887         bool is_pse = false;
4888         struct rsvd_bits_validate *shadow_zero_check;
4889         int i;
4890
4891         WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
4892
4893         shadow_zero_check = &context->shadow_zero_check;
4894         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4895                                 context->root_role.level,
4896                                 context->root_role.efer_nx,
4897                                 guest_can_use_gbpages(vcpu), is_pse, is_amd);
4898
4899         if (!shadow_me_mask)
4900                 return;
4901
4902         for (i = context->root_role.level; --i >= 0;) {
4903                 /*
4904                  * So far shadow_me_value is a constant during KVM's life
4905                  * time.  Bits in shadow_me_value are allowed to be set.
4906                  * Bits in shadow_me_mask but not in shadow_me_value are
4907                  * not allowed to be set.
4908                  */
4909                 shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
4910                 shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
4911                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
4912                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
4913         }
4914
4915 }
4916
4917 static inline bool boot_cpu_is_amd(void)
4918 {
4919         WARN_ON_ONCE(!tdp_enabled);
4920         return shadow_x_mask == 0;
4921 }
4922
4923 /*
4924  * the direct page table on host, use as much mmu features as
4925  * possible, however, kvm currently does not do execution-protection.
4926  */
4927 static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4928 {
4929         struct rsvd_bits_validate *shadow_zero_check;
4930         int i;
4931
4932         shadow_zero_check = &context->shadow_zero_check;
4933
4934         if (boot_cpu_is_amd())
4935                 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4936                                         context->root_role.level, true,
4937                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4938                                         false, true);
4939         else
4940                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4941                                             reserved_hpa_bits(), false,
4942                                             max_huge_page_level);
4943
4944         if (!shadow_me_mask)
4945                 return;
4946
4947         for (i = context->root_role.level; --i >= 0;) {
4948                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4949                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4950         }
4951 }
4952
4953 /*
4954  * as the comments in reset_shadow_zero_bits_mask() except it
4955  * is the shadow page table for intel nested guest.
4956  */
4957 static void
4958 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4959 {
4960         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4961                                     reserved_hpa_bits(), execonly,
4962                                     max_huge_page_level);
4963 }
4964
4965 #define BYTE_MASK(access) \
4966         ((1 & (access) ? 2 : 0) | \
4967          (2 & (access) ? 4 : 0) | \
4968          (3 & (access) ? 8 : 0) | \
4969          (4 & (access) ? 16 : 0) | \
4970          (5 & (access) ? 32 : 0) | \
4971          (6 & (access) ? 64 : 0) | \
4972          (7 & (access) ? 128 : 0))
4973
4974
4975 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4976 {
4977         unsigned byte;
4978
4979         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4980         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4981         const u8 u = BYTE_MASK(ACC_USER_MASK);
4982
4983         bool cr4_smep = is_cr4_smep(mmu);
4984         bool cr4_smap = is_cr4_smap(mmu);
4985         bool cr0_wp = is_cr0_wp(mmu);
4986         bool efer_nx = is_efer_nx(mmu);
4987
4988         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4989                 unsigned pfec = byte << 1;
4990
4991                 /*
4992                  * Each "*f" variable has a 1 bit for each UWX value
4993                  * that causes a fault with the given PFEC.
4994                  */
4995
4996                 /* Faults from writes to non-writable pages */
4997                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4998                 /* Faults from user mode accesses to supervisor pages */
4999                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
5000                 /* Faults from fetches of non-executable pages*/
5001                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
5002                 /* Faults from kernel mode fetches of user pages */
5003                 u8 smepf = 0;
5004                 /* Faults from kernel mode accesses of user pages */
5005                 u8 smapf = 0;
5006
5007                 if (!ept) {
5008                         /* Faults from kernel mode accesses to user pages */
5009                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
5010
5011                         /* Not really needed: !nx will cause pte.nx to fault */
5012                         if (!efer_nx)
5013                                 ff = 0;
5014
5015                         /* Allow supervisor writes if !cr0.wp */
5016                         if (!cr0_wp)
5017                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
5018
5019                         /* Disallow supervisor fetches of user code if cr4.smep */
5020                         if (cr4_smep)
5021                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
5022
5023                         /*
5024                          * SMAP:kernel-mode data accesses from user-mode
5025                          * mappings should fault. A fault is considered
5026                          * as a SMAP violation if all of the following
5027                          * conditions are true:
5028                          *   - X86_CR4_SMAP is set in CR4
5029                          *   - A user page is accessed
5030                          *   - The access is not a fetch
5031                          *   - The access is supervisor mode
5032                          *   - If implicit supervisor access or X86_EFLAGS_AC is clear
5033                          *
5034                          * Here, we cover the first four conditions.
5035                          * The fifth is computed dynamically in permission_fault();
5036                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
5037                          * *not* subject to SMAP restrictions.
5038                          */
5039                         if (cr4_smap)
5040                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
5041                 }
5042
5043                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
5044         }
5045 }
5046
5047 /*
5048 * PKU is an additional mechanism by which the paging controls access to
5049 * user-mode addresses based on the value in the PKRU register.  Protection
5050 * key violations are reported through a bit in the page fault error code.
5051 * Unlike other bits of the error code, the PK bit is not known at the
5052 * call site of e.g. gva_to_gpa; it must be computed directly in
5053 * permission_fault based on two bits of PKRU, on some machine state (CR4,
5054 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
5055 *
5056 * In particular the following conditions come from the error code, the
5057 * page tables and the machine state:
5058 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
5059 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
5060 * - PK is always zero if U=0 in the page tables
5061 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
5062 *
5063 * The PKRU bitmask caches the result of these four conditions.  The error
5064 * code (minus the P bit) and the page table's U bit form an index into the
5065 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
5066 * with the two bits of the PKRU register corresponding to the protection key.
5067 * For the first three conditions above the bits will be 00, thus masking
5068 * away both AD and WD.  For all reads or if the last condition holds, WD
5069 * only will be masked away.
5070 */
5071 static void update_pkru_bitmask(struct kvm_mmu *mmu)
5072 {
5073         unsigned bit;
5074         bool wp;
5075
5076         mmu->pkru_mask = 0;
5077
5078         if (!is_cr4_pke(mmu))
5079                 return;
5080
5081         wp = is_cr0_wp(mmu);
5082
5083         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
5084                 unsigned pfec, pkey_bits;
5085                 bool check_pkey, check_write, ff, uf, wf, pte_user;
5086
5087                 pfec = bit << 1;
5088                 ff = pfec & PFERR_FETCH_MASK;
5089                 uf = pfec & PFERR_USER_MASK;
5090                 wf = pfec & PFERR_WRITE_MASK;
5091
5092                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
5093                 pte_user = pfec & PFERR_RSVD_MASK;
5094
5095                 /*
5096                  * Only need to check the access which is not an
5097                  * instruction fetch and is to a user page.
5098                  */
5099                 check_pkey = (!ff && pte_user);
5100                 /*
5101                  * write access is controlled by PKRU if it is a
5102                  * user access or CR0.WP = 1.
5103                  */
5104                 check_write = check_pkey && wf && (uf || wp);
5105
5106                 /* PKRU.AD stops both read and write access. */
5107                 pkey_bits = !!check_pkey;
5108                 /* PKRU.WD stops write access. */
5109                 pkey_bits |= (!!check_write) << 1;
5110
5111                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
5112         }
5113 }
5114
5115 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
5116                                         struct kvm_mmu *mmu)
5117 {
5118         if (!is_cr0_pg(mmu))
5119                 return;
5120
5121         reset_guest_rsvds_bits_mask(vcpu, mmu);
5122         update_permission_bitmask(mmu, false);
5123         update_pkru_bitmask(mmu);
5124 }
5125
5126 static void paging64_init_context(struct kvm_mmu *context)
5127 {
5128         context->page_fault = paging64_page_fault;
5129         context->gva_to_gpa = paging64_gva_to_gpa;
5130         context->sync_spte = paging64_sync_spte;
5131 }
5132
5133 static void paging32_init_context(struct kvm_mmu *context)
5134 {
5135         context->page_fault = paging32_page_fault;
5136         context->gva_to_gpa = paging32_gva_to_gpa;
5137         context->sync_spte = paging32_sync_spte;
5138 }
5139
5140 static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
5141                                             const struct kvm_mmu_role_regs *regs)
5142 {
5143         union kvm_cpu_role role = {0};
5144
5145         role.base.access = ACC_ALL;
5146         role.base.smm = is_smm(vcpu);
5147         role.base.guest_mode = is_guest_mode(vcpu);
5148         role.ext.valid = 1;
5149
5150         if (!____is_cr0_pg(regs)) {
5151                 role.base.direct = 1;
5152                 return role;
5153         }
5154
5155         role.base.efer_nx = ____is_efer_nx(regs);
5156         role.base.cr0_wp = ____is_cr0_wp(regs);
5157         role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
5158         role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
5159         role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
5160
5161         if (____is_efer_lma(regs))
5162                 role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
5163                                                         : PT64_ROOT_4LEVEL;
5164         else if (____is_cr4_pae(regs))
5165                 role.base.level = PT32E_ROOT_LEVEL;
5166         else
5167                 role.base.level = PT32_ROOT_LEVEL;
5168
5169         role.ext.cr4_smep = ____is_cr4_smep(regs);
5170         role.ext.cr4_smap = ____is_cr4_smap(regs);
5171         role.ext.cr4_pse = ____is_cr4_pse(regs);
5172
5173         /* PKEY and LA57 are active iff long mode is active. */
5174         role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
5175         role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
5176         role.ext.efer_lma = ____is_efer_lma(regs);
5177         return role;
5178 }
5179
5180 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
5181 {
5182         /* tdp_root_level is architecture forced level, use it if nonzero */
5183         if (tdp_root_level)
5184                 return tdp_root_level;
5185
5186         /* Use 5-level TDP if and only if it's useful/necessary. */
5187         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
5188                 return 4;
5189
5190         return max_tdp_level;
5191 }
5192
5193 static union kvm_mmu_page_role
5194 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
5195                                 union kvm_cpu_role cpu_role)
5196 {
5197         union kvm_mmu_page_role role = {0};
5198
5199         role.access = ACC_ALL;
5200         role.cr0_wp = true;
5201         role.efer_nx = true;
5202         role.smm = cpu_role.base.smm;
5203         role.guest_mode = cpu_role.base.guest_mode;
5204         role.ad_disabled = !kvm_ad_enabled();
5205         role.level = kvm_mmu_get_tdp_level(vcpu);
5206         role.direct = true;
5207         role.has_4_byte_gpte = false;
5208
5209         return role;
5210 }
5211
5212 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
5213                              union kvm_cpu_role cpu_role)
5214 {
5215         struct kvm_mmu *context = &vcpu->arch.root_mmu;
5216         union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
5217
5218         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5219             root_role.word == context->root_role.word)
5220                 return;
5221
5222         context->cpu_role.as_u64 = cpu_role.as_u64;
5223         context->root_role.word = root_role.word;
5224         context->page_fault = kvm_tdp_page_fault;
5225         context->sync_spte = NULL;
5226         context->get_guest_pgd = get_cr3;
5227         context->get_pdptr = kvm_pdptr_read;
5228         context->inject_page_fault = kvm_inject_page_fault;
5229
5230         if (!is_cr0_pg(context))
5231                 context->gva_to_gpa = nonpaging_gva_to_gpa;
5232         else if (is_cr4_pae(context))
5233                 context->gva_to_gpa = paging64_gva_to_gpa;
5234         else
5235                 context->gva_to_gpa = paging32_gva_to_gpa;
5236
5237         reset_guest_paging_metadata(vcpu, context);
5238         reset_tdp_shadow_zero_bits_mask(context);
5239 }
5240
5241 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
5242                                     union kvm_cpu_role cpu_role,
5243                                     union kvm_mmu_page_role root_role)
5244 {
5245         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5246             root_role.word == context->root_role.word)
5247                 return;
5248
5249         context->cpu_role.as_u64 = cpu_role.as_u64;
5250         context->root_role.word = root_role.word;
5251
5252         if (!is_cr0_pg(context))
5253                 nonpaging_init_context(context);
5254         else if (is_cr4_pae(context))
5255                 paging64_init_context(context);
5256         else
5257                 paging32_init_context(context);
5258
5259         reset_guest_paging_metadata(vcpu, context);
5260         reset_shadow_zero_bits_mask(vcpu, context);
5261 }
5262
5263 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
5264                                 union kvm_cpu_role cpu_role)
5265 {
5266         struct kvm_mmu *context = &vcpu->arch.root_mmu;
5267         union kvm_mmu_page_role root_role;
5268
5269         root_role = cpu_role.base;
5270
5271         /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
5272         root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
5273
5274         /*
5275          * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
5276          * KVM uses NX when TDP is disabled to handle a variety of scenarios,
5277          * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
5278          * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
5279          * The iTLB multi-hit workaround can be toggled at any time, so assume
5280          * NX can be used by any non-nested shadow MMU to avoid having to reset
5281          * MMU contexts.
5282          */
5283         root_role.efer_nx = true;
5284
5285         shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5286 }
5287
5288 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
5289                              unsigned long cr4, u64 efer, gpa_t nested_cr3)
5290 {
5291         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5292         struct kvm_mmu_role_regs regs = {
5293                 .cr0 = cr0,
5294                 .cr4 = cr4 & ~X86_CR4_PKE,
5295                 .efer = efer,
5296         };
5297         union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5298         union kvm_mmu_page_role root_role;
5299
5300         /* NPT requires CR0.PG=1. */
5301         WARN_ON_ONCE(cpu_role.base.direct);
5302
5303         root_role = cpu_role.base;
5304         root_role.level = kvm_mmu_get_tdp_level(vcpu);
5305         if (root_role.level == PT64_ROOT_5LEVEL &&
5306             cpu_role.base.level == PT64_ROOT_4LEVEL)
5307                 root_role.passthrough = 1;
5308
5309         shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5310         kvm_mmu_new_pgd(vcpu, nested_cr3);
5311 }
5312 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
5313
5314 static union kvm_cpu_role
5315 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5316                                    bool execonly, u8 level)
5317 {
5318         union kvm_cpu_role role = {0};
5319
5320         /*
5321          * KVM does not support SMM transfer monitors, and consequently does not
5322          * support the "entry to SMM" control either.  role.base.smm is always 0.
5323          */
5324         WARN_ON_ONCE(is_smm(vcpu));
5325         role.base.level = level;
5326         role.base.has_4_byte_gpte = false;
5327         role.base.direct = false;
5328         role.base.ad_disabled = !accessed_dirty;
5329         role.base.guest_mode = true;
5330         role.base.access = ACC_ALL;
5331
5332         role.ext.word = 0;
5333         role.ext.execonly = execonly;
5334         role.ext.valid = 1;
5335
5336         return role;
5337 }
5338
5339 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5340                              int huge_page_level, bool accessed_dirty,
5341                              gpa_t new_eptp)
5342 {
5343         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5344         u8 level = vmx_eptp_page_walk_level(new_eptp);
5345         union kvm_cpu_role new_mode =
5346                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5347                                                    execonly, level);
5348
5349         if (new_mode.as_u64 != context->cpu_role.as_u64) {
5350                 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
5351                 context->cpu_role.as_u64 = new_mode.as_u64;
5352                 context->root_role.word = new_mode.base.word;
5353
5354                 context->page_fault = ept_page_fault;
5355                 context->gva_to_gpa = ept_gva_to_gpa;
5356                 context->sync_spte = ept_sync_spte;
5357
5358                 update_permission_bitmask(context, true);
5359                 context->pkru_mask = 0;
5360                 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5361                 reset_ept_shadow_zero_bits_mask(context, execonly);
5362         }
5363
5364         kvm_mmu_new_pgd(vcpu, new_eptp);
5365 }
5366 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5367
5368 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
5369                              union kvm_cpu_role cpu_role)
5370 {
5371         struct kvm_mmu *context = &vcpu->arch.root_mmu;
5372
5373         kvm_init_shadow_mmu(vcpu, cpu_role);
5374
5375         context->get_guest_pgd     = get_cr3;
5376         context->get_pdptr         = kvm_pdptr_read;
5377         context->inject_page_fault = kvm_inject_page_fault;
5378 }
5379
5380 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
5381                                 union kvm_cpu_role new_mode)
5382 {
5383         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5384
5385         if (new_mode.as_u64 == g_context->cpu_role.as_u64)
5386                 return;
5387
5388         g_context->cpu_role.as_u64   = new_mode.as_u64;
5389         g_context->get_guest_pgd     = get_cr3;
5390         g_context->get_pdptr         = kvm_pdptr_read;
5391         g_context->inject_page_fault = kvm_inject_page_fault;
5392
5393         /*
5394          * L2 page tables are never shadowed, so there is no need to sync
5395          * SPTEs.
5396          */
5397         g_context->sync_spte         = NULL;
5398
5399         /*
5400          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5401          * L1's nested page tables (e.g. EPT12). The nested translation
5402          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5403          * L2's page tables as the first level of translation and L1's
5404          * nested page tables as the second level of translation. Basically
5405          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5406          */
5407         if (!is_paging(vcpu))
5408                 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5409         else if (is_long_mode(vcpu))
5410                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5411         else if (is_pae(vcpu))
5412                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5413         else
5414                 g_context->gva_to_gpa = paging32_gva_to_gpa;
5415
5416         reset_guest_paging_metadata(vcpu, g_context);
5417 }
5418
5419 void kvm_init_mmu(struct kvm_vcpu *vcpu)
5420 {
5421         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5422         union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5423
5424         if (mmu_is_nested(vcpu))
5425                 init_kvm_nested_mmu(vcpu, cpu_role);
5426         else if (tdp_enabled)
5427                 init_kvm_tdp_mmu(vcpu, cpu_role);
5428         else
5429                 init_kvm_softmmu(vcpu, cpu_role);
5430 }
5431 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5432
5433 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5434 {
5435         /*
5436          * Invalidate all MMU roles to force them to reinitialize as CPUID
5437          * information is factored into reserved bit calculations.
5438          *
5439          * Correctly handling multiple vCPU models with respect to paging and
5440          * physical address properties) in a single VM would require tracking
5441          * all relevant CPUID information in kvm_mmu_page_role. That is very
5442          * undesirable as it would increase the memory requirements for
5443          * gfn_track (see struct kvm_mmu_page_role comments).  For now that
5444          * problem is swept under the rug; KVM's CPUID API is horrific and
5445          * it's all but impossible to solve it without introducing a new API.
5446          */
5447         vcpu->arch.root_mmu.root_role.word = 0;
5448         vcpu->arch.guest_mmu.root_role.word = 0;
5449         vcpu->arch.nested_mmu.root_role.word = 0;
5450         vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5451         vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5452         vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
5453         kvm_mmu_reset_context(vcpu);
5454
5455         /*
5456          * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5457          * kvm_arch_vcpu_ioctl().
5458          */
5459         KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
5460 }
5461
5462 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5463 {
5464         kvm_mmu_unload(vcpu);
5465         kvm_init_mmu(vcpu);
5466 }
5467 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5468
5469 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5470 {
5471         int r;
5472
5473         r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
5474         if (r)
5475                 goto out;
5476         r = mmu_alloc_special_roots(vcpu);
5477         if (r)
5478                 goto out;
5479         if (vcpu->arch.mmu->root_role.direct)
5480                 r = mmu_alloc_direct_roots(vcpu);
5481         else
5482                 r = mmu_alloc_shadow_roots(vcpu);
5483         if (r)
5484                 goto out;
5485
5486         kvm_mmu_sync_roots(vcpu);
5487
5488         kvm_mmu_load_pgd(vcpu);
5489
5490         /*
5491          * Flush any TLB entries for the new root, the provenance of the root
5492          * is unknown.  Even if KVM ensures there are no stale TLB entries
5493          * for a freed root, in theory another hypervisor could have left
5494          * stale entries.  Flushing on alloc also allows KVM to skip the TLB
5495          * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5496          */
5497         static_call(kvm_x86_flush_tlb_current)(vcpu);
5498 out:
5499         return r;
5500 }
5501
5502 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5503 {
5504         struct kvm *kvm = vcpu->kvm;
5505
5506         kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5507         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5508         kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5509         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5510         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5511 }
5512
5513 static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5514 {
5515         struct kvm_mmu_page *sp;
5516
5517         if (!VALID_PAGE(root_hpa))
5518                 return false;
5519
5520         /*
5521          * When freeing obsolete roots, treat roots as obsolete if they don't
5522          * have an associated shadow page.  This does mean KVM will get false
5523          * positives and free roots that don't strictly need to be freed, but
5524          * such false positives are relatively rare:
5525          *
5526          *  (a) only PAE paging and nested NPT has roots without shadow pages
5527          *  (b) remote reloads due to a memslot update obsoletes _all_ roots
5528          *  (c) KVM doesn't track previous roots for PAE paging, and the guest
5529          *      is unlikely to zap an in-use PGD.
5530          */
5531         sp = to_shadow_page(root_hpa);
5532         return !sp || is_obsolete_sp(kvm, sp);
5533 }
5534
5535 static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5536 {
5537         unsigned long roots_to_free = 0;
5538         int i;
5539
5540         if (is_obsolete_root(kvm, mmu->root.hpa))
5541                 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5542
5543         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5544                 if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
5545                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5546         }
5547
5548         if (roots_to_free)
5549                 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5550 }
5551
5552 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5553 {
5554         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5555         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5556 }
5557
5558 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5559                                     int *bytes)
5560 {
5561         u64 gentry = 0;
5562         int r;
5563
5564         /*
5565          * Assume that the pte write on a page table of the same type
5566          * as the current vcpu paging mode since we update the sptes only
5567          * when they have the same mode.
5568          */
5569         if (is_pae(vcpu) && *bytes == 4) {
5570                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5571                 *gpa &= ~(gpa_t)7;
5572                 *bytes = 8;
5573         }
5574
5575         if (*bytes == 4 || *bytes == 8) {
5576                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5577                 if (r)
5578                         gentry = 0;
5579         }
5580
5581         return gentry;
5582 }
5583
5584 /*
5585  * If we're seeing too many writes to a page, it may no longer be a page table,
5586  * or we may be forking, in which case it is better to unmap the page.
5587  */
5588 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5589 {
5590         /*
5591          * Skip write-flooding detected for the sp whose level is 1, because
5592          * it can become unsync, then the guest page is not write-protected.
5593          */
5594         if (sp->role.level == PG_LEVEL_4K)
5595                 return false;
5596
5597         atomic_inc(&sp->write_flooding_count);
5598         return atomic_read(&sp->write_flooding_count) >= 3;
5599 }
5600
5601 /*
5602  * Misaligned accesses are too much trouble to fix up; also, they usually
5603  * indicate a page is not used as a page table.
5604  */
5605 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5606                                     int bytes)
5607 {
5608         unsigned offset, pte_size, misaligned;
5609
5610         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5611                  gpa, bytes, sp->role.word);
5612
5613         offset = offset_in_page(gpa);
5614         pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5615
5616         /*
5617          * Sometimes, the OS only writes the last one bytes to update status
5618          * bits, for example, in linux, andb instruction is used in clear_bit().
5619          */
5620         if (!(offset & (pte_size - 1)) && bytes == 1)
5621                 return false;
5622
5623         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5624         misaligned |= bytes < 4;
5625
5626         return misaligned;
5627 }
5628
5629 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5630 {
5631         unsigned page_offset, quadrant;
5632         u64 *spte;
5633         int level;
5634
5635         page_offset = offset_in_page(gpa);
5636         level = sp->role.level;
5637         *nspte = 1;
5638         if (sp->role.has_4_byte_gpte) {
5639                 page_offset <<= 1;      /* 32->64 */
5640                 /*
5641                  * A 32-bit pde maps 4MB while the shadow pdes map
5642                  * only 2MB.  So we need to double the offset again
5643                  * and zap two pdes instead of one.
5644                  */
5645                 if (level == PT32_ROOT_LEVEL) {
5646                         page_offset &= ~7; /* kill rounding error */
5647                         page_offset <<= 1;
5648                         *nspte = 2;
5649                 }
5650                 quadrant = page_offset >> PAGE_SHIFT;
5651                 page_offset &= ~PAGE_MASK;
5652                 if (quadrant != sp->role.quadrant)
5653                         return NULL;
5654         }
5655
5656         spte = &sp->spt[page_offset / sizeof(*spte)];
5657         return spte;
5658 }
5659
5660 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5661                               const u8 *new, int bytes,
5662                               struct kvm_page_track_notifier_node *node)
5663 {
5664         gfn_t gfn = gpa >> PAGE_SHIFT;
5665         struct kvm_mmu_page *sp;
5666         LIST_HEAD(invalid_list);
5667         u64 entry, gentry, *spte;
5668         int npte;
5669         bool flush = false;
5670
5671         /*
5672          * If we don't have indirect shadow pages, it means no page is
5673          * write-protected, so we can exit simply.
5674          */
5675         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5676                 return;
5677
5678         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5679
5680         write_lock(&vcpu->kvm->mmu_lock);
5681
5682         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5683
5684         ++vcpu->kvm->stat.mmu_pte_write;
5685
5686         for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
5687                 if (detect_write_misaligned(sp, gpa, bytes) ||
5688                       detect_write_flooding(sp)) {
5689                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5690                         ++vcpu->kvm->stat.mmu_flooded;
5691                         continue;
5692                 }
5693
5694                 spte = get_written_sptes(sp, gpa, &npte);
5695                 if (!spte)
5696                         continue;
5697
5698                 while (npte--) {
5699                         entry = *spte;
5700                         mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5701                         if (gentry && sp->role.level != PG_LEVEL_4K)
5702                                 ++vcpu->kvm->stat.mmu_pde_zapped;
5703                         if (is_shadow_present_pte(entry))
5704                                 flush = true;
5705                         ++spte;
5706                 }
5707         }
5708         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5709         write_unlock(&vcpu->kvm->mmu_lock);
5710 }
5711
5712 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5713                        void *insn, int insn_len)
5714 {
5715         int r, emulation_type = EMULTYPE_PF;
5716         bool direct = vcpu->arch.mmu->root_role.direct;
5717
5718         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5719                 return RET_PF_RETRY;
5720
5721         r = RET_PF_INVALID;
5722         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5723                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5724                 if (r == RET_PF_EMULATE)
5725                         goto emulate;
5726         }
5727
5728         if (r == RET_PF_INVALID) {
5729                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5730                                           lower_32_bits(error_code), false,
5731                                           &emulation_type);
5732                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5733                         return -EIO;
5734         }
5735
5736         if (r < 0)
5737                 return r;
5738         if (r != RET_PF_EMULATE)
5739                 return 1;
5740
5741         /*
5742          * Before emulating the instruction, check if the error code
5743          * was due to a RO violation while translating the guest page.
5744          * This can occur when using nested virtualization with nested
5745          * paging in both guests. If true, we simply unprotect the page
5746          * and resume the guest.
5747          */
5748         if (vcpu->arch.mmu->root_role.direct &&
5749             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5750                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5751                 return 1;
5752         }
5753
5754         /*
5755          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5756          * optimistically try to just unprotect the page and let the processor
5757          * re-execute the instruction that caused the page fault.  Do not allow
5758          * retrying MMIO emulation, as it's not only pointless but could also
5759          * cause us to enter an infinite loop because the processor will keep
5760          * faulting on the non-existent MMIO address.  Retrying an instruction
5761          * from a nested guest is also pointless and dangerous as we are only
5762          * explicitly shadowing L1's page tables, i.e. unprotecting something
5763          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5764          */
5765         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5766                 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5767 emulate:
5768         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5769                                        insn_len);
5770 }
5771 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5772
5773 static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5774                                       u64 addr, hpa_t root_hpa)
5775 {
5776         struct kvm_shadow_walk_iterator iterator;
5777
5778         vcpu_clear_mmio_info(vcpu, addr);
5779
5780         if (!VALID_PAGE(root_hpa))
5781                 return;
5782
5783         write_lock(&vcpu->kvm->mmu_lock);
5784         for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
5785                 struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
5786
5787                 if (sp->unsync) {
5788                         int ret = kvm_sync_spte(vcpu, sp, iterator.index);
5789
5790                         if (ret < 0)
5791                                 mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
5792                         if (ret)
5793                                 kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
5794                 }
5795
5796                 if (!sp->unsync_children)
5797                         break;
5798         }
5799         write_unlock(&vcpu->kvm->mmu_lock);
5800 }
5801
5802 void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5803                              u64 addr, unsigned long roots)
5804 {
5805         int i;
5806
5807         WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
5808
5809         /* It's actually a GPA for vcpu->arch.guest_mmu.  */
5810         if (mmu != &vcpu->arch.guest_mmu) {
5811                 /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5812                 if (is_noncanonical_address(addr, vcpu))
5813                         return;
5814
5815                 static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
5816         }
5817
5818         if (!mmu->sync_spte)
5819                 return;
5820
5821         if (roots & KVM_MMU_ROOT_CURRENT)
5822                 __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
5823
5824         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5825                 if (roots & KVM_MMU_ROOT_PREVIOUS(i))
5826                         __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
5827         }
5828 }
5829 EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
5830
5831 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5832 {
5833         /*
5834          * INVLPG is required to invalidate any global mappings for the VA,
5835          * irrespective of PCID.  Blindly sync all roots as it would take
5836          * roughly the same amount of work/time to determine whether any of the
5837          * previous roots have a global mapping.
5838          *
5839          * Mappings not reachable via the current or previous cached roots will
5840          * be synced when switching to that new cr3, so nothing needs to be
5841          * done here for them.
5842          */
5843         kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
5844         ++vcpu->stat.invlpg;
5845 }
5846 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5847
5848
5849 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5850 {
5851         struct kvm_mmu *mmu = vcpu->arch.mmu;
5852         unsigned long roots = 0;
5853         uint i;
5854
5855         if (pcid == kvm_get_active_pcid(vcpu))
5856                 roots |= KVM_MMU_ROOT_CURRENT;
5857
5858         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5859                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5860                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
5861                         roots |= KVM_MMU_ROOT_PREVIOUS(i);
5862         }
5863
5864         if (roots)
5865                 kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
5866         ++vcpu->stat.invlpg;
5867
5868         /*
5869          * Mappings not reachable via the current cr3 or the prev_roots will be
5870          * synced when switching to that cr3, so nothing needs to be done here
5871          * for them.
5872          */
5873 }
5874
5875 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5876                        int tdp_max_root_level, int tdp_huge_page_level)
5877 {
5878         tdp_enabled = enable_tdp;
5879         tdp_root_level = tdp_forced_root_level;
5880         max_tdp_level = tdp_max_root_level;
5881
5882 #ifdef CONFIG_X86_64
5883         tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
5884 #endif
5885         /*
5886          * max_huge_page_level reflects KVM's MMU capabilities irrespective
5887          * of kernel support, e.g. KVM may be capable of using 1GB pages when
5888          * the kernel is not.  But, KVM never creates a page size greater than
5889          * what is used by the kernel for any given HVA, i.e. the kernel's
5890          * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5891          */
5892         if (tdp_enabled)
5893                 max_huge_page_level = tdp_huge_page_level;
5894         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5895                 max_huge_page_level = PG_LEVEL_1G;
5896         else
5897                 max_huge_page_level = PG_LEVEL_2M;
5898 }
5899 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5900
5901 /* The return value indicates if tlb flush on all vcpus is needed. */
5902 typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
5903                                     struct kvm_rmap_head *rmap_head,
5904                                     const struct kvm_memory_slot *slot);
5905
5906 static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
5907                                               const struct kvm_memory_slot *slot,
5908                                               slot_rmaps_handler fn,
5909                                               int start_level, int end_level,
5910                                               gfn_t start_gfn, gfn_t end_gfn,
5911                                               bool flush_on_yield, bool flush)
5912 {
5913         struct slot_rmap_walk_iterator iterator;
5914
5915         lockdep_assert_held_write(&kvm->mmu_lock);
5916
5917         for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
5918                         end_gfn, &iterator) {
5919                 if (iterator.rmap)
5920                         flush |= fn(kvm, iterator.rmap, slot);
5921
5922                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5923                         if (flush && flush_on_yield) {
5924                                 kvm_flush_remote_tlbs_range(kvm, start_gfn,
5925                                                             iterator.gfn - start_gfn + 1);
5926                                 flush = false;
5927                         }
5928                         cond_resched_rwlock_write(&kvm->mmu_lock);
5929                 }
5930         }
5931
5932         return flush;
5933 }
5934
5935 static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
5936                                             const struct kvm_memory_slot *slot,
5937                                             slot_rmaps_handler fn,
5938                                             int start_level, int end_level,
5939                                             bool flush_on_yield)
5940 {
5941         return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
5942                                  slot->base_gfn, slot->base_gfn + slot->npages - 1,
5943                                  flush_on_yield, false);
5944 }
5945
5946 static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
5947                                                const struct kvm_memory_slot *slot,
5948                                                slot_rmaps_handler fn,
5949                                                bool flush_on_yield)
5950 {
5951         return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
5952 }
5953
5954 static void free_mmu_pages(struct kvm_mmu *mmu)
5955 {
5956         if (!tdp_enabled && mmu->pae_root)
5957                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5958         free_page((unsigned long)mmu->pae_root);
5959         free_page((unsigned long)mmu->pml4_root);
5960         free_page((unsigned long)mmu->pml5_root);
5961 }
5962
5963 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5964 {
5965         struct page *page;
5966         int i;
5967
5968         mmu->root.hpa = INVALID_PAGE;
5969         mmu->root.pgd = 0;
5970         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5971                 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5972
5973         /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5974         if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5975                 return 0;
5976
5977         /*
5978          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5979          * while the PDP table is a per-vCPU construct that's allocated at MMU
5980          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5981          * x86_64.  Therefore we need to allocate the PDP table in the first
5982          * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
5983          * generally doesn't use PAE paging and can skip allocating the PDP
5984          * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
5985          * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
5986          * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
5987          */
5988         if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5989                 return 0;
5990
5991         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5992         if (!page)
5993                 return -ENOMEM;
5994
5995         mmu->pae_root = page_address(page);
5996
5997         /*
5998          * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
5999          * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
6000          * that KVM's writes and the CPU's reads get along.  Note, this is
6001          * only necessary when using shadow paging, as 64-bit NPT can get at
6002          * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
6003          * by 32-bit kernels (when KVM itself uses 32-bit NPT).
6004          */
6005         if (!tdp_enabled)
6006                 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
6007         else
6008                 WARN_ON_ONCE(shadow_me_value);
6009
6010         for (i = 0; i < 4; ++i)
6011                 mmu->pae_root[i] = INVALID_PAE_ROOT;
6012
6013         return 0;
6014 }
6015
6016 int kvm_mmu_create(struct kvm_vcpu *vcpu)
6017 {
6018         int ret;
6019
6020         vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
6021         vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
6022
6023         vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
6024         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
6025
6026         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
6027
6028         vcpu->arch.mmu = &vcpu->arch.root_mmu;
6029         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
6030
6031         ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
6032         if (ret)
6033                 return ret;
6034
6035         ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
6036         if (ret)
6037                 goto fail_allocate_root;
6038
6039         return ret;
6040  fail_allocate_root:
6041         free_mmu_pages(&vcpu->arch.guest_mmu);
6042         return ret;
6043 }
6044
6045 #define BATCH_ZAP_PAGES 10
6046 static void kvm_zap_obsolete_pages(struct kvm *kvm)
6047 {
6048         struct kvm_mmu_page *sp, *node;
6049         int nr_zapped, batch = 0;
6050         bool unstable;
6051
6052 restart:
6053         list_for_each_entry_safe_reverse(sp, node,
6054               &kvm->arch.active_mmu_pages, link) {
6055                 /*
6056                  * No obsolete valid page exists before a newly created page
6057                  * since active_mmu_pages is a FIFO list.
6058                  */
6059                 if (!is_obsolete_sp(kvm, sp))
6060                         break;
6061
6062                 /*
6063                  * Invalid pages should never land back on the list of active
6064                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
6065                  * infinite loop if the page gets put back on the list (again).
6066                  */
6067                 if (WARN_ON(sp->role.invalid))
6068                         continue;
6069
6070                 /*
6071                  * No need to flush the TLB since we're only zapping shadow
6072                  * pages with an obsolete generation number and all vCPUS have
6073                  * loaded a new root, i.e. the shadow pages being zapped cannot
6074                  * be in active use by the guest.
6075                  */
6076                 if (batch >= BATCH_ZAP_PAGES &&
6077                     cond_resched_rwlock_write(&kvm->mmu_lock)) {
6078                         batch = 0;
6079                         goto restart;
6080                 }
6081
6082                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
6083                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped);
6084                 batch += nr_zapped;
6085
6086                 if (unstable)
6087                         goto restart;
6088         }
6089
6090         /*
6091          * Kick all vCPUs (via remote TLB flush) before freeing the page tables
6092          * to ensure KVM is not in the middle of a lockless shadow page table
6093          * walk, which may reference the pages.  The remote TLB flush itself is
6094          * not required and is simply a convenient way to kick vCPUs as needed.
6095          * KVM performs a local TLB flush when allocating a new root (see
6096          * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
6097          * running with an obsolete MMU.
6098          */
6099         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
6100 }
6101
6102 /*
6103  * Fast invalidate all shadow pages and use lock-break technique
6104  * to zap obsolete pages.
6105  *
6106  * It's required when memslot is being deleted or VM is being
6107  * destroyed, in these cases, we should ensure that KVM MMU does
6108  * not use any resource of the being-deleted slot or all slots
6109  * after calling the function.
6110  */
6111 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
6112 {
6113         lockdep_assert_held(&kvm->slots_lock);
6114
6115         write_lock(&kvm->mmu_lock);
6116         trace_kvm_mmu_zap_all_fast(kvm);
6117
6118         /*
6119          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
6120          * held for the entire duration of zapping obsolete pages, it's
6121          * impossible for there to be multiple invalid generations associated
6122          * with *valid* shadow pages at any given time, i.e. there is exactly
6123          * one valid generation and (at most) one invalid generation.
6124          */
6125         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
6126
6127         /*
6128          * In order to ensure all vCPUs drop their soon-to-be invalid roots,
6129          * invalidating TDP MMU roots must be done while holding mmu_lock for
6130          * write and in the same critical section as making the reload request,
6131          * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
6132          */
6133         if (tdp_mmu_enabled)
6134                 kvm_tdp_mmu_invalidate_all_roots(kvm);
6135
6136         /*
6137          * Notify all vcpus to reload its shadow page table and flush TLB.
6138          * Then all vcpus will switch to new shadow page table with the new
6139          * mmu_valid_gen.
6140          *
6141          * Note: we need to do this under the protection of mmu_lock,
6142          * otherwise, vcpu would purge shadow page but miss tlb flush.
6143          */
6144         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
6145
6146         kvm_zap_obsolete_pages(kvm);
6147
6148         write_unlock(&kvm->mmu_lock);
6149
6150         /*
6151          * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
6152          * returning to the caller, e.g. if the zap is in response to a memslot
6153          * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
6154          * associated with the deleted memslot once the update completes, and
6155          * Deferring the zap until the final reference to the root is put would
6156          * lead to use-after-free.
6157          */
6158         if (tdp_mmu_enabled)
6159                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
6160 }
6161
6162 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
6163 {
6164         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
6165 }
6166
6167 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
6168                         struct kvm_memory_slot *slot,
6169                         struct kvm_page_track_notifier_node *node)
6170 {
6171         kvm_mmu_zap_all_fast(kvm);
6172 }
6173
6174 int kvm_mmu_init_vm(struct kvm *kvm)
6175 {
6176         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
6177         int r;
6178
6179         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6180         INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
6181         INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
6182         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
6183
6184         if (tdp_mmu_enabled) {
6185                 r = kvm_mmu_init_tdp_mmu(kvm);
6186                 if (r < 0)
6187                         return r;
6188         }
6189
6190         node->track_write = kvm_mmu_pte_write;
6191         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
6192         kvm_page_track_register_notifier(kvm, node);
6193
6194         kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
6195         kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
6196
6197         kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
6198
6199         kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
6200         kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
6201
6202         return 0;
6203 }
6204
6205 static void mmu_free_vm_memory_caches(struct kvm *kvm)
6206 {
6207         kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
6208         kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
6209         kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
6210 }
6211
6212 void kvm_mmu_uninit_vm(struct kvm *kvm)
6213 {
6214         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
6215
6216         kvm_page_track_unregister_notifier(kvm, node);
6217
6218         if (tdp_mmu_enabled)
6219                 kvm_mmu_uninit_tdp_mmu(kvm);
6220
6221         mmu_free_vm_memory_caches(kvm);
6222 }
6223
6224 static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6225 {
6226         const struct kvm_memory_slot *memslot;
6227         struct kvm_memslots *slots;
6228         struct kvm_memslot_iter iter;
6229         bool flush = false;
6230         gfn_t start, end;
6231         int i;
6232
6233         if (!kvm_memslots_have_rmaps(kvm))
6234                 return flush;
6235
6236         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6237                 slots = __kvm_memslots(kvm, i);
6238
6239                 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
6240                         memslot = iter.slot;
6241                         start = max(gfn_start, memslot->base_gfn);
6242                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
6243                         if (WARN_ON_ONCE(start >= end))
6244                                 continue;
6245
6246                         flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
6247                                                   PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
6248                                                   start, end - 1, true, flush);
6249                 }
6250         }
6251
6252         return flush;
6253 }
6254
6255 /*
6256  * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
6257  * (not including it)
6258  */
6259 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6260 {
6261         bool flush;
6262         int i;
6263
6264         if (WARN_ON_ONCE(gfn_end <= gfn_start))
6265                 return;
6266
6267         write_lock(&kvm->mmu_lock);
6268
6269         kvm_mmu_invalidate_begin(kvm, 0, -1ul);
6270
6271         flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
6272
6273         if (tdp_mmu_enabled) {
6274                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
6275                         flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
6276                                                       gfn_end, true, flush);
6277         }
6278
6279         if (flush)
6280                 kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
6281
6282         kvm_mmu_invalidate_end(kvm, 0, -1ul);
6283
6284         write_unlock(&kvm->mmu_lock);
6285 }
6286
6287 static bool slot_rmap_write_protect(struct kvm *kvm,
6288                                     struct kvm_rmap_head *rmap_head,
6289                                     const struct kvm_memory_slot *slot)
6290 {
6291         return rmap_write_protect(rmap_head, false);
6292 }
6293
6294 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
6295                                       const struct kvm_memory_slot *memslot,
6296                                       int start_level)
6297 {
6298         if (kvm_memslots_have_rmaps(kvm)) {
6299                 write_lock(&kvm->mmu_lock);
6300                 walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
6301                                 start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
6302                 write_unlock(&kvm->mmu_lock);
6303         }
6304
6305         if (tdp_mmu_enabled) {
6306                 read_lock(&kvm->mmu_lock);
6307                 kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
6308                 read_unlock(&kvm->mmu_lock);
6309         }
6310 }
6311
6312 static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
6313 {
6314         return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
6315 }
6316
6317 static bool need_topup_split_caches_or_resched(struct kvm *kvm)
6318 {
6319         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
6320                 return true;
6321
6322         /*
6323          * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
6324          * to split a single huge page. Calculating how many are actually needed
6325          * is possible but not worth the complexity.
6326          */
6327         return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
6328                need_topup(&kvm->arch.split_page_header_cache, 1) ||
6329                need_topup(&kvm->arch.split_shadow_page_cache, 1);
6330 }
6331
6332 static int topup_split_caches(struct kvm *kvm)
6333 {
6334         /*
6335          * Allocating rmap list entries when splitting huge pages for nested
6336          * MMUs is uncommon as KVM needs to use a list if and only if there is
6337          * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
6338          * aliased by multiple L2 gfns and/or from multiple nested roots with
6339          * different roles.  Aliasing gfns when using TDP is atypical for VMMs;
6340          * a few gfns are often aliased during boot, e.g. when remapping BIOS,
6341          * but aliasing rarely occurs post-boot or for many gfns.  If there is
6342          * only one rmap entry, rmap->val points directly at that one entry and
6343          * doesn't need to allocate a list.  Buffer the cache by the default
6344          * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
6345          * encounters an aliased gfn or two.
6346          */
6347         const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
6348                              KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
6349         int r;
6350
6351         lockdep_assert_held(&kvm->slots_lock);
6352
6353         r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
6354                                          SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
6355         if (r)
6356                 return r;
6357
6358         r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
6359         if (r)
6360                 return r;
6361
6362         return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
6363 }
6364
6365 static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
6366 {
6367         struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6368         struct shadow_page_caches caches = {};
6369         union kvm_mmu_page_role role;
6370         unsigned int access;
6371         gfn_t gfn;
6372
6373         gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6374         access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
6375
6376         /*
6377          * Note, huge page splitting always uses direct shadow pages, regardless
6378          * of whether the huge page itself is mapped by a direct or indirect
6379          * shadow page, since the huge page region itself is being directly
6380          * mapped with smaller pages.
6381          */
6382         role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
6383
6384         /* Direct SPs do not require a shadowed_info_cache. */
6385         caches.page_header_cache = &kvm->arch.split_page_header_cache;
6386         caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
6387
6388         /* Safe to pass NULL for vCPU since requesting a direct SP. */
6389         return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
6390 }
6391
6392 static void shadow_mmu_split_huge_page(struct kvm *kvm,
6393                                        const struct kvm_memory_slot *slot,
6394                                        u64 *huge_sptep)
6395
6396 {
6397         struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
6398         u64 huge_spte = READ_ONCE(*huge_sptep);
6399         struct kvm_mmu_page *sp;
6400         bool flush = false;
6401         u64 *sptep, spte;
6402         gfn_t gfn;
6403         int index;
6404
6405         sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
6406
6407         for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
6408                 sptep = &sp->spt[index];
6409                 gfn = kvm_mmu_page_get_gfn(sp, index);
6410
6411                 /*
6412                  * The SP may already have populated SPTEs, e.g. if this huge
6413                  * page is aliased by multiple sptes with the same access
6414                  * permissions. These entries are guaranteed to map the same
6415                  * gfn-to-pfn translation since the SP is direct, so no need to
6416                  * modify them.
6417                  *
6418                  * However, if a given SPTE points to a lower level page table,
6419                  * that lower level page table may only be partially populated.
6420                  * Installing such SPTEs would effectively unmap a potion of the
6421                  * huge page. Unmapping guest memory always requires a TLB flush
6422                  * since a subsequent operation on the unmapped regions would
6423                  * fail to detect the need to flush.
6424                  */
6425                 if (is_shadow_present_pte(*sptep)) {
6426                         flush |= !is_last_spte(*sptep, sp->role.level);
6427                         continue;
6428                 }
6429
6430                 spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
6431                 mmu_spte_set(sptep, spte);
6432                 __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
6433         }
6434
6435         __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
6436 }
6437
6438 static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
6439                                           const struct kvm_memory_slot *slot,
6440                                           u64 *huge_sptep)
6441 {
6442         struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6443         int level, r = 0;
6444         gfn_t gfn;
6445         u64 spte;
6446
6447         /* Grab information for the tracepoint before dropping the MMU lock. */
6448         gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6449         level = huge_sp->role.level;
6450         spte = *huge_sptep;
6451
6452         if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
6453                 r = -ENOSPC;
6454                 goto out;
6455         }
6456
6457         if (need_topup_split_caches_or_resched(kvm)) {
6458                 write_unlock(&kvm->mmu_lock);
6459                 cond_resched();
6460                 /*
6461                  * If the topup succeeds, return -EAGAIN to indicate that the
6462                  * rmap iterator should be restarted because the MMU lock was
6463                  * dropped.
6464                  */
6465                 r = topup_split_caches(kvm) ?: -EAGAIN;
6466                 write_lock(&kvm->mmu_lock);
6467                 goto out;
6468         }
6469
6470         shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
6471
6472 out:
6473         trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
6474         return r;
6475 }
6476
6477 static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6478                                             struct kvm_rmap_head *rmap_head,
6479                                             const struct kvm_memory_slot *slot)
6480 {
6481         struct rmap_iterator iter;
6482         struct kvm_mmu_page *sp;
6483         u64 *huge_sptep;
6484         int r;
6485
6486 restart:
6487         for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
6488                 sp = sptep_to_sp(huge_sptep);
6489
6490                 /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
6491                 if (WARN_ON_ONCE(!sp->role.guest_mode))
6492                         continue;
6493
6494                 /* The rmaps should never contain non-leaf SPTEs. */
6495                 if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
6496                         continue;
6497
6498                 /* SPs with level >PG_LEVEL_4K should never by unsync. */
6499                 if (WARN_ON_ONCE(sp->unsync))
6500                         continue;
6501
6502                 /* Don't bother splitting huge pages on invalid SPs. */
6503                 if (sp->role.invalid)
6504                         continue;
6505
6506                 r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
6507
6508                 /*
6509                  * The split succeeded or needs to be retried because the MMU
6510                  * lock was dropped. Either way, restart the iterator to get it
6511                  * back into a consistent state.
6512                  */
6513                 if (!r || r == -EAGAIN)
6514                         goto restart;
6515
6516                 /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
6517                 break;
6518         }
6519
6520         return false;
6521 }
6522
6523 static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6524                                                 const struct kvm_memory_slot *slot,
6525                                                 gfn_t start, gfn_t end,
6526                                                 int target_level)
6527 {
6528         int level;
6529
6530         /*
6531          * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
6532          * down to the target level. This ensures pages are recursively split
6533          * all the way to the target level. There's no need to split pages
6534          * already at the target level.
6535          */
6536         for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
6537                 __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
6538                                   level, level, start, end - 1, true, false);
6539 }
6540
6541 /* Must be called with the mmu_lock held in write-mode. */
6542 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
6543                                    const struct kvm_memory_slot *memslot,
6544                                    u64 start, u64 end,
6545                                    int target_level)
6546 {
6547         if (!tdp_mmu_enabled)
6548                 return;
6549
6550         if (kvm_memslots_have_rmaps(kvm))
6551                 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6552
6553         kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
6554
6555         /*
6556          * A TLB flush is unnecessary at this point for the same resons as in
6557          * kvm_mmu_slot_try_split_huge_pages().
6558          */
6559 }
6560
6561 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
6562                                         const struct kvm_memory_slot *memslot,
6563                                         int target_level)
6564 {
6565         u64 start = memslot->base_gfn;
6566         u64 end = start + memslot->npages;
6567
6568         if (!tdp_mmu_enabled)
6569                 return;
6570
6571         if (kvm_memslots_have_rmaps(kvm)) {
6572                 write_lock(&kvm->mmu_lock);
6573                 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6574                 write_unlock(&kvm->mmu_lock);
6575         }
6576
6577         read_lock(&kvm->mmu_lock);
6578         kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
6579         read_unlock(&kvm->mmu_lock);
6580
6581         /*
6582          * No TLB flush is necessary here. KVM will flush TLBs after
6583          * write-protecting and/or clearing dirty on the newly split SPTEs to
6584          * ensure that guest writes are reflected in the dirty log before the
6585          * ioctl to enable dirty logging on this memslot completes. Since the
6586          * split SPTEs retain the write and dirty bits of the huge SPTE, it is
6587          * safe for KVM to decide if a TLB flush is necessary based on the split
6588          * SPTEs.
6589          */
6590 }
6591
6592 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6593                                          struct kvm_rmap_head *rmap_head,
6594                                          const struct kvm_memory_slot *slot)
6595 {
6596         u64 *sptep;
6597         struct rmap_iterator iter;
6598         int need_tlb_flush = 0;
6599         struct kvm_mmu_page *sp;
6600
6601 restart:
6602         for_each_rmap_spte(rmap_head, &iter, sptep) {
6603                 sp = sptep_to_sp(sptep);
6604
6605                 /*
6606                  * We cannot do huge page mapping for indirect shadow pages,
6607                  * which are found on the last rmap (level = 1) when not using
6608                  * tdp; such shadow pages are synced with the page table in
6609                  * the guest, and the guest page table is using 4K page size
6610                  * mapping if the indirect sp has level = 1.
6611                  */
6612                 if (sp->role.direct &&
6613                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
6614                                                                PG_LEVEL_NUM)) {
6615                         kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
6616
6617                         if (kvm_available_flush_tlb_with_range())
6618                                 kvm_flush_remote_tlbs_sptep(kvm, sptep);
6619                         else
6620                                 need_tlb_flush = 1;
6621
6622                         goto restart;
6623                 }
6624         }
6625
6626         return need_tlb_flush;
6627 }
6628
6629 static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6630                                            const struct kvm_memory_slot *slot)
6631 {
6632         /*
6633          * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
6634          * pages that are already mapped at the maximum hugepage level.
6635          */
6636         if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
6637                             PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
6638                 kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
6639 }
6640
6641 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6642                                    const struct kvm_memory_slot *slot)
6643 {
6644         if (kvm_memslots_have_rmaps(kvm)) {
6645                 write_lock(&kvm->mmu_lock);
6646                 kvm_rmap_zap_collapsible_sptes(kvm, slot);
6647                 write_unlock(&kvm->mmu_lock);
6648         }
6649
6650         if (tdp_mmu_enabled) {
6651                 read_lock(&kvm->mmu_lock);
6652                 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6653                 read_unlock(&kvm->mmu_lock);
6654         }
6655 }
6656
6657 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
6658                                         const struct kvm_memory_slot *memslot)
6659 {
6660         /*
6661          * All current use cases for flushing the TLBs for a specific memslot
6662          * related to dirty logging, and many do the TLB flush out of mmu_lock.
6663          * The interaction between the various operations on memslot must be
6664          * serialized by slots_locks to ensure the TLB flush from one operation
6665          * is observed by any other operation on the same memslot.
6666          */
6667         lockdep_assert_held(&kvm->slots_lock);
6668         kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
6669 }
6670
6671 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6672                                    const struct kvm_memory_slot *memslot)
6673 {
6674         if (kvm_memslots_have_rmaps(kvm)) {
6675                 write_lock(&kvm->mmu_lock);
6676                 /*
6677                  * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6678                  * support dirty logging at a 4k granularity.
6679                  */
6680                 walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
6681                 write_unlock(&kvm->mmu_lock);
6682         }
6683
6684         if (tdp_mmu_enabled) {
6685                 read_lock(&kvm->mmu_lock);
6686                 kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6687                 read_unlock(&kvm->mmu_lock);
6688         }
6689
6690         /*
6691          * The caller will flush the TLBs after this function returns.
6692          *
6693          * It's also safe to flush TLBs out of mmu lock here as currently this
6694          * function is only used for dirty logging, in which case flushing TLB
6695          * out of mmu lock also guarantees no dirty pages will be lost in
6696          * dirty_bitmap.
6697          */
6698 }
6699
6700 void kvm_mmu_zap_all(struct kvm *kvm)
6701 {
6702         struct kvm_mmu_page *sp, *node;
6703         LIST_HEAD(invalid_list);
6704         int ign;
6705
6706         write_lock(&kvm->mmu_lock);
6707 restart:
6708         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6709                 if (WARN_ON(sp->role.invalid))
6710                         continue;
6711                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6712                         goto restart;
6713                 if (cond_resched_rwlock_write(&kvm->mmu_lock))
6714                         goto restart;
6715         }
6716
6717         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6718
6719         if (tdp_mmu_enabled)
6720                 kvm_tdp_mmu_zap_all(kvm);
6721
6722         write_unlock(&kvm->mmu_lock);
6723 }
6724
6725 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6726 {
6727         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6728
6729         gen &= MMIO_SPTE_GEN_MASK;
6730
6731         /*
6732          * Generation numbers are incremented in multiples of the number of
6733          * address spaces in order to provide unique generations across all
6734          * address spaces.  Strip what is effectively the address space
6735          * modifier prior to checking for a wrap of the MMIO generation so
6736          * that a wrap in any address space is detected.
6737          */
6738         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6739
6740         /*
6741          * The very rare case: if the MMIO generation number has wrapped,
6742          * zap all shadow pages.
6743          */
6744         if (unlikely(gen == 0)) {
6745                 kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
6746                 kvm_mmu_zap_all_fast(kvm);
6747         }
6748 }
6749
6750 static unsigned long mmu_shrink_scan(struct shrinker *shrink,
6751                                      struct shrink_control *sc)
6752 {
6753         struct kvm *kvm;
6754         int nr_to_scan = sc->nr_to_scan;
6755         unsigned long freed = 0;
6756
6757         mutex_lock(&kvm_lock);
6758
6759         list_for_each_entry(kvm, &vm_list, vm_list) {
6760                 int idx;
6761                 LIST_HEAD(invalid_list);
6762
6763                 /*
6764                  * Never scan more than sc->nr_to_scan VM instances.
6765                  * Will not hit this condition practically since we do not try
6766                  * to shrink more than one VM and it is very unlikely to see
6767                  * !n_used_mmu_pages so many times.
6768                  */
6769                 if (!nr_to_scan--)
6770                         break;
6771                 /*
6772                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6773                  * here. We may skip a VM instance errorneosly, but we do not
6774                  * want to shrink a VM that only started to populate its MMU
6775                  * anyway.
6776                  */
6777                 if (!kvm->arch.n_used_mmu_pages &&
6778                     !kvm_has_zapped_obsolete_pages(kvm))
6779                         continue;
6780
6781                 idx = srcu_read_lock(&kvm->srcu);
6782                 write_lock(&kvm->mmu_lock);
6783
6784                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6785                         kvm_mmu_commit_zap_page(kvm,
6786                               &kvm->arch.zapped_obsolete_pages);
6787                         goto unlock;
6788                 }
6789
6790                 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6791
6792 unlock:
6793                 write_unlock(&kvm->mmu_lock);
6794                 srcu_read_unlock(&kvm->srcu, idx);
6795
6796                 /*
6797                  * unfair on small ones
6798                  * per-vm shrinkers cry out
6799                  * sadness comes quickly
6800                  */
6801                 list_move_tail(&kvm->vm_list, &vm_list);
6802                 break;
6803         }
6804
6805         mutex_unlock(&kvm_lock);
6806         return freed;
6807 }
6808
6809 static unsigned long mmu_shrink_count(struct shrinker *shrink,
6810                                       struct shrink_control *sc)
6811 {
6812         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6813 }
6814
6815 static struct shrinker mmu_shrinker = {
6816         .count_objects = mmu_shrink_count,
6817         .scan_objects = mmu_shrink_scan,
6818         .seeks = DEFAULT_SEEKS * 10,
6819 };
6820
6821 static void mmu_destroy_caches(void)
6822 {
6823         kmem_cache_destroy(pte_list_desc_cache);
6824         kmem_cache_destroy(mmu_page_header_cache);
6825 }
6826
6827 static bool get_nx_auto_mode(void)
6828 {
6829         /* Return true when CPU has the bug, and mitigations are ON */
6830         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6831 }
6832
6833 static void __set_nx_huge_pages(bool val)
6834 {
6835         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6836 }
6837
6838 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6839 {
6840         bool old_val = nx_huge_pages;
6841         bool new_val;
6842
6843         /* In "auto" mode deploy workaround only if CPU has the bug. */
6844         if (sysfs_streq(val, "off"))
6845                 new_val = 0;
6846         else if (sysfs_streq(val, "force"))
6847                 new_val = 1;
6848         else if (sysfs_streq(val, "auto"))
6849                 new_val = get_nx_auto_mode();
6850         else if (kstrtobool(val, &new_val) < 0)
6851                 return -EINVAL;
6852
6853         __set_nx_huge_pages(new_val);
6854
6855         if (new_val != old_val) {
6856                 struct kvm *kvm;
6857
6858                 mutex_lock(&kvm_lock);
6859
6860                 list_for_each_entry(kvm, &vm_list, vm_list) {
6861                         mutex_lock(&kvm->slots_lock);
6862                         kvm_mmu_zap_all_fast(kvm);
6863                         mutex_unlock(&kvm->slots_lock);
6864
6865                         wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
6866                 }
6867                 mutex_unlock(&kvm_lock);
6868         }
6869
6870         return 0;
6871 }
6872
6873 /*
6874  * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6875  * its default value of -1 is technically undefined behavior for a boolean.
6876  * Forward the module init call to SPTE code so that it too can handle module
6877  * params that need to be resolved/snapshot.
6878  */
6879 void __init kvm_mmu_x86_module_init(void)
6880 {
6881         if (nx_huge_pages == -1)
6882                 __set_nx_huge_pages(get_nx_auto_mode());
6883
6884         /*
6885          * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
6886          * TDP MMU is actually enabled is determined in kvm_configure_mmu()
6887          * when the vendor module is loaded.
6888          */
6889         tdp_mmu_allowed = tdp_mmu_enabled;
6890
6891         kvm_mmu_spte_module_init();
6892 }
6893
6894 /*
6895  * The bulk of the MMU initialization is deferred until the vendor module is
6896  * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6897  * to be reset when a potentially different vendor module is loaded.
6898  */
6899 int kvm_mmu_vendor_module_init(void)
6900 {
6901         int ret = -ENOMEM;
6902
6903         /*
6904          * MMU roles use union aliasing which is, generally speaking, an
6905          * undefined behavior. However, we supposedly know how compilers behave
6906          * and the current status quo is unlikely to change. Guardians below are
6907          * supposed to let us know if the assumption becomes false.
6908          */
6909         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6910         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6911         BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
6912
6913         kvm_mmu_reset_all_pte_masks();
6914
6915         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6916                                             sizeof(struct pte_list_desc),
6917                                             0, SLAB_ACCOUNT, NULL);
6918         if (!pte_list_desc_cache)
6919                 goto out;
6920
6921         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6922                                                   sizeof(struct kvm_mmu_page),
6923                                                   0, SLAB_ACCOUNT, NULL);
6924         if (!mmu_page_header_cache)
6925                 goto out;
6926
6927         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6928                 goto out;
6929
6930         ret = register_shrinker(&mmu_shrinker, "x86-mmu");
6931         if (ret)
6932                 goto out_shrinker;
6933
6934         return 0;
6935
6936 out_shrinker:
6937         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6938 out:
6939         mmu_destroy_caches();
6940         return ret;
6941 }
6942
6943 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6944 {
6945         kvm_mmu_unload(vcpu);
6946         free_mmu_pages(&vcpu->arch.root_mmu);
6947         free_mmu_pages(&vcpu->arch.guest_mmu);
6948         mmu_free_memory_caches(vcpu);
6949 }
6950
6951 void kvm_mmu_vendor_module_exit(void)
6952 {
6953         mmu_destroy_caches();
6954         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6955         unregister_shrinker(&mmu_shrinker);
6956 }
6957
6958 /*
6959  * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6960  * select a halving time of 1 hour".  Returns true if recovery is enabled.
6961  */
6962 static bool calc_nx_huge_pages_recovery_period(uint *period)
6963 {
6964         /*
6965          * Use READ_ONCE to get the params, this may be called outside of the
6966          * param setters, e.g. by the kthread to compute its next timeout.
6967          */
6968         bool enabled = READ_ONCE(nx_huge_pages);
6969         uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6970
6971         if (!enabled || !ratio)
6972                 return false;
6973
6974         *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6975         if (!*period) {
6976                 /* Make sure the period is not less than one second.  */
6977                 ratio = min(ratio, 3600u);
6978                 *period = 60 * 60 * 1000 / ratio;
6979         }
6980         return true;
6981 }
6982
6983 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6984 {
6985         bool was_recovery_enabled, is_recovery_enabled;
6986         uint old_period, new_period;
6987         int err;
6988
6989         was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6990
6991         err = param_set_uint(val, kp);
6992         if (err)
6993                 return err;
6994
6995         is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6996
6997         if (is_recovery_enabled &&
6998             (!was_recovery_enabled || old_period > new_period)) {
6999                 struct kvm *kvm;
7000
7001                 mutex_lock(&kvm_lock);
7002
7003                 list_for_each_entry(kvm, &vm_list, vm_list)
7004                         wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
7005
7006                 mutex_unlock(&kvm_lock);
7007         }
7008
7009         return err;
7010 }
7011
7012 static void kvm_recover_nx_huge_pages(struct kvm *kvm)
7013 {
7014         unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
7015         struct kvm_memory_slot *slot;
7016         int rcu_idx;
7017         struct kvm_mmu_page *sp;
7018         unsigned int ratio;
7019         LIST_HEAD(invalid_list);
7020         bool flush = false;
7021         ulong to_zap;
7022
7023         rcu_idx = srcu_read_lock(&kvm->srcu);
7024         write_lock(&kvm->mmu_lock);
7025
7026         /*
7027          * Zapping TDP MMU shadow pages, including the remote TLB flush, must
7028          * be done under RCU protection, because the pages are freed via RCU
7029          * callback.
7030          */
7031         rcu_read_lock();
7032
7033         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7034         to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
7035         for ( ; to_zap; --to_zap) {
7036                 if (list_empty(&kvm->arch.possible_nx_huge_pages))
7037                         break;
7038
7039                 /*
7040                  * We use a separate list instead of just using active_mmu_pages
7041                  * because the number of shadow pages that be replaced with an
7042                  * NX huge page is expected to be relatively small compared to
7043                  * the total number of shadow pages.  And because the TDP MMU
7044                  * doesn't use active_mmu_pages.
7045                  */
7046                 sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
7047                                       struct kvm_mmu_page,
7048                                       possible_nx_huge_page_link);
7049                 WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
7050                 WARN_ON_ONCE(!sp->role.direct);
7051
7052                 /*
7053                  * Unaccount and do not attempt to recover any NX Huge Pages
7054                  * that are being dirty tracked, as they would just be faulted
7055                  * back in as 4KiB pages. The NX Huge Pages in this slot will be
7056                  * recovered, along with all the other huge pages in the slot,
7057                  * when dirty logging is disabled.
7058                  *
7059                  * Since gfn_to_memslot() is relatively expensive, it helps to
7060                  * skip it if it the test cannot possibly return true.  On the
7061                  * other hand, if any memslot has logging enabled, chances are
7062                  * good that all of them do, in which case unaccount_nx_huge_page()
7063                  * is much cheaper than zapping the page.
7064                  *
7065                  * If a memslot update is in progress, reading an incorrect value
7066                  * of kvm->nr_memslots_dirty_logging is not a problem: if it is
7067                  * becoming zero, gfn_to_memslot() will be done unnecessarily; if
7068                  * it is becoming nonzero, the page will be zapped unnecessarily.
7069                  * Either way, this only affects efficiency in racy situations,
7070                  * and not correctness.
7071                  */
7072                 slot = NULL;
7073                 if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
7074                         slot = gfn_to_memslot(kvm, sp->gfn);
7075                         WARN_ON_ONCE(!slot);
7076                 }
7077
7078                 if (slot && kvm_slot_dirty_track_enabled(slot))
7079                         unaccount_nx_huge_page(kvm, sp);
7080                 else if (is_tdp_mmu_page(sp))
7081                         flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
7082                 else
7083                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
7084                 WARN_ON_ONCE(sp->nx_huge_page_disallowed);
7085
7086                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7087                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7088                         rcu_read_unlock();
7089
7090                         cond_resched_rwlock_write(&kvm->mmu_lock);
7091                         flush = false;
7092
7093                         rcu_read_lock();
7094                 }
7095         }
7096         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7097
7098         rcu_read_unlock();
7099
7100         write_unlock(&kvm->mmu_lock);
7101         srcu_read_unlock(&kvm->srcu, rcu_idx);
7102 }
7103
7104 static long get_nx_huge_page_recovery_timeout(u64 start_time)
7105 {
7106         bool enabled;
7107         uint period;
7108
7109         enabled = calc_nx_huge_pages_recovery_period(&period);
7110
7111         return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
7112                        : MAX_SCHEDULE_TIMEOUT;
7113 }
7114
7115 static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
7116 {
7117         u64 start_time;
7118         long remaining_time;
7119
7120         while (true) {
7121                 start_time = get_jiffies_64();
7122                 remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7123
7124                 set_current_state(TASK_INTERRUPTIBLE);
7125                 while (!kthread_should_stop() && remaining_time > 0) {
7126                         schedule_timeout(remaining_time);
7127                         remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7128                         set_current_state(TASK_INTERRUPTIBLE);
7129                 }
7130
7131                 set_current_state(TASK_RUNNING);
7132
7133                 if (kthread_should_stop())
7134                         return 0;
7135
7136                 kvm_recover_nx_huge_pages(kvm);
7137         }
7138 }
7139
7140 int kvm_mmu_post_init_vm(struct kvm *kvm)
7141 {
7142         int err;
7143
7144         err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
7145                                           "kvm-nx-lpage-recovery",
7146                                           &kvm->arch.nx_huge_page_recovery_thread);
7147         if (!err)
7148                 kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
7149
7150         return err;
7151 }
7152
7153 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
7154 {
7155         if (kvm->arch.nx_huge_page_recovery_thread)
7156                 kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
7157 }