KVM: x86/mmu: Separate TDP MMU shadow page allocation and initialization
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
71ba3f31 13static bool __read_mostly tdp_mmu_enabled = true;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
d501f747 17bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 18{
897218ff 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
d501f747 20 return false;
fe5db27d
BG
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
02c00b3a
BG
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
d501f747
BG
28
29 return true;
fe5db27d
BG
30}
31
6103bc07
BG
32static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 bool shared)
34{
35 if (shared)
36 lockdep_assert_held_read(&kvm->mmu_lock);
37 else
38 lockdep_assert_held_write(&kvm->mmu_lock);
39}
40
fe5db27d
BG
41void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42{
43 if (!kvm->arch.tdp_mmu_enabled)
44 return;
02c00b3a 45
524a1e4e 46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
02c00b3a 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
48
49 /*
50 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 * can run before the VM is torn down.
52 */
53 rcu_barrier();
02c00b3a
BG
54}
55
2bdb3d84 56static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
57 gfn_t start, gfn_t end, bool can_yield, bool flush,
58 bool shared);
2bdb3d84
BG
59
60static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 61{
2bdb3d84
BG
62 free_page((unsigned long)sp->spt);
63 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
64}
65
c0e64238
BG
66/*
67 * This is called through call_rcu in order to free TDP page table memory
68 * safely with respect to other kernel threads that may be operating on
69 * the memory.
70 * By only accessing TDP MMU page table memory in an RCU read critical
71 * section, and freeing it after a grace period, lockless access to that
72 * memory won't use it after it is freed.
73 */
74static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 75{
c0e64238
BG
76 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 rcu_head);
a889ea54 78
c0e64238
BG
79 tdp_mmu_free_sp(sp);
80}
a889ea54 81
6103bc07
BG
82void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 bool shared)
2bdb3d84 84{
6103bc07 85 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 86
11cccf5c 87 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
88 return;
89
90 WARN_ON(!root->tdp_mmu_page);
91
c0e64238
BG
92 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 list_del_rcu(&root->link);
94 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
2bdb3d84 95
524a1e4e 96 zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
2bdb3d84 97
c0e64238 98 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
99}
100
cfc10997 101/*
d62007ed
SC
102 * Returns the next root after @prev_root (or the first root if @prev_root is
103 * NULL). A reference to the returned root is acquired, and the reference to
104 * @prev_root is released (the caller obviously must hold a reference to
105 * @prev_root if it's non-NULL).
106 *
107 * If @only_valid is true, invalid roots are skipped.
108 *
109 * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997
BG
110 */
111static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07 112 struct kvm_mmu_page *prev_root,
d62007ed 113 bool shared, bool only_valid)
a889ea54
BG
114{
115 struct kvm_mmu_page *next_root;
116
c0e64238
BG
117 rcu_read_lock();
118
cfc10997 119 if (prev_root)
c0e64238
BG
120 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
121 &prev_root->link,
122 typeof(*prev_root), link);
cfc10997 123 else
c0e64238
BG
124 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
125 typeof(*next_root), link);
a889ea54 126
04dc4e6c 127 while (next_root) {
d62007ed 128 if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94 129 kvm_tdp_mmu_get_root(next_root))
04dc4e6c
SC
130 break;
131
c0e64238
BG
132 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
133 &next_root->link, typeof(*next_root), link);
04dc4e6c 134 }
fb101293 135
c0e64238 136 rcu_read_unlock();
a889ea54 137
cfc10997 138 if (prev_root)
6103bc07 139 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 140
a889ea54
BG
141 return next_root;
142}
143
144/*
145 * Note: this iterator gets and puts references to the roots it iterates over.
146 * This makes it safe to release the MMU lock and yield within the loop, but
147 * if exiting the loop early, the caller must drop the reference to the most
148 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
149 *
150 * If shared is set, this function is operating under the MMU lock in read
151 * mode. In the unlikely event that this thread must free a root, the lock
152 * will be temporarily dropped and reacquired in write mode.
a889ea54 153 */
d62007ed
SC
154#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
155 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
156 _root; \
157 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
158 if (kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 159 } else
a889ea54 160
d62007ed
SC
161#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
162 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
163
164#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
165 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
166
c0e64238
BG
167#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
168 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
169 lockdep_is_held_type(&kvm->mmu_lock, 0) || \
170 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
a3f15bda
SC
171 if (kvm_mmu_page_as_id(_root) != _as_id) { \
172 } else
02c00b3a 173
a82070b6 174static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3a
BG
175{
176 struct kvm_mmu_page *sp;
177
178 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
179 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6
DM
180
181 return sp;
182}
183
184static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn,
185 union kvm_mmu_page_role role)
186{
02c00b3a
BG
187 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
188
a3aca4de 189 sp->role = role;
02c00b3a
BG
190 sp->gfn = gfn;
191 sp->tdp_mmu_page = true;
192
33dd3574 193 trace_kvm_mmu_get_page(sp, true);
02c00b3a
BG
194}
195
a82070b6
DM
196static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
197 struct tdp_iter *iter)
02c00b3a 198{
a3aca4de 199 struct kvm_mmu_page *parent_sp;
02c00b3a 200 union kvm_mmu_page_role role;
a3aca4de
DM
201
202 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
203
204 role = parent_sp->role;
205 role.level--;
206
a82070b6 207 tdp_mmu_init_sp(child_sp, iter->gfn, role);
a3aca4de
DM
208}
209
210hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
211{
212 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
02c00b3a
BG
213 struct kvm *kvm = vcpu->kvm;
214 struct kvm_mmu_page *root;
215
6e6ec584 216 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 217
04dc4e6c
SC
218 /*
219 * Check for an existing root before allocating a new one. Note, the
220 * role check prevents consuming an invalid root.
221 */
a3f15bda 222 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293 223 if (root->role.word == role.word &&
ad6d6b94 224 kvm_tdp_mmu_get_root(root))
6e6ec584 225 goto out;
02c00b3a
BG
226 }
227
a82070b6
DM
228 root = tdp_mmu_alloc_sp(vcpu);
229 tdp_mmu_init_sp(root, 0, role);
230
11cccf5c 231 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 232
c0e64238
BG
233 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
234 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
235 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 236
6e6ec584 237out:
02c00b3a 238 return __pa(root->spt);
fe5db27d 239}
2f2fad08
BG
240
241static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
242 u64 old_spte, u64 new_spte, int level,
243 bool shared);
2f2fad08 244
f8e14497
BG
245static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
246{
f8e14497
BG
247 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
248 return;
249
250 if (is_accessed_spte(old_spte) &&
64bb2769
SC
251 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
252 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
253 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
254}
255
a6a0b05d
BG
256static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
257 u64 old_spte, u64 new_spte, int level)
258{
259 bool pfn_changed;
260 struct kvm_memory_slot *slot;
261
262 if (level > PG_LEVEL_4K)
263 return;
264
265 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
266
267 if ((!is_writable_pte(old_spte) || pfn_changed) &&
268 is_writable_pte(new_spte)) {
269 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 270 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
271 }
272}
273
a9442f59 274/**
c298a30c 275 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59
BG
276 *
277 * @kvm: kvm instance
278 * @sp: the page to be removed
9a77daac
BG
279 * @shared: This operation may not be running under the exclusive use of
280 * the MMU lock and the operation must synchronize with other
281 * threads that might be adding or removing pages.
a9442f59 282 */
c298a30c
DM
283static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
284 bool shared)
a9442f59 285{
9a77daac
BG
286 if (shared)
287 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
288 else
289 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
290
291 list_del(&sp->link);
292 if (sp->lpage_disallowed)
293 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
294
295 if (shared)
296 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
297}
298
a066e61f 299/**
0f53dfa3 300 * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61f
BG
301 *
302 * @kvm: kvm instance
303 * @pt: the page removed from the paging structure
9a77daac
BG
304 * @shared: This operation may not be running under the exclusive use
305 * of the MMU lock and the operation must synchronize with other
306 * threads that might be modifying SPTEs.
a066e61f
BG
307 *
308 * Given a page table that has been removed from the TDP paging structure,
309 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
310 *
311 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
312 * protection. Since this thread removed it from the paging structure,
313 * this thread will be responsible for ensuring the page is freed. Hence the
314 * early rcu_dereferences in the function.
a066e61f 315 */
0f53dfa3 316static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61f 317{
70fb3e41 318 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 319 int level = sp->role.level;
e25f0e0c 320 gfn_t base_gfn = sp->gfn;
a066e61f
BG
321 int i;
322
323 trace_kvm_mmu_prepare_zap_page(sp);
324
c298a30c 325 tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61f
BG
326
327 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
574c3c55
BG
328 u64 *sptep = rcu_dereference(pt) + i;
329 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
330 u64 old_child_spte;
9a77daac
BG
331
332 if (shared) {
e25f0e0c
BG
333 /*
334 * Set the SPTE to a nonpresent value that other
335 * threads will not overwrite. If the SPTE was
336 * already marked as removed then another thread
337 * handling a page fault could overwrite it, so
338 * set the SPTE until it is set from some other
339 * value to the removed SPTE value.
340 */
341 for (;;) {
342 old_child_spte = xchg(sptep, REMOVED_SPTE);
343 if (!is_removed_spte(old_child_spte))
344 break;
345 cpu_relax();
346 }
9a77daac 347 } else {
8df9f1af
SC
348 /*
349 * If the SPTE is not MMU-present, there is no backing
350 * page associated with the SPTE and so no side effects
351 * that need to be recorded, and exclusive ownership of
352 * mmu_lock ensures the SPTE can't be made present.
353 * Note, zapping MMIO SPTEs is also unnecessary as they
354 * are guarded by the memslots generation, not by being
355 * unreachable.
356 */
9a77daac 357 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
358 if (!is_shadow_present_pte(old_child_spte))
359 continue;
e25f0e0c
BG
360
361 /*
362 * Marking the SPTE as a removed SPTE is not
363 * strictly necessary here as the MMU lock will
364 * stop other threads from concurrently modifying
365 * this SPTE. Using the removed SPTE value keeps
366 * the two branches consistent and simplifies
367 * the function.
368 */
369 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 370 }
e25f0e0c 371 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
f1b83255 372 old_child_spte, REMOVED_SPTE, level,
e25f0e0c 373 shared);
a066e61f
BG
374 }
375
574c3c55 376 kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
f1b83255 377 KVM_PAGES_PER_HPAGE(level + 1));
a066e61f 378
7cca2d0b 379 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
380}
381
2f2fad08 382/**
7f6231a3 383 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
384 * @kvm: kvm instance
385 * @as_id: the address space of the paging structure the SPTE was a part of
386 * @gfn: the base GFN that was mapped by the SPTE
387 * @old_spte: The value of the SPTE before the change
388 * @new_spte: The value of the SPTE after the change
389 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
390 * @shared: This operation may not be running under the exclusive use of
391 * the MMU lock and the operation must synchronize with other
392 * threads that might be modifying SPTEs.
2f2fad08
BG
393 *
394 * Handle bookkeeping that might result from the modification of a SPTE.
395 * This function must be called for all TDP SPTE modifications.
396 */
397static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
398 u64 old_spte, u64 new_spte, int level,
399 bool shared)
2f2fad08
BG
400{
401 bool was_present = is_shadow_present_pte(old_spte);
402 bool is_present = is_shadow_present_pte(new_spte);
403 bool was_leaf = was_present && is_last_spte(old_spte, level);
404 bool is_leaf = is_present && is_last_spte(new_spte, level);
405 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
406
407 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
408 WARN_ON(level < PG_LEVEL_4K);
764388ce 409 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
410
411 /*
412 * If this warning were to trigger it would indicate that there was a
413 * missing MMU notifier or a race with some notifier handler.
414 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 415 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
416 * should be zapping the SPTE before the main MM's page table is
417 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
418 * thread before replacement.
419 */
420 if (was_leaf && is_leaf && pfn_changed) {
421 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
422 "SPTE with another present leaf SPTE mapping a\n"
423 "different PFN!\n"
424 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
425 as_id, gfn, old_spte, new_spte, level);
426
427 /*
428 * Crash the host to prevent error propagation and guest data
d9f6e12f 429 * corruption.
2f2fad08
BG
430 */
431 BUG();
432 }
433
434 if (old_spte == new_spte)
435 return;
436
b9a98c34
BG
437 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
438
115111ef
DM
439 if (is_leaf)
440 check_spte_writable_invariants(new_spte);
441
2f2fad08
BG
442 /*
443 * The only times a SPTE should be changed from a non-present to
444 * non-present state is when an MMIO entry is installed/modified/
445 * removed. In that case, there is nothing to do here.
446 */
447 if (!was_present && !is_present) {
448 /*
08f07c80
BG
449 * If this change does not involve a MMIO SPTE or removed SPTE,
450 * it is unexpected. Log the change, though it should not
451 * impact the guest since both the former and current SPTEs
452 * are nonpresent.
2f2fad08 453 */
08f07c80
BG
454 if (WARN_ON(!is_mmio_spte(old_spte) &&
455 !is_mmio_spte(new_spte) &&
456 !is_removed_spte(new_spte)))
2f2fad08
BG
457 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
458 "should not be replaced with another,\n"
459 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
460 "are MMIO SPTEs, or the new SPTE is\n"
461 "a temporary removed SPTE.\n"
2f2fad08
BG
462 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
463 as_id, gfn, old_spte, new_spte, level);
464 return;
465 }
466
71f51d2c
MZ
467 if (is_leaf != was_leaf)
468 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
469
470 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 471 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
472 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
473
474 /*
475 * Recursively handle child PTs if the change removed a subtree from
476 * the paging structure.
477 */
a066e61f 478 if (was_present && !was_leaf && (pfn_changed || !is_present))
0f53dfa3 479 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
480}
481
482static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
483 u64 old_spte, u64 new_spte, int level,
484 bool shared)
2f2fad08 485{
9a77daac
BG
486 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
487 shared);
f8e14497 488 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
489 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
490 new_spte, level);
2f2fad08 491}
faaf05b0 492
9a77daac 493/*
6ccf4438
PB
494 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
495 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 496 * in KVM's dirty bitmaps.
9a77daac 497 *
3255530a
DM
498 * If setting the SPTE fails because it has changed, iter->old_spte will be
499 * refreshed to the current value of the spte.
500 *
9a77daac
BG
501 * @kvm: kvm instance
502 * @iter: a tdp_iter instance currently on the SPTE that should be set
503 * @new_spte: The value the SPTE should be set to
3e72c791
DM
504 * Return:
505 * * 0 - If the SPTE was set.
506 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
507 * no side-effects other than setting iter->old_spte to the last
508 * known value of the spte.
9a77daac 509 */
3e72c791
DM
510static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
511 struct tdp_iter *iter,
512 u64 new_spte)
9a77daac 513{
3255530a
DM
514 u64 *sptep = rcu_dereference(iter->sptep);
515 u64 old_spte;
516
3a0f64de
SC
517 WARN_ON_ONCE(iter->yielded);
518
9a77daac
BG
519 lockdep_assert_held_read(&kvm->mmu_lock);
520
08f07c80
BG
521 /*
522 * Do not change removed SPTEs. Only the thread that froze the SPTE
523 * may modify it.
524 */
7a51393a 525 if (is_removed_spte(iter->old_spte))
3e72c791 526 return -EBUSY;
08f07c80 527
6e8eb206
DM
528 /*
529 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
530 * does not hold the mmu_lock.
531 */
3255530a
DM
532 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
533 if (old_spte != iter->old_spte) {
534 /*
535 * The page table entry was modified by a different logical
536 * CPU. Refresh iter->old_spte with the current value so the
537 * caller operates on fresh data, e.g. if it retries
538 * tdp_mmu_set_spte_atomic().
539 */
540 iter->old_spte = old_spte;
3e72c791 541 return -EBUSY;
3255530a 542 }
9a77daac 543
24ae4cfa
BG
544 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
545 new_spte, iter->level, true);
546 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac 547
3e72c791 548 return 0;
9a77daac
BG
549}
550
3e72c791
DM
551static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
552 struct tdp_iter *iter)
08f07c80 553{
3e72c791
DM
554 int ret;
555
08f07c80
BG
556 /*
557 * Freeze the SPTE by setting it to a special,
558 * non-present value. This will stop other threads from
559 * immediately installing a present entry in its place
560 * before the TLBs are flushed.
561 */
3e72c791
DM
562 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
563 if (ret)
564 return ret;
08f07c80
BG
565
566 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
567 KVM_PAGES_PER_HPAGE(iter->level));
568
569 /*
570 * No other thread can overwrite the removed SPTE as they
571 * must either wait on the MMU lock or use
d9f6e12f 572 * tdp_mmu_set_spte_atomic which will not overwrite the
08f07c80
BG
573 * special removed SPTE value. No bookkeeping is needed
574 * here since the SPTE is going from non-present
575 * to non-present.
576 */
14f6fec2 577 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
08f07c80 578
3e72c791 579 return 0;
08f07c80
BG
580}
581
9a77daac 582
fe43fa2f
BG
583/*
584 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
585 * @kvm: kvm instance
586 * @iter: a tdp_iter instance currently on the SPTE that should be set
587 * @new_spte: The value the SPTE should be set to
588 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
589 * of the page. Should be set unless handling an MMU
590 * notifier for access tracking. Leaving record_acc_track
591 * unset in that case prevents page accesses from being
592 * double counted.
593 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
594 * appropriate for the change being made. Should be set
595 * unless performing certain dirty logging operations.
596 * Leaving record_dirty_log unset in that case prevents page
597 * writes from being double counted.
598 */
f8e14497 599static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
a6a0b05d
BG
600 u64 new_spte, bool record_acc_track,
601 bool record_dirty_log)
faaf05b0 602{
3a0f64de
SC
603 WARN_ON_ONCE(iter->yielded);
604
531810ca 605 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 606
08f07c80
BG
607 /*
608 * No thread should be using this function to set SPTEs to the
609 * temporary removed SPTE value.
610 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
611 * should be used. If operating under the MMU lock in write mode, the
612 * use of the removed SPTE should not be necessary.
613 */
7a51393a 614 WARN_ON(is_removed_spte(iter->old_spte));
08f07c80 615
7cca2d0b 616 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
f8e14497 617
08889894
SC
618 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
619 new_spte, iter->level, false);
f8e14497
BG
620 if (record_acc_track)
621 handle_changed_spte_acc_track(iter->old_spte, new_spte,
622 iter->level);
a6a0b05d 623 if (record_dirty_log)
08889894 624 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
a6a0b05d
BG
625 iter->old_spte, new_spte,
626 iter->level);
f8e14497
BG
627}
628
629static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
630 u64 new_spte)
631{
a6a0b05d 632 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 633}
faaf05b0 634
f8e14497
BG
635static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
636 struct tdp_iter *iter,
637 u64 new_spte)
638{
a6a0b05d
BG
639 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
640}
641
642static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
643 struct tdp_iter *iter,
644 u64 new_spte)
645{
646 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
647}
648
649#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075 650 for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0 651
f8e14497
BG
652#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
653 tdp_root_for_each_pte(_iter, _root, _start, _end) \
654 if (!is_shadow_present_pte(_iter.old_spte) || \
655 !is_last_spte(_iter.old_spte, _iter.level)) \
656 continue; \
657 else
658
bb18842e 659#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
77aa6075 660 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root_hpa), _start, _end)
bb18842e 661
e28a436c
BG
662/*
663 * Yield if the MMU lock is contended or this thread needs to return control
664 * to the scheduler.
665 *
e139a34e
BG
666 * If this function should yield and flush is set, it will perform a remote
667 * TLB flush before yielding.
668 *
3a0f64de
SC
669 * If this function yields, iter->yielded is set and the caller must skip to
670 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
671 * over the paging structures to allow the iterator to continue its traversal
672 * from the paging structure root.
e28a436c 673 *
3a0f64de 674 * Returns true if this function yielded.
e28a436c 675 */
3a0f64de
SC
676static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
677 struct tdp_iter *iter,
678 bool flush, bool shared)
a6a0b05d 679{
3a0f64de
SC
680 WARN_ON(iter->yielded);
681
ed5e484b
BG
682 /* Ensure forward progress has been made before yielding. */
683 if (iter->next_last_level_gfn == iter->yielded_gfn)
684 return false;
685
531810ca 686 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7cca2d0b
BG
687 rcu_read_unlock();
688
e139a34e
BG
689 if (flush)
690 kvm_flush_remote_tlbs(kvm);
691
6103bc07
BG
692 if (shared)
693 cond_resched_rwlock_read(&kvm->mmu_lock);
694 else
695 cond_resched_rwlock_write(&kvm->mmu_lock);
696
7cca2d0b 697 rcu_read_lock();
ed5e484b
BG
698
699 WARN_ON(iter->gfn > iter->next_last_level_gfn);
700
3a0f64de 701 iter->yielded = true;
a6a0b05d 702 }
e28a436c 703
3a0f64de 704 return iter->yielded;
a6a0b05d
BG
705}
706
faaf05b0
BG
707/*
708 * Tears down the mappings for the range of gfns, [start, end), and frees the
709 * non-root pages mapping GFNs strictly within that range. Returns true if
710 * SPTEs have been cleared and a TLB flush is needed before releasing the
711 * MMU lock.
6103bc07 712 *
063afacd
BG
713 * If can_yield is true, will release the MMU lock and reschedule if the
714 * scheduler needs the CPU or there is contention on the MMU lock. If this
715 * function cannot yield, it will not release the MMU lock or reschedule and
716 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07
BG
717 * operation can cause a soft lockup.
718 *
719 * If shared is true, this thread holds the MMU lock in read mode and must
720 * account for the possibility that other threads are modifying the paging
721 * structures concurrently. If shared is false, this thread should hold the
722 * MMU lock in write mode.
faaf05b0
BG
723 */
724static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
725 gfn_t start, gfn_t end, bool can_yield, bool flush,
726 bool shared)
faaf05b0 727{
524a1e4e
SC
728 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
729 bool zap_all = (start == 0 && end >= max_gfn_host);
faaf05b0 730 struct tdp_iter iter;
faaf05b0 731
0103098f
SC
732 /*
733 * No need to try to step down in the iterator when zapping all SPTEs,
734 * zapping the top-level non-leaf SPTEs will recurse on their children.
735 */
736 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
737
524a1e4e
SC
738 /*
739 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
740 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
741 * and so KVM will never install a SPTE for such addresses.
742 */
743 end = min(end, max_gfn_host);
744
6103bc07
BG
745 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
746
7cca2d0b
BG
747 rcu_read_lock();
748
77aa6075 749 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
6103bc07 750retry:
1af4a960 751 if (can_yield &&
6103bc07 752 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
a835429c 753 flush = false;
1af4a960
BG
754 continue;
755 }
756
faaf05b0
BG
757 if (!is_shadow_present_pte(iter.old_spte))
758 continue;
759
760 /*
761 * If this is a non-last-level SPTE that covers a larger range
762 * than should be zapped, continue, and zap the mappings at a
524a1e4e 763 * lower level, except when zapping all SPTEs.
faaf05b0 764 */
524a1e4e
SC
765 if (!zap_all &&
766 (iter.gfn < start ||
faaf05b0
BG
767 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
768 !is_last_spte(iter.old_spte, iter.level))
769 continue;
770
6103bc07
BG
771 if (!shared) {
772 tdp_mmu_set_spte(kvm, &iter, 0);
773 flush = true;
3e72c791 774 } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
6103bc07
BG
775 goto retry;
776 }
faaf05b0 777 }
7cca2d0b
BG
778
779 rcu_read_unlock();
a835429c 780 return flush;
faaf05b0
BG
781}
782
783/*
784 * Tears down the mappings for the range of gfns, [start, end), and frees the
785 * non-root pages mapping GFNs strictly within that range. Returns true if
786 * SPTEs have been cleared and a TLB flush is needed before releasing the
787 * MMU lock.
788 */
2b9663d8 789bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
5a324c24 790 gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
791{
792 struct kvm_mmu_page *root;
faaf05b0 793
5a324c24 794 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
6103bc07 795 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
5a324c24 796 false);
faaf05b0 797
faaf05b0
BG
798 return flush;
799}
800
801void kvm_tdp_mmu_zap_all(struct kvm *kvm)
802{
2b9663d8
SC
803 bool flush = false;
804 int i;
805
806 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5a324c24 807 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
faaf05b0 808
faaf05b0
BG
809 if (flush)
810 kvm_flush_remote_tlbs(kvm);
811}
bb18842e 812
4c6654bd
BG
813static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
814 struct kvm_mmu_page *prev_root)
815{
816 struct kvm_mmu_page *next_root;
817
818 if (prev_root)
819 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
820 &prev_root->link,
821 typeof(*prev_root), link);
822 else
823 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
824 typeof(*next_root), link);
825
826 while (next_root && !(next_root->role.invalid &&
827 refcount_read(&next_root->tdp_mmu_root_count)))
828 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
829 &next_root->link,
830 typeof(*next_root), link);
831
832 return next_root;
833}
834
835/*
836 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
837 * invalidated root, they will not be freed until this function drops the
838 * reference. Before dropping that reference, tear down the paging
839 * structure so that whichever thread does drop the last reference
840 * only has to do a trivial amount of work. Since the roots are invalid,
841 * no new SPTEs should be created under them.
842 */
843void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
844{
4c6654bd
BG
845 struct kvm_mmu_page *next_root;
846 struct kvm_mmu_page *root;
847 bool flush = false;
848
849 lockdep_assert_held_read(&kvm->mmu_lock);
850
851 rcu_read_lock();
852
853 root = next_invalidated_root(kvm, NULL);
854
855 while (root) {
856 next_root = next_invalidated_root(kvm, root);
857
858 rcu_read_unlock();
859
524a1e4e 860 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
4c6654bd
BG
861
862 /*
863 * Put the reference acquired in
864 * kvm_tdp_mmu_invalidate_roots
865 */
866 kvm_tdp_mmu_put_root(kvm, root, true);
867
868 root = next_root;
869
870 rcu_read_lock();
871 }
872
873 rcu_read_unlock();
faaf05b0 874
faaf05b0
BG
875 if (flush)
876 kvm_flush_remote_tlbs(kvm);
877}
bb18842e 878
b7cccd39
BG
879/*
880 * Mark each TDP MMU root as invalid so that other threads
881 * will drop their references and allow the root count to
882 * go to 0.
883 *
4c6654bd
BG
884 * Also take a reference on all roots so that this thread
885 * can do the bulk of the work required to free the roots
886 * once they are invalidated. Without this reference, a
887 * vCPU thread might drop the last reference to a root and
888 * get stuck with tearing down the entire paging structure.
889 *
890 * Roots which have a zero refcount should be skipped as
891 * they're already being torn down.
892 * Already invalid roots should be referenced again so that
893 * they aren't freed before kvm_tdp_mmu_zap_all_fast is
894 * done with them.
895 *
b7cccd39
BG
896 * This has essentially the same effect for the TDP MMU
897 * as updating mmu_valid_gen does for the shadow MMU.
898 */
899void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
900{
901 struct kvm_mmu_page *root;
902
903 lockdep_assert_held_write(&kvm->mmu_lock);
904 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
4c6654bd
BG
905 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
906 root->role.invalid = true;
b7cccd39
BG
907}
908
bb18842e
BG
909/*
910 * Installs a last-level SPTE to handle a TDP page fault.
911 * (NPT/EPT violation/misconfiguration)
912 */
cdc47767
PB
913static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
914 struct kvm_page_fault *fault,
915 struct tdp_iter *iter)
bb18842e 916{
c435d4b7 917 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 918 u64 new_spte;
57a3e96d 919 int ret = RET_PF_FIXED;
ad67e480 920 bool wrprot = false;
bb18842e 921
7158bee4 922 WARN_ON(sp->role.level != fault->goal_level);
e710c5f6 923 if (unlikely(!fault->slot))
bb18842e 924 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 925 else
53597858 926 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 927 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 928 fault->map_writable, &new_spte);
bb18842e
BG
929
930 if (new_spte == iter->old_spte)
931 ret = RET_PF_SPURIOUS;
3e72c791 932 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 933 return RET_PF_RETRY;
bb18842e
BG
934
935 /*
936 * If the page fault was caused by a write but the page is write
937 * protected, emulation is needed. If the emulation was skipped,
938 * the vCPU would have the same fault again.
939 */
ad67e480 940 if (wrprot) {
cdc47767 941 if (fault->write)
bb18842e 942 ret = RET_PF_EMULATE;
bb18842e
BG
943 }
944
945 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
946 if (unlikely(is_mmio_spte(new_spte))) {
947 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
948 new_spte);
bb18842e 949 ret = RET_PF_EMULATE;
3849e092 950 } else {
9a77daac
BG
951 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
952 rcu_dereference(iter->sptep));
3849e092 953 }
bb18842e 954
857f8474
KH
955 /*
956 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
957 * consistent with legacy MMU behavior.
958 */
959 if (ret != RET_PF_SPURIOUS)
bb18842e
BG
960 vcpu->stat.pf_fixed++;
961
962 return ret;
963}
964
7b7e1ab6
DM
965/*
966 * tdp_mmu_link_sp_atomic - Atomically replace the given spte with an spte
967 * pointing to the provided page table.
968 *
969 * @kvm: kvm instance
970 * @iter: a tdp_iter instance currently on the SPTE that should be set
971 * @sp: The new TDP page table to install.
972 * @account_nx: True if this page table is being installed to split a
973 * non-executable huge page.
974 *
975 * Returns: 0 if the new page table was installed. Non-0 if the page table
976 * could not be installed (e.g. the atomic compare-exchange failed).
977 */
978static int tdp_mmu_link_sp_atomic(struct kvm *kvm, struct tdp_iter *iter,
979 struct kvm_mmu_page *sp, bool account_nx)
980{
981 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
982 int ret;
983
984 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
985 if (ret)
986 return ret;
987
988 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
989 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
990 if (account_nx)
991 account_huge_nx_page(kvm, sp);
992 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
993
994 return 0;
995}
996
bb18842e
BG
997/*
998 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
999 * page tables and SPTEs to translate the faulting guest physical address.
1000 */
2f6305dd 1001int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 1002{
bb18842e
BG
1003 struct kvm_mmu *mmu = vcpu->arch.mmu;
1004 struct tdp_iter iter;
89c0fd49 1005 struct kvm_mmu_page *sp;
bb18842e 1006 int ret;
bb18842e 1007
73a3c659 1008 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 1009
f0066d94 1010 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
1011
1012 rcu_read_lock();
1013
2f6305dd 1014 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
73a3c659 1015 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 1016 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 1017
73a3c659 1018 if (iter.level == fault->goal_level)
bb18842e
BG
1019 break;
1020
1021 /*
1022 * If there is an SPTE mapping a large page at a higher level
1023 * than the target, that SPTE must be cleared and replaced
1024 * with a non-leaf SPTE.
1025 */
1026 if (is_shadow_present_pte(iter.old_spte) &&
1027 is_large_pte(iter.old_spte)) {
3e72c791 1028 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 1029 break;
bb18842e 1030
bb18842e
BG
1031 /*
1032 * The iter must explicitly re-read the spte here
1033 * because the new value informs the !present
1034 * path below.
1035 */
7cca2d0b 1036 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
bb18842e
BG
1037 }
1038
1039 if (!is_shadow_present_pte(iter.old_spte)) {
7b7e1ab6
DM
1040 bool account_nx = fault->huge_page_disallowed &&
1041 fault->req_level >= iter.level;
1042
ff76d506 1043 /*
c4342633 1044 * If SPTE has been frozen by another thread, just
ff76d506
KH
1045 * give up and retry, avoiding unnecessary page table
1046 * allocation and free.
1047 */
1048 if (is_removed_spte(iter.old_spte))
1049 break;
1050
a82070b6
DM
1051 sp = tdp_mmu_alloc_sp(vcpu);
1052 tdp_mmu_init_child_sp(sp, &iter);
1053
7b7e1ab6 1054 if (tdp_mmu_link_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) {
9a77daac
BG
1055 tdp_mmu_free_sp(sp);
1056 break;
1057 }
bb18842e
BG
1058 }
1059 }
1060
73a3c659 1061 if (iter.level != fault->goal_level) {
7cca2d0b 1062 rcu_read_unlock();
bb18842e 1063 return RET_PF_RETRY;
7cca2d0b 1064 }
bb18842e 1065
cdc47767 1066 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
7cca2d0b 1067 rcu_read_unlock();
bb18842e
BG
1068
1069 return ret;
1070}
063afacd 1071
3039bcc7
SC
1072bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1073 bool flush)
063afacd 1074{
83b83a02
SC
1075 return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1076 range->end, range->may_block, flush);
063afacd
BG
1077}
1078
3039bcc7
SC
1079typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1080 struct kvm_gfn_range *range);
063afacd 1081
3039bcc7
SC
1082static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1083 struct kvm_gfn_range *range,
1084 tdp_handler_t handler)
063afacd 1085{
3039bcc7
SC
1086 struct kvm_mmu_page *root;
1087 struct tdp_iter iter;
1088 bool ret = false;
1089
1090 rcu_read_lock();
1091
e1eed584
SC
1092 /*
1093 * Don't support rescheduling, none of the MMU notifiers that funnel
1094 * into this helper allow blocking; it'd be dead, wasteful code.
1095 */
3039bcc7
SC
1096 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1097 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1098 ret |= handler(kvm, &iter, range);
1099 }
1100
1101 rcu_read_unlock();
1102
1103 return ret;
063afacd 1104}
f8e14497
BG
1105
1106/*
1107 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1108 * if any of the GFNs in the range have been accessed.
1109 */
3039bcc7
SC
1110static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1111 struct kvm_gfn_range *range)
f8e14497 1112{
f8e14497
BG
1113 u64 new_spte = 0;
1114
3039bcc7
SC
1115 /* If we have a non-accessed entry we don't need to change the pte. */
1116 if (!is_accessed_spte(iter->old_spte))
1117 return false;
7cca2d0b 1118
3039bcc7
SC
1119 new_spte = iter->old_spte;
1120
1121 if (spte_ad_enabled(new_spte)) {
1122 new_spte &= ~shadow_accessed_mask;
1123 } else {
f8e14497 1124 /*
3039bcc7
SC
1125 * Capture the dirty status of the page, so that it doesn't get
1126 * lost when the SPTE is marked for access tracking.
f8e14497 1127 */
3039bcc7
SC
1128 if (is_writable_pte(new_spte))
1129 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1130
3039bcc7 1131 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1132 }
1133
3039bcc7 1134 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1135
3039bcc7 1136 return true;
f8e14497
BG
1137}
1138
3039bcc7 1139bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1140{
3039bcc7 1141 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1142}
1143
3039bcc7
SC
1144static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1145 struct kvm_gfn_range *range)
f8e14497 1146{
3039bcc7 1147 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1148}
1149
3039bcc7 1150bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1151{
3039bcc7 1152 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1153}
1d8dd6b3 1154
3039bcc7
SC
1155static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1156 struct kvm_gfn_range *range)
1d8dd6b3 1157{
1d8dd6b3 1158 u64 new_spte;
7cca2d0b 1159
3039bcc7
SC
1160 /* Huge pages aren't expected to be modified without first being zapped. */
1161 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1162
3039bcc7
SC
1163 if (iter->level != PG_LEVEL_4K ||
1164 !is_shadow_present_pte(iter->old_spte))
1165 return false;
1d8dd6b3 1166
3039bcc7
SC
1167 /*
1168 * Note, when changing a read-only SPTE, it's not strictly necessary to
1169 * zero the SPTE before setting the new PFN, but doing so preserves the
1170 * invariant that the PFN of a present * leaf SPTE can never change.
1171 * See __handle_changed_spte().
1172 */
1173 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1174
3039bcc7
SC
1175 if (!pte_write(range->pte)) {
1176 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1177 pte_pfn(range->pte));
1d8dd6b3 1178
3039bcc7 1179 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1180 }
1181
3039bcc7 1182 return true;
1d8dd6b3
BG
1183}
1184
3039bcc7
SC
1185/*
1186 * Handle the changed_pte MMU notifier for the TDP MMU.
1187 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1188 * notifier.
1189 * Returns non-zero if a flush is needed before releasing the MMU lock.
1190 */
1191bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1192{
3039bcc7
SC
1193 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1194
1195 /* FIXME: return 'flush' instead of flushing here. */
1196 if (flush)
1197 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1198
1199 return false;
1d8dd6b3
BG
1200}
1201
a6a0b05d 1202/*
bedd9195
DM
1203 * Remove write access from all SPTEs at or above min_level that map GFNs
1204 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1205 * be flushed.
a6a0b05d
BG
1206 */
1207static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1208 gfn_t start, gfn_t end, int min_level)
1209{
1210 struct tdp_iter iter;
1211 u64 new_spte;
1212 bool spte_set = false;
1213
7cca2d0b
BG
1214 rcu_read_lock();
1215
a6a0b05d
BG
1216 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1217
77aa6075 1218 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfa
BG
1219retry:
1220 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1221 continue;
1222
a6a0b05d 1223 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1224 !is_last_spte(iter.old_spte, iter.level) ||
1225 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1226 continue;
1227
1228 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1229
3e72c791 1230 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1231 goto retry;
3255530a 1232
a6a0b05d 1233 spte_set = true;
a6a0b05d 1234 }
7cca2d0b
BG
1235
1236 rcu_read_unlock();
a6a0b05d
BG
1237 return spte_set;
1238}
1239
1240/*
1241 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1242 * only affect leaf SPTEs down to min_level.
1243 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1244 */
269e9552
HM
1245bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1246 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1247{
1248 struct kvm_mmu_page *root;
a6a0b05d
BG
1249 bool spte_set = false;
1250
24ae4cfa 1251 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1252
d62007ed 1253 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1254 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1255 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1256
1257 return spte_set;
1258}
1259
1260/*
1261 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1262 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1263 * If AD bits are not enabled, this will require clearing the writable bit on
1264 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1265 * be flushed.
1266 */
1267static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1268 gfn_t start, gfn_t end)
1269{
1270 struct tdp_iter iter;
1271 u64 new_spte;
1272 bool spte_set = false;
1273
7cca2d0b
BG
1274 rcu_read_lock();
1275
a6a0b05d 1276 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1277retry:
1278 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1279 continue;
1280
a6a0b05d
BG
1281 if (spte_ad_need_write_protect(iter.old_spte)) {
1282 if (is_writable_pte(iter.old_spte))
1283 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1284 else
1285 continue;
1286 } else {
1287 if (iter.old_spte & shadow_dirty_mask)
1288 new_spte = iter.old_spte & ~shadow_dirty_mask;
1289 else
1290 continue;
1291 }
1292
3e72c791 1293 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1294 goto retry;
3255530a 1295
a6a0b05d 1296 spte_set = true;
a6a0b05d 1297 }
7cca2d0b
BG
1298
1299 rcu_read_unlock();
a6a0b05d
BG
1300 return spte_set;
1301}
1302
1303/*
1304 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1305 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1306 * If AD bits are not enabled, this will require clearing the writable bit on
1307 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1308 * be flushed.
1309 */
269e9552
HM
1310bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1311 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1312{
1313 struct kvm_mmu_page *root;
a6a0b05d
BG
1314 bool spte_set = false;
1315
24ae4cfa 1316 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1317
d62007ed 1318 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1319 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1320 slot->base_gfn + slot->npages);
a6a0b05d
BG
1321
1322 return spte_set;
1323}
1324
1325/*
1326 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1327 * set in mask, starting at gfn. The given memslot is expected to contain all
1328 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1329 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1330 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1331 */
1332static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1333 gfn_t gfn, unsigned long mask, bool wrprot)
1334{
1335 struct tdp_iter iter;
1336 u64 new_spte;
1337
7cca2d0b
BG
1338 rcu_read_lock();
1339
a6a0b05d
BG
1340 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1341 gfn + BITS_PER_LONG) {
1342 if (!mask)
1343 break;
1344
1345 if (iter.level > PG_LEVEL_4K ||
1346 !(mask & (1UL << (iter.gfn - gfn))))
1347 continue;
1348
f1b3b06a
BG
1349 mask &= ~(1UL << (iter.gfn - gfn));
1350
a6a0b05d
BG
1351 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1352 if (is_writable_pte(iter.old_spte))
1353 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1354 else
1355 continue;
1356 } else {
1357 if (iter.old_spte & shadow_dirty_mask)
1358 new_spte = iter.old_spte & ~shadow_dirty_mask;
1359 else
1360 continue;
1361 }
1362
1363 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1364 }
7cca2d0b
BG
1365
1366 rcu_read_unlock();
a6a0b05d
BG
1367}
1368
1369/*
1370 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1371 * set in mask, starting at gfn. The given memslot is expected to contain all
1372 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1373 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1374 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1375 */
1376void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1377 struct kvm_memory_slot *slot,
1378 gfn_t gfn, unsigned long mask,
1379 bool wrprot)
1380{
1381 struct kvm_mmu_page *root;
a6a0b05d 1382
531810ca 1383 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1384 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1385 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1386}
1387
14881998 1388/*
87aa9ec9
BG
1389 * Clear leaf entries which could be replaced by large mappings, for
1390 * GFNs within the slot.
14881998 1391 */
4b85c921 1392static void zap_collapsible_spte_range(struct kvm *kvm,
14881998 1393 struct kvm_mmu_page *root,
4b85c921 1394 const struct kvm_memory_slot *slot)
14881998 1395{
9eba50f8
SC
1396 gfn_t start = slot->base_gfn;
1397 gfn_t end = start + slot->npages;
14881998
BG
1398 struct tdp_iter iter;
1399 kvm_pfn_t pfn;
14881998 1400
7cca2d0b
BG
1401 rcu_read_lock();
1402
14881998 1403 tdp_root_for_each_pte(iter, root, start, end) {
2db6f772 1404retry:
4b85c921 1405 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960 1406 continue;
1af4a960 1407
14881998 1408 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1409 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1410 continue;
1411
1412 pfn = spte_to_pfn(iter.old_spte);
1413 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1414 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1415 pfn, PG_LEVEL_NUM))
14881998
BG
1416 continue;
1417
4b85c921 1418 /* Note, a successful atomic zap also does a remote TLB flush. */
3e72c791 1419 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
2db6f772 1420 goto retry;
14881998
BG
1421 }
1422
7cca2d0b 1423 rcu_read_unlock();
14881998
BG
1424}
1425
1426/*
1427 * Clear non-leaf entries (and free associated page tables) which could
1428 * be replaced by large mappings, for GFNs within the slot.
1429 */
4b85c921
SC
1430void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1431 const struct kvm_memory_slot *slot)
14881998
BG
1432{
1433 struct kvm_mmu_page *root;
14881998 1434
2db6f772 1435 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1436
d62007ed 1437 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921 1438 zap_collapsible_spte_range(kvm, root, slot);
14881998 1439}
46044f72
BG
1440
1441/*
1442 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1443 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1444 * Returns true if an SPTE was set and a TLB flush is needed.
1445 */
1446static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1447 gfn_t gfn, int min_level)
46044f72
BG
1448{
1449 struct tdp_iter iter;
1450 u64 new_spte;
1451 bool spte_set = false;
1452
3ad93562
KZ
1453 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1454
7cca2d0b
BG
1455 rcu_read_lock();
1456
77aa6075 1457 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562
KZ
1458 if (!is_shadow_present_pte(iter.old_spte) ||
1459 !is_last_spte(iter.old_spte, iter.level))
1460 continue;
1461
46044f72 1462 new_spte = iter.old_spte &
5fc3424f 1463 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72 1464
7c8a4742
DM
1465 if (new_spte == iter.old_spte)
1466 break;
1467
46044f72
BG
1468 tdp_mmu_set_spte(kvm, &iter, new_spte);
1469 spte_set = true;
1470 }
1471
7cca2d0b
BG
1472 rcu_read_unlock();
1473
46044f72
BG
1474 return spte_set;
1475}
1476
1477/*
1478 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1479 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1480 * Returns true if an SPTE was set and a TLB flush is needed.
1481 */
1482bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1483 struct kvm_memory_slot *slot, gfn_t gfn,
1484 int min_level)
46044f72
BG
1485{
1486 struct kvm_mmu_page *root;
46044f72
BG
1487 bool spte_set = false;
1488
531810ca 1489 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1490 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1491 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1492
46044f72
BG
1493 return spte_set;
1494}
1495
95fb5b02
BG
1496/*
1497 * Return the level of the lowest level SPTE added to sptes.
1498 * That SPTE may be non-present.
c5c8c7c5
DM
1499 *
1500 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1501 */
39b4d43e
SC
1502int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1503 int *root_level)
95fb5b02
BG
1504{
1505 struct tdp_iter iter;
1506 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1507 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1508 int leaf = -1;
95fb5b02 1509
39b4d43e 1510 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02
BG
1511
1512 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1513 leaf = iter.level;
dde81f94 1514 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1515 }
1516
1517 return leaf;
1518}
6e8eb206
DM
1519
1520/*
1521 * Returns the last level spte pointer of the shadow page walk for the given
1522 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1523 * walk could be performed, returns NULL and *spte does not contain valid data.
1524 *
1525 * Contract:
1526 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1527 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1528 *
1529 * WARNING: This function is only intended to be called during fast_page_fault.
1530 */
1531u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1532 u64 *spte)
1533{
1534 struct tdp_iter iter;
1535 struct kvm_mmu *mmu = vcpu->arch.mmu;
1536 gfn_t gfn = addr >> PAGE_SHIFT;
1537 tdp_ptep_t sptep = NULL;
1538
1539 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1540 *spte = iter.old_spte;
1541 sptep = iter.sptep;
1542 }
1543
1544 /*
1545 * Perform the rcu_dereference to get the raw spte pointer value since
1546 * we are passing it up to fast_page_fault, which is shared with the
1547 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1548 * annotation.
1549 *
1550 * This is safe since fast_page_fault obeys the contracts of this
1551 * function as well as all TDP MMU contracts around modifying SPTEs
1552 * outside of mmu_lock.
1553 */
1554 return rcu_dereference(sptep);
1555}