KVM: x86/mmu: Document that zapping invalidated roots doesn't need to flush
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
71ba3f31 13static bool __read_mostly tdp_mmu_enabled = true;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
d501f747 17bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 18{
897218ff 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
d501f747 20 return false;
fe5db27d
BG
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
02c00b3a
BG
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
d501f747
BG
28
29 return true;
fe5db27d
BG
30}
31
6103bc07
BG
32static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 bool shared)
34{
35 if (shared)
36 lockdep_assert_held_read(&kvm->mmu_lock);
37 else
38 lockdep_assert_held_write(&kvm->mmu_lock);
39}
40
fe5db27d
BG
41void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42{
43 if (!kvm->arch.tdp_mmu_enabled)
44 return;
02c00b3a 45
524a1e4e 46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
02c00b3a 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
48
49 /*
50 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 * can run before the VM is torn down.
52 */
53 rcu_barrier();
02c00b3a
BG
54}
55
2bdb3d84 56static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
57 gfn_t start, gfn_t end, bool can_yield, bool flush,
58 bool shared);
2bdb3d84
BG
59
60static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 61{
2bdb3d84
BG
62 free_page((unsigned long)sp->spt);
63 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
64}
65
c0e64238
BG
66/*
67 * This is called through call_rcu in order to free TDP page table memory
68 * safely with respect to other kernel threads that may be operating on
69 * the memory.
70 * By only accessing TDP MMU page table memory in an RCU read critical
71 * section, and freeing it after a grace period, lockless access to that
72 * memory won't use it after it is freed.
73 */
74static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 75{
c0e64238
BG
76 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 rcu_head);
a889ea54 78
c0e64238
BG
79 tdp_mmu_free_sp(sp);
80}
a889ea54 81
6103bc07
BG
82void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 bool shared)
2bdb3d84 84{
6103bc07 85 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 86
11cccf5c 87 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
88 return;
89
90 WARN_ON(!root->tdp_mmu_page);
91
c0e64238
BG
92 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 list_del_rcu(&root->link);
94 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
2bdb3d84 95
db01416b
SC
96 /*
97 * A TLB flush is not necessary as KVM performs a local TLB flush when
98 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
99 * to a different pCPU. Note, the local TLB flush on reuse also
100 * invalidates any paging-structure-cache entries, i.e. TLB entries for
101 * intermediate paging structures, that may be zapped, as such entries
102 * are associated with the ASID on both VMX and SVM.
103 */
104 (void)zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
2bdb3d84 105
c0e64238 106 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
107}
108
cfc10997 109/*
d62007ed
SC
110 * Returns the next root after @prev_root (or the first root if @prev_root is
111 * NULL). A reference to the returned root is acquired, and the reference to
112 * @prev_root is released (the caller obviously must hold a reference to
113 * @prev_root if it's non-NULL).
114 *
115 * If @only_valid is true, invalid roots are skipped.
116 *
117 * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997
BG
118 */
119static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07 120 struct kvm_mmu_page *prev_root,
d62007ed 121 bool shared, bool only_valid)
a889ea54
BG
122{
123 struct kvm_mmu_page *next_root;
124
c0e64238
BG
125 rcu_read_lock();
126
cfc10997 127 if (prev_root)
c0e64238
BG
128 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
129 &prev_root->link,
130 typeof(*prev_root), link);
cfc10997 131 else
c0e64238
BG
132 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
133 typeof(*next_root), link);
a889ea54 134
04dc4e6c 135 while (next_root) {
d62007ed 136 if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94 137 kvm_tdp_mmu_get_root(next_root))
04dc4e6c
SC
138 break;
139
c0e64238
BG
140 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
141 &next_root->link, typeof(*next_root), link);
04dc4e6c 142 }
fb101293 143
c0e64238 144 rcu_read_unlock();
a889ea54 145
cfc10997 146 if (prev_root)
6103bc07 147 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 148
a889ea54
BG
149 return next_root;
150}
151
152/*
153 * Note: this iterator gets and puts references to the roots it iterates over.
154 * This makes it safe to release the MMU lock and yield within the loop, but
155 * if exiting the loop early, the caller must drop the reference to the most
156 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
157 *
158 * If shared is set, this function is operating under the MMU lock in read
159 * mode. In the unlikely event that this thread must free a root, the lock
160 * will be temporarily dropped and reacquired in write mode.
a889ea54 161 */
d62007ed
SC
162#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
163 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
164 _root; \
165 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
166 if (kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 167 } else
a889ea54 168
d62007ed
SC
169#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
170 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
171
172#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
173 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
174
c0e64238
BG
175#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
176 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
177 lockdep_is_held_type(&kvm->mmu_lock, 0) || \
178 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
a3f15bda
SC
179 if (kvm_mmu_page_as_id(_root) != _as_id) { \
180 } else
02c00b3a 181
a82070b6 182static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3a
BG
183{
184 struct kvm_mmu_page *sp;
185
186 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
187 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6
DM
188
189 return sp;
190}
191
192static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn,
193 union kvm_mmu_page_role role)
194{
02c00b3a
BG
195 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
196
a3aca4de 197 sp->role = role;
02c00b3a
BG
198 sp->gfn = gfn;
199 sp->tdp_mmu_page = true;
200
33dd3574 201 trace_kvm_mmu_get_page(sp, true);
02c00b3a
BG
202}
203
a82070b6
DM
204static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
205 struct tdp_iter *iter)
02c00b3a 206{
a3aca4de 207 struct kvm_mmu_page *parent_sp;
02c00b3a 208 union kvm_mmu_page_role role;
a3aca4de
DM
209
210 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
211
212 role = parent_sp->role;
213 role.level--;
214
a82070b6 215 tdp_mmu_init_sp(child_sp, iter->gfn, role);
a3aca4de
DM
216}
217
218hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
219{
220 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
02c00b3a
BG
221 struct kvm *kvm = vcpu->kvm;
222 struct kvm_mmu_page *root;
223
6e6ec584 224 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 225
04dc4e6c
SC
226 /*
227 * Check for an existing root before allocating a new one. Note, the
228 * role check prevents consuming an invalid root.
229 */
a3f15bda 230 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293 231 if (root->role.word == role.word &&
ad6d6b94 232 kvm_tdp_mmu_get_root(root))
6e6ec584 233 goto out;
02c00b3a
BG
234 }
235
a82070b6
DM
236 root = tdp_mmu_alloc_sp(vcpu);
237 tdp_mmu_init_sp(root, 0, role);
238
11cccf5c 239 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 240
c0e64238
BG
241 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
242 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
243 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 244
6e6ec584 245out:
02c00b3a 246 return __pa(root->spt);
fe5db27d 247}
2f2fad08
BG
248
249static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
250 u64 old_spte, u64 new_spte, int level,
251 bool shared);
2f2fad08 252
f8e14497
BG
253static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
254{
f8e14497
BG
255 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
256 return;
257
258 if (is_accessed_spte(old_spte) &&
64bb2769
SC
259 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
260 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
261 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
262}
263
a6a0b05d
BG
264static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
265 u64 old_spte, u64 new_spte, int level)
266{
267 bool pfn_changed;
268 struct kvm_memory_slot *slot;
269
270 if (level > PG_LEVEL_4K)
271 return;
272
273 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
274
275 if ((!is_writable_pte(old_spte) || pfn_changed) &&
276 is_writable_pte(new_spte)) {
277 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 278 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
279 }
280}
281
a9442f59 282/**
c298a30c 283 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59
BG
284 *
285 * @kvm: kvm instance
286 * @sp: the page to be removed
9a77daac
BG
287 * @shared: This operation may not be running under the exclusive use of
288 * the MMU lock and the operation must synchronize with other
289 * threads that might be adding or removing pages.
a9442f59 290 */
c298a30c
DM
291static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
292 bool shared)
a9442f59 293{
9a77daac
BG
294 if (shared)
295 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
296 else
297 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
298
299 list_del(&sp->link);
300 if (sp->lpage_disallowed)
301 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
302
303 if (shared)
304 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
305}
306
a066e61f 307/**
0f53dfa3 308 * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61f
BG
309 *
310 * @kvm: kvm instance
311 * @pt: the page removed from the paging structure
9a77daac
BG
312 * @shared: This operation may not be running under the exclusive use
313 * of the MMU lock and the operation must synchronize with other
314 * threads that might be modifying SPTEs.
a066e61f
BG
315 *
316 * Given a page table that has been removed from the TDP paging structure,
317 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
318 *
319 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
320 * protection. Since this thread removed it from the paging structure,
321 * this thread will be responsible for ensuring the page is freed. Hence the
322 * early rcu_dereferences in the function.
a066e61f 323 */
0f53dfa3 324static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61f 325{
70fb3e41 326 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 327 int level = sp->role.level;
e25f0e0c 328 gfn_t base_gfn = sp->gfn;
a066e61f
BG
329 int i;
330
331 trace_kvm_mmu_prepare_zap_page(sp);
332
c298a30c 333 tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61f
BG
334
335 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
574c3c55
BG
336 u64 *sptep = rcu_dereference(pt) + i;
337 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
338 u64 old_child_spte;
9a77daac
BG
339
340 if (shared) {
e25f0e0c
BG
341 /*
342 * Set the SPTE to a nonpresent value that other
343 * threads will not overwrite. If the SPTE was
344 * already marked as removed then another thread
345 * handling a page fault could overwrite it, so
346 * set the SPTE until it is set from some other
347 * value to the removed SPTE value.
348 */
349 for (;;) {
350 old_child_spte = xchg(sptep, REMOVED_SPTE);
351 if (!is_removed_spte(old_child_spte))
352 break;
353 cpu_relax();
354 }
9a77daac 355 } else {
8df9f1af
SC
356 /*
357 * If the SPTE is not MMU-present, there is no backing
358 * page associated with the SPTE and so no side effects
359 * that need to be recorded, and exclusive ownership of
360 * mmu_lock ensures the SPTE can't be made present.
361 * Note, zapping MMIO SPTEs is also unnecessary as they
362 * are guarded by the memslots generation, not by being
363 * unreachable.
364 */
9a77daac 365 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
366 if (!is_shadow_present_pte(old_child_spte))
367 continue;
e25f0e0c
BG
368
369 /*
370 * Marking the SPTE as a removed SPTE is not
371 * strictly necessary here as the MMU lock will
372 * stop other threads from concurrently modifying
373 * this SPTE. Using the removed SPTE value keeps
374 * the two branches consistent and simplifies
375 * the function.
376 */
377 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 378 }
e25f0e0c 379 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
f1b83255 380 old_child_spte, REMOVED_SPTE, level,
e25f0e0c 381 shared);
a066e61f
BG
382 }
383
574c3c55 384 kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
f1b83255 385 KVM_PAGES_PER_HPAGE(level + 1));
a066e61f 386
7cca2d0b 387 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
388}
389
2f2fad08 390/**
7f6231a3 391 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
392 * @kvm: kvm instance
393 * @as_id: the address space of the paging structure the SPTE was a part of
394 * @gfn: the base GFN that was mapped by the SPTE
395 * @old_spte: The value of the SPTE before the change
396 * @new_spte: The value of the SPTE after the change
397 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
398 * @shared: This operation may not be running under the exclusive use of
399 * the MMU lock and the operation must synchronize with other
400 * threads that might be modifying SPTEs.
2f2fad08
BG
401 *
402 * Handle bookkeeping that might result from the modification of a SPTE.
403 * This function must be called for all TDP SPTE modifications.
404 */
405static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
406 u64 old_spte, u64 new_spte, int level,
407 bool shared)
2f2fad08
BG
408{
409 bool was_present = is_shadow_present_pte(old_spte);
410 bool is_present = is_shadow_present_pte(new_spte);
411 bool was_leaf = was_present && is_last_spte(old_spte, level);
412 bool is_leaf = is_present && is_last_spte(new_spte, level);
413 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
414
415 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
416 WARN_ON(level < PG_LEVEL_4K);
764388ce 417 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
418
419 /*
420 * If this warning were to trigger it would indicate that there was a
421 * missing MMU notifier or a race with some notifier handler.
422 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 423 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
424 * should be zapping the SPTE before the main MM's page table is
425 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
426 * thread before replacement.
427 */
428 if (was_leaf && is_leaf && pfn_changed) {
429 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
430 "SPTE with another present leaf SPTE mapping a\n"
431 "different PFN!\n"
432 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
433 as_id, gfn, old_spte, new_spte, level);
434
435 /*
436 * Crash the host to prevent error propagation and guest data
d9f6e12f 437 * corruption.
2f2fad08
BG
438 */
439 BUG();
440 }
441
442 if (old_spte == new_spte)
443 return;
444
b9a98c34
BG
445 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
446
115111ef
DM
447 if (is_leaf)
448 check_spte_writable_invariants(new_spte);
449
2f2fad08
BG
450 /*
451 * The only times a SPTE should be changed from a non-present to
452 * non-present state is when an MMIO entry is installed/modified/
453 * removed. In that case, there is nothing to do here.
454 */
455 if (!was_present && !is_present) {
456 /*
08f07c80
BG
457 * If this change does not involve a MMIO SPTE or removed SPTE,
458 * it is unexpected. Log the change, though it should not
459 * impact the guest since both the former and current SPTEs
460 * are nonpresent.
2f2fad08 461 */
08f07c80
BG
462 if (WARN_ON(!is_mmio_spte(old_spte) &&
463 !is_mmio_spte(new_spte) &&
464 !is_removed_spte(new_spte)))
2f2fad08
BG
465 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
466 "should not be replaced with another,\n"
467 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
468 "are MMIO SPTEs, or the new SPTE is\n"
469 "a temporary removed SPTE.\n"
2f2fad08
BG
470 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
471 as_id, gfn, old_spte, new_spte, level);
472 return;
473 }
474
71f51d2c
MZ
475 if (is_leaf != was_leaf)
476 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
477
478 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 479 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
480 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
481
482 /*
483 * Recursively handle child PTs if the change removed a subtree from
484 * the paging structure.
485 */
a066e61f 486 if (was_present && !was_leaf && (pfn_changed || !is_present))
0f53dfa3 487 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
488}
489
490static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
491 u64 old_spte, u64 new_spte, int level,
492 bool shared)
2f2fad08 493{
9a77daac
BG
494 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
495 shared);
f8e14497 496 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
497 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
498 new_spte, level);
2f2fad08 499}
faaf05b0 500
9a77daac 501/*
6ccf4438
PB
502 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
503 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 504 * in KVM's dirty bitmaps.
9a77daac 505 *
3255530a
DM
506 * If setting the SPTE fails because it has changed, iter->old_spte will be
507 * refreshed to the current value of the spte.
508 *
9a77daac
BG
509 * @kvm: kvm instance
510 * @iter: a tdp_iter instance currently on the SPTE that should be set
511 * @new_spte: The value the SPTE should be set to
3e72c791
DM
512 * Return:
513 * * 0 - If the SPTE was set.
514 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
515 * no side-effects other than setting iter->old_spte to the last
516 * known value of the spte.
9a77daac 517 */
3e72c791
DM
518static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
519 struct tdp_iter *iter,
520 u64 new_spte)
9a77daac 521{
3255530a
DM
522 u64 *sptep = rcu_dereference(iter->sptep);
523 u64 old_spte;
524
3a0f64de
SC
525 WARN_ON_ONCE(iter->yielded);
526
9a77daac
BG
527 lockdep_assert_held_read(&kvm->mmu_lock);
528
08f07c80
BG
529 /*
530 * Do not change removed SPTEs. Only the thread that froze the SPTE
531 * may modify it.
532 */
7a51393a 533 if (is_removed_spte(iter->old_spte))
3e72c791 534 return -EBUSY;
08f07c80 535
6e8eb206
DM
536 /*
537 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
538 * does not hold the mmu_lock.
539 */
3255530a
DM
540 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
541 if (old_spte != iter->old_spte) {
542 /*
543 * The page table entry was modified by a different logical
544 * CPU. Refresh iter->old_spte with the current value so the
545 * caller operates on fresh data, e.g. if it retries
546 * tdp_mmu_set_spte_atomic().
547 */
548 iter->old_spte = old_spte;
3e72c791 549 return -EBUSY;
3255530a 550 }
9a77daac 551
24ae4cfa
BG
552 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
553 new_spte, iter->level, true);
554 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac 555
3e72c791 556 return 0;
9a77daac
BG
557}
558
3e72c791
DM
559static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
560 struct tdp_iter *iter)
08f07c80 561{
3e72c791
DM
562 int ret;
563
08f07c80
BG
564 /*
565 * Freeze the SPTE by setting it to a special,
566 * non-present value. This will stop other threads from
567 * immediately installing a present entry in its place
568 * before the TLBs are flushed.
569 */
3e72c791
DM
570 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
571 if (ret)
572 return ret;
08f07c80
BG
573
574 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
575 KVM_PAGES_PER_HPAGE(iter->level));
576
577 /*
578 * No other thread can overwrite the removed SPTE as they
579 * must either wait on the MMU lock or use
d9f6e12f 580 * tdp_mmu_set_spte_atomic which will not overwrite the
08f07c80
BG
581 * special removed SPTE value. No bookkeeping is needed
582 * here since the SPTE is going from non-present
583 * to non-present.
584 */
14f6fec2 585 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
08f07c80 586
3e72c791 587 return 0;
08f07c80
BG
588}
589
9a77daac 590
fe43fa2f
BG
591/*
592 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
593 * @kvm: kvm instance
594 * @iter: a tdp_iter instance currently on the SPTE that should be set
595 * @new_spte: The value the SPTE should be set to
596 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
597 * of the page. Should be set unless handling an MMU
598 * notifier for access tracking. Leaving record_acc_track
599 * unset in that case prevents page accesses from being
600 * double counted.
601 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
602 * appropriate for the change being made. Should be set
603 * unless performing certain dirty logging operations.
604 * Leaving record_dirty_log unset in that case prevents page
605 * writes from being double counted.
606 */
f8e14497 607static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
a6a0b05d
BG
608 u64 new_spte, bool record_acc_track,
609 bool record_dirty_log)
faaf05b0 610{
3a0f64de
SC
611 WARN_ON_ONCE(iter->yielded);
612
531810ca 613 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 614
08f07c80
BG
615 /*
616 * No thread should be using this function to set SPTEs to the
617 * temporary removed SPTE value.
618 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
619 * should be used. If operating under the MMU lock in write mode, the
620 * use of the removed SPTE should not be necessary.
621 */
7a51393a 622 WARN_ON(is_removed_spte(iter->old_spte));
08f07c80 623
7cca2d0b 624 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
f8e14497 625
08889894
SC
626 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
627 new_spte, iter->level, false);
f8e14497
BG
628 if (record_acc_track)
629 handle_changed_spte_acc_track(iter->old_spte, new_spte,
630 iter->level);
a6a0b05d 631 if (record_dirty_log)
08889894 632 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
a6a0b05d
BG
633 iter->old_spte, new_spte,
634 iter->level);
f8e14497
BG
635}
636
637static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
638 u64 new_spte)
639{
a6a0b05d 640 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 641}
faaf05b0 642
f8e14497
BG
643static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
644 struct tdp_iter *iter,
645 u64 new_spte)
646{
a6a0b05d
BG
647 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
648}
649
650static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
651 struct tdp_iter *iter,
652 u64 new_spte)
653{
654 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
655}
656
657#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075 658 for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0 659
f8e14497
BG
660#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
661 tdp_root_for_each_pte(_iter, _root, _start, _end) \
662 if (!is_shadow_present_pte(_iter.old_spte) || \
663 !is_last_spte(_iter.old_spte, _iter.level)) \
664 continue; \
665 else
666
bb18842e 667#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
b9e5603c 668 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
bb18842e 669
e28a436c
BG
670/*
671 * Yield if the MMU lock is contended or this thread needs to return control
672 * to the scheduler.
673 *
e139a34e
BG
674 * If this function should yield and flush is set, it will perform a remote
675 * TLB flush before yielding.
676 *
3a0f64de
SC
677 * If this function yields, iter->yielded is set and the caller must skip to
678 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
679 * over the paging structures to allow the iterator to continue its traversal
680 * from the paging structure root.
e28a436c 681 *
3a0f64de 682 * Returns true if this function yielded.
e28a436c 683 */
3a0f64de
SC
684static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
685 struct tdp_iter *iter,
686 bool flush, bool shared)
a6a0b05d 687{
3a0f64de
SC
688 WARN_ON(iter->yielded);
689
ed5e484b
BG
690 /* Ensure forward progress has been made before yielding. */
691 if (iter->next_last_level_gfn == iter->yielded_gfn)
692 return false;
693
531810ca 694 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7cca2d0b
BG
695 rcu_read_unlock();
696
e139a34e
BG
697 if (flush)
698 kvm_flush_remote_tlbs(kvm);
699
6103bc07
BG
700 if (shared)
701 cond_resched_rwlock_read(&kvm->mmu_lock);
702 else
703 cond_resched_rwlock_write(&kvm->mmu_lock);
704
7cca2d0b 705 rcu_read_lock();
ed5e484b
BG
706
707 WARN_ON(iter->gfn > iter->next_last_level_gfn);
708
3a0f64de 709 iter->yielded = true;
a6a0b05d 710 }
e28a436c 711
3a0f64de 712 return iter->yielded;
a6a0b05d
BG
713}
714
faaf05b0
BG
715/*
716 * Tears down the mappings for the range of gfns, [start, end), and frees the
717 * non-root pages mapping GFNs strictly within that range. Returns true if
718 * SPTEs have been cleared and a TLB flush is needed before releasing the
719 * MMU lock.
6103bc07 720 *
063afacd
BG
721 * If can_yield is true, will release the MMU lock and reschedule if the
722 * scheduler needs the CPU or there is contention on the MMU lock. If this
723 * function cannot yield, it will not release the MMU lock or reschedule and
724 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07
BG
725 * operation can cause a soft lockup.
726 *
727 * If shared is true, this thread holds the MMU lock in read mode and must
728 * account for the possibility that other threads are modifying the paging
729 * structures concurrently. If shared is false, this thread should hold the
730 * MMU lock in write mode.
faaf05b0
BG
731 */
732static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
733 gfn_t start, gfn_t end, bool can_yield, bool flush,
734 bool shared)
faaf05b0 735{
524a1e4e
SC
736 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
737 bool zap_all = (start == 0 && end >= max_gfn_host);
faaf05b0 738 struct tdp_iter iter;
faaf05b0 739
0103098f
SC
740 /*
741 * No need to try to step down in the iterator when zapping all SPTEs,
742 * zapping the top-level non-leaf SPTEs will recurse on their children.
743 */
744 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
745
524a1e4e
SC
746 /*
747 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
748 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
749 * and so KVM will never install a SPTE for such addresses.
750 */
751 end = min(end, max_gfn_host);
752
6103bc07
BG
753 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
754
7cca2d0b
BG
755 rcu_read_lock();
756
77aa6075 757 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
6103bc07 758retry:
1af4a960 759 if (can_yield &&
6103bc07 760 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
a835429c 761 flush = false;
1af4a960
BG
762 continue;
763 }
764
faaf05b0
BG
765 if (!is_shadow_present_pte(iter.old_spte))
766 continue;
767
768 /*
769 * If this is a non-last-level SPTE that covers a larger range
770 * than should be zapped, continue, and zap the mappings at a
524a1e4e 771 * lower level, except when zapping all SPTEs.
faaf05b0 772 */
524a1e4e
SC
773 if (!zap_all &&
774 (iter.gfn < start ||
faaf05b0
BG
775 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
776 !is_last_spte(iter.old_spte, iter.level))
777 continue;
778
6103bc07
BG
779 if (!shared) {
780 tdp_mmu_set_spte(kvm, &iter, 0);
781 flush = true;
3e72c791 782 } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
6103bc07
BG
783 goto retry;
784 }
faaf05b0 785 }
7cca2d0b
BG
786
787 rcu_read_unlock();
a835429c 788 return flush;
faaf05b0
BG
789}
790
791/*
792 * Tears down the mappings for the range of gfns, [start, end), and frees the
793 * non-root pages mapping GFNs strictly within that range. Returns true if
794 * SPTEs have been cleared and a TLB flush is needed before releasing the
795 * MMU lock.
796 */
2b9663d8 797bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
5a324c24 798 gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
799{
800 struct kvm_mmu_page *root;
faaf05b0 801
5a324c24 802 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
6103bc07 803 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
5a324c24 804 false);
faaf05b0 805
faaf05b0
BG
806 return flush;
807}
808
809void kvm_tdp_mmu_zap_all(struct kvm *kvm)
810{
2b9663d8
SC
811 bool flush = false;
812 int i;
813
814 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5a324c24 815 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
faaf05b0 816
faaf05b0
BG
817 if (flush)
818 kvm_flush_remote_tlbs(kvm);
819}
bb18842e 820
4c6654bd
BG
821static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
822 struct kvm_mmu_page *prev_root)
823{
824 struct kvm_mmu_page *next_root;
825
826 if (prev_root)
827 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
828 &prev_root->link,
829 typeof(*prev_root), link);
830 else
831 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
832 typeof(*next_root), link);
833
834 while (next_root && !(next_root->role.invalid &&
835 refcount_read(&next_root->tdp_mmu_root_count)))
836 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
837 &next_root->link,
838 typeof(*next_root), link);
839
840 return next_root;
841}
842
843/*
f28e9c7f
SC
844 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
845 * zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a
846 * reference to each invalidated root, roots will not be freed until after this
847 * function drops the gifted reference, e.g. so that vCPUs don't get stuck with
848 * tearing down paging structures.
4c6654bd
BG
849 */
850void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
851{
4c6654bd
BG
852 struct kvm_mmu_page *next_root;
853 struct kvm_mmu_page *root;
4c6654bd
BG
854
855 lockdep_assert_held_read(&kvm->mmu_lock);
856
857 rcu_read_lock();
858
859 root = next_invalidated_root(kvm, NULL);
860
861 while (root) {
862 next_root = next_invalidated_root(kvm, root);
863
864 rcu_read_unlock();
865
7ae5840e
SC
866 /*
867 * A TLB flush is unnecessary, invalidated roots are guaranteed
868 * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
869 * for more details), and unlike the legacy MMU, no vCPU kick
870 * is needed to play nice with lockless shadow walks as the TDP
871 * MMU protects its paging structures via RCU. Note, zapping
872 * will still flush on yield, but that's a minor performance
873 * blip and not a functional issue.
874 */
875 (void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
4c6654bd
BG
876
877 /*
878 * Put the reference acquired in
879 * kvm_tdp_mmu_invalidate_roots
880 */
881 kvm_tdp_mmu_put_root(kvm, root, true);
882
883 root = next_root;
884
885 rcu_read_lock();
886 }
887
888 rcu_read_unlock();
faaf05b0 889}
bb18842e 890
b7cccd39 891/*
f28e9c7f
SC
892 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
893 * is about to be zapped, e.g. in response to a memslots update. The caller is
894 * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual
895 * zapping.
b7cccd39 896 *
f28e9c7f
SC
897 * Take a reference on all roots to prevent the root from being freed before it
898 * is zapped by this thread. Freeing a root is not a correctness issue, but if
899 * a vCPU drops the last reference to a root prior to the root being zapped, it
900 * will get stuck with tearing down the entire paging structure.
4c6654bd 901 *
f28e9c7f
SC
902 * Get a reference even if the root is already invalid,
903 * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all
904 * invalid roots, e.g. there's no epoch to identify roots that were invalidated
905 * by a previous call. Roots stay on the list until the last reference is
906 * dropped, so even though all invalid roots are zapped, a root may not go away
907 * for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
908 *
909 * Because mmu_lock is held for write, it should be impossible to observe a
910 * root with zero refcount, i.e. the list of roots cannot be stale.
4c6654bd 911 *
b7cccd39
BG
912 * This has essentially the same effect for the TDP MMU
913 * as updating mmu_valid_gen does for the shadow MMU.
914 */
915void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
916{
917 struct kvm_mmu_page *root;
918
919 lockdep_assert_held_write(&kvm->mmu_lock);
f28e9c7f
SC
920 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
921 if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
4c6654bd 922 root->role.invalid = true;
f28e9c7f 923 }
b7cccd39
BG
924}
925
bb18842e
BG
926/*
927 * Installs a last-level SPTE to handle a TDP page fault.
928 * (NPT/EPT violation/misconfiguration)
929 */
cdc47767
PB
930static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
931 struct kvm_page_fault *fault,
932 struct tdp_iter *iter)
bb18842e 933{
c435d4b7 934 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 935 u64 new_spte;
57a3e96d 936 int ret = RET_PF_FIXED;
ad67e480 937 bool wrprot = false;
bb18842e 938
7158bee4 939 WARN_ON(sp->role.level != fault->goal_level);
e710c5f6 940 if (unlikely(!fault->slot))
bb18842e 941 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 942 else
53597858 943 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 944 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 945 fault->map_writable, &new_spte);
bb18842e
BG
946
947 if (new_spte == iter->old_spte)
948 ret = RET_PF_SPURIOUS;
3e72c791 949 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 950 return RET_PF_RETRY;
bb18842e
BG
951
952 /*
953 * If the page fault was caused by a write but the page is write
954 * protected, emulation is needed. If the emulation was skipped,
955 * the vCPU would have the same fault again.
956 */
ad67e480 957 if (wrprot) {
cdc47767 958 if (fault->write)
bb18842e 959 ret = RET_PF_EMULATE;
bb18842e
BG
960 }
961
962 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
963 if (unlikely(is_mmio_spte(new_spte))) {
964 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
965 new_spte);
bb18842e 966 ret = RET_PF_EMULATE;
3849e092 967 } else {
9a77daac
BG
968 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
969 rcu_dereference(iter->sptep));
3849e092 970 }
bb18842e 971
857f8474
KH
972 /*
973 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
974 * consistent with legacy MMU behavior.
975 */
976 if (ret != RET_PF_SPURIOUS)
bb18842e
BG
977 vcpu->stat.pf_fixed++;
978
979 return ret;
980}
981
7b7e1ab6 982/*
cb00a70b
DM
983 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
984 * provided page table.
7b7e1ab6
DM
985 *
986 * @kvm: kvm instance
987 * @iter: a tdp_iter instance currently on the SPTE that should be set
988 * @sp: The new TDP page table to install.
989 * @account_nx: True if this page table is being installed to split a
990 * non-executable huge page.
cb00a70b 991 * @shared: This operation is running under the MMU lock in read mode.
7b7e1ab6
DM
992 *
993 * Returns: 0 if the new page table was installed. Non-0 if the page table
994 * could not be installed (e.g. the atomic compare-exchange failed).
995 */
cb00a70b
DM
996static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
997 struct kvm_mmu_page *sp, bool account_nx,
998 bool shared)
7b7e1ab6
DM
999{
1000 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
cb00a70b 1001 int ret = 0;
7b7e1ab6 1002
cb00a70b
DM
1003 if (shared) {
1004 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1005 if (ret)
1006 return ret;
1007 } else {
1008 tdp_mmu_set_spte(kvm, iter, spte);
1009 }
7b7e1ab6
DM
1010
1011 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1012 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1013 if (account_nx)
1014 account_huge_nx_page(kvm, sp);
1015 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1016
1017 return 0;
1018}
1019
bb18842e
BG
1020/*
1021 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1022 * page tables and SPTEs to translate the faulting guest physical address.
1023 */
2f6305dd 1024int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 1025{
bb18842e
BG
1026 struct kvm_mmu *mmu = vcpu->arch.mmu;
1027 struct tdp_iter iter;
89c0fd49 1028 struct kvm_mmu_page *sp;
bb18842e 1029 int ret;
bb18842e 1030
73a3c659 1031 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 1032
f0066d94 1033 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
1034
1035 rcu_read_lock();
1036
2f6305dd 1037 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
73a3c659 1038 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 1039 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 1040
73a3c659 1041 if (iter.level == fault->goal_level)
bb18842e
BG
1042 break;
1043
1044 /*
1045 * If there is an SPTE mapping a large page at a higher level
1046 * than the target, that SPTE must be cleared and replaced
1047 * with a non-leaf SPTE.
1048 */
1049 if (is_shadow_present_pte(iter.old_spte) &&
1050 is_large_pte(iter.old_spte)) {
3e72c791 1051 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 1052 break;
bb18842e 1053
bb18842e
BG
1054 /*
1055 * The iter must explicitly re-read the spte here
1056 * because the new value informs the !present
1057 * path below.
1058 */
7cca2d0b 1059 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
bb18842e
BG
1060 }
1061
1062 if (!is_shadow_present_pte(iter.old_spte)) {
7b7e1ab6
DM
1063 bool account_nx = fault->huge_page_disallowed &&
1064 fault->req_level >= iter.level;
1065
ff76d506 1066 /*
c4342633 1067 * If SPTE has been frozen by another thread, just
ff76d506
KH
1068 * give up and retry, avoiding unnecessary page table
1069 * allocation and free.
1070 */
1071 if (is_removed_spte(iter.old_spte))
1072 break;
1073
a82070b6
DM
1074 sp = tdp_mmu_alloc_sp(vcpu);
1075 tdp_mmu_init_child_sp(sp, &iter);
1076
cb00a70b 1077 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
9a77daac
BG
1078 tdp_mmu_free_sp(sp);
1079 break;
1080 }
bb18842e
BG
1081 }
1082 }
1083
73a3c659 1084 if (iter.level != fault->goal_level) {
7cca2d0b 1085 rcu_read_unlock();
bb18842e 1086 return RET_PF_RETRY;
7cca2d0b 1087 }
bb18842e 1088
cdc47767 1089 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
7cca2d0b 1090 rcu_read_unlock();
bb18842e
BG
1091
1092 return ret;
1093}
063afacd 1094
3039bcc7
SC
1095bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1096 bool flush)
063afacd 1097{
83b83a02
SC
1098 return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1099 range->end, range->may_block, flush);
063afacd
BG
1100}
1101
3039bcc7
SC
1102typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1103 struct kvm_gfn_range *range);
063afacd 1104
3039bcc7
SC
1105static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1106 struct kvm_gfn_range *range,
1107 tdp_handler_t handler)
063afacd 1108{
3039bcc7
SC
1109 struct kvm_mmu_page *root;
1110 struct tdp_iter iter;
1111 bool ret = false;
1112
1113 rcu_read_lock();
1114
e1eed584
SC
1115 /*
1116 * Don't support rescheduling, none of the MMU notifiers that funnel
1117 * into this helper allow blocking; it'd be dead, wasteful code.
1118 */
3039bcc7
SC
1119 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1120 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1121 ret |= handler(kvm, &iter, range);
1122 }
1123
1124 rcu_read_unlock();
1125
1126 return ret;
063afacd 1127}
f8e14497
BG
1128
1129/*
1130 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1131 * if any of the GFNs in the range have been accessed.
1132 */
3039bcc7
SC
1133static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1134 struct kvm_gfn_range *range)
f8e14497 1135{
f8e14497
BG
1136 u64 new_spte = 0;
1137
3039bcc7
SC
1138 /* If we have a non-accessed entry we don't need to change the pte. */
1139 if (!is_accessed_spte(iter->old_spte))
1140 return false;
7cca2d0b 1141
3039bcc7
SC
1142 new_spte = iter->old_spte;
1143
1144 if (spte_ad_enabled(new_spte)) {
1145 new_spte &= ~shadow_accessed_mask;
1146 } else {
f8e14497 1147 /*
3039bcc7
SC
1148 * Capture the dirty status of the page, so that it doesn't get
1149 * lost when the SPTE is marked for access tracking.
f8e14497 1150 */
3039bcc7
SC
1151 if (is_writable_pte(new_spte))
1152 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1153
3039bcc7 1154 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1155 }
1156
3039bcc7 1157 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1158
3039bcc7 1159 return true;
f8e14497
BG
1160}
1161
3039bcc7 1162bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1163{
3039bcc7 1164 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1165}
1166
3039bcc7
SC
1167static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1168 struct kvm_gfn_range *range)
f8e14497 1169{
3039bcc7 1170 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1171}
1172
3039bcc7 1173bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1174{
3039bcc7 1175 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1176}
1d8dd6b3 1177
3039bcc7
SC
1178static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1179 struct kvm_gfn_range *range)
1d8dd6b3 1180{
1d8dd6b3 1181 u64 new_spte;
7cca2d0b 1182
3039bcc7
SC
1183 /* Huge pages aren't expected to be modified without first being zapped. */
1184 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1185
3039bcc7
SC
1186 if (iter->level != PG_LEVEL_4K ||
1187 !is_shadow_present_pte(iter->old_spte))
1188 return false;
1d8dd6b3 1189
3039bcc7
SC
1190 /*
1191 * Note, when changing a read-only SPTE, it's not strictly necessary to
1192 * zero the SPTE before setting the new PFN, but doing so preserves the
1193 * invariant that the PFN of a present * leaf SPTE can never change.
1194 * See __handle_changed_spte().
1195 */
1196 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1197
3039bcc7
SC
1198 if (!pte_write(range->pte)) {
1199 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1200 pte_pfn(range->pte));
1d8dd6b3 1201
3039bcc7 1202 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1203 }
1204
3039bcc7 1205 return true;
1d8dd6b3
BG
1206}
1207
3039bcc7
SC
1208/*
1209 * Handle the changed_pte MMU notifier for the TDP MMU.
1210 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1211 * notifier.
1212 * Returns non-zero if a flush is needed before releasing the MMU lock.
1213 */
1214bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1215{
3039bcc7
SC
1216 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1217
1218 /* FIXME: return 'flush' instead of flushing here. */
1219 if (flush)
1220 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1221
1222 return false;
1d8dd6b3
BG
1223}
1224
a6a0b05d 1225/*
bedd9195
DM
1226 * Remove write access from all SPTEs at or above min_level that map GFNs
1227 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1228 * be flushed.
a6a0b05d
BG
1229 */
1230static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1231 gfn_t start, gfn_t end, int min_level)
1232{
1233 struct tdp_iter iter;
1234 u64 new_spte;
1235 bool spte_set = false;
1236
7cca2d0b
BG
1237 rcu_read_lock();
1238
a6a0b05d
BG
1239 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1240
77aa6075 1241 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfa
BG
1242retry:
1243 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1244 continue;
1245
a6a0b05d 1246 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1247 !is_last_spte(iter.old_spte, iter.level) ||
1248 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1249 continue;
1250
1251 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1252
3e72c791 1253 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1254 goto retry;
3255530a 1255
a6a0b05d 1256 spte_set = true;
a6a0b05d 1257 }
7cca2d0b
BG
1258
1259 rcu_read_unlock();
a6a0b05d
BG
1260 return spte_set;
1261}
1262
1263/*
1264 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1265 * only affect leaf SPTEs down to min_level.
1266 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1267 */
269e9552
HM
1268bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1269 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1270{
1271 struct kvm_mmu_page *root;
a6a0b05d
BG
1272 bool spte_set = false;
1273
24ae4cfa 1274 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1275
d62007ed 1276 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1277 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1278 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1279
1280 return spte_set;
1281}
1282
a3fe5dbd
DM
1283static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1284{
1285 struct kvm_mmu_page *sp;
1286
1287 gfp |= __GFP_ZERO;
1288
1289 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1290 if (!sp)
1291 return NULL;
1292
1293 sp->spt = (void *)__get_free_page(gfp);
1294 if (!sp->spt) {
1295 kmem_cache_free(mmu_page_header_cache, sp);
1296 return NULL;
1297 }
1298
1299 return sp;
1300}
1301
1302static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
cb00a70b
DM
1303 struct tdp_iter *iter,
1304 bool shared)
a3fe5dbd
DM
1305{
1306 struct kvm_mmu_page *sp;
1307
a3fe5dbd
DM
1308 /*
1309 * Since we are allocating while under the MMU lock we have to be
1310 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1311 * reclaim and to avoid making any filesystem callbacks (which can end
1312 * up invoking KVM MMU notifiers, resulting in a deadlock).
1313 *
1314 * If this allocation fails we drop the lock and retry with reclaim
1315 * allowed.
1316 */
1317 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1318 if (sp)
1319 return sp;
1320
1321 rcu_read_unlock();
cb00a70b
DM
1322
1323 if (shared)
1324 read_unlock(&kvm->mmu_lock);
1325 else
1326 write_unlock(&kvm->mmu_lock);
a3fe5dbd
DM
1327
1328 iter->yielded = true;
1329 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1330
cb00a70b
DM
1331 if (shared)
1332 read_lock(&kvm->mmu_lock);
1333 else
1334 write_lock(&kvm->mmu_lock);
1335
a3fe5dbd
DM
1336 rcu_read_lock();
1337
1338 return sp;
1339}
1340
cb00a70b
DM
1341static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1342 struct kvm_mmu_page *sp, bool shared)
a3fe5dbd
DM
1343{
1344 const u64 huge_spte = iter->old_spte;
1345 const int level = iter->level;
1346 int ret, i;
1347
1348 tdp_mmu_init_child_sp(sp, iter);
1349
1350 /*
1351 * No need for atomics when writing to sp->spt since the page table has
1352 * not been linked in yet and thus is not reachable from any other CPU.
1353 */
1354 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1355 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1356
1357 /*
1358 * Replace the huge spte with a pointer to the populated lower level
1359 * page table. Since we are making this change without a TLB flush vCPUs
1360 * will see a mix of the split mappings and the original huge mapping,
1361 * depending on what's currently in their TLB. This is fine from a
1362 * correctness standpoint since the translation will be the same either
1363 * way.
1364 */
cb00a70b 1365 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
a3fe5dbd 1366 if (ret)
e0b728b1 1367 goto out;
a3fe5dbd
DM
1368
1369 /*
1370 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1371 * are overwriting from the page stats. But we have to manually update
1372 * the page stats with the new present child pages.
1373 */
1374 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1375
e0b728b1
DM
1376out:
1377 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1378 return ret;
a3fe5dbd
DM
1379}
1380
1381static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1382 struct kvm_mmu_page *root,
1383 gfn_t start, gfn_t end,
cb00a70b 1384 int target_level, bool shared)
a3fe5dbd
DM
1385{
1386 struct kvm_mmu_page *sp = NULL;
1387 struct tdp_iter iter;
1388 int ret = 0;
1389
1390 rcu_read_lock();
1391
1392 /*
1393 * Traverse the page table splitting all huge pages above the target
1394 * level into one lower level. For example, if we encounter a 1GB page
1395 * we split it into 512 2MB pages.
1396 *
1397 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1398 * to visit an SPTE before ever visiting its children, which means we
1399 * will correctly recursively split huge pages that are more than one
1400 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1401 * and then splitting each of those to 512 4KB pages).
1402 */
1403 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1404retry:
cb00a70b 1405 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
a3fe5dbd
DM
1406 continue;
1407
1408 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1409 continue;
1410
1411 if (!sp) {
cb00a70b 1412 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
a3fe5dbd
DM
1413 if (!sp) {
1414 ret = -ENOMEM;
e0b728b1
DM
1415 trace_kvm_mmu_split_huge_page(iter.gfn,
1416 iter.old_spte,
1417 iter.level, ret);
a3fe5dbd
DM
1418 break;
1419 }
1420
1421 if (iter.yielded)
1422 continue;
1423 }
1424
cb00a70b 1425 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
a3fe5dbd
DM
1426 goto retry;
1427
1428 sp = NULL;
1429 }
1430
1431 rcu_read_unlock();
1432
1433 /*
1434 * It's possible to exit the loop having never used the last sp if, for
1435 * example, a vCPU doing HugePage NX splitting wins the race and
1436 * installs its own sp in place of the last sp we tried to split.
1437 */
1438 if (sp)
1439 tdp_mmu_free_sp(sp);
1440
a3fe5dbd
DM
1441 return ret;
1442}
1443
cb00a70b 1444
a3fe5dbd
DM
1445/*
1446 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1447 */
1448void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1449 const struct kvm_memory_slot *slot,
1450 gfn_t start, gfn_t end,
cb00a70b 1451 int target_level, bool shared)
a3fe5dbd
DM
1452{
1453 struct kvm_mmu_page *root;
1454 int r = 0;
1455
cb00a70b 1456 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a3fe5dbd 1457
cb00a70b
DM
1458 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1459 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
a3fe5dbd 1460 if (r) {
cb00a70b 1461 kvm_tdp_mmu_put_root(kvm, root, shared);
a3fe5dbd
DM
1462 break;
1463 }
1464 }
1465}
1466
a6a0b05d
BG
1467/*
1468 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1469 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1470 * If AD bits are not enabled, this will require clearing the writable bit on
1471 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1472 * be flushed.
1473 */
1474static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1475 gfn_t start, gfn_t end)
1476{
1477 struct tdp_iter iter;
1478 u64 new_spte;
1479 bool spte_set = false;
1480
7cca2d0b
BG
1481 rcu_read_lock();
1482
a6a0b05d 1483 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1484retry:
1485 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1486 continue;
1487
3354ef5a
SC
1488 if (!is_shadow_present_pte(iter.old_spte))
1489 continue;
1490
a6a0b05d
BG
1491 if (spte_ad_need_write_protect(iter.old_spte)) {
1492 if (is_writable_pte(iter.old_spte))
1493 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1494 else
1495 continue;
1496 } else {
1497 if (iter.old_spte & shadow_dirty_mask)
1498 new_spte = iter.old_spte & ~shadow_dirty_mask;
1499 else
1500 continue;
1501 }
1502
3e72c791 1503 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1504 goto retry;
3255530a 1505
a6a0b05d 1506 spte_set = true;
a6a0b05d 1507 }
7cca2d0b
BG
1508
1509 rcu_read_unlock();
a6a0b05d
BG
1510 return spte_set;
1511}
1512
1513/*
1514 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1515 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1516 * If AD bits are not enabled, this will require clearing the writable bit on
1517 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1518 * be flushed.
1519 */
269e9552
HM
1520bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1521 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1522{
1523 struct kvm_mmu_page *root;
a6a0b05d
BG
1524 bool spte_set = false;
1525
24ae4cfa 1526 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1527
d62007ed 1528 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1529 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1530 slot->base_gfn + slot->npages);
a6a0b05d
BG
1531
1532 return spte_set;
1533}
1534
1535/*
1536 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1537 * set in mask, starting at gfn. The given memslot is expected to contain all
1538 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1539 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1540 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1541 */
1542static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1543 gfn_t gfn, unsigned long mask, bool wrprot)
1544{
1545 struct tdp_iter iter;
1546 u64 new_spte;
1547
7cca2d0b
BG
1548 rcu_read_lock();
1549
a6a0b05d
BG
1550 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1551 gfn + BITS_PER_LONG) {
1552 if (!mask)
1553 break;
1554
1555 if (iter.level > PG_LEVEL_4K ||
1556 !(mask & (1UL << (iter.gfn - gfn))))
1557 continue;
1558
f1b3b06a
BG
1559 mask &= ~(1UL << (iter.gfn - gfn));
1560
a6a0b05d
BG
1561 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1562 if (is_writable_pte(iter.old_spte))
1563 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1564 else
1565 continue;
1566 } else {
1567 if (iter.old_spte & shadow_dirty_mask)
1568 new_spte = iter.old_spte & ~shadow_dirty_mask;
1569 else
1570 continue;
1571 }
1572
1573 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1574 }
7cca2d0b
BG
1575
1576 rcu_read_unlock();
a6a0b05d
BG
1577}
1578
1579/*
1580 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1581 * set in mask, starting at gfn. The given memslot is expected to contain all
1582 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1583 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1584 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1585 */
1586void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1587 struct kvm_memory_slot *slot,
1588 gfn_t gfn, unsigned long mask,
1589 bool wrprot)
1590{
1591 struct kvm_mmu_page *root;
a6a0b05d 1592
531810ca 1593 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1594 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1595 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1596}
1597
14881998 1598/*
87aa9ec9
BG
1599 * Clear leaf entries which could be replaced by large mappings, for
1600 * GFNs within the slot.
14881998 1601 */
4b85c921 1602static void zap_collapsible_spte_range(struct kvm *kvm,
14881998 1603 struct kvm_mmu_page *root,
4b85c921 1604 const struct kvm_memory_slot *slot)
14881998 1605{
9eba50f8
SC
1606 gfn_t start = slot->base_gfn;
1607 gfn_t end = start + slot->npages;
14881998
BG
1608 struct tdp_iter iter;
1609 kvm_pfn_t pfn;
14881998 1610
7cca2d0b
BG
1611 rcu_read_lock();
1612
14881998 1613 tdp_root_for_each_pte(iter, root, start, end) {
2db6f772 1614retry:
4b85c921 1615 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960 1616 continue;
1af4a960 1617
14881998 1618 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1619 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1620 continue;
1621
1622 pfn = spte_to_pfn(iter.old_spte);
1623 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1624 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1625 pfn, PG_LEVEL_NUM))
14881998
BG
1626 continue;
1627
4b85c921 1628 /* Note, a successful atomic zap also does a remote TLB flush. */
3e72c791 1629 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
2db6f772 1630 goto retry;
14881998
BG
1631 }
1632
7cca2d0b 1633 rcu_read_unlock();
14881998
BG
1634}
1635
1636/*
1637 * Clear non-leaf entries (and free associated page tables) which could
1638 * be replaced by large mappings, for GFNs within the slot.
1639 */
4b85c921
SC
1640void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1641 const struct kvm_memory_slot *slot)
14881998
BG
1642{
1643 struct kvm_mmu_page *root;
14881998 1644
2db6f772 1645 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1646
d62007ed 1647 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921 1648 zap_collapsible_spte_range(kvm, root, slot);
14881998 1649}
46044f72
BG
1650
1651/*
1652 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1653 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1654 * Returns true if an SPTE was set and a TLB flush is needed.
1655 */
1656static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1657 gfn_t gfn, int min_level)
46044f72
BG
1658{
1659 struct tdp_iter iter;
1660 u64 new_spte;
1661 bool spte_set = false;
1662
3ad93562
KZ
1663 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1664
7cca2d0b
BG
1665 rcu_read_lock();
1666
77aa6075 1667 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562
KZ
1668 if (!is_shadow_present_pte(iter.old_spte) ||
1669 !is_last_spte(iter.old_spte, iter.level))
1670 continue;
1671
46044f72 1672 new_spte = iter.old_spte &
5fc3424f 1673 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72 1674
7c8a4742
DM
1675 if (new_spte == iter.old_spte)
1676 break;
1677
46044f72
BG
1678 tdp_mmu_set_spte(kvm, &iter, new_spte);
1679 spte_set = true;
1680 }
1681
7cca2d0b
BG
1682 rcu_read_unlock();
1683
46044f72
BG
1684 return spte_set;
1685}
1686
1687/*
1688 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1689 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1690 * Returns true if an SPTE was set and a TLB flush is needed.
1691 */
1692bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1693 struct kvm_memory_slot *slot, gfn_t gfn,
1694 int min_level)
46044f72
BG
1695{
1696 struct kvm_mmu_page *root;
46044f72
BG
1697 bool spte_set = false;
1698
531810ca 1699 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1700 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1701 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1702
46044f72
BG
1703 return spte_set;
1704}
1705
95fb5b02
BG
1706/*
1707 * Return the level of the lowest level SPTE added to sptes.
1708 * That SPTE may be non-present.
c5c8c7c5
DM
1709 *
1710 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1711 */
39b4d43e
SC
1712int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1713 int *root_level)
95fb5b02
BG
1714{
1715 struct tdp_iter iter;
1716 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1717 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1718 int leaf = -1;
95fb5b02 1719
39b4d43e 1720 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02
BG
1721
1722 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1723 leaf = iter.level;
dde81f94 1724 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1725 }
1726
1727 return leaf;
1728}
6e8eb206
DM
1729
1730/*
1731 * Returns the last level spte pointer of the shadow page walk for the given
1732 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1733 * walk could be performed, returns NULL and *spte does not contain valid data.
1734 *
1735 * Contract:
1736 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1737 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1738 *
1739 * WARNING: This function is only intended to be called during fast_page_fault.
1740 */
1741u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1742 u64 *spte)
1743{
1744 struct tdp_iter iter;
1745 struct kvm_mmu *mmu = vcpu->arch.mmu;
1746 gfn_t gfn = addr >> PAGE_SHIFT;
1747 tdp_ptep_t sptep = NULL;
1748
1749 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1750 *spte = iter.old_spte;
1751 sptep = iter.sptep;
1752 }
1753
1754 /*
1755 * Perform the rcu_dereference to get the raw spte pointer value since
1756 * we are passing it up to fast_page_fault, which is shared with the
1757 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1758 * annotation.
1759 *
1760 * This is safe since fast_page_fault obeys the contracts of this
1761 * function as well as all TDP MMU contracts around modifying SPTEs
1762 * outside of mmu_lock.
1763 */
1764 return rcu_dereference(sptep);
1765}