KVM: x86/mmu: Refactor low-level TDP MMU set SPTE helper to take raw values
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
71ba3f31 13static bool __read_mostly tdp_mmu_enabled = true;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
d501f747 17bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 18{
897218ff 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
d501f747 20 return false;
fe5db27d
BG
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
02c00b3a
BG
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
d501f747
BG
28
29 return true;
fe5db27d
BG
30}
31
226b8c8f
SC
32/* Arbitrarily returns true so that this may be used in if statements. */
33static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
6103bc07
BG
34 bool shared)
35{
36 if (shared)
37 lockdep_assert_held_read(&kvm->mmu_lock);
38 else
39 lockdep_assert_held_write(&kvm->mmu_lock);
226b8c8f
SC
40
41 return true;
6103bc07
BG
42}
43
fe5db27d
BG
44void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
45{
46 if (!kvm->arch.tdp_mmu_enabled)
47 return;
02c00b3a 48
524a1e4e 49 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
02c00b3a 50 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
51
52 /*
53 * Ensure that all the outstanding RCU callbacks to free shadow pages
54 * can run before the VM is torn down.
55 */
56 rcu_barrier();
02c00b3a
BG
57}
58
2bdb3d84 59static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
60 gfn_t start, gfn_t end, bool can_yield, bool flush,
61 bool shared);
2bdb3d84
BG
62
63static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 64{
2bdb3d84
BG
65 free_page((unsigned long)sp->spt);
66 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
67}
68
c0e64238
BG
69/*
70 * This is called through call_rcu in order to free TDP page table memory
71 * safely with respect to other kernel threads that may be operating on
72 * the memory.
73 * By only accessing TDP MMU page table memory in an RCU read critical
74 * section, and freeing it after a grace period, lockless access to that
75 * memory won't use it after it is freed.
76 */
77static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 78{
c0e64238
BG
79 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
80 rcu_head);
a889ea54 81
c0e64238
BG
82 tdp_mmu_free_sp(sp);
83}
a889ea54 84
6103bc07
BG
85void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
86 bool shared)
2bdb3d84 87{
6103bc07 88 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 89
11cccf5c 90 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
91 return;
92
93 WARN_ON(!root->tdp_mmu_page);
94
c0e64238
BG
95 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
96 list_del_rcu(&root->link);
97 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
2bdb3d84 98
db01416b
SC
99 /*
100 * A TLB flush is not necessary as KVM performs a local TLB flush when
101 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
102 * to a different pCPU. Note, the local TLB flush on reuse also
103 * invalidates any paging-structure-cache entries, i.e. TLB entries for
104 * intermediate paging structures, that may be zapped, as such entries
105 * are associated with the ASID on both VMX and SVM.
106 */
107 (void)zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
2bdb3d84 108
c0e64238 109 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
110}
111
cfc10997 112/*
d62007ed
SC
113 * Returns the next root after @prev_root (or the first root if @prev_root is
114 * NULL). A reference to the returned root is acquired, and the reference to
115 * @prev_root is released (the caller obviously must hold a reference to
116 * @prev_root if it's non-NULL).
117 *
118 * If @only_valid is true, invalid roots are skipped.
119 *
120 * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997
BG
121 */
122static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07 123 struct kvm_mmu_page *prev_root,
d62007ed 124 bool shared, bool only_valid)
a889ea54
BG
125{
126 struct kvm_mmu_page *next_root;
127
c0e64238
BG
128 rcu_read_lock();
129
cfc10997 130 if (prev_root)
c0e64238
BG
131 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
132 &prev_root->link,
133 typeof(*prev_root), link);
cfc10997 134 else
c0e64238
BG
135 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
136 typeof(*next_root), link);
a889ea54 137
04dc4e6c 138 while (next_root) {
d62007ed 139 if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94 140 kvm_tdp_mmu_get_root(next_root))
04dc4e6c
SC
141 break;
142
c0e64238
BG
143 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
144 &next_root->link, typeof(*next_root), link);
04dc4e6c 145 }
fb101293 146
c0e64238 147 rcu_read_unlock();
a889ea54 148
cfc10997 149 if (prev_root)
6103bc07 150 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 151
a889ea54
BG
152 return next_root;
153}
154
155/*
156 * Note: this iterator gets and puts references to the roots it iterates over.
157 * This makes it safe to release the MMU lock and yield within the loop, but
158 * if exiting the loop early, the caller must drop the reference to the most
159 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
160 *
161 * If shared is set, this function is operating under the MMU lock in read
162 * mode. In the unlikely event that this thread must free a root, the lock
163 * will be temporarily dropped and reacquired in write mode.
a889ea54 164 */
d62007ed
SC
165#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
166 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
167 _root; \
168 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
614f6970
PB
169 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
170 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 171 } else
a889ea54 172
d62007ed
SC
173#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
174 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
175
614f6970
PB
176#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
177 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
d62007ed 178
226b8c8f
SC
179/*
180 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
181 * the implication being that any flow that holds mmu_lock for read is
182 * inherently yield-friendly and should use the yield-safe variant above.
183 * Holding mmu_lock for write obviates the need for RCU protection as the list
184 * is guaranteed to be stable.
185 */
186#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
187 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
188 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
189 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 190 } else
02c00b3a 191
a82070b6 192static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3a
BG
193{
194 struct kvm_mmu_page *sp;
195
196 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
197 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6
DM
198
199 return sp;
200}
201
202static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn,
203 union kvm_mmu_page_role role)
204{
02c00b3a
BG
205 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
206
a3aca4de 207 sp->role = role;
02c00b3a
BG
208 sp->gfn = gfn;
209 sp->tdp_mmu_page = true;
210
33dd3574 211 trace_kvm_mmu_get_page(sp, true);
02c00b3a
BG
212}
213
a82070b6
DM
214static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
215 struct tdp_iter *iter)
02c00b3a 216{
a3aca4de 217 struct kvm_mmu_page *parent_sp;
02c00b3a 218 union kvm_mmu_page_role role;
a3aca4de
DM
219
220 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
221
222 role = parent_sp->role;
223 role.level--;
224
a82070b6 225 tdp_mmu_init_sp(child_sp, iter->gfn, role);
a3aca4de
DM
226}
227
228hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
229{
230 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
02c00b3a
BG
231 struct kvm *kvm = vcpu->kvm;
232 struct kvm_mmu_page *root;
233
6e6ec584 234 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 235
04dc4e6c
SC
236 /*
237 * Check for an existing root before allocating a new one. Note, the
238 * role check prevents consuming an invalid root.
239 */
a3f15bda 240 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293 241 if (root->role.word == role.word &&
ad6d6b94 242 kvm_tdp_mmu_get_root(root))
6e6ec584 243 goto out;
02c00b3a
BG
244 }
245
a82070b6
DM
246 root = tdp_mmu_alloc_sp(vcpu);
247 tdp_mmu_init_sp(root, 0, role);
248
11cccf5c 249 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 250
c0e64238
BG
251 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
252 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
253 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 254
6e6ec584 255out:
02c00b3a 256 return __pa(root->spt);
fe5db27d 257}
2f2fad08
BG
258
259static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
260 u64 old_spte, u64 new_spte, int level,
261 bool shared);
2f2fad08 262
f8e14497
BG
263static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
264{
f8e14497
BG
265 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
266 return;
267
268 if (is_accessed_spte(old_spte) &&
64bb2769
SC
269 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
270 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
271 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
272}
273
a6a0b05d
BG
274static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
275 u64 old_spte, u64 new_spte, int level)
276{
277 bool pfn_changed;
278 struct kvm_memory_slot *slot;
279
280 if (level > PG_LEVEL_4K)
281 return;
282
283 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
284
285 if ((!is_writable_pte(old_spte) || pfn_changed) &&
286 is_writable_pte(new_spte)) {
287 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 288 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
289 }
290}
291
a9442f59 292/**
c298a30c 293 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59
BG
294 *
295 * @kvm: kvm instance
296 * @sp: the page to be removed
9a77daac
BG
297 * @shared: This operation may not be running under the exclusive use of
298 * the MMU lock and the operation must synchronize with other
299 * threads that might be adding or removing pages.
a9442f59 300 */
c298a30c
DM
301static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
302 bool shared)
a9442f59 303{
9a77daac
BG
304 if (shared)
305 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
306 else
307 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
308
309 list_del(&sp->link);
310 if (sp->lpage_disallowed)
311 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
312
313 if (shared)
314 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
315}
316
a066e61f 317/**
0f53dfa3 318 * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61f
BG
319 *
320 * @kvm: kvm instance
321 * @pt: the page removed from the paging structure
9a77daac
BG
322 * @shared: This operation may not be running under the exclusive use
323 * of the MMU lock and the operation must synchronize with other
324 * threads that might be modifying SPTEs.
a066e61f
BG
325 *
326 * Given a page table that has been removed from the TDP paging structure,
327 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
328 *
329 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
330 * protection. Since this thread removed it from the paging structure,
331 * this thread will be responsible for ensuring the page is freed. Hence the
332 * early rcu_dereferences in the function.
a066e61f 333 */
0f53dfa3 334static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61f 335{
70fb3e41 336 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 337 int level = sp->role.level;
e25f0e0c 338 gfn_t base_gfn = sp->gfn;
a066e61f
BG
339 int i;
340
341 trace_kvm_mmu_prepare_zap_page(sp);
342
c298a30c 343 tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61f
BG
344
345 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
574c3c55
BG
346 u64 *sptep = rcu_dereference(pt) + i;
347 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
348 u64 old_child_spte;
9a77daac
BG
349
350 if (shared) {
e25f0e0c
BG
351 /*
352 * Set the SPTE to a nonpresent value that other
353 * threads will not overwrite. If the SPTE was
354 * already marked as removed then another thread
355 * handling a page fault could overwrite it, so
356 * set the SPTE until it is set from some other
357 * value to the removed SPTE value.
358 */
359 for (;;) {
360 old_child_spte = xchg(sptep, REMOVED_SPTE);
361 if (!is_removed_spte(old_child_spte))
362 break;
363 cpu_relax();
364 }
9a77daac 365 } else {
8df9f1af
SC
366 /*
367 * If the SPTE is not MMU-present, there is no backing
368 * page associated with the SPTE and so no side effects
369 * that need to be recorded, and exclusive ownership of
370 * mmu_lock ensures the SPTE can't be made present.
371 * Note, zapping MMIO SPTEs is also unnecessary as they
372 * are guarded by the memslots generation, not by being
373 * unreachable.
374 */
9a77daac 375 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
376 if (!is_shadow_present_pte(old_child_spte))
377 continue;
e25f0e0c
BG
378
379 /*
380 * Marking the SPTE as a removed SPTE is not
381 * strictly necessary here as the MMU lock will
382 * stop other threads from concurrently modifying
383 * this SPTE. Using the removed SPTE value keeps
384 * the two branches consistent and simplifies
385 * the function.
386 */
387 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 388 }
e25f0e0c 389 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
f1b83255 390 old_child_spte, REMOVED_SPTE, level,
e25f0e0c 391 shared);
a066e61f
BG
392 }
393
574c3c55 394 kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
f1b83255 395 KVM_PAGES_PER_HPAGE(level + 1));
a066e61f 396
7cca2d0b 397 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
398}
399
2f2fad08 400/**
7f6231a3 401 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
402 * @kvm: kvm instance
403 * @as_id: the address space of the paging structure the SPTE was a part of
404 * @gfn: the base GFN that was mapped by the SPTE
405 * @old_spte: The value of the SPTE before the change
406 * @new_spte: The value of the SPTE after the change
407 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
408 * @shared: This operation may not be running under the exclusive use of
409 * the MMU lock and the operation must synchronize with other
410 * threads that might be modifying SPTEs.
2f2fad08
BG
411 *
412 * Handle bookkeeping that might result from the modification of a SPTE.
413 * This function must be called for all TDP SPTE modifications.
414 */
415static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
416 u64 old_spte, u64 new_spte, int level,
417 bool shared)
2f2fad08
BG
418{
419 bool was_present = is_shadow_present_pte(old_spte);
420 bool is_present = is_shadow_present_pte(new_spte);
421 bool was_leaf = was_present && is_last_spte(old_spte, level);
422 bool is_leaf = is_present && is_last_spte(new_spte, level);
423 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
424
425 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
426 WARN_ON(level < PG_LEVEL_4K);
764388ce 427 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
428
429 /*
430 * If this warning were to trigger it would indicate that there was a
431 * missing MMU notifier or a race with some notifier handler.
432 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 433 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
434 * should be zapping the SPTE before the main MM's page table is
435 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
436 * thread before replacement.
437 */
438 if (was_leaf && is_leaf && pfn_changed) {
439 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
440 "SPTE with another present leaf SPTE mapping a\n"
441 "different PFN!\n"
442 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
443 as_id, gfn, old_spte, new_spte, level);
444
445 /*
446 * Crash the host to prevent error propagation and guest data
d9f6e12f 447 * corruption.
2f2fad08
BG
448 */
449 BUG();
450 }
451
452 if (old_spte == new_spte)
453 return;
454
b9a98c34
BG
455 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
456
115111ef
DM
457 if (is_leaf)
458 check_spte_writable_invariants(new_spte);
459
2f2fad08
BG
460 /*
461 * The only times a SPTE should be changed from a non-present to
462 * non-present state is when an MMIO entry is installed/modified/
463 * removed. In that case, there is nothing to do here.
464 */
465 if (!was_present && !is_present) {
466 /*
08f07c80
BG
467 * If this change does not involve a MMIO SPTE or removed SPTE,
468 * it is unexpected. Log the change, though it should not
469 * impact the guest since both the former and current SPTEs
470 * are nonpresent.
2f2fad08 471 */
08f07c80
BG
472 if (WARN_ON(!is_mmio_spte(old_spte) &&
473 !is_mmio_spte(new_spte) &&
474 !is_removed_spte(new_spte)))
2f2fad08
BG
475 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
476 "should not be replaced with another,\n"
477 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
478 "are MMIO SPTEs, or the new SPTE is\n"
479 "a temporary removed SPTE.\n"
2f2fad08
BG
480 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
481 as_id, gfn, old_spte, new_spte, level);
482 return;
483 }
484
71f51d2c
MZ
485 if (is_leaf != was_leaf)
486 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
487
488 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 489 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
490 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
491
492 /*
493 * Recursively handle child PTs if the change removed a subtree from
c8e5a0d0
SC
494 * the paging structure. Note the WARN on the PFN changing without the
495 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
496 * pages are kernel allocations and should never be migrated.
2f2fad08 497 */
c8e5a0d0
SC
498 if (was_present && !was_leaf &&
499 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0f53dfa3 500 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
501}
502
503static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
504 u64 old_spte, u64 new_spte, int level,
505 bool shared)
2f2fad08 506{
9a77daac
BG
507 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
508 shared);
f8e14497 509 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
510 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
511 new_spte, level);
2f2fad08 512}
faaf05b0 513
9a77daac 514/*
6ccf4438
PB
515 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
516 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 517 * in KVM's dirty bitmaps.
9a77daac 518 *
3255530a
DM
519 * If setting the SPTE fails because it has changed, iter->old_spte will be
520 * refreshed to the current value of the spte.
521 *
9a77daac
BG
522 * @kvm: kvm instance
523 * @iter: a tdp_iter instance currently on the SPTE that should be set
524 * @new_spte: The value the SPTE should be set to
3e72c791
DM
525 * Return:
526 * * 0 - If the SPTE was set.
527 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
528 * no side-effects other than setting iter->old_spte to the last
529 * known value of the spte.
9a77daac 530 */
3e72c791
DM
531static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
532 struct tdp_iter *iter,
533 u64 new_spte)
9a77daac 534{
3255530a
DM
535 u64 *sptep = rcu_dereference(iter->sptep);
536 u64 old_spte;
537
3a0f64de
SC
538 WARN_ON_ONCE(iter->yielded);
539
9a77daac
BG
540 lockdep_assert_held_read(&kvm->mmu_lock);
541
08f07c80
BG
542 /*
543 * Do not change removed SPTEs. Only the thread that froze the SPTE
544 * may modify it.
545 */
7a51393a 546 if (is_removed_spte(iter->old_spte))
3e72c791 547 return -EBUSY;
08f07c80 548
6e8eb206
DM
549 /*
550 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
551 * does not hold the mmu_lock.
552 */
3255530a
DM
553 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
554 if (old_spte != iter->old_spte) {
555 /*
556 * The page table entry was modified by a different logical
557 * CPU. Refresh iter->old_spte with the current value so the
558 * caller operates on fresh data, e.g. if it retries
559 * tdp_mmu_set_spte_atomic().
560 */
561 iter->old_spte = old_spte;
3e72c791 562 return -EBUSY;
3255530a 563 }
9a77daac 564
24ae4cfa
BG
565 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
566 new_spte, iter->level, true);
567 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac 568
3e72c791 569 return 0;
9a77daac
BG
570}
571
3e72c791
DM
572static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
573 struct tdp_iter *iter)
08f07c80 574{
3e72c791
DM
575 int ret;
576
08f07c80
BG
577 /*
578 * Freeze the SPTE by setting it to a special,
579 * non-present value. This will stop other threads from
580 * immediately installing a present entry in its place
581 * before the TLBs are flushed.
582 */
3e72c791
DM
583 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
584 if (ret)
585 return ret;
08f07c80
BG
586
587 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
588 KVM_PAGES_PER_HPAGE(iter->level));
589
590 /*
591 * No other thread can overwrite the removed SPTE as they
592 * must either wait on the MMU lock or use
d9f6e12f 593 * tdp_mmu_set_spte_atomic which will not overwrite the
08f07c80
BG
594 * special removed SPTE value. No bookkeeping is needed
595 * here since the SPTE is going from non-present
596 * to non-present.
597 */
0e587aa7 598 kvm_tdp_mmu_write_spte(iter->sptep, 0);
08f07c80 599
3e72c791 600 return 0;
08f07c80
BG
601}
602
9a77daac 603
fe43fa2f
BG
604/*
605 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
626808d1
SC
606 * @kvm: KVM instance
607 * @as_id: Address space ID, i.e. regular vs. SMM
608 * @sptep: Pointer to the SPTE
609 * @old_spte: The current value of the SPTE
610 * @new_spte: The new value that will be set for the SPTE
611 * @gfn: The base GFN that was (or will be) mapped by the SPTE
612 * @level: The level _containing_ the SPTE (its parent PT's level)
fe43fa2f
BG
613 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
614 * of the page. Should be set unless handling an MMU
615 * notifier for access tracking. Leaving record_acc_track
616 * unset in that case prevents page accesses from being
617 * double counted.
618 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
619 * appropriate for the change being made. Should be set
620 * unless performing certain dirty logging operations.
621 * Leaving record_dirty_log unset in that case prevents page
622 * writes from being double counted.
623 */
626808d1
SC
624static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
625 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
626 bool record_acc_track, bool record_dirty_log)
faaf05b0 627{
531810ca 628 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 629
08f07c80 630 /*
966da62a 631 * No thread should be using this function to set SPTEs to or from the
08f07c80
BG
632 * temporary removed SPTE value.
633 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
634 * should be used. If operating under the MMU lock in write mode, the
635 * use of the removed SPTE should not be necessary.
636 */
626808d1 637 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
08f07c80 638
626808d1
SC
639 kvm_tdp_mmu_write_spte(sptep, new_spte);
640
641 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
f8e14497 642
f8e14497 643 if (record_acc_track)
626808d1 644 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d 645 if (record_dirty_log)
626808d1
SC
646 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
647 new_spte, level);
648}
649
650static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
651 u64 new_spte, bool record_acc_track,
652 bool record_dirty_log)
653{
654 WARN_ON_ONCE(iter->yielded);
655
656 __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
657 new_spte, iter->gfn, iter->level,
658 record_acc_track, record_dirty_log);
f8e14497
BG
659}
660
661static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
662 u64 new_spte)
663{
626808d1 664 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 665}
faaf05b0 666
f8e14497
BG
667static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
668 struct tdp_iter *iter,
669 u64 new_spte)
670{
626808d1 671 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
a6a0b05d
BG
672}
673
674static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
675 struct tdp_iter *iter,
676 u64 new_spte)
677{
626808d1 678 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
679}
680
681#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075 682 for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0 683
f8e14497
BG
684#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
685 tdp_root_for_each_pte(_iter, _root, _start, _end) \
686 if (!is_shadow_present_pte(_iter.old_spte) || \
687 !is_last_spte(_iter.old_spte, _iter.level)) \
688 continue; \
689 else
690
bb18842e 691#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
b9e5603c 692 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
bb18842e 693
e28a436c
BG
694/*
695 * Yield if the MMU lock is contended or this thread needs to return control
696 * to the scheduler.
697 *
e139a34e
BG
698 * If this function should yield and flush is set, it will perform a remote
699 * TLB flush before yielding.
700 *
3a0f64de
SC
701 * If this function yields, iter->yielded is set and the caller must skip to
702 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
703 * over the paging structures to allow the iterator to continue its traversal
704 * from the paging structure root.
e28a436c 705 *
3a0f64de 706 * Returns true if this function yielded.
e28a436c 707 */
3a0f64de
SC
708static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
709 struct tdp_iter *iter,
710 bool flush, bool shared)
a6a0b05d 711{
3a0f64de
SC
712 WARN_ON(iter->yielded);
713
ed5e484b
BG
714 /* Ensure forward progress has been made before yielding. */
715 if (iter->next_last_level_gfn == iter->yielded_gfn)
716 return false;
717
531810ca 718 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7cca2d0b
BG
719 rcu_read_unlock();
720
e139a34e
BG
721 if (flush)
722 kvm_flush_remote_tlbs(kvm);
723
6103bc07
BG
724 if (shared)
725 cond_resched_rwlock_read(&kvm->mmu_lock);
726 else
727 cond_resched_rwlock_write(&kvm->mmu_lock);
728
7cca2d0b 729 rcu_read_lock();
ed5e484b
BG
730
731 WARN_ON(iter->gfn > iter->next_last_level_gfn);
732
3a0f64de 733 iter->yielded = true;
a6a0b05d 734 }
e28a436c 735
3a0f64de 736 return iter->yielded;
a6a0b05d
BG
737}
738
faaf05b0
BG
739/*
740 * Tears down the mappings for the range of gfns, [start, end), and frees the
741 * non-root pages mapping GFNs strictly within that range. Returns true if
742 * SPTEs have been cleared and a TLB flush is needed before releasing the
743 * MMU lock.
6103bc07 744 *
063afacd
BG
745 * If can_yield is true, will release the MMU lock and reschedule if the
746 * scheduler needs the CPU or there is contention on the MMU lock. If this
747 * function cannot yield, it will not release the MMU lock or reschedule and
748 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07
BG
749 * operation can cause a soft lockup.
750 *
751 * If shared is true, this thread holds the MMU lock in read mode and must
752 * account for the possibility that other threads are modifying the paging
753 * structures concurrently. If shared is false, this thread should hold the
754 * MMU lock in write mode.
faaf05b0
BG
755 */
756static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
757 gfn_t start, gfn_t end, bool can_yield, bool flush,
758 bool shared)
faaf05b0 759{
524a1e4e
SC
760 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
761 bool zap_all = (start == 0 && end >= max_gfn_host);
faaf05b0 762 struct tdp_iter iter;
faaf05b0 763
0103098f
SC
764 /*
765 * No need to try to step down in the iterator when zapping all SPTEs,
766 * zapping the top-level non-leaf SPTEs will recurse on their children.
767 */
768 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
769
524a1e4e
SC
770 /*
771 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
772 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
773 * and so KVM will never install a SPTE for such addresses.
774 */
775 end = min(end, max_gfn_host);
776
6103bc07
BG
777 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
778
7cca2d0b
BG
779 rcu_read_lock();
780
77aa6075 781 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
6103bc07 782retry:
1af4a960 783 if (can_yield &&
6103bc07 784 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
a835429c 785 flush = false;
1af4a960
BG
786 continue;
787 }
788
faaf05b0
BG
789 if (!is_shadow_present_pte(iter.old_spte))
790 continue;
791
792 /*
793 * If this is a non-last-level SPTE that covers a larger range
794 * than should be zapped, continue, and zap the mappings at a
524a1e4e 795 * lower level, except when zapping all SPTEs.
faaf05b0 796 */
524a1e4e
SC
797 if (!zap_all &&
798 (iter.gfn < start ||
faaf05b0
BG
799 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
800 !is_last_spte(iter.old_spte, iter.level))
801 continue;
802
6103bc07
BG
803 if (!shared) {
804 tdp_mmu_set_spte(kvm, &iter, 0);
805 flush = true;
3e72c791 806 } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
6103bc07
BG
807 goto retry;
808 }
faaf05b0 809 }
7cca2d0b
BG
810
811 rcu_read_unlock();
a835429c 812 return flush;
faaf05b0
BG
813}
814
815/*
816 * Tears down the mappings for the range of gfns, [start, end), and frees the
817 * non-root pages mapping GFNs strictly within that range. Returns true if
818 * SPTEs have been cleared and a TLB flush is needed before releasing the
819 * MMU lock.
820 */
2b9663d8 821bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
5a324c24 822 gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
823{
824 struct kvm_mmu_page *root;
faaf05b0 825
614f6970 826 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
6103bc07 827 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
5a324c24 828 false);
faaf05b0 829
faaf05b0
BG
830 return flush;
831}
832
833void kvm_tdp_mmu_zap_all(struct kvm *kvm)
834{
2b9663d8
SC
835 bool flush = false;
836 int i;
837
838 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5a324c24 839 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
faaf05b0 840
faaf05b0
BG
841 if (flush)
842 kvm_flush_remote_tlbs(kvm);
843}
bb18842e 844
4c6654bd
BG
845static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
846 struct kvm_mmu_page *prev_root)
847{
848 struct kvm_mmu_page *next_root;
849
850 if (prev_root)
851 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
852 &prev_root->link,
853 typeof(*prev_root), link);
854 else
855 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
856 typeof(*next_root), link);
857
858 while (next_root && !(next_root->role.invalid &&
859 refcount_read(&next_root->tdp_mmu_root_count)))
860 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
861 &next_root->link,
862 typeof(*next_root), link);
863
864 return next_root;
865}
866
867/*
f28e9c7f
SC
868 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
869 * zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a
870 * reference to each invalidated root, roots will not be freed until after this
871 * function drops the gifted reference, e.g. so that vCPUs don't get stuck with
872 * tearing down paging structures.
4c6654bd
BG
873 */
874void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
875{
4c6654bd
BG
876 struct kvm_mmu_page *next_root;
877 struct kvm_mmu_page *root;
4c6654bd
BG
878
879 lockdep_assert_held_read(&kvm->mmu_lock);
880
881 rcu_read_lock();
882
883 root = next_invalidated_root(kvm, NULL);
884
885 while (root) {
886 next_root = next_invalidated_root(kvm, root);
887
888 rcu_read_unlock();
889
7ae5840e
SC
890 /*
891 * A TLB flush is unnecessary, invalidated roots are guaranteed
892 * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
893 * for more details), and unlike the legacy MMU, no vCPU kick
894 * is needed to play nice with lockless shadow walks as the TDP
895 * MMU protects its paging structures via RCU. Note, zapping
896 * will still flush on yield, but that's a minor performance
897 * blip and not a functional issue.
898 */
899 (void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
4c6654bd
BG
900
901 /*
902 * Put the reference acquired in
903 * kvm_tdp_mmu_invalidate_roots
904 */
905 kvm_tdp_mmu_put_root(kvm, root, true);
906
907 root = next_root;
908
909 rcu_read_lock();
910 }
911
912 rcu_read_unlock();
faaf05b0 913}
bb18842e 914
b7cccd39 915/*
f28e9c7f
SC
916 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
917 * is about to be zapped, e.g. in response to a memslots update. The caller is
918 * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual
919 * zapping.
b7cccd39 920 *
f28e9c7f
SC
921 * Take a reference on all roots to prevent the root from being freed before it
922 * is zapped by this thread. Freeing a root is not a correctness issue, but if
923 * a vCPU drops the last reference to a root prior to the root being zapped, it
924 * will get stuck with tearing down the entire paging structure.
4c6654bd 925 *
f28e9c7f
SC
926 * Get a reference even if the root is already invalid,
927 * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all
928 * invalid roots, e.g. there's no epoch to identify roots that were invalidated
929 * by a previous call. Roots stay on the list until the last reference is
930 * dropped, so even though all invalid roots are zapped, a root may not go away
931 * for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
932 *
933 * Because mmu_lock is held for write, it should be impossible to observe a
934 * root with zero refcount, i.e. the list of roots cannot be stale.
4c6654bd 935 *
b7cccd39
BG
936 * This has essentially the same effect for the TDP MMU
937 * as updating mmu_valid_gen does for the shadow MMU.
938 */
939void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
940{
941 struct kvm_mmu_page *root;
942
943 lockdep_assert_held_write(&kvm->mmu_lock);
f28e9c7f
SC
944 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
945 if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
4c6654bd 946 root->role.invalid = true;
f28e9c7f 947 }
b7cccd39
BG
948}
949
bb18842e
BG
950/*
951 * Installs a last-level SPTE to handle a TDP page fault.
952 * (NPT/EPT violation/misconfiguration)
953 */
cdc47767
PB
954static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
955 struct kvm_page_fault *fault,
956 struct tdp_iter *iter)
bb18842e 957{
c435d4b7 958 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 959 u64 new_spte;
57a3e96d 960 int ret = RET_PF_FIXED;
ad67e480 961 bool wrprot = false;
bb18842e 962
7158bee4 963 WARN_ON(sp->role.level != fault->goal_level);
e710c5f6 964 if (unlikely(!fault->slot))
bb18842e 965 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 966 else
53597858 967 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 968 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 969 fault->map_writable, &new_spte);
bb18842e
BG
970
971 if (new_spte == iter->old_spte)
972 ret = RET_PF_SPURIOUS;
3e72c791 973 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 974 return RET_PF_RETRY;
bb18842e
BG
975
976 /*
977 * If the page fault was caused by a write but the page is write
978 * protected, emulation is needed. If the emulation was skipped,
979 * the vCPU would have the same fault again.
980 */
ad67e480 981 if (wrprot) {
cdc47767 982 if (fault->write)
bb18842e 983 ret = RET_PF_EMULATE;
bb18842e
BG
984 }
985
986 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
987 if (unlikely(is_mmio_spte(new_spte))) {
988 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
989 new_spte);
bb18842e 990 ret = RET_PF_EMULATE;
3849e092 991 } else {
9a77daac
BG
992 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
993 rcu_dereference(iter->sptep));
3849e092 994 }
bb18842e 995
857f8474
KH
996 /*
997 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
998 * consistent with legacy MMU behavior.
999 */
1000 if (ret != RET_PF_SPURIOUS)
bb18842e
BG
1001 vcpu->stat.pf_fixed++;
1002
1003 return ret;
1004}
1005
7b7e1ab6 1006/*
cb00a70b
DM
1007 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1008 * provided page table.
7b7e1ab6
DM
1009 *
1010 * @kvm: kvm instance
1011 * @iter: a tdp_iter instance currently on the SPTE that should be set
1012 * @sp: The new TDP page table to install.
1013 * @account_nx: True if this page table is being installed to split a
1014 * non-executable huge page.
cb00a70b 1015 * @shared: This operation is running under the MMU lock in read mode.
7b7e1ab6
DM
1016 *
1017 * Returns: 0 if the new page table was installed. Non-0 if the page table
1018 * could not be installed (e.g. the atomic compare-exchange failed).
1019 */
cb00a70b
DM
1020static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1021 struct kvm_mmu_page *sp, bool account_nx,
1022 bool shared)
7b7e1ab6
DM
1023{
1024 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
cb00a70b 1025 int ret = 0;
7b7e1ab6 1026
cb00a70b
DM
1027 if (shared) {
1028 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1029 if (ret)
1030 return ret;
1031 } else {
1032 tdp_mmu_set_spte(kvm, iter, spte);
1033 }
7b7e1ab6
DM
1034
1035 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1036 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1037 if (account_nx)
1038 account_huge_nx_page(kvm, sp);
1039 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1040
1041 return 0;
1042}
1043
bb18842e
BG
1044/*
1045 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1046 * page tables and SPTEs to translate the faulting guest physical address.
1047 */
2f6305dd 1048int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 1049{
bb18842e
BG
1050 struct kvm_mmu *mmu = vcpu->arch.mmu;
1051 struct tdp_iter iter;
89c0fd49 1052 struct kvm_mmu_page *sp;
bb18842e 1053 int ret;
bb18842e 1054
73a3c659 1055 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 1056
f0066d94 1057 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
1058
1059 rcu_read_lock();
1060
2f6305dd 1061 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
73a3c659 1062 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 1063 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 1064
73a3c659 1065 if (iter.level == fault->goal_level)
bb18842e
BG
1066 break;
1067
1068 /*
1069 * If there is an SPTE mapping a large page at a higher level
1070 * than the target, that SPTE must be cleared and replaced
1071 * with a non-leaf SPTE.
1072 */
1073 if (is_shadow_present_pte(iter.old_spte) &&
1074 is_large_pte(iter.old_spte)) {
3e72c791 1075 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 1076 break;
bb18842e 1077
bb18842e
BG
1078 /*
1079 * The iter must explicitly re-read the spte here
1080 * because the new value informs the !present
1081 * path below.
1082 */
0e587aa7 1083 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
bb18842e
BG
1084 }
1085
1086 if (!is_shadow_present_pte(iter.old_spte)) {
7b7e1ab6
DM
1087 bool account_nx = fault->huge_page_disallowed &&
1088 fault->req_level >= iter.level;
1089
ff76d506 1090 /*
c4342633 1091 * If SPTE has been frozen by another thread, just
ff76d506
KH
1092 * give up and retry, avoiding unnecessary page table
1093 * allocation and free.
1094 */
1095 if (is_removed_spte(iter.old_spte))
1096 break;
1097
a82070b6
DM
1098 sp = tdp_mmu_alloc_sp(vcpu);
1099 tdp_mmu_init_child_sp(sp, &iter);
1100
cb00a70b 1101 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
9a77daac
BG
1102 tdp_mmu_free_sp(sp);
1103 break;
1104 }
bb18842e
BG
1105 }
1106 }
1107
73a3c659 1108 if (iter.level != fault->goal_level) {
7cca2d0b 1109 rcu_read_unlock();
bb18842e 1110 return RET_PF_RETRY;
7cca2d0b 1111 }
bb18842e 1112
cdc47767 1113 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
7cca2d0b 1114 rcu_read_unlock();
bb18842e
BG
1115
1116 return ret;
1117}
063afacd 1118
3039bcc7
SC
1119bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1120 bool flush)
063afacd 1121{
83b83a02
SC
1122 return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1123 range->end, range->may_block, flush);
063afacd
BG
1124}
1125
3039bcc7
SC
1126typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1127 struct kvm_gfn_range *range);
063afacd 1128
3039bcc7
SC
1129static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1130 struct kvm_gfn_range *range,
1131 tdp_handler_t handler)
063afacd 1132{
3039bcc7
SC
1133 struct kvm_mmu_page *root;
1134 struct tdp_iter iter;
1135 bool ret = false;
1136
e1eed584
SC
1137 /*
1138 * Don't support rescheduling, none of the MMU notifiers that funnel
1139 * into this helper allow blocking; it'd be dead, wasteful code.
1140 */
3039bcc7 1141 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
a151acec
SC
1142 rcu_read_lock();
1143
3039bcc7
SC
1144 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1145 ret |= handler(kvm, &iter, range);
3039bcc7 1146
a151acec
SC
1147 rcu_read_unlock();
1148 }
3039bcc7
SC
1149
1150 return ret;
063afacd 1151}
f8e14497
BG
1152
1153/*
1154 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1155 * if any of the GFNs in the range have been accessed.
1156 */
3039bcc7
SC
1157static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1158 struct kvm_gfn_range *range)
f8e14497 1159{
f8e14497
BG
1160 u64 new_spte = 0;
1161
3039bcc7
SC
1162 /* If we have a non-accessed entry we don't need to change the pte. */
1163 if (!is_accessed_spte(iter->old_spte))
1164 return false;
7cca2d0b 1165
3039bcc7
SC
1166 new_spte = iter->old_spte;
1167
1168 if (spte_ad_enabled(new_spte)) {
1169 new_spte &= ~shadow_accessed_mask;
1170 } else {
f8e14497 1171 /*
3039bcc7
SC
1172 * Capture the dirty status of the page, so that it doesn't get
1173 * lost when the SPTE is marked for access tracking.
f8e14497 1174 */
3039bcc7
SC
1175 if (is_writable_pte(new_spte))
1176 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1177
3039bcc7 1178 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1179 }
1180
3039bcc7 1181 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1182
3039bcc7 1183 return true;
f8e14497
BG
1184}
1185
3039bcc7 1186bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1187{
3039bcc7 1188 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1189}
1190
3039bcc7
SC
1191static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1192 struct kvm_gfn_range *range)
f8e14497 1193{
3039bcc7 1194 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1195}
1196
3039bcc7 1197bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1198{
3039bcc7 1199 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1200}
1d8dd6b3 1201
3039bcc7
SC
1202static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1203 struct kvm_gfn_range *range)
1d8dd6b3 1204{
1d8dd6b3 1205 u64 new_spte;
7cca2d0b 1206
3039bcc7
SC
1207 /* Huge pages aren't expected to be modified without first being zapped. */
1208 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1209
3039bcc7
SC
1210 if (iter->level != PG_LEVEL_4K ||
1211 !is_shadow_present_pte(iter->old_spte))
1212 return false;
1d8dd6b3 1213
3039bcc7
SC
1214 /*
1215 * Note, when changing a read-only SPTE, it's not strictly necessary to
1216 * zero the SPTE before setting the new PFN, but doing so preserves the
1217 * invariant that the PFN of a present * leaf SPTE can never change.
1218 * See __handle_changed_spte().
1219 */
1220 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1221
3039bcc7
SC
1222 if (!pte_write(range->pte)) {
1223 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1224 pte_pfn(range->pte));
1d8dd6b3 1225
3039bcc7 1226 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1227 }
1228
3039bcc7 1229 return true;
1d8dd6b3
BG
1230}
1231
3039bcc7
SC
1232/*
1233 * Handle the changed_pte MMU notifier for the TDP MMU.
1234 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1235 * notifier.
1236 * Returns non-zero if a flush is needed before releasing the MMU lock.
1237 */
1238bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1239{
93fa50f6
SC
1240 /*
1241 * No need to handle the remote TLB flush under RCU protection, the
1242 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1243 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1244 */
1245 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1d8dd6b3
BG
1246}
1247
a6a0b05d 1248/*
bedd9195
DM
1249 * Remove write access from all SPTEs at or above min_level that map GFNs
1250 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1251 * be flushed.
a6a0b05d
BG
1252 */
1253static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1254 gfn_t start, gfn_t end, int min_level)
1255{
1256 struct tdp_iter iter;
1257 u64 new_spte;
1258 bool spte_set = false;
1259
7cca2d0b
BG
1260 rcu_read_lock();
1261
a6a0b05d
BG
1262 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1263
77aa6075 1264 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfa
BG
1265retry:
1266 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1267 continue;
1268
a6a0b05d 1269 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1270 !is_last_spte(iter.old_spte, iter.level) ||
1271 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1272 continue;
1273
1274 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1275
3e72c791 1276 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1277 goto retry;
3255530a 1278
a6a0b05d 1279 spte_set = true;
a6a0b05d 1280 }
7cca2d0b
BG
1281
1282 rcu_read_unlock();
a6a0b05d
BG
1283 return spte_set;
1284}
1285
1286/*
1287 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1288 * only affect leaf SPTEs down to min_level.
1289 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1290 */
269e9552
HM
1291bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1292 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1293{
1294 struct kvm_mmu_page *root;
a6a0b05d
BG
1295 bool spte_set = false;
1296
24ae4cfa 1297 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1298
d62007ed 1299 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1300 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1301 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1302
1303 return spte_set;
1304}
1305
a3fe5dbd
DM
1306static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1307{
1308 struct kvm_mmu_page *sp;
1309
1310 gfp |= __GFP_ZERO;
1311
1312 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1313 if (!sp)
1314 return NULL;
1315
1316 sp->spt = (void *)__get_free_page(gfp);
1317 if (!sp->spt) {
1318 kmem_cache_free(mmu_page_header_cache, sp);
1319 return NULL;
1320 }
1321
1322 return sp;
1323}
1324
1325static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
cb00a70b
DM
1326 struct tdp_iter *iter,
1327 bool shared)
a3fe5dbd
DM
1328{
1329 struct kvm_mmu_page *sp;
1330
a3fe5dbd
DM
1331 /*
1332 * Since we are allocating while under the MMU lock we have to be
1333 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1334 * reclaim and to avoid making any filesystem callbacks (which can end
1335 * up invoking KVM MMU notifiers, resulting in a deadlock).
1336 *
1337 * If this allocation fails we drop the lock and retry with reclaim
1338 * allowed.
1339 */
1340 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1341 if (sp)
1342 return sp;
1343
1344 rcu_read_unlock();
cb00a70b
DM
1345
1346 if (shared)
1347 read_unlock(&kvm->mmu_lock);
1348 else
1349 write_unlock(&kvm->mmu_lock);
a3fe5dbd
DM
1350
1351 iter->yielded = true;
1352 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1353
cb00a70b
DM
1354 if (shared)
1355 read_lock(&kvm->mmu_lock);
1356 else
1357 write_lock(&kvm->mmu_lock);
1358
a3fe5dbd
DM
1359 rcu_read_lock();
1360
1361 return sp;
1362}
1363
cb00a70b
DM
1364static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1365 struct kvm_mmu_page *sp, bool shared)
a3fe5dbd
DM
1366{
1367 const u64 huge_spte = iter->old_spte;
1368 const int level = iter->level;
1369 int ret, i;
1370
1371 tdp_mmu_init_child_sp(sp, iter);
1372
1373 /*
1374 * No need for atomics when writing to sp->spt since the page table has
1375 * not been linked in yet and thus is not reachable from any other CPU.
1376 */
1377 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1378 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1379
1380 /*
1381 * Replace the huge spte with a pointer to the populated lower level
1382 * page table. Since we are making this change without a TLB flush vCPUs
1383 * will see a mix of the split mappings and the original huge mapping,
1384 * depending on what's currently in their TLB. This is fine from a
1385 * correctness standpoint since the translation will be the same either
1386 * way.
1387 */
cb00a70b 1388 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
a3fe5dbd 1389 if (ret)
e0b728b1 1390 goto out;
a3fe5dbd
DM
1391
1392 /*
1393 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1394 * are overwriting from the page stats. But we have to manually update
1395 * the page stats with the new present child pages.
1396 */
1397 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1398
e0b728b1
DM
1399out:
1400 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1401 return ret;
a3fe5dbd
DM
1402}
1403
1404static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1405 struct kvm_mmu_page *root,
1406 gfn_t start, gfn_t end,
cb00a70b 1407 int target_level, bool shared)
a3fe5dbd
DM
1408{
1409 struct kvm_mmu_page *sp = NULL;
1410 struct tdp_iter iter;
1411 int ret = 0;
1412
1413 rcu_read_lock();
1414
1415 /*
1416 * Traverse the page table splitting all huge pages above the target
1417 * level into one lower level. For example, if we encounter a 1GB page
1418 * we split it into 512 2MB pages.
1419 *
1420 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1421 * to visit an SPTE before ever visiting its children, which means we
1422 * will correctly recursively split huge pages that are more than one
1423 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1424 * and then splitting each of those to 512 4KB pages).
1425 */
1426 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1427retry:
cb00a70b 1428 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
a3fe5dbd
DM
1429 continue;
1430
1431 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1432 continue;
1433
1434 if (!sp) {
cb00a70b 1435 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
a3fe5dbd
DM
1436 if (!sp) {
1437 ret = -ENOMEM;
e0b728b1
DM
1438 trace_kvm_mmu_split_huge_page(iter.gfn,
1439 iter.old_spte,
1440 iter.level, ret);
a3fe5dbd
DM
1441 break;
1442 }
1443
1444 if (iter.yielded)
1445 continue;
1446 }
1447
cb00a70b 1448 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
a3fe5dbd
DM
1449 goto retry;
1450
1451 sp = NULL;
1452 }
1453
1454 rcu_read_unlock();
1455
1456 /*
1457 * It's possible to exit the loop having never used the last sp if, for
1458 * example, a vCPU doing HugePage NX splitting wins the race and
1459 * installs its own sp in place of the last sp we tried to split.
1460 */
1461 if (sp)
1462 tdp_mmu_free_sp(sp);
1463
a3fe5dbd
DM
1464 return ret;
1465}
1466
cb00a70b 1467
a3fe5dbd
DM
1468/*
1469 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1470 */
1471void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1472 const struct kvm_memory_slot *slot,
1473 gfn_t start, gfn_t end,
cb00a70b 1474 int target_level, bool shared)
a3fe5dbd
DM
1475{
1476 struct kvm_mmu_page *root;
1477 int r = 0;
1478
cb00a70b 1479 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a3fe5dbd 1480
7c554d8e 1481 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
cb00a70b 1482 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
a3fe5dbd 1483 if (r) {
cb00a70b 1484 kvm_tdp_mmu_put_root(kvm, root, shared);
a3fe5dbd
DM
1485 break;
1486 }
1487 }
1488}
1489
a6a0b05d
BG
1490/*
1491 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1492 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1493 * If AD bits are not enabled, this will require clearing the writable bit on
1494 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1495 * be flushed.
1496 */
1497static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1498 gfn_t start, gfn_t end)
1499{
1500 struct tdp_iter iter;
1501 u64 new_spte;
1502 bool spte_set = false;
1503
7cca2d0b
BG
1504 rcu_read_lock();
1505
a6a0b05d 1506 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1507retry:
1508 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1509 continue;
1510
3354ef5a
SC
1511 if (!is_shadow_present_pte(iter.old_spte))
1512 continue;
1513
a6a0b05d
BG
1514 if (spte_ad_need_write_protect(iter.old_spte)) {
1515 if (is_writable_pte(iter.old_spte))
1516 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1517 else
1518 continue;
1519 } else {
1520 if (iter.old_spte & shadow_dirty_mask)
1521 new_spte = iter.old_spte & ~shadow_dirty_mask;
1522 else
1523 continue;
1524 }
1525
3e72c791 1526 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1527 goto retry;
3255530a 1528
a6a0b05d 1529 spte_set = true;
a6a0b05d 1530 }
7cca2d0b
BG
1531
1532 rcu_read_unlock();
a6a0b05d
BG
1533 return spte_set;
1534}
1535
1536/*
1537 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1538 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1539 * If AD bits are not enabled, this will require clearing the writable bit on
1540 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1541 * be flushed.
1542 */
269e9552
HM
1543bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1544 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1545{
1546 struct kvm_mmu_page *root;
a6a0b05d
BG
1547 bool spte_set = false;
1548
24ae4cfa 1549 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1550
d62007ed 1551 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1552 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1553 slot->base_gfn + slot->npages);
a6a0b05d
BG
1554
1555 return spte_set;
1556}
1557
1558/*
1559 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1560 * set in mask, starting at gfn. The given memslot is expected to contain all
1561 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1562 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1563 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1564 */
1565static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1566 gfn_t gfn, unsigned long mask, bool wrprot)
1567{
1568 struct tdp_iter iter;
1569 u64 new_spte;
1570
7cca2d0b
BG
1571 rcu_read_lock();
1572
a6a0b05d
BG
1573 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1574 gfn + BITS_PER_LONG) {
1575 if (!mask)
1576 break;
1577
1578 if (iter.level > PG_LEVEL_4K ||
1579 !(mask & (1UL << (iter.gfn - gfn))))
1580 continue;
1581
f1b3b06a
BG
1582 mask &= ~(1UL << (iter.gfn - gfn));
1583
a6a0b05d
BG
1584 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1585 if (is_writable_pte(iter.old_spte))
1586 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1587 else
1588 continue;
1589 } else {
1590 if (iter.old_spte & shadow_dirty_mask)
1591 new_spte = iter.old_spte & ~shadow_dirty_mask;
1592 else
1593 continue;
1594 }
1595
1596 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1597 }
7cca2d0b
BG
1598
1599 rcu_read_unlock();
a6a0b05d
BG
1600}
1601
1602/*
1603 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1604 * set in mask, starting at gfn. The given memslot is expected to contain all
1605 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1606 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1607 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1608 */
1609void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1610 struct kvm_memory_slot *slot,
1611 gfn_t gfn, unsigned long mask,
1612 bool wrprot)
1613{
1614 struct kvm_mmu_page *root;
a6a0b05d 1615
531810ca 1616 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1617 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1618 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1619}
1620
14881998 1621/*
87aa9ec9
BG
1622 * Clear leaf entries which could be replaced by large mappings, for
1623 * GFNs within the slot.
14881998 1624 */
4b85c921 1625static void zap_collapsible_spte_range(struct kvm *kvm,
14881998 1626 struct kvm_mmu_page *root,
4b85c921 1627 const struct kvm_memory_slot *slot)
14881998 1628{
9eba50f8
SC
1629 gfn_t start = slot->base_gfn;
1630 gfn_t end = start + slot->npages;
14881998
BG
1631 struct tdp_iter iter;
1632 kvm_pfn_t pfn;
14881998 1633
7cca2d0b
BG
1634 rcu_read_lock();
1635
14881998 1636 tdp_root_for_each_pte(iter, root, start, end) {
2db6f772 1637retry:
4b85c921 1638 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960 1639 continue;
1af4a960 1640
14881998 1641 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1642 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1643 continue;
1644
1645 pfn = spte_to_pfn(iter.old_spte);
1646 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1647 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1648 pfn, PG_LEVEL_NUM))
14881998
BG
1649 continue;
1650
4b85c921 1651 /* Note, a successful atomic zap also does a remote TLB flush. */
3e72c791 1652 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
2db6f772 1653 goto retry;
14881998
BG
1654 }
1655
7cca2d0b 1656 rcu_read_unlock();
14881998
BG
1657}
1658
1659/*
1660 * Clear non-leaf entries (and free associated page tables) which could
1661 * be replaced by large mappings, for GFNs within the slot.
1662 */
4b85c921
SC
1663void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1664 const struct kvm_memory_slot *slot)
14881998
BG
1665{
1666 struct kvm_mmu_page *root;
14881998 1667
2db6f772 1668 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1669
d62007ed 1670 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921 1671 zap_collapsible_spte_range(kvm, root, slot);
14881998 1672}
46044f72
BG
1673
1674/*
1675 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1676 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1677 * Returns true if an SPTE was set and a TLB flush is needed.
1678 */
1679static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1680 gfn_t gfn, int min_level)
46044f72
BG
1681{
1682 struct tdp_iter iter;
1683 u64 new_spte;
1684 bool spte_set = false;
1685
3ad93562
KZ
1686 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1687
7cca2d0b
BG
1688 rcu_read_lock();
1689
77aa6075 1690 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562
KZ
1691 if (!is_shadow_present_pte(iter.old_spte) ||
1692 !is_last_spte(iter.old_spte, iter.level))
1693 continue;
1694
46044f72 1695 new_spte = iter.old_spte &
5fc3424f 1696 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72 1697
7c8a4742
DM
1698 if (new_spte == iter.old_spte)
1699 break;
1700
46044f72
BG
1701 tdp_mmu_set_spte(kvm, &iter, new_spte);
1702 spte_set = true;
1703 }
1704
7cca2d0b
BG
1705 rcu_read_unlock();
1706
46044f72
BG
1707 return spte_set;
1708}
1709
1710/*
1711 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1712 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1713 * Returns true if an SPTE was set and a TLB flush is needed.
1714 */
1715bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1716 struct kvm_memory_slot *slot, gfn_t gfn,
1717 int min_level)
46044f72
BG
1718{
1719 struct kvm_mmu_page *root;
46044f72
BG
1720 bool spte_set = false;
1721
531810ca 1722 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1723 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1724 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1725
46044f72
BG
1726 return spte_set;
1727}
1728
95fb5b02
BG
1729/*
1730 * Return the level of the lowest level SPTE added to sptes.
1731 * That SPTE may be non-present.
c5c8c7c5
DM
1732 *
1733 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1734 */
39b4d43e
SC
1735int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1736 int *root_level)
95fb5b02
BG
1737{
1738 struct tdp_iter iter;
1739 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1740 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1741 int leaf = -1;
95fb5b02 1742
39b4d43e 1743 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02
BG
1744
1745 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1746 leaf = iter.level;
dde81f94 1747 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1748 }
1749
1750 return leaf;
1751}
6e8eb206
DM
1752
1753/*
1754 * Returns the last level spte pointer of the shadow page walk for the given
1755 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1756 * walk could be performed, returns NULL and *spte does not contain valid data.
1757 *
1758 * Contract:
1759 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1760 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1761 *
1762 * WARNING: This function is only intended to be called during fast_page_fault.
1763 */
1764u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1765 u64 *spte)
1766{
1767 struct tdp_iter iter;
1768 struct kvm_mmu *mmu = vcpu->arch.mmu;
1769 gfn_t gfn = addr >> PAGE_SHIFT;
1770 tdp_ptep_t sptep = NULL;
1771
1772 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1773 *spte = iter.old_spte;
1774 sptep = iter.sptep;
1775 }
1776
1777 /*
1778 * Perform the rcu_dereference to get the raw spte pointer value since
1779 * we are passing it up to fast_page_fault, which is shared with the
1780 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1781 * annotation.
1782 *
1783 * This is safe since fast_page_fault obeys the contracts of this
1784 * function as well as all TDP MMU contracts around modifying SPTEs
1785 * outside of mmu_lock.
1786 */
1787 return rcu_dereference(sptep);
1788}