KVM: x86/mmu: Zap only the target TDP MMU shadow page in NX recovery
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
71ba3f31 13static bool __read_mostly tdp_mmu_enabled = true;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
d501f747 17bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 18{
897218ff 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
d501f747 20 return false;
fe5db27d
BG
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
02c00b3a
BG
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
d501f747
BG
28
29 return true;
fe5db27d
BG
30}
31
226b8c8f
SC
32/* Arbitrarily returns true so that this may be used in if statements. */
33static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
6103bc07
BG
34 bool shared)
35{
36 if (shared)
37 lockdep_assert_held_read(&kvm->mmu_lock);
38 else
39 lockdep_assert_held_write(&kvm->mmu_lock);
226b8c8f
SC
40
41 return true;
6103bc07
BG
42}
43
fe5db27d
BG
44void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
45{
46 if (!kvm->arch.tdp_mmu_enabled)
47 return;
02c00b3a 48
524a1e4e 49 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
02c00b3a 50 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
51
52 /*
53 * Ensure that all the outstanding RCU callbacks to free shadow pages
54 * can run before the VM is torn down.
55 */
56 rcu_barrier();
02c00b3a
BG
57}
58
2bdb3d84 59static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
60 gfn_t start, gfn_t end, bool can_yield, bool flush,
61 bool shared);
2bdb3d84
BG
62
63static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 64{
2bdb3d84
BG
65 free_page((unsigned long)sp->spt);
66 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
67}
68
c0e64238
BG
69/*
70 * This is called through call_rcu in order to free TDP page table memory
71 * safely with respect to other kernel threads that may be operating on
72 * the memory.
73 * By only accessing TDP MMU page table memory in an RCU read critical
74 * section, and freeing it after a grace period, lockless access to that
75 * memory won't use it after it is freed.
76 */
77static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 78{
c0e64238
BG
79 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
80 rcu_head);
a889ea54 81
c0e64238
BG
82 tdp_mmu_free_sp(sp);
83}
a889ea54 84
6103bc07
BG
85void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
86 bool shared)
2bdb3d84 87{
6103bc07 88 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 89
11cccf5c 90 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
91 return;
92
93 WARN_ON(!root->tdp_mmu_page);
94
c0e64238
BG
95 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
96 list_del_rcu(&root->link);
97 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
2bdb3d84 98
db01416b
SC
99 /*
100 * A TLB flush is not necessary as KVM performs a local TLB flush when
101 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
102 * to a different pCPU. Note, the local TLB flush on reuse also
103 * invalidates any paging-structure-cache entries, i.e. TLB entries for
104 * intermediate paging structures, that may be zapped, as such entries
105 * are associated with the ASID on both VMX and SVM.
106 */
107 (void)zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
2bdb3d84 108
c0e64238 109 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
110}
111
cfc10997 112/*
d62007ed
SC
113 * Returns the next root after @prev_root (or the first root if @prev_root is
114 * NULL). A reference to the returned root is acquired, and the reference to
115 * @prev_root is released (the caller obviously must hold a reference to
116 * @prev_root if it's non-NULL).
117 *
118 * If @only_valid is true, invalid roots are skipped.
119 *
120 * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997
BG
121 */
122static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07 123 struct kvm_mmu_page *prev_root,
d62007ed 124 bool shared, bool only_valid)
a889ea54
BG
125{
126 struct kvm_mmu_page *next_root;
127
c0e64238
BG
128 rcu_read_lock();
129
cfc10997 130 if (prev_root)
c0e64238
BG
131 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
132 &prev_root->link,
133 typeof(*prev_root), link);
cfc10997 134 else
c0e64238
BG
135 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
136 typeof(*next_root), link);
a889ea54 137
04dc4e6c 138 while (next_root) {
d62007ed 139 if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94 140 kvm_tdp_mmu_get_root(next_root))
04dc4e6c
SC
141 break;
142
c0e64238
BG
143 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
144 &next_root->link, typeof(*next_root), link);
04dc4e6c 145 }
fb101293 146
c0e64238 147 rcu_read_unlock();
a889ea54 148
cfc10997 149 if (prev_root)
6103bc07 150 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 151
a889ea54
BG
152 return next_root;
153}
154
155/*
156 * Note: this iterator gets and puts references to the roots it iterates over.
157 * This makes it safe to release the MMU lock and yield within the loop, but
158 * if exiting the loop early, the caller must drop the reference to the most
159 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
160 *
161 * If shared is set, this function is operating under the MMU lock in read
162 * mode. In the unlikely event that this thread must free a root, the lock
163 * will be temporarily dropped and reacquired in write mode.
a889ea54 164 */
d62007ed
SC
165#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
166 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
167 _root; \
168 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
614f6970
PB
169 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
170 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 171 } else
a889ea54 172
d62007ed
SC
173#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
174 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
175
614f6970
PB
176#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
177 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
d62007ed 178
226b8c8f
SC
179/*
180 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
181 * the implication being that any flow that holds mmu_lock for read is
182 * inherently yield-friendly and should use the yield-safe variant above.
183 * Holding mmu_lock for write obviates the need for RCU protection as the list
184 * is guaranteed to be stable.
185 */
186#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
187 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
188 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
189 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 190 } else
02c00b3a 191
a82070b6 192static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3a
BG
193{
194 struct kvm_mmu_page *sp;
195
196 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
197 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6
DM
198
199 return sp;
200}
201
c10743a1
SC
202static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
203 gfn_t gfn, union kvm_mmu_page_role role)
a82070b6 204{
02c00b3a
BG
205 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
206
a3aca4de 207 sp->role = role;
02c00b3a 208 sp->gfn = gfn;
c10743a1 209 sp->ptep = sptep;
02c00b3a
BG
210 sp->tdp_mmu_page = true;
211
33dd3574 212 trace_kvm_mmu_get_page(sp, true);
02c00b3a
BG
213}
214
a82070b6
DM
215static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
216 struct tdp_iter *iter)
02c00b3a 217{
a3aca4de 218 struct kvm_mmu_page *parent_sp;
02c00b3a 219 union kvm_mmu_page_role role;
a3aca4de
DM
220
221 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
222
223 role = parent_sp->role;
224 role.level--;
225
c10743a1 226 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
a3aca4de
DM
227}
228
229hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
230{
231 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
02c00b3a
BG
232 struct kvm *kvm = vcpu->kvm;
233 struct kvm_mmu_page *root;
234
6e6ec584 235 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 236
04dc4e6c
SC
237 /*
238 * Check for an existing root before allocating a new one. Note, the
239 * role check prevents consuming an invalid root.
240 */
a3f15bda 241 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293 242 if (root->role.word == role.word &&
ad6d6b94 243 kvm_tdp_mmu_get_root(root))
6e6ec584 244 goto out;
02c00b3a
BG
245 }
246
a82070b6 247 root = tdp_mmu_alloc_sp(vcpu);
c10743a1 248 tdp_mmu_init_sp(root, NULL, 0, role);
a82070b6 249
11cccf5c 250 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 251
c0e64238
BG
252 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
253 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
254 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 255
6e6ec584 256out:
02c00b3a 257 return __pa(root->spt);
fe5db27d 258}
2f2fad08
BG
259
260static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
261 u64 old_spte, u64 new_spte, int level,
262 bool shared);
2f2fad08 263
f8e14497
BG
264static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
265{
f8e14497
BG
266 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
267 return;
268
269 if (is_accessed_spte(old_spte) &&
64bb2769
SC
270 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
271 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
272 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
273}
274
a6a0b05d
BG
275static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
276 u64 old_spte, u64 new_spte, int level)
277{
278 bool pfn_changed;
279 struct kvm_memory_slot *slot;
280
281 if (level > PG_LEVEL_4K)
282 return;
283
284 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
285
286 if ((!is_writable_pte(old_spte) || pfn_changed) &&
287 is_writable_pte(new_spte)) {
288 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 289 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
290 }
291}
292
a9442f59 293/**
c298a30c 294 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59
BG
295 *
296 * @kvm: kvm instance
297 * @sp: the page to be removed
9a77daac
BG
298 * @shared: This operation may not be running under the exclusive use of
299 * the MMU lock and the operation must synchronize with other
300 * threads that might be adding or removing pages.
a9442f59 301 */
c298a30c
DM
302static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
303 bool shared)
a9442f59 304{
9a77daac
BG
305 if (shared)
306 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
307 else
308 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
309
310 list_del(&sp->link);
311 if (sp->lpage_disallowed)
312 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
313
314 if (shared)
315 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
316}
317
a066e61f 318/**
0f53dfa3 319 * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61f
BG
320 *
321 * @kvm: kvm instance
322 * @pt: the page removed from the paging structure
9a77daac
BG
323 * @shared: This operation may not be running under the exclusive use
324 * of the MMU lock and the operation must synchronize with other
325 * threads that might be modifying SPTEs.
a066e61f
BG
326 *
327 * Given a page table that has been removed from the TDP paging structure,
328 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
329 *
330 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
331 * protection. Since this thread removed it from the paging structure,
332 * this thread will be responsible for ensuring the page is freed. Hence the
333 * early rcu_dereferences in the function.
a066e61f 334 */
0f53dfa3 335static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61f 336{
70fb3e41 337 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 338 int level = sp->role.level;
e25f0e0c 339 gfn_t base_gfn = sp->gfn;
a066e61f
BG
340 int i;
341
342 trace_kvm_mmu_prepare_zap_page(sp);
343
c298a30c 344 tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61f
BG
345
346 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
574c3c55
BG
347 u64 *sptep = rcu_dereference(pt) + i;
348 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
349 u64 old_child_spte;
9a77daac
BG
350
351 if (shared) {
e25f0e0c
BG
352 /*
353 * Set the SPTE to a nonpresent value that other
354 * threads will not overwrite. If the SPTE was
355 * already marked as removed then another thread
356 * handling a page fault could overwrite it, so
357 * set the SPTE until it is set from some other
358 * value to the removed SPTE value.
359 */
360 for (;;) {
361 old_child_spte = xchg(sptep, REMOVED_SPTE);
362 if (!is_removed_spte(old_child_spte))
363 break;
364 cpu_relax();
365 }
9a77daac 366 } else {
8df9f1af
SC
367 /*
368 * If the SPTE is not MMU-present, there is no backing
369 * page associated with the SPTE and so no side effects
370 * that need to be recorded, and exclusive ownership of
371 * mmu_lock ensures the SPTE can't be made present.
372 * Note, zapping MMIO SPTEs is also unnecessary as they
373 * are guarded by the memslots generation, not by being
374 * unreachable.
375 */
9a77daac 376 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
377 if (!is_shadow_present_pte(old_child_spte))
378 continue;
e25f0e0c
BG
379
380 /*
381 * Marking the SPTE as a removed SPTE is not
382 * strictly necessary here as the MMU lock will
383 * stop other threads from concurrently modifying
384 * this SPTE. Using the removed SPTE value keeps
385 * the two branches consistent and simplifies
386 * the function.
387 */
388 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 389 }
e25f0e0c 390 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
f1b83255 391 old_child_spte, REMOVED_SPTE, level,
e25f0e0c 392 shared);
a066e61f
BG
393 }
394
574c3c55 395 kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
f1b83255 396 KVM_PAGES_PER_HPAGE(level + 1));
a066e61f 397
7cca2d0b 398 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
399}
400
2f2fad08 401/**
7f6231a3 402 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
403 * @kvm: kvm instance
404 * @as_id: the address space of the paging structure the SPTE was a part of
405 * @gfn: the base GFN that was mapped by the SPTE
406 * @old_spte: The value of the SPTE before the change
407 * @new_spte: The value of the SPTE after the change
408 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
409 * @shared: This operation may not be running under the exclusive use of
410 * the MMU lock and the operation must synchronize with other
411 * threads that might be modifying SPTEs.
2f2fad08
BG
412 *
413 * Handle bookkeeping that might result from the modification of a SPTE.
414 * This function must be called for all TDP SPTE modifications.
415 */
416static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
417 u64 old_spte, u64 new_spte, int level,
418 bool shared)
2f2fad08
BG
419{
420 bool was_present = is_shadow_present_pte(old_spte);
421 bool is_present = is_shadow_present_pte(new_spte);
422 bool was_leaf = was_present && is_last_spte(old_spte, level);
423 bool is_leaf = is_present && is_last_spte(new_spte, level);
424 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
425
426 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
427 WARN_ON(level < PG_LEVEL_4K);
764388ce 428 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
429
430 /*
431 * If this warning were to trigger it would indicate that there was a
432 * missing MMU notifier or a race with some notifier handler.
433 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 434 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
435 * should be zapping the SPTE before the main MM's page table is
436 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
437 * thread before replacement.
438 */
439 if (was_leaf && is_leaf && pfn_changed) {
440 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
441 "SPTE with another present leaf SPTE mapping a\n"
442 "different PFN!\n"
443 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
444 as_id, gfn, old_spte, new_spte, level);
445
446 /*
447 * Crash the host to prevent error propagation and guest data
d9f6e12f 448 * corruption.
2f2fad08
BG
449 */
450 BUG();
451 }
452
453 if (old_spte == new_spte)
454 return;
455
b9a98c34
BG
456 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
457
115111ef
DM
458 if (is_leaf)
459 check_spte_writable_invariants(new_spte);
460
2f2fad08
BG
461 /*
462 * The only times a SPTE should be changed from a non-present to
463 * non-present state is when an MMIO entry is installed/modified/
464 * removed. In that case, there is nothing to do here.
465 */
466 if (!was_present && !is_present) {
467 /*
08f07c80
BG
468 * If this change does not involve a MMIO SPTE or removed SPTE,
469 * it is unexpected. Log the change, though it should not
470 * impact the guest since both the former and current SPTEs
471 * are nonpresent.
2f2fad08 472 */
08f07c80
BG
473 if (WARN_ON(!is_mmio_spte(old_spte) &&
474 !is_mmio_spte(new_spte) &&
475 !is_removed_spte(new_spte)))
2f2fad08
BG
476 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
477 "should not be replaced with another,\n"
478 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
479 "are MMIO SPTEs, or the new SPTE is\n"
480 "a temporary removed SPTE.\n"
2f2fad08
BG
481 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
482 as_id, gfn, old_spte, new_spte, level);
483 return;
484 }
485
71f51d2c
MZ
486 if (is_leaf != was_leaf)
487 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
488
489 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 490 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
491 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
492
493 /*
494 * Recursively handle child PTs if the change removed a subtree from
c8e5a0d0
SC
495 * the paging structure. Note the WARN on the PFN changing without the
496 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
497 * pages are kernel allocations and should never be migrated.
2f2fad08 498 */
c8e5a0d0
SC
499 if (was_present && !was_leaf &&
500 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0f53dfa3 501 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
502}
503
504static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
505 u64 old_spte, u64 new_spte, int level,
506 bool shared)
2f2fad08 507{
9a77daac
BG
508 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
509 shared);
f8e14497 510 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
511 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
512 new_spte, level);
2f2fad08 513}
faaf05b0 514
9a77daac 515/*
6ccf4438
PB
516 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
517 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 518 * in KVM's dirty bitmaps.
9a77daac 519 *
3255530a
DM
520 * If setting the SPTE fails because it has changed, iter->old_spte will be
521 * refreshed to the current value of the spte.
522 *
9a77daac
BG
523 * @kvm: kvm instance
524 * @iter: a tdp_iter instance currently on the SPTE that should be set
525 * @new_spte: The value the SPTE should be set to
3e72c791
DM
526 * Return:
527 * * 0 - If the SPTE was set.
528 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
529 * no side-effects other than setting iter->old_spte to the last
530 * known value of the spte.
9a77daac 531 */
3e72c791
DM
532static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
533 struct tdp_iter *iter,
534 u64 new_spte)
9a77daac 535{
3255530a
DM
536 u64 *sptep = rcu_dereference(iter->sptep);
537 u64 old_spte;
538
3a0f64de
SC
539 WARN_ON_ONCE(iter->yielded);
540
9a77daac
BG
541 lockdep_assert_held_read(&kvm->mmu_lock);
542
08f07c80
BG
543 /*
544 * Do not change removed SPTEs. Only the thread that froze the SPTE
545 * may modify it.
546 */
7a51393a 547 if (is_removed_spte(iter->old_spte))
3e72c791 548 return -EBUSY;
08f07c80 549
6e8eb206
DM
550 /*
551 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
552 * does not hold the mmu_lock.
553 */
3255530a
DM
554 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
555 if (old_spte != iter->old_spte) {
556 /*
557 * The page table entry was modified by a different logical
558 * CPU. Refresh iter->old_spte with the current value so the
559 * caller operates on fresh data, e.g. if it retries
560 * tdp_mmu_set_spte_atomic().
561 */
562 iter->old_spte = old_spte;
3e72c791 563 return -EBUSY;
3255530a 564 }
9a77daac 565
24ae4cfa
BG
566 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
567 new_spte, iter->level, true);
568 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac 569
3e72c791 570 return 0;
9a77daac
BG
571}
572
3e72c791
DM
573static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
574 struct tdp_iter *iter)
08f07c80 575{
3e72c791
DM
576 int ret;
577
08f07c80
BG
578 /*
579 * Freeze the SPTE by setting it to a special,
580 * non-present value. This will stop other threads from
581 * immediately installing a present entry in its place
582 * before the TLBs are flushed.
583 */
3e72c791
DM
584 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
585 if (ret)
586 return ret;
08f07c80
BG
587
588 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
589 KVM_PAGES_PER_HPAGE(iter->level));
590
591 /*
592 * No other thread can overwrite the removed SPTE as they
593 * must either wait on the MMU lock or use
d9f6e12f 594 * tdp_mmu_set_spte_atomic which will not overwrite the
08f07c80
BG
595 * special removed SPTE value. No bookkeeping is needed
596 * here since the SPTE is going from non-present
597 * to non-present.
598 */
0e587aa7 599 kvm_tdp_mmu_write_spte(iter->sptep, 0);
08f07c80 600
3e72c791 601 return 0;
08f07c80
BG
602}
603
9a77daac 604
fe43fa2f
BG
605/*
606 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
626808d1
SC
607 * @kvm: KVM instance
608 * @as_id: Address space ID, i.e. regular vs. SMM
609 * @sptep: Pointer to the SPTE
610 * @old_spte: The current value of the SPTE
611 * @new_spte: The new value that will be set for the SPTE
612 * @gfn: The base GFN that was (or will be) mapped by the SPTE
613 * @level: The level _containing_ the SPTE (its parent PT's level)
fe43fa2f
BG
614 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
615 * of the page. Should be set unless handling an MMU
616 * notifier for access tracking. Leaving record_acc_track
617 * unset in that case prevents page accesses from being
618 * double counted.
619 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
620 * appropriate for the change being made. Should be set
621 * unless performing certain dirty logging operations.
622 * Leaving record_dirty_log unset in that case prevents page
623 * writes from being double counted.
624 */
626808d1
SC
625static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
626 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
627 bool record_acc_track, bool record_dirty_log)
faaf05b0 628{
531810ca 629 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 630
08f07c80 631 /*
966da62a 632 * No thread should be using this function to set SPTEs to or from the
08f07c80
BG
633 * temporary removed SPTE value.
634 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
635 * should be used. If operating under the MMU lock in write mode, the
636 * use of the removed SPTE should not be necessary.
637 */
626808d1 638 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
08f07c80 639
626808d1
SC
640 kvm_tdp_mmu_write_spte(sptep, new_spte);
641
642 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
f8e14497 643
f8e14497 644 if (record_acc_track)
626808d1 645 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d 646 if (record_dirty_log)
626808d1
SC
647 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
648 new_spte, level);
649}
650
651static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
652 u64 new_spte, bool record_acc_track,
653 bool record_dirty_log)
654{
655 WARN_ON_ONCE(iter->yielded);
656
657 __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
658 new_spte, iter->gfn, iter->level,
659 record_acc_track, record_dirty_log);
f8e14497
BG
660}
661
662static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
663 u64 new_spte)
664{
626808d1 665 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 666}
faaf05b0 667
f8e14497
BG
668static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
669 struct tdp_iter *iter,
670 u64 new_spte)
671{
626808d1 672 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
a6a0b05d
BG
673}
674
675static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
676 struct tdp_iter *iter,
677 u64 new_spte)
678{
626808d1 679 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
680}
681
682#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075 683 for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0 684
f8e14497
BG
685#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
686 tdp_root_for_each_pte(_iter, _root, _start, _end) \
687 if (!is_shadow_present_pte(_iter.old_spte) || \
688 !is_last_spte(_iter.old_spte, _iter.level)) \
689 continue; \
690 else
691
bb18842e 692#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
b9e5603c 693 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
bb18842e 694
e28a436c
BG
695/*
696 * Yield if the MMU lock is contended or this thread needs to return control
697 * to the scheduler.
698 *
e139a34e
BG
699 * If this function should yield and flush is set, it will perform a remote
700 * TLB flush before yielding.
701 *
3a0f64de
SC
702 * If this function yields, iter->yielded is set and the caller must skip to
703 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
704 * over the paging structures to allow the iterator to continue its traversal
705 * from the paging structure root.
e28a436c 706 *
3a0f64de 707 * Returns true if this function yielded.
e28a436c 708 */
3a0f64de
SC
709static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
710 struct tdp_iter *iter,
711 bool flush, bool shared)
a6a0b05d 712{
3a0f64de
SC
713 WARN_ON(iter->yielded);
714
ed5e484b
BG
715 /* Ensure forward progress has been made before yielding. */
716 if (iter->next_last_level_gfn == iter->yielded_gfn)
717 return false;
718
531810ca 719 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7cca2d0b
BG
720 rcu_read_unlock();
721
e139a34e
BG
722 if (flush)
723 kvm_flush_remote_tlbs(kvm);
724
6103bc07
BG
725 if (shared)
726 cond_resched_rwlock_read(&kvm->mmu_lock);
727 else
728 cond_resched_rwlock_write(&kvm->mmu_lock);
729
7cca2d0b 730 rcu_read_lock();
ed5e484b
BG
731
732 WARN_ON(iter->gfn > iter->next_last_level_gfn);
733
3a0f64de 734 iter->yielded = true;
a6a0b05d 735 }
e28a436c 736
3a0f64de 737 return iter->yielded;
a6a0b05d
BG
738}
739
c10743a1
SC
740bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
741{
742 u64 old_spte;
743
744 /*
745 * This helper intentionally doesn't allow zapping a root shadow page,
746 * which doesn't have a parent page table and thus no associated entry.
747 */
748 if (WARN_ON_ONCE(!sp->ptep))
749 return false;
750
751 rcu_read_lock();
752
753 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
754 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) {
755 rcu_read_unlock();
756 return false;
757 }
758
759 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
760 sp->gfn, sp->role.level + 1, true, true);
761
762 rcu_read_unlock();
763
764 return true;
765}
766
faaf05b0
BG
767/*
768 * Tears down the mappings for the range of gfns, [start, end), and frees the
769 * non-root pages mapping GFNs strictly within that range. Returns true if
770 * SPTEs have been cleared and a TLB flush is needed before releasing the
771 * MMU lock.
6103bc07 772 *
063afacd
BG
773 * If can_yield is true, will release the MMU lock and reschedule if the
774 * scheduler needs the CPU or there is contention on the MMU lock. If this
775 * function cannot yield, it will not release the MMU lock or reschedule and
776 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07
BG
777 * operation can cause a soft lockup.
778 *
779 * If shared is true, this thread holds the MMU lock in read mode and must
780 * account for the possibility that other threads are modifying the paging
781 * structures concurrently. If shared is false, this thread should hold the
782 * MMU lock in write mode.
faaf05b0
BG
783 */
784static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
785 gfn_t start, gfn_t end, bool can_yield, bool flush,
786 bool shared)
faaf05b0 787{
524a1e4e
SC
788 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
789 bool zap_all = (start == 0 && end >= max_gfn_host);
faaf05b0 790 struct tdp_iter iter;
faaf05b0 791
0103098f
SC
792 /*
793 * No need to try to step down in the iterator when zapping all SPTEs,
794 * zapping the top-level non-leaf SPTEs will recurse on their children.
795 */
796 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
797
524a1e4e
SC
798 /*
799 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
800 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
801 * and so KVM will never install a SPTE for such addresses.
802 */
803 end = min(end, max_gfn_host);
804
6103bc07
BG
805 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
806
7cca2d0b
BG
807 rcu_read_lock();
808
77aa6075 809 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
6103bc07 810retry:
1af4a960 811 if (can_yield &&
6103bc07 812 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
a835429c 813 flush = false;
1af4a960
BG
814 continue;
815 }
816
faaf05b0
BG
817 if (!is_shadow_present_pte(iter.old_spte))
818 continue;
819
820 /*
821 * If this is a non-last-level SPTE that covers a larger range
822 * than should be zapped, continue, and zap the mappings at a
524a1e4e 823 * lower level, except when zapping all SPTEs.
faaf05b0 824 */
524a1e4e
SC
825 if (!zap_all &&
826 (iter.gfn < start ||
faaf05b0
BG
827 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
828 !is_last_spte(iter.old_spte, iter.level))
829 continue;
830
6103bc07
BG
831 if (!shared) {
832 tdp_mmu_set_spte(kvm, &iter, 0);
833 flush = true;
3e72c791 834 } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
6103bc07
BG
835 goto retry;
836 }
faaf05b0 837 }
7cca2d0b
BG
838
839 rcu_read_unlock();
a835429c 840 return flush;
faaf05b0
BG
841}
842
843/*
844 * Tears down the mappings for the range of gfns, [start, end), and frees the
845 * non-root pages mapping GFNs strictly within that range. Returns true if
846 * SPTEs have been cleared and a TLB flush is needed before releasing the
847 * MMU lock.
848 */
2b9663d8 849bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
5a324c24 850 gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
851{
852 struct kvm_mmu_page *root;
faaf05b0 853
614f6970 854 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
6103bc07 855 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
5a324c24 856 false);
faaf05b0 857
faaf05b0
BG
858 return flush;
859}
860
861void kvm_tdp_mmu_zap_all(struct kvm *kvm)
862{
2b9663d8
SC
863 bool flush = false;
864 int i;
865
866 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5a324c24 867 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
faaf05b0 868
faaf05b0
BG
869 if (flush)
870 kvm_flush_remote_tlbs(kvm);
871}
bb18842e 872
4c6654bd
BG
873static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
874 struct kvm_mmu_page *prev_root)
875{
876 struct kvm_mmu_page *next_root;
877
878 if (prev_root)
879 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
880 &prev_root->link,
881 typeof(*prev_root), link);
882 else
883 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
884 typeof(*next_root), link);
885
886 while (next_root && !(next_root->role.invalid &&
887 refcount_read(&next_root->tdp_mmu_root_count)))
888 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
889 &next_root->link,
890 typeof(*next_root), link);
891
892 return next_root;
893}
894
895/*
f28e9c7f
SC
896 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
897 * zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a
898 * reference to each invalidated root, roots will not be freed until after this
899 * function drops the gifted reference, e.g. so that vCPUs don't get stuck with
900 * tearing down paging structures.
4c6654bd
BG
901 */
902void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
903{
4c6654bd
BG
904 struct kvm_mmu_page *next_root;
905 struct kvm_mmu_page *root;
4c6654bd
BG
906
907 lockdep_assert_held_read(&kvm->mmu_lock);
908
909 rcu_read_lock();
910
911 root = next_invalidated_root(kvm, NULL);
912
913 while (root) {
914 next_root = next_invalidated_root(kvm, root);
915
916 rcu_read_unlock();
917
7ae5840e
SC
918 /*
919 * A TLB flush is unnecessary, invalidated roots are guaranteed
920 * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
921 * for more details), and unlike the legacy MMU, no vCPU kick
922 * is needed to play nice with lockless shadow walks as the TDP
923 * MMU protects its paging structures via RCU. Note, zapping
924 * will still flush on yield, but that's a minor performance
925 * blip and not a functional issue.
926 */
927 (void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
4c6654bd
BG
928
929 /*
930 * Put the reference acquired in
931 * kvm_tdp_mmu_invalidate_roots
932 */
933 kvm_tdp_mmu_put_root(kvm, root, true);
934
935 root = next_root;
936
937 rcu_read_lock();
938 }
939
940 rcu_read_unlock();
faaf05b0 941}
bb18842e 942
b7cccd39 943/*
f28e9c7f
SC
944 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
945 * is about to be zapped, e.g. in response to a memslots update. The caller is
946 * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual
947 * zapping.
b7cccd39 948 *
f28e9c7f
SC
949 * Take a reference on all roots to prevent the root from being freed before it
950 * is zapped by this thread. Freeing a root is not a correctness issue, but if
951 * a vCPU drops the last reference to a root prior to the root being zapped, it
952 * will get stuck with tearing down the entire paging structure.
4c6654bd 953 *
f28e9c7f
SC
954 * Get a reference even if the root is already invalid,
955 * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all
956 * invalid roots, e.g. there's no epoch to identify roots that were invalidated
957 * by a previous call. Roots stay on the list until the last reference is
958 * dropped, so even though all invalid roots are zapped, a root may not go away
959 * for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
960 *
961 * Because mmu_lock is held for write, it should be impossible to observe a
962 * root with zero refcount, i.e. the list of roots cannot be stale.
4c6654bd 963 *
b7cccd39
BG
964 * This has essentially the same effect for the TDP MMU
965 * as updating mmu_valid_gen does for the shadow MMU.
966 */
967void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
968{
969 struct kvm_mmu_page *root;
970
971 lockdep_assert_held_write(&kvm->mmu_lock);
f28e9c7f
SC
972 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
973 if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
4c6654bd 974 root->role.invalid = true;
f28e9c7f 975 }
b7cccd39
BG
976}
977
bb18842e
BG
978/*
979 * Installs a last-level SPTE to handle a TDP page fault.
980 * (NPT/EPT violation/misconfiguration)
981 */
cdc47767
PB
982static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
983 struct kvm_page_fault *fault,
984 struct tdp_iter *iter)
bb18842e 985{
c435d4b7 986 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 987 u64 new_spte;
57a3e96d 988 int ret = RET_PF_FIXED;
ad67e480 989 bool wrprot = false;
bb18842e 990
7158bee4 991 WARN_ON(sp->role.level != fault->goal_level);
e710c5f6 992 if (unlikely(!fault->slot))
bb18842e 993 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 994 else
53597858 995 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 996 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 997 fault->map_writable, &new_spte);
bb18842e
BG
998
999 if (new_spte == iter->old_spte)
1000 ret = RET_PF_SPURIOUS;
3e72c791 1001 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 1002 return RET_PF_RETRY;
bb18842e
BG
1003
1004 /*
1005 * If the page fault was caused by a write but the page is write
1006 * protected, emulation is needed. If the emulation was skipped,
1007 * the vCPU would have the same fault again.
1008 */
ad67e480 1009 if (wrprot) {
cdc47767 1010 if (fault->write)
bb18842e 1011 ret = RET_PF_EMULATE;
bb18842e
BG
1012 }
1013
1014 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
1015 if (unlikely(is_mmio_spte(new_spte))) {
1016 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1017 new_spte);
bb18842e 1018 ret = RET_PF_EMULATE;
3849e092 1019 } else {
9a77daac
BG
1020 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1021 rcu_dereference(iter->sptep));
3849e092 1022 }
bb18842e 1023
857f8474
KH
1024 /*
1025 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1026 * consistent with legacy MMU behavior.
1027 */
1028 if (ret != RET_PF_SPURIOUS)
bb18842e
BG
1029 vcpu->stat.pf_fixed++;
1030
1031 return ret;
1032}
1033
7b7e1ab6 1034/*
cb00a70b
DM
1035 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1036 * provided page table.
7b7e1ab6
DM
1037 *
1038 * @kvm: kvm instance
1039 * @iter: a tdp_iter instance currently on the SPTE that should be set
1040 * @sp: The new TDP page table to install.
1041 * @account_nx: True if this page table is being installed to split a
1042 * non-executable huge page.
cb00a70b 1043 * @shared: This operation is running under the MMU lock in read mode.
7b7e1ab6
DM
1044 *
1045 * Returns: 0 if the new page table was installed. Non-0 if the page table
1046 * could not be installed (e.g. the atomic compare-exchange failed).
1047 */
cb00a70b
DM
1048static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1049 struct kvm_mmu_page *sp, bool account_nx,
1050 bool shared)
7b7e1ab6
DM
1051{
1052 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
cb00a70b 1053 int ret = 0;
7b7e1ab6 1054
cb00a70b
DM
1055 if (shared) {
1056 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1057 if (ret)
1058 return ret;
1059 } else {
1060 tdp_mmu_set_spte(kvm, iter, spte);
1061 }
7b7e1ab6
DM
1062
1063 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1064 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1065 if (account_nx)
1066 account_huge_nx_page(kvm, sp);
1067 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1068
1069 return 0;
1070}
1071
bb18842e
BG
1072/*
1073 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1074 * page tables and SPTEs to translate the faulting guest physical address.
1075 */
2f6305dd 1076int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 1077{
bb18842e
BG
1078 struct kvm_mmu *mmu = vcpu->arch.mmu;
1079 struct tdp_iter iter;
89c0fd49 1080 struct kvm_mmu_page *sp;
bb18842e 1081 int ret;
bb18842e 1082
73a3c659 1083 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 1084
f0066d94 1085 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
1086
1087 rcu_read_lock();
1088
2f6305dd 1089 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
73a3c659 1090 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 1091 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 1092
73a3c659 1093 if (iter.level == fault->goal_level)
bb18842e
BG
1094 break;
1095
1096 /*
1097 * If there is an SPTE mapping a large page at a higher level
1098 * than the target, that SPTE must be cleared and replaced
1099 * with a non-leaf SPTE.
1100 */
1101 if (is_shadow_present_pte(iter.old_spte) &&
1102 is_large_pte(iter.old_spte)) {
3e72c791 1103 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 1104 break;
bb18842e 1105
bb18842e
BG
1106 /*
1107 * The iter must explicitly re-read the spte here
1108 * because the new value informs the !present
1109 * path below.
1110 */
0e587aa7 1111 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
bb18842e
BG
1112 }
1113
1114 if (!is_shadow_present_pte(iter.old_spte)) {
7b7e1ab6
DM
1115 bool account_nx = fault->huge_page_disallowed &&
1116 fault->req_level >= iter.level;
1117
ff76d506 1118 /*
c4342633 1119 * If SPTE has been frozen by another thread, just
ff76d506
KH
1120 * give up and retry, avoiding unnecessary page table
1121 * allocation and free.
1122 */
1123 if (is_removed_spte(iter.old_spte))
1124 break;
1125
a82070b6
DM
1126 sp = tdp_mmu_alloc_sp(vcpu);
1127 tdp_mmu_init_child_sp(sp, &iter);
1128
cb00a70b 1129 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
9a77daac
BG
1130 tdp_mmu_free_sp(sp);
1131 break;
1132 }
bb18842e
BG
1133 }
1134 }
1135
73a3c659 1136 if (iter.level != fault->goal_level) {
7cca2d0b 1137 rcu_read_unlock();
bb18842e 1138 return RET_PF_RETRY;
7cca2d0b 1139 }
bb18842e 1140
cdc47767 1141 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
7cca2d0b 1142 rcu_read_unlock();
bb18842e
BG
1143
1144 return ret;
1145}
063afacd 1146
3039bcc7
SC
1147bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1148 bool flush)
063afacd 1149{
83b83a02
SC
1150 return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1151 range->end, range->may_block, flush);
063afacd
BG
1152}
1153
3039bcc7
SC
1154typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1155 struct kvm_gfn_range *range);
063afacd 1156
3039bcc7
SC
1157static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1158 struct kvm_gfn_range *range,
1159 tdp_handler_t handler)
063afacd 1160{
3039bcc7
SC
1161 struct kvm_mmu_page *root;
1162 struct tdp_iter iter;
1163 bool ret = false;
1164
e1eed584
SC
1165 /*
1166 * Don't support rescheduling, none of the MMU notifiers that funnel
1167 * into this helper allow blocking; it'd be dead, wasteful code.
1168 */
3039bcc7 1169 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
a151acec
SC
1170 rcu_read_lock();
1171
3039bcc7
SC
1172 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1173 ret |= handler(kvm, &iter, range);
3039bcc7 1174
a151acec
SC
1175 rcu_read_unlock();
1176 }
3039bcc7
SC
1177
1178 return ret;
063afacd 1179}
f8e14497
BG
1180
1181/*
1182 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1183 * if any of the GFNs in the range have been accessed.
1184 */
3039bcc7
SC
1185static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1186 struct kvm_gfn_range *range)
f8e14497 1187{
f8e14497
BG
1188 u64 new_spte = 0;
1189
3039bcc7
SC
1190 /* If we have a non-accessed entry we don't need to change the pte. */
1191 if (!is_accessed_spte(iter->old_spte))
1192 return false;
7cca2d0b 1193
3039bcc7
SC
1194 new_spte = iter->old_spte;
1195
1196 if (spte_ad_enabled(new_spte)) {
1197 new_spte &= ~shadow_accessed_mask;
1198 } else {
f8e14497 1199 /*
3039bcc7
SC
1200 * Capture the dirty status of the page, so that it doesn't get
1201 * lost when the SPTE is marked for access tracking.
f8e14497 1202 */
3039bcc7
SC
1203 if (is_writable_pte(new_spte))
1204 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1205
3039bcc7 1206 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1207 }
1208
3039bcc7 1209 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1210
3039bcc7 1211 return true;
f8e14497
BG
1212}
1213
3039bcc7 1214bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1215{
3039bcc7 1216 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1217}
1218
3039bcc7
SC
1219static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1220 struct kvm_gfn_range *range)
f8e14497 1221{
3039bcc7 1222 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1223}
1224
3039bcc7 1225bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1226{
3039bcc7 1227 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1228}
1d8dd6b3 1229
3039bcc7
SC
1230static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1231 struct kvm_gfn_range *range)
1d8dd6b3 1232{
1d8dd6b3 1233 u64 new_spte;
7cca2d0b 1234
3039bcc7
SC
1235 /* Huge pages aren't expected to be modified without first being zapped. */
1236 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1237
3039bcc7
SC
1238 if (iter->level != PG_LEVEL_4K ||
1239 !is_shadow_present_pte(iter->old_spte))
1240 return false;
1d8dd6b3 1241
3039bcc7
SC
1242 /*
1243 * Note, when changing a read-only SPTE, it's not strictly necessary to
1244 * zero the SPTE before setting the new PFN, but doing so preserves the
1245 * invariant that the PFN of a present * leaf SPTE can never change.
1246 * See __handle_changed_spte().
1247 */
1248 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1249
3039bcc7
SC
1250 if (!pte_write(range->pte)) {
1251 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1252 pte_pfn(range->pte));
1d8dd6b3 1253
3039bcc7 1254 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1255 }
1256
3039bcc7 1257 return true;
1d8dd6b3
BG
1258}
1259
3039bcc7
SC
1260/*
1261 * Handle the changed_pte MMU notifier for the TDP MMU.
1262 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1263 * notifier.
1264 * Returns non-zero if a flush is needed before releasing the MMU lock.
1265 */
1266bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1267{
93fa50f6
SC
1268 /*
1269 * No need to handle the remote TLB flush under RCU protection, the
1270 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1271 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1272 */
1273 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1d8dd6b3
BG
1274}
1275
a6a0b05d 1276/*
bedd9195
DM
1277 * Remove write access from all SPTEs at or above min_level that map GFNs
1278 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1279 * be flushed.
a6a0b05d
BG
1280 */
1281static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1282 gfn_t start, gfn_t end, int min_level)
1283{
1284 struct tdp_iter iter;
1285 u64 new_spte;
1286 bool spte_set = false;
1287
7cca2d0b
BG
1288 rcu_read_lock();
1289
a6a0b05d
BG
1290 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1291
77aa6075 1292 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfa
BG
1293retry:
1294 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1295 continue;
1296
a6a0b05d 1297 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1298 !is_last_spte(iter.old_spte, iter.level) ||
1299 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1300 continue;
1301
1302 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1303
3e72c791 1304 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1305 goto retry;
3255530a 1306
a6a0b05d 1307 spte_set = true;
a6a0b05d 1308 }
7cca2d0b
BG
1309
1310 rcu_read_unlock();
a6a0b05d
BG
1311 return spte_set;
1312}
1313
1314/*
1315 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1316 * only affect leaf SPTEs down to min_level.
1317 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1318 */
269e9552
HM
1319bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1320 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1321{
1322 struct kvm_mmu_page *root;
a6a0b05d
BG
1323 bool spte_set = false;
1324
24ae4cfa 1325 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1326
d62007ed 1327 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1328 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1329 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1330
1331 return spte_set;
1332}
1333
a3fe5dbd
DM
1334static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1335{
1336 struct kvm_mmu_page *sp;
1337
1338 gfp |= __GFP_ZERO;
1339
1340 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1341 if (!sp)
1342 return NULL;
1343
1344 sp->spt = (void *)__get_free_page(gfp);
1345 if (!sp->spt) {
1346 kmem_cache_free(mmu_page_header_cache, sp);
1347 return NULL;
1348 }
1349
1350 return sp;
1351}
1352
1353static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
cb00a70b
DM
1354 struct tdp_iter *iter,
1355 bool shared)
a3fe5dbd
DM
1356{
1357 struct kvm_mmu_page *sp;
1358
a3fe5dbd
DM
1359 /*
1360 * Since we are allocating while under the MMU lock we have to be
1361 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1362 * reclaim and to avoid making any filesystem callbacks (which can end
1363 * up invoking KVM MMU notifiers, resulting in a deadlock).
1364 *
1365 * If this allocation fails we drop the lock and retry with reclaim
1366 * allowed.
1367 */
1368 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1369 if (sp)
1370 return sp;
1371
1372 rcu_read_unlock();
cb00a70b
DM
1373
1374 if (shared)
1375 read_unlock(&kvm->mmu_lock);
1376 else
1377 write_unlock(&kvm->mmu_lock);
a3fe5dbd
DM
1378
1379 iter->yielded = true;
1380 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1381
cb00a70b
DM
1382 if (shared)
1383 read_lock(&kvm->mmu_lock);
1384 else
1385 write_lock(&kvm->mmu_lock);
1386
a3fe5dbd
DM
1387 rcu_read_lock();
1388
1389 return sp;
1390}
1391
cb00a70b
DM
1392static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1393 struct kvm_mmu_page *sp, bool shared)
a3fe5dbd
DM
1394{
1395 const u64 huge_spte = iter->old_spte;
1396 const int level = iter->level;
1397 int ret, i;
1398
1399 tdp_mmu_init_child_sp(sp, iter);
1400
1401 /*
1402 * No need for atomics when writing to sp->spt since the page table has
1403 * not been linked in yet and thus is not reachable from any other CPU.
1404 */
1405 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1406 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1407
1408 /*
1409 * Replace the huge spte with a pointer to the populated lower level
1410 * page table. Since we are making this change without a TLB flush vCPUs
1411 * will see a mix of the split mappings and the original huge mapping,
1412 * depending on what's currently in their TLB. This is fine from a
1413 * correctness standpoint since the translation will be the same either
1414 * way.
1415 */
cb00a70b 1416 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
a3fe5dbd 1417 if (ret)
e0b728b1 1418 goto out;
a3fe5dbd
DM
1419
1420 /*
1421 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1422 * are overwriting from the page stats. But we have to manually update
1423 * the page stats with the new present child pages.
1424 */
1425 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1426
e0b728b1
DM
1427out:
1428 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1429 return ret;
a3fe5dbd
DM
1430}
1431
1432static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1433 struct kvm_mmu_page *root,
1434 gfn_t start, gfn_t end,
cb00a70b 1435 int target_level, bool shared)
a3fe5dbd
DM
1436{
1437 struct kvm_mmu_page *sp = NULL;
1438 struct tdp_iter iter;
1439 int ret = 0;
1440
1441 rcu_read_lock();
1442
1443 /*
1444 * Traverse the page table splitting all huge pages above the target
1445 * level into one lower level. For example, if we encounter a 1GB page
1446 * we split it into 512 2MB pages.
1447 *
1448 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1449 * to visit an SPTE before ever visiting its children, which means we
1450 * will correctly recursively split huge pages that are more than one
1451 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1452 * and then splitting each of those to 512 4KB pages).
1453 */
1454 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1455retry:
cb00a70b 1456 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
a3fe5dbd
DM
1457 continue;
1458
1459 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1460 continue;
1461
1462 if (!sp) {
cb00a70b 1463 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
a3fe5dbd
DM
1464 if (!sp) {
1465 ret = -ENOMEM;
e0b728b1
DM
1466 trace_kvm_mmu_split_huge_page(iter.gfn,
1467 iter.old_spte,
1468 iter.level, ret);
a3fe5dbd
DM
1469 break;
1470 }
1471
1472 if (iter.yielded)
1473 continue;
1474 }
1475
cb00a70b 1476 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
a3fe5dbd
DM
1477 goto retry;
1478
1479 sp = NULL;
1480 }
1481
1482 rcu_read_unlock();
1483
1484 /*
1485 * It's possible to exit the loop having never used the last sp if, for
1486 * example, a vCPU doing HugePage NX splitting wins the race and
1487 * installs its own sp in place of the last sp we tried to split.
1488 */
1489 if (sp)
1490 tdp_mmu_free_sp(sp);
1491
a3fe5dbd
DM
1492 return ret;
1493}
1494
cb00a70b 1495
a3fe5dbd
DM
1496/*
1497 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1498 */
1499void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1500 const struct kvm_memory_slot *slot,
1501 gfn_t start, gfn_t end,
cb00a70b 1502 int target_level, bool shared)
a3fe5dbd
DM
1503{
1504 struct kvm_mmu_page *root;
1505 int r = 0;
1506
cb00a70b 1507 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a3fe5dbd 1508
7c554d8e 1509 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
cb00a70b 1510 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
a3fe5dbd 1511 if (r) {
cb00a70b 1512 kvm_tdp_mmu_put_root(kvm, root, shared);
a3fe5dbd
DM
1513 break;
1514 }
1515 }
1516}
1517
a6a0b05d
BG
1518/*
1519 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1520 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1521 * If AD bits are not enabled, this will require clearing the writable bit on
1522 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1523 * be flushed.
1524 */
1525static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1526 gfn_t start, gfn_t end)
1527{
1528 struct tdp_iter iter;
1529 u64 new_spte;
1530 bool spte_set = false;
1531
7cca2d0b
BG
1532 rcu_read_lock();
1533
a6a0b05d 1534 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1535retry:
1536 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1537 continue;
1538
3354ef5a
SC
1539 if (!is_shadow_present_pte(iter.old_spte))
1540 continue;
1541
a6a0b05d
BG
1542 if (spte_ad_need_write_protect(iter.old_spte)) {
1543 if (is_writable_pte(iter.old_spte))
1544 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1545 else
1546 continue;
1547 } else {
1548 if (iter.old_spte & shadow_dirty_mask)
1549 new_spte = iter.old_spte & ~shadow_dirty_mask;
1550 else
1551 continue;
1552 }
1553
3e72c791 1554 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1555 goto retry;
3255530a 1556
a6a0b05d 1557 spte_set = true;
a6a0b05d 1558 }
7cca2d0b
BG
1559
1560 rcu_read_unlock();
a6a0b05d
BG
1561 return spte_set;
1562}
1563
1564/*
1565 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1566 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1567 * If AD bits are not enabled, this will require clearing the writable bit on
1568 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1569 * be flushed.
1570 */
269e9552
HM
1571bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1572 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1573{
1574 struct kvm_mmu_page *root;
a6a0b05d
BG
1575 bool spte_set = false;
1576
24ae4cfa 1577 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1578
d62007ed 1579 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1580 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1581 slot->base_gfn + slot->npages);
a6a0b05d
BG
1582
1583 return spte_set;
1584}
1585
1586/*
1587 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1588 * set in mask, starting at gfn. The given memslot is expected to contain all
1589 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1590 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1591 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1592 */
1593static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1594 gfn_t gfn, unsigned long mask, bool wrprot)
1595{
1596 struct tdp_iter iter;
1597 u64 new_spte;
1598
7cca2d0b
BG
1599 rcu_read_lock();
1600
a6a0b05d
BG
1601 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1602 gfn + BITS_PER_LONG) {
1603 if (!mask)
1604 break;
1605
1606 if (iter.level > PG_LEVEL_4K ||
1607 !(mask & (1UL << (iter.gfn - gfn))))
1608 continue;
1609
f1b3b06a
BG
1610 mask &= ~(1UL << (iter.gfn - gfn));
1611
a6a0b05d
BG
1612 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1613 if (is_writable_pte(iter.old_spte))
1614 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1615 else
1616 continue;
1617 } else {
1618 if (iter.old_spte & shadow_dirty_mask)
1619 new_spte = iter.old_spte & ~shadow_dirty_mask;
1620 else
1621 continue;
1622 }
1623
1624 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1625 }
7cca2d0b
BG
1626
1627 rcu_read_unlock();
a6a0b05d
BG
1628}
1629
1630/*
1631 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1632 * set in mask, starting at gfn. The given memslot is expected to contain all
1633 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1634 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1635 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1636 */
1637void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1638 struct kvm_memory_slot *slot,
1639 gfn_t gfn, unsigned long mask,
1640 bool wrprot)
1641{
1642 struct kvm_mmu_page *root;
a6a0b05d 1643
531810ca 1644 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1645 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1646 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1647}
1648
14881998 1649/*
87aa9ec9
BG
1650 * Clear leaf entries which could be replaced by large mappings, for
1651 * GFNs within the slot.
14881998 1652 */
4b85c921 1653static void zap_collapsible_spte_range(struct kvm *kvm,
14881998 1654 struct kvm_mmu_page *root,
4b85c921 1655 const struct kvm_memory_slot *slot)
14881998 1656{
9eba50f8
SC
1657 gfn_t start = slot->base_gfn;
1658 gfn_t end = start + slot->npages;
14881998
BG
1659 struct tdp_iter iter;
1660 kvm_pfn_t pfn;
14881998 1661
7cca2d0b
BG
1662 rcu_read_lock();
1663
14881998 1664 tdp_root_for_each_pte(iter, root, start, end) {
2db6f772 1665retry:
4b85c921 1666 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960 1667 continue;
1af4a960 1668
14881998 1669 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1670 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1671 continue;
1672
1673 pfn = spte_to_pfn(iter.old_spte);
1674 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1675 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1676 pfn, PG_LEVEL_NUM))
14881998
BG
1677 continue;
1678
4b85c921 1679 /* Note, a successful atomic zap also does a remote TLB flush. */
3e72c791 1680 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
2db6f772 1681 goto retry;
14881998
BG
1682 }
1683
7cca2d0b 1684 rcu_read_unlock();
14881998
BG
1685}
1686
1687/*
1688 * Clear non-leaf entries (and free associated page tables) which could
1689 * be replaced by large mappings, for GFNs within the slot.
1690 */
4b85c921
SC
1691void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1692 const struct kvm_memory_slot *slot)
14881998
BG
1693{
1694 struct kvm_mmu_page *root;
14881998 1695
2db6f772 1696 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1697
d62007ed 1698 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921 1699 zap_collapsible_spte_range(kvm, root, slot);
14881998 1700}
46044f72
BG
1701
1702/*
1703 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1704 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1705 * Returns true if an SPTE was set and a TLB flush is needed.
1706 */
1707static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1708 gfn_t gfn, int min_level)
46044f72
BG
1709{
1710 struct tdp_iter iter;
1711 u64 new_spte;
1712 bool spte_set = false;
1713
3ad93562
KZ
1714 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1715
7cca2d0b
BG
1716 rcu_read_lock();
1717
77aa6075 1718 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562
KZ
1719 if (!is_shadow_present_pte(iter.old_spte) ||
1720 !is_last_spte(iter.old_spte, iter.level))
1721 continue;
1722
46044f72 1723 new_spte = iter.old_spte &
5fc3424f 1724 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72 1725
7c8a4742
DM
1726 if (new_spte == iter.old_spte)
1727 break;
1728
46044f72
BG
1729 tdp_mmu_set_spte(kvm, &iter, new_spte);
1730 spte_set = true;
1731 }
1732
7cca2d0b
BG
1733 rcu_read_unlock();
1734
46044f72
BG
1735 return spte_set;
1736}
1737
1738/*
1739 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1740 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1741 * Returns true if an SPTE was set and a TLB flush is needed.
1742 */
1743bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1744 struct kvm_memory_slot *slot, gfn_t gfn,
1745 int min_level)
46044f72
BG
1746{
1747 struct kvm_mmu_page *root;
46044f72
BG
1748 bool spte_set = false;
1749
531810ca 1750 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1751 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1752 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1753
46044f72
BG
1754 return spte_set;
1755}
1756
95fb5b02
BG
1757/*
1758 * Return the level of the lowest level SPTE added to sptes.
1759 * That SPTE may be non-present.
c5c8c7c5
DM
1760 *
1761 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1762 */
39b4d43e
SC
1763int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1764 int *root_level)
95fb5b02
BG
1765{
1766 struct tdp_iter iter;
1767 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1768 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1769 int leaf = -1;
95fb5b02 1770
39b4d43e 1771 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02
BG
1772
1773 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1774 leaf = iter.level;
dde81f94 1775 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1776 }
1777
1778 return leaf;
1779}
6e8eb206
DM
1780
1781/*
1782 * Returns the last level spte pointer of the shadow page walk for the given
1783 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1784 * walk could be performed, returns NULL and *spte does not contain valid data.
1785 *
1786 * Contract:
1787 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1788 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1789 *
1790 * WARNING: This function is only intended to be called during fast_page_fault.
1791 */
1792u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1793 u64 *spte)
1794{
1795 struct tdp_iter iter;
1796 struct kvm_mmu *mmu = vcpu->arch.mmu;
1797 gfn_t gfn = addr >> PAGE_SHIFT;
1798 tdp_ptep_t sptep = NULL;
1799
1800 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1801 *spte = iter.old_spte;
1802 sptep = iter.sptep;
1803 }
1804
1805 /*
1806 * Perform the rcu_dereference to get the raw spte pointer value since
1807 * we are passing it up to fast_page_fault, which is shared with the
1808 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1809 * annotation.
1810 *
1811 * This is safe since fast_page_fault obeys the contracts of this
1812 * function as well as all TDP MMU contracts around modifying SPTEs
1813 * outside of mmu_lock.
1814 */
1815 return rcu_dereference(sptep);
1816}