KVM: nSVM: leave the guest mode prior to loading a nested state
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
fe5db27d 13static bool __read_mostly tdp_mmu_enabled = false;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
17void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18{
897218ff 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
fe5db27d
BG
20 return;
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
02c00b3a
BG
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
fe5db27d
BG
28}
29
6103bc07
BG
30static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
31 bool shared)
32{
33 if (shared)
34 lockdep_assert_held_read(&kvm->mmu_lock);
35 else
36 lockdep_assert_held_write(&kvm->mmu_lock);
37}
38
fe5db27d
BG
39void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
40{
41 if (!kvm->arch.tdp_mmu_enabled)
42 return;
02c00b3a
BG
43
44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
45
46 /*
47 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 * can run before the VM is torn down.
49 */
50 rcu_barrier();
02c00b3a
BG
51}
52
2bdb3d84 53static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
54 gfn_t start, gfn_t end, bool can_yield, bool flush,
55 bool shared);
2bdb3d84
BG
56
57static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 58{
2bdb3d84
BG
59 free_page((unsigned long)sp->spt);
60 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
61}
62
c0e64238
BG
63/*
64 * This is called through call_rcu in order to free TDP page table memory
65 * safely with respect to other kernel threads that may be operating on
66 * the memory.
67 * By only accessing TDP MMU page table memory in an RCU read critical
68 * section, and freeing it after a grace period, lockless access to that
69 * memory won't use it after it is freed.
70 */
71static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 72{
c0e64238
BG
73 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74 rcu_head);
a889ea54 75
c0e64238
BG
76 tdp_mmu_free_sp(sp);
77}
a889ea54 78
6103bc07
BG
79void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
80 bool shared)
2bdb3d84
BG
81{
82 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
83
6103bc07 84 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 85
11cccf5c 86 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
87 return;
88
89 WARN_ON(!root->tdp_mmu_page);
90
c0e64238
BG
91 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92 list_del_rcu(&root->link);
93 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
2bdb3d84 94
6103bc07 95 zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
2bdb3d84 96
c0e64238 97 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
98}
99
cfc10997
BG
100/*
101 * Finds the next valid root after root (or the first valid root if root
102 * is NULL), takes a reference on it, and returns that next root. If root
103 * is not NULL, this thread should have already taken a reference on it, and
104 * that reference will be dropped. If no valid root is found, this
105 * function will return NULL.
106 */
107static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07
BG
108 struct kvm_mmu_page *prev_root,
109 bool shared)
a889ea54
BG
110{
111 struct kvm_mmu_page *next_root;
112
c0e64238
BG
113 rcu_read_lock();
114
cfc10997 115 if (prev_root)
c0e64238
BG
116 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
117 &prev_root->link,
118 typeof(*prev_root), link);
cfc10997 119 else
c0e64238
BG
120 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
121 typeof(*next_root), link);
a889ea54 122
c0e64238
BG
123 while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
124 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
125 &next_root->link, typeof(*next_root), link);
fb101293 126
c0e64238 127 rcu_read_unlock();
a889ea54 128
cfc10997 129 if (prev_root)
6103bc07 130 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 131
a889ea54
BG
132 return next_root;
133}
134
135/*
136 * Note: this iterator gets and puts references to the roots it iterates over.
137 * This makes it safe to release the MMU lock and yield within the loop, but
138 * if exiting the loop early, the caller must drop the reference to the most
139 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
140 *
141 * If shared is set, this function is operating under the MMU lock in read
142 * mode. In the unlikely event that this thread must free a root, the lock
143 * will be temporarily dropped and reacquired in write mode.
a889ea54 144 */
6103bc07
BG
145#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
146 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
147 _root; \
148 _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
149 if (kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 150 } else
a889ea54 151
c0e64238
BG
152#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
153 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
154 lockdep_is_held_type(&kvm->mmu_lock, 0) || \
155 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
a3f15bda
SC
156 if (kvm_mmu_page_as_id(_root) != _as_id) { \
157 } else
02c00b3a
BG
158
159static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
160 int level)
161{
162 union kvm_mmu_page_role role;
163
164 role = vcpu->arch.mmu->mmu_role.base;
165 role.level = level;
166 role.direct = true;
167 role.gpte_is_8_bytes = true;
168 role.access = ACC_ALL;
169
170 return role;
171}
172
173static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
174 int level)
175{
176 struct kvm_mmu_page *sp;
177
178 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
179 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
180 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
181
182 sp->role.word = page_role_for_level(vcpu, level).word;
183 sp->gfn = gfn;
184 sp->tdp_mmu_page = true;
185
33dd3574
BG
186 trace_kvm_mmu_get_page(sp, true);
187
02c00b3a
BG
188 return sp;
189}
190
6e6ec584 191hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
02c00b3a
BG
192{
193 union kvm_mmu_page_role role;
194 struct kvm *kvm = vcpu->kvm;
195 struct kvm_mmu_page *root;
196
6e6ec584 197 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 198
6e6ec584 199 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
02c00b3a
BG
200
201 /* Check for an existing root before allocating a new one. */
a3f15bda 202 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293
BG
203 if (root->role.word == role.word &&
204 kvm_tdp_mmu_get_root(kvm, root))
6e6ec584 205 goto out;
02c00b3a
BG
206 }
207
208 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
11cccf5c 209 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 210
c0e64238
BG
211 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
212 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
213 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 214
6e6ec584 215out:
02c00b3a 216 return __pa(root->spt);
fe5db27d 217}
2f2fad08
BG
218
219static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
220 u64 old_spte, u64 new_spte, int level,
221 bool shared);
2f2fad08 222
f8e14497
BG
223static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
224{
f8e14497
BG
225 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
226 return;
227
228 if (is_accessed_spte(old_spte) &&
64bb2769
SC
229 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
230 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
231 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
232}
233
a6a0b05d
BG
234static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
235 u64 old_spte, u64 new_spte, int level)
236{
237 bool pfn_changed;
238 struct kvm_memory_slot *slot;
239
240 if (level > PG_LEVEL_4K)
241 return;
242
243 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
244
245 if ((!is_writable_pte(old_spte) || pfn_changed) &&
246 is_writable_pte(new_spte)) {
247 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 248 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
249 }
250}
251
a9442f59
BG
252/**
253 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
254 *
255 * @kvm: kvm instance
256 * @sp: the new page
9a77daac
BG
257 * @shared: This operation may not be running under the exclusive use of
258 * the MMU lock and the operation must synchronize with other
259 * threads that might be adding or removing pages.
a9442f59
BG
260 * @account_nx: This page replaces a NX large page and should be marked for
261 * eventual reclaim.
262 */
263static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
9a77daac 264 bool shared, bool account_nx)
a9442f59 265{
9a77daac
BG
266 if (shared)
267 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
268 else
269 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
270
271 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
272 if (account_nx)
273 account_huge_nx_page(kvm, sp);
9a77daac
BG
274
275 if (shared)
276 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
277}
278
279/**
280 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
281 *
282 * @kvm: kvm instance
283 * @sp: the page to be removed
9a77daac
BG
284 * @shared: This operation may not be running under the exclusive use of
285 * the MMU lock and the operation must synchronize with other
286 * threads that might be adding or removing pages.
a9442f59 287 */
9a77daac
BG
288static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
289 bool shared)
a9442f59 290{
9a77daac
BG
291 if (shared)
292 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
293 else
294 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
295
296 list_del(&sp->link);
297 if (sp->lpage_disallowed)
298 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
299
300 if (shared)
301 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
302}
303
a066e61f
BG
304/**
305 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
306 *
307 * @kvm: kvm instance
308 * @pt: the page removed from the paging structure
9a77daac
BG
309 * @shared: This operation may not be running under the exclusive use
310 * of the MMU lock and the operation must synchronize with other
311 * threads that might be modifying SPTEs.
a066e61f
BG
312 *
313 * Given a page table that has been removed from the TDP paging structure,
314 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
315 *
316 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
317 * protection. Since this thread removed it from the paging structure,
318 * this thread will be responsible for ensuring the page is freed. Hence the
319 * early rcu_dereferences in the function.
a066e61f 320 */
70fb3e41 321static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
9a77daac 322 bool shared)
a066e61f 323{
70fb3e41 324 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 325 int level = sp->role.level;
e25f0e0c 326 gfn_t base_gfn = sp->gfn;
a066e61f 327 u64 old_child_spte;
9a77daac 328 u64 *sptep;
e25f0e0c 329 gfn_t gfn;
a066e61f
BG
330 int i;
331
332 trace_kvm_mmu_prepare_zap_page(sp);
333
9a77daac 334 tdp_mmu_unlink_page(kvm, sp, shared);
a066e61f
BG
335
336 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
70fb3e41 337 sptep = rcu_dereference(pt) + i;
e25f0e0c 338 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
9a77daac
BG
339
340 if (shared) {
e25f0e0c
BG
341 /*
342 * Set the SPTE to a nonpresent value that other
343 * threads will not overwrite. If the SPTE was
344 * already marked as removed then another thread
345 * handling a page fault could overwrite it, so
346 * set the SPTE until it is set from some other
347 * value to the removed SPTE value.
348 */
349 for (;;) {
350 old_child_spte = xchg(sptep, REMOVED_SPTE);
351 if (!is_removed_spte(old_child_spte))
352 break;
353 cpu_relax();
354 }
9a77daac 355 } else {
8df9f1af
SC
356 /*
357 * If the SPTE is not MMU-present, there is no backing
358 * page associated with the SPTE and so no side effects
359 * that need to be recorded, and exclusive ownership of
360 * mmu_lock ensures the SPTE can't be made present.
361 * Note, zapping MMIO SPTEs is also unnecessary as they
362 * are guarded by the memslots generation, not by being
363 * unreachable.
364 */
9a77daac 365 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
366 if (!is_shadow_present_pte(old_child_spte))
367 continue;
e25f0e0c
BG
368
369 /*
370 * Marking the SPTE as a removed SPTE is not
371 * strictly necessary here as the MMU lock will
372 * stop other threads from concurrently modifying
373 * this SPTE. Using the removed SPTE value keeps
374 * the two branches consistent and simplifies
375 * the function.
376 */
377 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 378 }
e25f0e0c
BG
379 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
380 old_child_spte, REMOVED_SPTE, level - 1,
381 shared);
a066e61f
BG
382 }
383
384 kvm_flush_remote_tlbs_with_address(kvm, gfn,
385 KVM_PAGES_PER_HPAGE(level));
386
7cca2d0b 387 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
388}
389
2f2fad08
BG
390/**
391 * handle_changed_spte - handle bookkeeping associated with an SPTE change
392 * @kvm: kvm instance
393 * @as_id: the address space of the paging structure the SPTE was a part of
394 * @gfn: the base GFN that was mapped by the SPTE
395 * @old_spte: The value of the SPTE before the change
396 * @new_spte: The value of the SPTE after the change
397 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
398 * @shared: This operation may not be running under the exclusive use of
399 * the MMU lock and the operation must synchronize with other
400 * threads that might be modifying SPTEs.
2f2fad08
BG
401 *
402 * Handle bookkeeping that might result from the modification of a SPTE.
403 * This function must be called for all TDP SPTE modifications.
404 */
405static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
406 u64 old_spte, u64 new_spte, int level,
407 bool shared)
2f2fad08
BG
408{
409 bool was_present = is_shadow_present_pte(old_spte);
410 bool is_present = is_shadow_present_pte(new_spte);
411 bool was_leaf = was_present && is_last_spte(old_spte, level);
412 bool is_leaf = is_present && is_last_spte(new_spte, level);
413 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
414
415 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
416 WARN_ON(level < PG_LEVEL_4K);
764388ce 417 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
418
419 /*
420 * If this warning were to trigger it would indicate that there was a
421 * missing MMU notifier or a race with some notifier handler.
422 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 423 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
424 * should be zapping the SPTE before the main MM's page table is
425 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
426 * thread before replacement.
427 */
428 if (was_leaf && is_leaf && pfn_changed) {
429 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
430 "SPTE with another present leaf SPTE mapping a\n"
431 "different PFN!\n"
432 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
433 as_id, gfn, old_spte, new_spte, level);
434
435 /*
436 * Crash the host to prevent error propagation and guest data
d9f6e12f 437 * corruption.
2f2fad08
BG
438 */
439 BUG();
440 }
441
442 if (old_spte == new_spte)
443 return;
444
b9a98c34
BG
445 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
446
1699f65c
SMSH
447 if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
448 if (is_large_pte(old_spte))
449 atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
450 else
451 atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
452 }
453
2f2fad08
BG
454 /*
455 * The only times a SPTE should be changed from a non-present to
456 * non-present state is when an MMIO entry is installed/modified/
457 * removed. In that case, there is nothing to do here.
458 */
459 if (!was_present && !is_present) {
460 /*
08f07c80
BG
461 * If this change does not involve a MMIO SPTE or removed SPTE,
462 * it is unexpected. Log the change, though it should not
463 * impact the guest since both the former and current SPTEs
464 * are nonpresent.
2f2fad08 465 */
08f07c80
BG
466 if (WARN_ON(!is_mmio_spte(old_spte) &&
467 !is_mmio_spte(new_spte) &&
468 !is_removed_spte(new_spte)))
2f2fad08
BG
469 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
470 "should not be replaced with another,\n"
471 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
472 "are MMIO SPTEs, or the new SPTE is\n"
473 "a temporary removed SPTE.\n"
2f2fad08
BG
474 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
475 as_id, gfn, old_spte, new_spte, level);
476 return;
477 }
478
479
480 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 481 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
482 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
483
484 /*
485 * Recursively handle child PTs if the change removed a subtree from
486 * the paging structure.
487 */
a066e61f
BG
488 if (was_present && !was_leaf && (pfn_changed || !is_present))
489 handle_removed_tdp_mmu_page(kvm,
9a77daac 490 spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
491}
492
493static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
494 u64 old_spte, u64 new_spte, int level,
495 bool shared)
2f2fad08 496{
9a77daac
BG
497 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
498 shared);
f8e14497 499 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
500 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
501 new_spte, level);
2f2fad08 502}
faaf05b0 503
9a77daac 504/*
24ae4cfa
BG
505 * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
506 * and handle the associated bookkeeping, but do not mark the page dirty
507 * in KVM's dirty bitmaps.
9a77daac
BG
508 *
509 * @kvm: kvm instance
510 * @iter: a tdp_iter instance currently on the SPTE that should be set
511 * @new_spte: The value the SPTE should be set to
512 * Returns: true if the SPTE was set, false if it was not. If false is returned,
513 * this function will have no side-effects.
514 */
24ae4cfa
BG
515static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
516 struct tdp_iter *iter,
517 u64 new_spte)
9a77daac 518{
9a77daac
BG
519 lockdep_assert_held_read(&kvm->mmu_lock);
520
08f07c80
BG
521 /*
522 * Do not change removed SPTEs. Only the thread that froze the SPTE
523 * may modify it.
524 */
7a51393a 525 if (is_removed_spte(iter->old_spte))
08f07c80
BG
526 return false;
527
9a77daac
BG
528 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
529 new_spte) != iter->old_spte)
530 return false;
531
24ae4cfa
BG
532 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
533 new_spte, iter->level, true);
534 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac
BG
535
536 return true;
537}
538
24ae4cfa
BG
539static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
540 struct tdp_iter *iter,
541 u64 new_spte)
542{
543 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
544 return false;
545
546 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
547 iter->old_spte, new_spte, iter->level);
548 return true;
549}
550
08f07c80
BG
551static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
552 struct tdp_iter *iter)
553{
554 /*
555 * Freeze the SPTE by setting it to a special,
556 * non-present value. This will stop other threads from
557 * immediately installing a present entry in its place
558 * before the TLBs are flushed.
559 */
560 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
561 return false;
562
563 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
564 KVM_PAGES_PER_HPAGE(iter->level));
565
566 /*
567 * No other thread can overwrite the removed SPTE as they
568 * must either wait on the MMU lock or use
d9f6e12f 569 * tdp_mmu_set_spte_atomic which will not overwrite the
08f07c80
BG
570 * special removed SPTE value. No bookkeeping is needed
571 * here since the SPTE is going from non-present
572 * to non-present.
573 */
14f6fec2 574 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
08f07c80
BG
575
576 return true;
577}
578
9a77daac 579
fe43fa2f
BG
580/*
581 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
582 * @kvm: kvm instance
583 * @iter: a tdp_iter instance currently on the SPTE that should be set
584 * @new_spte: The value the SPTE should be set to
585 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
586 * of the page. Should be set unless handling an MMU
587 * notifier for access tracking. Leaving record_acc_track
588 * unset in that case prevents page accesses from being
589 * double counted.
590 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
591 * appropriate for the change being made. Should be set
592 * unless performing certain dirty logging operations.
593 * Leaving record_dirty_log unset in that case prevents page
594 * writes from being double counted.
595 */
f8e14497 596static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
a6a0b05d
BG
597 u64 new_spte, bool record_acc_track,
598 bool record_dirty_log)
faaf05b0 599{
531810ca 600 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 601
08f07c80
BG
602 /*
603 * No thread should be using this function to set SPTEs to the
604 * temporary removed SPTE value.
605 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
606 * should be used. If operating under the MMU lock in write mode, the
607 * use of the removed SPTE should not be necessary.
608 */
7a51393a 609 WARN_ON(is_removed_spte(iter->old_spte));
08f07c80 610
7cca2d0b 611 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
f8e14497 612
08889894
SC
613 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
614 new_spte, iter->level, false);
f8e14497
BG
615 if (record_acc_track)
616 handle_changed_spte_acc_track(iter->old_spte, new_spte,
617 iter->level);
a6a0b05d 618 if (record_dirty_log)
08889894 619 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
a6a0b05d
BG
620 iter->old_spte, new_spte,
621 iter->level);
f8e14497
BG
622}
623
624static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
625 u64 new_spte)
626{
a6a0b05d 627 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 628}
faaf05b0 629
f8e14497
BG
630static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
631 struct tdp_iter *iter,
632 u64 new_spte)
633{
a6a0b05d
BG
634 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
635}
636
637static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
638 struct tdp_iter *iter,
639 u64 new_spte)
640{
641 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
642}
643
644#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
645 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
646
f8e14497
BG
647#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
648 tdp_root_for_each_pte(_iter, _root, _start, _end) \
649 if (!is_shadow_present_pte(_iter.old_spte) || \
650 !is_last_spte(_iter.old_spte, _iter.level)) \
651 continue; \
652 else
653
bb18842e
BG
654#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
655 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
656 _mmu->shadow_root_level, _start, _end)
657
e28a436c
BG
658/*
659 * Yield if the MMU lock is contended or this thread needs to return control
660 * to the scheduler.
661 *
e139a34e
BG
662 * If this function should yield and flush is set, it will perform a remote
663 * TLB flush before yielding.
664 *
e28a436c 665 * If this function yields, it will also reset the tdp_iter's walk over the
ed5e484b
BG
666 * paging structure and the calling function should skip to the next
667 * iteration to allow the iterator to continue its traversal from the
668 * paging structure root.
e28a436c
BG
669 *
670 * Return true if this function yielded and the iterator's traversal was reset.
671 * Return false if a yield was not needed.
672 */
e139a34e 673static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
6103bc07
BG
674 struct tdp_iter *iter, bool flush,
675 bool shared)
a6a0b05d 676{
ed5e484b
BG
677 /* Ensure forward progress has been made before yielding. */
678 if (iter->next_last_level_gfn == iter->yielded_gfn)
679 return false;
680
531810ca 681 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7cca2d0b
BG
682 rcu_read_unlock();
683
e139a34e
BG
684 if (flush)
685 kvm_flush_remote_tlbs(kvm);
686
6103bc07
BG
687 if (shared)
688 cond_resched_rwlock_read(&kvm->mmu_lock);
689 else
690 cond_resched_rwlock_write(&kvm->mmu_lock);
691
7cca2d0b 692 rcu_read_lock();
ed5e484b
BG
693
694 WARN_ON(iter->gfn > iter->next_last_level_gfn);
695
b601c3bc 696 tdp_iter_restart(iter);
ed5e484b 697
e28a436c 698 return true;
a6a0b05d 699 }
e28a436c
BG
700
701 return false;
a6a0b05d
BG
702}
703
faaf05b0
BG
704/*
705 * Tears down the mappings for the range of gfns, [start, end), and frees the
706 * non-root pages mapping GFNs strictly within that range. Returns true if
707 * SPTEs have been cleared and a TLB flush is needed before releasing the
708 * MMU lock.
6103bc07 709 *
063afacd
BG
710 * If can_yield is true, will release the MMU lock and reschedule if the
711 * scheduler needs the CPU or there is contention on the MMU lock. If this
712 * function cannot yield, it will not release the MMU lock or reschedule and
713 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07
BG
714 * operation can cause a soft lockup.
715 *
716 * If shared is true, this thread holds the MMU lock in read mode and must
717 * account for the possibility that other threads are modifying the paging
718 * structures concurrently. If shared is false, this thread should hold the
719 * MMU lock in write mode.
faaf05b0
BG
720 */
721static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
722 gfn_t start, gfn_t end, bool can_yield, bool flush,
723 bool shared)
faaf05b0
BG
724{
725 struct tdp_iter iter;
faaf05b0 726
6103bc07
BG
727 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
728
7cca2d0b
BG
729 rcu_read_lock();
730
faaf05b0 731 tdp_root_for_each_pte(iter, root, start, end) {
6103bc07 732retry:
1af4a960 733 if (can_yield &&
6103bc07 734 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
a835429c 735 flush = false;
1af4a960
BG
736 continue;
737 }
738
faaf05b0
BG
739 if (!is_shadow_present_pte(iter.old_spte))
740 continue;
741
742 /*
743 * If this is a non-last-level SPTE that covers a larger range
744 * than should be zapped, continue, and zap the mappings at a
745 * lower level.
746 */
747 if ((iter.gfn < start ||
748 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
749 !is_last_spte(iter.old_spte, iter.level))
750 continue;
751
6103bc07
BG
752 if (!shared) {
753 tdp_mmu_set_spte(kvm, &iter, 0);
754 flush = true;
755 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
756 /*
757 * The iter must explicitly re-read the SPTE because
758 * the atomic cmpxchg failed.
759 */
760 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
761 goto retry;
762 }
faaf05b0 763 }
7cca2d0b
BG
764
765 rcu_read_unlock();
a835429c 766 return flush;
faaf05b0
BG
767}
768
769/*
770 * Tears down the mappings for the range of gfns, [start, end), and frees the
771 * non-root pages mapping GFNs strictly within that range. Returns true if
772 * SPTEs have been cleared and a TLB flush is needed before releasing the
773 * MMU lock.
6103bc07
BG
774 *
775 * If shared is true, this thread holds the MMU lock in read mode and must
776 * account for the possibility that other threads are modifying the paging
777 * structures concurrently. If shared is false, this thread should hold the
778 * MMU in write mode.
faaf05b0 779 */
2b9663d8 780bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
6103bc07
BG
781 gfn_t end, bool can_yield, bool flush,
782 bool shared)
faaf05b0
BG
783{
784 struct kvm_mmu_page *root;
faaf05b0 785
6103bc07
BG
786 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
787 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
788 shared);
faaf05b0 789
faaf05b0
BG
790 return flush;
791}
792
793void kvm_tdp_mmu_zap_all(struct kvm *kvm)
794{
339f5a7f 795 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
2b9663d8
SC
796 bool flush = false;
797 int i;
798
799 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
6103bc07
BG
800 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
801 flush, false);
faaf05b0 802
faaf05b0
BG
803 if (flush)
804 kvm_flush_remote_tlbs(kvm);
805}
bb18842e 806
4c6654bd
BG
807static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
808 struct kvm_mmu_page *prev_root)
809{
810 struct kvm_mmu_page *next_root;
811
812 if (prev_root)
813 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
814 &prev_root->link,
815 typeof(*prev_root), link);
816 else
817 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
818 typeof(*next_root), link);
819
820 while (next_root && !(next_root->role.invalid &&
821 refcount_read(&next_root->tdp_mmu_root_count)))
822 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
823 &next_root->link,
824 typeof(*next_root), link);
825
826 return next_root;
827}
828
829/*
830 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
831 * invalidated root, they will not be freed until this function drops the
832 * reference. Before dropping that reference, tear down the paging
833 * structure so that whichever thread does drop the last reference
834 * only has to do a trivial amount of work. Since the roots are invalid,
835 * no new SPTEs should be created under them.
836 */
837void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
838{
839 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
840 struct kvm_mmu_page *next_root;
841 struct kvm_mmu_page *root;
842 bool flush = false;
843
844 lockdep_assert_held_read(&kvm->mmu_lock);
845
846 rcu_read_lock();
847
848 root = next_invalidated_root(kvm, NULL);
849
850 while (root) {
851 next_root = next_invalidated_root(kvm, root);
852
853 rcu_read_unlock();
854
855 flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
856 true);
857
858 /*
859 * Put the reference acquired in
860 * kvm_tdp_mmu_invalidate_roots
861 */
862 kvm_tdp_mmu_put_root(kvm, root, true);
863
864 root = next_root;
865
866 rcu_read_lock();
867 }
868
869 rcu_read_unlock();
faaf05b0 870
faaf05b0
BG
871 if (flush)
872 kvm_flush_remote_tlbs(kvm);
873}
bb18842e 874
b7cccd39
BG
875/*
876 * Mark each TDP MMU root as invalid so that other threads
877 * will drop their references and allow the root count to
878 * go to 0.
879 *
4c6654bd
BG
880 * Also take a reference on all roots so that this thread
881 * can do the bulk of the work required to free the roots
882 * once they are invalidated. Without this reference, a
883 * vCPU thread might drop the last reference to a root and
884 * get stuck with tearing down the entire paging structure.
885 *
886 * Roots which have a zero refcount should be skipped as
887 * they're already being torn down.
888 * Already invalid roots should be referenced again so that
889 * they aren't freed before kvm_tdp_mmu_zap_all_fast is
890 * done with them.
891 *
b7cccd39
BG
892 * This has essentially the same effect for the TDP MMU
893 * as updating mmu_valid_gen does for the shadow MMU.
894 */
895void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
896{
897 struct kvm_mmu_page *root;
898
899 lockdep_assert_held_write(&kvm->mmu_lock);
900 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
4c6654bd
BG
901 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
902 root->role.invalid = true;
b7cccd39
BG
903}
904
bb18842e
BG
905/*
906 * Installs a last-level SPTE to handle a TDP page fault.
907 * (NPT/EPT violation/misconfiguration)
908 */
909static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
910 int map_writable,
911 struct tdp_iter *iter,
912 kvm_pfn_t pfn, bool prefault)
913{
914 u64 new_spte;
915 int ret = 0;
916 int make_spte_ret = 0;
917
9a77daac 918 if (unlikely(is_noslot_pfn(pfn)))
bb18842e 919 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 920 else
bb18842e
BG
921 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
922 pfn, iter->old_spte, prefault, true,
923 map_writable, !shadow_accessed_mask,
924 &new_spte);
925
926 if (new_spte == iter->old_spte)
927 ret = RET_PF_SPURIOUS;
9a77daac
BG
928 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
929 return RET_PF_RETRY;
bb18842e
BG
930
931 /*
932 * If the page fault was caused by a write but the page is write
933 * protected, emulation is needed. If the emulation was skipped,
934 * the vCPU would have the same fault again.
935 */
936 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
937 if (write)
938 ret = RET_PF_EMULATE;
939 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
940 }
941
942 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
943 if (unlikely(is_mmio_spte(new_spte))) {
944 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
945 new_spte);
bb18842e 946 ret = RET_PF_EMULATE;
3849e092 947 } else {
9a77daac
BG
948 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
949 rcu_dereference(iter->sptep));
3849e092 950 }
bb18842e 951
bb18842e
BG
952 if (!prefault)
953 vcpu->stat.pf_fixed++;
954
955 return ret;
956}
957
958/*
959 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
960 * page tables and SPTEs to translate the faulting guest physical address.
961 */
962int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
963 int map_writable, int max_level, kvm_pfn_t pfn,
964 bool prefault)
965{
966 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
967 bool write = error_code & PFERR_WRITE_MASK;
968 bool exec = error_code & PFERR_FETCH_MASK;
969 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
970 struct kvm_mmu *mmu = vcpu->arch.mmu;
971 struct tdp_iter iter;
89c0fd49 972 struct kvm_mmu_page *sp;
bb18842e
BG
973 u64 *child_pt;
974 u64 new_spte;
975 int ret;
976 gfn_t gfn = gpa >> PAGE_SHIFT;
977 int level;
978 int req_level;
979
980 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
981 return RET_PF_RETRY;
982 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
983 return RET_PF_RETRY;
984
985 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
986 huge_page_disallowed, &req_level);
987
988 trace_kvm_mmu_spte_requested(gpa, level, pfn);
7cca2d0b
BG
989
990 rcu_read_lock();
991
bb18842e
BG
992 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
993 if (nx_huge_page_workaround_enabled)
994 disallowed_hugepage_adjust(iter.old_spte, gfn,
995 iter.level, &pfn, &level);
996
997 if (iter.level == level)
998 break;
999
1000 /*
1001 * If there is an SPTE mapping a large page at a higher level
1002 * than the target, that SPTE must be cleared and replaced
1003 * with a non-leaf SPTE.
1004 */
1005 if (is_shadow_present_pte(iter.old_spte) &&
1006 is_large_pte(iter.old_spte)) {
08f07c80 1007 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 1008 break;
bb18842e 1009
bb18842e
BG
1010 /*
1011 * The iter must explicitly re-read the spte here
1012 * because the new value informs the !present
1013 * path below.
1014 */
7cca2d0b 1015 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
bb18842e
BG
1016 }
1017
1018 if (!is_shadow_present_pte(iter.old_spte)) {
ff76d506
KH
1019 /*
1020 * If SPTE has been forzen by another thread, just
1021 * give up and retry, avoiding unnecessary page table
1022 * allocation and free.
1023 */
1024 if (is_removed_spte(iter.old_spte))
1025 break;
1026
89c0fd49 1027 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
89c0fd49 1028 child_pt = sp->spt;
a9442f59 1029
bb18842e
BG
1030 new_spte = make_nonleaf_spte(child_pt,
1031 !shadow_accessed_mask);
1032
9a77daac
BG
1033 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
1034 new_spte)) {
1035 tdp_mmu_link_page(vcpu->kvm, sp, true,
1036 huge_page_disallowed &&
1037 req_level >= iter.level);
1038
1039 trace_kvm_mmu_get_page(sp, true);
1040 } else {
1041 tdp_mmu_free_sp(sp);
1042 break;
1043 }
bb18842e
BG
1044 }
1045 }
1046
9a77daac 1047 if (iter.level != level) {
7cca2d0b 1048 rcu_read_unlock();
bb18842e 1049 return RET_PF_RETRY;
7cca2d0b 1050 }
bb18842e
BG
1051
1052 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1053 pfn, prefault);
7cca2d0b 1054 rcu_read_unlock();
bb18842e
BG
1055
1056 return ret;
1057}
063afacd 1058
3039bcc7
SC
1059bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1060 bool flush)
063afacd 1061{
063afacd 1062 struct kvm_mmu_page *root;
063afacd 1063
3039bcc7
SC
1064 for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1065 flush |= zap_gfn_range(kvm, root, range->start, range->end,
6103bc07 1066 range->may_block, flush, false);
063afacd 1067
3039bcc7 1068 return flush;
063afacd
BG
1069}
1070
3039bcc7
SC
1071typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1072 struct kvm_gfn_range *range);
063afacd 1073
3039bcc7
SC
1074static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1075 struct kvm_gfn_range *range,
1076 tdp_handler_t handler)
063afacd 1077{
3039bcc7
SC
1078 struct kvm_mmu_page *root;
1079 struct tdp_iter iter;
1080 bool ret = false;
1081
1082 rcu_read_lock();
1083
e1eed584
SC
1084 /*
1085 * Don't support rescheduling, none of the MMU notifiers that funnel
1086 * into this helper allow blocking; it'd be dead, wasteful code.
1087 */
3039bcc7
SC
1088 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1089 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1090 ret |= handler(kvm, &iter, range);
1091 }
1092
1093 rcu_read_unlock();
1094
1095 return ret;
063afacd 1096}
f8e14497
BG
1097
1098/*
1099 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1100 * if any of the GFNs in the range have been accessed.
1101 */
3039bcc7
SC
1102static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1103 struct kvm_gfn_range *range)
f8e14497 1104{
f8e14497
BG
1105 u64 new_spte = 0;
1106
3039bcc7
SC
1107 /* If we have a non-accessed entry we don't need to change the pte. */
1108 if (!is_accessed_spte(iter->old_spte))
1109 return false;
7cca2d0b 1110
3039bcc7
SC
1111 new_spte = iter->old_spte;
1112
1113 if (spte_ad_enabled(new_spte)) {
1114 new_spte &= ~shadow_accessed_mask;
1115 } else {
f8e14497 1116 /*
3039bcc7
SC
1117 * Capture the dirty status of the page, so that it doesn't get
1118 * lost when the SPTE is marked for access tracking.
f8e14497 1119 */
3039bcc7
SC
1120 if (is_writable_pte(new_spte))
1121 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1122
3039bcc7 1123 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1124 }
1125
3039bcc7 1126 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1127
3039bcc7 1128 return true;
f8e14497
BG
1129}
1130
3039bcc7 1131bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1132{
3039bcc7 1133 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1134}
1135
3039bcc7
SC
1136static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1137 struct kvm_gfn_range *range)
f8e14497 1138{
3039bcc7 1139 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1140}
1141
3039bcc7 1142bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1143{
3039bcc7 1144 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1145}
1d8dd6b3 1146
3039bcc7
SC
1147static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1148 struct kvm_gfn_range *range)
1d8dd6b3 1149{
1d8dd6b3 1150 u64 new_spte;
7cca2d0b 1151
3039bcc7
SC
1152 /* Huge pages aren't expected to be modified without first being zapped. */
1153 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1154
3039bcc7
SC
1155 if (iter->level != PG_LEVEL_4K ||
1156 !is_shadow_present_pte(iter->old_spte))
1157 return false;
1d8dd6b3 1158
3039bcc7
SC
1159 /*
1160 * Note, when changing a read-only SPTE, it's not strictly necessary to
1161 * zero the SPTE before setting the new PFN, but doing so preserves the
1162 * invariant that the PFN of a present * leaf SPTE can never change.
1163 * See __handle_changed_spte().
1164 */
1165 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1166
3039bcc7
SC
1167 if (!pte_write(range->pte)) {
1168 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1169 pte_pfn(range->pte));
1d8dd6b3 1170
3039bcc7 1171 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1172 }
1173
3039bcc7 1174 return true;
1d8dd6b3
BG
1175}
1176
3039bcc7
SC
1177/*
1178 * Handle the changed_pte MMU notifier for the TDP MMU.
1179 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1180 * notifier.
1181 * Returns non-zero if a flush is needed before releasing the MMU lock.
1182 */
1183bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1184{
3039bcc7
SC
1185 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1186
1187 /* FIXME: return 'flush' instead of flushing here. */
1188 if (flush)
1189 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1190
1191 return false;
1d8dd6b3
BG
1192}
1193
a6a0b05d
BG
1194/*
1195 * Remove write access from all the SPTEs mapping GFNs [start, end). If
1196 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1197 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1198 */
1199static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1200 gfn_t start, gfn_t end, int min_level)
1201{
1202 struct tdp_iter iter;
1203 u64 new_spte;
1204 bool spte_set = false;
1205
7cca2d0b
BG
1206 rcu_read_lock();
1207
a6a0b05d
BG
1208 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1209
1210 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1211 min_level, start, end) {
24ae4cfa
BG
1212retry:
1213 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1214 continue;
1215
a6a0b05d 1216 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1217 !is_last_spte(iter.old_spte, iter.level) ||
1218 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1219 continue;
1220
1221 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1222
24ae4cfa
BG
1223 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1224 new_spte)) {
1225 /*
1226 * The iter must explicitly re-read the SPTE because
1227 * the atomic cmpxchg failed.
1228 */
1229 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1230 goto retry;
1231 }
a6a0b05d 1232 spte_set = true;
a6a0b05d 1233 }
7cca2d0b
BG
1234
1235 rcu_read_unlock();
a6a0b05d
BG
1236 return spte_set;
1237}
1238
1239/*
1240 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1241 * only affect leaf SPTEs down to min_level.
1242 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1243 */
1244bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1245 int min_level)
1246{
1247 struct kvm_mmu_page *root;
a6a0b05d
BG
1248 bool spte_set = false;
1249
24ae4cfa 1250 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1251
24ae4cfa 1252 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1253 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1254 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1255
1256 return spte_set;
1257}
1258
1259/*
1260 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1261 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1262 * If AD bits are not enabled, this will require clearing the writable bit on
1263 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1264 * be flushed.
1265 */
1266static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1267 gfn_t start, gfn_t end)
1268{
1269 struct tdp_iter iter;
1270 u64 new_spte;
1271 bool spte_set = false;
1272
7cca2d0b
BG
1273 rcu_read_lock();
1274
a6a0b05d 1275 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1276retry:
1277 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1278 continue;
1279
a6a0b05d
BG
1280 if (spte_ad_need_write_protect(iter.old_spte)) {
1281 if (is_writable_pte(iter.old_spte))
1282 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1283 else
1284 continue;
1285 } else {
1286 if (iter.old_spte & shadow_dirty_mask)
1287 new_spte = iter.old_spte & ~shadow_dirty_mask;
1288 else
1289 continue;
1290 }
1291
24ae4cfa
BG
1292 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1293 new_spte)) {
1294 /*
1295 * The iter must explicitly re-read the SPTE because
1296 * the atomic cmpxchg failed.
1297 */
1298 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1299 goto retry;
1300 }
a6a0b05d 1301 spte_set = true;
a6a0b05d 1302 }
7cca2d0b
BG
1303
1304 rcu_read_unlock();
a6a0b05d
BG
1305 return spte_set;
1306}
1307
1308/*
1309 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1310 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1311 * If AD bits are not enabled, this will require clearing the writable bit on
1312 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1313 * be flushed.
1314 */
1315bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1316{
1317 struct kvm_mmu_page *root;
a6a0b05d
BG
1318 bool spte_set = false;
1319
24ae4cfa 1320 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1321
24ae4cfa 1322 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1323 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1324 slot->base_gfn + slot->npages);
a6a0b05d
BG
1325
1326 return spte_set;
1327}
1328
1329/*
1330 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1331 * set in mask, starting at gfn. The given memslot is expected to contain all
1332 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1333 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1334 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1335 */
1336static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1337 gfn_t gfn, unsigned long mask, bool wrprot)
1338{
1339 struct tdp_iter iter;
1340 u64 new_spte;
1341
7cca2d0b
BG
1342 rcu_read_lock();
1343
a6a0b05d
BG
1344 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1345 gfn + BITS_PER_LONG) {
1346 if (!mask)
1347 break;
1348
1349 if (iter.level > PG_LEVEL_4K ||
1350 !(mask & (1UL << (iter.gfn - gfn))))
1351 continue;
1352
f1b3b06a
BG
1353 mask &= ~(1UL << (iter.gfn - gfn));
1354
a6a0b05d
BG
1355 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1356 if (is_writable_pte(iter.old_spte))
1357 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1358 else
1359 continue;
1360 } else {
1361 if (iter.old_spte & shadow_dirty_mask)
1362 new_spte = iter.old_spte & ~shadow_dirty_mask;
1363 else
1364 continue;
1365 }
1366
1367 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1368 }
7cca2d0b
BG
1369
1370 rcu_read_unlock();
a6a0b05d
BG
1371}
1372
1373/*
1374 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1375 * set in mask, starting at gfn. The given memslot is expected to contain all
1376 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1377 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1378 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1379 */
1380void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1381 struct kvm_memory_slot *slot,
1382 gfn_t gfn, unsigned long mask,
1383 bool wrprot)
1384{
1385 struct kvm_mmu_page *root;
a6a0b05d 1386
531810ca 1387 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1388 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1389 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1390}
1391
14881998 1392/*
87aa9ec9
BG
1393 * Clear leaf entries which could be replaced by large mappings, for
1394 * GFNs within the slot.
14881998 1395 */
af95b53e 1396static bool zap_collapsible_spte_range(struct kvm *kvm,
14881998 1397 struct kvm_mmu_page *root,
8ca6f063 1398 const struct kvm_memory_slot *slot,
af95b53e 1399 bool flush)
14881998 1400{
9eba50f8
SC
1401 gfn_t start = slot->base_gfn;
1402 gfn_t end = start + slot->npages;
14881998
BG
1403 struct tdp_iter iter;
1404 kvm_pfn_t pfn;
14881998 1405
7cca2d0b
BG
1406 rcu_read_lock();
1407
14881998 1408 tdp_root_for_each_pte(iter, root, start, end) {
2db6f772
BG
1409retry:
1410 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
af95b53e 1411 flush = false;
1af4a960
BG
1412 continue;
1413 }
1414
14881998 1415 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1416 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1417 continue;
1418
1419 pfn = spte_to_pfn(iter.old_spte);
1420 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1421 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1422 pfn, PG_LEVEL_NUM))
14881998
BG
1423 continue;
1424
2db6f772
BG
1425 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1426 /*
1427 * The iter must explicitly re-read the SPTE because
1428 * the atomic cmpxchg failed.
1429 */
1430 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1431 goto retry;
1432 }
af95b53e 1433 flush = true;
14881998
BG
1434 }
1435
7cca2d0b 1436 rcu_read_unlock();
af95b53e
SC
1437
1438 return flush;
14881998
BG
1439}
1440
1441/*
1442 * Clear non-leaf entries (and free associated page tables) which could
1443 * be replaced by large mappings, for GFNs within the slot.
1444 */
142ccde1 1445bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
8ca6f063
BG
1446 const struct kvm_memory_slot *slot,
1447 bool flush)
14881998
BG
1448{
1449 struct kvm_mmu_page *root;
14881998 1450
2db6f772 1451 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1452
2db6f772 1453 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
af95b53e 1454 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
af95b53e 1455
142ccde1 1456 return flush;
14881998 1457}
46044f72
BG
1458
1459/*
1460 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1461 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1462 * Returns true if an SPTE was set and a TLB flush is needed.
1463 */
1464static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1465 gfn_t gfn)
1466{
1467 struct tdp_iter iter;
1468 u64 new_spte;
1469 bool spte_set = false;
1470
7cca2d0b
BG
1471 rcu_read_lock();
1472
46044f72
BG
1473 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1474 if (!is_writable_pte(iter.old_spte))
1475 break;
1476
1477 new_spte = iter.old_spte &
5fc3424f 1478 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72
BG
1479
1480 tdp_mmu_set_spte(kvm, &iter, new_spte);
1481 spte_set = true;
1482 }
1483
7cca2d0b
BG
1484 rcu_read_unlock();
1485
46044f72
BG
1486 return spte_set;
1487}
1488
1489/*
1490 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1491 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1492 * Returns true if an SPTE was set and a TLB flush is needed.
1493 */
1494bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1495 struct kvm_memory_slot *slot, gfn_t gfn)
1496{
1497 struct kvm_mmu_page *root;
46044f72
BG
1498 bool spte_set = false;
1499
531810ca 1500 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1501 for_each_tdp_mmu_root(kvm, root, slot->as_id)
46044f72 1502 spte_set |= write_protect_gfn(kvm, root, gfn);
a3f15bda 1503
46044f72
BG
1504 return spte_set;
1505}
1506
95fb5b02
BG
1507/*
1508 * Return the level of the lowest level SPTE added to sptes.
1509 * That SPTE may be non-present.
1510 */
39b4d43e
SC
1511int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1512 int *root_level)
95fb5b02
BG
1513{
1514 struct tdp_iter iter;
1515 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1516 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1517 int leaf = -1;
95fb5b02 1518
39b4d43e 1519 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02 1520
7cca2d0b
BG
1521 rcu_read_lock();
1522
95fb5b02
BG
1523 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1524 leaf = iter.level;
dde81f94 1525 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1526 }
1527
7cca2d0b
BG
1528 rcu_read_unlock();
1529
95fb5b02
BG
1530 return leaf;
1531}