KVM: x86: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
71ba3f31 13static bool __read_mostly tdp_mmu_enabled = true;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
d501f747 17bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 18{
897218ff 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
d501f747 20 return false;
fe5db27d
BG
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
02c00b3a
BG
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
d501f747
BG
28
29 return true;
fe5db27d
BG
30}
31
6103bc07
BG
32static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 bool shared)
34{
35 if (shared)
36 lockdep_assert_held_read(&kvm->mmu_lock);
37 else
38 lockdep_assert_held_write(&kvm->mmu_lock);
39}
40
fe5db27d
BG
41void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42{
43 if (!kvm->arch.tdp_mmu_enabled)
44 return;
02c00b3a 45
524a1e4e 46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
02c00b3a 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
48
49 /*
50 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 * can run before the VM is torn down.
52 */
53 rcu_barrier();
02c00b3a
BG
54}
55
2bdb3d84 56static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
57 gfn_t start, gfn_t end, bool can_yield, bool flush,
58 bool shared);
2bdb3d84
BG
59
60static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 61{
2bdb3d84
BG
62 free_page((unsigned long)sp->spt);
63 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
64}
65
c0e64238
BG
66/*
67 * This is called through call_rcu in order to free TDP page table memory
68 * safely with respect to other kernel threads that may be operating on
69 * the memory.
70 * By only accessing TDP MMU page table memory in an RCU read critical
71 * section, and freeing it after a grace period, lockless access to that
72 * memory won't use it after it is freed.
73 */
74static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 75{
c0e64238
BG
76 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 rcu_head);
a889ea54 78
c0e64238
BG
79 tdp_mmu_free_sp(sp);
80}
a889ea54 81
6103bc07
BG
82void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 bool shared)
2bdb3d84 84{
6103bc07 85 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 86
11cccf5c 87 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
88 return;
89
90 WARN_ON(!root->tdp_mmu_page);
91
c0e64238
BG
92 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 list_del_rcu(&root->link);
94 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
2bdb3d84 95
524a1e4e 96 zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
2bdb3d84 97
c0e64238 98 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
99}
100
cfc10997
BG
101/*
102 * Finds the next valid root after root (or the first valid root if root
103 * is NULL), takes a reference on it, and returns that next root. If root
104 * is not NULL, this thread should have already taken a reference on it, and
105 * that reference will be dropped. If no valid root is found, this
106 * function will return NULL.
107 */
108static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07
BG
109 struct kvm_mmu_page *prev_root,
110 bool shared)
a889ea54
BG
111{
112 struct kvm_mmu_page *next_root;
113
c0e64238
BG
114 rcu_read_lock();
115
cfc10997 116 if (prev_root)
c0e64238
BG
117 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
118 &prev_root->link,
119 typeof(*prev_root), link);
cfc10997 120 else
c0e64238
BG
121 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 typeof(*next_root), link);
a889ea54 123
c0e64238
BG
124 while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
125 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
126 &next_root->link, typeof(*next_root), link);
fb101293 127
c0e64238 128 rcu_read_unlock();
a889ea54 129
cfc10997 130 if (prev_root)
6103bc07 131 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 132
a889ea54
BG
133 return next_root;
134}
135
136/*
137 * Note: this iterator gets and puts references to the roots it iterates over.
138 * This makes it safe to release the MMU lock and yield within the loop, but
139 * if exiting the loop early, the caller must drop the reference to the most
140 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
141 *
142 * If shared is set, this function is operating under the MMU lock in read
143 * mode. In the unlikely event that this thread must free a root, the lock
144 * will be temporarily dropped and reacquired in write mode.
a889ea54 145 */
6103bc07
BG
146#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
147 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
148 _root; \
149 _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
150 if (kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 151 } else
a889ea54 152
c0e64238
BG
153#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
154 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
155 lockdep_is_held_type(&kvm->mmu_lock, 0) || \
156 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
a3f15bda
SC
157 if (kvm_mmu_page_as_id(_root) != _as_id) { \
158 } else
02c00b3a
BG
159
160static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
161 int level)
162{
163 union kvm_mmu_page_role role;
164
165 role = vcpu->arch.mmu->mmu_role.base;
166 role.level = level;
167 role.direct = true;
168 role.gpte_is_8_bytes = true;
169 role.access = ACC_ALL;
87e888ea 170 role.ad_disabled = !shadow_accessed_mask;
02c00b3a
BG
171
172 return role;
173}
174
175static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
176 int level)
177{
178 struct kvm_mmu_page *sp;
179
180 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
181 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
182 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
183
184 sp->role.word = page_role_for_level(vcpu, level).word;
185 sp->gfn = gfn;
186 sp->tdp_mmu_page = true;
187
33dd3574
BG
188 trace_kvm_mmu_get_page(sp, true);
189
02c00b3a
BG
190 return sp;
191}
192
6e6ec584 193hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
02c00b3a
BG
194{
195 union kvm_mmu_page_role role;
196 struct kvm *kvm = vcpu->kvm;
197 struct kvm_mmu_page *root;
198
6e6ec584 199 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 200
6e6ec584 201 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
02c00b3a
BG
202
203 /* Check for an existing root before allocating a new one. */
a3f15bda 204 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293
BG
205 if (root->role.word == role.word &&
206 kvm_tdp_mmu_get_root(kvm, root))
6e6ec584 207 goto out;
02c00b3a
BG
208 }
209
210 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
11cccf5c 211 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 212
c0e64238
BG
213 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
214 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
215 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 216
6e6ec584 217out:
02c00b3a 218 return __pa(root->spt);
fe5db27d 219}
2f2fad08
BG
220
221static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
222 u64 old_spte, u64 new_spte, int level,
223 bool shared);
2f2fad08 224
f8e14497
BG
225static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
226{
f8e14497
BG
227 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
228 return;
229
230 if (is_accessed_spte(old_spte) &&
64bb2769
SC
231 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
232 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
233 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
234}
235
a6a0b05d
BG
236static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
237 u64 old_spte, u64 new_spte, int level)
238{
239 bool pfn_changed;
240 struct kvm_memory_slot *slot;
241
242 if (level > PG_LEVEL_4K)
243 return;
244
245 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
246
247 if ((!is_writable_pte(old_spte) || pfn_changed) &&
248 is_writable_pte(new_spte)) {
249 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 250 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
251 }
252}
253
a9442f59
BG
254/**
255 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
256 *
257 * @kvm: kvm instance
258 * @sp: the new page
259 * @account_nx: This page replaces a NX large page and should be marked for
260 * eventual reclaim.
261 */
262static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
9653f2da 263 bool account_nx)
a9442f59 264{
9653f2da 265 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
266 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
267 if (account_nx)
268 account_huge_nx_page(kvm, sp);
9653f2da 269 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
270}
271
272/**
273 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
274 *
275 * @kvm: kvm instance
276 * @sp: the page to be removed
9a77daac
BG
277 * @shared: This operation may not be running under the exclusive use of
278 * the MMU lock and the operation must synchronize with other
279 * threads that might be adding or removing pages.
a9442f59 280 */
9a77daac
BG
281static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
282 bool shared)
a9442f59 283{
9a77daac
BG
284 if (shared)
285 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
286 else
287 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
288
289 list_del(&sp->link);
290 if (sp->lpage_disallowed)
291 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
292
293 if (shared)
294 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
295}
296
a066e61f
BG
297/**
298 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
299 *
300 * @kvm: kvm instance
301 * @pt: the page removed from the paging structure
9a77daac
BG
302 * @shared: This operation may not be running under the exclusive use
303 * of the MMU lock and the operation must synchronize with other
304 * threads that might be modifying SPTEs.
a066e61f
BG
305 *
306 * Given a page table that has been removed from the TDP paging structure,
307 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
308 *
309 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
310 * protection. Since this thread removed it from the paging structure,
311 * this thread will be responsible for ensuring the page is freed. Hence the
312 * early rcu_dereferences in the function.
a066e61f 313 */
70fb3e41 314static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
9a77daac 315 bool shared)
a066e61f 316{
70fb3e41 317 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 318 int level = sp->role.level;
e25f0e0c 319 gfn_t base_gfn = sp->gfn;
a066e61f 320 u64 old_child_spte;
9a77daac 321 u64 *sptep;
e25f0e0c 322 gfn_t gfn;
a066e61f
BG
323 int i;
324
325 trace_kvm_mmu_prepare_zap_page(sp);
326
9a77daac 327 tdp_mmu_unlink_page(kvm, sp, shared);
a066e61f
BG
328
329 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
70fb3e41 330 sptep = rcu_dereference(pt) + i;
f1b83255 331 gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
9a77daac
BG
332
333 if (shared) {
e25f0e0c
BG
334 /*
335 * Set the SPTE to a nonpresent value that other
336 * threads will not overwrite. If the SPTE was
337 * already marked as removed then another thread
338 * handling a page fault could overwrite it, so
339 * set the SPTE until it is set from some other
340 * value to the removed SPTE value.
341 */
342 for (;;) {
343 old_child_spte = xchg(sptep, REMOVED_SPTE);
344 if (!is_removed_spte(old_child_spte))
345 break;
346 cpu_relax();
347 }
9a77daac 348 } else {
8df9f1af
SC
349 /*
350 * If the SPTE is not MMU-present, there is no backing
351 * page associated with the SPTE and so no side effects
352 * that need to be recorded, and exclusive ownership of
353 * mmu_lock ensures the SPTE can't be made present.
354 * Note, zapping MMIO SPTEs is also unnecessary as they
355 * are guarded by the memslots generation, not by being
356 * unreachable.
357 */
9a77daac 358 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
359 if (!is_shadow_present_pte(old_child_spte))
360 continue;
e25f0e0c
BG
361
362 /*
363 * Marking the SPTE as a removed SPTE is not
364 * strictly necessary here as the MMU lock will
365 * stop other threads from concurrently modifying
366 * this SPTE. Using the removed SPTE value keeps
367 * the two branches consistent and simplifies
368 * the function.
369 */
370 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 371 }
e25f0e0c 372 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
f1b83255 373 old_child_spte, REMOVED_SPTE, level,
e25f0e0c 374 shared);
a066e61f
BG
375 }
376
377 kvm_flush_remote_tlbs_with_address(kvm, gfn,
f1b83255 378 KVM_PAGES_PER_HPAGE(level + 1));
a066e61f 379
7cca2d0b 380 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
381}
382
2f2fad08 383/**
7f6231a3 384 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
385 * @kvm: kvm instance
386 * @as_id: the address space of the paging structure the SPTE was a part of
387 * @gfn: the base GFN that was mapped by the SPTE
388 * @old_spte: The value of the SPTE before the change
389 * @new_spte: The value of the SPTE after the change
390 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
391 * @shared: This operation may not be running under the exclusive use of
392 * the MMU lock and the operation must synchronize with other
393 * threads that might be modifying SPTEs.
2f2fad08
BG
394 *
395 * Handle bookkeeping that might result from the modification of a SPTE.
396 * This function must be called for all TDP SPTE modifications.
397 */
398static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
399 u64 old_spte, u64 new_spte, int level,
400 bool shared)
2f2fad08
BG
401{
402 bool was_present = is_shadow_present_pte(old_spte);
403 bool is_present = is_shadow_present_pte(new_spte);
404 bool was_leaf = was_present && is_last_spte(old_spte, level);
405 bool is_leaf = is_present && is_last_spte(new_spte, level);
406 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
407
408 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
409 WARN_ON(level < PG_LEVEL_4K);
764388ce 410 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
411
412 /*
413 * If this warning were to trigger it would indicate that there was a
414 * missing MMU notifier or a race with some notifier handler.
415 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 416 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
417 * should be zapping the SPTE before the main MM's page table is
418 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
419 * thread before replacement.
420 */
421 if (was_leaf && is_leaf && pfn_changed) {
422 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
423 "SPTE with another present leaf SPTE mapping a\n"
424 "different PFN!\n"
425 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
426 as_id, gfn, old_spte, new_spte, level);
427
428 /*
429 * Crash the host to prevent error propagation and guest data
d9f6e12f 430 * corruption.
2f2fad08
BG
431 */
432 BUG();
433 }
434
435 if (old_spte == new_spte)
436 return;
437
b9a98c34
BG
438 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
439
2f2fad08
BG
440 /*
441 * The only times a SPTE should be changed from a non-present to
442 * non-present state is when an MMIO entry is installed/modified/
443 * removed. In that case, there is nothing to do here.
444 */
445 if (!was_present && !is_present) {
446 /*
08f07c80
BG
447 * If this change does not involve a MMIO SPTE or removed SPTE,
448 * it is unexpected. Log the change, though it should not
449 * impact the guest since both the former and current SPTEs
450 * are nonpresent.
2f2fad08 451 */
08f07c80
BG
452 if (WARN_ON(!is_mmio_spte(old_spte) &&
453 !is_mmio_spte(new_spte) &&
454 !is_removed_spte(new_spte)))
2f2fad08
BG
455 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
456 "should not be replaced with another,\n"
457 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
458 "are MMIO SPTEs, or the new SPTE is\n"
459 "a temporary removed SPTE.\n"
2f2fad08
BG
460 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
461 as_id, gfn, old_spte, new_spte, level);
462 return;
463 }
464
71f51d2c
MZ
465 if (is_leaf != was_leaf)
466 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
467
468 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 469 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
470 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
471
472 /*
473 * Recursively handle child PTs if the change removed a subtree from
474 * the paging structure.
475 */
a066e61f
BG
476 if (was_present && !was_leaf && (pfn_changed || !is_present))
477 handle_removed_tdp_mmu_page(kvm,
9a77daac 478 spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
479}
480
481static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
482 u64 old_spte, u64 new_spte, int level,
483 bool shared)
2f2fad08 484{
9a77daac
BG
485 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
486 shared);
f8e14497 487 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
488 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
489 new_spte, level);
2f2fad08 490}
faaf05b0 491
9a77daac 492/*
6ccf4438
PB
493 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
494 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 495 * in KVM's dirty bitmaps.
9a77daac
BG
496 *
497 * @kvm: kvm instance
498 * @iter: a tdp_iter instance currently on the SPTE that should be set
499 * @new_spte: The value the SPTE should be set to
500 * Returns: true if the SPTE was set, false if it was not. If false is returned,
501 * this function will have no side-effects.
502 */
6ccf4438
PB
503static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
504 struct tdp_iter *iter,
505 u64 new_spte)
9a77daac 506{
9a77daac
BG
507 lockdep_assert_held_read(&kvm->mmu_lock);
508
08f07c80
BG
509 /*
510 * Do not change removed SPTEs. Only the thread that froze the SPTE
511 * may modify it.
512 */
7a51393a 513 if (is_removed_spte(iter->old_spte))
08f07c80
BG
514 return false;
515
6e8eb206
DM
516 /*
517 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
518 * does not hold the mmu_lock.
519 */
9a77daac
BG
520 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
521 new_spte) != iter->old_spte)
522 return false;
523
24ae4cfa
BG
524 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
525 new_spte, iter->level, true);
526 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac
BG
527
528 return true;
529}
530
08f07c80
BG
531static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
532 struct tdp_iter *iter)
533{
534 /*
535 * Freeze the SPTE by setting it to a special,
536 * non-present value. This will stop other threads from
537 * immediately installing a present entry in its place
538 * before the TLBs are flushed.
539 */
6ccf4438 540 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
08f07c80
BG
541 return false;
542
543 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
544 KVM_PAGES_PER_HPAGE(iter->level));
545
546 /*
547 * No other thread can overwrite the removed SPTE as they
548 * must either wait on the MMU lock or use
d9f6e12f 549 * tdp_mmu_set_spte_atomic which will not overwrite the
08f07c80
BG
550 * special removed SPTE value. No bookkeeping is needed
551 * here since the SPTE is going from non-present
552 * to non-present.
553 */
14f6fec2 554 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
08f07c80
BG
555
556 return true;
557}
558
9a77daac 559
fe43fa2f
BG
560/*
561 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
562 * @kvm: kvm instance
563 * @iter: a tdp_iter instance currently on the SPTE that should be set
564 * @new_spte: The value the SPTE should be set to
565 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
566 * of the page. Should be set unless handling an MMU
567 * notifier for access tracking. Leaving record_acc_track
568 * unset in that case prevents page accesses from being
569 * double counted.
570 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
571 * appropriate for the change being made. Should be set
572 * unless performing certain dirty logging operations.
573 * Leaving record_dirty_log unset in that case prevents page
574 * writes from being double counted.
575 */
f8e14497 576static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
a6a0b05d
BG
577 u64 new_spte, bool record_acc_track,
578 bool record_dirty_log)
faaf05b0 579{
531810ca 580 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 581
08f07c80
BG
582 /*
583 * No thread should be using this function to set SPTEs to the
584 * temporary removed SPTE value.
585 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
586 * should be used. If operating under the MMU lock in write mode, the
587 * use of the removed SPTE should not be necessary.
588 */
7a51393a 589 WARN_ON(is_removed_spte(iter->old_spte));
08f07c80 590
7cca2d0b 591 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
f8e14497 592
08889894
SC
593 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
594 new_spte, iter->level, false);
f8e14497
BG
595 if (record_acc_track)
596 handle_changed_spte_acc_track(iter->old_spte, new_spte,
597 iter->level);
a6a0b05d 598 if (record_dirty_log)
08889894 599 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
a6a0b05d
BG
600 iter->old_spte, new_spte,
601 iter->level);
f8e14497
BG
602}
603
604static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
605 u64 new_spte)
606{
a6a0b05d 607 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 608}
faaf05b0 609
f8e14497
BG
610static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
611 struct tdp_iter *iter,
612 u64 new_spte)
613{
a6a0b05d
BG
614 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
615}
616
617static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
618 struct tdp_iter *iter,
619 u64 new_spte)
620{
621 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
622}
623
624#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
625 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
626
f8e14497
BG
627#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
628 tdp_root_for_each_pte(_iter, _root, _start, _end) \
629 if (!is_shadow_present_pte(_iter.old_spte) || \
630 !is_last_spte(_iter.old_spte, _iter.level)) \
631 continue; \
632 else
633
bb18842e
BG
634#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
635 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
636 _mmu->shadow_root_level, _start, _end)
637
e28a436c
BG
638/*
639 * Yield if the MMU lock is contended or this thread needs to return control
640 * to the scheduler.
641 *
e139a34e
BG
642 * If this function should yield and flush is set, it will perform a remote
643 * TLB flush before yielding.
644 *
e28a436c 645 * If this function yields, it will also reset the tdp_iter's walk over the
ed5e484b
BG
646 * paging structure and the calling function should skip to the next
647 * iteration to allow the iterator to continue its traversal from the
648 * paging structure root.
e28a436c
BG
649 *
650 * Return true if this function yielded and the iterator's traversal was reset.
651 * Return false if a yield was not needed.
652 */
e139a34e 653static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
6103bc07
BG
654 struct tdp_iter *iter, bool flush,
655 bool shared)
a6a0b05d 656{
ed5e484b
BG
657 /* Ensure forward progress has been made before yielding. */
658 if (iter->next_last_level_gfn == iter->yielded_gfn)
659 return false;
660
531810ca 661 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7cca2d0b
BG
662 rcu_read_unlock();
663
e139a34e
BG
664 if (flush)
665 kvm_flush_remote_tlbs(kvm);
666
6103bc07
BG
667 if (shared)
668 cond_resched_rwlock_read(&kvm->mmu_lock);
669 else
670 cond_resched_rwlock_write(&kvm->mmu_lock);
671
7cca2d0b 672 rcu_read_lock();
ed5e484b
BG
673
674 WARN_ON(iter->gfn > iter->next_last_level_gfn);
675
b601c3bc 676 tdp_iter_restart(iter);
ed5e484b 677
e28a436c 678 return true;
a6a0b05d 679 }
e28a436c
BG
680
681 return false;
a6a0b05d
BG
682}
683
faaf05b0
BG
684/*
685 * Tears down the mappings for the range of gfns, [start, end), and frees the
686 * non-root pages mapping GFNs strictly within that range. Returns true if
687 * SPTEs have been cleared and a TLB flush is needed before releasing the
688 * MMU lock.
6103bc07 689 *
063afacd
BG
690 * If can_yield is true, will release the MMU lock and reschedule if the
691 * scheduler needs the CPU or there is contention on the MMU lock. If this
692 * function cannot yield, it will not release the MMU lock or reschedule and
693 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07
BG
694 * operation can cause a soft lockup.
695 *
696 * If shared is true, this thread holds the MMU lock in read mode and must
697 * account for the possibility that other threads are modifying the paging
698 * structures concurrently. If shared is false, this thread should hold the
699 * MMU lock in write mode.
faaf05b0
BG
700 */
701static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
6103bc07
BG
702 gfn_t start, gfn_t end, bool can_yield, bool flush,
703 bool shared)
faaf05b0 704{
524a1e4e
SC
705 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
706 bool zap_all = (start == 0 && end >= max_gfn_host);
faaf05b0 707 struct tdp_iter iter;
faaf05b0 708
0103098f
SC
709 /*
710 * No need to try to step down in the iterator when zapping all SPTEs,
711 * zapping the top-level non-leaf SPTEs will recurse on their children.
712 */
713 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
714
524a1e4e
SC
715 /*
716 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
717 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
718 * and so KVM will never install a SPTE for such addresses.
719 */
720 end = min(end, max_gfn_host);
721
6103bc07
BG
722 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
723
7cca2d0b
BG
724 rcu_read_lock();
725
0103098f
SC
726 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
727 min_level, start, end) {
6103bc07 728retry:
1af4a960 729 if (can_yield &&
6103bc07 730 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
a835429c 731 flush = false;
1af4a960
BG
732 continue;
733 }
734
faaf05b0
BG
735 if (!is_shadow_present_pte(iter.old_spte))
736 continue;
737
738 /*
739 * If this is a non-last-level SPTE that covers a larger range
740 * than should be zapped, continue, and zap the mappings at a
524a1e4e 741 * lower level, except when zapping all SPTEs.
faaf05b0 742 */
524a1e4e
SC
743 if (!zap_all &&
744 (iter.gfn < start ||
faaf05b0
BG
745 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
746 !is_last_spte(iter.old_spte, iter.level))
747 continue;
748
6103bc07
BG
749 if (!shared) {
750 tdp_mmu_set_spte(kvm, &iter, 0);
751 flush = true;
752 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
753 /*
754 * The iter must explicitly re-read the SPTE because
755 * the atomic cmpxchg failed.
756 */
757 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
758 goto retry;
759 }
faaf05b0 760 }
7cca2d0b
BG
761
762 rcu_read_unlock();
a835429c 763 return flush;
faaf05b0
BG
764}
765
766/*
767 * Tears down the mappings for the range of gfns, [start, end), and frees the
768 * non-root pages mapping GFNs strictly within that range. Returns true if
769 * SPTEs have been cleared and a TLB flush is needed before releasing the
770 * MMU lock.
771 */
2b9663d8 772bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
5a324c24 773 gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
774{
775 struct kvm_mmu_page *root;
faaf05b0 776
5a324c24 777 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
6103bc07 778 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
5a324c24 779 false);
faaf05b0 780
faaf05b0
BG
781 return flush;
782}
783
784void kvm_tdp_mmu_zap_all(struct kvm *kvm)
785{
2b9663d8
SC
786 bool flush = false;
787 int i;
788
789 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5a324c24 790 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
faaf05b0 791
faaf05b0
BG
792 if (flush)
793 kvm_flush_remote_tlbs(kvm);
794}
bb18842e 795
4c6654bd
BG
796static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
797 struct kvm_mmu_page *prev_root)
798{
799 struct kvm_mmu_page *next_root;
800
801 if (prev_root)
802 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
803 &prev_root->link,
804 typeof(*prev_root), link);
805 else
806 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
807 typeof(*next_root), link);
808
809 while (next_root && !(next_root->role.invalid &&
810 refcount_read(&next_root->tdp_mmu_root_count)))
811 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
812 &next_root->link,
813 typeof(*next_root), link);
814
815 return next_root;
816}
817
818/*
819 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
820 * invalidated root, they will not be freed until this function drops the
821 * reference. Before dropping that reference, tear down the paging
822 * structure so that whichever thread does drop the last reference
823 * only has to do a trivial amount of work. Since the roots are invalid,
824 * no new SPTEs should be created under them.
825 */
826void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
827{
4c6654bd
BG
828 struct kvm_mmu_page *next_root;
829 struct kvm_mmu_page *root;
830 bool flush = false;
831
832 lockdep_assert_held_read(&kvm->mmu_lock);
833
834 rcu_read_lock();
835
836 root = next_invalidated_root(kvm, NULL);
837
838 while (root) {
839 next_root = next_invalidated_root(kvm, root);
840
841 rcu_read_unlock();
842
524a1e4e 843 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
4c6654bd
BG
844
845 /*
846 * Put the reference acquired in
847 * kvm_tdp_mmu_invalidate_roots
848 */
849 kvm_tdp_mmu_put_root(kvm, root, true);
850
851 root = next_root;
852
853 rcu_read_lock();
854 }
855
856 rcu_read_unlock();
faaf05b0 857
faaf05b0
BG
858 if (flush)
859 kvm_flush_remote_tlbs(kvm);
860}
bb18842e 861
b7cccd39
BG
862/*
863 * Mark each TDP MMU root as invalid so that other threads
864 * will drop their references and allow the root count to
865 * go to 0.
866 *
4c6654bd
BG
867 * Also take a reference on all roots so that this thread
868 * can do the bulk of the work required to free the roots
869 * once they are invalidated. Without this reference, a
870 * vCPU thread might drop the last reference to a root and
871 * get stuck with tearing down the entire paging structure.
872 *
873 * Roots which have a zero refcount should be skipped as
874 * they're already being torn down.
875 * Already invalid roots should be referenced again so that
876 * they aren't freed before kvm_tdp_mmu_zap_all_fast is
877 * done with them.
878 *
b7cccd39
BG
879 * This has essentially the same effect for the TDP MMU
880 * as updating mmu_valid_gen does for the shadow MMU.
881 */
882void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
883{
884 struct kvm_mmu_page *root;
885
886 lockdep_assert_held_write(&kvm->mmu_lock);
887 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
4c6654bd
BG
888 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
889 root->role.invalid = true;
b7cccd39
BG
890}
891
bb18842e
BG
892/*
893 * Installs a last-level SPTE to handle a TDP page fault.
894 * (NPT/EPT violation/misconfiguration)
895 */
cdc47767
PB
896static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
897 struct kvm_page_fault *fault,
898 struct tdp_iter *iter)
bb18842e 899{
c435d4b7 900 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 901 u64 new_spte;
57a3e96d 902 int ret = RET_PF_FIXED;
ad67e480 903 bool wrprot = false;
bb18842e 904
7158bee4 905 WARN_ON(sp->role.level != fault->goal_level);
e710c5f6 906 if (unlikely(!fault->slot))
bb18842e 907 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 908 else
53597858 909 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 910 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 911 fault->map_writable, &new_spte);
bb18842e
BG
912
913 if (new_spte == iter->old_spte)
914 ret = RET_PF_SPURIOUS;
6ccf4438 915 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 916 return RET_PF_RETRY;
bb18842e
BG
917
918 /*
919 * If the page fault was caused by a write but the page is write
920 * protected, emulation is needed. If the emulation was skipped,
921 * the vCPU would have the same fault again.
922 */
ad67e480 923 if (wrprot) {
cdc47767 924 if (fault->write)
bb18842e 925 ret = RET_PF_EMULATE;
bb18842e
BG
926 }
927
928 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
929 if (unlikely(is_mmio_spte(new_spte))) {
930 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
931 new_spte);
bb18842e 932 ret = RET_PF_EMULATE;
3849e092 933 } else {
9a77daac
BG
934 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
935 rcu_dereference(iter->sptep));
3849e092 936 }
bb18842e 937
857f8474
KH
938 /*
939 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
940 * consistent with legacy MMU behavior.
941 */
942 if (ret != RET_PF_SPURIOUS)
bb18842e
BG
943 vcpu->stat.pf_fixed++;
944
945 return ret;
946}
947
948/*
949 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
950 * page tables and SPTEs to translate the faulting guest physical address.
951 */
2f6305dd 952int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 953{
bb18842e
BG
954 struct kvm_mmu *mmu = vcpu->arch.mmu;
955 struct tdp_iter iter;
89c0fd49 956 struct kvm_mmu_page *sp;
bb18842e
BG
957 u64 *child_pt;
958 u64 new_spte;
959 int ret;
bb18842e 960
73a3c659 961 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 962
f0066d94 963 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
964
965 rcu_read_lock();
966
2f6305dd 967 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
73a3c659 968 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 969 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 970
73a3c659 971 if (iter.level == fault->goal_level)
bb18842e
BG
972 break;
973
974 /*
975 * If there is an SPTE mapping a large page at a higher level
976 * than the target, that SPTE must be cleared and replaced
977 * with a non-leaf SPTE.
978 */
979 if (is_shadow_present_pte(iter.old_spte) &&
980 is_large_pte(iter.old_spte)) {
08f07c80 981 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 982 break;
bb18842e 983
bb18842e
BG
984 /*
985 * The iter must explicitly re-read the spte here
986 * because the new value informs the !present
987 * path below.
988 */
7cca2d0b 989 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
bb18842e
BG
990 }
991
992 if (!is_shadow_present_pte(iter.old_spte)) {
ff76d506 993 /*
c4342633 994 * If SPTE has been frozen by another thread, just
ff76d506
KH
995 * give up and retry, avoiding unnecessary page table
996 * allocation and free.
997 */
998 if (is_removed_spte(iter.old_spte))
999 break;
1000
f1b83255 1001 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
89c0fd49 1002 child_pt = sp->spt;
a9442f59 1003
bb18842e
BG
1004 new_spte = make_nonleaf_spte(child_pt,
1005 !shadow_accessed_mask);
1006
6ccf4438 1007 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
9653f2da 1008 tdp_mmu_link_page(vcpu->kvm, sp,
73a3c659
PB
1009 fault->huge_page_disallowed &&
1010 fault->req_level >= iter.level);
9a77daac
BG
1011
1012 trace_kvm_mmu_get_page(sp, true);
1013 } else {
1014 tdp_mmu_free_sp(sp);
1015 break;
1016 }
bb18842e
BG
1017 }
1018 }
1019
73a3c659 1020 if (iter.level != fault->goal_level) {
7cca2d0b 1021 rcu_read_unlock();
bb18842e 1022 return RET_PF_RETRY;
7cca2d0b 1023 }
bb18842e 1024
cdc47767 1025 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
7cca2d0b 1026 rcu_read_unlock();
bb18842e
BG
1027
1028 return ret;
1029}
063afacd 1030
3039bcc7
SC
1031bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1032 bool flush)
063afacd 1033{
063afacd 1034 struct kvm_mmu_page *root;
063afacd 1035
3039bcc7
SC
1036 for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1037 flush |= zap_gfn_range(kvm, root, range->start, range->end,
6103bc07 1038 range->may_block, flush, false);
063afacd 1039
3039bcc7 1040 return flush;
063afacd
BG
1041}
1042
3039bcc7
SC
1043typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1044 struct kvm_gfn_range *range);
063afacd 1045
3039bcc7
SC
1046static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1047 struct kvm_gfn_range *range,
1048 tdp_handler_t handler)
063afacd 1049{
3039bcc7
SC
1050 struct kvm_mmu_page *root;
1051 struct tdp_iter iter;
1052 bool ret = false;
1053
1054 rcu_read_lock();
1055
e1eed584
SC
1056 /*
1057 * Don't support rescheduling, none of the MMU notifiers that funnel
1058 * into this helper allow blocking; it'd be dead, wasteful code.
1059 */
3039bcc7
SC
1060 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1061 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1062 ret |= handler(kvm, &iter, range);
1063 }
1064
1065 rcu_read_unlock();
1066
1067 return ret;
063afacd 1068}
f8e14497
BG
1069
1070/*
1071 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1072 * if any of the GFNs in the range have been accessed.
1073 */
3039bcc7
SC
1074static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1075 struct kvm_gfn_range *range)
f8e14497 1076{
f8e14497
BG
1077 u64 new_spte = 0;
1078
3039bcc7
SC
1079 /* If we have a non-accessed entry we don't need to change the pte. */
1080 if (!is_accessed_spte(iter->old_spte))
1081 return false;
7cca2d0b 1082
3039bcc7
SC
1083 new_spte = iter->old_spte;
1084
1085 if (spte_ad_enabled(new_spte)) {
1086 new_spte &= ~shadow_accessed_mask;
1087 } else {
f8e14497 1088 /*
3039bcc7
SC
1089 * Capture the dirty status of the page, so that it doesn't get
1090 * lost when the SPTE is marked for access tracking.
f8e14497 1091 */
3039bcc7
SC
1092 if (is_writable_pte(new_spte))
1093 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1094
3039bcc7 1095 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1096 }
1097
3039bcc7 1098 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1099
3039bcc7 1100 return true;
f8e14497
BG
1101}
1102
3039bcc7 1103bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1104{
3039bcc7 1105 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1106}
1107
3039bcc7
SC
1108static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1109 struct kvm_gfn_range *range)
f8e14497 1110{
3039bcc7 1111 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1112}
1113
3039bcc7 1114bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1115{
3039bcc7 1116 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1117}
1d8dd6b3 1118
3039bcc7
SC
1119static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1120 struct kvm_gfn_range *range)
1d8dd6b3 1121{
1d8dd6b3 1122 u64 new_spte;
7cca2d0b 1123
3039bcc7
SC
1124 /* Huge pages aren't expected to be modified without first being zapped. */
1125 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1126
3039bcc7
SC
1127 if (iter->level != PG_LEVEL_4K ||
1128 !is_shadow_present_pte(iter->old_spte))
1129 return false;
1d8dd6b3 1130
3039bcc7
SC
1131 /*
1132 * Note, when changing a read-only SPTE, it's not strictly necessary to
1133 * zero the SPTE before setting the new PFN, but doing so preserves the
1134 * invariant that the PFN of a present * leaf SPTE can never change.
1135 * See __handle_changed_spte().
1136 */
1137 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1138
3039bcc7
SC
1139 if (!pte_write(range->pte)) {
1140 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1141 pte_pfn(range->pte));
1d8dd6b3 1142
3039bcc7 1143 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1144 }
1145
3039bcc7 1146 return true;
1d8dd6b3
BG
1147}
1148
3039bcc7
SC
1149/*
1150 * Handle the changed_pte MMU notifier for the TDP MMU.
1151 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1152 * notifier.
1153 * Returns non-zero if a flush is needed before releasing the MMU lock.
1154 */
1155bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1156{
3039bcc7
SC
1157 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1158
1159 /* FIXME: return 'flush' instead of flushing here. */
1160 if (flush)
1161 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1162
1163 return false;
1d8dd6b3
BG
1164}
1165
a6a0b05d 1166/*
bedd9195
DM
1167 * Remove write access from all SPTEs at or above min_level that map GFNs
1168 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1169 * be flushed.
a6a0b05d
BG
1170 */
1171static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1172 gfn_t start, gfn_t end, int min_level)
1173{
1174 struct tdp_iter iter;
1175 u64 new_spte;
1176 bool spte_set = false;
1177
7cca2d0b
BG
1178 rcu_read_lock();
1179
a6a0b05d
BG
1180 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1181
1182 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1183 min_level, start, end) {
24ae4cfa
BG
1184retry:
1185 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1186 continue;
1187
a6a0b05d 1188 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1189 !is_last_spte(iter.old_spte, iter.level) ||
1190 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1191 continue;
1192
1193 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1194
6ccf4438 1195 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
24ae4cfa
BG
1196 /*
1197 * The iter must explicitly re-read the SPTE because
1198 * the atomic cmpxchg failed.
1199 */
1200 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1201 goto retry;
1202 }
a6a0b05d 1203 spte_set = true;
a6a0b05d 1204 }
7cca2d0b
BG
1205
1206 rcu_read_unlock();
a6a0b05d
BG
1207 return spte_set;
1208}
1209
1210/*
1211 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1212 * only affect leaf SPTEs down to min_level.
1213 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1214 */
269e9552
HM
1215bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1216 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1217{
1218 struct kvm_mmu_page *root;
a6a0b05d
BG
1219 bool spte_set = false;
1220
24ae4cfa 1221 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1222
24ae4cfa 1223 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1224 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1225 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1226
1227 return spte_set;
1228}
1229
1230/*
1231 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1232 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1233 * If AD bits are not enabled, this will require clearing the writable bit on
1234 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1235 * be flushed.
1236 */
1237static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1238 gfn_t start, gfn_t end)
1239{
1240 struct tdp_iter iter;
1241 u64 new_spte;
1242 bool spte_set = false;
1243
7cca2d0b
BG
1244 rcu_read_lock();
1245
a6a0b05d 1246 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1247retry:
1248 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1249 continue;
1250
a6a0b05d
BG
1251 if (spte_ad_need_write_protect(iter.old_spte)) {
1252 if (is_writable_pte(iter.old_spte))
1253 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1254 else
1255 continue;
1256 } else {
1257 if (iter.old_spte & shadow_dirty_mask)
1258 new_spte = iter.old_spte & ~shadow_dirty_mask;
1259 else
1260 continue;
1261 }
1262
6ccf4438 1263 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
24ae4cfa
BG
1264 /*
1265 * The iter must explicitly re-read the SPTE because
1266 * the atomic cmpxchg failed.
1267 */
1268 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1269 goto retry;
1270 }
a6a0b05d 1271 spte_set = true;
a6a0b05d 1272 }
7cca2d0b
BG
1273
1274 rcu_read_unlock();
a6a0b05d
BG
1275 return spte_set;
1276}
1277
1278/*
1279 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1280 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1281 * If AD bits are not enabled, this will require clearing the writable bit on
1282 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1283 * be flushed.
1284 */
269e9552
HM
1285bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1286 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1287{
1288 struct kvm_mmu_page *root;
a6a0b05d
BG
1289 bool spte_set = false;
1290
24ae4cfa 1291 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1292
24ae4cfa 1293 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1294 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1295 slot->base_gfn + slot->npages);
a6a0b05d
BG
1296
1297 return spte_set;
1298}
1299
1300/*
1301 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1302 * set in mask, starting at gfn. The given memslot is expected to contain all
1303 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1304 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1305 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1306 */
1307static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1308 gfn_t gfn, unsigned long mask, bool wrprot)
1309{
1310 struct tdp_iter iter;
1311 u64 new_spte;
1312
7cca2d0b
BG
1313 rcu_read_lock();
1314
a6a0b05d
BG
1315 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1316 gfn + BITS_PER_LONG) {
1317 if (!mask)
1318 break;
1319
1320 if (iter.level > PG_LEVEL_4K ||
1321 !(mask & (1UL << (iter.gfn - gfn))))
1322 continue;
1323
f1b3b06a
BG
1324 mask &= ~(1UL << (iter.gfn - gfn));
1325
a6a0b05d
BG
1326 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1327 if (is_writable_pte(iter.old_spte))
1328 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1329 else
1330 continue;
1331 } else {
1332 if (iter.old_spte & shadow_dirty_mask)
1333 new_spte = iter.old_spte & ~shadow_dirty_mask;
1334 else
1335 continue;
1336 }
1337
1338 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1339 }
7cca2d0b
BG
1340
1341 rcu_read_unlock();
a6a0b05d
BG
1342}
1343
1344/*
1345 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1346 * set in mask, starting at gfn. The given memslot is expected to contain all
1347 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1348 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1349 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1350 */
1351void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1352 struct kvm_memory_slot *slot,
1353 gfn_t gfn, unsigned long mask,
1354 bool wrprot)
1355{
1356 struct kvm_mmu_page *root;
a6a0b05d 1357
531810ca 1358 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1359 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1360 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1361}
1362
14881998 1363/*
87aa9ec9
BG
1364 * Clear leaf entries which could be replaced by large mappings, for
1365 * GFNs within the slot.
14881998 1366 */
af95b53e 1367static bool zap_collapsible_spte_range(struct kvm *kvm,
14881998 1368 struct kvm_mmu_page *root,
8ca6f063 1369 const struct kvm_memory_slot *slot,
af95b53e 1370 bool flush)
14881998 1371{
9eba50f8
SC
1372 gfn_t start = slot->base_gfn;
1373 gfn_t end = start + slot->npages;
14881998
BG
1374 struct tdp_iter iter;
1375 kvm_pfn_t pfn;
14881998 1376
7cca2d0b
BG
1377 rcu_read_lock();
1378
14881998 1379 tdp_root_for_each_pte(iter, root, start, end) {
2db6f772
BG
1380retry:
1381 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
af95b53e 1382 flush = false;
1af4a960
BG
1383 continue;
1384 }
1385
14881998 1386 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1387 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1388 continue;
1389
1390 pfn = spte_to_pfn(iter.old_spte);
1391 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1392 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1393 pfn, PG_LEVEL_NUM))
14881998
BG
1394 continue;
1395
2db6f772
BG
1396 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1397 /*
1398 * The iter must explicitly re-read the SPTE because
1399 * the atomic cmpxchg failed.
1400 */
1401 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1402 goto retry;
1403 }
af95b53e 1404 flush = true;
14881998
BG
1405 }
1406
7cca2d0b 1407 rcu_read_unlock();
af95b53e
SC
1408
1409 return flush;
14881998
BG
1410}
1411
1412/*
1413 * Clear non-leaf entries (and free associated page tables) which could
1414 * be replaced by large mappings, for GFNs within the slot.
1415 */
142ccde1 1416bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
8ca6f063
BG
1417 const struct kvm_memory_slot *slot,
1418 bool flush)
14881998
BG
1419{
1420 struct kvm_mmu_page *root;
14881998 1421
2db6f772 1422 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1423
2db6f772 1424 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
af95b53e 1425 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
af95b53e 1426
142ccde1 1427 return flush;
14881998 1428}
46044f72
BG
1429
1430/*
1431 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1432 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1433 * Returns true if an SPTE was set and a TLB flush is needed.
1434 */
1435static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1436 gfn_t gfn, int min_level)
46044f72
BG
1437{
1438 struct tdp_iter iter;
1439 u64 new_spte;
1440 bool spte_set = false;
1441
3ad93562
KZ
1442 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1443
7cca2d0b
BG
1444 rcu_read_lock();
1445
3ad93562
KZ
1446 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1447 min_level, gfn, gfn + 1) {
1448 if (!is_shadow_present_pte(iter.old_spte) ||
1449 !is_last_spte(iter.old_spte, iter.level))
1450 continue;
1451
46044f72
BG
1452 if (!is_writable_pte(iter.old_spte))
1453 break;
1454
1455 new_spte = iter.old_spte &
5fc3424f 1456 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72
BG
1457
1458 tdp_mmu_set_spte(kvm, &iter, new_spte);
1459 spte_set = true;
1460 }
1461
7cca2d0b
BG
1462 rcu_read_unlock();
1463
46044f72
BG
1464 return spte_set;
1465}
1466
1467/*
1468 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1469 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1470 * Returns true if an SPTE was set and a TLB flush is needed.
1471 */
1472bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1473 struct kvm_memory_slot *slot, gfn_t gfn,
1474 int min_level)
46044f72
BG
1475{
1476 struct kvm_mmu_page *root;
46044f72
BG
1477 bool spte_set = false;
1478
531810ca 1479 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1480 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1481 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1482
46044f72
BG
1483 return spte_set;
1484}
1485
95fb5b02
BG
1486/*
1487 * Return the level of the lowest level SPTE added to sptes.
1488 * That SPTE may be non-present.
c5c8c7c5
DM
1489 *
1490 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1491 */
39b4d43e
SC
1492int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1493 int *root_level)
95fb5b02
BG
1494{
1495 struct tdp_iter iter;
1496 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1497 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1498 int leaf = -1;
95fb5b02 1499
39b4d43e 1500 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02
BG
1501
1502 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1503 leaf = iter.level;
dde81f94 1504 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1505 }
1506
1507 return leaf;
1508}
6e8eb206
DM
1509
1510/*
1511 * Returns the last level spte pointer of the shadow page walk for the given
1512 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1513 * walk could be performed, returns NULL and *spte does not contain valid data.
1514 *
1515 * Contract:
1516 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1517 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1518 *
1519 * WARNING: This function is only intended to be called during fast_page_fault.
1520 */
1521u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1522 u64 *spte)
1523{
1524 struct tdp_iter iter;
1525 struct kvm_mmu *mmu = vcpu->arch.mmu;
1526 gfn_t gfn = addr >> PAGE_SHIFT;
1527 tdp_ptep_t sptep = NULL;
1528
1529 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1530 *spte = iter.old_spte;
1531 sptep = iter.sptep;
1532 }
1533
1534 /*
1535 * Perform the rcu_dereference to get the raw spte pointer value since
1536 * we are passing it up to fast_page_fault, which is shared with the
1537 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1538 * annotation.
1539 *
1540 * This is safe since fast_page_fault obeys the contracts of this
1541 * function as well as all TDP MMU contracts around modifying SPTEs
1542 * outside of mmu_lock.
1543 */
1544 return rcu_dereference(sptep);
1545}