KVM: x86/mmu: Move shadow-present check out of spte_has_volatile_bits()
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
71ba3f31 13static bool __read_mostly tdp_mmu_enabled = true;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
a1a39128 17int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 18{
a1a39128
PB
19 struct workqueue_struct *wq;
20
897218ff 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
a1a39128
PB
22 return 0;
23
24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
25 if (!wq)
26 return -ENOMEM;
fe5db27d
BG
27
28 /* This should not be changed for the lifetime of the VM. */
29 kvm->arch.tdp_mmu_enabled = true;
02c00b3a 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
a1a39128
PB
33 kvm->arch.tdp_mmu_zap_wq = wq;
34 return 1;
fe5db27d
BG
35}
36
226b8c8f
SC
37/* Arbitrarily returns true so that this may be used in if statements. */
38static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
6103bc07
BG
39 bool shared)
40{
41 if (shared)
42 lockdep_assert_held_read(&kvm->mmu_lock);
43 else
44 lockdep_assert_held_write(&kvm->mmu_lock);
226b8c8f
SC
45
46 return true;
6103bc07
BG
47}
48
fe5db27d
BG
49void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
50{
51 if (!kvm->arch.tdp_mmu_enabled)
52 return;
02c00b3a 53
3203a56a 54 /* Also waits for any queued work items. */
22b94c4b
PB
55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56
524a1e4e 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
02c00b3a 58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
59
60 /*
61 * Ensure that all the outstanding RCU callbacks to free shadow pages
22b94c4b
PB
62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
63 * can call kvm_tdp_mmu_put_root and create new callbacks.
7cca2d0b
BG
64 */
65 rcu_barrier();
02c00b3a
BG
66}
67
2bdb3d84 68static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 69{
2bdb3d84
BG
70 free_page((unsigned long)sp->spt);
71 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
72}
73
c0e64238
BG
74/*
75 * This is called through call_rcu in order to free TDP page table memory
76 * safely with respect to other kernel threads that may be operating on
77 * the memory.
78 * By only accessing TDP MMU page table memory in an RCU read critical
79 * section, and freeing it after a grace period, lockless access to that
80 * memory won't use it after it is freed.
81 */
82static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 83{
c0e64238
BG
84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
85 rcu_head);
a889ea54 86
c0e64238
BG
87 tdp_mmu_free_sp(sp);
88}
a889ea54 89
e2b5b21d
SC
90static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
91 bool shared);
92
22b94c4b
PB
93static void tdp_mmu_zap_root_work(struct work_struct *work)
94{
95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 tdp_mmu_async_work);
97 struct kvm *kvm = root->tdp_mmu_async_data;
98
99 read_lock(&kvm->mmu_lock);
100
101 /*
102 * A TLB flush is not necessary as KVM performs a local TLB flush when
103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
104 * to a different pCPU. Note, the local TLB flush on reuse also
105 * invalidates any paging-structure-cache entries, i.e. TLB entries for
106 * intermediate paging structures, that may be zapped, as such entries
107 * are associated with the ASID on both VMX and SVM.
108 */
109 tdp_mmu_zap_root(kvm, root, true);
110
111 /*
112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
113 * avoiding an infinite loop. By design, the root is reachable while
114 * it's being asynchronously zapped, thus a different task can put its
115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
116 * asynchronously zapped root is unavoidable.
117 */
118 kvm_tdp_mmu_put_root(kvm, root, true);
119
120 read_unlock(&kvm->mmu_lock);
121}
122
123static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124{
125 root->tdp_mmu_async_data = kvm;
126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
128}
129
8351779c
PB
130static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
131{
132 union kvm_mmu_page_role role = page->role;
133 role.invalid = true;
134
135 /* No need to use cmpxchg, only the invalid bit can change. */
136 role.word = xchg(&page->role.word, role.word);
137 return role.invalid;
138}
139
6103bc07
BG
140void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
141 bool shared)
2bdb3d84 142{
6103bc07 143 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 144
11cccf5c 145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
146 return;
147
148 WARN_ON(!root->tdp_mmu_page);
149
db01416b 150 /*
8351779c
PB
151 * The root now has refcount=0. It is valid, but readers already
152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
153 * rejects it. This remains true for the rest of the execution
154 * of this function, because readers visit valid roots only
155 * (except for tdp_mmu_zap_root_work(), which however
156 * does not acquire any reference itself).
157 *
158 * Even though there are flows that need to visit all roots for
159 * correctness, they all take mmu_lock for write, so they cannot yet
160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
161 * since the root still has refcount=0.
162 *
163 * However, tdp_mmu_zap_root can yield, and writers do not expect to
164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
165 * So the root temporarily gets an extra reference, going to refcount=1
166 * while staying invalid. Readers still cannot acquire any reference;
167 * but writers are now allowed to run if tdp_mmu_zap_root yields and
efd995da
PB
168 * they might take an extra reference if they themselves yield.
169 * Therefore, when the reference is given back by the worker,
8351779c
PB
170 * there is no guarantee that the refcount is still 1. If not, whoever
171 * puts the last reference will free the page, but they will not have to
172 * zap the root because a root cannot go from invalid to valid.
db01416b 173 */
8351779c
PB
174 if (!kvm_tdp_root_mark_invalid(root)) {
175 refcount_set(&root->tdp_mmu_root_count, 1);
8351779c
PB
176
177 /*
efd995da
PB
178 * Zapping the root in a worker is not just "nice to have";
179 * it is required because kvm_tdp_mmu_invalidate_all_roots()
180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
182 * might return with some roots not zapped yet.
8351779c 183 */
efd995da
PB
184 tdp_mmu_schedule_zap_root(kvm, root);
185 return;
8351779c 186 }
2bdb3d84 187
8351779c
PB
188 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
189 list_del_rcu(&root->link);
190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
c0e64238 191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
192}
193
cfc10997 194/*
d62007ed
SC
195 * Returns the next root after @prev_root (or the first root if @prev_root is
196 * NULL). A reference to the returned root is acquired, and the reference to
197 * @prev_root is released (the caller obviously must hold a reference to
198 * @prev_root if it's non-NULL).
199 *
200 * If @only_valid is true, invalid roots are skipped.
201 *
202 * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997
BG
203 */
204static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07 205 struct kvm_mmu_page *prev_root,
d62007ed 206 bool shared, bool only_valid)
a889ea54
BG
207{
208 struct kvm_mmu_page *next_root;
209
c0e64238
BG
210 rcu_read_lock();
211
cfc10997 212 if (prev_root)
c0e64238
BG
213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
214 &prev_root->link,
215 typeof(*prev_root), link);
cfc10997 216 else
c0e64238
BG
217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
218 typeof(*next_root), link);
a889ea54 219
04dc4e6c 220 while (next_root) {
d62007ed 221 if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94 222 kvm_tdp_mmu_get_root(next_root))
04dc4e6c
SC
223 break;
224
c0e64238
BG
225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
226 &next_root->link, typeof(*next_root), link);
04dc4e6c 227 }
fb101293 228
c0e64238 229 rcu_read_unlock();
a889ea54 230
cfc10997 231 if (prev_root)
6103bc07 232 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 233
a889ea54
BG
234 return next_root;
235}
236
237/*
238 * Note: this iterator gets and puts references to the roots it iterates over.
239 * This makes it safe to release the MMU lock and yield within the loop, but
240 * if exiting the loop early, the caller must drop the reference to the most
241 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
242 *
243 * If shared is set, this function is operating under the MMU lock in read
244 * mode. In the unlikely event that this thread must free a root, the lock
245 * will be temporarily dropped and reacquired in write mode.
a889ea54 246 */
d62007ed
SC
247#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
249 _root; \
250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
614f6970
PB
251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
252 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 253 } else
a889ea54 254
d62007ed
SC
255#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
257
614f6970
PB
258#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
d62007ed 260
226b8c8f
SC
261/*
262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
263 * the implication being that any flow that holds mmu_lock for read is
264 * inherently yield-friendly and should use the yield-safe variant above.
265 * Holding mmu_lock for write obviates the need for RCU protection as the list
266 * is guaranteed to be stable.
267 */
268#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
271 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 272 } else
02c00b3a 273
a82070b6 274static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3a
BG
275{
276 struct kvm_mmu_page *sp;
277
278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6
DM
280
281 return sp;
282}
283
c10743a1
SC
284static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
285 gfn_t gfn, union kvm_mmu_page_role role)
a82070b6 286{
02c00b3a
BG
287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
288
a3aca4de 289 sp->role = role;
02c00b3a 290 sp->gfn = gfn;
c10743a1 291 sp->ptep = sptep;
02c00b3a
BG
292 sp->tdp_mmu_page = true;
293
33dd3574 294 trace_kvm_mmu_get_page(sp, true);
02c00b3a
BG
295}
296
a82070b6
DM
297static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
298 struct tdp_iter *iter)
02c00b3a 299{
a3aca4de 300 struct kvm_mmu_page *parent_sp;
02c00b3a 301 union kvm_mmu_page_role role;
a3aca4de
DM
302
303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
304
305 role = parent_sp->role;
306 role.level--;
307
c10743a1 308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
a3aca4de
DM
309}
310
311hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
312{
313 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
02c00b3a
BG
314 struct kvm *kvm = vcpu->kvm;
315 struct kvm_mmu_page *root;
316
6e6ec584 317 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 318
04dc4e6c
SC
319 /*
320 * Check for an existing root before allocating a new one. Note, the
321 * role check prevents consuming an invalid root.
322 */
a3f15bda 323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293 324 if (root->role.word == role.word &&
ad6d6b94 325 kvm_tdp_mmu_get_root(root))
6e6ec584 326 goto out;
02c00b3a
BG
327 }
328
a82070b6 329 root = tdp_mmu_alloc_sp(vcpu);
c10743a1 330 tdp_mmu_init_sp(root, NULL, 0, role);
a82070b6 331
11cccf5c 332 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 333
c0e64238
BG
334 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 337
6e6ec584 338out:
02c00b3a 339 return __pa(root->spt);
fe5db27d 340}
2f2fad08
BG
341
342static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
343 u64 old_spte, u64 new_spte, int level,
344 bool shared);
2f2fad08 345
f8e14497
BG
346static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
347{
f8e14497
BG
348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
349 return;
350
351 if (is_accessed_spte(old_spte) &&
64bb2769
SC
352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
354 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
355}
356
a6a0b05d
BG
357static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
358 u64 old_spte, u64 new_spte, int level)
359{
360 bool pfn_changed;
361 struct kvm_memory_slot *slot;
362
363 if (level > PG_LEVEL_4K)
364 return;
365
366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
367
368 if ((!is_writable_pte(old_spte) || pfn_changed) &&
369 is_writable_pte(new_spte)) {
370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 371 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
372 }
373}
374
a9442f59 375/**
c298a30c 376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59
BG
377 *
378 * @kvm: kvm instance
379 * @sp: the page to be removed
9a77daac
BG
380 * @shared: This operation may not be running under the exclusive use of
381 * the MMU lock and the operation must synchronize with other
382 * threads that might be adding or removing pages.
a9442f59 383 */
c298a30c
DM
384static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
385 bool shared)
a9442f59 386{
9a77daac
BG
387 if (shared)
388 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
389 else
390 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
391
392 list_del(&sp->link);
393 if (sp->lpage_disallowed)
394 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
395
396 if (shared)
397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
398}
399
a066e61f 400/**
0f53dfa3 401 * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61f
BG
402 *
403 * @kvm: kvm instance
404 * @pt: the page removed from the paging structure
9a77daac
BG
405 * @shared: This operation may not be running under the exclusive use
406 * of the MMU lock and the operation must synchronize with other
407 * threads that might be modifying SPTEs.
a066e61f
BG
408 *
409 * Given a page table that has been removed from the TDP paging structure,
410 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
411 *
412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
413 * protection. Since this thread removed it from the paging structure,
414 * this thread will be responsible for ensuring the page is freed. Hence the
415 * early rcu_dereferences in the function.
a066e61f 416 */
0f53dfa3 417static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61f 418{
70fb3e41 419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 420 int level = sp->role.level;
e25f0e0c 421 gfn_t base_gfn = sp->gfn;
a066e61f
BG
422 int i;
423
424 trace_kvm_mmu_prepare_zap_page(sp);
425
c298a30c 426 tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61f
BG
427
428 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
574c3c55
BG
429 u64 *sptep = rcu_dereference(pt) + i;
430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
431 u64 old_child_spte;
9a77daac
BG
432
433 if (shared) {
e25f0e0c
BG
434 /*
435 * Set the SPTE to a nonpresent value that other
436 * threads will not overwrite. If the SPTE was
437 * already marked as removed then another thread
438 * handling a page fault could overwrite it, so
439 * set the SPTE until it is set from some other
440 * value to the removed SPTE value.
441 */
442 for (;;) {
443 old_child_spte = xchg(sptep, REMOVED_SPTE);
444 if (!is_removed_spte(old_child_spte))
445 break;
446 cpu_relax();
447 }
9a77daac 448 } else {
8df9f1af
SC
449 /*
450 * If the SPTE is not MMU-present, there is no backing
451 * page associated with the SPTE and so no side effects
452 * that need to be recorded, and exclusive ownership of
453 * mmu_lock ensures the SPTE can't be made present.
454 * Note, zapping MMIO SPTEs is also unnecessary as they
455 * are guarded by the memslots generation, not by being
456 * unreachable.
457 */
9a77daac 458 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
459 if (!is_shadow_present_pte(old_child_spte))
460 continue;
e25f0e0c
BG
461
462 /*
463 * Marking the SPTE as a removed SPTE is not
464 * strictly necessary here as the MMU lock will
465 * stop other threads from concurrently modifying
466 * this SPTE. Using the removed SPTE value keeps
467 * the two branches consistent and simplifies
468 * the function.
469 */
470 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 471 }
e25f0e0c 472 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
f1b83255 473 old_child_spte, REMOVED_SPTE, level,
e25f0e0c 474 shared);
a066e61f
BG
475 }
476
7cca2d0b 477 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
478}
479
2f2fad08 480/**
7f6231a3 481 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
482 * @kvm: kvm instance
483 * @as_id: the address space of the paging structure the SPTE was a part of
484 * @gfn: the base GFN that was mapped by the SPTE
485 * @old_spte: The value of the SPTE before the change
486 * @new_spte: The value of the SPTE after the change
487 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
488 * @shared: This operation may not be running under the exclusive use of
489 * the MMU lock and the operation must synchronize with other
490 * threads that might be modifying SPTEs.
2f2fad08
BG
491 *
492 * Handle bookkeeping that might result from the modification of a SPTE.
493 * This function must be called for all TDP SPTE modifications.
494 */
495static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
496 u64 old_spte, u64 new_spte, int level,
497 bool shared)
2f2fad08
BG
498{
499 bool was_present = is_shadow_present_pte(old_spte);
500 bool is_present = is_shadow_present_pte(new_spte);
501 bool was_leaf = was_present && is_last_spte(old_spte, level);
502 bool is_leaf = is_present && is_last_spte(new_spte, level);
503 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
504
505 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
506 WARN_ON(level < PG_LEVEL_4K);
764388ce 507 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
508
509 /*
510 * If this warning were to trigger it would indicate that there was a
511 * missing MMU notifier or a race with some notifier handler.
512 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 513 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
514 * should be zapping the SPTE before the main MM's page table is
515 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
516 * thread before replacement.
517 */
518 if (was_leaf && is_leaf && pfn_changed) {
519 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
520 "SPTE with another present leaf SPTE mapping a\n"
521 "different PFN!\n"
522 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
523 as_id, gfn, old_spte, new_spte, level);
524
525 /*
526 * Crash the host to prevent error propagation and guest data
d9f6e12f 527 * corruption.
2f2fad08
BG
528 */
529 BUG();
530 }
531
532 if (old_spte == new_spte)
533 return;
534
b9a98c34
BG
535 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
536
115111ef
DM
537 if (is_leaf)
538 check_spte_writable_invariants(new_spte);
539
2f2fad08
BG
540 /*
541 * The only times a SPTE should be changed from a non-present to
542 * non-present state is when an MMIO entry is installed/modified/
543 * removed. In that case, there is nothing to do here.
544 */
545 if (!was_present && !is_present) {
546 /*
08f07c80
BG
547 * If this change does not involve a MMIO SPTE or removed SPTE,
548 * it is unexpected. Log the change, though it should not
549 * impact the guest since both the former and current SPTEs
550 * are nonpresent.
2f2fad08 551 */
08f07c80
BG
552 if (WARN_ON(!is_mmio_spte(old_spte) &&
553 !is_mmio_spte(new_spte) &&
554 !is_removed_spte(new_spte)))
2f2fad08
BG
555 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
556 "should not be replaced with another,\n"
557 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
558 "are MMIO SPTEs, or the new SPTE is\n"
559 "a temporary removed SPTE.\n"
2f2fad08
BG
560 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
561 as_id, gfn, old_spte, new_spte, level);
562 return;
563 }
564
71f51d2c
MZ
565 if (is_leaf != was_leaf)
566 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
567
568 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 569 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
570 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
571
572 /*
573 * Recursively handle child PTs if the change removed a subtree from
c8e5a0d0
SC
574 * the paging structure. Note the WARN on the PFN changing without the
575 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
576 * pages are kernel allocations and should never be migrated.
2f2fad08 577 */
c8e5a0d0
SC
578 if (was_present && !was_leaf &&
579 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0f53dfa3 580 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
581}
582
583static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
584 u64 old_spte, u64 new_spte, int level,
585 bool shared)
2f2fad08 586{
9a77daac
BG
587 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
588 shared);
f8e14497 589 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
590 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
591 new_spte, level);
2f2fad08 592}
faaf05b0 593
9a77daac 594/*
6ccf4438
PB
595 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
596 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 597 * in KVM's dirty bitmaps.
9a77daac 598 *
3255530a
DM
599 * If setting the SPTE fails because it has changed, iter->old_spte will be
600 * refreshed to the current value of the spte.
601 *
9a77daac
BG
602 * @kvm: kvm instance
603 * @iter: a tdp_iter instance currently on the SPTE that should be set
604 * @new_spte: The value the SPTE should be set to
3e72c791
DM
605 * Return:
606 * * 0 - If the SPTE was set.
607 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
608 * no side-effects other than setting iter->old_spte to the last
609 * known value of the spte.
9a77daac 610 */
3e72c791
DM
611static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
612 struct tdp_iter *iter,
613 u64 new_spte)
9a77daac 614{
3255530a
DM
615 u64 *sptep = rcu_dereference(iter->sptep);
616 u64 old_spte;
617
08f07c80 618 /*
396fd74d
SC
619 * The caller is responsible for ensuring the old SPTE is not a REMOVED
620 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
621 * and pre-checking before inserting a new SPTE is advantageous as it
622 * avoids unnecessary work.
08f07c80 623 */
396fd74d
SC
624 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
625
626 lockdep_assert_held_read(&kvm->mmu_lock);
08f07c80 627
6e8eb206
DM
628 /*
629 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
630 * does not hold the mmu_lock.
631 */
3255530a
DM
632 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
633 if (old_spte != iter->old_spte) {
634 /*
635 * The page table entry was modified by a different logical
636 * CPU. Refresh iter->old_spte with the current value so the
637 * caller operates on fresh data, e.g. if it retries
638 * tdp_mmu_set_spte_atomic().
639 */
640 iter->old_spte = old_spte;
3e72c791 641 return -EBUSY;
3255530a 642 }
9a77daac 643
24ae4cfa
BG
644 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
645 new_spte, iter->level, true);
646 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac 647
3e72c791 648 return 0;
9a77daac
BG
649}
650
3e72c791
DM
651static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
652 struct tdp_iter *iter)
08f07c80 653{
3e72c791
DM
654 int ret;
655
08f07c80
BG
656 /*
657 * Freeze the SPTE by setting it to a special,
658 * non-present value. This will stop other threads from
659 * immediately installing a present entry in its place
660 * before the TLBs are flushed.
661 */
3e72c791
DM
662 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
663 if (ret)
664 return ret;
08f07c80
BG
665
666 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
667 KVM_PAGES_PER_HPAGE(iter->level));
668
669 /*
670 * No other thread can overwrite the removed SPTE as they
671 * must either wait on the MMU lock or use
d9f6e12f 672 * tdp_mmu_set_spte_atomic which will not overwrite the
08f07c80
BG
673 * special removed SPTE value. No bookkeeping is needed
674 * here since the SPTE is going from non-present
675 * to non-present.
676 */
0e587aa7 677 kvm_tdp_mmu_write_spte(iter->sptep, 0);
08f07c80 678
3e72c791 679 return 0;
08f07c80
BG
680}
681
9a77daac 682
fe43fa2f
BG
683/*
684 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
626808d1
SC
685 * @kvm: KVM instance
686 * @as_id: Address space ID, i.e. regular vs. SMM
687 * @sptep: Pointer to the SPTE
688 * @old_spte: The current value of the SPTE
689 * @new_spte: The new value that will be set for the SPTE
690 * @gfn: The base GFN that was (or will be) mapped by the SPTE
691 * @level: The level _containing_ the SPTE (its parent PT's level)
fe43fa2f
BG
692 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
693 * of the page. Should be set unless handling an MMU
694 * notifier for access tracking. Leaving record_acc_track
695 * unset in that case prevents page accesses from being
696 * double counted.
697 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
698 * appropriate for the change being made. Should be set
699 * unless performing certain dirty logging operations.
700 * Leaving record_dirty_log unset in that case prevents page
701 * writes from being double counted.
702 */
626808d1
SC
703static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
704 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
705 bool record_acc_track, bool record_dirty_log)
faaf05b0 706{
531810ca 707 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 708
08f07c80 709 /*
966da62a 710 * No thread should be using this function to set SPTEs to or from the
08f07c80
BG
711 * temporary removed SPTE value.
712 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
713 * should be used. If operating under the MMU lock in write mode, the
714 * use of the removed SPTE should not be necessary.
715 */
626808d1 716 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
08f07c80 717
626808d1
SC
718 kvm_tdp_mmu_write_spte(sptep, new_spte);
719
720 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
f8e14497 721
f8e14497 722 if (record_acc_track)
626808d1 723 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d 724 if (record_dirty_log)
626808d1
SC
725 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
726 new_spte, level);
727}
728
729static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
730 u64 new_spte, bool record_acc_track,
731 bool record_dirty_log)
732{
733 WARN_ON_ONCE(iter->yielded);
734
735 __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
736 new_spte, iter->gfn, iter->level,
737 record_acc_track, record_dirty_log);
f8e14497
BG
738}
739
740static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
741 u64 new_spte)
742{
626808d1 743 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 744}
faaf05b0 745
f8e14497
BG
746static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
747 struct tdp_iter *iter,
748 u64 new_spte)
749{
626808d1 750 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
a6a0b05d
BG
751}
752
753static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
754 struct tdp_iter *iter,
755 u64 new_spte)
756{
626808d1 757 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
758}
759
760#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075 761 for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0 762
f8e14497
BG
763#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
764 tdp_root_for_each_pte(_iter, _root, _start, _end) \
765 if (!is_shadow_present_pte(_iter.old_spte) || \
766 !is_last_spte(_iter.old_spte, _iter.level)) \
767 continue; \
768 else
769
bb18842e 770#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
b9e5603c 771 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
bb18842e 772
e28a436c
BG
773/*
774 * Yield if the MMU lock is contended or this thread needs to return control
775 * to the scheduler.
776 *
e139a34e
BG
777 * If this function should yield and flush is set, it will perform a remote
778 * TLB flush before yielding.
779 *
3a0f64de
SC
780 * If this function yields, iter->yielded is set and the caller must skip to
781 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
782 * over the paging structures to allow the iterator to continue its traversal
783 * from the paging structure root.
e28a436c 784 *
3a0f64de 785 * Returns true if this function yielded.
e28a436c 786 */
3a0f64de
SC
787static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
788 struct tdp_iter *iter,
789 bool flush, bool shared)
a6a0b05d 790{
3a0f64de
SC
791 WARN_ON(iter->yielded);
792
ed5e484b
BG
793 /* Ensure forward progress has been made before yielding. */
794 if (iter->next_last_level_gfn == iter->yielded_gfn)
795 return false;
796
531810ca 797 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
e139a34e
BG
798 if (flush)
799 kvm_flush_remote_tlbs(kvm);
800
bd296779
SC
801 rcu_read_unlock();
802
6103bc07
BG
803 if (shared)
804 cond_resched_rwlock_read(&kvm->mmu_lock);
805 else
806 cond_resched_rwlock_write(&kvm->mmu_lock);
807
7cca2d0b 808 rcu_read_lock();
ed5e484b
BG
809
810 WARN_ON(iter->gfn > iter->next_last_level_gfn);
811
3a0f64de 812 iter->yielded = true;
a6a0b05d 813 }
e28a436c 814
3a0f64de 815 return iter->yielded;
a6a0b05d
BG
816}
817
86931ff7 818static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
e2b5b21d
SC
819{
820 /*
86931ff7
SC
821 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
822 * a gpa range that would exceed the max gfn, and KVM does not create
823 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
824 * the slow emulation path every time.
e2b5b21d 825 */
86931ff7 826 return kvm_mmu_max_gfn() + 1;
e2b5b21d
SC
827}
828
1b6043e8
SC
829static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
830 bool shared, int zap_level)
e2b5b21d 831{
e2b5b21d
SC
832 struct tdp_iter iter;
833
86931ff7 834 gfn_t end = tdp_mmu_max_gfn_exclusive();
e2b5b21d
SC
835 gfn_t start = 0;
836
1b6043e8
SC
837 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
838retry:
839 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
840 continue;
841
842 if (!is_shadow_present_pte(iter.old_spte))
843 continue;
844
845 if (iter.level > zap_level)
846 continue;
847
848 if (!shared)
849 tdp_mmu_set_spte(kvm, &iter, 0);
850 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
851 goto retry;
852 }
853}
854
855static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
856 bool shared)
857{
858
8351779c
PB
859 /*
860 * The root must have an elevated refcount so that it's reachable via
861 * mmu_notifier callbacks, which allows this path to yield and drop
862 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
863 * must drop all references to relevant pages prior to completing the
864 * callback. Dropping mmu_lock with an unreachable root would result
865 * in zapping SPTEs after a relevant mmu_notifier callback completes
866 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
867 * dirty accessed bits to the SPTE's associated struct page.
868 */
869 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
870
e2b5b21d
SC
871 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
872
873 rcu_read_lock();
874
875 /*
1b6043e8
SC
876 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
877 * split the zap into two passes. On the first pass, zap at the 1gb
878 * level, and then zap top-level SPs on the second pass. "1gb" is not
879 * arbitrary, as KVM must be able to zap a 1gb shadow page without
880 * inducing a stall to allow in-place replacement with a 1gb hugepage.
881 *
882 * Because zapping a SP recurses on its children, stepping down to
883 * PG_LEVEL_4K in the iterator itself is unnecessary.
e2b5b21d 884 */
1b6043e8
SC
885 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
886 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
e2b5b21d
SC
887
888 rcu_read_unlock();
889}
890
c10743a1
SC
891bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
892{
893 u64 old_spte;
894
895 /*
896 * This helper intentionally doesn't allow zapping a root shadow page,
897 * which doesn't have a parent page table and thus no associated entry.
898 */
899 if (WARN_ON_ONCE(!sp->ptep))
900 return false;
901
c10743a1 902 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
bb95dfb9 903 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
c10743a1 904 return false;
c10743a1
SC
905
906 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
907 sp->gfn, sp->role.level + 1, true, true);
908
c10743a1
SC
909 return true;
910}
911
faaf05b0 912/*
f47e5bbb
SC
913 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
914 * have been cleared and a TLB flush is needed before releasing the MMU lock.
6103bc07 915 *
063afacd
BG
916 * If can_yield is true, will release the MMU lock and reschedule if the
917 * scheduler needs the CPU or there is contention on the MMU lock. If this
918 * function cannot yield, it will not release the MMU lock or reschedule and
919 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07 920 * operation can cause a soft lockup.
faaf05b0 921 */
f47e5bbb
SC
922static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
923 gfn_t start, gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
924{
925 struct tdp_iter iter;
faaf05b0 926
86931ff7 927 end = min(end, tdp_mmu_max_gfn_exclusive());
524a1e4e 928
acbda82a 929 lockdep_assert_held_write(&kvm->mmu_lock);
6103bc07 930
7cca2d0b
BG
931 rcu_read_lock();
932
f47e5bbb 933 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
1af4a960 934 if (can_yield &&
acbda82a 935 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
a835429c 936 flush = false;
1af4a960
BG
937 continue;
938 }
939
f47e5bbb 940 if (!is_shadow_present_pte(iter.old_spte) ||
faaf05b0
BG
941 !is_last_spte(iter.old_spte, iter.level))
942 continue;
943
acbda82a
SC
944 tdp_mmu_set_spte(kvm, &iter, 0);
945 flush = true;
faaf05b0 946 }
7cca2d0b 947
fcb93eb6
PB
948 rcu_read_unlock();
949
f47e5bbb
SC
950 /*
951 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
952 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
953 */
954 return flush;
faaf05b0
BG
955}
956
957/*
958 * Tears down the mappings for the range of gfns, [start, end), and frees the
959 * non-root pages mapping GFNs strictly within that range. Returns true if
960 * SPTEs have been cleared and a TLB flush is needed before releasing the
961 * MMU lock.
962 */
f47e5bbb
SC
963bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
964 bool can_yield, bool flush)
faaf05b0
BG
965{
966 struct kvm_mmu_page *root;
faaf05b0 967
614f6970 968 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
f47e5bbb 969 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
faaf05b0 970
faaf05b0
BG
971 return flush;
972}
973
974void kvm_tdp_mmu_zap_all(struct kvm *kvm)
975{
e2b5b21d 976 struct kvm_mmu_page *root;
2b9663d8
SC
977 int i;
978
77c8cd6b 979 /*
22b94c4b
PB
980 * Zap all roots, including invalid roots, as all SPTEs must be dropped
981 * before returning to the caller. Zap directly even if the root is
982 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
983 * all that expensive and mmu_lock is already held, which means the
984 * worker has yielded, i.e. flushing the work instead of zapping here
985 * isn't guaranteed to be any faster.
986 *
77c8cd6b
SC
987 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
988 * is being destroyed or the userspace VMM has exited. In both cases,
989 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
990 */
e2b5b21d
SC
991 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
992 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
993 tdp_mmu_zap_root(kvm, root, false);
994 }
faaf05b0 995}
bb18842e 996
4c6654bd 997/*
f28e9c7f 998 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
22b94c4b 999 * zap" completes.
4c6654bd
BG
1000 */
1001void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1002{
22b94c4b 1003 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
faaf05b0 1004}
bb18842e 1005
b7cccd39 1006/*
f28e9c7f 1007 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
22b94c4b
PB
1008 * is about to be zapped, e.g. in response to a memslots update. The actual
1009 * zapping is performed asynchronously, so a reference is taken on all roots.
1010 * Using a separate workqueue makes it easy to ensure that the destruction is
1011 * performed before the "fast zap" completes, without keeping a separate list
1012 * of invalidated roots; the list is effectively the list of work items in
1013 * the workqueue.
b7cccd39 1014 *
22b94c4b
PB
1015 * Get a reference even if the root is already invalid, the asynchronous worker
1016 * assumes it was gifted a reference to the root it processes. Because mmu_lock
1017 * is held for write, it should be impossible to observe a root with zero refcount,
1018 * i.e. the list of roots cannot be stale.
4c6654bd 1019 *
b7cccd39
BG
1020 * This has essentially the same effect for the TDP MMU
1021 * as updating mmu_valid_gen does for the shadow MMU.
1022 */
1023void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1024{
1025 struct kvm_mmu_page *root;
1026
1027 lockdep_assert_held_write(&kvm->mmu_lock);
f28e9c7f 1028 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
efd995da
PB
1029 if (!root->role.invalid &&
1030 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
4c6654bd 1031 root->role.invalid = true;
22b94c4b
PB
1032 tdp_mmu_schedule_zap_root(kvm, root);
1033 }
f28e9c7f 1034 }
b7cccd39
BG
1035}
1036
bb18842e
BG
1037/*
1038 * Installs a last-level SPTE to handle a TDP page fault.
1039 * (NPT/EPT violation/misconfiguration)
1040 */
cdc47767
PB
1041static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1042 struct kvm_page_fault *fault,
1043 struct tdp_iter *iter)
bb18842e 1044{
c435d4b7 1045 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 1046 u64 new_spte;
57a3e96d 1047 int ret = RET_PF_FIXED;
ad67e480 1048 bool wrprot = false;
bb18842e 1049
7158bee4 1050 WARN_ON(sp->role.level != fault->goal_level);
e710c5f6 1051 if (unlikely(!fault->slot))
bb18842e 1052 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 1053 else
53597858 1054 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 1055 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 1056 fault->map_writable, &new_spte);
bb18842e
BG
1057
1058 if (new_spte == iter->old_spte)
1059 ret = RET_PF_SPURIOUS;
3e72c791 1060 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 1061 return RET_PF_RETRY;
bb95dfb9
SC
1062 else if (is_shadow_present_pte(iter->old_spte) &&
1063 !is_last_spte(iter->old_spte, iter->level))
1064 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1065 KVM_PAGES_PER_HPAGE(iter->level + 1));
bb18842e
BG
1066
1067 /*
1068 * If the page fault was caused by a write but the page is write
1069 * protected, emulation is needed. If the emulation was skipped,
1070 * the vCPU would have the same fault again.
1071 */
ad67e480 1072 if (wrprot) {
cdc47767 1073 if (fault->write)
bb18842e 1074 ret = RET_PF_EMULATE;
bb18842e
BG
1075 }
1076
1077 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
1078 if (unlikely(is_mmio_spte(new_spte))) {
1079 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1080 new_spte);
bb18842e 1081 ret = RET_PF_EMULATE;
3849e092 1082 } else {
9a77daac
BG
1083 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1084 rcu_dereference(iter->sptep));
3849e092 1085 }
bb18842e 1086
857f8474
KH
1087 /*
1088 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1089 * consistent with legacy MMU behavior.
1090 */
1091 if (ret != RET_PF_SPURIOUS)
bb18842e
BG
1092 vcpu->stat.pf_fixed++;
1093
1094 return ret;
1095}
1096
7b7e1ab6 1097/*
cb00a70b
DM
1098 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1099 * provided page table.
7b7e1ab6
DM
1100 *
1101 * @kvm: kvm instance
1102 * @iter: a tdp_iter instance currently on the SPTE that should be set
1103 * @sp: The new TDP page table to install.
1104 * @account_nx: True if this page table is being installed to split a
1105 * non-executable huge page.
cb00a70b 1106 * @shared: This operation is running under the MMU lock in read mode.
7b7e1ab6
DM
1107 *
1108 * Returns: 0 if the new page table was installed. Non-0 if the page table
1109 * could not be installed (e.g. the atomic compare-exchange failed).
1110 */
cb00a70b
DM
1111static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1112 struct kvm_mmu_page *sp, bool account_nx,
1113 bool shared)
7b7e1ab6
DM
1114{
1115 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
cb00a70b 1116 int ret = 0;
7b7e1ab6 1117
cb00a70b
DM
1118 if (shared) {
1119 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1120 if (ret)
1121 return ret;
1122 } else {
1123 tdp_mmu_set_spte(kvm, iter, spte);
1124 }
7b7e1ab6
DM
1125
1126 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1127 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1128 if (account_nx)
1129 account_huge_nx_page(kvm, sp);
1130 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1131
1132 return 0;
1133}
1134
bb18842e
BG
1135/*
1136 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1137 * page tables and SPTEs to translate the faulting guest physical address.
1138 */
2f6305dd 1139int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 1140{
bb18842e
BG
1141 struct kvm_mmu *mmu = vcpu->arch.mmu;
1142 struct tdp_iter iter;
89c0fd49 1143 struct kvm_mmu_page *sp;
bb18842e 1144 int ret;
bb18842e 1145
73a3c659 1146 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 1147
f0066d94 1148 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
1149
1150 rcu_read_lock();
1151
2f6305dd 1152 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
73a3c659 1153 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 1154 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 1155
73a3c659 1156 if (iter.level == fault->goal_level)
bb18842e
BG
1157 break;
1158
1159 /*
1160 * If there is an SPTE mapping a large page at a higher level
1161 * than the target, that SPTE must be cleared and replaced
1162 * with a non-leaf SPTE.
1163 */
1164 if (is_shadow_present_pte(iter.old_spte) &&
1165 is_large_pte(iter.old_spte)) {
3e72c791 1166 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 1167 break;
bb18842e 1168
bb18842e
BG
1169 /*
1170 * The iter must explicitly re-read the spte here
1171 * because the new value informs the !present
1172 * path below.
1173 */
0e587aa7 1174 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
bb18842e
BG
1175 }
1176
1177 if (!is_shadow_present_pte(iter.old_spte)) {
7b7e1ab6
DM
1178 bool account_nx = fault->huge_page_disallowed &&
1179 fault->req_level >= iter.level;
1180
ff76d506 1181 /*
c4342633 1182 * If SPTE has been frozen by another thread, just
ff76d506
KH
1183 * give up and retry, avoiding unnecessary page table
1184 * allocation and free.
1185 */
1186 if (is_removed_spte(iter.old_spte))
1187 break;
1188
a82070b6
DM
1189 sp = tdp_mmu_alloc_sp(vcpu);
1190 tdp_mmu_init_child_sp(sp, &iter);
1191
cb00a70b 1192 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
9a77daac
BG
1193 tdp_mmu_free_sp(sp);
1194 break;
1195 }
bb18842e
BG
1196 }
1197 }
1198
58298b06
SC
1199 /*
1200 * Force the guest to retry the access if the upper level SPTEs aren't
1201 * in place, or if the target leaf SPTE is frozen by another CPU.
1202 */
1203 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
7cca2d0b 1204 rcu_read_unlock();
bb18842e 1205 return RET_PF_RETRY;
7cca2d0b 1206 }
bb18842e 1207
cdc47767 1208 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
7cca2d0b 1209 rcu_read_unlock();
bb18842e
BG
1210
1211 return ret;
1212}
063afacd 1213
3039bcc7
SC
1214bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1215 bool flush)
063afacd 1216{
f47e5bbb
SC
1217 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1218 range->end, range->may_block, flush);
063afacd
BG
1219}
1220
3039bcc7
SC
1221typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1222 struct kvm_gfn_range *range);
063afacd 1223
3039bcc7
SC
1224static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1225 struct kvm_gfn_range *range,
1226 tdp_handler_t handler)
063afacd 1227{
3039bcc7
SC
1228 struct kvm_mmu_page *root;
1229 struct tdp_iter iter;
1230 bool ret = false;
1231
e1eed584
SC
1232 /*
1233 * Don't support rescheduling, none of the MMU notifiers that funnel
1234 * into this helper allow blocking; it'd be dead, wasteful code.
1235 */
3039bcc7 1236 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
a151acec
SC
1237 rcu_read_lock();
1238
3039bcc7
SC
1239 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1240 ret |= handler(kvm, &iter, range);
3039bcc7 1241
a151acec
SC
1242 rcu_read_unlock();
1243 }
3039bcc7
SC
1244
1245 return ret;
063afacd 1246}
f8e14497
BG
1247
1248/*
1249 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1250 * if any of the GFNs in the range have been accessed.
1251 */
3039bcc7
SC
1252static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1253 struct kvm_gfn_range *range)
f8e14497 1254{
f8e14497
BG
1255 u64 new_spte = 0;
1256
3039bcc7
SC
1257 /* If we have a non-accessed entry we don't need to change the pte. */
1258 if (!is_accessed_spte(iter->old_spte))
1259 return false;
7cca2d0b 1260
3039bcc7
SC
1261 new_spte = iter->old_spte;
1262
1263 if (spte_ad_enabled(new_spte)) {
1264 new_spte &= ~shadow_accessed_mask;
1265 } else {
f8e14497 1266 /*
3039bcc7
SC
1267 * Capture the dirty status of the page, so that it doesn't get
1268 * lost when the SPTE is marked for access tracking.
f8e14497 1269 */
3039bcc7
SC
1270 if (is_writable_pte(new_spte))
1271 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1272
3039bcc7 1273 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1274 }
1275
3039bcc7 1276 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1277
3039bcc7 1278 return true;
f8e14497
BG
1279}
1280
3039bcc7 1281bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1282{
3039bcc7 1283 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1284}
1285
3039bcc7
SC
1286static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1287 struct kvm_gfn_range *range)
f8e14497 1288{
3039bcc7 1289 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1290}
1291
3039bcc7 1292bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1293{
3039bcc7 1294 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1295}
1d8dd6b3 1296
3039bcc7
SC
1297static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1298 struct kvm_gfn_range *range)
1d8dd6b3 1299{
1d8dd6b3 1300 u64 new_spte;
7cca2d0b 1301
3039bcc7
SC
1302 /* Huge pages aren't expected to be modified without first being zapped. */
1303 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1304
3039bcc7
SC
1305 if (iter->level != PG_LEVEL_4K ||
1306 !is_shadow_present_pte(iter->old_spte))
1307 return false;
1d8dd6b3 1308
3039bcc7
SC
1309 /*
1310 * Note, when changing a read-only SPTE, it's not strictly necessary to
1311 * zero the SPTE before setting the new PFN, but doing so preserves the
1312 * invariant that the PFN of a present * leaf SPTE can never change.
1313 * See __handle_changed_spte().
1314 */
1315 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1316
3039bcc7
SC
1317 if (!pte_write(range->pte)) {
1318 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1319 pte_pfn(range->pte));
1d8dd6b3 1320
3039bcc7 1321 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1322 }
1323
3039bcc7 1324 return true;
1d8dd6b3
BG
1325}
1326
3039bcc7
SC
1327/*
1328 * Handle the changed_pte MMU notifier for the TDP MMU.
1329 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1330 * notifier.
1331 * Returns non-zero if a flush is needed before releasing the MMU lock.
1332 */
1333bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1334{
93fa50f6
SC
1335 /*
1336 * No need to handle the remote TLB flush under RCU protection, the
1337 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1338 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1339 */
1340 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1d8dd6b3
BG
1341}
1342
a6a0b05d 1343/*
bedd9195
DM
1344 * Remove write access from all SPTEs at or above min_level that map GFNs
1345 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1346 * be flushed.
a6a0b05d
BG
1347 */
1348static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1349 gfn_t start, gfn_t end, int min_level)
1350{
1351 struct tdp_iter iter;
1352 u64 new_spte;
1353 bool spte_set = false;
1354
7cca2d0b
BG
1355 rcu_read_lock();
1356
a6a0b05d
BG
1357 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1358
77aa6075 1359 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfa
BG
1360retry:
1361 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1362 continue;
1363
a6a0b05d 1364 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1365 !is_last_spte(iter.old_spte, iter.level) ||
1366 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1367 continue;
1368
1369 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1370
3e72c791 1371 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1372 goto retry;
3255530a 1373
a6a0b05d 1374 spte_set = true;
a6a0b05d 1375 }
7cca2d0b
BG
1376
1377 rcu_read_unlock();
a6a0b05d
BG
1378 return spte_set;
1379}
1380
1381/*
1382 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1383 * only affect leaf SPTEs down to min_level.
1384 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1385 */
269e9552
HM
1386bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1387 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1388{
1389 struct kvm_mmu_page *root;
a6a0b05d
BG
1390 bool spte_set = false;
1391
24ae4cfa 1392 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1393
d62007ed 1394 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1395 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1396 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1397
1398 return spte_set;
1399}
1400
a3fe5dbd
DM
1401static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1402{
1403 struct kvm_mmu_page *sp;
1404
1405 gfp |= __GFP_ZERO;
1406
1407 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1408 if (!sp)
1409 return NULL;
1410
1411 sp->spt = (void *)__get_free_page(gfp);
1412 if (!sp->spt) {
1413 kmem_cache_free(mmu_page_header_cache, sp);
1414 return NULL;
1415 }
1416
1417 return sp;
1418}
1419
1420static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
cb00a70b
DM
1421 struct tdp_iter *iter,
1422 bool shared)
a3fe5dbd
DM
1423{
1424 struct kvm_mmu_page *sp;
1425
a3fe5dbd
DM
1426 /*
1427 * Since we are allocating while under the MMU lock we have to be
1428 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1429 * reclaim and to avoid making any filesystem callbacks (which can end
1430 * up invoking KVM MMU notifiers, resulting in a deadlock).
1431 *
1432 * If this allocation fails we drop the lock and retry with reclaim
1433 * allowed.
1434 */
1435 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1436 if (sp)
1437 return sp;
1438
1439 rcu_read_unlock();
cb00a70b
DM
1440
1441 if (shared)
1442 read_unlock(&kvm->mmu_lock);
1443 else
1444 write_unlock(&kvm->mmu_lock);
a3fe5dbd
DM
1445
1446 iter->yielded = true;
1447 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1448
cb00a70b
DM
1449 if (shared)
1450 read_lock(&kvm->mmu_lock);
1451 else
1452 write_lock(&kvm->mmu_lock);
1453
a3fe5dbd
DM
1454 rcu_read_lock();
1455
1456 return sp;
1457}
1458
cb00a70b
DM
1459static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1460 struct kvm_mmu_page *sp, bool shared)
a3fe5dbd
DM
1461{
1462 const u64 huge_spte = iter->old_spte;
1463 const int level = iter->level;
1464 int ret, i;
1465
1466 tdp_mmu_init_child_sp(sp, iter);
1467
1468 /*
1469 * No need for atomics when writing to sp->spt since the page table has
1470 * not been linked in yet and thus is not reachable from any other CPU.
1471 */
1472 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1473 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1474
1475 /*
1476 * Replace the huge spte with a pointer to the populated lower level
1477 * page table. Since we are making this change without a TLB flush vCPUs
1478 * will see a mix of the split mappings and the original huge mapping,
1479 * depending on what's currently in their TLB. This is fine from a
1480 * correctness standpoint since the translation will be the same either
1481 * way.
1482 */
cb00a70b 1483 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
a3fe5dbd 1484 if (ret)
e0b728b1 1485 goto out;
a3fe5dbd
DM
1486
1487 /*
1488 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1489 * are overwriting from the page stats. But we have to manually update
1490 * the page stats with the new present child pages.
1491 */
1492 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1493
e0b728b1
DM
1494out:
1495 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1496 return ret;
a3fe5dbd
DM
1497}
1498
1499static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1500 struct kvm_mmu_page *root,
1501 gfn_t start, gfn_t end,
cb00a70b 1502 int target_level, bool shared)
a3fe5dbd
DM
1503{
1504 struct kvm_mmu_page *sp = NULL;
1505 struct tdp_iter iter;
1506 int ret = 0;
1507
1508 rcu_read_lock();
1509
1510 /*
1511 * Traverse the page table splitting all huge pages above the target
1512 * level into one lower level. For example, if we encounter a 1GB page
1513 * we split it into 512 2MB pages.
1514 *
1515 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1516 * to visit an SPTE before ever visiting its children, which means we
1517 * will correctly recursively split huge pages that are more than one
1518 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1519 * and then splitting each of those to 512 4KB pages).
1520 */
1521 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1522retry:
cb00a70b 1523 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
a3fe5dbd
DM
1524 continue;
1525
1526 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1527 continue;
1528
1529 if (!sp) {
cb00a70b 1530 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
a3fe5dbd
DM
1531 if (!sp) {
1532 ret = -ENOMEM;
e0b728b1
DM
1533 trace_kvm_mmu_split_huge_page(iter.gfn,
1534 iter.old_spte,
1535 iter.level, ret);
a3fe5dbd
DM
1536 break;
1537 }
1538
1539 if (iter.yielded)
1540 continue;
1541 }
1542
cb00a70b 1543 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
a3fe5dbd
DM
1544 goto retry;
1545
1546 sp = NULL;
1547 }
1548
1549 rcu_read_unlock();
1550
1551 /*
1552 * It's possible to exit the loop having never used the last sp if, for
1553 * example, a vCPU doing HugePage NX splitting wins the race and
1554 * installs its own sp in place of the last sp we tried to split.
1555 */
1556 if (sp)
1557 tdp_mmu_free_sp(sp);
1558
a3fe5dbd
DM
1559 return ret;
1560}
1561
cb00a70b 1562
a3fe5dbd
DM
1563/*
1564 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1565 */
1566void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1567 const struct kvm_memory_slot *slot,
1568 gfn_t start, gfn_t end,
cb00a70b 1569 int target_level, bool shared)
a3fe5dbd
DM
1570{
1571 struct kvm_mmu_page *root;
1572 int r = 0;
1573
cb00a70b 1574 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a3fe5dbd 1575
7c554d8e 1576 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
cb00a70b 1577 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
a3fe5dbd 1578 if (r) {
cb00a70b 1579 kvm_tdp_mmu_put_root(kvm, root, shared);
a3fe5dbd
DM
1580 break;
1581 }
1582 }
1583}
1584
a6a0b05d
BG
1585/*
1586 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1587 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1588 * If AD bits are not enabled, this will require clearing the writable bit on
1589 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1590 * be flushed.
1591 */
1592static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1593 gfn_t start, gfn_t end)
1594{
1595 struct tdp_iter iter;
1596 u64 new_spte;
1597 bool spte_set = false;
1598
7cca2d0b
BG
1599 rcu_read_lock();
1600
a6a0b05d 1601 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1602retry:
1603 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1604 continue;
1605
3354ef5a
SC
1606 if (!is_shadow_present_pte(iter.old_spte))
1607 continue;
1608
a6a0b05d
BG
1609 if (spte_ad_need_write_protect(iter.old_spte)) {
1610 if (is_writable_pte(iter.old_spte))
1611 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1612 else
1613 continue;
1614 } else {
1615 if (iter.old_spte & shadow_dirty_mask)
1616 new_spte = iter.old_spte & ~shadow_dirty_mask;
1617 else
1618 continue;
1619 }
1620
3e72c791 1621 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1622 goto retry;
3255530a 1623
a6a0b05d 1624 spte_set = true;
a6a0b05d 1625 }
7cca2d0b
BG
1626
1627 rcu_read_unlock();
a6a0b05d
BG
1628 return spte_set;
1629}
1630
1631/*
1632 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1633 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1634 * If AD bits are not enabled, this will require clearing the writable bit on
1635 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1636 * be flushed.
1637 */
269e9552
HM
1638bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1639 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1640{
1641 struct kvm_mmu_page *root;
a6a0b05d
BG
1642 bool spte_set = false;
1643
24ae4cfa 1644 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1645
d62007ed 1646 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1647 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1648 slot->base_gfn + slot->npages);
a6a0b05d
BG
1649
1650 return spte_set;
1651}
1652
1653/*
1654 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1655 * set in mask, starting at gfn. The given memslot is expected to contain all
1656 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1657 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1658 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1659 */
1660static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1661 gfn_t gfn, unsigned long mask, bool wrprot)
1662{
1663 struct tdp_iter iter;
1664 u64 new_spte;
1665
7cca2d0b
BG
1666 rcu_read_lock();
1667
a6a0b05d
BG
1668 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1669 gfn + BITS_PER_LONG) {
1670 if (!mask)
1671 break;
1672
1673 if (iter.level > PG_LEVEL_4K ||
1674 !(mask & (1UL << (iter.gfn - gfn))))
1675 continue;
1676
f1b3b06a
BG
1677 mask &= ~(1UL << (iter.gfn - gfn));
1678
a6a0b05d
BG
1679 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1680 if (is_writable_pte(iter.old_spte))
1681 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1682 else
1683 continue;
1684 } else {
1685 if (iter.old_spte & shadow_dirty_mask)
1686 new_spte = iter.old_spte & ~shadow_dirty_mask;
1687 else
1688 continue;
1689 }
1690
1691 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1692 }
7cca2d0b
BG
1693
1694 rcu_read_unlock();
a6a0b05d
BG
1695}
1696
1697/*
1698 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1699 * set in mask, starting at gfn. The given memslot is expected to contain all
1700 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1701 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1702 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1703 */
1704void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1705 struct kvm_memory_slot *slot,
1706 gfn_t gfn, unsigned long mask,
1707 bool wrprot)
1708{
1709 struct kvm_mmu_page *root;
a6a0b05d 1710
531810ca 1711 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1712 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1713 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1714}
1715
14881998 1716/*
87aa9ec9
BG
1717 * Clear leaf entries which could be replaced by large mappings, for
1718 * GFNs within the slot.
14881998 1719 */
4b85c921 1720static void zap_collapsible_spte_range(struct kvm *kvm,
14881998 1721 struct kvm_mmu_page *root,
4b85c921 1722 const struct kvm_memory_slot *slot)
14881998 1723{
9eba50f8
SC
1724 gfn_t start = slot->base_gfn;
1725 gfn_t end = start + slot->npages;
14881998
BG
1726 struct tdp_iter iter;
1727 kvm_pfn_t pfn;
14881998 1728
7cca2d0b
BG
1729 rcu_read_lock();
1730
14881998 1731 tdp_root_for_each_pte(iter, root, start, end) {
2db6f772 1732retry:
4b85c921 1733 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960 1734 continue;
1af4a960 1735
14881998 1736 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1737 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1738 continue;
1739
1740 pfn = spte_to_pfn(iter.old_spte);
1741 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1742 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1743 pfn, PG_LEVEL_NUM))
14881998
BG
1744 continue;
1745
4b85c921 1746 /* Note, a successful atomic zap also does a remote TLB flush. */
3e72c791 1747 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
2db6f772 1748 goto retry;
14881998
BG
1749 }
1750
7cca2d0b 1751 rcu_read_unlock();
14881998
BG
1752}
1753
1754/*
1755 * Clear non-leaf entries (and free associated page tables) which could
1756 * be replaced by large mappings, for GFNs within the slot.
1757 */
4b85c921
SC
1758void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1759 const struct kvm_memory_slot *slot)
14881998
BG
1760{
1761 struct kvm_mmu_page *root;
14881998 1762
2db6f772 1763 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1764
d62007ed 1765 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921 1766 zap_collapsible_spte_range(kvm, root, slot);
14881998 1767}
46044f72
BG
1768
1769/*
1770 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1771 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1772 * Returns true if an SPTE was set and a TLB flush is needed.
1773 */
1774static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1775 gfn_t gfn, int min_level)
46044f72
BG
1776{
1777 struct tdp_iter iter;
1778 u64 new_spte;
1779 bool spte_set = false;
1780
3ad93562
KZ
1781 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1782
7cca2d0b
BG
1783 rcu_read_lock();
1784
77aa6075 1785 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562
KZ
1786 if (!is_shadow_present_pte(iter.old_spte) ||
1787 !is_last_spte(iter.old_spte, iter.level))
1788 continue;
1789
46044f72 1790 new_spte = iter.old_spte &
5fc3424f 1791 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72 1792
7c8a4742
DM
1793 if (new_spte == iter.old_spte)
1794 break;
1795
46044f72
BG
1796 tdp_mmu_set_spte(kvm, &iter, new_spte);
1797 spte_set = true;
1798 }
1799
7cca2d0b
BG
1800 rcu_read_unlock();
1801
46044f72
BG
1802 return spte_set;
1803}
1804
1805/*
1806 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1807 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1808 * Returns true if an SPTE was set and a TLB flush is needed.
1809 */
1810bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1811 struct kvm_memory_slot *slot, gfn_t gfn,
1812 int min_level)
46044f72
BG
1813{
1814 struct kvm_mmu_page *root;
46044f72
BG
1815 bool spte_set = false;
1816
531810ca 1817 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1818 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1819 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1820
46044f72
BG
1821 return spte_set;
1822}
1823
95fb5b02
BG
1824/*
1825 * Return the level of the lowest level SPTE added to sptes.
1826 * That SPTE may be non-present.
c5c8c7c5
DM
1827 *
1828 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1829 */
39b4d43e
SC
1830int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1831 int *root_level)
95fb5b02
BG
1832{
1833 struct tdp_iter iter;
1834 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1835 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1836 int leaf = -1;
95fb5b02 1837
39b4d43e 1838 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02
BG
1839
1840 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1841 leaf = iter.level;
dde81f94 1842 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1843 }
1844
1845 return leaf;
1846}
6e8eb206
DM
1847
1848/*
1849 * Returns the last level spte pointer of the shadow page walk for the given
1850 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1851 * walk could be performed, returns NULL and *spte does not contain valid data.
1852 *
1853 * Contract:
1854 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1855 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1856 *
1857 * WARNING: This function is only intended to be called during fast_page_fault.
1858 */
1859u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1860 u64 *spte)
1861{
1862 struct tdp_iter iter;
1863 struct kvm_mmu *mmu = vcpu->arch.mmu;
1864 gfn_t gfn = addr >> PAGE_SHIFT;
1865 tdp_ptep_t sptep = NULL;
1866
1867 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1868 *spte = iter.old_spte;
1869 sptep = iter.sptep;
1870 }
1871
1872 /*
1873 * Perform the rcu_dereference to get the raw spte pointer value since
1874 * we are passing it up to fast_page_fault, which is shared with the
1875 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1876 * annotation.
1877 *
1878 * This is safe since fast_page_fault obeys the contracts of this
1879 * function as well as all TDP MMU contracts around modifying SPTEs
1880 * outside of mmu_lock.
1881 */
1882 return rcu_dereference(sptep);
1883}