KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
71ba3f31 13static bool __read_mostly tdp_mmu_enabled = true;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
a1a39128 17int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 18{
a1a39128
PB
19 struct workqueue_struct *wq;
20
897218ff 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
a1a39128
PB
22 return 0;
23
24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
25 if (!wq)
26 return -ENOMEM;
fe5db27d
BG
27
28 /* This should not be changed for the lifetime of the VM. */
29 kvm->arch.tdp_mmu_enabled = true;
02c00b3a 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
a1a39128
PB
33 kvm->arch.tdp_mmu_zap_wq = wq;
34 return 1;
fe5db27d
BG
35}
36
226b8c8f
SC
37/* Arbitrarily returns true so that this may be used in if statements. */
38static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
6103bc07
BG
39 bool shared)
40{
41 if (shared)
42 lockdep_assert_held_read(&kvm->mmu_lock);
43 else
44 lockdep_assert_held_write(&kvm->mmu_lock);
226b8c8f
SC
45
46 return true;
6103bc07
BG
47}
48
fe5db27d
BG
49void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
50{
51 if (!kvm->arch.tdp_mmu_enabled)
52 return;
02c00b3a 53
3203a56a 54 /* Also waits for any queued work items. */
22b94c4b
PB
55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56
524a1e4e 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
02c00b3a 58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
59
60 /*
61 * Ensure that all the outstanding RCU callbacks to free shadow pages
22b94c4b
PB
62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
63 * can call kvm_tdp_mmu_put_root and create new callbacks.
7cca2d0b
BG
64 */
65 rcu_barrier();
02c00b3a
BG
66}
67
2bdb3d84 68static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 69{
2bdb3d84
BG
70 free_page((unsigned long)sp->spt);
71 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
72}
73
c0e64238
BG
74/*
75 * This is called through call_rcu in order to free TDP page table memory
76 * safely with respect to other kernel threads that may be operating on
77 * the memory.
78 * By only accessing TDP MMU page table memory in an RCU read critical
79 * section, and freeing it after a grace period, lockless access to that
80 * memory won't use it after it is freed.
81 */
82static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 83{
c0e64238
BG
84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
85 rcu_head);
a889ea54 86
c0e64238
BG
87 tdp_mmu_free_sp(sp);
88}
a889ea54 89
e2b5b21d
SC
90static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
91 bool shared);
92
22b94c4b
PB
93static void tdp_mmu_zap_root_work(struct work_struct *work)
94{
95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 tdp_mmu_async_work);
97 struct kvm *kvm = root->tdp_mmu_async_data;
98
99 read_lock(&kvm->mmu_lock);
100
101 /*
102 * A TLB flush is not necessary as KVM performs a local TLB flush when
103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
104 * to a different pCPU. Note, the local TLB flush on reuse also
105 * invalidates any paging-structure-cache entries, i.e. TLB entries for
106 * intermediate paging structures, that may be zapped, as such entries
107 * are associated with the ASID on both VMX and SVM.
108 */
109 tdp_mmu_zap_root(kvm, root, true);
110
111 /*
112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
113 * avoiding an infinite loop. By design, the root is reachable while
114 * it's being asynchronously zapped, thus a different task can put its
115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
116 * asynchronously zapped root is unavoidable.
117 */
118 kvm_tdp_mmu_put_root(kvm, root, true);
119
120 read_unlock(&kvm->mmu_lock);
121}
122
123static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124{
125 root->tdp_mmu_async_data = kvm;
126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
128}
129
8351779c
PB
130static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
131{
132 union kvm_mmu_page_role role = page->role;
133 role.invalid = true;
134
135 /* No need to use cmpxchg, only the invalid bit can change. */
136 role.word = xchg(&page->role.word, role.word);
137 return role.invalid;
138}
139
6103bc07
BG
140void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
141 bool shared)
2bdb3d84 142{
6103bc07 143 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 144
11cccf5c 145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
146 return;
147
148 WARN_ON(!root->tdp_mmu_page);
149
db01416b 150 /*
8351779c
PB
151 * The root now has refcount=0. It is valid, but readers already
152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
153 * rejects it. This remains true for the rest of the execution
154 * of this function, because readers visit valid roots only
155 * (except for tdp_mmu_zap_root_work(), which however
156 * does not acquire any reference itself).
157 *
158 * Even though there are flows that need to visit all roots for
159 * correctness, they all take mmu_lock for write, so they cannot yet
160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
161 * since the root still has refcount=0.
162 *
163 * However, tdp_mmu_zap_root can yield, and writers do not expect to
164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
165 * So the root temporarily gets an extra reference, going to refcount=1
166 * while staying invalid. Readers still cannot acquire any reference;
167 * but writers are now allowed to run if tdp_mmu_zap_root yields and
efd995da
PB
168 * they might take an extra reference if they themselves yield.
169 * Therefore, when the reference is given back by the worker,
8351779c
PB
170 * there is no guarantee that the refcount is still 1. If not, whoever
171 * puts the last reference will free the page, but they will not have to
172 * zap the root because a root cannot go from invalid to valid.
db01416b 173 */
8351779c
PB
174 if (!kvm_tdp_root_mark_invalid(root)) {
175 refcount_set(&root->tdp_mmu_root_count, 1);
8351779c
PB
176
177 /*
efd995da
PB
178 * Zapping the root in a worker is not just "nice to have";
179 * it is required because kvm_tdp_mmu_invalidate_all_roots()
180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
182 * might return with some roots not zapped yet.
8351779c 183 */
efd995da
PB
184 tdp_mmu_schedule_zap_root(kvm, root);
185 return;
8351779c 186 }
2bdb3d84 187
8351779c
PB
188 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
189 list_del_rcu(&root->link);
190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
c0e64238 191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
192}
193
cfc10997 194/*
d62007ed
SC
195 * Returns the next root after @prev_root (or the first root if @prev_root is
196 * NULL). A reference to the returned root is acquired, and the reference to
197 * @prev_root is released (the caller obviously must hold a reference to
198 * @prev_root if it's non-NULL).
199 *
200 * If @only_valid is true, invalid roots are skipped.
201 *
202 * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997
BG
203 */
204static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07 205 struct kvm_mmu_page *prev_root,
d62007ed 206 bool shared, bool only_valid)
a889ea54
BG
207{
208 struct kvm_mmu_page *next_root;
209
c0e64238
BG
210 rcu_read_lock();
211
cfc10997 212 if (prev_root)
c0e64238
BG
213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
214 &prev_root->link,
215 typeof(*prev_root), link);
cfc10997 216 else
c0e64238
BG
217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
218 typeof(*next_root), link);
a889ea54 219
04dc4e6c 220 while (next_root) {
d62007ed 221 if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94 222 kvm_tdp_mmu_get_root(next_root))
04dc4e6c
SC
223 break;
224
c0e64238
BG
225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
226 &next_root->link, typeof(*next_root), link);
04dc4e6c 227 }
fb101293 228
c0e64238 229 rcu_read_unlock();
a889ea54 230
cfc10997 231 if (prev_root)
6103bc07 232 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 233
a889ea54
BG
234 return next_root;
235}
236
237/*
238 * Note: this iterator gets and puts references to the roots it iterates over.
239 * This makes it safe to release the MMU lock and yield within the loop, but
240 * if exiting the loop early, the caller must drop the reference to the most
241 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
242 *
243 * If shared is set, this function is operating under the MMU lock in read
244 * mode. In the unlikely event that this thread must free a root, the lock
245 * will be temporarily dropped and reacquired in write mode.
a889ea54 246 */
d62007ed
SC
247#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
249 _root; \
250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
614f6970
PB
251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
252 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 253 } else
a889ea54 254
d62007ed
SC
255#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
257
614f6970
PB
258#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
d62007ed 260
226b8c8f
SC
261/*
262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
263 * the implication being that any flow that holds mmu_lock for read is
264 * inherently yield-friendly and should use the yield-safe variant above.
265 * Holding mmu_lock for write obviates the need for RCU protection as the list
266 * is guaranteed to be stable.
267 */
268#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
271 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 272 } else
02c00b3a 273
a82070b6 274static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3a
BG
275{
276 struct kvm_mmu_page *sp;
277
278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6
DM
280
281 return sp;
282}
283
c10743a1
SC
284static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
285 gfn_t gfn, union kvm_mmu_page_role role)
a82070b6 286{
02c00b3a
BG
287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
288
a3aca4de 289 sp->role = role;
02c00b3a 290 sp->gfn = gfn;
c10743a1 291 sp->ptep = sptep;
02c00b3a
BG
292 sp->tdp_mmu_page = true;
293
33dd3574 294 trace_kvm_mmu_get_page(sp, true);
02c00b3a
BG
295}
296
a82070b6
DM
297static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
298 struct tdp_iter *iter)
02c00b3a 299{
a3aca4de 300 struct kvm_mmu_page *parent_sp;
02c00b3a 301 union kvm_mmu_page_role role;
a3aca4de
DM
302
303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
304
305 role = parent_sp->role;
306 role.level--;
307
c10743a1 308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
a3aca4de
DM
309}
310
311hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
312{
7a458f0e 313 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
02c00b3a
BG
314 struct kvm *kvm = vcpu->kvm;
315 struct kvm_mmu_page *root;
316
6e6ec584 317 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 318
04dc4e6c
SC
319 /*
320 * Check for an existing root before allocating a new one. Note, the
321 * role check prevents consuming an invalid root.
322 */
a3f15bda 323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293 324 if (root->role.word == role.word &&
ad6d6b94 325 kvm_tdp_mmu_get_root(root))
6e6ec584 326 goto out;
02c00b3a
BG
327 }
328
a82070b6 329 root = tdp_mmu_alloc_sp(vcpu);
c10743a1 330 tdp_mmu_init_sp(root, NULL, 0, role);
a82070b6 331
11cccf5c 332 refcount_set(&root->tdp_mmu_root_count, 1);
02c00b3a 333
c0e64238
BG
334 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 337
6e6ec584 338out:
02c00b3a 339 return __pa(root->spt);
fe5db27d 340}
2f2fad08
BG
341
342static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
343 u64 old_spte, u64 new_spte, int level,
344 bool shared);
2f2fad08 345
f8e14497
BG
346static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
347{
f8e14497
BG
348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
349 return;
350
351 if (is_accessed_spte(old_spte) &&
64bb2769
SC
352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
354 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
355}
356
a6a0b05d
BG
357static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
358 u64 old_spte, u64 new_spte, int level)
359{
360 bool pfn_changed;
361 struct kvm_memory_slot *slot;
362
363 if (level > PG_LEVEL_4K)
364 return;
365
366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
367
368 if ((!is_writable_pte(old_spte) || pfn_changed) &&
369 is_writable_pte(new_spte)) {
370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 371 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
372 }
373}
374
a9442f59 375/**
c298a30c 376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59
BG
377 *
378 * @kvm: kvm instance
379 * @sp: the page to be removed
9a77daac
BG
380 * @shared: This operation may not be running under the exclusive use of
381 * the MMU lock and the operation must synchronize with other
382 * threads that might be adding or removing pages.
a9442f59 383 */
c298a30c
DM
384static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
385 bool shared)
a9442f59 386{
9a77daac
BG
387 if (shared)
388 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
389 else
390 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
391
392 list_del(&sp->link);
393 if (sp->lpage_disallowed)
394 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
395
396 if (shared)
397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
398}
399
a066e61f 400/**
0f53dfa3 401 * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61f
BG
402 *
403 * @kvm: kvm instance
404 * @pt: the page removed from the paging structure
9a77daac
BG
405 * @shared: This operation may not be running under the exclusive use
406 * of the MMU lock and the operation must synchronize with other
407 * threads that might be modifying SPTEs.
a066e61f
BG
408 *
409 * Given a page table that has been removed from the TDP paging structure,
410 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
411 *
412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
413 * protection. Since this thread removed it from the paging structure,
414 * this thread will be responsible for ensuring the page is freed. Hence the
415 * early rcu_dereferences in the function.
a066e61f 416 */
0f53dfa3 417static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61f 418{
70fb3e41 419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 420 int level = sp->role.level;
e25f0e0c 421 gfn_t base_gfn = sp->gfn;
a066e61f
BG
422 int i;
423
424 trace_kvm_mmu_prepare_zap_page(sp);
425
c298a30c 426 tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61f 427
2ca3129e 428 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
ba3a6120 429 tdp_ptep_t sptep = pt + i;
574c3c55 430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
ba3a6120 431 u64 old_spte;
9a77daac
BG
432
433 if (shared) {
e25f0e0c
BG
434 /*
435 * Set the SPTE to a nonpresent value that other
436 * threads will not overwrite. If the SPTE was
437 * already marked as removed then another thread
438 * handling a page fault could overwrite it, so
439 * set the SPTE until it is set from some other
440 * value to the removed SPTE value.
441 */
442 for (;;) {
ba3a6120
SC
443 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
444 if (!is_removed_spte(old_spte))
e25f0e0c
BG
445 break;
446 cpu_relax();
447 }
9a77daac 448 } else {
8df9f1af
SC
449 /*
450 * If the SPTE is not MMU-present, there is no backing
451 * page associated with the SPTE and so no side effects
452 * that need to be recorded, and exclusive ownership of
453 * mmu_lock ensures the SPTE can't be made present.
454 * Note, zapping MMIO SPTEs is also unnecessary as they
455 * are guarded by the memslots generation, not by being
456 * unreachable.
457 */
ba3a6120
SC
458 old_spte = kvm_tdp_mmu_read_spte(sptep);
459 if (!is_shadow_present_pte(old_spte))
8df9f1af 460 continue;
e25f0e0c
BG
461
462 /*
ba3a6120
SC
463 * Use the common helper instead of a raw WRITE_ONCE as
464 * the SPTE needs to be updated atomically if it can be
465 * modified by a different vCPU outside of mmu_lock.
466 * Even though the parent SPTE is !PRESENT, the TLB
467 * hasn't yet been flushed, and both Intel and AMD
468 * document that A/D assists can use upper-level PxE
469 * entries that are cached in the TLB, i.e. the CPU can
470 * still access the page and mark it dirty.
471 *
472 * No retry is needed in the atomic update path as the
473 * sole concern is dropping a Dirty bit, i.e. no other
474 * task can zap/remove the SPTE as mmu_lock is held for
475 * write. Marking the SPTE as a removed SPTE is not
476 * strictly necessary for the same reason, but using
477 * the remove SPTE value keeps the shared/exclusive
478 * paths consistent and allows the handle_changed_spte()
479 * call below to hardcode the new value to REMOVED_SPTE.
480 *
481 * Note, even though dropping a Dirty bit is the only
482 * scenario where a non-atomic update could result in a
483 * functional bug, simply checking the Dirty bit isn't
484 * sufficient as a fast page fault could read the upper
485 * level SPTE before it is zapped, and then make this
486 * target SPTE writable, resume the guest, and set the
487 * Dirty bit between reading the SPTE above and writing
488 * it here.
e25f0e0c 489 */
ba3a6120
SC
490 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
491 REMOVED_SPTE, level);
9a77daac 492 }
e25f0e0c 493 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
ba3a6120 494 old_spte, REMOVED_SPTE, level, shared);
a066e61f
BG
495 }
496
7cca2d0b 497 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
498}
499
2f2fad08 500/**
7f6231a3 501 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
502 * @kvm: kvm instance
503 * @as_id: the address space of the paging structure the SPTE was a part of
504 * @gfn: the base GFN that was mapped by the SPTE
505 * @old_spte: The value of the SPTE before the change
506 * @new_spte: The value of the SPTE after the change
507 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
508 * @shared: This operation may not be running under the exclusive use of
509 * the MMU lock and the operation must synchronize with other
510 * threads that might be modifying SPTEs.
2f2fad08
BG
511 *
512 * Handle bookkeeping that might result from the modification of a SPTE.
513 * This function must be called for all TDP SPTE modifications.
514 */
515static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
516 u64 old_spte, u64 new_spte, int level,
517 bool shared)
2f2fad08
BG
518{
519 bool was_present = is_shadow_present_pte(old_spte);
520 bool is_present = is_shadow_present_pte(new_spte);
521 bool was_leaf = was_present && is_last_spte(old_spte, level);
522 bool is_leaf = is_present && is_last_spte(new_spte, level);
523 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
524
525 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
526 WARN_ON(level < PG_LEVEL_4K);
764388ce 527 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
528
529 /*
530 * If this warning were to trigger it would indicate that there was a
531 * missing MMU notifier or a race with some notifier handler.
532 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 533 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
534 * should be zapping the SPTE before the main MM's page table is
535 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
536 * thread before replacement.
537 */
538 if (was_leaf && is_leaf && pfn_changed) {
539 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
540 "SPTE with another present leaf SPTE mapping a\n"
541 "different PFN!\n"
542 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
543 as_id, gfn, old_spte, new_spte, level);
544
545 /*
546 * Crash the host to prevent error propagation and guest data
d9f6e12f 547 * corruption.
2f2fad08
BG
548 */
549 BUG();
550 }
551
552 if (old_spte == new_spte)
553 return;
554
b9a98c34
BG
555 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
556
115111ef
DM
557 if (is_leaf)
558 check_spte_writable_invariants(new_spte);
559
2f2fad08
BG
560 /*
561 * The only times a SPTE should be changed from a non-present to
562 * non-present state is when an MMIO entry is installed/modified/
563 * removed. In that case, there is nothing to do here.
564 */
565 if (!was_present && !is_present) {
566 /*
08f07c80
BG
567 * If this change does not involve a MMIO SPTE or removed SPTE,
568 * it is unexpected. Log the change, though it should not
569 * impact the guest since both the former and current SPTEs
570 * are nonpresent.
2f2fad08 571 */
08f07c80
BG
572 if (WARN_ON(!is_mmio_spte(old_spte) &&
573 !is_mmio_spte(new_spte) &&
574 !is_removed_spte(new_spte)))
2f2fad08
BG
575 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
576 "should not be replaced with another,\n"
577 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
578 "are MMIO SPTEs, or the new SPTE is\n"
579 "a temporary removed SPTE.\n"
2f2fad08
BG
580 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
581 as_id, gfn, old_spte, new_spte, level);
582 return;
583 }
584
71f51d2c
MZ
585 if (is_leaf != was_leaf)
586 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
587
588 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 589 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
590 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
591
592 /*
593 * Recursively handle child PTs if the change removed a subtree from
c8e5a0d0
SC
594 * the paging structure. Note the WARN on the PFN changing without the
595 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
596 * pages are kernel allocations and should never be migrated.
2f2fad08 597 */
c8e5a0d0
SC
598 if (was_present && !was_leaf &&
599 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0f53dfa3 600 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
601}
602
603static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
604 u64 old_spte, u64 new_spte, int level,
605 bool shared)
2f2fad08 606{
9a77daac
BG
607 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
608 shared);
f8e14497 609 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
610 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
611 new_spte, level);
2f2fad08 612}
faaf05b0 613
9a77daac 614/*
6ccf4438
PB
615 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
616 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 617 * in KVM's dirty bitmaps.
9a77daac 618 *
3255530a
DM
619 * If setting the SPTE fails because it has changed, iter->old_spte will be
620 * refreshed to the current value of the spte.
621 *
9a77daac
BG
622 * @kvm: kvm instance
623 * @iter: a tdp_iter instance currently on the SPTE that should be set
624 * @new_spte: The value the SPTE should be set to
3e72c791
DM
625 * Return:
626 * * 0 - If the SPTE was set.
627 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
628 * no side-effects other than setting iter->old_spte to the last
629 * known value of the spte.
9a77daac 630 */
3e72c791
DM
631static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
632 struct tdp_iter *iter,
633 u64 new_spte)
9a77daac 634{
3255530a 635 u64 *sptep = rcu_dereference(iter->sptep);
3255530a 636
08f07c80 637 /*
396fd74d
SC
638 * The caller is responsible for ensuring the old SPTE is not a REMOVED
639 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
640 * and pre-checking before inserting a new SPTE is advantageous as it
641 * avoids unnecessary work.
08f07c80 642 */
396fd74d
SC
643 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
644
645 lockdep_assert_held_read(&kvm->mmu_lock);
08f07c80 646
6e8eb206
DM
647 /*
648 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
649 * does not hold the mmu_lock.
650 */
aee98a68 651 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
3e72c791 652 return -EBUSY;
9a77daac 653
24ae4cfa
BG
654 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
655 new_spte, iter->level, true);
656 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
9a77daac 657
3e72c791 658 return 0;
9a77daac
BG
659}
660
3e72c791
DM
661static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
662 struct tdp_iter *iter)
08f07c80 663{
3e72c791
DM
664 int ret;
665
08f07c80
BG
666 /*
667 * Freeze the SPTE by setting it to a special,
668 * non-present value. This will stop other threads from
669 * immediately installing a present entry in its place
670 * before the TLBs are flushed.
671 */
3e72c791
DM
672 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
673 if (ret)
674 return ret;
08f07c80
BG
675
676 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
677 KVM_PAGES_PER_HPAGE(iter->level));
678
679 /*
ba3a6120
SC
680 * No other thread can overwrite the removed SPTE as they must either
681 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
682 * overwrite the special removed SPTE value. No bookkeeping is needed
683 * here since the SPTE is going from non-present to non-present. Use
684 * the raw write helper to avoid an unnecessary check on volatile bits.
08f07c80 685 */
ba3a6120 686 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
08f07c80 687
3e72c791 688 return 0;
08f07c80
BG
689}
690
9a77daac 691
fe43fa2f
BG
692/*
693 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
626808d1
SC
694 * @kvm: KVM instance
695 * @as_id: Address space ID, i.e. regular vs. SMM
696 * @sptep: Pointer to the SPTE
697 * @old_spte: The current value of the SPTE
698 * @new_spte: The new value that will be set for the SPTE
699 * @gfn: The base GFN that was (or will be) mapped by the SPTE
700 * @level: The level _containing_ the SPTE (its parent PT's level)
fe43fa2f
BG
701 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
702 * of the page. Should be set unless handling an MMU
703 * notifier for access tracking. Leaving record_acc_track
704 * unset in that case prevents page accesses from being
705 * double counted.
706 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
707 * appropriate for the change being made. Should be set
708 * unless performing certain dirty logging operations.
709 * Leaving record_dirty_log unset in that case prevents page
710 * writes from being double counted.
ba3a6120
SC
711 *
712 * Returns the old SPTE value, which _may_ be different than @old_spte if the
713 * SPTE had voldatile bits.
fe43fa2f 714 */
ba3a6120
SC
715static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
716 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
717 bool record_acc_track, bool record_dirty_log)
faaf05b0 718{
531810ca 719 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 720
08f07c80 721 /*
966da62a 722 * No thread should be using this function to set SPTEs to or from the
08f07c80
BG
723 * temporary removed SPTE value.
724 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
725 * should be used. If operating under the MMU lock in write mode, the
726 * use of the removed SPTE should not be necessary.
727 */
626808d1 728 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
08f07c80 729
ba3a6120 730 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
626808d1
SC
731
732 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
f8e14497 733
f8e14497 734 if (record_acc_track)
626808d1 735 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d 736 if (record_dirty_log)
626808d1
SC
737 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
738 new_spte, level);
ba3a6120 739 return old_spte;
626808d1
SC
740}
741
742static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
743 u64 new_spte, bool record_acc_track,
744 bool record_dirty_log)
745{
746 WARN_ON_ONCE(iter->yielded);
747
ba3a6120
SC
748 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
749 iter->old_spte, new_spte,
750 iter->gfn, iter->level,
751 record_acc_track, record_dirty_log);
f8e14497
BG
752}
753
754static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
755 u64 new_spte)
756{
626808d1 757 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 758}
faaf05b0 759
f8e14497
BG
760static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
761 struct tdp_iter *iter,
762 u64 new_spte)
763{
626808d1 764 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
a6a0b05d
BG
765}
766
767static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
768 struct tdp_iter *iter,
769 u64 new_spte)
770{
626808d1 771 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
772}
773
774#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075 775 for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0 776
f8e14497
BG
777#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
778 tdp_root_for_each_pte(_iter, _root, _start, _end) \
779 if (!is_shadow_present_pte(_iter.old_spte) || \
780 !is_last_spte(_iter.old_spte, _iter.level)) \
781 continue; \
782 else
783
bb18842e 784#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
b9e5603c 785 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
bb18842e 786
e28a436c
BG
787/*
788 * Yield if the MMU lock is contended or this thread needs to return control
789 * to the scheduler.
790 *
e139a34e
BG
791 * If this function should yield and flush is set, it will perform a remote
792 * TLB flush before yielding.
793 *
3a0f64de
SC
794 * If this function yields, iter->yielded is set and the caller must skip to
795 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
796 * over the paging structures to allow the iterator to continue its traversal
797 * from the paging structure root.
e28a436c 798 *
3a0f64de 799 * Returns true if this function yielded.
e28a436c 800 */
3a0f64de
SC
801static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
802 struct tdp_iter *iter,
803 bool flush, bool shared)
a6a0b05d 804{
3a0f64de
SC
805 WARN_ON(iter->yielded);
806
ed5e484b
BG
807 /* Ensure forward progress has been made before yielding. */
808 if (iter->next_last_level_gfn == iter->yielded_gfn)
809 return false;
810
531810ca 811 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
e139a34e
BG
812 if (flush)
813 kvm_flush_remote_tlbs(kvm);
814
bd296779
SC
815 rcu_read_unlock();
816
6103bc07
BG
817 if (shared)
818 cond_resched_rwlock_read(&kvm->mmu_lock);
819 else
820 cond_resched_rwlock_write(&kvm->mmu_lock);
821
7cca2d0b 822 rcu_read_lock();
ed5e484b
BG
823
824 WARN_ON(iter->gfn > iter->next_last_level_gfn);
825
3a0f64de 826 iter->yielded = true;
a6a0b05d 827 }
e28a436c 828
3a0f64de 829 return iter->yielded;
a6a0b05d
BG
830}
831
86931ff7 832static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
e2b5b21d
SC
833{
834 /*
86931ff7
SC
835 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
836 * a gpa range that would exceed the max gfn, and KVM does not create
837 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
838 * the slow emulation path every time.
e2b5b21d 839 */
86931ff7 840 return kvm_mmu_max_gfn() + 1;
e2b5b21d
SC
841}
842
1b6043e8
SC
843static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
844 bool shared, int zap_level)
e2b5b21d 845{
e2b5b21d
SC
846 struct tdp_iter iter;
847
86931ff7 848 gfn_t end = tdp_mmu_max_gfn_exclusive();
e2b5b21d
SC
849 gfn_t start = 0;
850
1b6043e8
SC
851 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
852retry:
853 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
854 continue;
855
856 if (!is_shadow_present_pte(iter.old_spte))
857 continue;
858
859 if (iter.level > zap_level)
860 continue;
861
862 if (!shared)
863 tdp_mmu_set_spte(kvm, &iter, 0);
864 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
865 goto retry;
866 }
867}
868
869static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
870 bool shared)
871{
872
8351779c
PB
873 /*
874 * The root must have an elevated refcount so that it's reachable via
875 * mmu_notifier callbacks, which allows this path to yield and drop
876 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
877 * must drop all references to relevant pages prior to completing the
878 * callback. Dropping mmu_lock with an unreachable root would result
879 * in zapping SPTEs after a relevant mmu_notifier callback completes
880 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
881 * dirty accessed bits to the SPTE's associated struct page.
882 */
883 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
884
e2b5b21d
SC
885 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
886
887 rcu_read_lock();
888
889 /*
1b6043e8
SC
890 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
891 * split the zap into two passes. On the first pass, zap at the 1gb
892 * level, and then zap top-level SPs on the second pass. "1gb" is not
893 * arbitrary, as KVM must be able to zap a 1gb shadow page without
894 * inducing a stall to allow in-place replacement with a 1gb hugepage.
895 *
896 * Because zapping a SP recurses on its children, stepping down to
897 * PG_LEVEL_4K in the iterator itself is unnecessary.
e2b5b21d 898 */
1b6043e8
SC
899 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
900 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
e2b5b21d
SC
901
902 rcu_read_unlock();
903}
904
c10743a1
SC
905bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
906{
907 u64 old_spte;
908
909 /*
910 * This helper intentionally doesn't allow zapping a root shadow page,
911 * which doesn't have a parent page table and thus no associated entry.
912 */
913 if (WARN_ON_ONCE(!sp->ptep))
914 return false;
915
c10743a1 916 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
bb95dfb9 917 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
c10743a1 918 return false;
c10743a1
SC
919
920 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
921 sp->gfn, sp->role.level + 1, true, true);
922
c10743a1
SC
923 return true;
924}
925
faaf05b0 926/*
f47e5bbb
SC
927 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
928 * have been cleared and a TLB flush is needed before releasing the MMU lock.
6103bc07 929 *
063afacd
BG
930 * If can_yield is true, will release the MMU lock and reschedule if the
931 * scheduler needs the CPU or there is contention on the MMU lock. If this
932 * function cannot yield, it will not release the MMU lock or reschedule and
933 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07 934 * operation can cause a soft lockup.
faaf05b0 935 */
f47e5bbb
SC
936static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
937 gfn_t start, gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
938{
939 struct tdp_iter iter;
faaf05b0 940
86931ff7 941 end = min(end, tdp_mmu_max_gfn_exclusive());
524a1e4e 942
acbda82a 943 lockdep_assert_held_write(&kvm->mmu_lock);
6103bc07 944
7cca2d0b
BG
945 rcu_read_lock();
946
f47e5bbb 947 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
1af4a960 948 if (can_yield &&
acbda82a 949 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
a835429c 950 flush = false;
1af4a960
BG
951 continue;
952 }
953
f47e5bbb 954 if (!is_shadow_present_pte(iter.old_spte) ||
faaf05b0
BG
955 !is_last_spte(iter.old_spte, iter.level))
956 continue;
957
acbda82a
SC
958 tdp_mmu_set_spte(kvm, &iter, 0);
959 flush = true;
faaf05b0 960 }
7cca2d0b 961
fcb93eb6
PB
962 rcu_read_unlock();
963
f47e5bbb
SC
964 /*
965 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
966 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
967 */
968 return flush;
faaf05b0
BG
969}
970
971/*
972 * Tears down the mappings for the range of gfns, [start, end), and frees the
973 * non-root pages mapping GFNs strictly within that range. Returns true if
974 * SPTEs have been cleared and a TLB flush is needed before releasing the
975 * MMU lock.
976 */
f47e5bbb
SC
977bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
978 bool can_yield, bool flush)
faaf05b0
BG
979{
980 struct kvm_mmu_page *root;
faaf05b0 981
614f6970 982 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
f47e5bbb 983 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
faaf05b0 984
faaf05b0
BG
985 return flush;
986}
987
988void kvm_tdp_mmu_zap_all(struct kvm *kvm)
989{
e2b5b21d 990 struct kvm_mmu_page *root;
2b9663d8
SC
991 int i;
992
77c8cd6b 993 /*
22b94c4b
PB
994 * Zap all roots, including invalid roots, as all SPTEs must be dropped
995 * before returning to the caller. Zap directly even if the root is
996 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
997 * all that expensive and mmu_lock is already held, which means the
998 * worker has yielded, i.e. flushing the work instead of zapping here
999 * isn't guaranteed to be any faster.
1000 *
77c8cd6b
SC
1001 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1002 * is being destroyed or the userspace VMM has exited. In both cases,
1003 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1004 */
e2b5b21d
SC
1005 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1006 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1007 tdp_mmu_zap_root(kvm, root, false);
1008 }
faaf05b0 1009}
bb18842e 1010
4c6654bd 1011/*
f28e9c7f 1012 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
22b94c4b 1013 * zap" completes.
4c6654bd
BG
1014 */
1015void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1016{
22b94c4b 1017 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
faaf05b0 1018}
bb18842e 1019
b7cccd39 1020/*
f28e9c7f 1021 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
22b94c4b
PB
1022 * is about to be zapped, e.g. in response to a memslots update. The actual
1023 * zapping is performed asynchronously, so a reference is taken on all roots.
1024 * Using a separate workqueue makes it easy to ensure that the destruction is
1025 * performed before the "fast zap" completes, without keeping a separate list
1026 * of invalidated roots; the list is effectively the list of work items in
1027 * the workqueue.
b7cccd39 1028 *
22b94c4b
PB
1029 * Get a reference even if the root is already invalid, the asynchronous worker
1030 * assumes it was gifted a reference to the root it processes. Because mmu_lock
1031 * is held for write, it should be impossible to observe a root with zero refcount,
1032 * i.e. the list of roots cannot be stale.
4c6654bd 1033 *
b7cccd39
BG
1034 * This has essentially the same effect for the TDP MMU
1035 * as updating mmu_valid_gen does for the shadow MMU.
1036 */
1037void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1038{
1039 struct kvm_mmu_page *root;
1040
1041 lockdep_assert_held_write(&kvm->mmu_lock);
f28e9c7f 1042 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
efd995da
PB
1043 if (!root->role.invalid &&
1044 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
4c6654bd 1045 root->role.invalid = true;
22b94c4b
PB
1046 tdp_mmu_schedule_zap_root(kvm, root);
1047 }
f28e9c7f 1048 }
b7cccd39
BG
1049}
1050
bb18842e
BG
1051/*
1052 * Installs a last-level SPTE to handle a TDP page fault.
1053 * (NPT/EPT violation/misconfiguration)
1054 */
cdc47767
PB
1055static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1056 struct kvm_page_fault *fault,
1057 struct tdp_iter *iter)
bb18842e 1058{
c435d4b7 1059 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 1060 u64 new_spte;
57a3e96d 1061 int ret = RET_PF_FIXED;
ad67e480 1062 bool wrprot = false;
bb18842e 1063
7158bee4 1064 WARN_ON(sp->role.level != fault->goal_level);
e710c5f6 1065 if (unlikely(!fault->slot))
bb18842e 1066 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 1067 else
53597858 1068 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 1069 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 1070 fault->map_writable, &new_spte);
bb18842e
BG
1071
1072 if (new_spte == iter->old_spte)
1073 ret = RET_PF_SPURIOUS;
3e72c791 1074 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 1075 return RET_PF_RETRY;
bb95dfb9
SC
1076 else if (is_shadow_present_pte(iter->old_spte) &&
1077 !is_last_spte(iter->old_spte, iter->level))
1078 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1079 KVM_PAGES_PER_HPAGE(iter->level + 1));
bb18842e
BG
1080
1081 /*
1082 * If the page fault was caused by a write but the page is write
1083 * protected, emulation is needed. If the emulation was skipped,
1084 * the vCPU would have the same fault again.
1085 */
ad67e480 1086 if (wrprot) {
cdc47767 1087 if (fault->write)
bb18842e 1088 ret = RET_PF_EMULATE;
bb18842e
BG
1089 }
1090
1091 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac 1092 if (unlikely(is_mmio_spte(new_spte))) {
1075d41e 1093 vcpu->stat.pf_mmio_spte_created++;
9a77daac
BG
1094 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1095 new_spte);
bb18842e 1096 ret = RET_PF_EMULATE;
3849e092 1097 } else {
9a77daac
BG
1098 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1099 rcu_dereference(iter->sptep));
3849e092 1100 }
bb18842e 1101
bb18842e
BG
1102 return ret;
1103}
1104
7b7e1ab6 1105/*
cb00a70b
DM
1106 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1107 * provided page table.
7b7e1ab6
DM
1108 *
1109 * @kvm: kvm instance
1110 * @iter: a tdp_iter instance currently on the SPTE that should be set
1111 * @sp: The new TDP page table to install.
1112 * @account_nx: True if this page table is being installed to split a
1113 * non-executable huge page.
cb00a70b 1114 * @shared: This operation is running under the MMU lock in read mode.
7b7e1ab6
DM
1115 *
1116 * Returns: 0 if the new page table was installed. Non-0 if the page table
1117 * could not be installed (e.g. the atomic compare-exchange failed).
1118 */
cb00a70b
DM
1119static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1120 struct kvm_mmu_page *sp, bool account_nx,
1121 bool shared)
7b7e1ab6 1122{
54275f74 1123 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
cb00a70b 1124 int ret = 0;
7b7e1ab6 1125
cb00a70b
DM
1126 if (shared) {
1127 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1128 if (ret)
1129 return ret;
1130 } else {
1131 tdp_mmu_set_spte(kvm, iter, spte);
1132 }
7b7e1ab6
DM
1133
1134 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1135 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1136 if (account_nx)
1137 account_huge_nx_page(kvm, sp);
1138 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1139
1140 return 0;
1141}
1142
bb18842e
BG
1143/*
1144 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1145 * page tables and SPTEs to translate the faulting guest physical address.
1146 */
2f6305dd 1147int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 1148{
bb18842e
BG
1149 struct kvm_mmu *mmu = vcpu->arch.mmu;
1150 struct tdp_iter iter;
89c0fd49 1151 struct kvm_mmu_page *sp;
bb18842e 1152 int ret;
bb18842e 1153
73a3c659 1154 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 1155
f0066d94 1156 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
1157
1158 rcu_read_lock();
1159
2f6305dd 1160 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
73a3c659 1161 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 1162 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 1163
73a3c659 1164 if (iter.level == fault->goal_level)
bb18842e
BG
1165 break;
1166
1167 /*
1168 * If there is an SPTE mapping a large page at a higher level
1169 * than the target, that SPTE must be cleared and replaced
1170 * with a non-leaf SPTE.
1171 */
1172 if (is_shadow_present_pte(iter.old_spte) &&
1173 is_large_pte(iter.old_spte)) {
3e72c791 1174 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 1175 break;
bb18842e 1176
bb18842e
BG
1177 /*
1178 * The iter must explicitly re-read the spte here
1179 * because the new value informs the !present
1180 * path below.
1181 */
0e587aa7 1182 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
bb18842e
BG
1183 }
1184
1185 if (!is_shadow_present_pte(iter.old_spte)) {
7b7e1ab6
DM
1186 bool account_nx = fault->huge_page_disallowed &&
1187 fault->req_level >= iter.level;
1188
ff76d506 1189 /*
c4342633 1190 * If SPTE has been frozen by another thread, just
ff76d506
KH
1191 * give up and retry, avoiding unnecessary page table
1192 * allocation and free.
1193 */
1194 if (is_removed_spte(iter.old_spte))
1195 break;
1196
a82070b6
DM
1197 sp = tdp_mmu_alloc_sp(vcpu);
1198 tdp_mmu_init_child_sp(sp, &iter);
1199
cb00a70b 1200 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
9a77daac
BG
1201 tdp_mmu_free_sp(sp);
1202 break;
1203 }
bb18842e
BG
1204 }
1205 }
1206
58298b06
SC
1207 /*
1208 * Force the guest to retry the access if the upper level SPTEs aren't
1209 * in place, or if the target leaf SPTE is frozen by another CPU.
1210 */
1211 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
7cca2d0b 1212 rcu_read_unlock();
bb18842e 1213 return RET_PF_RETRY;
7cca2d0b 1214 }
bb18842e 1215
cdc47767 1216 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
7cca2d0b 1217 rcu_read_unlock();
bb18842e
BG
1218
1219 return ret;
1220}
063afacd 1221
3039bcc7
SC
1222bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1223 bool flush)
063afacd 1224{
f47e5bbb
SC
1225 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1226 range->end, range->may_block, flush);
063afacd
BG
1227}
1228
3039bcc7
SC
1229typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1230 struct kvm_gfn_range *range);
063afacd 1231
3039bcc7
SC
1232static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1233 struct kvm_gfn_range *range,
1234 tdp_handler_t handler)
063afacd 1235{
3039bcc7
SC
1236 struct kvm_mmu_page *root;
1237 struct tdp_iter iter;
1238 bool ret = false;
1239
e1eed584
SC
1240 /*
1241 * Don't support rescheduling, none of the MMU notifiers that funnel
1242 * into this helper allow blocking; it'd be dead, wasteful code.
1243 */
3039bcc7 1244 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
a151acec
SC
1245 rcu_read_lock();
1246
3039bcc7
SC
1247 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1248 ret |= handler(kvm, &iter, range);
3039bcc7 1249
a151acec
SC
1250 rcu_read_unlock();
1251 }
3039bcc7
SC
1252
1253 return ret;
063afacd 1254}
f8e14497
BG
1255
1256/*
1257 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1258 * if any of the GFNs in the range have been accessed.
1259 */
3039bcc7
SC
1260static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1261 struct kvm_gfn_range *range)
f8e14497 1262{
f8e14497
BG
1263 u64 new_spte = 0;
1264
3039bcc7
SC
1265 /* If we have a non-accessed entry we don't need to change the pte. */
1266 if (!is_accessed_spte(iter->old_spte))
1267 return false;
7cca2d0b 1268
3039bcc7
SC
1269 new_spte = iter->old_spte;
1270
1271 if (spte_ad_enabled(new_spte)) {
1272 new_spte &= ~shadow_accessed_mask;
1273 } else {
f8e14497 1274 /*
3039bcc7
SC
1275 * Capture the dirty status of the page, so that it doesn't get
1276 * lost when the SPTE is marked for access tracking.
f8e14497 1277 */
3039bcc7
SC
1278 if (is_writable_pte(new_spte))
1279 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 1280
3039bcc7 1281 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
1282 }
1283
3039bcc7 1284 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 1285
3039bcc7 1286 return true;
f8e14497
BG
1287}
1288
3039bcc7 1289bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1290{
3039bcc7 1291 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1292}
1293
3039bcc7
SC
1294static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1295 struct kvm_gfn_range *range)
f8e14497 1296{
3039bcc7 1297 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1298}
1299
3039bcc7 1300bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1301{
3039bcc7 1302 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1303}
1d8dd6b3 1304
3039bcc7
SC
1305static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1306 struct kvm_gfn_range *range)
1d8dd6b3 1307{
1d8dd6b3 1308 u64 new_spte;
7cca2d0b 1309
3039bcc7
SC
1310 /* Huge pages aren't expected to be modified without first being zapped. */
1311 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 1312
3039bcc7
SC
1313 if (iter->level != PG_LEVEL_4K ||
1314 !is_shadow_present_pte(iter->old_spte))
1315 return false;
1d8dd6b3 1316
3039bcc7
SC
1317 /*
1318 * Note, when changing a read-only SPTE, it's not strictly necessary to
1319 * zero the SPTE before setting the new PFN, but doing so preserves the
1320 * invariant that the PFN of a present * leaf SPTE can never change.
1321 * See __handle_changed_spte().
1322 */
1323 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 1324
3039bcc7
SC
1325 if (!pte_write(range->pte)) {
1326 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1327 pte_pfn(range->pte));
1d8dd6b3 1328
3039bcc7 1329 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1330 }
1331
3039bcc7 1332 return true;
1d8dd6b3
BG
1333}
1334
3039bcc7
SC
1335/*
1336 * Handle the changed_pte MMU notifier for the TDP MMU.
1337 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1338 * notifier.
1339 * Returns non-zero if a flush is needed before releasing the MMU lock.
1340 */
1341bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1342{
93fa50f6
SC
1343 /*
1344 * No need to handle the remote TLB flush under RCU protection, the
1345 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1346 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1347 */
1348 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1d8dd6b3
BG
1349}
1350
a6a0b05d 1351/*
bedd9195
DM
1352 * Remove write access from all SPTEs at or above min_level that map GFNs
1353 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1354 * be flushed.
a6a0b05d
BG
1355 */
1356static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1357 gfn_t start, gfn_t end, int min_level)
1358{
1359 struct tdp_iter iter;
1360 u64 new_spte;
1361 bool spte_set = false;
1362
7cca2d0b
BG
1363 rcu_read_lock();
1364
a6a0b05d
BG
1365 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1366
77aa6075 1367 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfa
BG
1368retry:
1369 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1370 continue;
1371
a6a0b05d 1372 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1373 !is_last_spte(iter.old_spte, iter.level) ||
1374 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1375 continue;
1376
1377 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1378
3e72c791 1379 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1380 goto retry;
3255530a 1381
a6a0b05d 1382 spte_set = true;
a6a0b05d 1383 }
7cca2d0b
BG
1384
1385 rcu_read_unlock();
a6a0b05d
BG
1386 return spte_set;
1387}
1388
1389/*
1390 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1391 * only affect leaf SPTEs down to min_level.
1392 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1393 */
269e9552
HM
1394bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1395 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1396{
1397 struct kvm_mmu_page *root;
a6a0b05d
BG
1398 bool spte_set = false;
1399
24ae4cfa 1400 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1401
d62007ed 1402 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1403 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1404 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1405
1406 return spte_set;
1407}
1408
a3fe5dbd
DM
1409static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1410{
1411 struct kvm_mmu_page *sp;
1412
1413 gfp |= __GFP_ZERO;
1414
1415 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1416 if (!sp)
1417 return NULL;
1418
1419 sp->spt = (void *)__get_free_page(gfp);
1420 if (!sp->spt) {
1421 kmem_cache_free(mmu_page_header_cache, sp);
1422 return NULL;
1423 }
1424
1425 return sp;
1426}
1427
1428static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
cb00a70b
DM
1429 struct tdp_iter *iter,
1430 bool shared)
a3fe5dbd
DM
1431{
1432 struct kvm_mmu_page *sp;
1433
a3fe5dbd
DM
1434 /*
1435 * Since we are allocating while under the MMU lock we have to be
1436 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1437 * reclaim and to avoid making any filesystem callbacks (which can end
1438 * up invoking KVM MMU notifiers, resulting in a deadlock).
1439 *
1440 * If this allocation fails we drop the lock and retry with reclaim
1441 * allowed.
1442 */
1443 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1444 if (sp)
1445 return sp;
1446
1447 rcu_read_unlock();
cb00a70b
DM
1448
1449 if (shared)
1450 read_unlock(&kvm->mmu_lock);
1451 else
1452 write_unlock(&kvm->mmu_lock);
a3fe5dbd
DM
1453
1454 iter->yielded = true;
1455 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1456
cb00a70b
DM
1457 if (shared)
1458 read_lock(&kvm->mmu_lock);
1459 else
1460 write_lock(&kvm->mmu_lock);
1461
a3fe5dbd
DM
1462 rcu_read_lock();
1463
1464 return sp;
1465}
1466
cb00a70b
DM
1467static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1468 struct kvm_mmu_page *sp, bool shared)
a3fe5dbd
DM
1469{
1470 const u64 huge_spte = iter->old_spte;
1471 const int level = iter->level;
1472 int ret, i;
1473
1474 tdp_mmu_init_child_sp(sp, iter);
1475
1476 /*
1477 * No need for atomics when writing to sp->spt since the page table has
1478 * not been linked in yet and thus is not reachable from any other CPU.
1479 */
2ca3129e 1480 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
084cc29f 1481 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i);
a3fe5dbd
DM
1482
1483 /*
1484 * Replace the huge spte with a pointer to the populated lower level
1485 * page table. Since we are making this change without a TLB flush vCPUs
1486 * will see a mix of the split mappings and the original huge mapping,
1487 * depending on what's currently in their TLB. This is fine from a
1488 * correctness standpoint since the translation will be the same either
1489 * way.
1490 */
cb00a70b 1491 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
a3fe5dbd 1492 if (ret)
e0b728b1 1493 goto out;
a3fe5dbd
DM
1494
1495 /*
1496 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1497 * are overwriting from the page stats. But we have to manually update
1498 * the page stats with the new present child pages.
1499 */
2ca3129e 1500 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
a3fe5dbd 1501
e0b728b1
DM
1502out:
1503 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1504 return ret;
a3fe5dbd
DM
1505}
1506
1507static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1508 struct kvm_mmu_page *root,
1509 gfn_t start, gfn_t end,
cb00a70b 1510 int target_level, bool shared)
a3fe5dbd
DM
1511{
1512 struct kvm_mmu_page *sp = NULL;
1513 struct tdp_iter iter;
1514 int ret = 0;
1515
1516 rcu_read_lock();
1517
1518 /*
1519 * Traverse the page table splitting all huge pages above the target
1520 * level into one lower level. For example, if we encounter a 1GB page
1521 * we split it into 512 2MB pages.
1522 *
1523 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1524 * to visit an SPTE before ever visiting its children, which means we
1525 * will correctly recursively split huge pages that are more than one
1526 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1527 * and then splitting each of those to 512 4KB pages).
1528 */
1529 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1530retry:
cb00a70b 1531 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
a3fe5dbd
DM
1532 continue;
1533
1534 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1535 continue;
1536
1537 if (!sp) {
cb00a70b 1538 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
a3fe5dbd
DM
1539 if (!sp) {
1540 ret = -ENOMEM;
e0b728b1
DM
1541 trace_kvm_mmu_split_huge_page(iter.gfn,
1542 iter.old_spte,
1543 iter.level, ret);
a3fe5dbd
DM
1544 break;
1545 }
1546
1547 if (iter.yielded)
1548 continue;
1549 }
1550
cb00a70b 1551 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
a3fe5dbd
DM
1552 goto retry;
1553
1554 sp = NULL;
1555 }
1556
1557 rcu_read_unlock();
1558
1559 /*
1560 * It's possible to exit the loop having never used the last sp if, for
1561 * example, a vCPU doing HugePage NX splitting wins the race and
1562 * installs its own sp in place of the last sp we tried to split.
1563 */
1564 if (sp)
1565 tdp_mmu_free_sp(sp);
1566
a3fe5dbd
DM
1567 return ret;
1568}
1569
cb00a70b 1570
a3fe5dbd
DM
1571/*
1572 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1573 */
1574void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1575 const struct kvm_memory_slot *slot,
1576 gfn_t start, gfn_t end,
cb00a70b 1577 int target_level, bool shared)
a3fe5dbd
DM
1578{
1579 struct kvm_mmu_page *root;
1580 int r = 0;
1581
cb00a70b 1582 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a3fe5dbd 1583
7c554d8e 1584 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
cb00a70b 1585 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
a3fe5dbd 1586 if (r) {
cb00a70b 1587 kvm_tdp_mmu_put_root(kvm, root, shared);
a3fe5dbd
DM
1588 break;
1589 }
1590 }
1591}
1592
a6a0b05d
BG
1593/*
1594 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1595 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1596 * If AD bits are not enabled, this will require clearing the writable bit on
1597 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1598 * be flushed.
1599 */
1600static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1601 gfn_t start, gfn_t end)
1602{
1603 struct tdp_iter iter;
1604 u64 new_spte;
1605 bool spte_set = false;
1606
7cca2d0b
BG
1607 rcu_read_lock();
1608
a6a0b05d 1609 tdp_root_for_each_leaf_pte(iter, root, start, end) {
24ae4cfa
BG
1610retry:
1611 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1612 continue;
1613
3354ef5a
SC
1614 if (!is_shadow_present_pte(iter.old_spte))
1615 continue;
1616
a6a0b05d
BG
1617 if (spte_ad_need_write_protect(iter.old_spte)) {
1618 if (is_writable_pte(iter.old_spte))
1619 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1620 else
1621 continue;
1622 } else {
1623 if (iter.old_spte & shadow_dirty_mask)
1624 new_spte = iter.old_spte & ~shadow_dirty_mask;
1625 else
1626 continue;
1627 }
1628
3e72c791 1629 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1630 goto retry;
3255530a 1631
a6a0b05d 1632 spte_set = true;
a6a0b05d 1633 }
7cca2d0b
BG
1634
1635 rcu_read_unlock();
a6a0b05d
BG
1636 return spte_set;
1637}
1638
1639/*
1640 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1641 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1642 * If AD bits are not enabled, this will require clearing the writable bit on
1643 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1644 * be flushed.
1645 */
269e9552
HM
1646bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1647 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1648{
1649 struct kvm_mmu_page *root;
a6a0b05d
BG
1650 bool spte_set = false;
1651
24ae4cfa 1652 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1653
d62007ed 1654 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1655 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1656 slot->base_gfn + slot->npages);
a6a0b05d
BG
1657
1658 return spte_set;
1659}
1660
1661/*
1662 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1663 * set in mask, starting at gfn. The given memslot is expected to contain all
1664 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1665 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1666 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1667 */
1668static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1669 gfn_t gfn, unsigned long mask, bool wrprot)
1670{
1671 struct tdp_iter iter;
1672 u64 new_spte;
1673
7cca2d0b
BG
1674 rcu_read_lock();
1675
a6a0b05d
BG
1676 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1677 gfn + BITS_PER_LONG) {
1678 if (!mask)
1679 break;
1680
1681 if (iter.level > PG_LEVEL_4K ||
1682 !(mask & (1UL << (iter.gfn - gfn))))
1683 continue;
1684
f1b3b06a
BG
1685 mask &= ~(1UL << (iter.gfn - gfn));
1686
a6a0b05d
BG
1687 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1688 if (is_writable_pte(iter.old_spte))
1689 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1690 else
1691 continue;
1692 } else {
1693 if (iter.old_spte & shadow_dirty_mask)
1694 new_spte = iter.old_spte & ~shadow_dirty_mask;
1695 else
1696 continue;
1697 }
1698
1699 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1700 }
7cca2d0b
BG
1701
1702 rcu_read_unlock();
a6a0b05d
BG
1703}
1704
1705/*
1706 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1707 * set in mask, starting at gfn. The given memslot is expected to contain all
1708 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1709 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1710 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1711 */
1712void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1713 struct kvm_memory_slot *slot,
1714 gfn_t gfn, unsigned long mask,
1715 bool wrprot)
1716{
1717 struct kvm_mmu_page *root;
a6a0b05d 1718
531810ca 1719 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1720 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1721 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1722}
1723
14881998 1724/*
87aa9ec9
BG
1725 * Clear leaf entries which could be replaced by large mappings, for
1726 * GFNs within the slot.
14881998 1727 */
4b85c921 1728static void zap_collapsible_spte_range(struct kvm *kvm,
14881998 1729 struct kvm_mmu_page *root,
4b85c921 1730 const struct kvm_memory_slot *slot)
14881998 1731{
9eba50f8
SC
1732 gfn_t start = slot->base_gfn;
1733 gfn_t end = start + slot->npages;
14881998 1734 struct tdp_iter iter;
5ba7c4c6 1735 int max_mapping_level;
14881998 1736 kvm_pfn_t pfn;
14881998 1737
7cca2d0b
BG
1738 rcu_read_lock();
1739
14881998 1740 tdp_root_for_each_pte(iter, root, start, end) {
4b85c921 1741 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960 1742 continue;
1af4a960 1743
14881998 1744 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1745 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1746 continue;
1747
5ba7c4c6
BG
1748 /*
1749 * This is a leaf SPTE. Check if the PFN it maps can
1750 * be mapped at a higher level.
1751 */
14881998 1752 pfn = spte_to_pfn(iter.old_spte);
5ba7c4c6
BG
1753 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1754 iter.gfn, pfn, PG_LEVEL_NUM);
1755
1756 WARN_ON(max_mapping_level < iter.level);
1757
1758 /*
1759 * If this page is already mapped at the highest
1760 * viable level, there's nothing more to do.
1761 */
1762 if (max_mapping_level == iter.level)
1763 continue;
1764
1765 /*
1766 * The page can be remapped at a higher level, so step
1767 * up to zap the parent SPTE.
1768 */
1769 while (max_mapping_level > iter.level)
1770 tdp_iter_step_up(&iter);
1771
4b85c921 1772 /* Note, a successful atomic zap also does a remote TLB flush. */
5ba7c4c6
BG
1773 tdp_mmu_zap_spte_atomic(kvm, &iter);
1774
1775 /*
1776 * If the atomic zap fails, the iter will recurse back into
1777 * the same subtree to retry.
1778 */
14881998
BG
1779 }
1780
7cca2d0b 1781 rcu_read_unlock();
14881998
BG
1782}
1783
1784/*
1785 * Clear non-leaf entries (and free associated page tables) which could
1786 * be replaced by large mappings, for GFNs within the slot.
1787 */
4b85c921
SC
1788void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1789 const struct kvm_memory_slot *slot)
14881998
BG
1790{
1791 struct kvm_mmu_page *root;
14881998 1792
2db6f772 1793 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1794
d62007ed 1795 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921 1796 zap_collapsible_spte_range(kvm, root, slot);
14881998 1797}
46044f72
BG
1798
1799/*
1800 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1801 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1802 * Returns true if an SPTE was set and a TLB flush is needed.
1803 */
1804static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1805 gfn_t gfn, int min_level)
46044f72
BG
1806{
1807 struct tdp_iter iter;
1808 u64 new_spte;
1809 bool spte_set = false;
1810
3ad93562
KZ
1811 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1812
7cca2d0b
BG
1813 rcu_read_lock();
1814
77aa6075 1815 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562
KZ
1816 if (!is_shadow_present_pte(iter.old_spte) ||
1817 !is_last_spte(iter.old_spte, iter.level))
1818 continue;
1819
46044f72 1820 new_spte = iter.old_spte &
5fc3424f 1821 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72 1822
7c8a4742
DM
1823 if (new_spte == iter.old_spte)
1824 break;
1825
46044f72
BG
1826 tdp_mmu_set_spte(kvm, &iter, new_spte);
1827 spte_set = true;
1828 }
1829
7cca2d0b
BG
1830 rcu_read_unlock();
1831
46044f72
BG
1832 return spte_set;
1833}
1834
1835/*
1836 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1837 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1838 * Returns true if an SPTE was set and a TLB flush is needed.
1839 */
1840bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1841 struct kvm_memory_slot *slot, gfn_t gfn,
1842 int min_level)
46044f72
BG
1843{
1844 struct kvm_mmu_page *root;
46044f72
BG
1845 bool spte_set = false;
1846
531810ca 1847 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1848 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1849 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1850
46044f72
BG
1851 return spte_set;
1852}
1853
95fb5b02
BG
1854/*
1855 * Return the level of the lowest level SPTE added to sptes.
1856 * That SPTE may be non-present.
c5c8c7c5
DM
1857 *
1858 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1859 */
39b4d43e
SC
1860int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1861 int *root_level)
95fb5b02
BG
1862{
1863 struct tdp_iter iter;
1864 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1865 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1866 int leaf = -1;
95fb5b02 1867
a972e29c 1868 *root_level = vcpu->arch.mmu->root_role.level;
95fb5b02
BG
1869
1870 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1871 leaf = iter.level;
dde81f94 1872 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1873 }
1874
1875 return leaf;
1876}
6e8eb206
DM
1877
1878/*
1879 * Returns the last level spte pointer of the shadow page walk for the given
1880 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1881 * walk could be performed, returns NULL and *spte does not contain valid data.
1882 *
1883 * Contract:
1884 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1885 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1886 *
1887 * WARNING: This function is only intended to be called during fast_page_fault.
1888 */
1889u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1890 u64 *spte)
1891{
1892 struct tdp_iter iter;
1893 struct kvm_mmu *mmu = vcpu->arch.mmu;
1894 gfn_t gfn = addr >> PAGE_SHIFT;
1895 tdp_ptep_t sptep = NULL;
1896
1897 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1898 *spte = iter.old_spte;
1899 sptep = iter.sptep;
1900 }
1901
1902 /*
1903 * Perform the rcu_dereference to get the raw spte pointer value since
1904 * we are passing it up to fast_page_fault, which is shared with the
1905 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1906 * annotation.
1907 *
1908 * This is safe since fast_page_fault obeys the contracts of this
1909 * function as well as all TDP MMU contracts around modifying SPTEs
1910 * outside of mmu_lock.
1911 */
1912 return rcu_dereference(sptep);
1913}