KVM: x86/mmu: Merge TDP MMU put and free root
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d
BG
1// SPDX-License-Identifier: GPL-2.0
2
02c00b3a
BG
3#include "mmu.h"
4#include "mmu_internal.h"
bb18842e 5#include "mmutrace.h"
2f2fad08 6#include "tdp_iter.h"
fe5db27d 7#include "tdp_mmu.h"
02c00b3a 8#include "spte.h"
fe5db27d 9
9a77daac 10#include <asm/cmpxchg.h>
33dd3574
BG
11#include <trace/events/kvm.h>
12
fe5db27d 13static bool __read_mostly tdp_mmu_enabled = false;
95fb5b02 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
fe5db27d
BG
15
16/* Initializes the TDP MMU for the VM, if enabled. */
17void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18{
897218ff 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
fe5db27d
BG
20 return;
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
02c00b3a
BG
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
89c0fd49 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
fe5db27d
BG
28}
29
30void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
31{
32 if (!kvm->arch.tdp_mmu_enabled)
33 return;
02c00b3a
BG
34
35 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
36
37 /*
38 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 * can run before the VM is torn down.
40 */
41 rcu_barrier();
02c00b3a
BG
42}
43
2bdb3d84
BG
44static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
45 gfn_t start, gfn_t end, bool can_yield, bool flush);
46
47static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 48{
2bdb3d84
BG
49 free_page((unsigned long)sp->spt);
50 kmem_cache_free(mmu_page_header_cache, sp);
51}
52
53void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
54{
55 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
56
57 lockdep_assert_held_write(&kvm->mmu_lock);
58
59 if (--root->root_count)
60 return;
61
62 WARN_ON(!root->tdp_mmu_page);
63
64 list_del(&root->link);
65
66 zap_gfn_range(kvm, root, 0, max_gfn, false, false);
67
68 tdp_mmu_free_sp(root);
a889ea54
BG
69}
70
71static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
72 struct kvm_mmu_page *root)
73{
531810ca 74 lockdep_assert_held_write(&kvm->mmu_lock);
a889ea54
BG
75
76 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
77 return false;
78
76eb54e7 79 kvm_tdp_mmu_get_root(kvm, root);
a889ea54
BG
80 return true;
81
82}
83
84static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
85 struct kvm_mmu_page *root)
86{
87 struct kvm_mmu_page *next_root;
88
89 next_root = list_next_entry(root, link);
2bdb3d84 90 kvm_tdp_mmu_put_root(kvm, root);
a889ea54
BG
91 return next_root;
92}
93
94/*
95 * Note: this iterator gets and puts references to the roots it iterates over.
96 * This makes it safe to release the MMU lock and yield within the loop, but
97 * if exiting the loop early, the caller must drop the reference to the most
98 * recent root. (Unless keeping a live reference is desirable.)
99 */
a3f15bda 100#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
a889ea54
BG
101 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
102 typeof(*_root), link); \
103 tdp_mmu_next_root_valid(_kvm, _root); \
a3f15bda
SC
104 _root = tdp_mmu_next_root(_kvm, _root)) \
105 if (kvm_mmu_page_as_id(_root) != _as_id) { \
106 } else
a889ea54 107
a3f15bda
SC
108#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
109 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
110 if (kvm_mmu_page_as_id(_root) != _as_id) { \
111 } else
02c00b3a 112
02c00b3a
BG
113static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
114 int level)
115{
116 union kvm_mmu_page_role role;
117
118 role = vcpu->arch.mmu->mmu_role.base;
119 role.level = level;
120 role.direct = true;
121 role.gpte_is_8_bytes = true;
122 role.access = ACC_ALL;
123
124 return role;
125}
126
127static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
128 int level)
129{
130 struct kvm_mmu_page *sp;
131
132 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
133 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
134 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
135
136 sp->role.word = page_role_for_level(vcpu, level).word;
137 sp->gfn = gfn;
138 sp->tdp_mmu_page = true;
139
33dd3574
BG
140 trace_kvm_mmu_get_page(sp, true);
141
02c00b3a
BG
142 return sp;
143}
144
6e6ec584 145hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
02c00b3a
BG
146{
147 union kvm_mmu_page_role role;
148 struct kvm *kvm = vcpu->kvm;
149 struct kvm_mmu_page *root;
150
6e6ec584 151 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 152
6e6ec584 153 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
02c00b3a
BG
154
155 /* Check for an existing root before allocating a new one. */
a3f15bda 156 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
02c00b3a 157 if (root->role.word == role.word) {
76eb54e7 158 kvm_tdp_mmu_get_root(kvm, root);
6e6ec584 159 goto out;
02c00b3a
BG
160 }
161 }
162
163 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
164 root->root_count = 1;
165
166 list_add(&root->link, &kvm->arch.tdp_mmu_roots);
167
6e6ec584 168out:
02c00b3a 169 return __pa(root->spt);
fe5db27d 170}
2f2fad08 171
7cca2d0b
BG
172/*
173 * This is called through call_rcu in order to free TDP page table memory
174 * safely with respect to other kernel threads that may be operating on
175 * the memory.
176 * By only accessing TDP MMU page table memory in an RCU read critical
177 * section, and freeing it after a grace period, lockless access to that
178 * memory won't use it after it is freed.
179 */
180static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
181{
182 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
183 rcu_head);
184
185 tdp_mmu_free_sp(sp);
186}
187
2f2fad08 188static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
189 u64 old_spte, u64 new_spte, int level,
190 bool shared);
2f2fad08 191
f8e14497
BG
192static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
193{
f8e14497
BG
194 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
195 return;
196
197 if (is_accessed_spte(old_spte) &&
64bb2769
SC
198 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
199 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
f8e14497
BG
200 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
201}
202
a6a0b05d
BG
203static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
204 u64 old_spte, u64 new_spte, int level)
205{
206 bool pfn_changed;
207 struct kvm_memory_slot *slot;
208
209 if (level > PG_LEVEL_4K)
210 return;
211
212 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
213
214 if ((!is_writable_pte(old_spte) || pfn_changed) &&
215 is_writable_pte(new_spte)) {
216 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
fb04a1ed 217 mark_page_dirty_in_slot(kvm, slot, gfn);
a6a0b05d
BG
218 }
219}
220
a9442f59
BG
221/**
222 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
223 *
224 * @kvm: kvm instance
225 * @sp: the new page
9a77daac
BG
226 * @shared: This operation may not be running under the exclusive use of
227 * the MMU lock and the operation must synchronize with other
228 * threads that might be adding or removing pages.
a9442f59
BG
229 * @account_nx: This page replaces a NX large page and should be marked for
230 * eventual reclaim.
231 */
232static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
9a77daac 233 bool shared, bool account_nx)
a9442f59 234{
9a77daac
BG
235 if (shared)
236 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
237 else
238 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
239
240 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
241 if (account_nx)
242 account_huge_nx_page(kvm, sp);
9a77daac
BG
243
244 if (shared)
245 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
246}
247
248/**
249 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
250 *
251 * @kvm: kvm instance
252 * @sp: the page to be removed
9a77daac
BG
253 * @shared: This operation may not be running under the exclusive use of
254 * the MMU lock and the operation must synchronize with other
255 * threads that might be adding or removing pages.
a9442f59 256 */
9a77daac
BG
257static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
258 bool shared)
a9442f59 259{
9a77daac
BG
260 if (shared)
261 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
262 else
263 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59
BG
264
265 list_del(&sp->link);
266 if (sp->lpage_disallowed)
267 unaccount_huge_nx_page(kvm, sp);
9a77daac
BG
268
269 if (shared)
270 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
271}
272
a066e61f
BG
273/**
274 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
275 *
276 * @kvm: kvm instance
277 * @pt: the page removed from the paging structure
9a77daac
BG
278 * @shared: This operation may not be running under the exclusive use
279 * of the MMU lock and the operation must synchronize with other
280 * threads that might be modifying SPTEs.
a066e61f
BG
281 *
282 * Given a page table that has been removed from the TDP paging structure,
283 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
284 *
285 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
286 * protection. Since this thread removed it from the paging structure,
287 * this thread will be responsible for ensuring the page is freed. Hence the
288 * early rcu_dereferences in the function.
a066e61f 289 */
70fb3e41 290static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
9a77daac 291 bool shared)
a066e61f 292{
70fb3e41 293 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 294 int level = sp->role.level;
e25f0e0c 295 gfn_t base_gfn = sp->gfn;
a066e61f 296 u64 old_child_spte;
9a77daac 297 u64 *sptep;
e25f0e0c 298 gfn_t gfn;
a066e61f
BG
299 int i;
300
301 trace_kvm_mmu_prepare_zap_page(sp);
302
9a77daac 303 tdp_mmu_unlink_page(kvm, sp, shared);
a066e61f
BG
304
305 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
70fb3e41 306 sptep = rcu_dereference(pt) + i;
e25f0e0c 307 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
9a77daac
BG
308
309 if (shared) {
e25f0e0c
BG
310 /*
311 * Set the SPTE to a nonpresent value that other
312 * threads will not overwrite. If the SPTE was
313 * already marked as removed then another thread
314 * handling a page fault could overwrite it, so
315 * set the SPTE until it is set from some other
316 * value to the removed SPTE value.
317 */
318 for (;;) {
319 old_child_spte = xchg(sptep, REMOVED_SPTE);
320 if (!is_removed_spte(old_child_spte))
321 break;
322 cpu_relax();
323 }
9a77daac 324 } else {
8df9f1af
SC
325 /*
326 * If the SPTE is not MMU-present, there is no backing
327 * page associated with the SPTE and so no side effects
328 * that need to be recorded, and exclusive ownership of
329 * mmu_lock ensures the SPTE can't be made present.
330 * Note, zapping MMIO SPTEs is also unnecessary as they
331 * are guarded by the memslots generation, not by being
332 * unreachable.
333 */
9a77daac 334 old_child_spte = READ_ONCE(*sptep);
8df9f1af
SC
335 if (!is_shadow_present_pte(old_child_spte))
336 continue;
e25f0e0c
BG
337
338 /*
339 * Marking the SPTE as a removed SPTE is not
340 * strictly necessary here as the MMU lock will
341 * stop other threads from concurrently modifying
342 * this SPTE. Using the removed SPTE value keeps
343 * the two branches consistent and simplifies
344 * the function.
345 */
346 WRITE_ONCE(*sptep, REMOVED_SPTE);
9a77daac 347 }
e25f0e0c
BG
348 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
349 old_child_spte, REMOVED_SPTE, level - 1,
350 shared);
a066e61f
BG
351 }
352
353 kvm_flush_remote_tlbs_with_address(kvm, gfn,
354 KVM_PAGES_PER_HPAGE(level));
355
7cca2d0b 356 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
357}
358
2f2fad08
BG
359/**
360 * handle_changed_spte - handle bookkeeping associated with an SPTE change
361 * @kvm: kvm instance
362 * @as_id: the address space of the paging structure the SPTE was a part of
363 * @gfn: the base GFN that was mapped by the SPTE
364 * @old_spte: The value of the SPTE before the change
365 * @new_spte: The value of the SPTE after the change
366 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
367 * @shared: This operation may not be running under the exclusive use of
368 * the MMU lock and the operation must synchronize with other
369 * threads that might be modifying SPTEs.
2f2fad08
BG
370 *
371 * Handle bookkeeping that might result from the modification of a SPTE.
372 * This function must be called for all TDP SPTE modifications.
373 */
374static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
375 u64 old_spte, u64 new_spte, int level,
376 bool shared)
2f2fad08
BG
377{
378 bool was_present = is_shadow_present_pte(old_spte);
379 bool is_present = is_shadow_present_pte(new_spte);
380 bool was_leaf = was_present && is_last_spte(old_spte, level);
381 bool is_leaf = is_present && is_last_spte(new_spte, level);
382 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08
BG
383
384 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
385 WARN_ON(level < PG_LEVEL_4K);
764388ce 386 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
387
388 /*
389 * If this warning were to trigger it would indicate that there was a
390 * missing MMU notifier or a race with some notifier handler.
391 * A present, leaf SPTE should never be directly replaced with another
392 * present leaf SPTE pointing to a differnt PFN. A notifier handler
393 * should be zapping the SPTE before the main MM's page table is
394 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
395 * thread before replacement.
396 */
397 if (was_leaf && is_leaf && pfn_changed) {
398 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
399 "SPTE with another present leaf SPTE mapping a\n"
400 "different PFN!\n"
401 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
402 as_id, gfn, old_spte, new_spte, level);
403
404 /*
405 * Crash the host to prevent error propagation and guest data
406 * courruption.
407 */
408 BUG();
409 }
410
411 if (old_spte == new_spte)
412 return;
413
b9a98c34
BG
414 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
415
2f2fad08
BG
416 /*
417 * The only times a SPTE should be changed from a non-present to
418 * non-present state is when an MMIO entry is installed/modified/
419 * removed. In that case, there is nothing to do here.
420 */
421 if (!was_present && !is_present) {
422 /*
08f07c80
BG
423 * If this change does not involve a MMIO SPTE or removed SPTE,
424 * it is unexpected. Log the change, though it should not
425 * impact the guest since both the former and current SPTEs
426 * are nonpresent.
2f2fad08 427 */
08f07c80
BG
428 if (WARN_ON(!is_mmio_spte(old_spte) &&
429 !is_mmio_spte(new_spte) &&
430 !is_removed_spte(new_spte)))
2f2fad08
BG
431 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
432 "should not be replaced with another,\n"
433 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
434 "are MMIO SPTEs, or the new SPTE is\n"
435 "a temporary removed SPTE.\n"
2f2fad08
BG
436 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
437 as_id, gfn, old_spte, new_spte, level);
438 return;
439 }
440
441
442 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 443 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
444 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
445
446 /*
447 * Recursively handle child PTs if the change removed a subtree from
448 * the paging structure.
449 */
a066e61f
BG
450 if (was_present && !was_leaf && (pfn_changed || !is_present))
451 handle_removed_tdp_mmu_page(kvm,
9a77daac 452 spte_to_child_pt(old_spte, level), shared);
2f2fad08
BG
453}
454
455static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
456 u64 old_spte, u64 new_spte, int level,
457 bool shared)
2f2fad08 458{
9a77daac
BG
459 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
460 shared);
f8e14497 461 handle_changed_spte_acc_track(old_spte, new_spte, level);
a6a0b05d
BG
462 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
463 new_spte, level);
2f2fad08 464}
faaf05b0 465
9a77daac
BG
466/*
467 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
468 * associated bookkeeping
469 *
470 * @kvm: kvm instance
471 * @iter: a tdp_iter instance currently on the SPTE that should be set
472 * @new_spte: The value the SPTE should be set to
473 * Returns: true if the SPTE was set, false if it was not. If false is returned,
474 * this function will have no side-effects.
475 */
476static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
477 struct tdp_iter *iter,
478 u64 new_spte)
479{
9a77daac
BG
480 lockdep_assert_held_read(&kvm->mmu_lock);
481
08f07c80
BG
482 /*
483 * Do not change removed SPTEs. Only the thread that froze the SPTE
484 * may modify it.
485 */
7a51393a 486 if (is_removed_spte(iter->old_spte))
08f07c80
BG
487 return false;
488
9a77daac
BG
489 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
490 new_spte) != iter->old_spte)
491 return false;
492
08889894
SC
493 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
494 new_spte, iter->level, true);
9a77daac
BG
495
496 return true;
497}
498
08f07c80
BG
499static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
500 struct tdp_iter *iter)
501{
502 /*
503 * Freeze the SPTE by setting it to a special,
504 * non-present value. This will stop other threads from
505 * immediately installing a present entry in its place
506 * before the TLBs are flushed.
507 */
508 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
509 return false;
510
511 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
512 KVM_PAGES_PER_HPAGE(iter->level));
513
514 /*
515 * No other thread can overwrite the removed SPTE as they
516 * must either wait on the MMU lock or use
517 * tdp_mmu_set_spte_atomic which will not overrite the
518 * special removed SPTE value. No bookkeeping is needed
519 * here since the SPTE is going from non-present
520 * to non-present.
521 */
14f6fec2 522 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
08f07c80
BG
523
524 return true;
525}
526
9a77daac 527
fe43fa2f
BG
528/*
529 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
530 * @kvm: kvm instance
531 * @iter: a tdp_iter instance currently on the SPTE that should be set
532 * @new_spte: The value the SPTE should be set to
533 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
534 * of the page. Should be set unless handling an MMU
535 * notifier for access tracking. Leaving record_acc_track
536 * unset in that case prevents page accesses from being
537 * double counted.
538 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
539 * appropriate for the change being made. Should be set
540 * unless performing certain dirty logging operations.
541 * Leaving record_dirty_log unset in that case prevents page
542 * writes from being double counted.
543 */
f8e14497 544static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
a6a0b05d
BG
545 u64 new_spte, bool record_acc_track,
546 bool record_dirty_log)
faaf05b0 547{
531810ca 548 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 549
08f07c80
BG
550 /*
551 * No thread should be using this function to set SPTEs to the
552 * temporary removed SPTE value.
553 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
554 * should be used. If operating under the MMU lock in write mode, the
555 * use of the removed SPTE should not be necessary.
556 */
7a51393a 557 WARN_ON(is_removed_spte(iter->old_spte));
08f07c80 558
7cca2d0b 559 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
f8e14497 560
08889894
SC
561 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
562 new_spte, iter->level, false);
f8e14497
BG
563 if (record_acc_track)
564 handle_changed_spte_acc_track(iter->old_spte, new_spte,
565 iter->level);
a6a0b05d 566 if (record_dirty_log)
08889894 567 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
a6a0b05d
BG
568 iter->old_spte, new_spte,
569 iter->level);
f8e14497
BG
570}
571
572static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
573 u64 new_spte)
574{
a6a0b05d 575 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
f8e14497 576}
faaf05b0 577
f8e14497
BG
578static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
579 struct tdp_iter *iter,
580 u64 new_spte)
581{
a6a0b05d
BG
582 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
583}
584
585static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
586 struct tdp_iter *iter,
587 u64 new_spte)
588{
589 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
faaf05b0
BG
590}
591
592#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
593 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
594
f8e14497
BG
595#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
596 tdp_root_for_each_pte(_iter, _root, _start, _end) \
597 if (!is_shadow_present_pte(_iter.old_spte) || \
598 !is_last_spte(_iter.old_spte, _iter.level)) \
599 continue; \
600 else
601
bb18842e
BG
602#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
603 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
604 _mmu->shadow_root_level, _start, _end)
605
e28a436c
BG
606/*
607 * Yield if the MMU lock is contended or this thread needs to return control
608 * to the scheduler.
609 *
e139a34e
BG
610 * If this function should yield and flush is set, it will perform a remote
611 * TLB flush before yielding.
612 *
e28a436c 613 * If this function yields, it will also reset the tdp_iter's walk over the
ed5e484b
BG
614 * paging structure and the calling function should skip to the next
615 * iteration to allow the iterator to continue its traversal from the
616 * paging structure root.
e28a436c
BG
617 *
618 * Return true if this function yielded and the iterator's traversal was reset.
619 * Return false if a yield was not needed.
620 */
e139a34e
BG
621static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
622 struct tdp_iter *iter, bool flush)
a6a0b05d 623{
ed5e484b
BG
624 /* Ensure forward progress has been made before yielding. */
625 if (iter->next_last_level_gfn == iter->yielded_gfn)
626 return false;
627
531810ca 628 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7cca2d0b
BG
629 rcu_read_unlock();
630
e139a34e
BG
631 if (flush)
632 kvm_flush_remote_tlbs(kvm);
633
531810ca 634 cond_resched_rwlock_write(&kvm->mmu_lock);
7cca2d0b 635 rcu_read_lock();
ed5e484b
BG
636
637 WARN_ON(iter->gfn > iter->next_last_level_gfn);
638
b601c3bc 639 tdp_iter_restart(iter);
ed5e484b 640
e28a436c 641 return true;
a6a0b05d 642 }
e28a436c
BG
643
644 return false;
a6a0b05d
BG
645}
646
faaf05b0
BG
647/*
648 * Tears down the mappings for the range of gfns, [start, end), and frees the
649 * non-root pages mapping GFNs strictly within that range. Returns true if
650 * SPTEs have been cleared and a TLB flush is needed before releasing the
651 * MMU lock.
063afacd
BG
652 * If can_yield is true, will release the MMU lock and reschedule if the
653 * scheduler needs the CPU or there is contention on the MMU lock. If this
654 * function cannot yield, it will not release the MMU lock or reschedule and
655 * the caller must ensure it does not supply too large a GFN range, or the
a835429c
SC
656 * operation can cause a soft lockup. Note, in some use cases a flush may be
657 * required by prior actions. Ensure the pending flush is performed prior to
658 * yielding.
faaf05b0
BG
659 */
660static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
a835429c 661 gfn_t start, gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
662{
663 struct tdp_iter iter;
faaf05b0 664
7cca2d0b
BG
665 rcu_read_lock();
666
faaf05b0 667 tdp_root_for_each_pte(iter, root, start, end) {
1af4a960 668 if (can_yield &&
a835429c
SC
669 tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
670 flush = false;
1af4a960
BG
671 continue;
672 }
673
faaf05b0
BG
674 if (!is_shadow_present_pte(iter.old_spte))
675 continue;
676
677 /*
678 * If this is a non-last-level SPTE that covers a larger range
679 * than should be zapped, continue, and zap the mappings at a
680 * lower level.
681 */
682 if ((iter.gfn < start ||
683 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
684 !is_last_spte(iter.old_spte, iter.level))
685 continue;
686
687 tdp_mmu_set_spte(kvm, &iter, 0);
a835429c 688 flush = true;
faaf05b0 689 }
7cca2d0b
BG
690
691 rcu_read_unlock();
a835429c 692 return flush;
faaf05b0
BG
693}
694
695/*
696 * Tears down the mappings for the range of gfns, [start, end), and frees the
697 * non-root pages mapping GFNs strictly within that range. Returns true if
698 * SPTEs have been cleared and a TLB flush is needed before releasing the
699 * MMU lock.
700 */
2b9663d8
SC
701bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
702 gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
703{
704 struct kvm_mmu_page *root;
faaf05b0 705
a3f15bda 706 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
33a31641 707 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
faaf05b0 708
faaf05b0
BG
709 return flush;
710}
711
712void kvm_tdp_mmu_zap_all(struct kvm *kvm)
713{
339f5a7f 714 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
2b9663d8
SC
715 bool flush = false;
716 int i;
717
718 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
719 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
faaf05b0 720
faaf05b0
BG
721 if (flush)
722 kvm_flush_remote_tlbs(kvm);
723}
bb18842e
BG
724
725/*
726 * Installs a last-level SPTE to handle a TDP page fault.
727 * (NPT/EPT violation/misconfiguration)
728 */
729static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
730 int map_writable,
731 struct tdp_iter *iter,
732 kvm_pfn_t pfn, bool prefault)
733{
734 u64 new_spte;
735 int ret = 0;
736 int make_spte_ret = 0;
737
9a77daac 738 if (unlikely(is_noslot_pfn(pfn)))
bb18842e 739 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 740 else
bb18842e
BG
741 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
742 pfn, iter->old_spte, prefault, true,
743 map_writable, !shadow_accessed_mask,
744 &new_spte);
745
746 if (new_spte == iter->old_spte)
747 ret = RET_PF_SPURIOUS;
9a77daac
BG
748 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
749 return RET_PF_RETRY;
bb18842e
BG
750
751 /*
752 * If the page fault was caused by a write but the page is write
753 * protected, emulation is needed. If the emulation was skipped,
754 * the vCPU would have the same fault again.
755 */
756 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
757 if (write)
758 ret = RET_PF_EMULATE;
759 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
760 }
761
762 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac
BG
763 if (unlikely(is_mmio_spte(new_spte))) {
764 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
765 new_spte);
bb18842e 766 ret = RET_PF_EMULATE;
3849e092 767 } else {
9a77daac
BG
768 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
769 rcu_dereference(iter->sptep));
3849e092 770 }
bb18842e 771
bb18842e
BG
772 if (!prefault)
773 vcpu->stat.pf_fixed++;
774
775 return ret;
776}
777
778/*
779 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
780 * page tables and SPTEs to translate the faulting guest physical address.
781 */
782int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
783 int map_writable, int max_level, kvm_pfn_t pfn,
784 bool prefault)
785{
786 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
787 bool write = error_code & PFERR_WRITE_MASK;
788 bool exec = error_code & PFERR_FETCH_MASK;
789 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
790 struct kvm_mmu *mmu = vcpu->arch.mmu;
791 struct tdp_iter iter;
89c0fd49 792 struct kvm_mmu_page *sp;
bb18842e
BG
793 u64 *child_pt;
794 u64 new_spte;
795 int ret;
796 gfn_t gfn = gpa >> PAGE_SHIFT;
797 int level;
798 int req_level;
799
800 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
801 return RET_PF_RETRY;
802 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
803 return RET_PF_RETRY;
804
805 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
806 huge_page_disallowed, &req_level);
807
808 trace_kvm_mmu_spte_requested(gpa, level, pfn);
7cca2d0b
BG
809
810 rcu_read_lock();
811
bb18842e
BG
812 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
813 if (nx_huge_page_workaround_enabled)
814 disallowed_hugepage_adjust(iter.old_spte, gfn,
815 iter.level, &pfn, &level);
816
817 if (iter.level == level)
818 break;
819
820 /*
821 * If there is an SPTE mapping a large page at a higher level
822 * than the target, that SPTE must be cleared and replaced
823 * with a non-leaf SPTE.
824 */
825 if (is_shadow_present_pte(iter.old_spte) &&
826 is_large_pte(iter.old_spte)) {
08f07c80 827 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
9a77daac 828 break;
bb18842e 829
bb18842e
BG
830 /*
831 * The iter must explicitly re-read the spte here
832 * because the new value informs the !present
833 * path below.
834 */
7cca2d0b 835 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
bb18842e
BG
836 }
837
838 if (!is_shadow_present_pte(iter.old_spte)) {
89c0fd49 839 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
89c0fd49 840 child_pt = sp->spt;
a9442f59 841
bb18842e
BG
842 new_spte = make_nonleaf_spte(child_pt,
843 !shadow_accessed_mask);
844
9a77daac
BG
845 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
846 new_spte)) {
847 tdp_mmu_link_page(vcpu->kvm, sp, true,
848 huge_page_disallowed &&
849 req_level >= iter.level);
850
851 trace_kvm_mmu_get_page(sp, true);
852 } else {
853 tdp_mmu_free_sp(sp);
854 break;
855 }
bb18842e
BG
856 }
857 }
858
9a77daac 859 if (iter.level != level) {
7cca2d0b 860 rcu_read_unlock();
bb18842e 861 return RET_PF_RETRY;
7cca2d0b 862 }
bb18842e
BG
863
864 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
865 pfn, prefault);
7cca2d0b 866 rcu_read_unlock();
bb18842e
BG
867
868 return ret;
869}
063afacd 870
3039bcc7
SC
871bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
872 bool flush)
063afacd 873{
063afacd 874 struct kvm_mmu_page *root;
063afacd 875
3039bcc7
SC
876 for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
877 flush |= zap_gfn_range(kvm, root, range->start, range->end,
e1eed584 878 range->may_block, flush);
063afacd 879
3039bcc7 880 return flush;
20321957
SC
881}
882
3039bcc7
SC
883typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
884 struct kvm_gfn_range *range);
063afacd 885
3039bcc7
SC
886static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
887 struct kvm_gfn_range *range,
888 tdp_handler_t handler)
063afacd 889{
3039bcc7
SC
890 struct kvm_mmu_page *root;
891 struct tdp_iter iter;
892 bool ret = false;
893
894 rcu_read_lock();
895
e1eed584
SC
896 /*
897 * Don't support rescheduling, none of the MMU notifiers that funnel
898 * into this helper allow blocking; it'd be dead, wasteful code.
899 */
3039bcc7
SC
900 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
901 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
902 ret |= handler(kvm, &iter, range);
903 }
904
905 rcu_read_unlock();
906
907 return ret;
063afacd 908}
f8e14497
BG
909
910/*
911 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
912 * if any of the GFNs in the range have been accessed.
913 */
3039bcc7
SC
914static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
915 struct kvm_gfn_range *range)
f8e14497 916{
3039bcc7 917 u64 new_spte = 0;
f8e14497 918
3039bcc7
SC
919 /* If we have a non-accessed entry we don't need to change the pte. */
920 if (!is_accessed_spte(iter->old_spte))
921 return false;
7cca2d0b 922
3039bcc7
SC
923 new_spte = iter->old_spte;
924
925 if (spte_ad_enabled(new_spte)) {
926 new_spte &= ~shadow_accessed_mask;
927 } else {
f8e14497 928 /*
3039bcc7
SC
929 * Capture the dirty status of the page, so that it doesn't get
930 * lost when the SPTE is marked for access tracking.
f8e14497 931 */
3039bcc7
SC
932 if (is_writable_pte(new_spte))
933 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
f8e14497 934
3039bcc7 935 new_spte = mark_spte_for_access_track(new_spte);
f8e14497
BG
936 }
937
3039bcc7 938 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
7cca2d0b 939
3039bcc7 940 return true;
f8e14497
BG
941}
942
3039bcc7 943bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 944{
3039bcc7 945 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
946}
947
3039bcc7
SC
948static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
949 struct kvm_gfn_range *range)
f8e14497 950{
3039bcc7 951 return is_accessed_spte(iter->old_spte);
f8e14497
BG
952}
953
3039bcc7 954bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 955{
3039bcc7 956 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 957}
1d8dd6b3 958
3039bcc7
SC
959static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
960 struct kvm_gfn_range *range)
1d8dd6b3 961{
1d8dd6b3 962 u64 new_spte;
7cca2d0b 963
3039bcc7
SC
964 /* Huge pages aren't expected to be modified without first being zapped. */
965 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1d8dd6b3 966
3039bcc7
SC
967 if (iter->level != PG_LEVEL_4K ||
968 !is_shadow_present_pte(iter->old_spte))
969 return false;
1d8dd6b3 970
3039bcc7
SC
971 /*
972 * Note, when changing a read-only SPTE, it's not strictly necessary to
973 * zero the SPTE before setting the new PFN, but doing so preserves the
974 * invariant that the PFN of a present * leaf SPTE can never change.
975 * See __handle_changed_spte().
976 */
977 tdp_mmu_set_spte(kvm, iter, 0);
1d8dd6b3 978
3039bcc7
SC
979 if (!pte_write(range->pte)) {
980 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
981 pte_pfn(range->pte));
1d8dd6b3 982
3039bcc7 983 tdp_mmu_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
984 }
985
3039bcc7 986 return true;
1d8dd6b3
BG
987}
988
3039bcc7
SC
989/*
990 * Handle the changed_pte MMU notifier for the TDP MMU.
991 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
992 * notifier.
993 * Returns non-zero if a flush is needed before releasing the MMU lock.
994 */
995bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 996{
3039bcc7
SC
997 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
998
999 /* FIXME: return 'flush' instead of flushing here. */
1000 if (flush)
1001 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1002
1003 return false;
1d8dd6b3
BG
1004}
1005
a6a0b05d
BG
1006/*
1007 * Remove write access from all the SPTEs mapping GFNs [start, end). If
1008 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1009 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1010 */
1011static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1012 gfn_t start, gfn_t end, int min_level)
1013{
1014 struct tdp_iter iter;
1015 u64 new_spte;
1016 bool spte_set = false;
1017
7cca2d0b
BG
1018 rcu_read_lock();
1019
a6a0b05d
BG
1020 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1021
1022 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1023 min_level, start, end) {
1af4a960
BG
1024 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1025 continue;
1026
a6a0b05d 1027 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1028 !is_last_spte(iter.old_spte, iter.level) ||
1029 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1030 continue;
1031
1032 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1033
1034 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1035 spte_set = true;
a6a0b05d 1036 }
7cca2d0b
BG
1037
1038 rcu_read_unlock();
a6a0b05d
BG
1039 return spte_set;
1040}
1041
1042/*
1043 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1044 * only affect leaf SPTEs down to min_level.
1045 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1046 */
1047bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1048 int min_level)
1049{
1050 struct kvm_mmu_page *root;
a6a0b05d
BG
1051 bool spte_set = false;
1052
a3f15bda 1053 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
a6a0b05d
BG
1054 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1055 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1056
1057 return spte_set;
1058}
1059
1060/*
1061 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1062 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1063 * If AD bits are not enabled, this will require clearing the writable bit on
1064 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1065 * be flushed.
1066 */
1067static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1068 gfn_t start, gfn_t end)
1069{
1070 struct tdp_iter iter;
1071 u64 new_spte;
1072 bool spte_set = false;
1073
7cca2d0b
BG
1074 rcu_read_lock();
1075
a6a0b05d 1076 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1af4a960
BG
1077 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1078 continue;
1079
a6a0b05d
BG
1080 if (spte_ad_need_write_protect(iter.old_spte)) {
1081 if (is_writable_pte(iter.old_spte))
1082 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1083 else
1084 continue;
1085 } else {
1086 if (iter.old_spte & shadow_dirty_mask)
1087 new_spte = iter.old_spte & ~shadow_dirty_mask;
1088 else
1089 continue;
1090 }
1091
1092 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1093 spte_set = true;
a6a0b05d 1094 }
7cca2d0b
BG
1095
1096 rcu_read_unlock();
a6a0b05d
BG
1097 return spte_set;
1098}
1099
1100/*
1101 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1102 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1103 * If AD bits are not enabled, this will require clearing the writable bit on
1104 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1105 * be flushed.
1106 */
1107bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1108{
1109 struct kvm_mmu_page *root;
a6a0b05d
BG
1110 bool spte_set = false;
1111
a3f15bda 1112 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
a6a0b05d
BG
1113 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1114 slot->base_gfn + slot->npages);
a6a0b05d
BG
1115
1116 return spte_set;
1117}
1118
1119/*
1120 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1121 * set in mask, starting at gfn. The given memslot is expected to contain all
1122 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1123 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1124 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1125 */
1126static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1127 gfn_t gfn, unsigned long mask, bool wrprot)
1128{
1129 struct tdp_iter iter;
1130 u64 new_spte;
1131
7cca2d0b
BG
1132 rcu_read_lock();
1133
a6a0b05d
BG
1134 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1135 gfn + BITS_PER_LONG) {
1136 if (!mask)
1137 break;
1138
1139 if (iter.level > PG_LEVEL_4K ||
1140 !(mask & (1UL << (iter.gfn - gfn))))
1141 continue;
1142
f1b3b06a
BG
1143 mask &= ~(1UL << (iter.gfn - gfn));
1144
a6a0b05d
BG
1145 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1146 if (is_writable_pte(iter.old_spte))
1147 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1148 else
1149 continue;
1150 } else {
1151 if (iter.old_spte & shadow_dirty_mask)
1152 new_spte = iter.old_spte & ~shadow_dirty_mask;
1153 else
1154 continue;
1155 }
1156
1157 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
a6a0b05d 1158 }
7cca2d0b
BG
1159
1160 rcu_read_unlock();
a6a0b05d
BG
1161}
1162
1163/*
1164 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1165 * set in mask, starting at gfn. The given memslot is expected to contain all
1166 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1167 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1168 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1169 */
1170void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1171 struct kvm_memory_slot *slot,
1172 gfn_t gfn, unsigned long mask,
1173 bool wrprot)
1174{
1175 struct kvm_mmu_page *root;
a6a0b05d 1176
531810ca 1177 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1178 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1179 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1180}
1181
14881998 1182/*
87aa9ec9
BG
1183 * Clear leaf entries which could be replaced by large mappings, for
1184 * GFNs within the slot.
14881998 1185 */
af95b53e 1186static bool zap_collapsible_spte_range(struct kvm *kvm,
14881998 1187 struct kvm_mmu_page *root,
8ca6f063 1188 const struct kvm_memory_slot *slot,
af95b53e 1189 bool flush)
14881998 1190{
9eba50f8
SC
1191 gfn_t start = slot->base_gfn;
1192 gfn_t end = start + slot->npages;
14881998
BG
1193 struct tdp_iter iter;
1194 kvm_pfn_t pfn;
14881998 1195
7cca2d0b
BG
1196 rcu_read_lock();
1197
14881998 1198 tdp_root_for_each_pte(iter, root, start, end) {
af95b53e
SC
1199 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
1200 flush = false;
1af4a960
BG
1201 continue;
1202 }
1203
14881998 1204 if (!is_shadow_present_pte(iter.old_spte) ||
87aa9ec9 1205 !is_last_spte(iter.old_spte, iter.level))
14881998
BG
1206 continue;
1207
1208 pfn = spte_to_pfn(iter.old_spte);
1209 if (kvm_is_reserved_pfn(pfn) ||
9eba50f8
SC
1210 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1211 pfn, PG_LEVEL_NUM))
14881998
BG
1212 continue;
1213
1214 tdp_mmu_set_spte(kvm, &iter, 0);
1215
af95b53e 1216 flush = true;
14881998
BG
1217 }
1218
7cca2d0b 1219 rcu_read_unlock();
af95b53e
SC
1220
1221 return flush;
14881998
BG
1222}
1223
1224/*
1225 * Clear non-leaf entries (and free associated page tables) which could
1226 * be replaced by large mappings, for GFNs within the slot.
1227 */
142ccde1 1228bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
8ca6f063
BG
1229 const struct kvm_memory_slot *slot,
1230 bool flush)
14881998
BG
1231{
1232 struct kvm_mmu_page *root;
14881998 1233
a3f15bda 1234 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
af95b53e 1235 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
af95b53e 1236
142ccde1 1237 return flush;
14881998 1238}
46044f72
BG
1239
1240/*
1241 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1242 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1243 * Returns true if an SPTE was set and a TLB flush is needed.
1244 */
1245static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1246 gfn_t gfn)
1247{
1248 struct tdp_iter iter;
1249 u64 new_spte;
1250 bool spte_set = false;
1251
7cca2d0b
BG
1252 rcu_read_lock();
1253
46044f72
BG
1254 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1255 if (!is_writable_pte(iter.old_spte))
1256 break;
1257
1258 new_spte = iter.old_spte &
5fc3424f 1259 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72
BG
1260
1261 tdp_mmu_set_spte(kvm, &iter, new_spte);
1262 spte_set = true;
1263 }
1264
7cca2d0b
BG
1265 rcu_read_unlock();
1266
46044f72
BG
1267 return spte_set;
1268}
1269
1270/*
1271 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1272 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1273 * Returns true if an SPTE was set and a TLB flush is needed.
1274 */
1275bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1276 struct kvm_memory_slot *slot, gfn_t gfn)
1277{
1278 struct kvm_mmu_page *root;
46044f72
BG
1279 bool spte_set = false;
1280
531810ca 1281 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1282 for_each_tdp_mmu_root(kvm, root, slot->as_id)
46044f72 1283 spte_set |= write_protect_gfn(kvm, root, gfn);
a3f15bda 1284
46044f72
BG
1285 return spte_set;
1286}
1287
95fb5b02
BG
1288/*
1289 * Return the level of the lowest level SPTE added to sptes.
1290 * That SPTE may be non-present.
1291 */
39b4d43e
SC
1292int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1293 int *root_level)
95fb5b02
BG
1294{
1295 struct tdp_iter iter;
1296 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1297 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1298 int leaf = -1;
95fb5b02 1299
39b4d43e 1300 *root_level = vcpu->arch.mmu->shadow_root_level;
95fb5b02 1301
7cca2d0b
BG
1302 rcu_read_lock();
1303
95fb5b02
BG
1304 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1305 leaf = iter.level;
dde81f94 1306 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1307 }
1308
7cca2d0b
BG
1309 rcu_read_unlock();
1310
95fb5b02
BG
1311 return leaf;
1312}