Commit | Line | Data |
---|---|---|
fe5db27d BG |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
02c00b3a BG |
3 | #include "mmu.h" |
4 | #include "mmu_internal.h" | |
bb18842e | 5 | #include "mmutrace.h" |
2f2fad08 | 6 | #include "tdp_iter.h" |
fe5db27d | 7 | #include "tdp_mmu.h" |
02c00b3a | 8 | #include "spte.h" |
fe5db27d | 9 | |
33dd3574 BG |
10 | #include <trace/events/kvm.h> |
11 | ||
95fb5b02 | 12 | #ifdef CONFIG_X86_64 |
fe5db27d | 13 | static bool __read_mostly tdp_mmu_enabled = false; |
95fb5b02 BG |
14 | module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); |
15 | #endif | |
fe5db27d BG |
16 | |
17 | static bool is_tdp_mmu_enabled(void) | |
18 | { | |
19 | #ifdef CONFIG_X86_64 | |
20 | return tdp_enabled && READ_ONCE(tdp_mmu_enabled); | |
21 | #else | |
22 | return false; | |
23 | #endif /* CONFIG_X86_64 */ | |
24 | } | |
25 | ||
26 | /* Initializes the TDP MMU for the VM, if enabled. */ | |
27 | void kvm_mmu_init_tdp_mmu(struct kvm *kvm) | |
28 | { | |
29 | if (!is_tdp_mmu_enabled()) | |
30 | return; | |
31 | ||
32 | /* This should not be changed for the lifetime of the VM. */ | |
33 | kvm->arch.tdp_mmu_enabled = true; | |
02c00b3a BG |
34 | |
35 | INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); | |
89c0fd49 | 36 | INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); |
fe5db27d BG |
37 | } |
38 | ||
39 | void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) | |
40 | { | |
41 | if (!kvm->arch.tdp_mmu_enabled) | |
42 | return; | |
02c00b3a BG |
43 | |
44 | WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); | |
45 | } | |
46 | ||
a889ea54 BG |
47 | static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) |
48 | { | |
49 | if (kvm_mmu_put_root(kvm, root)) | |
50 | kvm_tdp_mmu_free_root(kvm, root); | |
51 | } | |
52 | ||
53 | static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, | |
54 | struct kvm_mmu_page *root) | |
55 | { | |
56 | lockdep_assert_held(&kvm->mmu_lock); | |
57 | ||
58 | if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) | |
59 | return false; | |
60 | ||
61 | kvm_mmu_get_root(kvm, root); | |
62 | return true; | |
63 | ||
64 | } | |
65 | ||
66 | static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, | |
67 | struct kvm_mmu_page *root) | |
68 | { | |
69 | struct kvm_mmu_page *next_root; | |
70 | ||
71 | next_root = list_next_entry(root, link); | |
72 | tdp_mmu_put_root(kvm, root); | |
73 | return next_root; | |
74 | } | |
75 | ||
76 | /* | |
77 | * Note: this iterator gets and puts references to the roots it iterates over. | |
78 | * This makes it safe to release the MMU lock and yield within the loop, but | |
79 | * if exiting the loop early, the caller must drop the reference to the most | |
80 | * recent root. (Unless keeping a live reference is desirable.) | |
81 | */ | |
82 | #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ | |
83 | for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ | |
84 | typeof(*_root), link); \ | |
85 | tdp_mmu_next_root_valid(_kvm, _root); \ | |
86 | _root = tdp_mmu_next_root(_kvm, _root)) | |
87 | ||
88 | #define for_each_tdp_mmu_root(_kvm, _root) \ | |
02c00b3a BG |
89 | list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) |
90 | ||
91 | bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) | |
92 | { | |
93 | struct kvm_mmu_page *sp; | |
94 | ||
c887c9b9 PB |
95 | if (!kvm->arch.tdp_mmu_enabled) |
96 | return false; | |
97 | if (WARN_ON(!VALID_PAGE(hpa))) | |
98 | return false; | |
99 | ||
02c00b3a | 100 | sp = to_shadow_page(hpa); |
c887c9b9 PB |
101 | if (WARN_ON(!sp)) |
102 | return false; | |
02c00b3a BG |
103 | |
104 | return sp->tdp_mmu_page && sp->root_count; | |
105 | } | |
106 | ||
faaf05b0 | 107 | static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, |
063afacd | 108 | gfn_t start, gfn_t end, bool can_yield); |
faaf05b0 | 109 | |
02c00b3a BG |
110 | void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) |
111 | { | |
339f5a7f | 112 | gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); |
faaf05b0 | 113 | |
02c00b3a BG |
114 | lockdep_assert_held(&kvm->mmu_lock); |
115 | ||
116 | WARN_ON(root->root_count); | |
117 | WARN_ON(!root->tdp_mmu_page); | |
118 | ||
119 | list_del(&root->link); | |
120 | ||
063afacd | 121 | zap_gfn_range(kvm, root, 0, max_gfn, false); |
faaf05b0 | 122 | |
02c00b3a BG |
123 | free_page((unsigned long)root->spt); |
124 | kmem_cache_free(mmu_page_header_cache, root); | |
125 | } | |
126 | ||
127 | static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, | |
128 | int level) | |
129 | { | |
130 | union kvm_mmu_page_role role; | |
131 | ||
132 | role = vcpu->arch.mmu->mmu_role.base; | |
133 | role.level = level; | |
134 | role.direct = true; | |
135 | role.gpte_is_8_bytes = true; | |
136 | role.access = ACC_ALL; | |
137 | ||
138 | return role; | |
139 | } | |
140 | ||
141 | static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, | |
142 | int level) | |
143 | { | |
144 | struct kvm_mmu_page *sp; | |
145 | ||
146 | sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); | |
147 | sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); | |
148 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | |
149 | ||
150 | sp->role.word = page_role_for_level(vcpu, level).word; | |
151 | sp->gfn = gfn; | |
152 | sp->tdp_mmu_page = true; | |
153 | ||
33dd3574 BG |
154 | trace_kvm_mmu_get_page(sp, true); |
155 | ||
02c00b3a BG |
156 | return sp; |
157 | } | |
158 | ||
159 | static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) | |
160 | { | |
161 | union kvm_mmu_page_role role; | |
162 | struct kvm *kvm = vcpu->kvm; | |
163 | struct kvm_mmu_page *root; | |
164 | ||
165 | role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); | |
166 | ||
167 | spin_lock(&kvm->mmu_lock); | |
168 | ||
169 | /* Check for an existing root before allocating a new one. */ | |
170 | for_each_tdp_mmu_root(kvm, root) { | |
171 | if (root->role.word == role.word) { | |
172 | kvm_mmu_get_root(kvm, root); | |
173 | spin_unlock(&kvm->mmu_lock); | |
174 | return root; | |
175 | } | |
176 | } | |
177 | ||
178 | root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); | |
179 | root->root_count = 1; | |
180 | ||
181 | list_add(&root->link, &kvm->arch.tdp_mmu_roots); | |
182 | ||
183 | spin_unlock(&kvm->mmu_lock); | |
184 | ||
185 | return root; | |
186 | } | |
187 | ||
188 | hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) | |
189 | { | |
190 | struct kvm_mmu_page *root; | |
191 | ||
192 | root = get_tdp_mmu_vcpu_root(vcpu); | |
193 | if (!root) | |
194 | return INVALID_PAGE; | |
195 | ||
196 | return __pa(root->spt); | |
fe5db27d | 197 | } |
2f2fad08 BG |
198 | |
199 | static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, | |
200 | u64 old_spte, u64 new_spte, int level); | |
201 | ||
faaf05b0 BG |
202 | static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) |
203 | { | |
204 | return sp->role.smm ? 1 : 0; | |
205 | } | |
206 | ||
f8e14497 BG |
207 | static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) |
208 | { | |
209 | bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); | |
210 | ||
211 | if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) | |
212 | return; | |
213 | ||
214 | if (is_accessed_spte(old_spte) && | |
215 | (!is_accessed_spte(new_spte) || pfn_changed)) | |
216 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | |
217 | } | |
218 | ||
a6a0b05d BG |
219 | static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, |
220 | u64 old_spte, u64 new_spte, int level) | |
221 | { | |
222 | bool pfn_changed; | |
223 | struct kvm_memory_slot *slot; | |
224 | ||
225 | if (level > PG_LEVEL_4K) | |
226 | return; | |
227 | ||
228 | pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); | |
229 | ||
230 | if ((!is_writable_pte(old_spte) || pfn_changed) && | |
231 | is_writable_pte(new_spte)) { | |
232 | slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); | |
fb04a1ed | 233 | mark_page_dirty_in_slot(kvm, slot, gfn); |
a6a0b05d BG |
234 | } |
235 | } | |
236 | ||
2f2fad08 BG |
237 | /** |
238 | * handle_changed_spte - handle bookkeeping associated with an SPTE change | |
239 | * @kvm: kvm instance | |
240 | * @as_id: the address space of the paging structure the SPTE was a part of | |
241 | * @gfn: the base GFN that was mapped by the SPTE | |
242 | * @old_spte: The value of the SPTE before the change | |
243 | * @new_spte: The value of the SPTE after the change | |
244 | * @level: the level of the PT the SPTE is part of in the paging structure | |
245 | * | |
246 | * Handle bookkeeping that might result from the modification of a SPTE. | |
247 | * This function must be called for all TDP SPTE modifications. | |
248 | */ | |
249 | static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, | |
250 | u64 old_spte, u64 new_spte, int level) | |
251 | { | |
252 | bool was_present = is_shadow_present_pte(old_spte); | |
253 | bool is_present = is_shadow_present_pte(new_spte); | |
254 | bool was_leaf = was_present && is_last_spte(old_spte, level); | |
255 | bool is_leaf = is_present && is_last_spte(new_spte, level); | |
256 | bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); | |
257 | u64 *pt; | |
89c0fd49 | 258 | struct kvm_mmu_page *sp; |
2f2fad08 BG |
259 | u64 old_child_spte; |
260 | int i; | |
261 | ||
262 | WARN_ON(level > PT64_ROOT_MAX_LEVEL); | |
263 | WARN_ON(level < PG_LEVEL_4K); | |
764388ce | 264 | WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); |
2f2fad08 BG |
265 | |
266 | /* | |
267 | * If this warning were to trigger it would indicate that there was a | |
268 | * missing MMU notifier or a race with some notifier handler. | |
269 | * A present, leaf SPTE should never be directly replaced with another | |
270 | * present leaf SPTE pointing to a differnt PFN. A notifier handler | |
271 | * should be zapping the SPTE before the main MM's page table is | |
272 | * changed, or the SPTE should be zeroed, and the TLBs flushed by the | |
273 | * thread before replacement. | |
274 | */ | |
275 | if (was_leaf && is_leaf && pfn_changed) { | |
276 | pr_err("Invalid SPTE change: cannot replace a present leaf\n" | |
277 | "SPTE with another present leaf SPTE mapping a\n" | |
278 | "different PFN!\n" | |
279 | "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", | |
280 | as_id, gfn, old_spte, new_spte, level); | |
281 | ||
282 | /* | |
283 | * Crash the host to prevent error propagation and guest data | |
284 | * courruption. | |
285 | */ | |
286 | BUG(); | |
287 | } | |
288 | ||
289 | if (old_spte == new_spte) | |
290 | return; | |
291 | ||
b9a98c34 BG |
292 | trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); |
293 | ||
2f2fad08 BG |
294 | /* |
295 | * The only times a SPTE should be changed from a non-present to | |
296 | * non-present state is when an MMIO entry is installed/modified/ | |
297 | * removed. In that case, there is nothing to do here. | |
298 | */ | |
299 | if (!was_present && !is_present) { | |
300 | /* | |
301 | * If this change does not involve a MMIO SPTE, it is | |
302 | * unexpected. Log the change, though it should not impact the | |
303 | * guest since both the former and current SPTEs are nonpresent. | |
304 | */ | |
305 | if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) | |
306 | pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" | |
307 | "should not be replaced with another,\n" | |
308 | "different nonpresent SPTE, unless one or both\n" | |
309 | "are MMIO SPTEs.\n" | |
310 | "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", | |
311 | as_id, gfn, old_spte, new_spte, level); | |
312 | return; | |
313 | } | |
314 | ||
315 | ||
316 | if (was_leaf && is_dirty_spte(old_spte) && | |
317 | (!is_dirty_spte(new_spte) || pfn_changed)) | |
318 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | |
319 | ||
320 | /* | |
321 | * Recursively handle child PTs if the change removed a subtree from | |
322 | * the paging structure. | |
323 | */ | |
324 | if (was_present && !was_leaf && (pfn_changed || !is_present)) { | |
325 | pt = spte_to_child_pt(old_spte, level); | |
89c0fd49 BG |
326 | sp = sptep_to_sp(pt); |
327 | ||
33dd3574 BG |
328 | trace_kvm_mmu_prepare_zap_page(sp); |
329 | ||
89c0fd49 | 330 | list_del(&sp->link); |
2f2fad08 | 331 | |
29cf0f50 BG |
332 | if (sp->lpage_disallowed) |
333 | unaccount_huge_nx_page(kvm, sp); | |
334 | ||
2f2fad08 BG |
335 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { |
336 | old_child_spte = READ_ONCE(*(pt + i)); | |
337 | WRITE_ONCE(*(pt + i), 0); | |
338 | handle_changed_spte(kvm, as_id, | |
339 | gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), | |
340 | old_child_spte, 0, level - 1); | |
341 | } | |
342 | ||
343 | kvm_flush_remote_tlbs_with_address(kvm, gfn, | |
344 | KVM_PAGES_PER_HPAGE(level)); | |
345 | ||
346 | free_page((unsigned long)pt); | |
89c0fd49 | 347 | kmem_cache_free(mmu_page_header_cache, sp); |
2f2fad08 BG |
348 | } |
349 | } | |
350 | ||
351 | static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, | |
352 | u64 old_spte, u64 new_spte, int level) | |
353 | { | |
354 | __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); | |
f8e14497 | 355 | handle_changed_spte_acc_track(old_spte, new_spte, level); |
a6a0b05d BG |
356 | handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, |
357 | new_spte, level); | |
2f2fad08 | 358 | } |
faaf05b0 | 359 | |
fe43fa2f BG |
360 | /* |
361 | * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping | |
362 | * @kvm: kvm instance | |
363 | * @iter: a tdp_iter instance currently on the SPTE that should be set | |
364 | * @new_spte: The value the SPTE should be set to | |
365 | * @record_acc_track: Notify the MM subsystem of changes to the accessed state | |
366 | * of the page. Should be set unless handling an MMU | |
367 | * notifier for access tracking. Leaving record_acc_track | |
368 | * unset in that case prevents page accesses from being | |
369 | * double counted. | |
370 | * @record_dirty_log: Record the page as dirty in the dirty bitmap if | |
371 | * appropriate for the change being made. Should be set | |
372 | * unless performing certain dirty logging operations. | |
373 | * Leaving record_dirty_log unset in that case prevents page | |
374 | * writes from being double counted. | |
375 | */ | |
f8e14497 | 376 | static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, |
a6a0b05d BG |
377 | u64 new_spte, bool record_acc_track, |
378 | bool record_dirty_log) | |
faaf05b0 BG |
379 | { |
380 | u64 *root_pt = tdp_iter_root_pt(iter); | |
381 | struct kvm_mmu_page *root = sptep_to_sp(root_pt); | |
382 | int as_id = kvm_mmu_page_as_id(root); | |
383 | ||
3a9a4aa5 BG |
384 | lockdep_assert_held(&kvm->mmu_lock); |
385 | ||
f8e14497 BG |
386 | WRITE_ONCE(*iter->sptep, new_spte); |
387 | ||
388 | __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, | |
389 | iter->level); | |
390 | if (record_acc_track) | |
391 | handle_changed_spte_acc_track(iter->old_spte, new_spte, | |
392 | iter->level); | |
a6a0b05d BG |
393 | if (record_dirty_log) |
394 | handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, | |
395 | iter->old_spte, new_spte, | |
396 | iter->level); | |
f8e14497 BG |
397 | } |
398 | ||
399 | static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, | |
400 | u64 new_spte) | |
401 | { | |
a6a0b05d | 402 | __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); |
f8e14497 | 403 | } |
faaf05b0 | 404 | |
f8e14497 BG |
405 | static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, |
406 | struct tdp_iter *iter, | |
407 | u64 new_spte) | |
408 | { | |
a6a0b05d BG |
409 | __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); |
410 | } | |
411 | ||
412 | static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, | |
413 | struct tdp_iter *iter, | |
414 | u64 new_spte) | |
415 | { | |
416 | __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); | |
faaf05b0 BG |
417 | } |
418 | ||
419 | #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ | |
420 | for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) | |
421 | ||
f8e14497 BG |
422 | #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ |
423 | tdp_root_for_each_pte(_iter, _root, _start, _end) \ | |
424 | if (!is_shadow_present_pte(_iter.old_spte) || \ | |
425 | !is_last_spte(_iter.old_spte, _iter.level)) \ | |
426 | continue; \ | |
427 | else | |
428 | ||
bb18842e BG |
429 | #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ |
430 | for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ | |
431 | _mmu->shadow_root_level, _start, _end) | |
432 | ||
faaf05b0 | 433 | /* |
e28a436c BG |
434 | * Flush the TLB and yield if the MMU lock is contended or this thread needs to |
435 | * return control to the scheduler. | |
436 | * | |
437 | * If this function yields, it will also reset the tdp_iter's walk over the | |
438 | * paging structure and the calling function should allow the iterator to | |
439 | * continue its traversal from the paging structure root. | |
440 | * | |
441 | * Return true if this function yielded, the TLBs were flushed, and the | |
442 | * iterator's traversal was reset. Return false if a yield was not needed. | |
faaf05b0 BG |
443 | */ |
444 | static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) | |
445 | { | |
446 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | |
447 | kvm_flush_remote_tlbs(kvm); | |
448 | cond_resched_lock(&kvm->mmu_lock); | |
449 | tdp_iter_refresh_walk(iter); | |
faaf05b0 BG |
450 | return true; |
451 | } | |
e28a436c BG |
452 | |
453 | return false; | |
faaf05b0 BG |
454 | } |
455 | ||
e28a436c BG |
456 | /* |
457 | * Yield if the MMU lock is contended or this thread needs to return control | |
458 | * to the scheduler. | |
459 | * | |
460 | * If this function yields, it will also reset the tdp_iter's walk over the | |
461 | * paging structure and the calling function should allow the iterator to | |
462 | * continue its traversal from the paging structure root. | |
463 | * | |
464 | * Return true if this function yielded and the iterator's traversal was reset. | |
465 | * Return false if a yield was not needed. | |
466 | */ | |
467 | static bool tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) | |
a6a0b05d BG |
468 | { |
469 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | |
470 | cond_resched_lock(&kvm->mmu_lock); | |
471 | tdp_iter_refresh_walk(iter); | |
e28a436c | 472 | return true; |
a6a0b05d | 473 | } |
e28a436c BG |
474 | |
475 | return false; | |
a6a0b05d BG |
476 | } |
477 | ||
faaf05b0 BG |
478 | /* |
479 | * Tears down the mappings for the range of gfns, [start, end), and frees the | |
480 | * non-root pages mapping GFNs strictly within that range. Returns true if | |
481 | * SPTEs have been cleared and a TLB flush is needed before releasing the | |
482 | * MMU lock. | |
063afacd BG |
483 | * If can_yield is true, will release the MMU lock and reschedule if the |
484 | * scheduler needs the CPU or there is contention on the MMU lock. If this | |
485 | * function cannot yield, it will not release the MMU lock or reschedule and | |
486 | * the caller must ensure it does not supply too large a GFN range, or the | |
487 | * operation can cause a soft lockup. | |
faaf05b0 BG |
488 | */ |
489 | static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
063afacd | 490 | gfn_t start, gfn_t end, bool can_yield) |
faaf05b0 BG |
491 | { |
492 | struct tdp_iter iter; | |
493 | bool flush_needed = false; | |
494 | ||
495 | tdp_root_for_each_pte(iter, root, start, end) { | |
496 | if (!is_shadow_present_pte(iter.old_spte)) | |
497 | continue; | |
498 | ||
499 | /* | |
500 | * If this is a non-last-level SPTE that covers a larger range | |
501 | * than should be zapped, continue, and zap the mappings at a | |
502 | * lower level. | |
503 | */ | |
504 | if ((iter.gfn < start || | |
505 | iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && | |
506 | !is_last_spte(iter.old_spte, iter.level)) | |
507 | continue; | |
508 | ||
509 | tdp_mmu_set_spte(kvm, &iter, 0); | |
510 | ||
e28a436c BG |
511 | flush_needed = !can_yield || |
512 | !tdp_mmu_iter_flush_cond_resched(kvm, &iter); | |
faaf05b0 BG |
513 | } |
514 | return flush_needed; | |
515 | } | |
516 | ||
517 | /* | |
518 | * Tears down the mappings for the range of gfns, [start, end), and frees the | |
519 | * non-root pages mapping GFNs strictly within that range. Returns true if | |
520 | * SPTEs have been cleared and a TLB flush is needed before releasing the | |
521 | * MMU lock. | |
522 | */ | |
523 | bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) | |
524 | { | |
525 | struct kvm_mmu_page *root; | |
526 | bool flush = false; | |
527 | ||
a889ea54 | 528 | for_each_tdp_mmu_root_yield_safe(kvm, root) |
063afacd | 529 | flush |= zap_gfn_range(kvm, root, start, end, true); |
faaf05b0 | 530 | |
faaf05b0 BG |
531 | return flush; |
532 | } | |
533 | ||
534 | void kvm_tdp_mmu_zap_all(struct kvm *kvm) | |
535 | { | |
339f5a7f | 536 | gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); |
faaf05b0 BG |
537 | bool flush; |
538 | ||
539 | flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); | |
540 | if (flush) | |
541 | kvm_flush_remote_tlbs(kvm); | |
542 | } | |
bb18842e BG |
543 | |
544 | /* | |
545 | * Installs a last-level SPTE to handle a TDP page fault. | |
546 | * (NPT/EPT violation/misconfiguration) | |
547 | */ | |
548 | static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, | |
549 | int map_writable, | |
550 | struct tdp_iter *iter, | |
551 | kvm_pfn_t pfn, bool prefault) | |
552 | { | |
553 | u64 new_spte; | |
554 | int ret = 0; | |
555 | int make_spte_ret = 0; | |
556 | ||
557 | if (unlikely(is_noslot_pfn(pfn))) { | |
558 | new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); | |
559 | trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); | |
33dd3574 | 560 | } else { |
bb18842e BG |
561 | make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, |
562 | pfn, iter->old_spte, prefault, true, | |
563 | map_writable, !shadow_accessed_mask, | |
564 | &new_spte); | |
33dd3574 BG |
565 | trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); |
566 | } | |
bb18842e BG |
567 | |
568 | if (new_spte == iter->old_spte) | |
569 | ret = RET_PF_SPURIOUS; | |
570 | else | |
571 | tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); | |
572 | ||
573 | /* | |
574 | * If the page fault was caused by a write but the page is write | |
575 | * protected, emulation is needed. If the emulation was skipped, | |
576 | * the vCPU would have the same fault again. | |
577 | */ | |
578 | if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { | |
579 | if (write) | |
580 | ret = RET_PF_EMULATE; | |
581 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); | |
582 | } | |
583 | ||
584 | /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ | |
585 | if (unlikely(is_mmio_spte(new_spte))) | |
586 | ret = RET_PF_EMULATE; | |
587 | ||
588 | trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); | |
589 | if (!prefault) | |
590 | vcpu->stat.pf_fixed++; | |
591 | ||
592 | return ret; | |
593 | } | |
594 | ||
595 | /* | |
596 | * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing | |
597 | * page tables and SPTEs to translate the faulting guest physical address. | |
598 | */ | |
599 | int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, | |
600 | int map_writable, int max_level, kvm_pfn_t pfn, | |
601 | bool prefault) | |
602 | { | |
603 | bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); | |
604 | bool write = error_code & PFERR_WRITE_MASK; | |
605 | bool exec = error_code & PFERR_FETCH_MASK; | |
606 | bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; | |
607 | struct kvm_mmu *mmu = vcpu->arch.mmu; | |
608 | struct tdp_iter iter; | |
89c0fd49 | 609 | struct kvm_mmu_page *sp; |
bb18842e BG |
610 | u64 *child_pt; |
611 | u64 new_spte; | |
612 | int ret; | |
613 | gfn_t gfn = gpa >> PAGE_SHIFT; | |
614 | int level; | |
615 | int req_level; | |
616 | ||
617 | if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) | |
618 | return RET_PF_RETRY; | |
619 | if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) | |
620 | return RET_PF_RETRY; | |
621 | ||
622 | level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, | |
623 | huge_page_disallowed, &req_level); | |
624 | ||
625 | trace_kvm_mmu_spte_requested(gpa, level, pfn); | |
626 | tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { | |
627 | if (nx_huge_page_workaround_enabled) | |
628 | disallowed_hugepage_adjust(iter.old_spte, gfn, | |
629 | iter.level, &pfn, &level); | |
630 | ||
631 | if (iter.level == level) | |
632 | break; | |
633 | ||
634 | /* | |
635 | * If there is an SPTE mapping a large page at a higher level | |
636 | * than the target, that SPTE must be cleared and replaced | |
637 | * with a non-leaf SPTE. | |
638 | */ | |
639 | if (is_shadow_present_pte(iter.old_spte) && | |
640 | is_large_pte(iter.old_spte)) { | |
641 | tdp_mmu_set_spte(vcpu->kvm, &iter, 0); | |
642 | ||
643 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, | |
644 | KVM_PAGES_PER_HPAGE(iter.level)); | |
645 | ||
646 | /* | |
647 | * The iter must explicitly re-read the spte here | |
648 | * because the new value informs the !present | |
649 | * path below. | |
650 | */ | |
651 | iter.old_spte = READ_ONCE(*iter.sptep); | |
652 | } | |
653 | ||
654 | if (!is_shadow_present_pte(iter.old_spte)) { | |
89c0fd49 BG |
655 | sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); |
656 | list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); | |
657 | child_pt = sp->spt; | |
bb18842e BG |
658 | new_spte = make_nonleaf_spte(child_pt, |
659 | !shadow_accessed_mask); | |
660 | ||
661 | trace_kvm_mmu_get_page(sp, true); | |
29cf0f50 BG |
662 | if (huge_page_disallowed && req_level >= iter.level) |
663 | account_huge_nx_page(vcpu->kvm, sp); | |
664 | ||
bb18842e BG |
665 | tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); |
666 | } | |
667 | } | |
668 | ||
669 | if (WARN_ON(iter.level != level)) | |
670 | return RET_PF_RETRY; | |
671 | ||
672 | ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, | |
673 | pfn, prefault); | |
674 | ||
675 | return ret; | |
676 | } | |
063afacd BG |
677 | |
678 | static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, | |
679 | unsigned long end, unsigned long data, | |
680 | int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, | |
681 | struct kvm_mmu_page *root, gfn_t start, | |
682 | gfn_t end, unsigned long data)) | |
683 | { | |
684 | struct kvm_memslots *slots; | |
685 | struct kvm_memory_slot *memslot; | |
686 | struct kvm_mmu_page *root; | |
687 | int ret = 0; | |
688 | int as_id; | |
689 | ||
a889ea54 | 690 | for_each_tdp_mmu_root_yield_safe(kvm, root) { |
063afacd BG |
691 | as_id = kvm_mmu_page_as_id(root); |
692 | slots = __kvm_memslots(kvm, as_id); | |
693 | kvm_for_each_memslot(memslot, slots) { | |
694 | unsigned long hva_start, hva_end; | |
695 | gfn_t gfn_start, gfn_end; | |
696 | ||
697 | hva_start = max(start, memslot->userspace_addr); | |
698 | hva_end = min(end, memslot->userspace_addr + | |
699 | (memslot->npages << PAGE_SHIFT)); | |
700 | if (hva_start >= hva_end) | |
701 | continue; | |
702 | /* | |
703 | * {gfn(page) | page intersects with [hva_start, hva_end)} = | |
704 | * {gfn_start, gfn_start+1, ..., gfn_end-1}. | |
705 | */ | |
706 | gfn_start = hva_to_gfn_memslot(hva_start, memslot); | |
707 | gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); | |
708 | ||
709 | ret |= handler(kvm, memslot, root, gfn_start, | |
710 | gfn_end, data); | |
711 | } | |
063afacd BG |
712 | } |
713 | ||
714 | return ret; | |
715 | } | |
716 | ||
717 | static int zap_gfn_range_hva_wrapper(struct kvm *kvm, | |
718 | struct kvm_memory_slot *slot, | |
719 | struct kvm_mmu_page *root, gfn_t start, | |
720 | gfn_t end, unsigned long unused) | |
721 | { | |
722 | return zap_gfn_range(kvm, root, start, end, false); | |
723 | } | |
724 | ||
725 | int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, | |
726 | unsigned long end) | |
727 | { | |
728 | return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, | |
729 | zap_gfn_range_hva_wrapper); | |
730 | } | |
f8e14497 BG |
731 | |
732 | /* | |
733 | * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero | |
734 | * if any of the GFNs in the range have been accessed. | |
735 | */ | |
736 | static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, | |
737 | struct kvm_mmu_page *root, gfn_t start, gfn_t end, | |
738 | unsigned long unused) | |
739 | { | |
740 | struct tdp_iter iter; | |
741 | int young = 0; | |
742 | u64 new_spte = 0; | |
743 | ||
744 | tdp_root_for_each_leaf_pte(iter, root, start, end) { | |
745 | /* | |
746 | * If we have a non-accessed entry we don't need to change the | |
747 | * pte. | |
748 | */ | |
749 | if (!is_accessed_spte(iter.old_spte)) | |
750 | continue; | |
751 | ||
752 | new_spte = iter.old_spte; | |
753 | ||
754 | if (spte_ad_enabled(new_spte)) { | |
755 | clear_bit((ffs(shadow_accessed_mask) - 1), | |
756 | (unsigned long *)&new_spte); | |
757 | } else { | |
758 | /* | |
759 | * Capture the dirty status of the page, so that it doesn't get | |
760 | * lost when the SPTE is marked for access tracking. | |
761 | */ | |
762 | if (is_writable_pte(new_spte)) | |
763 | kvm_set_pfn_dirty(spte_to_pfn(new_spte)); | |
764 | ||
765 | new_spte = mark_spte_for_access_track(new_spte); | |
766 | } | |
a6a0b05d | 767 | new_spte &= ~shadow_dirty_mask; |
f8e14497 BG |
768 | |
769 | tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); | |
770 | young = 1; | |
33dd3574 BG |
771 | |
772 | trace_kvm_age_page(iter.gfn, iter.level, slot, young); | |
f8e14497 BG |
773 | } |
774 | ||
775 | return young; | |
776 | } | |
777 | ||
778 | int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, | |
779 | unsigned long end) | |
780 | { | |
781 | return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, | |
782 | age_gfn_range); | |
783 | } | |
784 | ||
785 | static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, | |
786 | struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, | |
787 | unsigned long unused2) | |
788 | { | |
789 | struct tdp_iter iter; | |
790 | ||
791 | tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) | |
792 | if (is_accessed_spte(iter.old_spte)) | |
793 | return 1; | |
794 | ||
795 | return 0; | |
796 | } | |
797 | ||
798 | int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) | |
799 | { | |
800 | return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, | |
801 | test_age_gfn); | |
802 | } | |
1d8dd6b3 BG |
803 | |
804 | /* | |
805 | * Handle the changed_pte MMU notifier for the TDP MMU. | |
806 | * data is a pointer to the new pte_t mapping the HVA specified by the MMU | |
807 | * notifier. | |
808 | * Returns non-zero if a flush is needed before releasing the MMU lock. | |
809 | */ | |
810 | static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, | |
811 | struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, | |
812 | unsigned long data) | |
813 | { | |
814 | struct tdp_iter iter; | |
815 | pte_t *ptep = (pte_t *)data; | |
816 | kvm_pfn_t new_pfn; | |
817 | u64 new_spte; | |
818 | int need_flush = 0; | |
819 | ||
820 | WARN_ON(pte_huge(*ptep)); | |
821 | ||
822 | new_pfn = pte_pfn(*ptep); | |
823 | ||
824 | tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { | |
825 | if (iter.level != PG_LEVEL_4K) | |
826 | continue; | |
827 | ||
828 | if (!is_shadow_present_pte(iter.old_spte)) | |
829 | break; | |
830 | ||
831 | tdp_mmu_set_spte(kvm, &iter, 0); | |
832 | ||
833 | kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); | |
834 | ||
835 | if (!pte_write(*ptep)) { | |
836 | new_spte = kvm_mmu_changed_pte_notifier_make_spte( | |
837 | iter.old_spte, new_pfn); | |
838 | ||
839 | tdp_mmu_set_spte(kvm, &iter, new_spte); | |
840 | } | |
841 | ||
842 | need_flush = 1; | |
843 | } | |
844 | ||
845 | if (need_flush) | |
846 | kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); | |
847 | ||
848 | return 0; | |
849 | } | |
850 | ||
851 | int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, | |
852 | pte_t *host_ptep) | |
853 | { | |
854 | return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, | |
855 | (unsigned long)host_ptep, | |
856 | set_tdp_spte); | |
857 | } | |
858 | ||
a6a0b05d BG |
859 | /* |
860 | * Remove write access from all the SPTEs mapping GFNs [start, end). If | |
861 | * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. | |
862 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
863 | */ | |
864 | static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
865 | gfn_t start, gfn_t end, int min_level) | |
866 | { | |
867 | struct tdp_iter iter; | |
868 | u64 new_spte; | |
869 | bool spte_set = false; | |
870 | ||
871 | BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); | |
872 | ||
873 | for_each_tdp_pte_min_level(iter, root->spt, root->role.level, | |
874 | min_level, start, end) { | |
875 | if (!is_shadow_present_pte(iter.old_spte) || | |
876 | !is_last_spte(iter.old_spte, iter.level)) | |
877 | continue; | |
878 | ||
879 | new_spte = iter.old_spte & ~PT_WRITABLE_MASK; | |
880 | ||
881 | tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); | |
882 | spte_set = true; | |
883 | ||
884 | tdp_mmu_iter_cond_resched(kvm, &iter); | |
885 | } | |
886 | return spte_set; | |
887 | } | |
888 | ||
889 | /* | |
890 | * Remove write access from all the SPTEs mapping GFNs in the memslot. Will | |
891 | * only affect leaf SPTEs down to min_level. | |
892 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
893 | */ | |
894 | bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, | |
895 | int min_level) | |
896 | { | |
897 | struct kvm_mmu_page *root; | |
898 | int root_as_id; | |
899 | bool spte_set = false; | |
900 | ||
a889ea54 | 901 | for_each_tdp_mmu_root_yield_safe(kvm, root) { |
a6a0b05d BG |
902 | root_as_id = kvm_mmu_page_as_id(root); |
903 | if (root_as_id != slot->as_id) | |
904 | continue; | |
905 | ||
a6a0b05d BG |
906 | spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, |
907 | slot->base_gfn + slot->npages, min_level); | |
a6a0b05d BG |
908 | } |
909 | ||
910 | return spte_set; | |
911 | } | |
912 | ||
913 | /* | |
914 | * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If | |
915 | * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. | |
916 | * If AD bits are not enabled, this will require clearing the writable bit on | |
917 | * each SPTE. Returns true if an SPTE has been changed and the TLBs need to | |
918 | * be flushed. | |
919 | */ | |
920 | static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
921 | gfn_t start, gfn_t end) | |
922 | { | |
923 | struct tdp_iter iter; | |
924 | u64 new_spte; | |
925 | bool spte_set = false; | |
926 | ||
927 | tdp_root_for_each_leaf_pte(iter, root, start, end) { | |
928 | if (spte_ad_need_write_protect(iter.old_spte)) { | |
929 | if (is_writable_pte(iter.old_spte)) | |
930 | new_spte = iter.old_spte & ~PT_WRITABLE_MASK; | |
931 | else | |
932 | continue; | |
933 | } else { | |
934 | if (iter.old_spte & shadow_dirty_mask) | |
935 | new_spte = iter.old_spte & ~shadow_dirty_mask; | |
936 | else | |
937 | continue; | |
938 | } | |
939 | ||
940 | tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); | |
941 | spte_set = true; | |
942 | ||
943 | tdp_mmu_iter_cond_resched(kvm, &iter); | |
944 | } | |
945 | return spte_set; | |
946 | } | |
947 | ||
948 | /* | |
949 | * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If | |
950 | * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. | |
951 | * If AD bits are not enabled, this will require clearing the writable bit on | |
952 | * each SPTE. Returns true if an SPTE has been changed and the TLBs need to | |
953 | * be flushed. | |
954 | */ | |
955 | bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) | |
956 | { | |
957 | struct kvm_mmu_page *root; | |
958 | int root_as_id; | |
959 | bool spte_set = false; | |
960 | ||
a889ea54 | 961 | for_each_tdp_mmu_root_yield_safe(kvm, root) { |
a6a0b05d BG |
962 | root_as_id = kvm_mmu_page_as_id(root); |
963 | if (root_as_id != slot->as_id) | |
964 | continue; | |
965 | ||
a6a0b05d BG |
966 | spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, |
967 | slot->base_gfn + slot->npages); | |
a6a0b05d BG |
968 | } |
969 | ||
970 | return spte_set; | |
971 | } | |
972 | ||
973 | /* | |
974 | * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is | |
975 | * set in mask, starting at gfn. The given memslot is expected to contain all | |
976 | * the GFNs represented by set bits in the mask. If AD bits are enabled, | |
977 | * clearing the dirty status will involve clearing the dirty bit on each SPTE | |
978 | * or, if AD bits are not enabled, clearing the writable bit on each SPTE. | |
979 | */ | |
980 | static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, | |
981 | gfn_t gfn, unsigned long mask, bool wrprot) | |
982 | { | |
983 | struct tdp_iter iter; | |
984 | u64 new_spte; | |
985 | ||
986 | tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), | |
987 | gfn + BITS_PER_LONG) { | |
988 | if (!mask) | |
989 | break; | |
990 | ||
991 | if (iter.level > PG_LEVEL_4K || | |
992 | !(mask & (1UL << (iter.gfn - gfn)))) | |
993 | continue; | |
994 | ||
995 | if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { | |
996 | if (is_writable_pte(iter.old_spte)) | |
997 | new_spte = iter.old_spte & ~PT_WRITABLE_MASK; | |
998 | else | |
999 | continue; | |
1000 | } else { | |
1001 | if (iter.old_spte & shadow_dirty_mask) | |
1002 | new_spte = iter.old_spte & ~shadow_dirty_mask; | |
1003 | else | |
1004 | continue; | |
1005 | } | |
1006 | ||
1007 | tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); | |
1008 | ||
1009 | mask &= ~(1UL << (iter.gfn - gfn)); | |
1010 | } | |
1011 | } | |
1012 | ||
1013 | /* | |
1014 | * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is | |
1015 | * set in mask, starting at gfn. The given memslot is expected to contain all | |
1016 | * the GFNs represented by set bits in the mask. If AD bits are enabled, | |
1017 | * clearing the dirty status will involve clearing the dirty bit on each SPTE | |
1018 | * or, if AD bits are not enabled, clearing the writable bit on each SPTE. | |
1019 | */ | |
1020 | void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, | |
1021 | struct kvm_memory_slot *slot, | |
1022 | gfn_t gfn, unsigned long mask, | |
1023 | bool wrprot) | |
1024 | { | |
1025 | struct kvm_mmu_page *root; | |
1026 | int root_as_id; | |
1027 | ||
1028 | lockdep_assert_held(&kvm->mmu_lock); | |
1029 | for_each_tdp_mmu_root(kvm, root) { | |
1030 | root_as_id = kvm_mmu_page_as_id(root); | |
1031 | if (root_as_id != slot->as_id) | |
1032 | continue; | |
1033 | ||
1034 | clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); | |
1035 | } | |
1036 | } | |
1037 | ||
1038 | /* | |
1039 | * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is | |
1040 | * only used for PML, and so will involve setting the dirty bit on each SPTE. | |
1041 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
1042 | */ | |
1043 | static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
1044 | gfn_t start, gfn_t end) | |
1045 | { | |
1046 | struct tdp_iter iter; | |
1047 | u64 new_spte; | |
1048 | bool spte_set = false; | |
1049 | ||
1050 | tdp_root_for_each_pte(iter, root, start, end) { | |
1051 | if (!is_shadow_present_pte(iter.old_spte)) | |
1052 | continue; | |
1053 | ||
1054 | new_spte = iter.old_spte | shadow_dirty_mask; | |
1055 | ||
1056 | tdp_mmu_set_spte(kvm, &iter, new_spte); | |
1057 | spte_set = true; | |
1058 | ||
1059 | tdp_mmu_iter_cond_resched(kvm, &iter); | |
1060 | } | |
1061 | ||
1062 | return spte_set; | |
1063 | } | |
1064 | ||
1065 | /* | |
1066 | * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is | |
1067 | * only used for PML, and so will involve setting the dirty bit on each SPTE. | |
1068 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
1069 | */ | |
1070 | bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) | |
1071 | { | |
1072 | struct kvm_mmu_page *root; | |
1073 | int root_as_id; | |
1074 | bool spte_set = false; | |
1075 | ||
a889ea54 | 1076 | for_each_tdp_mmu_root_yield_safe(kvm, root) { |
a6a0b05d BG |
1077 | root_as_id = kvm_mmu_page_as_id(root); |
1078 | if (root_as_id != slot->as_id) | |
1079 | continue; | |
1080 | ||
a6a0b05d BG |
1081 | spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, |
1082 | slot->base_gfn + slot->npages); | |
a6a0b05d BG |
1083 | } |
1084 | return spte_set; | |
1085 | } | |
1086 | ||
14881998 | 1087 | /* |
87aa9ec9 BG |
1088 | * Clear leaf entries which could be replaced by large mappings, for |
1089 | * GFNs within the slot. | |
14881998 BG |
1090 | */ |
1091 | static void zap_collapsible_spte_range(struct kvm *kvm, | |
1092 | struct kvm_mmu_page *root, | |
1093 | gfn_t start, gfn_t end) | |
1094 | { | |
1095 | struct tdp_iter iter; | |
1096 | kvm_pfn_t pfn; | |
1097 | bool spte_set = false; | |
1098 | ||
1099 | tdp_root_for_each_pte(iter, root, start, end) { | |
1100 | if (!is_shadow_present_pte(iter.old_spte) || | |
87aa9ec9 | 1101 | !is_last_spte(iter.old_spte, iter.level)) |
14881998 BG |
1102 | continue; |
1103 | ||
1104 | pfn = spte_to_pfn(iter.old_spte); | |
1105 | if (kvm_is_reserved_pfn(pfn) || | |
1106 | !PageTransCompoundMap(pfn_to_page(pfn))) | |
1107 | continue; | |
1108 | ||
1109 | tdp_mmu_set_spte(kvm, &iter, 0); | |
1110 | ||
e28a436c | 1111 | spte_set = !tdp_mmu_iter_flush_cond_resched(kvm, &iter); |
14881998 BG |
1112 | } |
1113 | ||
1114 | if (spte_set) | |
1115 | kvm_flush_remote_tlbs(kvm); | |
1116 | } | |
1117 | ||
1118 | /* | |
1119 | * Clear non-leaf entries (and free associated page tables) which could | |
1120 | * be replaced by large mappings, for GFNs within the slot. | |
1121 | */ | |
1122 | void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, | |
1123 | const struct kvm_memory_slot *slot) | |
1124 | { | |
1125 | struct kvm_mmu_page *root; | |
1126 | int root_as_id; | |
1127 | ||
a889ea54 | 1128 | for_each_tdp_mmu_root_yield_safe(kvm, root) { |
14881998 BG |
1129 | root_as_id = kvm_mmu_page_as_id(root); |
1130 | if (root_as_id != slot->as_id) | |
1131 | continue; | |
1132 | ||
14881998 BG |
1133 | zap_collapsible_spte_range(kvm, root, slot->base_gfn, |
1134 | slot->base_gfn + slot->npages); | |
14881998 BG |
1135 | } |
1136 | } | |
46044f72 BG |
1137 | |
1138 | /* | |
1139 | * Removes write access on the last level SPTE mapping this GFN and unsets the | |
1140 | * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. | |
1141 | * Returns true if an SPTE was set and a TLB flush is needed. | |
1142 | */ | |
1143 | static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, | |
1144 | gfn_t gfn) | |
1145 | { | |
1146 | struct tdp_iter iter; | |
1147 | u64 new_spte; | |
1148 | bool spte_set = false; | |
1149 | ||
1150 | tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { | |
1151 | if (!is_writable_pte(iter.old_spte)) | |
1152 | break; | |
1153 | ||
1154 | new_spte = iter.old_spte & | |
1155 | ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); | |
1156 | ||
1157 | tdp_mmu_set_spte(kvm, &iter, new_spte); | |
1158 | spte_set = true; | |
1159 | } | |
1160 | ||
1161 | return spte_set; | |
1162 | } | |
1163 | ||
1164 | /* | |
1165 | * Removes write access on the last level SPTE mapping this GFN and unsets the | |
1166 | * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. | |
1167 | * Returns true if an SPTE was set and a TLB flush is needed. | |
1168 | */ | |
1169 | bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, | |
1170 | struct kvm_memory_slot *slot, gfn_t gfn) | |
1171 | { | |
1172 | struct kvm_mmu_page *root; | |
1173 | int root_as_id; | |
1174 | bool spte_set = false; | |
1175 | ||
1176 | lockdep_assert_held(&kvm->mmu_lock); | |
1177 | for_each_tdp_mmu_root(kvm, root) { | |
1178 | root_as_id = kvm_mmu_page_as_id(root); | |
1179 | if (root_as_id != slot->as_id) | |
1180 | continue; | |
1181 | ||
1182 | spte_set |= write_protect_gfn(kvm, root, gfn); | |
1183 | } | |
1184 | return spte_set; | |
1185 | } | |
1186 | ||
95fb5b02 BG |
1187 | /* |
1188 | * Return the level of the lowest level SPTE added to sptes. | |
1189 | * That SPTE may be non-present. | |
1190 | */ | |
39b4d43e SC |
1191 | int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, |
1192 | int *root_level) | |
95fb5b02 BG |
1193 | { |
1194 | struct tdp_iter iter; | |
1195 | struct kvm_mmu *mmu = vcpu->arch.mmu; | |
95fb5b02 | 1196 | gfn_t gfn = addr >> PAGE_SHIFT; |
2aa07893 | 1197 | int leaf = -1; |
95fb5b02 | 1198 | |
39b4d43e | 1199 | *root_level = vcpu->arch.mmu->shadow_root_level; |
95fb5b02 BG |
1200 | |
1201 | tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { | |
1202 | leaf = iter.level; | |
dde81f94 | 1203 | sptes[leaf] = iter.old_spte; |
95fb5b02 BG |
1204 | } |
1205 | ||
1206 | return leaf; | |
1207 | } |