Commit | Line | Data |
---|---|---|
fe5db27d BG |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
02c00b3a BG |
3 | #include "mmu.h" |
4 | #include "mmu_internal.h" | |
bb18842e | 5 | #include "mmutrace.h" |
2f2fad08 | 6 | #include "tdp_iter.h" |
fe5db27d | 7 | #include "tdp_mmu.h" |
02c00b3a | 8 | #include "spte.h" |
fe5db27d BG |
9 | |
10 | static bool __read_mostly tdp_mmu_enabled = false; | |
11 | ||
12 | static bool is_tdp_mmu_enabled(void) | |
13 | { | |
14 | #ifdef CONFIG_X86_64 | |
15 | return tdp_enabled && READ_ONCE(tdp_mmu_enabled); | |
16 | #else | |
17 | return false; | |
18 | #endif /* CONFIG_X86_64 */ | |
19 | } | |
20 | ||
21 | /* Initializes the TDP MMU for the VM, if enabled. */ | |
22 | void kvm_mmu_init_tdp_mmu(struct kvm *kvm) | |
23 | { | |
24 | if (!is_tdp_mmu_enabled()) | |
25 | return; | |
26 | ||
27 | /* This should not be changed for the lifetime of the VM. */ | |
28 | kvm->arch.tdp_mmu_enabled = true; | |
02c00b3a BG |
29 | |
30 | INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); | |
89c0fd49 | 31 | INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); |
fe5db27d BG |
32 | } |
33 | ||
34 | void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) | |
35 | { | |
36 | if (!kvm->arch.tdp_mmu_enabled) | |
37 | return; | |
02c00b3a BG |
38 | |
39 | WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); | |
40 | } | |
41 | ||
42 | #define for_each_tdp_mmu_root(_kvm, _root) \ | |
43 | list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) | |
44 | ||
45 | bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) | |
46 | { | |
47 | struct kvm_mmu_page *sp; | |
48 | ||
49 | sp = to_shadow_page(hpa); | |
50 | ||
51 | return sp->tdp_mmu_page && sp->root_count; | |
52 | } | |
53 | ||
faaf05b0 | 54 | static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, |
063afacd | 55 | gfn_t start, gfn_t end, bool can_yield); |
faaf05b0 | 56 | |
02c00b3a BG |
57 | void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) |
58 | { | |
faaf05b0 BG |
59 | gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); |
60 | ||
02c00b3a BG |
61 | lockdep_assert_held(&kvm->mmu_lock); |
62 | ||
63 | WARN_ON(root->root_count); | |
64 | WARN_ON(!root->tdp_mmu_page); | |
65 | ||
66 | list_del(&root->link); | |
67 | ||
063afacd | 68 | zap_gfn_range(kvm, root, 0, max_gfn, false); |
faaf05b0 | 69 | |
02c00b3a BG |
70 | free_page((unsigned long)root->spt); |
71 | kmem_cache_free(mmu_page_header_cache, root); | |
72 | } | |
73 | ||
74 | static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, | |
75 | int level) | |
76 | { | |
77 | union kvm_mmu_page_role role; | |
78 | ||
79 | role = vcpu->arch.mmu->mmu_role.base; | |
80 | role.level = level; | |
81 | role.direct = true; | |
82 | role.gpte_is_8_bytes = true; | |
83 | role.access = ACC_ALL; | |
84 | ||
85 | return role; | |
86 | } | |
87 | ||
88 | static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, | |
89 | int level) | |
90 | { | |
91 | struct kvm_mmu_page *sp; | |
92 | ||
93 | sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); | |
94 | sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); | |
95 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | |
96 | ||
97 | sp->role.word = page_role_for_level(vcpu, level).word; | |
98 | sp->gfn = gfn; | |
99 | sp->tdp_mmu_page = true; | |
100 | ||
101 | return sp; | |
102 | } | |
103 | ||
104 | static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) | |
105 | { | |
106 | union kvm_mmu_page_role role; | |
107 | struct kvm *kvm = vcpu->kvm; | |
108 | struct kvm_mmu_page *root; | |
109 | ||
110 | role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); | |
111 | ||
112 | spin_lock(&kvm->mmu_lock); | |
113 | ||
114 | /* Check for an existing root before allocating a new one. */ | |
115 | for_each_tdp_mmu_root(kvm, root) { | |
116 | if (root->role.word == role.word) { | |
117 | kvm_mmu_get_root(kvm, root); | |
118 | spin_unlock(&kvm->mmu_lock); | |
119 | return root; | |
120 | } | |
121 | } | |
122 | ||
123 | root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); | |
124 | root->root_count = 1; | |
125 | ||
126 | list_add(&root->link, &kvm->arch.tdp_mmu_roots); | |
127 | ||
128 | spin_unlock(&kvm->mmu_lock); | |
129 | ||
130 | return root; | |
131 | } | |
132 | ||
133 | hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) | |
134 | { | |
135 | struct kvm_mmu_page *root; | |
136 | ||
137 | root = get_tdp_mmu_vcpu_root(vcpu); | |
138 | if (!root) | |
139 | return INVALID_PAGE; | |
140 | ||
141 | return __pa(root->spt); | |
fe5db27d | 142 | } |
2f2fad08 BG |
143 | |
144 | static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, | |
145 | u64 old_spte, u64 new_spte, int level); | |
146 | ||
faaf05b0 BG |
147 | static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) |
148 | { | |
149 | return sp->role.smm ? 1 : 0; | |
150 | } | |
151 | ||
f8e14497 BG |
152 | static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) |
153 | { | |
154 | bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); | |
155 | ||
156 | if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) | |
157 | return; | |
158 | ||
159 | if (is_accessed_spte(old_spte) && | |
160 | (!is_accessed_spte(new_spte) || pfn_changed)) | |
161 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | |
162 | } | |
163 | ||
a6a0b05d BG |
164 | static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, |
165 | u64 old_spte, u64 new_spte, int level) | |
166 | { | |
167 | bool pfn_changed; | |
168 | struct kvm_memory_slot *slot; | |
169 | ||
170 | if (level > PG_LEVEL_4K) | |
171 | return; | |
172 | ||
173 | pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); | |
174 | ||
175 | if ((!is_writable_pte(old_spte) || pfn_changed) && | |
176 | is_writable_pte(new_spte)) { | |
177 | slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); | |
178 | mark_page_dirty_in_slot(slot, gfn); | |
179 | } | |
180 | } | |
181 | ||
2f2fad08 BG |
182 | /** |
183 | * handle_changed_spte - handle bookkeeping associated with an SPTE change | |
184 | * @kvm: kvm instance | |
185 | * @as_id: the address space of the paging structure the SPTE was a part of | |
186 | * @gfn: the base GFN that was mapped by the SPTE | |
187 | * @old_spte: The value of the SPTE before the change | |
188 | * @new_spte: The value of the SPTE after the change | |
189 | * @level: the level of the PT the SPTE is part of in the paging structure | |
190 | * | |
191 | * Handle bookkeeping that might result from the modification of a SPTE. | |
192 | * This function must be called for all TDP SPTE modifications. | |
193 | */ | |
194 | static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, | |
195 | u64 old_spte, u64 new_spte, int level) | |
196 | { | |
197 | bool was_present = is_shadow_present_pte(old_spte); | |
198 | bool is_present = is_shadow_present_pte(new_spte); | |
199 | bool was_leaf = was_present && is_last_spte(old_spte, level); | |
200 | bool is_leaf = is_present && is_last_spte(new_spte, level); | |
201 | bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); | |
202 | u64 *pt; | |
89c0fd49 | 203 | struct kvm_mmu_page *sp; |
2f2fad08 BG |
204 | u64 old_child_spte; |
205 | int i; | |
206 | ||
207 | WARN_ON(level > PT64_ROOT_MAX_LEVEL); | |
208 | WARN_ON(level < PG_LEVEL_4K); | |
209 | WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level)); | |
210 | ||
211 | /* | |
212 | * If this warning were to trigger it would indicate that there was a | |
213 | * missing MMU notifier or a race with some notifier handler. | |
214 | * A present, leaf SPTE should never be directly replaced with another | |
215 | * present leaf SPTE pointing to a differnt PFN. A notifier handler | |
216 | * should be zapping the SPTE before the main MM's page table is | |
217 | * changed, or the SPTE should be zeroed, and the TLBs flushed by the | |
218 | * thread before replacement. | |
219 | */ | |
220 | if (was_leaf && is_leaf && pfn_changed) { | |
221 | pr_err("Invalid SPTE change: cannot replace a present leaf\n" | |
222 | "SPTE with another present leaf SPTE mapping a\n" | |
223 | "different PFN!\n" | |
224 | "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", | |
225 | as_id, gfn, old_spte, new_spte, level); | |
226 | ||
227 | /* | |
228 | * Crash the host to prevent error propagation and guest data | |
229 | * courruption. | |
230 | */ | |
231 | BUG(); | |
232 | } | |
233 | ||
234 | if (old_spte == new_spte) | |
235 | return; | |
236 | ||
237 | /* | |
238 | * The only times a SPTE should be changed from a non-present to | |
239 | * non-present state is when an MMIO entry is installed/modified/ | |
240 | * removed. In that case, there is nothing to do here. | |
241 | */ | |
242 | if (!was_present && !is_present) { | |
243 | /* | |
244 | * If this change does not involve a MMIO SPTE, it is | |
245 | * unexpected. Log the change, though it should not impact the | |
246 | * guest since both the former and current SPTEs are nonpresent. | |
247 | */ | |
248 | if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) | |
249 | pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" | |
250 | "should not be replaced with another,\n" | |
251 | "different nonpresent SPTE, unless one or both\n" | |
252 | "are MMIO SPTEs.\n" | |
253 | "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", | |
254 | as_id, gfn, old_spte, new_spte, level); | |
255 | return; | |
256 | } | |
257 | ||
258 | ||
259 | if (was_leaf && is_dirty_spte(old_spte) && | |
260 | (!is_dirty_spte(new_spte) || pfn_changed)) | |
261 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | |
262 | ||
263 | /* | |
264 | * Recursively handle child PTs if the change removed a subtree from | |
265 | * the paging structure. | |
266 | */ | |
267 | if (was_present && !was_leaf && (pfn_changed || !is_present)) { | |
268 | pt = spte_to_child_pt(old_spte, level); | |
89c0fd49 BG |
269 | sp = sptep_to_sp(pt); |
270 | ||
271 | list_del(&sp->link); | |
2f2fad08 BG |
272 | |
273 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { | |
274 | old_child_spte = READ_ONCE(*(pt + i)); | |
275 | WRITE_ONCE(*(pt + i), 0); | |
276 | handle_changed_spte(kvm, as_id, | |
277 | gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), | |
278 | old_child_spte, 0, level - 1); | |
279 | } | |
280 | ||
281 | kvm_flush_remote_tlbs_with_address(kvm, gfn, | |
282 | KVM_PAGES_PER_HPAGE(level)); | |
283 | ||
284 | free_page((unsigned long)pt); | |
89c0fd49 | 285 | kmem_cache_free(mmu_page_header_cache, sp); |
2f2fad08 BG |
286 | } |
287 | } | |
288 | ||
289 | static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, | |
290 | u64 old_spte, u64 new_spte, int level) | |
291 | { | |
292 | __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); | |
f8e14497 | 293 | handle_changed_spte_acc_track(old_spte, new_spte, level); |
a6a0b05d BG |
294 | handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, |
295 | new_spte, level); | |
2f2fad08 | 296 | } |
faaf05b0 | 297 | |
f8e14497 | 298 | static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, |
a6a0b05d BG |
299 | u64 new_spte, bool record_acc_track, |
300 | bool record_dirty_log) | |
faaf05b0 BG |
301 | { |
302 | u64 *root_pt = tdp_iter_root_pt(iter); | |
303 | struct kvm_mmu_page *root = sptep_to_sp(root_pt); | |
304 | int as_id = kvm_mmu_page_as_id(root); | |
305 | ||
f8e14497 BG |
306 | WRITE_ONCE(*iter->sptep, new_spte); |
307 | ||
308 | __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, | |
309 | iter->level); | |
310 | if (record_acc_track) | |
311 | handle_changed_spte_acc_track(iter->old_spte, new_spte, | |
312 | iter->level); | |
a6a0b05d BG |
313 | if (record_dirty_log) |
314 | handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, | |
315 | iter->old_spte, new_spte, | |
316 | iter->level); | |
f8e14497 BG |
317 | } |
318 | ||
319 | static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, | |
320 | u64 new_spte) | |
321 | { | |
a6a0b05d | 322 | __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); |
f8e14497 | 323 | } |
faaf05b0 | 324 | |
f8e14497 BG |
325 | static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, |
326 | struct tdp_iter *iter, | |
327 | u64 new_spte) | |
328 | { | |
a6a0b05d BG |
329 | __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); |
330 | } | |
331 | ||
332 | static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, | |
333 | struct tdp_iter *iter, | |
334 | u64 new_spte) | |
335 | { | |
336 | __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); | |
faaf05b0 BG |
337 | } |
338 | ||
339 | #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ | |
340 | for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) | |
341 | ||
f8e14497 BG |
342 | #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ |
343 | tdp_root_for_each_pte(_iter, _root, _start, _end) \ | |
344 | if (!is_shadow_present_pte(_iter.old_spte) || \ | |
345 | !is_last_spte(_iter.old_spte, _iter.level)) \ | |
346 | continue; \ | |
347 | else | |
348 | ||
bb18842e BG |
349 | #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ |
350 | for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ | |
351 | _mmu->shadow_root_level, _start, _end) | |
352 | ||
faaf05b0 BG |
353 | /* |
354 | * Flush the TLB if the process should drop kvm->mmu_lock. | |
355 | * Return whether the caller still needs to flush the tlb. | |
356 | */ | |
357 | static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) | |
358 | { | |
359 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | |
360 | kvm_flush_remote_tlbs(kvm); | |
361 | cond_resched_lock(&kvm->mmu_lock); | |
362 | tdp_iter_refresh_walk(iter); | |
363 | return false; | |
364 | } else { | |
365 | return true; | |
366 | } | |
367 | } | |
368 | ||
a6a0b05d BG |
369 | static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) |
370 | { | |
371 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | |
372 | cond_resched_lock(&kvm->mmu_lock); | |
373 | tdp_iter_refresh_walk(iter); | |
374 | } | |
375 | } | |
376 | ||
faaf05b0 BG |
377 | /* |
378 | * Tears down the mappings for the range of gfns, [start, end), and frees the | |
379 | * non-root pages mapping GFNs strictly within that range. Returns true if | |
380 | * SPTEs have been cleared and a TLB flush is needed before releasing the | |
381 | * MMU lock. | |
063afacd BG |
382 | * If can_yield is true, will release the MMU lock and reschedule if the |
383 | * scheduler needs the CPU or there is contention on the MMU lock. If this | |
384 | * function cannot yield, it will not release the MMU lock or reschedule and | |
385 | * the caller must ensure it does not supply too large a GFN range, or the | |
386 | * operation can cause a soft lockup. | |
faaf05b0 BG |
387 | */ |
388 | static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
063afacd | 389 | gfn_t start, gfn_t end, bool can_yield) |
faaf05b0 BG |
390 | { |
391 | struct tdp_iter iter; | |
392 | bool flush_needed = false; | |
393 | ||
394 | tdp_root_for_each_pte(iter, root, start, end) { | |
395 | if (!is_shadow_present_pte(iter.old_spte)) | |
396 | continue; | |
397 | ||
398 | /* | |
399 | * If this is a non-last-level SPTE that covers a larger range | |
400 | * than should be zapped, continue, and zap the mappings at a | |
401 | * lower level. | |
402 | */ | |
403 | if ((iter.gfn < start || | |
404 | iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && | |
405 | !is_last_spte(iter.old_spte, iter.level)) | |
406 | continue; | |
407 | ||
408 | tdp_mmu_set_spte(kvm, &iter, 0); | |
409 | ||
063afacd BG |
410 | if (can_yield) |
411 | flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); | |
412 | else | |
413 | flush_needed = true; | |
faaf05b0 BG |
414 | } |
415 | return flush_needed; | |
416 | } | |
417 | ||
418 | /* | |
419 | * Tears down the mappings for the range of gfns, [start, end), and frees the | |
420 | * non-root pages mapping GFNs strictly within that range. Returns true if | |
421 | * SPTEs have been cleared and a TLB flush is needed before releasing the | |
422 | * MMU lock. | |
423 | */ | |
424 | bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) | |
425 | { | |
426 | struct kvm_mmu_page *root; | |
427 | bool flush = false; | |
428 | ||
429 | for_each_tdp_mmu_root(kvm, root) { | |
430 | /* | |
431 | * Take a reference on the root so that it cannot be freed if | |
432 | * this thread releases the MMU lock and yields in this loop. | |
433 | */ | |
434 | kvm_mmu_get_root(kvm, root); | |
435 | ||
063afacd | 436 | flush |= zap_gfn_range(kvm, root, start, end, true); |
faaf05b0 BG |
437 | |
438 | kvm_mmu_put_root(kvm, root); | |
439 | } | |
440 | ||
441 | return flush; | |
442 | } | |
443 | ||
444 | void kvm_tdp_mmu_zap_all(struct kvm *kvm) | |
445 | { | |
446 | gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); | |
447 | bool flush; | |
448 | ||
449 | flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); | |
450 | if (flush) | |
451 | kvm_flush_remote_tlbs(kvm); | |
452 | } | |
bb18842e BG |
453 | |
454 | /* | |
455 | * Installs a last-level SPTE to handle a TDP page fault. | |
456 | * (NPT/EPT violation/misconfiguration) | |
457 | */ | |
458 | static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, | |
459 | int map_writable, | |
460 | struct tdp_iter *iter, | |
461 | kvm_pfn_t pfn, bool prefault) | |
462 | { | |
463 | u64 new_spte; | |
464 | int ret = 0; | |
465 | int make_spte_ret = 0; | |
466 | ||
467 | if (unlikely(is_noslot_pfn(pfn))) { | |
468 | new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); | |
469 | trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); | |
470 | } else | |
471 | make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, | |
472 | pfn, iter->old_spte, prefault, true, | |
473 | map_writable, !shadow_accessed_mask, | |
474 | &new_spte); | |
475 | ||
476 | if (new_spte == iter->old_spte) | |
477 | ret = RET_PF_SPURIOUS; | |
478 | else | |
479 | tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); | |
480 | ||
481 | /* | |
482 | * If the page fault was caused by a write but the page is write | |
483 | * protected, emulation is needed. If the emulation was skipped, | |
484 | * the vCPU would have the same fault again. | |
485 | */ | |
486 | if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { | |
487 | if (write) | |
488 | ret = RET_PF_EMULATE; | |
489 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); | |
490 | } | |
491 | ||
492 | /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ | |
493 | if (unlikely(is_mmio_spte(new_spte))) | |
494 | ret = RET_PF_EMULATE; | |
495 | ||
496 | trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); | |
497 | if (!prefault) | |
498 | vcpu->stat.pf_fixed++; | |
499 | ||
500 | return ret; | |
501 | } | |
502 | ||
503 | /* | |
504 | * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing | |
505 | * page tables and SPTEs to translate the faulting guest physical address. | |
506 | */ | |
507 | int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, | |
508 | int map_writable, int max_level, kvm_pfn_t pfn, | |
509 | bool prefault) | |
510 | { | |
511 | bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); | |
512 | bool write = error_code & PFERR_WRITE_MASK; | |
513 | bool exec = error_code & PFERR_FETCH_MASK; | |
514 | bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; | |
515 | struct kvm_mmu *mmu = vcpu->arch.mmu; | |
516 | struct tdp_iter iter; | |
89c0fd49 | 517 | struct kvm_mmu_page *sp; |
bb18842e BG |
518 | u64 *child_pt; |
519 | u64 new_spte; | |
520 | int ret; | |
521 | gfn_t gfn = gpa >> PAGE_SHIFT; | |
522 | int level; | |
523 | int req_level; | |
524 | ||
525 | if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) | |
526 | return RET_PF_RETRY; | |
527 | if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) | |
528 | return RET_PF_RETRY; | |
529 | ||
530 | level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, | |
531 | huge_page_disallowed, &req_level); | |
532 | ||
533 | trace_kvm_mmu_spte_requested(gpa, level, pfn); | |
534 | tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { | |
535 | if (nx_huge_page_workaround_enabled) | |
536 | disallowed_hugepage_adjust(iter.old_spte, gfn, | |
537 | iter.level, &pfn, &level); | |
538 | ||
539 | if (iter.level == level) | |
540 | break; | |
541 | ||
542 | /* | |
543 | * If there is an SPTE mapping a large page at a higher level | |
544 | * than the target, that SPTE must be cleared and replaced | |
545 | * with a non-leaf SPTE. | |
546 | */ | |
547 | if (is_shadow_present_pte(iter.old_spte) && | |
548 | is_large_pte(iter.old_spte)) { | |
549 | tdp_mmu_set_spte(vcpu->kvm, &iter, 0); | |
550 | ||
551 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, | |
552 | KVM_PAGES_PER_HPAGE(iter.level)); | |
553 | ||
554 | /* | |
555 | * The iter must explicitly re-read the spte here | |
556 | * because the new value informs the !present | |
557 | * path below. | |
558 | */ | |
559 | iter.old_spte = READ_ONCE(*iter.sptep); | |
560 | } | |
561 | ||
562 | if (!is_shadow_present_pte(iter.old_spte)) { | |
89c0fd49 BG |
563 | sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); |
564 | list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); | |
565 | child_pt = sp->spt; | |
bb18842e BG |
566 | clear_page(child_pt); |
567 | new_spte = make_nonleaf_spte(child_pt, | |
568 | !shadow_accessed_mask); | |
569 | ||
570 | trace_kvm_mmu_get_page(sp, true); | |
571 | tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); | |
572 | } | |
573 | } | |
574 | ||
575 | if (WARN_ON(iter.level != level)) | |
576 | return RET_PF_RETRY; | |
577 | ||
578 | ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, | |
579 | pfn, prefault); | |
580 | ||
581 | return ret; | |
582 | } | |
063afacd BG |
583 | |
584 | static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, | |
585 | unsigned long end, unsigned long data, | |
586 | int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, | |
587 | struct kvm_mmu_page *root, gfn_t start, | |
588 | gfn_t end, unsigned long data)) | |
589 | { | |
590 | struct kvm_memslots *slots; | |
591 | struct kvm_memory_slot *memslot; | |
592 | struct kvm_mmu_page *root; | |
593 | int ret = 0; | |
594 | int as_id; | |
595 | ||
596 | for_each_tdp_mmu_root(kvm, root) { | |
597 | /* | |
598 | * Take a reference on the root so that it cannot be freed if | |
599 | * this thread releases the MMU lock and yields in this loop. | |
600 | */ | |
601 | kvm_mmu_get_root(kvm, root); | |
602 | ||
603 | as_id = kvm_mmu_page_as_id(root); | |
604 | slots = __kvm_memslots(kvm, as_id); | |
605 | kvm_for_each_memslot(memslot, slots) { | |
606 | unsigned long hva_start, hva_end; | |
607 | gfn_t gfn_start, gfn_end; | |
608 | ||
609 | hva_start = max(start, memslot->userspace_addr); | |
610 | hva_end = min(end, memslot->userspace_addr + | |
611 | (memslot->npages << PAGE_SHIFT)); | |
612 | if (hva_start >= hva_end) | |
613 | continue; | |
614 | /* | |
615 | * {gfn(page) | page intersects with [hva_start, hva_end)} = | |
616 | * {gfn_start, gfn_start+1, ..., gfn_end-1}. | |
617 | */ | |
618 | gfn_start = hva_to_gfn_memslot(hva_start, memslot); | |
619 | gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); | |
620 | ||
621 | ret |= handler(kvm, memslot, root, gfn_start, | |
622 | gfn_end, data); | |
623 | } | |
624 | ||
625 | kvm_mmu_put_root(kvm, root); | |
626 | } | |
627 | ||
628 | return ret; | |
629 | } | |
630 | ||
631 | static int zap_gfn_range_hva_wrapper(struct kvm *kvm, | |
632 | struct kvm_memory_slot *slot, | |
633 | struct kvm_mmu_page *root, gfn_t start, | |
634 | gfn_t end, unsigned long unused) | |
635 | { | |
636 | return zap_gfn_range(kvm, root, start, end, false); | |
637 | } | |
638 | ||
639 | int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, | |
640 | unsigned long end) | |
641 | { | |
642 | return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, | |
643 | zap_gfn_range_hva_wrapper); | |
644 | } | |
f8e14497 BG |
645 | |
646 | /* | |
647 | * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero | |
648 | * if any of the GFNs in the range have been accessed. | |
649 | */ | |
650 | static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, | |
651 | struct kvm_mmu_page *root, gfn_t start, gfn_t end, | |
652 | unsigned long unused) | |
653 | { | |
654 | struct tdp_iter iter; | |
655 | int young = 0; | |
656 | u64 new_spte = 0; | |
657 | ||
658 | tdp_root_for_each_leaf_pte(iter, root, start, end) { | |
659 | /* | |
660 | * If we have a non-accessed entry we don't need to change the | |
661 | * pte. | |
662 | */ | |
663 | if (!is_accessed_spte(iter.old_spte)) | |
664 | continue; | |
665 | ||
666 | new_spte = iter.old_spte; | |
667 | ||
668 | if (spte_ad_enabled(new_spte)) { | |
669 | clear_bit((ffs(shadow_accessed_mask) - 1), | |
670 | (unsigned long *)&new_spte); | |
671 | } else { | |
672 | /* | |
673 | * Capture the dirty status of the page, so that it doesn't get | |
674 | * lost when the SPTE is marked for access tracking. | |
675 | */ | |
676 | if (is_writable_pte(new_spte)) | |
677 | kvm_set_pfn_dirty(spte_to_pfn(new_spte)); | |
678 | ||
679 | new_spte = mark_spte_for_access_track(new_spte); | |
680 | } | |
a6a0b05d | 681 | new_spte &= ~shadow_dirty_mask; |
f8e14497 BG |
682 | |
683 | tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); | |
684 | young = 1; | |
685 | } | |
686 | ||
687 | return young; | |
688 | } | |
689 | ||
690 | int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, | |
691 | unsigned long end) | |
692 | { | |
693 | return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, | |
694 | age_gfn_range); | |
695 | } | |
696 | ||
697 | static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, | |
698 | struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, | |
699 | unsigned long unused2) | |
700 | { | |
701 | struct tdp_iter iter; | |
702 | ||
703 | tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) | |
704 | if (is_accessed_spte(iter.old_spte)) | |
705 | return 1; | |
706 | ||
707 | return 0; | |
708 | } | |
709 | ||
710 | int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) | |
711 | { | |
712 | return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, | |
713 | test_age_gfn); | |
714 | } | |
1d8dd6b3 BG |
715 | |
716 | /* | |
717 | * Handle the changed_pte MMU notifier for the TDP MMU. | |
718 | * data is a pointer to the new pte_t mapping the HVA specified by the MMU | |
719 | * notifier. | |
720 | * Returns non-zero if a flush is needed before releasing the MMU lock. | |
721 | */ | |
722 | static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, | |
723 | struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, | |
724 | unsigned long data) | |
725 | { | |
726 | struct tdp_iter iter; | |
727 | pte_t *ptep = (pte_t *)data; | |
728 | kvm_pfn_t new_pfn; | |
729 | u64 new_spte; | |
730 | int need_flush = 0; | |
731 | ||
732 | WARN_ON(pte_huge(*ptep)); | |
733 | ||
734 | new_pfn = pte_pfn(*ptep); | |
735 | ||
736 | tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { | |
737 | if (iter.level != PG_LEVEL_4K) | |
738 | continue; | |
739 | ||
740 | if (!is_shadow_present_pte(iter.old_spte)) | |
741 | break; | |
742 | ||
743 | tdp_mmu_set_spte(kvm, &iter, 0); | |
744 | ||
745 | kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); | |
746 | ||
747 | if (!pte_write(*ptep)) { | |
748 | new_spte = kvm_mmu_changed_pte_notifier_make_spte( | |
749 | iter.old_spte, new_pfn); | |
750 | ||
751 | tdp_mmu_set_spte(kvm, &iter, new_spte); | |
752 | } | |
753 | ||
754 | need_flush = 1; | |
755 | } | |
756 | ||
757 | if (need_flush) | |
758 | kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); | |
759 | ||
760 | return 0; | |
761 | } | |
762 | ||
763 | int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, | |
764 | pte_t *host_ptep) | |
765 | { | |
766 | return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, | |
767 | (unsigned long)host_ptep, | |
768 | set_tdp_spte); | |
769 | } | |
770 | ||
a6a0b05d BG |
771 | /* |
772 | * Remove write access from all the SPTEs mapping GFNs [start, end). If | |
773 | * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. | |
774 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
775 | */ | |
776 | static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
777 | gfn_t start, gfn_t end, int min_level) | |
778 | { | |
779 | struct tdp_iter iter; | |
780 | u64 new_spte; | |
781 | bool spte_set = false; | |
782 | ||
783 | BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); | |
784 | ||
785 | for_each_tdp_pte_min_level(iter, root->spt, root->role.level, | |
786 | min_level, start, end) { | |
787 | if (!is_shadow_present_pte(iter.old_spte) || | |
788 | !is_last_spte(iter.old_spte, iter.level)) | |
789 | continue; | |
790 | ||
791 | new_spte = iter.old_spte & ~PT_WRITABLE_MASK; | |
792 | ||
793 | tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); | |
794 | spte_set = true; | |
795 | ||
796 | tdp_mmu_iter_cond_resched(kvm, &iter); | |
797 | } | |
798 | return spte_set; | |
799 | } | |
800 | ||
801 | /* | |
802 | * Remove write access from all the SPTEs mapping GFNs in the memslot. Will | |
803 | * only affect leaf SPTEs down to min_level. | |
804 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
805 | */ | |
806 | bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, | |
807 | int min_level) | |
808 | { | |
809 | struct kvm_mmu_page *root; | |
810 | int root_as_id; | |
811 | bool spte_set = false; | |
812 | ||
813 | for_each_tdp_mmu_root(kvm, root) { | |
814 | root_as_id = kvm_mmu_page_as_id(root); | |
815 | if (root_as_id != slot->as_id) | |
816 | continue; | |
817 | ||
818 | /* | |
819 | * Take a reference on the root so that it cannot be freed if | |
820 | * this thread releases the MMU lock and yields in this loop. | |
821 | */ | |
822 | kvm_mmu_get_root(kvm, root); | |
823 | ||
824 | spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, | |
825 | slot->base_gfn + slot->npages, min_level); | |
826 | ||
827 | kvm_mmu_put_root(kvm, root); | |
828 | } | |
829 | ||
830 | return spte_set; | |
831 | } | |
832 | ||
833 | /* | |
834 | * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If | |
835 | * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. | |
836 | * If AD bits are not enabled, this will require clearing the writable bit on | |
837 | * each SPTE. Returns true if an SPTE has been changed and the TLBs need to | |
838 | * be flushed. | |
839 | */ | |
840 | static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
841 | gfn_t start, gfn_t end) | |
842 | { | |
843 | struct tdp_iter iter; | |
844 | u64 new_spte; | |
845 | bool spte_set = false; | |
846 | ||
847 | tdp_root_for_each_leaf_pte(iter, root, start, end) { | |
848 | if (spte_ad_need_write_protect(iter.old_spte)) { | |
849 | if (is_writable_pte(iter.old_spte)) | |
850 | new_spte = iter.old_spte & ~PT_WRITABLE_MASK; | |
851 | else | |
852 | continue; | |
853 | } else { | |
854 | if (iter.old_spte & shadow_dirty_mask) | |
855 | new_spte = iter.old_spte & ~shadow_dirty_mask; | |
856 | else | |
857 | continue; | |
858 | } | |
859 | ||
860 | tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); | |
861 | spte_set = true; | |
862 | ||
863 | tdp_mmu_iter_cond_resched(kvm, &iter); | |
864 | } | |
865 | return spte_set; | |
866 | } | |
867 | ||
868 | /* | |
869 | * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If | |
870 | * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. | |
871 | * If AD bits are not enabled, this will require clearing the writable bit on | |
872 | * each SPTE. Returns true if an SPTE has been changed and the TLBs need to | |
873 | * be flushed. | |
874 | */ | |
875 | bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) | |
876 | { | |
877 | struct kvm_mmu_page *root; | |
878 | int root_as_id; | |
879 | bool spte_set = false; | |
880 | ||
881 | for_each_tdp_mmu_root(kvm, root) { | |
882 | root_as_id = kvm_mmu_page_as_id(root); | |
883 | if (root_as_id != slot->as_id) | |
884 | continue; | |
885 | ||
886 | /* | |
887 | * Take a reference on the root so that it cannot be freed if | |
888 | * this thread releases the MMU lock and yields in this loop. | |
889 | */ | |
890 | kvm_mmu_get_root(kvm, root); | |
891 | ||
892 | spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, | |
893 | slot->base_gfn + slot->npages); | |
894 | ||
895 | kvm_mmu_put_root(kvm, root); | |
896 | } | |
897 | ||
898 | return spte_set; | |
899 | } | |
900 | ||
901 | /* | |
902 | * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is | |
903 | * set in mask, starting at gfn. The given memslot is expected to contain all | |
904 | * the GFNs represented by set bits in the mask. If AD bits are enabled, | |
905 | * clearing the dirty status will involve clearing the dirty bit on each SPTE | |
906 | * or, if AD bits are not enabled, clearing the writable bit on each SPTE. | |
907 | */ | |
908 | static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, | |
909 | gfn_t gfn, unsigned long mask, bool wrprot) | |
910 | { | |
911 | struct tdp_iter iter; | |
912 | u64 new_spte; | |
913 | ||
914 | tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), | |
915 | gfn + BITS_PER_LONG) { | |
916 | if (!mask) | |
917 | break; | |
918 | ||
919 | if (iter.level > PG_LEVEL_4K || | |
920 | !(mask & (1UL << (iter.gfn - gfn)))) | |
921 | continue; | |
922 | ||
923 | if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { | |
924 | if (is_writable_pte(iter.old_spte)) | |
925 | new_spte = iter.old_spte & ~PT_WRITABLE_MASK; | |
926 | else | |
927 | continue; | |
928 | } else { | |
929 | if (iter.old_spte & shadow_dirty_mask) | |
930 | new_spte = iter.old_spte & ~shadow_dirty_mask; | |
931 | else | |
932 | continue; | |
933 | } | |
934 | ||
935 | tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); | |
936 | ||
937 | mask &= ~(1UL << (iter.gfn - gfn)); | |
938 | } | |
939 | } | |
940 | ||
941 | /* | |
942 | * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is | |
943 | * set in mask, starting at gfn. The given memslot is expected to contain all | |
944 | * the GFNs represented by set bits in the mask. If AD bits are enabled, | |
945 | * clearing the dirty status will involve clearing the dirty bit on each SPTE | |
946 | * or, if AD bits are not enabled, clearing the writable bit on each SPTE. | |
947 | */ | |
948 | void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, | |
949 | struct kvm_memory_slot *slot, | |
950 | gfn_t gfn, unsigned long mask, | |
951 | bool wrprot) | |
952 | { | |
953 | struct kvm_mmu_page *root; | |
954 | int root_as_id; | |
955 | ||
956 | lockdep_assert_held(&kvm->mmu_lock); | |
957 | for_each_tdp_mmu_root(kvm, root) { | |
958 | root_as_id = kvm_mmu_page_as_id(root); | |
959 | if (root_as_id != slot->as_id) | |
960 | continue; | |
961 | ||
962 | clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); | |
963 | } | |
964 | } | |
965 | ||
966 | /* | |
967 | * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is | |
968 | * only used for PML, and so will involve setting the dirty bit on each SPTE. | |
969 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
970 | */ | |
971 | static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, | |
972 | gfn_t start, gfn_t end) | |
973 | { | |
974 | struct tdp_iter iter; | |
975 | u64 new_spte; | |
976 | bool spte_set = false; | |
977 | ||
978 | tdp_root_for_each_pte(iter, root, start, end) { | |
979 | if (!is_shadow_present_pte(iter.old_spte)) | |
980 | continue; | |
981 | ||
982 | new_spte = iter.old_spte | shadow_dirty_mask; | |
983 | ||
984 | tdp_mmu_set_spte(kvm, &iter, new_spte); | |
985 | spte_set = true; | |
986 | ||
987 | tdp_mmu_iter_cond_resched(kvm, &iter); | |
988 | } | |
989 | ||
990 | return spte_set; | |
991 | } | |
992 | ||
993 | /* | |
994 | * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is | |
995 | * only used for PML, and so will involve setting the dirty bit on each SPTE. | |
996 | * Returns true if an SPTE has been changed and the TLBs need to be flushed. | |
997 | */ | |
998 | bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) | |
999 | { | |
1000 | struct kvm_mmu_page *root; | |
1001 | int root_as_id; | |
1002 | bool spte_set = false; | |
1003 | ||
1004 | for_each_tdp_mmu_root(kvm, root) { | |
1005 | root_as_id = kvm_mmu_page_as_id(root); | |
1006 | if (root_as_id != slot->as_id) | |
1007 | continue; | |
1008 | ||
1009 | /* | |
1010 | * Take a reference on the root so that it cannot be freed if | |
1011 | * this thread releases the MMU lock and yields in this loop. | |
1012 | */ | |
1013 | kvm_mmu_get_root(kvm, root); | |
1014 | ||
1015 | spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, | |
1016 | slot->base_gfn + slot->npages); | |
1017 | ||
1018 | kvm_mmu_put_root(kvm, root); | |
1019 | } | |
1020 | return spte_set; | |
1021 | } | |
1022 |