Commit | Line | Data |
---|---|---|
982ed0de DW |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* | |
3 | * Kernel-based Virtual Machine driver for Linux | |
4 | * | |
5 | * This module enables kernel and guest-mode vCPU access to guest physical | |
6 | * memory with suitable invalidation mechanisms. | |
7 | * | |
8 | * Copyright © 2021 Amazon.com, Inc. or its affiliates. | |
9 | * | |
10 | * Authors: | |
11 | * David Woodhouse <dwmw2@infradead.org> | |
12 | */ | |
13 | ||
14 | #include <linux/kvm_host.h> | |
15 | #include <linux/kvm.h> | |
16 | #include <linux/highmem.h> | |
17 | #include <linux/module.h> | |
18 | #include <linux/errno.h> | |
19 | ||
20 | #include "kvm_mm.h" | |
21 | ||
22 | /* | |
23 | * MMU notifier 'invalidate_range_start' hook. | |
24 | */ | |
25 | void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, | |
26 | unsigned long end, bool may_block) | |
27 | { | |
28 | DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); | |
29 | struct gfn_to_pfn_cache *gpc; | |
df06dae3 | 30 | bool evict_vcpus = false; |
982ed0de DW |
31 | |
32 | spin_lock(&kvm->gpc_lock); | |
33 | list_for_each_entry(gpc, &kvm->gpc_list, list) { | |
34 | write_lock_irq(&gpc->lock); | |
35 | ||
36 | /* Only a single page so no need to care about length */ | |
37 | if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && | |
38 | gpc->uhva >= start && gpc->uhva < end) { | |
39 | gpc->valid = false; | |
40 | ||
41 | /* | |
42 | * If a guest vCPU could be using the physical address, | |
df06dae3 | 43 | * it needs to be forced out of guest mode. |
982ed0de | 44 | */ |
d0d96121 | 45 | if (gpc->usage & KVM_GUEST_USES_PFN) { |
df06dae3 SC |
46 | if (!evict_vcpus) { |
47 | evict_vcpus = true; | |
982ed0de DW |
48 | bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); |
49 | } | |
50 | __set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap); | |
51 | } | |
982ed0de DW |
52 | } |
53 | write_unlock_irq(&gpc->lock); | |
54 | } | |
55 | spin_unlock(&kvm->gpc_lock); | |
56 | ||
df06dae3 SC |
57 | if (evict_vcpus) { |
58 | /* | |
59 | * KVM needs to ensure the vCPU is fully out of guest context | |
60 | * before allowing the invalidation to continue. | |
61 | */ | |
62 | unsigned int req = KVM_REQ_OUTSIDE_GUEST_MODE; | |
982ed0de DW |
63 | bool called; |
64 | ||
65 | /* | |
66 | * If the OOM reaper is active, then all vCPUs should have | |
67 | * been stopped already, so perform the request without | |
df06dae3 | 68 | * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd. |
982ed0de DW |
69 | */ |
70 | if (!may_block) | |
71 | req &= ~KVM_REQUEST_WAIT; | |
72 | ||
73 | called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap); | |
74 | ||
75 | WARN_ON_ONCE(called && !may_block); | |
76 | } | |
77 | } | |
78 | ||
e308c24a | 79 | bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) |
982ed0de | 80 | { |
e308c24a | 81 | struct kvm_memslots *slots = kvm_memslots(gpc->kvm); |
982ed0de | 82 | |
ecbcf030 SC |
83 | if (!gpc->active) |
84 | return false; | |
85 | ||
982ed0de DW |
86 | if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE) |
87 | return false; | |
88 | ||
89 | if (gpc->gpa != gpa || gpc->generation != slots->generation || | |
90 | kvm_is_error_hva(gpc->uhva)) | |
91 | return false; | |
92 | ||
93 | if (!gpc->valid) | |
94 | return false; | |
95 | ||
96 | return true; | |
97 | } | |
aba3caef | 98 | EXPORT_SYMBOL_GPL(kvm_gpc_check); |
982ed0de | 99 | |
c1a81f3b | 100 | static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva) |
982ed0de | 101 | { |
85165781 SC |
102 | /* Unmap the old pfn/page if it was mapped before. */ |
103 | if (!is_error_noslot_pfn(pfn) && khva) { | |
104 | if (pfn_valid(pfn)) | |
105 | kunmap(pfn_to_page(pfn)); | |
982ed0de | 106 | #ifdef CONFIG_HAS_IOMEM |
85165781 SC |
107 | else |
108 | memunmap(khva); | |
982ed0de | 109 | #endif |
982ed0de DW |
110 | } |
111 | } | |
112 | ||
58cd407c | 113 | static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) |
982ed0de | 114 | { |
58cd407c SC |
115 | /* |
116 | * mn_active_invalidate_count acts for all intents and purposes | |
20ec3ebd CP |
117 | * like mmu_invalidate_in_progress here; but the latter cannot |
118 | * be used here because the invalidation of caches in the | |
119 | * mmu_notifier event occurs _before_ mmu_invalidate_in_progress | |
120 | * is elevated. | |
58cd407c SC |
121 | * |
122 | * Note, it does not matter that mn_active_invalidate_count | |
123 | * is not protected by gpc->lock. It is guaranteed to | |
124 | * be elevated before the mmu_notifier acquires gpc->lock, and | |
20ec3ebd | 125 | * isn't dropped until after mmu_invalidate_seq is updated. |
58cd407c SC |
126 | */ |
127 | if (kvm->mn_active_invalidate_count) | |
128 | return true; | |
129 | ||
130 | /* | |
131 | * Ensure mn_active_invalidate_count is read before | |
20ec3ebd | 132 | * mmu_invalidate_seq. This pairs with the smp_wmb() in |
58cd407c SC |
133 | * mmu_notifier_invalidate_range_end() to guarantee either the |
134 | * old (non-zero) value of mn_active_invalidate_count or the | |
20ec3ebd | 135 | * new (incremented) value of mmu_invalidate_seq is observed. |
58cd407c SC |
136 | */ |
137 | smp_rmb(); | |
20ec3ebd | 138 | return kvm->mmu_invalidate_seq != mmu_seq; |
58cd407c SC |
139 | } |
140 | ||
2a0b128a | 141 | static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) |
58cd407c SC |
142 | { |
143 | /* Note, the new page offset may be different than the old! */ | |
144 | void *old_khva = gpc->khva - offset_in_page(gpc->khva); | |
145 | kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; | |
146 | void *new_khva = NULL; | |
982ed0de | 147 | unsigned long mmu_seq; |
58cd407c SC |
148 | |
149 | lockdep_assert_held(&gpc->refresh_lock); | |
150 | ||
151 | lockdep_assert_held_write(&gpc->lock); | |
152 | ||
153 | /* | |
154 | * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva | |
155 | * assets have already been updated and so a concurrent check() from a | |
156 | * different task may not fail the gpa/uhva/generation checks. | |
157 | */ | |
158 | gpc->valid = false; | |
982ed0de DW |
159 | |
160 | do { | |
2a0b128a | 161 | mmu_seq = gpc->kvm->mmu_invalidate_seq; |
982ed0de DW |
162 | smp_rmb(); |
163 | ||
58cd407c SC |
164 | write_unlock_irq(&gpc->lock); |
165 | ||
166 | /* | |
167 | * If the previous iteration "failed" due to an mmu_notifier | |
168 | * event, release the pfn and unmap the kernel virtual address | |
169 | * from the previous attempt. Unmapping might sleep, so this | |
170 | * needs to be done after dropping the lock. Opportunistically | |
171 | * check for resched while the lock isn't held. | |
172 | */ | |
173 | if (new_pfn != KVM_PFN_ERR_FAULT) { | |
174 | /* | |
175 | * Keep the mapping if the previous iteration reused | |
176 | * the existing mapping and didn't create a new one. | |
177 | */ | |
85165781 | 178 | if (new_khva != old_khva) |
c1a81f3b | 179 | gpc_unmap_khva(new_pfn, new_khva); |
58cd407c | 180 | |
85165781 | 181 | kvm_release_pfn_clean(new_pfn); |
58cd407c SC |
182 | |
183 | cond_resched(); | |
184 | } | |
185 | ||
982ed0de | 186 | /* We always request a writeable mapping */ |
c8b88b33 | 187 | new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); |
982ed0de | 188 | if (is_error_noslot_pfn(new_pfn)) |
58cd407c | 189 | goto out_error; |
982ed0de | 190 | |
58cd407c SC |
191 | /* |
192 | * Obtain a new kernel mapping if KVM itself will access the | |
193 | * pfn. Note, kmap() and memremap() can both sleep, so this | |
194 | * too must be done outside of gpc->lock! | |
195 | */ | |
196 | if (gpc->usage & KVM_HOST_USES_PFN) { | |
197 | if (new_pfn == gpc->pfn) { | |
198 | new_khva = old_khva; | |
199 | } else if (pfn_valid(new_pfn)) { | |
200 | new_khva = kmap(pfn_to_page(new_pfn)); | |
201 | #ifdef CONFIG_HAS_IOMEM | |
202 | } else { | |
203 | new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB); | |
204 | #endif | |
205 | } | |
206 | if (!new_khva) { | |
207 | kvm_release_pfn_clean(new_pfn); | |
208 | goto out_error; | |
209 | } | |
210 | } | |
982ed0de | 211 | |
58cd407c | 212 | write_lock_irq(&gpc->lock); |
982ed0de | 213 | |
58cd407c SC |
214 | /* |
215 | * Other tasks must wait for _this_ refresh to complete before | |
216 | * attempting to refresh. | |
217 | */ | |
218 | WARN_ON_ONCE(gpc->valid); | |
2a0b128a | 219 | } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); |
58cd407c SC |
220 | |
221 | gpc->valid = true; | |
222 | gpc->pfn = new_pfn; | |
223 | gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK); | |
85165781 SC |
224 | |
225 | /* | |
226 | * Put the reference to the _new_ pfn. The pfn is now tracked by the | |
227 | * cache and can be safely migrated, swapped, etc... as the cache will | |
228 | * invalidate any mappings in response to relevant mmu_notifier events. | |
229 | */ | |
230 | kvm_release_pfn_clean(new_pfn); | |
231 | ||
58cd407c SC |
232 | return 0; |
233 | ||
234 | out_error: | |
235 | write_lock_irq(&gpc->lock); | |
236 | ||
237 | return -EFAULT; | |
982ed0de DW |
238 | } |
239 | ||
0318f207 | 240 | int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) |
982ed0de | 241 | { |
0318f207 | 242 | struct kvm_memslots *slots = kvm_memslots(gpc->kvm); |
982ed0de | 243 | unsigned long page_offset = gpa & ~PAGE_MASK; |
ecbcf030 | 244 | bool unmap_old = false; |
982ed0de | 245 | unsigned long old_uhva; |
ecbcf030 | 246 | kvm_pfn_t old_pfn; |
982ed0de | 247 | void *old_khva; |
ecbcf030 | 248 | int ret; |
982ed0de DW |
249 | |
250 | /* | |
251 | * If must fit within a single page. The 'len' argument is | |
252 | * only to enforce that. | |
253 | */ | |
254 | if (page_offset + len > PAGE_SIZE) | |
255 | return -EINVAL; | |
256 | ||
93984f19 SC |
257 | /* |
258 | * If another task is refreshing the cache, wait for it to complete. | |
259 | * There is no guarantee that concurrent refreshes will see the same | |
260 | * gpa, memslots generation, etc..., so they must be fully serialized. | |
261 | */ | |
262 | mutex_lock(&gpc->refresh_lock); | |
263 | ||
982ed0de DW |
264 | write_lock_irq(&gpc->lock); |
265 | ||
ecbcf030 SC |
266 | if (!gpc->active) { |
267 | ret = -EINVAL; | |
268 | goto out_unlock; | |
269 | } | |
270 | ||
982ed0de DW |
271 | old_pfn = gpc->pfn; |
272 | old_khva = gpc->khva - offset_in_page(gpc->khva); | |
273 | old_uhva = gpc->uhva; | |
982ed0de DW |
274 | |
275 | /* If the userspace HVA is invalid, refresh that first */ | |
276 | if (gpc->gpa != gpa || gpc->generation != slots->generation || | |
277 | kvm_is_error_hva(gpc->uhva)) { | |
278 | gfn_t gfn = gpa_to_gfn(gpa); | |
279 | ||
982ed0de DW |
280 | gpc->gpa = gpa; |
281 | gpc->generation = slots->generation; | |
282 | gpc->memslot = __gfn_to_memslot(slots, gfn); | |
283 | gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); | |
284 | ||
285 | if (kvm_is_error_hva(gpc->uhva)) { | |
286 | ret = -EFAULT; | |
287 | goto out; | |
288 | } | |
982ed0de DW |
289 | } |
290 | ||
291 | /* | |
292 | * If the userspace HVA changed or the PFN was already invalid, | |
293 | * drop the lock and do the HVA to PFN lookup again. | |
294 | */ | |
58cd407c | 295 | if (!gpc->valid || old_uhva != gpc->uhva) { |
2a0b128a | 296 | ret = hva_to_pfn_retry(gpc); |
982ed0de | 297 | } else { |
8332f0ed DW |
298 | /* |
299 | * If the HVA→PFN mapping was already valid, don't unmap it. | |
300 | * But do update gpc->khva because the offset within the page | |
301 | * may have changed. | |
302 | */ | |
303 | gpc->khva = old_khva + page_offset; | |
982ed0de DW |
304 | old_pfn = KVM_PFN_ERR_FAULT; |
305 | old_khva = NULL; | |
ecbcf030 | 306 | ret = 0; |
982ed0de DW |
307 | } |
308 | ||
309 | out: | |
58cd407c SC |
310 | /* |
311 | * Invalidate the cache and purge the pfn/khva if the refresh failed. | |
312 | * Some/all of the uhva, gpa, and memslot generation info may still be | |
313 | * valid, leave it as is. | |
314 | */ | |
315 | if (ret) { | |
316 | gpc->valid = false; | |
317 | gpc->pfn = KVM_PFN_ERR_FAULT; | |
318 | gpc->khva = NULL; | |
319 | } | |
320 | ||
ecbcf030 SC |
321 | /* Detect a pfn change before dropping the lock! */ |
322 | unmap_old = (old_pfn != gpc->pfn); | |
58cd407c | 323 | |
ecbcf030 | 324 | out_unlock: |
982ed0de DW |
325 | write_unlock_irq(&gpc->lock); |
326 | ||
93984f19 SC |
327 | mutex_unlock(&gpc->refresh_lock); |
328 | ||
ecbcf030 | 329 | if (unmap_old) |
c1a81f3b | 330 | gpc_unmap_khva(old_pfn, old_khva); |
982ed0de DW |
331 | |
332 | return ret; | |
333 | } | |
aba3caef | 334 | EXPORT_SYMBOL_GPL(kvm_gpc_refresh); |
982ed0de | 335 | |
aba3caef | 336 | void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) |
982ed0de DW |
337 | { |
338 | void *old_khva; | |
339 | kvm_pfn_t old_pfn; | |
982ed0de | 340 | |
93984f19 | 341 | mutex_lock(&gpc->refresh_lock); |
982ed0de DW |
342 | write_lock_irq(&gpc->lock); |
343 | ||
344 | gpc->valid = false; | |
345 | ||
346 | old_khva = gpc->khva - offset_in_page(gpc->khva); | |
982ed0de DW |
347 | old_pfn = gpc->pfn; |
348 | ||
349 | /* | |
350 | * We can leave the GPA → uHVA map cache intact but the PFN | |
351 | * lookup will need to be redone even for the same page. | |
352 | */ | |
353 | gpc->khva = NULL; | |
354 | gpc->pfn = KVM_PFN_ERR_FAULT; | |
355 | ||
356 | write_unlock_irq(&gpc->lock); | |
93984f19 | 357 | mutex_unlock(&gpc->refresh_lock); |
982ed0de | 358 | |
c1a81f3b | 359 | gpc_unmap_khva(old_pfn, old_khva); |
982ed0de | 360 | } |
aba3caef | 361 | EXPORT_SYMBOL_GPL(kvm_gpc_unmap); |
982ed0de | 362 | |
8c82a0b3 ML |
363 | void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, |
364 | struct kvm_vcpu *vcpu, enum pfn_cache_usage usage) | |
52491a38 | 365 | { |
8c82a0b3 ML |
366 | WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); |
367 | WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu); | |
368 | ||
52491a38 ML |
369 | rwlock_init(&gpc->lock); |
370 | mutex_init(&gpc->refresh_lock); | |
8c82a0b3 ML |
371 | |
372 | gpc->kvm = kvm; | |
373 | gpc->vcpu = vcpu; | |
374 | gpc->usage = usage; | |
52491a38 ML |
375 | } |
376 | EXPORT_SYMBOL_GPL(kvm_gpc_init); | |
982ed0de | 377 | |
8c82a0b3 | 378 | int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) |
982ed0de | 379 | { |
8c82a0b3 | 380 | struct kvm *kvm = gpc->kvm; |
d0d96121 | 381 | |
982ed0de | 382 | if (!gpc->active) { |
982ed0de DW |
383 | gpc->khva = NULL; |
384 | gpc->pfn = KVM_PFN_ERR_FAULT; | |
385 | gpc->uhva = KVM_HVA_ERR_BAD; | |
982ed0de | 386 | gpc->valid = false; |
982ed0de DW |
387 | |
388 | spin_lock(&kvm->gpc_lock); | |
389 | list_add(&gpc->list, &kvm->gpc_list); | |
390 | spin_unlock(&kvm->gpc_lock); | |
ecbcf030 SC |
391 | |
392 | /* | |
393 | * Activate the cache after adding it to the list, a concurrent | |
394 | * refresh must not establish a mapping until the cache is | |
395 | * reachable by mmu_notifier events. | |
396 | */ | |
397 | write_lock_irq(&gpc->lock); | |
398 | gpc->active = true; | |
399 | write_unlock_irq(&gpc->lock); | |
982ed0de | 400 | } |
0318f207 | 401 | return kvm_gpc_refresh(gpc, gpa, len); |
982ed0de | 402 | } |
52491a38 | 403 | EXPORT_SYMBOL_GPL(kvm_gpc_activate); |
982ed0de | 404 | |
8c82a0b3 | 405 | void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) |
982ed0de | 406 | { |
8c82a0b3 ML |
407 | struct kvm *kvm = gpc->kvm; |
408 | ||
982ed0de | 409 | if (gpc->active) { |
ecbcf030 SC |
410 | /* |
411 | * Deactivate the cache before removing it from the list, KVM | |
412 | * must stall mmu_notifier events until all users go away, i.e. | |
413 | * until gpc->lock is dropped and refresh is guaranteed to fail. | |
414 | */ | |
415 | write_lock_irq(&gpc->lock); | |
416 | gpc->active = false; | |
417 | write_unlock_irq(&gpc->lock); | |
418 | ||
982ed0de DW |
419 | spin_lock(&kvm->gpc_lock); |
420 | list_del(&gpc->list); | |
421 | spin_unlock(&kvm->gpc_lock); | |
422 | ||
aba3caef | 423 | kvm_gpc_unmap(kvm, gpc); |
982ed0de DW |
424 | } |
425 | } | |
52491a38 | 426 | EXPORT_SYMBOL_GPL(kvm_gpc_deactivate); |