kvm: x86/mmu: Allocate struct kvm_mmu_pages for all pages in TDP MMU
[linux-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9
10 static bool __read_mostly tdp_mmu_enabled = false;
11
12 static bool is_tdp_mmu_enabled(void)
13 {
14 #ifdef CONFIG_X86_64
15         return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
16 #else
17         return false;
18 #endif /* CONFIG_X86_64 */
19 }
20
21 /* Initializes the TDP MMU for the VM, if enabled. */
22 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
23 {
24         if (!is_tdp_mmu_enabled())
25                 return;
26
27         /* This should not be changed for the lifetime of the VM. */
28         kvm->arch.tdp_mmu_enabled = true;
29
30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
32 }
33
34 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
35 {
36         if (!kvm->arch.tdp_mmu_enabled)
37                 return;
38
39         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
40 }
41
42 #define for_each_tdp_mmu_root(_kvm, _root)                          \
43         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
44
45 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
46 {
47         struct kvm_mmu_page *sp;
48
49         sp = to_shadow_page(hpa);
50
51         return sp->tdp_mmu_page && sp->root_count;
52 }
53
54 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
55                           gfn_t start, gfn_t end);
56
57 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
58 {
59         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
60
61         lockdep_assert_held(&kvm->mmu_lock);
62
63         WARN_ON(root->root_count);
64         WARN_ON(!root->tdp_mmu_page);
65
66         list_del(&root->link);
67
68         zap_gfn_range(kvm, root, 0, max_gfn);
69
70         free_page((unsigned long)root->spt);
71         kmem_cache_free(mmu_page_header_cache, root);
72 }
73
74 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
75                                                    int level)
76 {
77         union kvm_mmu_page_role role;
78
79         role = vcpu->arch.mmu->mmu_role.base;
80         role.level = level;
81         role.direct = true;
82         role.gpte_is_8_bytes = true;
83         role.access = ACC_ALL;
84
85         return role;
86 }
87
88 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
89                                                int level)
90 {
91         struct kvm_mmu_page *sp;
92
93         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
94         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
95         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
96
97         sp->role.word = page_role_for_level(vcpu, level).word;
98         sp->gfn = gfn;
99         sp->tdp_mmu_page = true;
100
101         return sp;
102 }
103
104 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
105 {
106         union kvm_mmu_page_role role;
107         struct kvm *kvm = vcpu->kvm;
108         struct kvm_mmu_page *root;
109
110         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
111
112         spin_lock(&kvm->mmu_lock);
113
114         /* Check for an existing root before allocating a new one. */
115         for_each_tdp_mmu_root(kvm, root) {
116                 if (root->role.word == role.word) {
117                         kvm_mmu_get_root(kvm, root);
118                         spin_unlock(&kvm->mmu_lock);
119                         return root;
120                 }
121         }
122
123         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
124         root->root_count = 1;
125
126         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
127
128         spin_unlock(&kvm->mmu_lock);
129
130         return root;
131 }
132
133 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
134 {
135         struct kvm_mmu_page *root;
136
137         root = get_tdp_mmu_vcpu_root(vcpu);
138         if (!root)
139                 return INVALID_PAGE;
140
141         return __pa(root->spt);
142 }
143
144 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
145                                 u64 old_spte, u64 new_spte, int level);
146
147 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
148 {
149         return sp->role.smm ? 1 : 0;
150 }
151
152 /**
153  * handle_changed_spte - handle bookkeeping associated with an SPTE change
154  * @kvm: kvm instance
155  * @as_id: the address space of the paging structure the SPTE was a part of
156  * @gfn: the base GFN that was mapped by the SPTE
157  * @old_spte: The value of the SPTE before the change
158  * @new_spte: The value of the SPTE after the change
159  * @level: the level of the PT the SPTE is part of in the paging structure
160  *
161  * Handle bookkeeping that might result from the modification of a SPTE.
162  * This function must be called for all TDP SPTE modifications.
163  */
164 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
165                                 u64 old_spte, u64 new_spte, int level)
166 {
167         bool was_present = is_shadow_present_pte(old_spte);
168         bool is_present = is_shadow_present_pte(new_spte);
169         bool was_leaf = was_present && is_last_spte(old_spte, level);
170         bool is_leaf = is_present && is_last_spte(new_spte, level);
171         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
172         u64 *pt;
173         struct kvm_mmu_page *sp;
174         u64 old_child_spte;
175         int i;
176
177         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
178         WARN_ON(level < PG_LEVEL_4K);
179         WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level));
180
181         /*
182          * If this warning were to trigger it would indicate that there was a
183          * missing MMU notifier or a race with some notifier handler.
184          * A present, leaf SPTE should never be directly replaced with another
185          * present leaf SPTE pointing to a differnt PFN. A notifier handler
186          * should be zapping the SPTE before the main MM's page table is
187          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
188          * thread before replacement.
189          */
190         if (was_leaf && is_leaf && pfn_changed) {
191                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
192                        "SPTE with another present leaf SPTE mapping a\n"
193                        "different PFN!\n"
194                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
195                        as_id, gfn, old_spte, new_spte, level);
196
197                 /*
198                  * Crash the host to prevent error propagation and guest data
199                  * courruption.
200                  */
201                 BUG();
202         }
203
204         if (old_spte == new_spte)
205                 return;
206
207         /*
208          * The only times a SPTE should be changed from a non-present to
209          * non-present state is when an MMIO entry is installed/modified/
210          * removed. In that case, there is nothing to do here.
211          */
212         if (!was_present && !is_present) {
213                 /*
214                  * If this change does not involve a MMIO SPTE, it is
215                  * unexpected. Log the change, though it should not impact the
216                  * guest since both the former and current SPTEs are nonpresent.
217                  */
218                 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
219                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
220                                "should not be replaced with another,\n"
221                                "different nonpresent SPTE, unless one or both\n"
222                                "are MMIO SPTEs.\n"
223                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
224                                as_id, gfn, old_spte, new_spte, level);
225                 return;
226         }
227
228
229         if (was_leaf && is_dirty_spte(old_spte) &&
230             (!is_dirty_spte(new_spte) || pfn_changed))
231                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
232
233         /*
234          * Recursively handle child PTs if the change removed a subtree from
235          * the paging structure.
236          */
237         if (was_present && !was_leaf && (pfn_changed || !is_present)) {
238                 pt = spte_to_child_pt(old_spte, level);
239                 sp = sptep_to_sp(pt);
240
241                 list_del(&sp->link);
242
243                 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
244                         old_child_spte = READ_ONCE(*(pt + i));
245                         WRITE_ONCE(*(pt + i), 0);
246                         handle_changed_spte(kvm, as_id,
247                                 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
248                                 old_child_spte, 0, level - 1);
249                 }
250
251                 kvm_flush_remote_tlbs_with_address(kvm, gfn,
252                                                    KVM_PAGES_PER_HPAGE(level));
253
254                 free_page((unsigned long)pt);
255                 kmem_cache_free(mmu_page_header_cache, sp);
256         }
257 }
258
259 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
260                                 u64 old_spte, u64 new_spte, int level)
261 {
262         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
263 }
264
265 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
266                                     u64 new_spte)
267 {
268         u64 *root_pt = tdp_iter_root_pt(iter);
269         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
270         int as_id = kvm_mmu_page_as_id(root);
271
272         *iter->sptep = new_spte;
273
274         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
275                             iter->level);
276 }
277
278 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
279         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
280
281 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
282         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
283                          _mmu->shadow_root_level, _start, _end)
284
285 /*
286  * Flush the TLB if the process should drop kvm->mmu_lock.
287  * Return whether the caller still needs to flush the tlb.
288  */
289 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
290 {
291         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
292                 kvm_flush_remote_tlbs(kvm);
293                 cond_resched_lock(&kvm->mmu_lock);
294                 tdp_iter_refresh_walk(iter);
295                 return false;
296         } else {
297                 return true;
298         }
299 }
300
301 /*
302  * Tears down the mappings for the range of gfns, [start, end), and frees the
303  * non-root pages mapping GFNs strictly within that range. Returns true if
304  * SPTEs have been cleared and a TLB flush is needed before releasing the
305  * MMU lock.
306  */
307 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
308                           gfn_t start, gfn_t end)
309 {
310         struct tdp_iter iter;
311         bool flush_needed = false;
312
313         tdp_root_for_each_pte(iter, root, start, end) {
314                 if (!is_shadow_present_pte(iter.old_spte))
315                         continue;
316
317                 /*
318                  * If this is a non-last-level SPTE that covers a larger range
319                  * than should be zapped, continue, and zap the mappings at a
320                  * lower level.
321                  */
322                 if ((iter.gfn < start ||
323                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
324                     !is_last_spte(iter.old_spte, iter.level))
325                         continue;
326
327                 tdp_mmu_set_spte(kvm, &iter, 0);
328
329                 flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
330         }
331         return flush_needed;
332 }
333
334 /*
335  * Tears down the mappings for the range of gfns, [start, end), and frees the
336  * non-root pages mapping GFNs strictly within that range. Returns true if
337  * SPTEs have been cleared and a TLB flush is needed before releasing the
338  * MMU lock.
339  */
340 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
341 {
342         struct kvm_mmu_page *root;
343         bool flush = false;
344
345         for_each_tdp_mmu_root(kvm, root) {
346                 /*
347                  * Take a reference on the root so that it cannot be freed if
348                  * this thread releases the MMU lock and yields in this loop.
349                  */
350                 kvm_mmu_get_root(kvm, root);
351
352                 flush |= zap_gfn_range(kvm, root, start, end);
353
354                 kvm_mmu_put_root(kvm, root);
355         }
356
357         return flush;
358 }
359
360 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
361 {
362         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
363         bool flush;
364
365         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
366         if (flush)
367                 kvm_flush_remote_tlbs(kvm);
368 }
369
370 /*
371  * Installs a last-level SPTE to handle a TDP page fault.
372  * (NPT/EPT violation/misconfiguration)
373  */
374 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
375                                           int map_writable,
376                                           struct tdp_iter *iter,
377                                           kvm_pfn_t pfn, bool prefault)
378 {
379         u64 new_spte;
380         int ret = 0;
381         int make_spte_ret = 0;
382
383         if (unlikely(is_noslot_pfn(pfn))) {
384                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
385                 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
386         } else
387                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
388                                          pfn, iter->old_spte, prefault, true,
389                                          map_writable, !shadow_accessed_mask,
390                                          &new_spte);
391
392         if (new_spte == iter->old_spte)
393                 ret = RET_PF_SPURIOUS;
394         else
395                 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
396
397         /*
398          * If the page fault was caused by a write but the page is write
399          * protected, emulation is needed. If the emulation was skipped,
400          * the vCPU would have the same fault again.
401          */
402         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
403                 if (write)
404                         ret = RET_PF_EMULATE;
405                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
406         }
407
408         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
409         if (unlikely(is_mmio_spte(new_spte)))
410                 ret = RET_PF_EMULATE;
411
412         trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
413         if (!prefault)
414                 vcpu->stat.pf_fixed++;
415
416         return ret;
417 }
418
419 /*
420  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
421  * page tables and SPTEs to translate the faulting guest physical address.
422  */
423 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
424                     int map_writable, int max_level, kvm_pfn_t pfn,
425                     bool prefault)
426 {
427         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
428         bool write = error_code & PFERR_WRITE_MASK;
429         bool exec = error_code & PFERR_FETCH_MASK;
430         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
431         struct kvm_mmu *mmu = vcpu->arch.mmu;
432         struct tdp_iter iter;
433         struct kvm_mmu_page *sp;
434         u64 *child_pt;
435         u64 new_spte;
436         int ret;
437         gfn_t gfn = gpa >> PAGE_SHIFT;
438         int level;
439         int req_level;
440
441         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
442                 return RET_PF_RETRY;
443         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
444                 return RET_PF_RETRY;
445
446         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
447                                         huge_page_disallowed, &req_level);
448
449         trace_kvm_mmu_spte_requested(gpa, level, pfn);
450         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
451                 if (nx_huge_page_workaround_enabled)
452                         disallowed_hugepage_adjust(iter.old_spte, gfn,
453                                                    iter.level, &pfn, &level);
454
455                 if (iter.level == level)
456                         break;
457
458                 /*
459                  * If there is an SPTE mapping a large page at a higher level
460                  * than the target, that SPTE must be cleared and replaced
461                  * with a non-leaf SPTE.
462                  */
463                 if (is_shadow_present_pte(iter.old_spte) &&
464                     is_large_pte(iter.old_spte)) {
465                         tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
466
467                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
468                                         KVM_PAGES_PER_HPAGE(iter.level));
469
470                         /*
471                          * The iter must explicitly re-read the spte here
472                          * because the new value informs the !present
473                          * path below.
474                          */
475                         iter.old_spte = READ_ONCE(*iter.sptep);
476                 }
477
478                 if (!is_shadow_present_pte(iter.old_spte)) {
479                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
480                         list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
481                         child_pt = sp->spt;
482                         clear_page(child_pt);
483                         new_spte = make_nonleaf_spte(child_pt,
484                                                      !shadow_accessed_mask);
485
486                         trace_kvm_mmu_get_page(sp, true);
487                         tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
488                 }
489         }
490
491         if (WARN_ON(iter.level != level))
492                 return RET_PF_RETRY;
493
494         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
495                                               pfn, prefault);
496
497         return ret;
498 }