kvm: x86/mmu: Support dirty logging for the TDP MMU
[linux-2.6-block.git] / arch / x86 / kvm / mmu / tdp_mmu.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9
10 static bool __read_mostly tdp_mmu_enabled = false;
11
12 static bool is_tdp_mmu_enabled(void)
13 {
14 #ifdef CONFIG_X86_64
15         return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
16 #else
17         return false;
18 #endif /* CONFIG_X86_64 */
19 }
20
21 /* Initializes the TDP MMU for the VM, if enabled. */
22 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
23 {
24         if (!is_tdp_mmu_enabled())
25                 return;
26
27         /* This should not be changed for the lifetime of the VM. */
28         kvm->arch.tdp_mmu_enabled = true;
29
30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
32 }
33
34 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
35 {
36         if (!kvm->arch.tdp_mmu_enabled)
37                 return;
38
39         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
40 }
41
42 #define for_each_tdp_mmu_root(_kvm, _root)                          \
43         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
44
45 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
46 {
47         struct kvm_mmu_page *sp;
48
49         sp = to_shadow_page(hpa);
50
51         return sp->tdp_mmu_page && sp->root_count;
52 }
53
54 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
55                           gfn_t start, gfn_t end, bool can_yield);
56
57 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
58 {
59         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
60
61         lockdep_assert_held(&kvm->mmu_lock);
62
63         WARN_ON(root->root_count);
64         WARN_ON(!root->tdp_mmu_page);
65
66         list_del(&root->link);
67
68         zap_gfn_range(kvm, root, 0, max_gfn, false);
69
70         free_page((unsigned long)root->spt);
71         kmem_cache_free(mmu_page_header_cache, root);
72 }
73
74 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
75                                                    int level)
76 {
77         union kvm_mmu_page_role role;
78
79         role = vcpu->arch.mmu->mmu_role.base;
80         role.level = level;
81         role.direct = true;
82         role.gpte_is_8_bytes = true;
83         role.access = ACC_ALL;
84
85         return role;
86 }
87
88 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
89                                                int level)
90 {
91         struct kvm_mmu_page *sp;
92
93         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
94         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
95         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
96
97         sp->role.word = page_role_for_level(vcpu, level).word;
98         sp->gfn = gfn;
99         sp->tdp_mmu_page = true;
100
101         return sp;
102 }
103
104 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
105 {
106         union kvm_mmu_page_role role;
107         struct kvm *kvm = vcpu->kvm;
108         struct kvm_mmu_page *root;
109
110         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
111
112         spin_lock(&kvm->mmu_lock);
113
114         /* Check for an existing root before allocating a new one. */
115         for_each_tdp_mmu_root(kvm, root) {
116                 if (root->role.word == role.word) {
117                         kvm_mmu_get_root(kvm, root);
118                         spin_unlock(&kvm->mmu_lock);
119                         return root;
120                 }
121         }
122
123         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
124         root->root_count = 1;
125
126         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
127
128         spin_unlock(&kvm->mmu_lock);
129
130         return root;
131 }
132
133 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
134 {
135         struct kvm_mmu_page *root;
136
137         root = get_tdp_mmu_vcpu_root(vcpu);
138         if (!root)
139                 return INVALID_PAGE;
140
141         return __pa(root->spt);
142 }
143
144 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
145                                 u64 old_spte, u64 new_spte, int level);
146
147 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
148 {
149         return sp->role.smm ? 1 : 0;
150 }
151
152 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
153 {
154         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
155
156         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
157                 return;
158
159         if (is_accessed_spte(old_spte) &&
160             (!is_accessed_spte(new_spte) || pfn_changed))
161                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
162 }
163
164 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
165                                           u64 old_spte, u64 new_spte, int level)
166 {
167         bool pfn_changed;
168         struct kvm_memory_slot *slot;
169
170         if (level > PG_LEVEL_4K)
171                 return;
172
173         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
174
175         if ((!is_writable_pte(old_spte) || pfn_changed) &&
176             is_writable_pte(new_spte)) {
177                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
178                 mark_page_dirty_in_slot(slot, gfn);
179         }
180 }
181
182 /**
183  * handle_changed_spte - handle bookkeeping associated with an SPTE change
184  * @kvm: kvm instance
185  * @as_id: the address space of the paging structure the SPTE was a part of
186  * @gfn: the base GFN that was mapped by the SPTE
187  * @old_spte: The value of the SPTE before the change
188  * @new_spte: The value of the SPTE after the change
189  * @level: the level of the PT the SPTE is part of in the paging structure
190  *
191  * Handle bookkeeping that might result from the modification of a SPTE.
192  * This function must be called for all TDP SPTE modifications.
193  */
194 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
195                                 u64 old_spte, u64 new_spte, int level)
196 {
197         bool was_present = is_shadow_present_pte(old_spte);
198         bool is_present = is_shadow_present_pte(new_spte);
199         bool was_leaf = was_present && is_last_spte(old_spte, level);
200         bool is_leaf = is_present && is_last_spte(new_spte, level);
201         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
202         u64 *pt;
203         struct kvm_mmu_page *sp;
204         u64 old_child_spte;
205         int i;
206
207         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
208         WARN_ON(level < PG_LEVEL_4K);
209         WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level));
210
211         /*
212          * If this warning were to trigger it would indicate that there was a
213          * missing MMU notifier or a race with some notifier handler.
214          * A present, leaf SPTE should never be directly replaced with another
215          * present leaf SPTE pointing to a differnt PFN. A notifier handler
216          * should be zapping the SPTE before the main MM's page table is
217          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
218          * thread before replacement.
219          */
220         if (was_leaf && is_leaf && pfn_changed) {
221                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
222                        "SPTE with another present leaf SPTE mapping a\n"
223                        "different PFN!\n"
224                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
225                        as_id, gfn, old_spte, new_spte, level);
226
227                 /*
228                  * Crash the host to prevent error propagation and guest data
229                  * courruption.
230                  */
231                 BUG();
232         }
233
234         if (old_spte == new_spte)
235                 return;
236
237         /*
238          * The only times a SPTE should be changed from a non-present to
239          * non-present state is when an MMIO entry is installed/modified/
240          * removed. In that case, there is nothing to do here.
241          */
242         if (!was_present && !is_present) {
243                 /*
244                  * If this change does not involve a MMIO SPTE, it is
245                  * unexpected. Log the change, though it should not impact the
246                  * guest since both the former and current SPTEs are nonpresent.
247                  */
248                 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
249                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
250                                "should not be replaced with another,\n"
251                                "different nonpresent SPTE, unless one or both\n"
252                                "are MMIO SPTEs.\n"
253                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
254                                as_id, gfn, old_spte, new_spte, level);
255                 return;
256         }
257
258
259         if (was_leaf && is_dirty_spte(old_spte) &&
260             (!is_dirty_spte(new_spte) || pfn_changed))
261                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
262
263         /*
264          * Recursively handle child PTs if the change removed a subtree from
265          * the paging structure.
266          */
267         if (was_present && !was_leaf && (pfn_changed || !is_present)) {
268                 pt = spte_to_child_pt(old_spte, level);
269                 sp = sptep_to_sp(pt);
270
271                 list_del(&sp->link);
272
273                 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
274                         old_child_spte = READ_ONCE(*(pt + i));
275                         WRITE_ONCE(*(pt + i), 0);
276                         handle_changed_spte(kvm, as_id,
277                                 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
278                                 old_child_spte, 0, level - 1);
279                 }
280
281                 kvm_flush_remote_tlbs_with_address(kvm, gfn,
282                                                    KVM_PAGES_PER_HPAGE(level));
283
284                 free_page((unsigned long)pt);
285                 kmem_cache_free(mmu_page_header_cache, sp);
286         }
287 }
288
289 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
290                                 u64 old_spte, u64 new_spte, int level)
291 {
292         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
293         handle_changed_spte_acc_track(old_spte, new_spte, level);
294         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
295                                       new_spte, level);
296 }
297
298 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
299                                       u64 new_spte, bool record_acc_track,
300                                       bool record_dirty_log)
301 {
302         u64 *root_pt = tdp_iter_root_pt(iter);
303         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
304         int as_id = kvm_mmu_page_as_id(root);
305
306         WRITE_ONCE(*iter->sptep, new_spte);
307
308         __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
309                               iter->level);
310         if (record_acc_track)
311                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
312                                               iter->level);
313         if (record_dirty_log)
314                 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
315                                               iter->old_spte, new_spte,
316                                               iter->level);
317 }
318
319 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
320                                     u64 new_spte)
321 {
322         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
323 }
324
325 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
326                                                  struct tdp_iter *iter,
327                                                  u64 new_spte)
328 {
329         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
330 }
331
332 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
333                                                  struct tdp_iter *iter,
334                                                  u64 new_spte)
335 {
336         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
337 }
338
339 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
340         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
341
342 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
343         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
344                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
345                     !is_last_spte(_iter.old_spte, _iter.level))         \
346                         continue;                                       \
347                 else
348
349 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
350         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
351                          _mmu->shadow_root_level, _start, _end)
352
353 /*
354  * Flush the TLB if the process should drop kvm->mmu_lock.
355  * Return whether the caller still needs to flush the tlb.
356  */
357 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
358 {
359         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
360                 kvm_flush_remote_tlbs(kvm);
361                 cond_resched_lock(&kvm->mmu_lock);
362                 tdp_iter_refresh_walk(iter);
363                 return false;
364         } else {
365                 return true;
366         }
367 }
368
369 static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
370 {
371         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
372                 cond_resched_lock(&kvm->mmu_lock);
373                 tdp_iter_refresh_walk(iter);
374         }
375 }
376
377 /*
378  * Tears down the mappings for the range of gfns, [start, end), and frees the
379  * non-root pages mapping GFNs strictly within that range. Returns true if
380  * SPTEs have been cleared and a TLB flush is needed before releasing the
381  * MMU lock.
382  * If can_yield is true, will release the MMU lock and reschedule if the
383  * scheduler needs the CPU or there is contention on the MMU lock. If this
384  * function cannot yield, it will not release the MMU lock or reschedule and
385  * the caller must ensure it does not supply too large a GFN range, or the
386  * operation can cause a soft lockup.
387  */
388 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
389                           gfn_t start, gfn_t end, bool can_yield)
390 {
391         struct tdp_iter iter;
392         bool flush_needed = false;
393
394         tdp_root_for_each_pte(iter, root, start, end) {
395                 if (!is_shadow_present_pte(iter.old_spte))
396                         continue;
397
398                 /*
399                  * If this is a non-last-level SPTE that covers a larger range
400                  * than should be zapped, continue, and zap the mappings at a
401                  * lower level.
402                  */
403                 if ((iter.gfn < start ||
404                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
405                     !is_last_spte(iter.old_spte, iter.level))
406                         continue;
407
408                 tdp_mmu_set_spte(kvm, &iter, 0);
409
410                 if (can_yield)
411                         flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
412                 else
413                         flush_needed = true;
414         }
415         return flush_needed;
416 }
417
418 /*
419  * Tears down the mappings for the range of gfns, [start, end), and frees the
420  * non-root pages mapping GFNs strictly within that range. Returns true if
421  * SPTEs have been cleared and a TLB flush is needed before releasing the
422  * MMU lock.
423  */
424 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
425 {
426         struct kvm_mmu_page *root;
427         bool flush = false;
428
429         for_each_tdp_mmu_root(kvm, root) {
430                 /*
431                  * Take a reference on the root so that it cannot be freed if
432                  * this thread releases the MMU lock and yields in this loop.
433                  */
434                 kvm_mmu_get_root(kvm, root);
435
436                 flush |= zap_gfn_range(kvm, root, start, end, true);
437
438                 kvm_mmu_put_root(kvm, root);
439         }
440
441         return flush;
442 }
443
444 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
445 {
446         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
447         bool flush;
448
449         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
450         if (flush)
451                 kvm_flush_remote_tlbs(kvm);
452 }
453
454 /*
455  * Installs a last-level SPTE to handle a TDP page fault.
456  * (NPT/EPT violation/misconfiguration)
457  */
458 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
459                                           int map_writable,
460                                           struct tdp_iter *iter,
461                                           kvm_pfn_t pfn, bool prefault)
462 {
463         u64 new_spte;
464         int ret = 0;
465         int make_spte_ret = 0;
466
467         if (unlikely(is_noslot_pfn(pfn))) {
468                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
469                 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
470         } else
471                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
472                                          pfn, iter->old_spte, prefault, true,
473                                          map_writable, !shadow_accessed_mask,
474                                          &new_spte);
475
476         if (new_spte == iter->old_spte)
477                 ret = RET_PF_SPURIOUS;
478         else
479                 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
480
481         /*
482          * If the page fault was caused by a write but the page is write
483          * protected, emulation is needed. If the emulation was skipped,
484          * the vCPU would have the same fault again.
485          */
486         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
487                 if (write)
488                         ret = RET_PF_EMULATE;
489                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
490         }
491
492         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
493         if (unlikely(is_mmio_spte(new_spte)))
494                 ret = RET_PF_EMULATE;
495
496         trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
497         if (!prefault)
498                 vcpu->stat.pf_fixed++;
499
500         return ret;
501 }
502
503 /*
504  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
505  * page tables and SPTEs to translate the faulting guest physical address.
506  */
507 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
508                     int map_writable, int max_level, kvm_pfn_t pfn,
509                     bool prefault)
510 {
511         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
512         bool write = error_code & PFERR_WRITE_MASK;
513         bool exec = error_code & PFERR_FETCH_MASK;
514         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
515         struct kvm_mmu *mmu = vcpu->arch.mmu;
516         struct tdp_iter iter;
517         struct kvm_mmu_page *sp;
518         u64 *child_pt;
519         u64 new_spte;
520         int ret;
521         gfn_t gfn = gpa >> PAGE_SHIFT;
522         int level;
523         int req_level;
524
525         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
526                 return RET_PF_RETRY;
527         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
528                 return RET_PF_RETRY;
529
530         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
531                                         huge_page_disallowed, &req_level);
532
533         trace_kvm_mmu_spte_requested(gpa, level, pfn);
534         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
535                 if (nx_huge_page_workaround_enabled)
536                         disallowed_hugepage_adjust(iter.old_spte, gfn,
537                                                    iter.level, &pfn, &level);
538
539                 if (iter.level == level)
540                         break;
541
542                 /*
543                  * If there is an SPTE mapping a large page at a higher level
544                  * than the target, that SPTE must be cleared and replaced
545                  * with a non-leaf SPTE.
546                  */
547                 if (is_shadow_present_pte(iter.old_spte) &&
548                     is_large_pte(iter.old_spte)) {
549                         tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
550
551                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
552                                         KVM_PAGES_PER_HPAGE(iter.level));
553
554                         /*
555                          * The iter must explicitly re-read the spte here
556                          * because the new value informs the !present
557                          * path below.
558                          */
559                         iter.old_spte = READ_ONCE(*iter.sptep);
560                 }
561
562                 if (!is_shadow_present_pte(iter.old_spte)) {
563                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
564                         list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
565                         child_pt = sp->spt;
566                         clear_page(child_pt);
567                         new_spte = make_nonleaf_spte(child_pt,
568                                                      !shadow_accessed_mask);
569
570                         trace_kvm_mmu_get_page(sp, true);
571                         tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
572                 }
573         }
574
575         if (WARN_ON(iter.level != level))
576                 return RET_PF_RETRY;
577
578         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
579                                               pfn, prefault);
580
581         return ret;
582 }
583
584 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
585                 unsigned long end, unsigned long data,
586                 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
587                                struct kvm_mmu_page *root, gfn_t start,
588                                gfn_t end, unsigned long data))
589 {
590         struct kvm_memslots *slots;
591         struct kvm_memory_slot *memslot;
592         struct kvm_mmu_page *root;
593         int ret = 0;
594         int as_id;
595
596         for_each_tdp_mmu_root(kvm, root) {
597                 /*
598                  * Take a reference on the root so that it cannot be freed if
599                  * this thread releases the MMU lock and yields in this loop.
600                  */
601                 kvm_mmu_get_root(kvm, root);
602
603                 as_id = kvm_mmu_page_as_id(root);
604                 slots = __kvm_memslots(kvm, as_id);
605                 kvm_for_each_memslot(memslot, slots) {
606                         unsigned long hva_start, hva_end;
607                         gfn_t gfn_start, gfn_end;
608
609                         hva_start = max(start, memslot->userspace_addr);
610                         hva_end = min(end, memslot->userspace_addr +
611                                       (memslot->npages << PAGE_SHIFT));
612                         if (hva_start >= hva_end)
613                                 continue;
614                         /*
615                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
616                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
617                          */
618                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
619                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
620
621                         ret |= handler(kvm, memslot, root, gfn_start,
622                                        gfn_end, data);
623                 }
624
625                 kvm_mmu_put_root(kvm, root);
626         }
627
628         return ret;
629 }
630
631 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
632                                      struct kvm_memory_slot *slot,
633                                      struct kvm_mmu_page *root, gfn_t start,
634                                      gfn_t end, unsigned long unused)
635 {
636         return zap_gfn_range(kvm, root, start, end, false);
637 }
638
639 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
640                               unsigned long end)
641 {
642         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
643                                             zap_gfn_range_hva_wrapper);
644 }
645
646 /*
647  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
648  * if any of the GFNs in the range have been accessed.
649  */
650 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
651                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
652                          unsigned long unused)
653 {
654         struct tdp_iter iter;
655         int young = 0;
656         u64 new_spte = 0;
657
658         tdp_root_for_each_leaf_pte(iter, root, start, end) {
659                 /*
660                  * If we have a non-accessed entry we don't need to change the
661                  * pte.
662                  */
663                 if (!is_accessed_spte(iter.old_spte))
664                         continue;
665
666                 new_spte = iter.old_spte;
667
668                 if (spte_ad_enabled(new_spte)) {
669                         clear_bit((ffs(shadow_accessed_mask) - 1),
670                                   (unsigned long *)&new_spte);
671                 } else {
672                         /*
673                          * Capture the dirty status of the page, so that it doesn't get
674                          * lost when the SPTE is marked for access tracking.
675                          */
676                         if (is_writable_pte(new_spte))
677                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
678
679                         new_spte = mark_spte_for_access_track(new_spte);
680                 }
681                 new_spte &= ~shadow_dirty_mask;
682
683                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
684                 young = 1;
685         }
686
687         return young;
688 }
689
690 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
691                               unsigned long end)
692 {
693         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
694                                             age_gfn_range);
695 }
696
697 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
698                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
699                         unsigned long unused2)
700 {
701         struct tdp_iter iter;
702
703         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
704                 if (is_accessed_spte(iter.old_spte))
705                         return 1;
706
707         return 0;
708 }
709
710 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
711 {
712         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
713                                             test_age_gfn);
714 }
715
716 /*
717  * Handle the changed_pte MMU notifier for the TDP MMU.
718  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
719  * notifier.
720  * Returns non-zero if a flush is needed before releasing the MMU lock.
721  */
722 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
723                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
724                         unsigned long data)
725 {
726         struct tdp_iter iter;
727         pte_t *ptep = (pte_t *)data;
728         kvm_pfn_t new_pfn;
729         u64 new_spte;
730         int need_flush = 0;
731
732         WARN_ON(pte_huge(*ptep));
733
734         new_pfn = pte_pfn(*ptep);
735
736         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
737                 if (iter.level != PG_LEVEL_4K)
738                         continue;
739
740                 if (!is_shadow_present_pte(iter.old_spte))
741                         break;
742
743                 tdp_mmu_set_spte(kvm, &iter, 0);
744
745                 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
746
747                 if (!pte_write(*ptep)) {
748                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
749                                         iter.old_spte, new_pfn);
750
751                         tdp_mmu_set_spte(kvm, &iter, new_spte);
752                 }
753
754                 need_flush = 1;
755         }
756
757         if (need_flush)
758                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
759
760         return 0;
761 }
762
763 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
764                              pte_t *host_ptep)
765 {
766         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
767                                             (unsigned long)host_ptep,
768                                             set_tdp_spte);
769 }
770
771 /*
772  * Remove write access from all the SPTEs mapping GFNs [start, end). If
773  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
774  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
775  */
776 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
777                              gfn_t start, gfn_t end, int min_level)
778 {
779         struct tdp_iter iter;
780         u64 new_spte;
781         bool spte_set = false;
782
783         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
784
785         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
786                                    min_level, start, end) {
787                 if (!is_shadow_present_pte(iter.old_spte) ||
788                     !is_last_spte(iter.old_spte, iter.level))
789                         continue;
790
791                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
792
793                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
794                 spte_set = true;
795
796                 tdp_mmu_iter_cond_resched(kvm, &iter);
797         }
798         return spte_set;
799 }
800
801 /*
802  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
803  * only affect leaf SPTEs down to min_level.
804  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
805  */
806 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
807                              int min_level)
808 {
809         struct kvm_mmu_page *root;
810         int root_as_id;
811         bool spte_set = false;
812
813         for_each_tdp_mmu_root(kvm, root) {
814                 root_as_id = kvm_mmu_page_as_id(root);
815                 if (root_as_id != slot->as_id)
816                         continue;
817
818                 /*
819                  * Take a reference on the root so that it cannot be freed if
820                  * this thread releases the MMU lock and yields in this loop.
821                  */
822                 kvm_mmu_get_root(kvm, root);
823
824                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
825                              slot->base_gfn + slot->npages, min_level);
826
827                 kvm_mmu_put_root(kvm, root);
828         }
829
830         return spte_set;
831 }
832
833 /*
834  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
835  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
836  * If AD bits are not enabled, this will require clearing the writable bit on
837  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
838  * be flushed.
839  */
840 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
841                            gfn_t start, gfn_t end)
842 {
843         struct tdp_iter iter;
844         u64 new_spte;
845         bool spte_set = false;
846
847         tdp_root_for_each_leaf_pte(iter, root, start, end) {
848                 if (spte_ad_need_write_protect(iter.old_spte)) {
849                         if (is_writable_pte(iter.old_spte))
850                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
851                         else
852                                 continue;
853                 } else {
854                         if (iter.old_spte & shadow_dirty_mask)
855                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
856                         else
857                                 continue;
858                 }
859
860                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
861                 spte_set = true;
862
863                 tdp_mmu_iter_cond_resched(kvm, &iter);
864         }
865         return spte_set;
866 }
867
868 /*
869  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
870  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
871  * If AD bits are not enabled, this will require clearing the writable bit on
872  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
873  * be flushed.
874  */
875 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
876 {
877         struct kvm_mmu_page *root;
878         int root_as_id;
879         bool spte_set = false;
880
881         for_each_tdp_mmu_root(kvm, root) {
882                 root_as_id = kvm_mmu_page_as_id(root);
883                 if (root_as_id != slot->as_id)
884                         continue;
885
886                 /*
887                  * Take a reference on the root so that it cannot be freed if
888                  * this thread releases the MMU lock and yields in this loop.
889                  */
890                 kvm_mmu_get_root(kvm, root);
891
892                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
893                                 slot->base_gfn + slot->npages);
894
895                 kvm_mmu_put_root(kvm, root);
896         }
897
898         return spte_set;
899 }
900
901 /*
902  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
903  * set in mask, starting at gfn. The given memslot is expected to contain all
904  * the GFNs represented by set bits in the mask. If AD bits are enabled,
905  * clearing the dirty status will involve clearing the dirty bit on each SPTE
906  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
907  */
908 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
909                                   gfn_t gfn, unsigned long mask, bool wrprot)
910 {
911         struct tdp_iter iter;
912         u64 new_spte;
913
914         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
915                                     gfn + BITS_PER_LONG) {
916                 if (!mask)
917                         break;
918
919                 if (iter.level > PG_LEVEL_4K ||
920                     !(mask & (1UL << (iter.gfn - gfn))))
921                         continue;
922
923                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
924                         if (is_writable_pte(iter.old_spte))
925                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
926                         else
927                                 continue;
928                 } else {
929                         if (iter.old_spte & shadow_dirty_mask)
930                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
931                         else
932                                 continue;
933                 }
934
935                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
936
937                 mask &= ~(1UL << (iter.gfn - gfn));
938         }
939 }
940
941 /*
942  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
943  * set in mask, starting at gfn. The given memslot is expected to contain all
944  * the GFNs represented by set bits in the mask. If AD bits are enabled,
945  * clearing the dirty status will involve clearing the dirty bit on each SPTE
946  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
947  */
948 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
949                                        struct kvm_memory_slot *slot,
950                                        gfn_t gfn, unsigned long mask,
951                                        bool wrprot)
952 {
953         struct kvm_mmu_page *root;
954         int root_as_id;
955
956         lockdep_assert_held(&kvm->mmu_lock);
957         for_each_tdp_mmu_root(kvm, root) {
958                 root_as_id = kvm_mmu_page_as_id(root);
959                 if (root_as_id != slot->as_id)
960                         continue;
961
962                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
963         }
964 }
965
966 /*
967  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
968  * only used for PML, and so will involve setting the dirty bit on each SPTE.
969  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
970  */
971 static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
972                                 gfn_t start, gfn_t end)
973 {
974         struct tdp_iter iter;
975         u64 new_spte;
976         bool spte_set = false;
977
978         tdp_root_for_each_pte(iter, root, start, end) {
979                 if (!is_shadow_present_pte(iter.old_spte))
980                         continue;
981
982                 new_spte = iter.old_spte | shadow_dirty_mask;
983
984                 tdp_mmu_set_spte(kvm, &iter, new_spte);
985                 spte_set = true;
986
987                 tdp_mmu_iter_cond_resched(kvm, &iter);
988         }
989
990         return spte_set;
991 }
992
993 /*
994  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
995  * only used for PML, and so will involve setting the dirty bit on each SPTE.
996  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
997  */
998 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
999 {
1000         struct kvm_mmu_page *root;
1001         int root_as_id;
1002         bool spte_set = false;
1003
1004         for_each_tdp_mmu_root(kvm, root) {
1005                 root_as_id = kvm_mmu_page_as_id(root);
1006                 if (root_as_id != slot->as_id)
1007                         continue;
1008
1009                 /*
1010                  * Take a reference on the root so that it cannot be freed if
1011                  * this thread releases the MMU lock and yields in this loop.
1012                  */
1013                 kvm_mmu_get_root(kvm, root);
1014
1015                 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
1016                                 slot->base_gfn + slot->npages);
1017
1018                 kvm_mmu_put_root(kvm, root);
1019         }
1020         return spte_set;
1021 }
1022