Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
1d737c8a ZX |
2 | #ifndef __KVM_X86_MMU_H |
3 | #define __KVM_X86_MMU_H | |
4 | ||
edf88417 | 5 | #include <linux/kvm_host.h> |
fc78f519 | 6 | #include "kvm_cache_regs.h" |
89786147 | 7 | #include "cpuid.h" |
1d737c8a | 8 | |
8c6d6adc | 9 | #define PT_WRITABLE_SHIFT 1 |
be94f6b7 | 10 | #define PT_USER_SHIFT 2 |
8c6d6adc SY |
11 | |
12 | #define PT_PRESENT_MASK (1ULL << 0) | |
13 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | |
be94f6b7 | 14 | #define PT_USER_MASK (1ULL << PT_USER_SHIFT) |
8c6d6adc SY |
15 | #define PT_PWT_MASK (1ULL << 3) |
16 | #define PT_PCD_MASK (1ULL << 4) | |
1b7fcd32 AK |
17 | #define PT_ACCESSED_SHIFT 5 |
18 | #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) | |
8ea667f2 AK |
19 | #define PT_DIRTY_SHIFT 6 |
20 | #define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) | |
6fd01b71 AK |
21 | #define PT_PAGE_SIZE_SHIFT 7 |
22 | #define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT) | |
8c6d6adc SY |
23 | #define PT_PAT_MASK (1ULL << 7) |
24 | #define PT_GLOBAL_MASK (1ULL << 8) | |
25 | #define PT64_NX_SHIFT 63 | |
26 | #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) | |
27 | ||
28 | #define PT_PAT_SHIFT 7 | |
29 | #define PT_DIR_PAT_SHIFT 12 | |
30 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | |
31 | ||
855feb67 | 32 | #define PT64_ROOT_5LEVEL 5 |
2a7266a8 | 33 | #define PT64_ROOT_4LEVEL 4 |
8c6d6adc SY |
34 | #define PT32_ROOT_LEVEL 2 |
35 | #define PT32E_ROOT_LEVEL 3 | |
36 | ||
a91a7c70 LJ |
37 | #define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \ |
38 | X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE) | |
20f632bd SC |
39 | |
40 | #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP) | |
d6174299 | 41 | #define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX) |
20f632bd | 42 | |
eb79cd00 | 43 | static __always_inline u64 rsvd_bits(int s, int e) |
d1431483 | 44 | { |
eb79cd00 SC |
45 | BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s); |
46 | ||
47 | if (__builtin_constant_p(e)) | |
48 | BUILD_BUG_ON(e > 63); | |
49 | else | |
50 | e &= 63; | |
51 | ||
d1cd3ce9 YZ |
52 | if (e < s) |
53 | return 0; | |
54 | ||
2f80d502 | 55 | return ((2ULL << (e - s)) - 1) << s; |
d1431483 TC |
56 | } |
57 | ||
86931ff7 SC |
58 | /* |
59 | * The number of non-reserved physical address bits irrespective of features | |
60 | * that repurpose legal bits, e.g. MKTME. | |
61 | */ | |
62 | extern u8 __read_mostly shadow_phys_bits; | |
63 | ||
64 | static inline gfn_t kvm_mmu_max_gfn(void) | |
65 | { | |
66 | /* | |
67 | * Note that this uses the host MAXPHYADDR, not the guest's. | |
68 | * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR; | |
69 | * assuming KVM is running on bare metal, guest accesses beyond | |
70 | * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit | |
71 | * (either EPT Violation/Misconfig or #NPF), and so KVM will never | |
72 | * install a SPTE for such addresses. If KVM is running as a VM | |
73 | * itself, on the other hand, it might see a MAXPHYADDR that is less | |
74 | * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR | |
75 | * disallows such SPTEs entirely and simplifies the TDP MMU. | |
76 | */ | |
77 | int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52; | |
78 | ||
79 | return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1; | |
80 | } | |
81 | ||
3c5c3245 KH |
82 | static inline u8 kvm_get_shadow_phys_bits(void) |
83 | { | |
84 | /* | |
85 | * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected | |
86 | * in CPU detection code, but the processor treats those reduced bits as | |
87 | * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at | |
88 | * the physical address bits reported by CPUID. | |
89 | */ | |
90 | if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) | |
91 | return cpuid_eax(0x80000008) & 0xff; | |
92 | ||
93 | /* | |
94 | * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with | |
95 | * custom CPUID. Proceed with whatever the kernel found since these features | |
96 | * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). | |
97 | */ | |
98 | return boot_cpu_data.x86_phys_bits; | |
99 | } | |
100 | ||
8120337a | 101 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); |
e54f1ff2 | 102 | void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); |
e7b7bdea | 103 | void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); |
b37fbea6 | 104 | |
c9060662 | 105 | void kvm_init_mmu(struct kvm_vcpu *vcpu); |
dbc4739b SC |
106 | void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, |
107 | unsigned long cr4, u64 efer, gpa_t nested_cr3); | |
ae1e2d10 | 108 | void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, |
cc022ae1 LJ |
109 | int huge_page_level, bool accessed_dirty, |
110 | gpa_t new_eptp); | |
9bc1f09f | 111 | bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); |
1261bfa3 | 112 | int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, |
d0006530 | 113 | u64 fault_address, char *insn, int insn_len); |
94d8b056 | 114 | |
61a1773e SC |
115 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
116 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | |
527d5cd7 | 117 | void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu); |
61a1773e | 118 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); |
61b05a9f | 119 | void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); |
61a1773e | 120 | |
1d737c8a ZX |
121 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) |
122 | { | |
b9e5603c | 123 | if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE)) |
1d737c8a ZX |
124 | return 0; |
125 | ||
126 | return kvm_mmu_load(vcpu); | |
127 | } | |
128 | ||
c9470a2e JS |
129 | static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) |
130 | { | |
131 | BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); | |
132 | ||
133 | return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) | |
134 | ? cr3 & X86_CR3_PCID_MASK | |
135 | : 0; | |
136 | } | |
137 | ||
138 | static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) | |
139 | { | |
140 | return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); | |
141 | } | |
142 | ||
689f3bf2 | 143 | static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) |
6e42782f | 144 | { |
b9e5603c | 145 | u64 root_hpa = vcpu->arch.mmu->root.hpa; |
2a40b900 SC |
146 | |
147 | if (!VALID_PAGE(root_hpa)) | |
148 | return; | |
149 | ||
e83bc09c | 150 | static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa, |
a972e29c | 151 | vcpu->arch.mmu->root_role.level); |
7a02674d SC |
152 | } |
153 | ||
97d64b78 | 154 | /* |
f13577e8 PB |
155 | * Check if a given access (described through the I/D, W/R and U/S bits of a |
156 | * page fault error code pfec) causes a permission fault with the given PTE | |
157 | * access rights (in ACC_* format). | |
158 | * | |
159 | * Return zero if the access does not fault; return the page fault error code | |
160 | * if the access faults. | |
97d64b78 | 161 | */ |
f13577e8 | 162 | static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
be94f6b7 | 163 | unsigned pte_access, unsigned pte_pkey, |
5b22bbe7 | 164 | u64 access) |
bebb106a | 165 | { |
5b22bbe7 LJ |
166 | /* strip nested paging fault error codes */ |
167 | unsigned int pfec = access; | |
b3646477 | 168 | unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); |
97ec8c06 FW |
169 | |
170 | /* | |
4f4aa80e LJ |
171 | * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1. |
172 | * For implicit supervisor accesses, SMAP cannot be overridden. | |
97ec8c06 | 173 | * |
4f4aa80e LJ |
174 | * SMAP works on supervisor accesses only, and not_smap can |
175 | * be set or not set when user access with neither has any bearing | |
176 | * on the result. | |
97ec8c06 | 177 | * |
4f4aa80e LJ |
178 | * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit; |
179 | * this bit will always be zero in pfec, but it will be one in index | |
180 | * if SMAP checks are being disabled. | |
97ec8c06 | 181 | */ |
4f4aa80e LJ |
182 | u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; |
183 | bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; | |
184 | int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; | |
be94f6b7 | 185 | bool fault = (mmu->permissions[index] >> pte_access) & 1; |
7a98205d | 186 | u32 errcode = PFERR_PRESENT_MASK; |
97ec8c06 | 187 | |
be94f6b7 | 188 | WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); |
be94f6b7 HH |
189 | if (unlikely(mmu->pkru_mask)) { |
190 | u32 pkru_bits, offset; | |
191 | ||
192 | /* | |
193 | * PKRU defines 32 bits, there are 16 domains and 2 | |
194 | * attribute bits per domain in pkru. pte_pkey is the | |
195 | * index of the protection domain, so pte_pkey * 2 is | |
196 | * is the index of the first bit for the domain. | |
197 | */ | |
b9dd21e1 | 198 | pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; |
be94f6b7 HH |
199 | |
200 | /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ | |
7a98205d | 201 | offset = (pfec & ~1) + |
be94f6b7 HH |
202 | ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); |
203 | ||
204 | pkru_bits &= mmu->pkru_mask >> offset; | |
7a98205d | 205 | errcode |= -pkru_bits & PFERR_PK_MASK; |
be94f6b7 HH |
206 | fault |= (pkru_bits != 0); |
207 | } | |
208 | ||
7a98205d | 209 | return -(u32)fault & errcode; |
bebb106a | 210 | } |
97d64b78 | 211 | |
efdfe536 | 212 | void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); |
547ffaed | 213 | |
6ca9a6f3 | 214 | int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); |
1aa9b957 JS |
215 | |
216 | int kvm_mmu_post_init_vm(struct kvm *kvm); | |
217 | void kvm_mmu_pre_destroy_vm(struct kvm *kvm); | |
218 | ||
1e76a3ce | 219 | static inline bool kvm_shadow_root_allocated(struct kvm *kvm) |
e2209710 | 220 | { |
d501f747 | 221 | /* |
1e76a3ce DS |
222 | * Read shadow_root_allocated before related pointers. Hence, threads |
223 | * reading shadow_root_allocated in any lock context are guaranteed to | |
224 | * see the pointers. Pairs with smp_store_release in | |
225 | * mmu_first_shadow_root_alloc. | |
d501f747 | 226 | */ |
1e76a3ce DS |
227 | return smp_load_acquire(&kvm->arch.shadow_root_allocated); |
228 | } | |
229 | ||
230 | #ifdef CONFIG_X86_64 | |
231 | static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } | |
232 | #else | |
233 | static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } | |
234 | #endif | |
235 | ||
236 | static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) | |
237 | { | |
238 | return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); | |
e2209710 BG |
239 | } |
240 | ||
4139b197 PX |
241 | static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) |
242 | { | |
243 | /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ | |
244 | return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | |
245 | (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | |
246 | } | |
247 | ||
248 | static inline unsigned long | |
249 | __kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages, | |
250 | int level) | |
251 | { | |
252 | return gfn_to_index(slot->base_gfn + npages - 1, | |
253 | slot->base_gfn, level) + 1; | |
254 | } | |
255 | ||
256 | static inline unsigned long | |
257 | kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level) | |
258 | { | |
259 | return __kvm_mmu_slot_lpages(slot, slot->npages, level); | |
260 | } | |
261 | ||
71f51d2c MZ |
262 | static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) |
263 | { | |
264 | atomic64_add(count, &kvm->stat.pages[level - 1]); | |
265 | } | |
c59a0f57 | 266 | |
5b22bbe7 | 267 | gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, |
c59a0f57 LJ |
268 | struct x86_exception *exception); |
269 | ||
270 | static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, | |
271 | struct kvm_mmu *mmu, | |
5b22bbe7 | 272 | gpa_t gpa, u64 access, |
c59a0f57 LJ |
273 | struct x86_exception *exception) |
274 | { | |
275 | if (mmu != &vcpu->arch.nested_mmu) | |
276 | return gpa; | |
277 | return translate_nested_gpa(vcpu, gpa, access, exception); | |
278 | } | |
1d737c8a | 279 | #endif |