Commit | Line | Data |
---|---|---|
5a9624af PB |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | ||
3 | #ifndef KVM_X86_MMU_SPTE_H | |
4 | #define KVM_X86_MMU_SPTE_H | |
5 | ||
837d557a SC |
6 | #include <asm/vmx.h> |
7 | ||
d10f3780 | 8 | #include "mmu.h" |
5a9624af PB |
9 | #include "mmu_internal.h" |
10 | ||
edea7c4f SC |
11 | /* |
12 | * A MMU present SPTE is backed by actual memory and may or may not be present | |
13 | * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it | |
14 | * is ignored by all flavors of SPTEs and checking a low bit often generates | |
15 | * better code than for a high bit, e.g. 56+. MMU present checks are pervasive | |
16 | * enough that the improved code generation is noticeable in KVM's footprint. | |
17 | */ | |
18 | #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) | |
19 | ||
8a406c89 SC |
20 | /* |
21 | * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also | |
22 | * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. | |
23 | * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that | |
24 | * is must be employed for a given TDP SPTE. | |
25 | * | |
26 | * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE | |
27 | * paging, including NPT PAE. This scheme works because legacy shadow paging | |
28 | * is guaranteed to have A/D bits and write-protection is forced only for | |
29 | * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it | |
30 | * must be restricted to 64-bit KVM. | |
31 | */ | |
32 | #define SPTE_TDP_AD_SHIFT 52 | |
33 | #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) | |
dc1ae59f LJ |
34 | #define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) |
35 | #define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) | |
36 | #define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) | |
37 | static_assert(SPTE_TDP_AD_ENABLED == 0); | |
5a9624af PB |
38 | |
39 | #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK | |
2ca3129e | 40 | #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) |
5a9624af | 41 | #else |
2ca3129e | 42 | #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) |
5a9624af | 43 | #endif |
5a9624af | 44 | |
2ca3129e | 45 | #define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ |
5a9624af PB |
46 | | shadow_x_mask | shadow_nx_mask | shadow_me_mask) |
47 | ||
48 | #define ACC_EXEC_MASK 1 | |
49 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | |
50 | #define ACC_USER_MASK PT_USER_MASK | |
51 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | |
52 | ||
53 | /* The mask for the R/X bits in EPT PTEs */ | |
2ca3129e SC |
54 | #define SPTE_EPT_READABLE_MASK 0x1ull |
55 | #define SPTE_EPT_EXECUTABLE_MASK 0x4ull | |
5a9624af | 56 | |
2ca3129e SC |
57 | #define SPTE_LEVEL_BITS 9 |
58 | #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) | |
59 | #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) | |
60 | #define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) | |
5a9624af | 61 | |
613a3f37 SC |
62 | /* |
63 | * The mask/shift to use for saving the original R/X bits when marking the PTE | |
64 | * as not-present for access tracking purposes. We do not save the W bit as the | |
65 | * PTEs being access tracked also need to be dirty tracked, so the W bit will be | |
66 | * restored only when a write is attempted to the page. This mask obviously | |
67 | * must not overlap the A/D type mask. | |
68 | */ | |
2ca3129e SC |
69 | #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ |
70 | SPTE_EPT_EXECUTABLE_MASK) | |
613a3f37 SC |
71 | #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 |
72 | #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ | |
73 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) | |
74 | static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); | |
75 | ||
5f16bcac | 76 | /* |
02844ac1 DM |
77 | * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given |
78 | * SPTE is write-protected. See is_writable_pte() for details. | |
5f16bcac DM |
79 | */ |
80 | ||
81 | /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ | |
1ca87e01 DM |
82 | #define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) |
83 | #define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) | |
5f16bcac | 84 | |
613a3f37 SC |
85 | /* |
86 | * Low ignored bits are at a premium for EPT, use high ignored bits, taking care | |
87 | * to not overlap the A/D type mask or the saved access bits of access-tracked | |
88 | * SPTEs when A/D bits are disabled. | |
89 | */ | |
90 | #define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) | |
91 | #define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) | |
92 | ||
93 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); | |
94 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); | |
95 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); | |
96 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); | |
97 | ||
98 | /* Defined only to keep the above static asserts readable. */ | |
99 | #undef SHADOW_ACC_TRACK_SAVED_MASK | |
5a9624af PB |
100 | |
101 | /* | |
edea7c4f | 102 | * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of |
5a9624af PB |
103 | * the memslots generation and is derived as follows: |
104 | * | |
edea7c4f SC |
105 | * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 |
106 | * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 | |
5a9624af PB |
107 | * |
108 | * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in | |
109 | * the MMIO generation number, as doing so would require stealing a bit from | |
110 | * the "real" generation number and thus effectively halve the maximum number | |
111 | * of MMIO generations that can be handled before encountering a wrap (which | |
112 | * requires a full MMU zap). The flag is instead explicitly queried when | |
113 | * checking for MMIO spte cache hits. | |
114 | */ | |
5a9624af PB |
115 | |
116 | #define MMIO_SPTE_GEN_LOW_START 3 | |
edea7c4f | 117 | #define MMIO_SPTE_GEN_LOW_END 10 |
5a9624af | 118 | |
b0de5680 | 119 | #define MMIO_SPTE_GEN_HIGH_START 52 |
5a9624af | 120 | #define MMIO_SPTE_GEN_HIGH_END 62 |
34c0f6f2 MS |
121 | |
122 | #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ | |
123 | MMIO_SPTE_GEN_LOW_START) | |
5a9624af PB |
124 | #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ |
125 | MMIO_SPTE_GEN_HIGH_START) | |
edea7c4f SC |
126 | static_assert(!(SPTE_MMU_PRESENT_MASK & |
127 | (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); | |
5a9624af | 128 | |
8bad4606 SC |
129 | /* |
130 | * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the | |
131 | * MMU-present bit. The generation obviously co-exists with the magic MMIO | |
132 | * mask/value, and MMIO SPTEs are considered !MMU-present. | |
133 | * | |
134 | * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT | |
135 | * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO | |
136 | * and so they're off-limits for generation; additional checks ensure the mask | |
137 | * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). | |
138 | */ | |
139 | #define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) | |
140 | static_assert(!(SPTE_MMIO_ALLOWED_MASK & | |
141 | (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); | |
142 | ||
34c0f6f2 MS |
143 | #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) |
144 | #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) | |
145 | ||
146 | /* remember to adjust the comment above as well if you change these */ | |
edea7c4f | 147 | static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); |
34c0f6f2 MS |
148 | |
149 | #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) | |
150 | #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) | |
151 | ||
152 | #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) | |
153 | ||
7f01cab8 SC |
154 | /* |
155 | * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress | |
156 | * #VE and get EPT violations on non-present PTEs. We can use the | |
157 | * same value also without TDX for both VMX and SVM: | |
158 | * | |
159 | * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. | |
160 | * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) | |
161 | * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) | |
162 | */ | |
163 | #ifdef CONFIG_X86_64 | |
164 | #define SHADOW_NONPRESENT_VALUE BIT_ULL(63) | |
165 | static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); | |
166 | #else | |
d8fa2031 | 167 | #define SHADOW_NONPRESENT_VALUE 0ULL |
7f01cab8 | 168 | #endif |
d8fa2031 | 169 | |
5fc3424f SC |
170 | extern u64 __read_mostly shadow_host_writable_mask; |
171 | extern u64 __read_mostly shadow_mmu_writable_mask; | |
5a9624af PB |
172 | extern u64 __read_mostly shadow_nx_mask; |
173 | extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | |
174 | extern u64 __read_mostly shadow_user_mask; | |
175 | extern u64 __read_mostly shadow_accessed_mask; | |
176 | extern u64 __read_mostly shadow_dirty_mask; | |
177 | extern u64 __read_mostly shadow_mmio_value; | |
8120337a | 178 | extern u64 __read_mostly shadow_mmio_mask; |
5a9624af PB |
179 | extern u64 __read_mostly shadow_mmio_access_mask; |
180 | extern u64 __read_mostly shadow_present_mask; | |
38bf9d7b | 181 | extern u64 __read_mostly shadow_memtype_mask; |
e54f1ff2 | 182 | extern u64 __read_mostly shadow_me_value; |
5a9624af PB |
183 | extern u64 __read_mostly shadow_me_mask; |
184 | ||
185 | /* | |
dc1ae59f | 186 | * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; |
5a9624af PB |
187 | * shadow_acc_track_mask is the set of bits to be cleared in non-accessed |
188 | * pages. | |
189 | */ | |
190 | extern u64 __read_mostly shadow_acc_track_mask; | |
191 | ||
192 | /* | |
193 | * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order | |
194 | * to guard against L1TF attacks. | |
195 | */ | |
196 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; | |
197 | ||
8a967d65 PB |
198 | /* |
199 | * The number of high-order 1 bits to use in the mask above. | |
200 | */ | |
201 | #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 | |
202 | ||
08f07c80 BG |
203 | /* |
204 | * If a thread running without exclusive control of the MMU lock must perform a | |
964cea81 | 205 | * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a |
08f07c80 BG |
206 | * non-present intermediate value. Other threads which encounter this value |
207 | * should not modify the SPTE. | |
208 | * | |
715f1079 | 209 | * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on |
fa3e4203 | 210 | * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF |
7f01cab8 | 211 | * vulnerability. |
08f07c80 BG |
212 | * |
213 | * Only used by the TDP MMU. | |
214 | */ | |
964cea81 | 215 | #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) |
715f1079 | 216 | |
e03a7caa | 217 | /* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ |
964cea81 | 218 | static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); |
08f07c80 | 219 | |
964cea81 | 220 | static inline bool is_frozen_spte(u64 spte) |
08f07c80 | 221 | { |
964cea81 | 222 | return spte == FROZEN_SPTE; |
08f07c80 BG |
223 | } |
224 | ||
79e48cec SC |
225 | /* Get an SPTE's index into its parent's page table (and the spt array). */ |
226 | static inline int spte_index(u64 *sptep) | |
227 | { | |
228 | return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); | |
229 | } | |
230 | ||
5a9624af PB |
231 | /* |
232 | * In some cases, we need to preserve the GFN of a non-present or reserved | |
233 | * SPTE when we usurp the upper five bits of the physical address space to | |
234 | * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll | |
235 | * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask | |
236 | * left into the reserved bits, i.e. the GFN in the SPTE will be split into | |
237 | * high and low parts. This mask covers the lower bits of the GFN. | |
238 | */ | |
239 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; | |
240 | ||
5e3edd7e SC |
241 | static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) |
242 | { | |
243 | struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); | |
244 | ||
245 | return (struct kvm_mmu_page *)page_private(page); | |
246 | } | |
247 | ||
248 | static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) | |
249 | { | |
250 | return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); | |
251 | } | |
252 | ||
253 | static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) | |
254 | { | |
255 | return to_shadow_page(__pa(sptep)); | |
256 | } | |
257 | ||
c5f2d564 SC |
258 | static inline struct kvm_mmu_page *root_to_sp(hpa_t root) |
259 | { | |
0e3223d8 SC |
260 | if (kvm_mmu_is_dummy_root(root)) |
261 | return NULL; | |
262 | ||
c5f2d564 SC |
263 | /* |
264 | * The "root" may be a special root, e.g. a PAE entry, treat it as a | |
265 | * SPTE to ensure any non-PA bits are dropped. | |
266 | */ | |
267 | return spte_to_child_sp(root); | |
268 | } | |
269 | ||
949019b9 | 270 | static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) |
5a9624af | 271 | { |
949019b9 | 272 | return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && |
8b9e74bf | 273 | likely(enable_mmio_caching); |
5a9624af PB |
274 | } |
275 | ||
8f366ae6 SC |
276 | static inline bool is_shadow_present_pte(u64 pte) |
277 | { | |
278 | return !!(pte & SPTE_MMU_PRESENT_MASK); | |
279 | } | |
280 | ||
837d557a SC |
281 | static inline bool is_ept_ve_possible(u64 spte) |
282 | { | |
283 | return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && | |
284 | !(spte & VMX_EPT_SUPPRESS_VE_BIT) && | |
285 | (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; | |
286 | } | |
287 | ||
54275f74 SC |
288 | /* |
289 | * Returns true if A/D bits are supported in hardware and are enabled by KVM. | |
290 | * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can | |
291 | * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the | |
292 | * scenario where KVM is using A/D bits for L1, but not L2. | |
293 | */ | |
294 | static inline bool kvm_ad_enabled(void) | |
295 | { | |
296 | return !!shadow_accessed_mask; | |
297 | } | |
298 | ||
5a9624af PB |
299 | static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) |
300 | { | |
301 | return sp->role.ad_disabled; | |
302 | } | |
303 | ||
304 | static inline bool spte_ad_enabled(u64 spte) | |
305 | { | |
0fe6370e | 306 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
dc1ae59f | 307 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; |
5a9624af PB |
308 | } |
309 | ||
310 | static inline bool spte_ad_need_write_protect(u64 spte) | |
311 | { | |
0fe6370e | 312 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
8a406c89 | 313 | /* |
dc1ae59f | 314 | * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', |
8a406c89 SC |
315 | * and non-TDP SPTEs will never set these bits. Optimize for 64-bit |
316 | * TDP and do the A/D type check unconditionally. | |
317 | */ | |
dc1ae59f | 318 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; |
5a9624af PB |
319 | } |
320 | ||
321 | static inline u64 spte_shadow_accessed_mask(u64 spte) | |
322 | { | |
0fe6370e | 323 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
5a9624af PB |
324 | return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; |
325 | } | |
326 | ||
327 | static inline u64 spte_shadow_dirty_mask(u64 spte) | |
328 | { | |
0fe6370e | 329 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
5a9624af PB |
330 | return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; |
331 | } | |
332 | ||
333 | static inline bool is_access_track_spte(u64 spte) | |
334 | { | |
335 | return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; | |
336 | } | |
337 | ||
15e6a7e5 | 338 | static inline bool is_large_pte(u64 pte) |
5a9624af PB |
339 | { |
340 | return pte & PT_PAGE_SIZE_MASK; | |
341 | } | |
342 | ||
15e6a7e5 | 343 | static inline bool is_last_spte(u64 pte, int level) |
5a9624af | 344 | { |
15e6a7e5 | 345 | return (level == PG_LEVEL_4K) || is_large_pte(pte); |
5a9624af PB |
346 | } |
347 | ||
348 | static inline bool is_executable_pte(u64 spte) | |
349 | { | |
350 | return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; | |
351 | } | |
352 | ||
353 | static inline kvm_pfn_t spte_to_pfn(u64 pte) | |
354 | { | |
2ca3129e | 355 | return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; |
5a9624af PB |
356 | } |
357 | ||
358 | static inline bool is_accessed_spte(u64 spte) | |
359 | { | |
360 | u64 accessed_mask = spte_shadow_accessed_mask(spte); | |
361 | ||
362 | return accessed_mask ? spte & accessed_mask | |
363 | : !is_access_track_spte(spte); | |
364 | } | |
365 | ||
366 | static inline bool is_dirty_spte(u64 spte) | |
367 | { | |
368 | u64 dirty_mask = spte_shadow_dirty_mask(spte); | |
369 | ||
370 | return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; | |
371 | } | |
372 | ||
961f8445 SC |
373 | static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, |
374 | int level) | |
375 | { | |
376 | int bit7 = (pte >> 7) & 1; | |
377 | ||
378 | return rsvd_check->rsvd_bits_mask[bit7][level-1]; | |
379 | } | |
380 | ||
381 | static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, | |
382 | u64 pte, int level) | |
383 | { | |
384 | return pte & get_rsvd_bits(rsvd_check, pte, level); | |
385 | } | |
386 | ||
387 | static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, | |
388 | u64 pte) | |
389 | { | |
390 | return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); | |
391 | } | |
392 | ||
393 | static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, | |
394 | u64 spte, int level) | |
395 | { | |
3d5e7a28 | 396 | return __is_bad_mt_xwr(rsvd_check, spte) || |
961f8445 SC |
397 | __is_rsvd_bits_set(rsvd_check, spte, level); |
398 | } | |
399 | ||
00610021 | 400 | /* |
b64d740e | 401 | * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: |
02844ac1 DM |
402 | * |
403 | * 1. To intercept writes for dirty logging. KVM write-protects huge pages | |
562f5bc4 | 404 | * so that they can be split down into the dirty logging |
02844ac1 DM |
405 | * granularity (4KiB) whenever the guest writes to them. KVM also |
406 | * write-protects 4KiB pages so that writes can be recorded in the dirty log | |
407 | * (e.g. if not using PML). SPTEs are write-protected for dirty logging | |
408 | * during the VM-iotcls that enable dirty logging. | |
409 | * | |
410 | * 2. To intercept writes to guest page tables that KVM is shadowing. When a | |
411 | * guest writes to its page table the corresponding shadow page table will | |
412 | * be marked "unsync". That way KVM knows which shadow page tables need to | |
413 | * be updated on the next TLB flush, INVLPG, etc. and which do not. | |
414 | * | |
415 | * 3. To prevent guest writes to read-only memory, such as for memory in a | |
416 | * read-only memslot or guest memory backed by a read-only VMA. Writes to | |
417 | * such pages are disallowed entirely. | |
418 | * | |
b64d740e JS |
419 | * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this |
420 | * case, the SPTE is access-protected, not just write-protected! | |
421 | * | |
422 | * For cases #1 and #4, KVM can safely make such SPTEs writable without taking | |
423 | * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. | |
424 | * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits | |
425 | * in the SPTE: | |
02844ac1 DM |
426 | * |
427 | * shadow_mmu_writable_mask, aka MMU-writable - | |
428 | * Cleared on SPTEs that KVM is currently write-protecting for shadow paging | |
429 | * purposes (case 2 above). | |
430 | * | |
431 | * shadow_host_writable_mask, aka Host-writable - | |
432 | * Cleared on SPTEs that are not host-writable (case 3 above) | |
433 | * | |
434 | * Note, not all possible combinations of PT_WRITABLE_MASK, | |
435 | * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given | |
436 | * SPTE can be in only one of the following states, which map to the | |
437 | * aforementioned 3 cases: | |
438 | * | |
439 | * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK | |
440 | * ------------------------- | ------------------------ | ---------------- | |
441 | * 1 | 1 | 1 (writable) | |
442 | * 1 | 1 | 0 (case 1) | |
443 | * 1 | 0 | 0 (case 2) | |
444 | * 0 | 0 | 0 (case 3) | |
00610021 | 445 | * |
02844ac1 DM |
446 | * The valid combinations of these bits are checked by |
447 | * check_spte_writable_invariants() whenever an SPTE is modified. | |
00610021 | 448 | * |
02844ac1 DM |
449 | * Clearing the MMU-writable bit is always done under the MMU lock and always |
450 | * accompanied by a TLB flush before dropping the lock to avoid corrupting the | |
451 | * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging | |
452 | * (which does not clear the MMU-writable bit), does not flush TLBs before | |
453 | * dropping the lock, as it only needs to synchronize guest writes with the | |
b64d740e JS |
454 | * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for |
455 | * access-tracking via the clear_young() MMU notifier also does not flush TLBs. | |
00610021 | 456 | * |
02844ac1 DM |
457 | * So, there is the problem: clearing the MMU-writable bit can encounter a |
458 | * write-protected SPTE while CPUs still have writable mappings for that SPTE | |
459 | * cached in their TLB. To address this, KVM always flushes TLBs when | |
460 | * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. | |
00610021 | 461 | * |
02844ac1 DM |
462 | * The Host-writable bit is not modified on present SPTEs, it is only set or |
463 | * cleared when an SPTE is first faulted in from non-present and then remains | |
464 | * immutable. | |
00610021 DM |
465 | */ |
466 | static inline bool is_writable_pte(unsigned long pte) | |
467 | { | |
468 | return pte & PT_WRITABLE_MASK; | |
469 | } | |
470 | ||
115111ef | 471 | /* Note: spte must be a shadow-present leaf SPTE. */ |
932859a4 | 472 | static inline void check_spte_writable_invariants(u64 spte) |
5a9624af | 473 | { |
932859a4 DM |
474 | if (spte & shadow_mmu_writable_mask) |
475 | WARN_ONCE(!(spte & shadow_host_writable_mask), | |
8d20bd63 | 476 | KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", |
932859a4 DM |
477 | spte); |
478 | else | |
00610021 | 479 | WARN_ONCE(is_writable_pte(spte), |
8d20bd63 | 480 | KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); |
932859a4 | 481 | } |
5f16bcac | 482 | |
706c9c55 | 483 | static inline bool is_mmu_writable_spte(u64 spte) |
932859a4 | 484 | { |
932859a4 | 485 | return spte & shadow_mmu_writable_mask; |
5a9624af PB |
486 | } |
487 | ||
488 | static inline u64 get_mmio_spte_generation(u64 spte) | |
489 | { | |
490 | u64 gen; | |
491 | ||
34c0f6f2 MS |
492 | gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT; |
493 | gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT; | |
5a9624af PB |
494 | return gen; |
495 | } | |
496 | ||
54eb3ef5 SC |
497 | bool spte_has_volatile_bits(u64 spte); |
498 | ||
7158bee4 | 499 | bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
8283e36a | 500 | const struct kvm_memory_slot *slot, |
7158bee4 | 501 | unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, |
2839180c | 502 | u64 old_spte, bool prefetch, bool can_unsync, |
7158bee4 | 503 | bool host_writable, u64 *new_spte); |
47855da0 DM |
504 | u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, |
505 | union kvm_mmu_page_role role, int index); | |
5a9624af PB |
506 | u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); |
507 | u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); | |
508 | u64 mark_spte_for_access_track(u64 spte); | |
315d86da DM |
509 | |
510 | /* Restore an acc-track PTE back to a regular PTE */ | |
511 | static inline u64 restore_acc_track_spte(u64 spte) | |
512 | { | |
513 | u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) | |
514 | & SHADOW_ACC_TRACK_SAVED_BITS_MASK; | |
515 | ||
516 | spte &= ~shadow_acc_track_mask; | |
517 | spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << | |
518 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); | |
519 | spte |= saved_bits; | |
520 | ||
521 | return spte; | |
522 | } | |
523 | ||
c3e0c8c2 | 524 | void __init kvm_mmu_spte_module_init(void); |
5a9624af PB |
525 | void kvm_mmu_reset_all_pte_masks(void); |
526 | ||
527 | #endif |