Commit | Line | Data |
---|---|---|
5a9624af PB |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | ||
3 | #ifndef KVM_X86_MMU_SPTE_H | |
4 | #define KVM_X86_MMU_SPTE_H | |
5 | ||
6 | #include "mmu_internal.h" | |
7 | ||
edea7c4f SC |
8 | /* |
9 | * A MMU present SPTE is backed by actual memory and may or may not be present | |
10 | * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it | |
11 | * is ignored by all flavors of SPTEs and checking a low bit often generates | |
12 | * better code than for a high bit, e.g. 56+. MMU present checks are pervasive | |
13 | * enough that the improved code generation is noticeable in KVM's footprint. | |
14 | */ | |
15 | #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) | |
16 | ||
8a406c89 SC |
17 | /* |
18 | * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also | |
19 | * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. | |
20 | * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that | |
21 | * is must be employed for a given TDP SPTE. | |
22 | * | |
23 | * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE | |
24 | * paging, including NPT PAE. This scheme works because legacy shadow paging | |
25 | * is guaranteed to have A/D bits and write-protection is forced only for | |
26 | * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it | |
27 | * must be restricted to 64-bit KVM. | |
28 | */ | |
29 | #define SPTE_TDP_AD_SHIFT 52 | |
30 | #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) | |
31 | #define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) | |
32 | #define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) | |
33 | #define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) | |
34 | static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); | |
5a9624af PB |
35 | |
36 | #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK | |
2ca3129e | 37 | #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) |
5a9624af | 38 | #else |
2ca3129e | 39 | #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) |
5a9624af | 40 | #endif |
5a9624af | 41 | |
2ca3129e | 42 | #define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ |
5a9624af PB |
43 | | shadow_x_mask | shadow_nx_mask | shadow_me_mask) |
44 | ||
45 | #define ACC_EXEC_MASK 1 | |
46 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | |
47 | #define ACC_USER_MASK PT_USER_MASK | |
48 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | |
49 | ||
50 | /* The mask for the R/X bits in EPT PTEs */ | |
2ca3129e SC |
51 | #define SPTE_EPT_READABLE_MASK 0x1ull |
52 | #define SPTE_EPT_EXECUTABLE_MASK 0x4ull | |
5a9624af | 53 | |
2ca3129e SC |
54 | #define SPTE_LEVEL_BITS 9 |
55 | #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) | |
56 | #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) | |
57 | #define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) | |
5a9624af | 58 | |
613a3f37 SC |
59 | /* |
60 | * The mask/shift to use for saving the original R/X bits when marking the PTE | |
61 | * as not-present for access tracking purposes. We do not save the W bit as the | |
62 | * PTEs being access tracked also need to be dirty tracked, so the W bit will be | |
63 | * restored only when a write is attempted to the page. This mask obviously | |
64 | * must not overlap the A/D type mask. | |
65 | */ | |
2ca3129e SC |
66 | #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ |
67 | SPTE_EPT_EXECUTABLE_MASK) | |
613a3f37 SC |
68 | #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 |
69 | #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ | |
70 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) | |
71 | static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); | |
72 | ||
5f16bcac | 73 | /* |
02844ac1 DM |
74 | * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given |
75 | * SPTE is write-protected. See is_writable_pte() for details. | |
5f16bcac DM |
76 | */ |
77 | ||
78 | /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ | |
1ca87e01 DM |
79 | #define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) |
80 | #define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) | |
5f16bcac | 81 | |
613a3f37 SC |
82 | /* |
83 | * Low ignored bits are at a premium for EPT, use high ignored bits, taking care | |
84 | * to not overlap the A/D type mask or the saved access bits of access-tracked | |
85 | * SPTEs when A/D bits are disabled. | |
86 | */ | |
87 | #define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) | |
88 | #define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) | |
89 | ||
90 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); | |
91 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); | |
92 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); | |
93 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); | |
94 | ||
95 | /* Defined only to keep the above static asserts readable. */ | |
96 | #undef SHADOW_ACC_TRACK_SAVED_MASK | |
5a9624af PB |
97 | |
98 | /* | |
edea7c4f | 99 | * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of |
5a9624af PB |
100 | * the memslots generation and is derived as follows: |
101 | * | |
edea7c4f SC |
102 | * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 |
103 | * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 | |
5a9624af PB |
104 | * |
105 | * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in | |
106 | * the MMIO generation number, as doing so would require stealing a bit from | |
107 | * the "real" generation number and thus effectively halve the maximum number | |
108 | * of MMIO generations that can be handled before encountering a wrap (which | |
109 | * requires a full MMU zap). The flag is instead explicitly queried when | |
110 | * checking for MMIO spte cache hits. | |
111 | */ | |
5a9624af PB |
112 | |
113 | #define MMIO_SPTE_GEN_LOW_START 3 | |
edea7c4f | 114 | #define MMIO_SPTE_GEN_LOW_END 10 |
5a9624af | 115 | |
b0de5680 | 116 | #define MMIO_SPTE_GEN_HIGH_START 52 |
5a9624af | 117 | #define MMIO_SPTE_GEN_HIGH_END 62 |
34c0f6f2 MS |
118 | |
119 | #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ | |
120 | MMIO_SPTE_GEN_LOW_START) | |
5a9624af PB |
121 | #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ |
122 | MMIO_SPTE_GEN_HIGH_START) | |
edea7c4f SC |
123 | static_assert(!(SPTE_MMU_PRESENT_MASK & |
124 | (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); | |
5a9624af | 125 | |
8bad4606 SC |
126 | /* |
127 | * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the | |
128 | * MMU-present bit. The generation obviously co-exists with the magic MMIO | |
129 | * mask/value, and MMIO SPTEs are considered !MMU-present. | |
130 | * | |
131 | * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT | |
132 | * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO | |
133 | * and so they're off-limits for generation; additional checks ensure the mask | |
134 | * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). | |
135 | */ | |
136 | #define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) | |
137 | static_assert(!(SPTE_MMIO_ALLOWED_MASK & | |
138 | (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); | |
139 | ||
34c0f6f2 MS |
140 | #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) |
141 | #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) | |
142 | ||
143 | /* remember to adjust the comment above as well if you change these */ | |
edea7c4f | 144 | static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); |
34c0f6f2 MS |
145 | |
146 | #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) | |
147 | #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) | |
148 | ||
149 | #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) | |
150 | ||
5fc3424f SC |
151 | extern u64 __read_mostly shadow_host_writable_mask; |
152 | extern u64 __read_mostly shadow_mmu_writable_mask; | |
5a9624af PB |
153 | extern u64 __read_mostly shadow_nx_mask; |
154 | extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | |
155 | extern u64 __read_mostly shadow_user_mask; | |
156 | extern u64 __read_mostly shadow_accessed_mask; | |
157 | extern u64 __read_mostly shadow_dirty_mask; | |
158 | extern u64 __read_mostly shadow_mmio_value; | |
8120337a | 159 | extern u64 __read_mostly shadow_mmio_mask; |
5a9624af PB |
160 | extern u64 __read_mostly shadow_mmio_access_mask; |
161 | extern u64 __read_mostly shadow_present_mask; | |
38bf9d7b | 162 | extern u64 __read_mostly shadow_memtype_mask; |
e54f1ff2 | 163 | extern u64 __read_mostly shadow_me_value; |
5a9624af PB |
164 | extern u64 __read_mostly shadow_me_mask; |
165 | ||
166 | /* | |
8a406c89 | 167 | * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; |
5a9624af PB |
168 | * shadow_acc_track_mask is the set of bits to be cleared in non-accessed |
169 | * pages. | |
170 | */ | |
171 | extern u64 __read_mostly shadow_acc_track_mask; | |
172 | ||
173 | /* | |
174 | * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order | |
175 | * to guard against L1TF attacks. | |
176 | */ | |
177 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; | |
178 | ||
8a967d65 PB |
179 | /* |
180 | * The number of high-order 1 bits to use in the mask above. | |
181 | */ | |
182 | #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 | |
183 | ||
08f07c80 BG |
184 | /* |
185 | * If a thread running without exclusive control of the MMU lock must perform a | |
186 | * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a | |
187 | * non-present intermediate value. Other threads which encounter this value | |
188 | * should not modify the SPTE. | |
189 | * | |
715f1079 | 190 | * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on |
fa3e4203 | 191 | * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF |
715f1079 | 192 | * vulnerability. Use only low bits to avoid 64-bit immediates. |
08f07c80 BG |
193 | * |
194 | * Only used by the TDP MMU. | |
195 | */ | |
715f1079 SC |
196 | #define REMOVED_SPTE 0x5a0ULL |
197 | ||
198 | /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ | |
199 | static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); | |
08f07c80 BG |
200 | |
201 | static inline bool is_removed_spte(u64 spte) | |
202 | { | |
203 | return spte == REMOVED_SPTE; | |
204 | } | |
205 | ||
79e48cec SC |
206 | /* Get an SPTE's index into its parent's page table (and the spt array). */ |
207 | static inline int spte_index(u64 *sptep) | |
208 | { | |
209 | return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); | |
210 | } | |
211 | ||
5a9624af PB |
212 | /* |
213 | * In some cases, we need to preserve the GFN of a non-present or reserved | |
214 | * SPTE when we usurp the upper five bits of the physical address space to | |
215 | * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll | |
216 | * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask | |
217 | * left into the reserved bits, i.e. the GFN in the SPTE will be split into | |
218 | * high and low parts. This mask covers the lower bits of the GFN. | |
219 | */ | |
220 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; | |
221 | ||
5e3edd7e SC |
222 | static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) |
223 | { | |
224 | struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); | |
225 | ||
226 | return (struct kvm_mmu_page *)page_private(page); | |
227 | } | |
228 | ||
229 | static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) | |
230 | { | |
231 | return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); | |
232 | } | |
233 | ||
234 | static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) | |
235 | { | |
236 | return to_shadow_page(__pa(sptep)); | |
237 | } | |
238 | ||
5a9624af PB |
239 | static inline bool is_mmio_spte(u64 spte) |
240 | { | |
8120337a | 241 | return (spte & shadow_mmio_mask) == shadow_mmio_value && |
8b9e74bf | 242 | likely(enable_mmio_caching); |
5a9624af PB |
243 | } |
244 | ||
8f366ae6 SC |
245 | static inline bool is_shadow_present_pte(u64 pte) |
246 | { | |
247 | return !!(pte & SPTE_MMU_PRESENT_MASK); | |
248 | } | |
249 | ||
54275f74 SC |
250 | /* |
251 | * Returns true if A/D bits are supported in hardware and are enabled by KVM. | |
252 | * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can | |
253 | * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the | |
254 | * scenario where KVM is using A/D bits for L1, but not L2. | |
255 | */ | |
256 | static inline bool kvm_ad_enabled(void) | |
257 | { | |
258 | return !!shadow_accessed_mask; | |
259 | } | |
260 | ||
5a9624af PB |
261 | static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) |
262 | { | |
263 | return sp->role.ad_disabled; | |
264 | } | |
265 | ||
266 | static inline bool spte_ad_enabled(u64 spte) | |
267 | { | |
8f366ae6 | 268 | MMU_WARN_ON(!is_shadow_present_pte(spte)); |
8a406c89 | 269 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; |
5a9624af PB |
270 | } |
271 | ||
272 | static inline bool spte_ad_need_write_protect(u64 spte) | |
273 | { | |
8f366ae6 | 274 | MMU_WARN_ON(!is_shadow_present_pte(spte)); |
8a406c89 SC |
275 | /* |
276 | * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', | |
277 | * and non-TDP SPTEs will never set these bits. Optimize for 64-bit | |
278 | * TDP and do the A/D type check unconditionally. | |
279 | */ | |
280 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; | |
5a9624af PB |
281 | } |
282 | ||
283 | static inline u64 spte_shadow_accessed_mask(u64 spte) | |
284 | { | |
8f366ae6 | 285 | MMU_WARN_ON(!is_shadow_present_pte(spte)); |
5a9624af PB |
286 | return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; |
287 | } | |
288 | ||
289 | static inline u64 spte_shadow_dirty_mask(u64 spte) | |
290 | { | |
8f366ae6 | 291 | MMU_WARN_ON(!is_shadow_present_pte(spte)); |
5a9624af PB |
292 | return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; |
293 | } | |
294 | ||
295 | static inline bool is_access_track_spte(u64 spte) | |
296 | { | |
297 | return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; | |
298 | } | |
299 | ||
15e6a7e5 | 300 | static inline bool is_large_pte(u64 pte) |
5a9624af PB |
301 | { |
302 | return pte & PT_PAGE_SIZE_MASK; | |
303 | } | |
304 | ||
15e6a7e5 | 305 | static inline bool is_last_spte(u64 pte, int level) |
5a9624af | 306 | { |
15e6a7e5 | 307 | return (level == PG_LEVEL_4K) || is_large_pte(pte); |
5a9624af PB |
308 | } |
309 | ||
310 | static inline bool is_executable_pte(u64 spte) | |
311 | { | |
312 | return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; | |
313 | } | |
314 | ||
315 | static inline kvm_pfn_t spte_to_pfn(u64 pte) | |
316 | { | |
2ca3129e | 317 | return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; |
5a9624af PB |
318 | } |
319 | ||
320 | static inline bool is_accessed_spte(u64 spte) | |
321 | { | |
322 | u64 accessed_mask = spte_shadow_accessed_mask(spte); | |
323 | ||
324 | return accessed_mask ? spte & accessed_mask | |
325 | : !is_access_track_spte(spte); | |
326 | } | |
327 | ||
328 | static inline bool is_dirty_spte(u64 spte) | |
329 | { | |
330 | u64 dirty_mask = spte_shadow_dirty_mask(spte); | |
331 | ||
332 | return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; | |
333 | } | |
334 | ||
961f8445 SC |
335 | static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, |
336 | int level) | |
337 | { | |
338 | int bit7 = (pte >> 7) & 1; | |
339 | ||
340 | return rsvd_check->rsvd_bits_mask[bit7][level-1]; | |
341 | } | |
342 | ||
343 | static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, | |
344 | u64 pte, int level) | |
345 | { | |
346 | return pte & get_rsvd_bits(rsvd_check, pte, level); | |
347 | } | |
348 | ||
349 | static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, | |
350 | u64 pte) | |
351 | { | |
352 | return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); | |
353 | } | |
354 | ||
355 | static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, | |
356 | u64 spte, int level) | |
357 | { | |
3d5e7a28 | 358 | return __is_bad_mt_xwr(rsvd_check, spte) || |
961f8445 SC |
359 | __is_rsvd_bits_set(rsvd_check, spte, level); |
360 | } | |
361 | ||
00610021 | 362 | /* |
b64d740e | 363 | * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: |
02844ac1 DM |
364 | * |
365 | * 1. To intercept writes for dirty logging. KVM write-protects huge pages | |
366 | * so that they can be split be split down into the dirty logging | |
367 | * granularity (4KiB) whenever the guest writes to them. KVM also | |
368 | * write-protects 4KiB pages so that writes can be recorded in the dirty log | |
369 | * (e.g. if not using PML). SPTEs are write-protected for dirty logging | |
370 | * during the VM-iotcls that enable dirty logging. | |
371 | * | |
372 | * 2. To intercept writes to guest page tables that KVM is shadowing. When a | |
373 | * guest writes to its page table the corresponding shadow page table will | |
374 | * be marked "unsync". That way KVM knows which shadow page tables need to | |
375 | * be updated on the next TLB flush, INVLPG, etc. and which do not. | |
376 | * | |
377 | * 3. To prevent guest writes to read-only memory, such as for memory in a | |
378 | * read-only memslot or guest memory backed by a read-only VMA. Writes to | |
379 | * such pages are disallowed entirely. | |
380 | * | |
b64d740e JS |
381 | * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this |
382 | * case, the SPTE is access-protected, not just write-protected! | |
383 | * | |
384 | * For cases #1 and #4, KVM can safely make such SPTEs writable without taking | |
385 | * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. | |
386 | * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits | |
387 | * in the SPTE: | |
02844ac1 DM |
388 | * |
389 | * shadow_mmu_writable_mask, aka MMU-writable - | |
390 | * Cleared on SPTEs that KVM is currently write-protecting for shadow paging | |
391 | * purposes (case 2 above). | |
392 | * | |
393 | * shadow_host_writable_mask, aka Host-writable - | |
394 | * Cleared on SPTEs that are not host-writable (case 3 above) | |
395 | * | |
396 | * Note, not all possible combinations of PT_WRITABLE_MASK, | |
397 | * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given | |
398 | * SPTE can be in only one of the following states, which map to the | |
399 | * aforementioned 3 cases: | |
400 | * | |
401 | * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK | |
402 | * ------------------------- | ------------------------ | ---------------- | |
403 | * 1 | 1 | 1 (writable) | |
404 | * 1 | 1 | 0 (case 1) | |
405 | * 1 | 0 | 0 (case 2) | |
406 | * 0 | 0 | 0 (case 3) | |
00610021 | 407 | * |
02844ac1 DM |
408 | * The valid combinations of these bits are checked by |
409 | * check_spte_writable_invariants() whenever an SPTE is modified. | |
00610021 | 410 | * |
02844ac1 DM |
411 | * Clearing the MMU-writable bit is always done under the MMU lock and always |
412 | * accompanied by a TLB flush before dropping the lock to avoid corrupting the | |
413 | * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging | |
414 | * (which does not clear the MMU-writable bit), does not flush TLBs before | |
415 | * dropping the lock, as it only needs to synchronize guest writes with the | |
b64d740e JS |
416 | * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for |
417 | * access-tracking via the clear_young() MMU notifier also does not flush TLBs. | |
00610021 | 418 | * |
02844ac1 DM |
419 | * So, there is the problem: clearing the MMU-writable bit can encounter a |
420 | * write-protected SPTE while CPUs still have writable mappings for that SPTE | |
421 | * cached in their TLB. To address this, KVM always flushes TLBs when | |
422 | * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. | |
00610021 | 423 | * |
02844ac1 DM |
424 | * The Host-writable bit is not modified on present SPTEs, it is only set or |
425 | * cleared when an SPTE is first faulted in from non-present and then remains | |
426 | * immutable. | |
00610021 DM |
427 | */ |
428 | static inline bool is_writable_pte(unsigned long pte) | |
429 | { | |
430 | return pte & PT_WRITABLE_MASK; | |
431 | } | |
432 | ||
115111ef | 433 | /* Note: spte must be a shadow-present leaf SPTE. */ |
932859a4 | 434 | static inline void check_spte_writable_invariants(u64 spte) |
5a9624af | 435 | { |
932859a4 DM |
436 | if (spte & shadow_mmu_writable_mask) |
437 | WARN_ONCE(!(spte & shadow_host_writable_mask), | |
438 | "kvm: MMU-writable SPTE is not Host-writable: %llx", | |
439 | spte); | |
440 | else | |
00610021 | 441 | WARN_ONCE(is_writable_pte(spte), |
932859a4 DM |
442 | "kvm: Writable SPTE is not MMU-writable: %llx", spte); |
443 | } | |
5f16bcac | 444 | |
706c9c55 | 445 | static inline bool is_mmu_writable_spte(u64 spte) |
932859a4 | 446 | { |
932859a4 | 447 | return spte & shadow_mmu_writable_mask; |
5a9624af PB |
448 | } |
449 | ||
450 | static inline u64 get_mmio_spte_generation(u64 spte) | |
451 | { | |
452 | u64 gen; | |
453 | ||
34c0f6f2 MS |
454 | gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT; |
455 | gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT; | |
5a9624af PB |
456 | return gen; |
457 | } | |
458 | ||
54eb3ef5 SC |
459 | bool spte_has_volatile_bits(u64 spte); |
460 | ||
7158bee4 | 461 | bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
8283e36a | 462 | const struct kvm_memory_slot *slot, |
7158bee4 | 463 | unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, |
2839180c | 464 | u64 old_spte, bool prefetch, bool can_unsync, |
7158bee4 | 465 | bool host_writable, u64 *new_spte); |
47855da0 DM |
466 | u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, |
467 | union kvm_mmu_page_role role, int index); | |
5a9624af PB |
468 | u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); |
469 | u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); | |
470 | u64 mark_spte_for_access_track(u64 spte); | |
315d86da DM |
471 | |
472 | /* Restore an acc-track PTE back to a regular PTE */ | |
473 | static inline u64 restore_acc_track_spte(u64 spte) | |
474 | { | |
475 | u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) | |
476 | & SHADOW_ACC_TRACK_SAVED_BITS_MASK; | |
477 | ||
478 | spte &= ~shadow_acc_track_mask; | |
479 | spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << | |
480 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); | |
481 | spte |= saved_bits; | |
482 | ||
483 | return spte; | |
484 | } | |
485 | ||
5a9624af PB |
486 | u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); |
487 | ||
c3e0c8c2 | 488 | void __init kvm_mmu_spte_module_init(void); |
5a9624af PB |
489 | void kvm_mmu_reset_all_pte_masks(void); |
490 | ||
491 | #endif |