Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
3a94707d KC |
2 | /* |
3 | * This code is used on x86_64 to create page table identity mappings on | |
4 | * demand by building up a new set of page tables (or appending to the | |
5 | * existing ones), and then switching over to them when ready. | |
11fdf97a KC |
6 | * |
7 | * Copyright (C) 2015-2016 Yinghai Lu | |
8 | * Copyright (C) 2016 Kees Cook | |
3a94707d KC |
9 | */ |
10 | ||
aa8c6248 TG |
11 | /* No PAGE_TABLE_ISOLATION support needed either: */ |
12 | #undef CONFIG_PAGE_TABLE_ISOLATION | |
13 | ||
8b0d3b3b | 14 | #include "error.h" |
3a94707d KC |
15 | #include "misc.h" |
16 | ||
17 | /* These actually do the work of building the kernel identity maps. */ | |
ca5999fd | 18 | #include <linux/pgtable.h> |
c81d6002 | 19 | #include <asm/cmpxchg.h> |
8b0d3b3b JR |
20 | #include <asm/trap_pf.h> |
21 | #include <asm/trapnr.h> | |
65fddcfc | 22 | #include <asm/init.h> |
021182e5 TG |
23 | /* Use the static base for this part of the boot process */ |
24 | #undef __PAGE_OFFSET | |
25 | #define __PAGE_OFFSET __PAGE_OFFSET_BASE | |
3a94707d KC |
26 | #include "../../mm/ident_map.c" |
27 | ||
b17a45b6 AS |
28 | #define _SETUP |
29 | #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ | |
30 | #undef _SETUP | |
31 | ||
32 | extern unsigned long get_cmd_line_ptr(void); | |
33 | ||
5f2bb016 JR |
34 | /* Used by PAGE_KERN* macros: */ |
35 | pteval_t __default_kernel_pte_mask __read_mostly = ~0; | |
36 | ||
3a94707d KC |
37 | /* Used to track our page table allocation area. */ |
38 | struct alloc_pgt_data { | |
39 | unsigned char *pgt_buf; | |
40 | unsigned long pgt_buf_size; | |
41 | unsigned long pgt_buf_offset; | |
42 | }; | |
43 | ||
44 | /* | |
45 | * Allocates space for a page table entry, using struct alloc_pgt_data | |
46 | * above. Besides the local callers, this is used as the allocation | |
47 | * callback in mapping_info below. | |
48 | */ | |
49 | static void *alloc_pgt_page(void *context) | |
50 | { | |
51 | struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; | |
52 | unsigned char *entry; | |
53 | ||
54 | /* Validate there is space available for a new page. */ | |
55 | if (pages->pgt_buf_offset >= pages->pgt_buf_size) { | |
56 | debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); | |
57 | debug_putaddr(pages->pgt_buf_offset); | |
58 | debug_putaddr(pages->pgt_buf_size); | |
59 | return NULL; | |
60 | } | |
61 | ||
f530ee95 KS |
62 | /* Consumed more tables than expected? */ |
63 | if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) { | |
64 | debug_putstr("pgt_buf running low in " __FILE__ "\n"); | |
65 | debug_putstr("Need to raise BOOT_PGT_SIZE?\n"); | |
66 | debug_putaddr(pages->pgt_buf_offset); | |
67 | debug_putaddr(pages->pgt_buf_size); | |
68 | } | |
69 | ||
3a94707d KC |
70 | entry = pages->pgt_buf + pages->pgt_buf_offset; |
71 | pages->pgt_buf_offset += PAGE_SIZE; | |
72 | ||
73 | return entry; | |
74 | } | |
75 | ||
76 | /* Used to track our allocated page tables. */ | |
77 | static struct alloc_pgt_data pgt_data; | |
78 | ||
79 | /* The top level page table entry pointer. */ | |
a24261d7 | 80 | static unsigned long top_level_pgt; |
3a94707d | 81 | |
94d49eb3 KS |
82 | phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; |
83 | ||
11fdf97a KC |
84 | /* |
85 | * Mapping information structure passed to kernel_ident_mapping_init(). | |
86 | * Due to relocation, pointers must be assigned at run time not build time. | |
87 | */ | |
1958b5fc | 88 | static struct x86_mapping_info mapping_info; |
11fdf97a | 89 | |
ca0e22d4 | 90 | /* |
8570978e | 91 | * Adds the specified range to the identity mappings. |
ca0e22d4 | 92 | */ |
a9ee679b | 93 | void kernel_add_identity_map(unsigned long start, unsigned long end) |
ca0e22d4 | 94 | { |
4b3fdca6 JR |
95 | int ret; |
96 | ||
ca0e22d4 JR |
97 | /* Align boundary to 2M. */ |
98 | start = round_down(start, PMD_SIZE); | |
99 | end = round_up(end, PMD_SIZE); | |
100 | if (start >= end) | |
101 | return; | |
102 | ||
103 | /* Build the mapping. */ | |
4b3fdca6 JR |
104 | ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); |
105 | if (ret) | |
106 | error("Error: kernel_ident_mapping_init() failed\n"); | |
ca0e22d4 JR |
107 | } |
108 | ||
3a94707d | 109 | /* Locates and clears a region for a new top level page table. */ |
b17a45b6 | 110 | void initialize_identity_maps(void *rmode) |
3a94707d | 111 | { |
b17a45b6 | 112 | unsigned long cmdline; |
b57feed2 | 113 | struct setup_data *sd; |
b17a45b6 | 114 | |
94d49eb3 KS |
115 | /* Exclude the encryption mask from __PHYSICAL_MASK */ |
116 | physical_mask &= ~sme_me_mask; | |
117 | ||
11fdf97a KC |
118 | /* Init mapping_info with run-time function/buffer pointers. */ |
119 | mapping_info.alloc_pgt_page = alloc_pgt_page; | |
120 | mapping_info.context = &pgt_data; | |
07344b15 TL |
121 | mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; |
122 | mapping_info.kernpg_flag = _KERNPG_TABLE; | |
11fdf97a | 123 | |
3a94707d KC |
124 | /* |
125 | * It should be impossible for this not to already be true, | |
126 | * but since calling this a second time would rewind the other | |
127 | * counters, let's just make sure this is reset too. | |
128 | */ | |
129 | pgt_data.pgt_buf_offset = 0; | |
130 | ||
131 | /* | |
132 | * If we came here via startup_32(), cr3 will be _pgtable already | |
133 | * and we must append to the existing area instead of entirely | |
134 | * overwriting it. | |
a24261d7 KS |
135 | * |
136 | * With 5-level paging, we use '_pgtable' to allocate the p4d page table, | |
137 | * the top-level page table is allocated separately. | |
138 | * | |
139 | * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level | |
140 | * cases. On 4-level paging it's equal to 'top_level_pgt'. | |
3a94707d | 141 | */ |
a24261d7 KS |
142 | top_level_pgt = read_cr3_pa(); |
143 | if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { | |
3a94707d KC |
144 | pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; |
145 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; | |
146 | memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); | |
147 | } else { | |
3a94707d KC |
148 | pgt_data.pgt_buf = _pgtable; |
149 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE; | |
150 | memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); | |
a24261d7 | 151 | top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); |
3a94707d | 152 | } |
3a94707d | 153 | |
ca0e22d4 | 154 | /* |
b17a45b6 AS |
155 | * New page-table is set up - map the kernel image, boot_params and the |
156 | * command line. The uncompressed kernel requires boot_params and the | |
157 | * command line to be mapped in the identity mapping. Map them | |
158 | * explicitly here in case the compressed kernel does not touch them, | |
159 | * or does not touch all the pages covering them. | |
ca0e22d4 | 160 | */ |
a9ee679b | 161 | kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); |
b17a45b6 | 162 | boot_params = rmode; |
a9ee679b | 163 | kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); |
b17a45b6 | 164 | cmdline = get_cmd_line_ptr(); |
a9ee679b | 165 | kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); |
b17a45b6 | 166 | |
b57feed2 MR |
167 | /* |
168 | * Also map the setup_data entries passed via boot_params in case they | |
169 | * need to be accessed by uncompressed kernel via the identity mapping. | |
170 | */ | |
171 | sd = (struct setup_data *)boot_params->hdr.setup_data; | |
172 | while (sd) { | |
173 | unsigned long sd_addr = (unsigned long)sd; | |
174 | ||
175 | kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len); | |
176 | sd = (struct setup_data *)sd->next; | |
177 | } | |
178 | ||
76f61e1e MR |
179 | sev_prep_identity_maps(top_level_pgt); |
180 | ||
b17a45b6 | 181 | /* Load the new page-table. */ |
ca0e22d4 | 182 | write_cr3(top_level_pgt); |
8c29f016 ND |
183 | |
184 | /* | |
185 | * Now that the required page table mappings are established and a | |
186 | * GHCB can be used, check for SNP guest/HV feature compatibility. | |
187 | */ | |
188 | snp_check_features(); | |
3a94707d KC |
189 | } |
190 | ||
c81d6002 JR |
191 | static pte_t *split_large_pmd(struct x86_mapping_info *info, |
192 | pmd_t *pmdp, unsigned long __address) | |
193 | { | |
194 | unsigned long page_flags; | |
195 | unsigned long address; | |
196 | pte_t *pte; | |
197 | pmd_t pmd; | |
198 | int i; | |
199 | ||
200 | pte = (pte_t *)info->alloc_pgt_page(info->context); | |
201 | if (!pte) | |
202 | return NULL; | |
203 | ||
204 | address = __address & PMD_MASK; | |
205 | /* No large page - clear PSE flag */ | |
206 | page_flags = info->page_flag & ~_PAGE_PSE; | |
207 | ||
208 | /* Populate the PTEs */ | |
209 | for (i = 0; i < PTRS_PER_PMD; i++) { | |
210 | set_pte(&pte[i], __pte(address | page_flags)); | |
211 | address += PAGE_SIZE; | |
212 | } | |
213 | ||
214 | /* | |
215 | * Ideally we need to clear the large PMD first and do a TLB | |
216 | * flush before we write the new PMD. But the 2M range of the | |
217 | * PMD might contain the code we execute and/or the stack | |
218 | * we are on, so we can't do that. But that should be safe here | |
219 | * because we are going from large to small mappings and we are | |
220 | * also the only user of the page-table, so there is no chance | |
221 | * of a TLB multihit. | |
222 | */ | |
223 | pmd = __pmd((unsigned long)pte | info->kernpg_flag); | |
224 | set_pmd(pmdp, pmd); | |
225 | /* Flush TLB to establish the new PMD */ | |
226 | write_cr3(top_level_pgt); | |
227 | ||
228 | return pte + pte_index(__address); | |
229 | } | |
230 | ||
231 | static void clflush_page(unsigned long address) | |
232 | { | |
233 | unsigned int flush_size; | |
234 | char *cl, *start, *end; | |
235 | ||
236 | /* | |
237 | * Hardcode cl-size to 64 - CPUID can't be used here because that might | |
238 | * cause another #VC exception and the GHCB is not ready to use yet. | |
239 | */ | |
240 | flush_size = 64; | |
241 | start = (char *)(address & PAGE_MASK); | |
242 | end = start + PAGE_SIZE; | |
243 | ||
244 | /* | |
245 | * First make sure there are no pending writes on the cache-lines to | |
246 | * flush. | |
247 | */ | |
248 | asm volatile("mfence" : : : "memory"); | |
249 | ||
250 | for (cl = start; cl != end; cl += flush_size) | |
251 | clflush(cl); | |
252 | } | |
253 | ||
254 | static int set_clr_page_flags(struct x86_mapping_info *info, | |
255 | unsigned long address, | |
256 | pteval_t set, pteval_t clr) | |
257 | { | |
258 | pgd_t *pgdp = (pgd_t *)top_level_pgt; | |
259 | p4d_t *p4dp; | |
260 | pud_t *pudp; | |
261 | pmd_t *pmdp; | |
262 | pte_t *ptep, pte; | |
263 | ||
264 | /* | |
265 | * First make sure there is a PMD mapping for 'address'. | |
266 | * It should already exist, but keep things generic. | |
267 | * | |
268 | * To map the page just read from it and fault it in if there is no | |
a9ee679b MR |
269 | * mapping yet. kernel_add_identity_map() can't be called here because |
270 | * that would unconditionally map the address on PMD level, destroying | |
271 | * any PTE-level mappings that might already exist. Use assembly here | |
272 | * so the access won't be optimized away. | |
c81d6002 JR |
273 | */ |
274 | asm volatile("mov %[address], %%r9" | |
275 | :: [address] "g" (*(unsigned long *)address) | |
276 | : "r9", "memory"); | |
277 | ||
278 | /* | |
279 | * The page is mapped at least with PMD size - so skip checks and walk | |
280 | * directly to the PMD. | |
281 | */ | |
282 | p4dp = p4d_offset(pgdp, address); | |
283 | pudp = pud_offset(p4dp, address); | |
284 | pmdp = pmd_offset(pudp, address); | |
285 | ||
286 | if (pmd_large(*pmdp)) | |
287 | ptep = split_large_pmd(info, pmdp, address); | |
288 | else | |
289 | ptep = pte_offset_kernel(pmdp, address); | |
290 | ||
291 | if (!ptep) | |
292 | return -ENOMEM; | |
293 | ||
294 | /* | |
295 | * Changing encryption attributes of a page requires to flush it from | |
296 | * the caches. | |
297 | */ | |
4f9c403e | 298 | if ((set | clr) & _PAGE_ENC) { |
c81d6002 JR |
299 | clflush_page(address); |
300 | ||
4f9c403e BS |
301 | /* |
302 | * If the encryption attribute is being cleared, change the page state | |
303 | * to shared in the RMP table. | |
304 | */ | |
305 | if (clr) | |
306 | snp_set_page_shared(__pa(address & PAGE_MASK)); | |
307 | } | |
308 | ||
c81d6002 JR |
309 | /* Update PTE */ |
310 | pte = *ptep; | |
311 | pte = pte_set_flags(pte, set); | |
312 | pte = pte_clear_flags(pte, clr); | |
313 | set_pte(ptep, pte); | |
314 | ||
4f9c403e BS |
315 | /* |
316 | * If the encryption attribute is being set, then change the page state to | |
317 | * private in the RMP entry. The page state change must be done after the PTE | |
318 | * is updated. | |
319 | */ | |
320 | if (set & _PAGE_ENC) | |
321 | snp_set_page_private(__pa(address & PAGE_MASK)); | |
322 | ||
c81d6002 JR |
323 | /* Flush TLB after changing encryption attribute */ |
324 | write_cr3(top_level_pgt); | |
325 | ||
326 | return 0; | |
327 | } | |
328 | ||
329 | int set_page_decrypted(unsigned long address) | |
330 | { | |
331 | return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC); | |
332 | } | |
333 | ||
334 | int set_page_encrypted(unsigned long address) | |
335 | { | |
336 | return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0); | |
337 | } | |
338 | ||
69add17a JR |
339 | int set_page_non_present(unsigned long address) |
340 | { | |
341 | return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT); | |
342 | } | |
343 | ||
8b0d3b3b JR |
344 | static void do_pf_error(const char *msg, unsigned long error_code, |
345 | unsigned long address, unsigned long ip) | |
346 | { | |
347 | error_putstr(msg); | |
348 | ||
349 | error_putstr("\nError Code: "); | |
350 | error_puthex(error_code); | |
351 | error_putstr("\nCR2: 0x"); | |
352 | error_puthex(address); | |
353 | error_putstr("\nRIP relative to _head: 0x"); | |
354 | error_puthex(ip - (unsigned long)_head); | |
355 | error_putstr("\n"); | |
356 | ||
357 | error("Stopping.\n"); | |
358 | } | |
359 | ||
360 | void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) | |
361 | { | |
69add17a JR |
362 | unsigned long address = native_read_cr2(); |
363 | unsigned long end; | |
364 | bool ghcb_fault; | |
365 | ||
366 | ghcb_fault = sev_es_check_ghcb_fault(address); | |
367 | ||
368 | address &= PMD_MASK; | |
369 | end = address + PMD_SIZE; | |
8b0d3b3b JR |
370 | |
371 | /* | |
372 | * Check for unexpected error codes. Unexpected are: | |
373 | * - Faults on present pages | |
374 | * - User faults | |
375 | * - Reserved bits set | |
376 | */ | |
377 | if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) | |
378 | do_pf_error("Unexpected page-fault:", error_code, address, regs->ip); | |
69add17a JR |
379 | else if (ghcb_fault) |
380 | do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip); | |
8b0d3b3b JR |
381 | |
382 | /* | |
383 | * Error code is sane - now identity map the 2M region around | |
384 | * the faulting address. | |
385 | */ | |
a9ee679b | 386 | kernel_add_identity_map(address, end); |
8b0d3b3b | 387 | } |