Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
7de828df KC |
2 | /* |
3 | * kaslr.c | |
4 | * | |
5 | * This contains the routines needed to generate a reasonable level of | |
6 | * entropy to choose a randomized kernel base address offset in support | |
7 | * of Kernel Address Space Layout Randomization (KASLR). Additionally | |
8 | * handles walking the physical memory maps (and tracking memory regions | |
9 | * to avoid) in order to select a physical memory location that can | |
10 | * contain the entire properly aligned running kernel image. | |
11 | * | |
12 | */ | |
d52e7d5a BH |
13 | |
14 | /* | |
15 | * isspace() in linux/ctype.h is expected by next_args() to filter | |
16 | * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, | |
17 | * since isdigit() is implemented in both of them. Hence disable it | |
18 | * here. | |
19 | */ | |
20 | #define BOOT_CTYPE_H | |
21 | ||
22 | /* | |
23 | * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. | |
24 | * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL | |
25 | * which is meaningless and will cause compiling error in some cases. | |
d52e7d5a | 26 | */ |
f922c4ab | 27 | #define __DISABLE_EXPORTS |
d52e7d5a | 28 | |
8ab3820f | 29 | #include "misc.h" |
dc425a6e | 30 | #include "error.h" |
5b8b9cf7 | 31 | #include "../string.h" |
8ab3820f | 32 | |
a653f356 KC |
33 | #include <generated/compile.h> |
34 | #include <linux/module.h> | |
35 | #include <linux/uts.h> | |
36 | #include <linux/utsname.h> | |
d52e7d5a | 37 | #include <linux/ctype.h> |
c05cd797 | 38 | #include <linux/efi.h> |
a653f356 | 39 | #include <generated/utsrelease.h> |
c05cd797 | 40 | #include <asm/efi.h> |
a653f356 | 41 | |
d52e7d5a BH |
42 | /* Macros used by the included decompressor code below. */ |
43 | #define STATIC | |
44 | #include <linux/decompress/mm.h> | |
45 | ||
e626e6bb | 46 | #ifdef CONFIG_X86_5LEVEL |
ad3fe525 | 47 | unsigned int __pgtable_l5_enabled; |
b16e770b KS |
48 | unsigned int pgdir_shift __ro_after_init = 39; |
49 | unsigned int ptrs_per_p4d __ro_after_init = 1; | |
e626e6bb KS |
50 | #endif |
51 | ||
d52e7d5a BH |
52 | extern unsigned long get_cmd_line_ptr(void); |
53 | ||
fb43d6cb DH |
54 | /* Used by PAGE_KERN* macros: */ |
55 | pteval_t __default_kernel_pte_mask __read_mostly = ~0; | |
56 | ||
a653f356 | 57 | /* Simplified build-specific string for starting entropy. */ |
327f7d72 | 58 | static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" |
a653f356 KC |
59 | LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; |
60 | ||
a653f356 KC |
61 | static unsigned long rotate_xor(unsigned long hash, const void *area, |
62 | size_t size) | |
63 | { | |
64 | size_t i; | |
65 | unsigned long *ptr = (unsigned long *)area; | |
66 | ||
67 | for (i = 0; i < size / sizeof(hash); i++) { | |
68 | /* Rotate by odd number of bits and XOR. */ | |
69 | hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); | |
70 | hash ^= ptr[i]; | |
71 | } | |
72 | ||
73 | return hash; | |
74 | } | |
75 | ||
76 | /* Attempt to create a simple but unpredictable starting entropy. */ | |
d899a7d1 | 77 | static unsigned long get_boot_seed(void) |
a653f356 KC |
78 | { |
79 | unsigned long hash = 0; | |
80 | ||
81 | hash = rotate_xor(hash, build_str, sizeof(build_str)); | |
6655e0aa | 82 | hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); |
a653f356 KC |
83 | |
84 | return hash; | |
85 | } | |
86 | ||
d899a7d1 TG |
87 | #define KASLR_COMPRESSED_BOOT |
88 | #include "../../lib/kaslr.c" | |
8ab3820f | 89 | |
82fa9637 | 90 | |
f2844249 DJ |
91 | /* Only supporting at most 4 unusable memmap regions with kaslr */ |
92 | #define MAX_MEMMAP_REGIONS 4 | |
93 | ||
94 | static bool memmap_too_large; | |
95 | ||
d52e7d5a | 96 | |
4cdba14f | 97 | /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ |
5db1b1e1 | 98 | static unsigned long long mem_limit = ULLONG_MAX; |
4cdba14f | 99 | |
690eaa53 CF |
100 | /* Number of immovable memory regions */ |
101 | static int num_immovable_mem; | |
4cdba14f | 102 | |
ed09acde KC |
103 | enum mem_avoid_index { |
104 | MEM_AVOID_ZO_RANGE = 0, | |
105 | MEM_AVOID_INITRD, | |
106 | MEM_AVOID_CMDLINE, | |
107 | MEM_AVOID_BOOTPARAMS, | |
f2844249 DJ |
108 | MEM_AVOID_MEMMAP_BEGIN, |
109 | MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, | |
ed09acde KC |
110 | MEM_AVOID_MAX, |
111 | }; | |
112 | ||
e290e8c5 | 113 | static struct mem_vector mem_avoid[MEM_AVOID_MAX]; |
82fa9637 | 114 | |
82fa9637 KC |
115 | static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) |
116 | { | |
117 | /* Item one is entirely before item two. */ | |
118 | if (one->start + one->size <= two->start) | |
119 | return false; | |
120 | /* Item one is entirely after item two. */ | |
121 | if (one->start >= two->start + two->size) | |
122 | return false; | |
123 | return true; | |
124 | } | |
125 | ||
d52e7d5a | 126 | char *skip_spaces(const char *str) |
f2844249 | 127 | { |
d52e7d5a BH |
128 | while (isspace(*str)) |
129 | ++str; | |
130 | return (char *)str; | |
f2844249 | 131 | } |
d52e7d5a BH |
132 | #include "../../../../lib/ctype.c" |
133 | #include "../../../../lib/cmdline.c" | |
f2844249 | 134 | |
199c8471 DW |
135 | enum parse_mode { |
136 | PARSE_MEMMAP, | |
137 | PARSE_EFI, | |
138 | }; | |
139 | ||
f2844249 | 140 | static int |
199c8471 DW |
141 | parse_memmap(char *p, unsigned long long *start, unsigned long long *size, |
142 | enum parse_mode mode) | |
f2844249 DJ |
143 | { |
144 | char *oldp; | |
145 | ||
146 | if (!p) | |
147 | return -EINVAL; | |
148 | ||
149 | /* We don't care about this option here */ | |
150 | if (!strncmp(p, "exactmap", 8)) | |
151 | return -EINVAL; | |
152 | ||
153 | oldp = p; | |
d52e7d5a | 154 | *size = memparse(p, &p); |
f2844249 DJ |
155 | if (p == oldp) |
156 | return -EINVAL; | |
157 | ||
158 | switch (*p) { | |
f2844249 DJ |
159 | case '#': |
160 | case '$': | |
161 | case '!': | |
d52e7d5a | 162 | *start = memparse(p + 1, &p); |
f2844249 | 163 | return 0; |
4cdba14f | 164 | case '@': |
199c8471 DW |
165 | if (mode == PARSE_MEMMAP) { |
166 | /* | |
167 | * memmap=nn@ss specifies usable region, should | |
168 | * be skipped | |
169 | */ | |
170 | *size = 0; | |
171 | } else { | |
172 | unsigned long long flags; | |
173 | ||
174 | /* | |
175 | * efi_fake_mem=nn@ss:attr the attr specifies | |
176 | * flags that might imply a soft-reservation. | |
177 | */ | |
178 | *start = memparse(p + 1, &p); | |
179 | if (p && *p == ':') { | |
180 | p++; | |
181 | if (kstrtoull(p, 0, &flags) < 0) | |
182 | *size = 0; | |
183 | else if (flags & EFI_MEMORY_SP) | |
184 | return 0; | |
185 | } | |
186 | *size = 0; | |
187 | } | |
4cdba14f BH |
188 | /* Fall through */ |
189 | default: | |
190 | /* | |
191 | * If w/o offset, only size specified, memmap=nn[KMG] has the | |
192 | * same behaviour as mem=nn[KMG]. It limits the max address | |
193 | * system can use. Region above the limit should be avoided. | |
194 | */ | |
195 | *start = 0; | |
f2844249 DJ |
196 | return 0; |
197 | } | |
198 | ||
199 | return -EINVAL; | |
200 | } | |
201 | ||
199c8471 | 202 | static void mem_avoid_memmap(enum parse_mode mode, char *str) |
f2844249 | 203 | { |
d52e7d5a | 204 | static int i; |
f2844249 | 205 | |
d52e7d5a | 206 | if (i >= MAX_MEMMAP_REGIONS) |
f2844249 DJ |
207 | return; |
208 | ||
f2844249 DJ |
209 | while (str && (i < MAX_MEMMAP_REGIONS)) { |
210 | int rc; | |
211 | unsigned long long start, size; | |
212 | char *k = strchr(str, ','); | |
213 | ||
214 | if (k) | |
215 | *k++ = 0; | |
216 | ||
199c8471 | 217 | rc = parse_memmap(str, &start, &size, mode); |
f2844249 DJ |
218 | if (rc < 0) |
219 | break; | |
220 | str = k; | |
4cdba14f BH |
221 | |
222 | if (start == 0) { | |
223 | /* Store the specified memory limit if size > 0 */ | |
224 | if (size > 0) | |
225 | mem_limit = size; | |
226 | ||
f2844249 | 227 | continue; |
4cdba14f | 228 | } |
f2844249 DJ |
229 | |
230 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; | |
231 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; | |
232 | i++; | |
233 | } | |
234 | ||
235 | /* More than 4 memmaps, fail kaslr */ | |
236 | if ((i >= MAX_MEMMAP_REGIONS) && str) | |
237 | memmap_too_large = true; | |
238 | } | |
239 | ||
9b912485 BH |
240 | /* Store the number of 1GB huge pages which users specified: */ |
241 | static unsigned long max_gb_huge_pages; | |
242 | ||
243 | static void parse_gb_huge_pages(char *param, char *val) | |
244 | { | |
245 | static bool gbpage_sz; | |
246 | char *p; | |
247 | ||
248 | if (!strcmp(param, "hugepagesz")) { | |
249 | p = val; | |
250 | if (memparse(p, &p) != PUD_SIZE) { | |
251 | gbpage_sz = false; | |
252 | return; | |
253 | } | |
254 | ||
255 | if (gbpage_sz) | |
256 | warn("Repeatedly set hugeTLB page size of 1G!\n"); | |
257 | gbpage_sz = true; | |
258 | return; | |
259 | } | |
260 | ||
261 | if (!strcmp(param, "hugepages") && gbpage_sz) { | |
262 | p = val; | |
263 | max_gb_huge_pages = simple_strtoull(p, &p, 0); | |
264 | return; | |
265 | } | |
266 | } | |
267 | ||
44060e8a | 268 | static void handle_mem_options(void) |
d52e7d5a BH |
269 | { |
270 | char *args = (char *)get_cmd_line_ptr(); | |
271 | size_t len = strlen((char *)args); | |
272 | char *tmp_cmdline; | |
273 | char *param, *val; | |
4cdba14f | 274 | u64 mem_size; |
d52e7d5a | 275 | |
747ff626 BH |
276 | if (!strstr(args, "memmap=") && !strstr(args, "mem=") && |
277 | !strstr(args, "hugepages")) | |
44060e8a | 278 | return; |
d52e7d5a BH |
279 | |
280 | tmp_cmdline = malloc(len + 1); | |
69550d41 | 281 | if (!tmp_cmdline) |
d52e7d5a BH |
282 | error("Failed to allocate space for tmp_cmdline"); |
283 | ||
284 | memcpy(tmp_cmdline, args, len); | |
285 | tmp_cmdline[len] = 0; | |
286 | args = tmp_cmdline; | |
287 | ||
288 | /* Chew leading spaces */ | |
289 | args = skip_spaces(args); | |
290 | ||
291 | while (*args) { | |
292 | args = next_arg(args, ¶m, &val); | |
293 | /* Stop at -- */ | |
294 | if (!val && strcmp(param, "--") == 0) { | |
295 | warn("Only '--' specified in cmdline"); | |
44060e8a | 296 | goto out; |
d52e7d5a BH |
297 | } |
298 | ||
4cdba14f | 299 | if (!strcmp(param, "memmap")) { |
199c8471 | 300 | mem_avoid_memmap(PARSE_MEMMAP, val); |
747ff626 BH |
301 | } else if (strstr(param, "hugepages")) { |
302 | parse_gb_huge_pages(param, val); | |
4cdba14f BH |
303 | } else if (!strcmp(param, "mem")) { |
304 | char *p = val; | |
305 | ||
306 | if (!strcmp(p, "nopentium")) | |
307 | continue; | |
308 | mem_size = memparse(p, &p); | |
44060e8a CF |
309 | if (mem_size == 0) |
310 | goto out; | |
311 | ||
4cdba14f | 312 | mem_limit = mem_size; |
199c8471 DW |
313 | } else if (!strcmp(param, "efi_fake_mem")) { |
314 | mem_avoid_memmap(PARSE_EFI, val); | |
4cdba14f | 315 | } |
d52e7d5a BH |
316 | } |
317 | ||
44060e8a | 318 | out: |
d52e7d5a | 319 | free(tmp_cmdline); |
44060e8a | 320 | return; |
d52e7d5a BH |
321 | } |
322 | ||
9dc1969c | 323 | /* |
ed09acde KC |
324 | * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). |
325 | * The mem_avoid array is used to store the ranges that need to be avoided | |
326 | * when KASLR searches for an appropriate random address. We must avoid any | |
9dc1969c | 327 | * regions that are unsafe to overlap with during decompression, and other |
ed09acde KC |
328 | * things like the initrd, cmdline and boot_params. This comment seeks to |
329 | * explain mem_avoid as clearly as possible since incorrect mem_avoid | |
330 | * memory ranges lead to really hard to debug boot failures. | |
331 | * | |
332 | * The initrd, cmdline, and boot_params are trivial to identify for | |
cb18ef0d | 333 | * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and |
ed09acde KC |
334 | * MEM_AVOID_BOOTPARAMS respectively below. |
335 | * | |
336 | * What is not obvious how to avoid is the range of memory that is used | |
337 | * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover | |
338 | * the compressed kernel (ZO) and its run space, which is used to extract | |
339 | * the uncompressed kernel (VO) and relocs. | |
340 | * | |
341 | * ZO's full run size sits against the end of the decompression buffer, so | |
342 | * we can calculate where text, data, bss, etc of ZO are positioned more | |
343 | * easily. | |
344 | * | |
345 | * For additional background, the decompression calculations can be found | |
346 | * in header.S, and the memory diagram is based on the one found in misc.c. | |
347 | * | |
348 | * The following conditions are already enforced by the image layouts and | |
349 | * associated code: | |
350 | * - input + input_size >= output + output_size | |
351 | * - kernel_total_size <= init_size | |
352 | * - kernel_total_size <= output_size (see Note below) | |
353 | * - output + init_size >= output + output_size | |
9dc1969c | 354 | * |
ed09acde KC |
355 | * (Note that kernel_total_size and output_size have no fundamental |
356 | * relationship, but output_size is passed to choose_random_location | |
357 | * as a maximum of the two. The diagram is showing a case where | |
358 | * kernel_total_size is larger than output_size, but this case is | |
359 | * handled by bumping output_size.) | |
9dc1969c | 360 | * |
ed09acde | 361 | * The above conditions can be illustrated by a diagram: |
9dc1969c | 362 | * |
ed09acde KC |
363 | * 0 output input input+input_size output+init_size |
364 | * | | | | | | |
365 | * | | | | | | |
366 | * |-----|--------|--------|--------------|-----------|--|-------------| | |
367 | * | | | | |
368 | * | | | | |
369 | * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size | |
9dc1969c | 370 | * |
ed09acde KC |
371 | * [output, output+init_size) is the entire memory range used for |
372 | * extracting the compressed image. | |
9dc1969c | 373 | * |
ed09acde KC |
374 | * [output, output+kernel_total_size) is the range needed for the |
375 | * uncompressed kernel (VO) and its run size (bss, brk, etc). | |
9dc1969c | 376 | * |
ed09acde KC |
377 | * [output, output+output_size) is VO plus relocs (i.e. the entire |
378 | * uncompressed payload contained by ZO). This is the area of the buffer | |
379 | * written to during decompression. | |
9dc1969c | 380 | * |
ed09acde KC |
381 | * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case |
382 | * range of the copied ZO and decompression code. (i.e. the range | |
383 | * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) | |
9dc1969c | 384 | * |
ed09acde KC |
385 | * [input, input+input_size) is the original copied compressed image (ZO) |
386 | * (i.e. it does not include its run size). This range must be avoided | |
387 | * because it contains the data used for decompression. | |
9dc1969c | 388 | * |
ed09acde KC |
389 | * [input+input_size, output+init_size) is [_text, _end) for ZO. This |
390 | * range includes ZO's heap and stack, and must be avoided since it | |
391 | * performs the decompression. | |
9dc1969c | 392 | * |
ed09acde KC |
393 | * Since the above two ranges need to be avoided and they are adjacent, |
394 | * they can be merged, resulting in: [input, output+init_size) which | |
395 | * becomes the MEM_AVOID_ZO_RANGE below. | |
9dc1969c | 396 | */ |
82fa9637 | 397 | static void mem_avoid_init(unsigned long input, unsigned long input_size, |
9dc1969c | 398 | unsigned long output) |
82fa9637 | 399 | { |
9dc1969c | 400 | unsigned long init_size = boot_params->hdr.init_size; |
82fa9637 KC |
401 | u64 initrd_start, initrd_size; |
402 | u64 cmd_line, cmd_line_size; | |
82fa9637 KC |
403 | char *ptr; |
404 | ||
405 | /* | |
406 | * Avoid the region that is unsafe to overlap during | |
9dc1969c | 407 | * decompression. |
82fa9637 | 408 | */ |
ed09acde KC |
409 | mem_avoid[MEM_AVOID_ZO_RANGE].start = input; |
410 | mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; | |
3a94707d KC |
411 | add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, |
412 | mem_avoid[MEM_AVOID_ZO_RANGE].size); | |
82fa9637 KC |
413 | |
414 | /* Avoid initrd. */ | |
6655e0aa KC |
415 | initrd_start = (u64)boot_params->ext_ramdisk_image << 32; |
416 | initrd_start |= boot_params->hdr.ramdisk_image; | |
417 | initrd_size = (u64)boot_params->ext_ramdisk_size << 32; | |
418 | initrd_size |= boot_params->hdr.ramdisk_size; | |
ed09acde KC |
419 | mem_avoid[MEM_AVOID_INITRD].start = initrd_start; |
420 | mem_avoid[MEM_AVOID_INITRD].size = initrd_size; | |
3a94707d | 421 | /* No need to set mapping for initrd, it will be handled in VO. */ |
82fa9637 KC |
422 | |
423 | /* Avoid kernel command line. */ | |
6655e0aa KC |
424 | cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; |
425 | cmd_line |= boot_params->hdr.cmd_line_ptr; | |
82fa9637 KC |
426 | /* Calculate size of cmd_line. */ |
427 | ptr = (char *)(unsigned long)cmd_line; | |
69550d41 | 428 | for (cmd_line_size = 0; ptr[cmd_line_size++];) |
82fa9637 | 429 | ; |
ed09acde KC |
430 | mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; |
431 | mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; | |
3a94707d KC |
432 | add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, |
433 | mem_avoid[MEM_AVOID_CMDLINE].size); | |
82fa9637 | 434 | |
ed09acde KC |
435 | /* Avoid boot parameters. */ |
436 | mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; | |
437 | mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); | |
3a94707d KC |
438 | add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, |
439 | mem_avoid[MEM_AVOID_BOOTPARAMS].size); | |
440 | ||
441 | /* We don't need to set a mapping for setup_data. */ | |
442 | ||
f2844249 | 443 | /* Mark the memmap regions we need to avoid */ |
747ff626 | 444 | handle_mem_options(); |
f2844249 | 445 | |
690eaa53 CF |
446 | /* Enumerate the immovable memory regions */ |
447 | num_immovable_mem = count_immovable_mem_regions(); | |
448 | ||
3a94707d KC |
449 | #ifdef CONFIG_X86_VERBOSE_BOOTUP |
450 | /* Make sure video RAM can be used. */ | |
451 | add_identity_map(0, PMD_SIZE); | |
452 | #endif | |
82fa9637 KC |
453 | } |
454 | ||
06486d6c KC |
455 | /* |
456 | * Does this memory vector overlap a known avoided area? If so, record the | |
457 | * overlap region with the lowest address. | |
458 | */ | |
459 | static bool mem_avoid_overlap(struct mem_vector *img, | |
460 | struct mem_vector *overlap) | |
82fa9637 KC |
461 | { |
462 | int i; | |
0cacbfbe | 463 | struct setup_data *ptr; |
06486d6c KC |
464 | unsigned long earliest = img->start + img->size; |
465 | bool is_overlapping = false; | |
82fa9637 KC |
466 | |
467 | for (i = 0; i < MEM_AVOID_MAX; i++) { | |
06486d6c KC |
468 | if (mem_overlaps(img, &mem_avoid[i]) && |
469 | mem_avoid[i].start < earliest) { | |
470 | *overlap = mem_avoid[i]; | |
6daa2ec0 | 471 | earliest = overlap->start; |
06486d6c KC |
472 | is_overlapping = true; |
473 | } | |
82fa9637 KC |
474 | } |
475 | ||
0cacbfbe | 476 | /* Avoid all entries in the setup_data linked list. */ |
6655e0aa | 477 | ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; |
0cacbfbe KC |
478 | while (ptr) { |
479 | struct mem_vector avoid; | |
480 | ||
20cc2888 | 481 | avoid.start = (unsigned long)ptr; |
0cacbfbe KC |
482 | avoid.size = sizeof(*ptr) + ptr->len; |
483 | ||
06486d6c KC |
484 | if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { |
485 | *overlap = avoid; | |
6daa2ec0 | 486 | earliest = overlap->start; |
06486d6c KC |
487 | is_overlapping = true; |
488 | } | |
0cacbfbe KC |
489 | |
490 | ptr = (struct setup_data *)(unsigned long)ptr->next; | |
491 | } | |
492 | ||
06486d6c | 493 | return is_overlapping; |
82fa9637 KC |
494 | } |
495 | ||
c401cf15 BH |
496 | struct slot_area { |
497 | unsigned long addr; | |
498 | int num; | |
499 | }; | |
500 | ||
501 | #define MAX_SLOT_AREA 100 | |
502 | ||
503 | static struct slot_area slot_areas[MAX_SLOT_AREA]; | |
504 | ||
e290e8c5 | 505 | static unsigned long slot_max; |
82fa9637 | 506 | |
c401cf15 BH |
507 | static unsigned long slot_area_index; |
508 | ||
509 | static void store_slot_info(struct mem_vector *region, unsigned long image_size) | |
510 | { | |
511 | struct slot_area slot_area; | |
512 | ||
513 | if (slot_area_index == MAX_SLOT_AREA) | |
514 | return; | |
515 | ||
516 | slot_area.addr = region->start; | |
517 | slot_area.num = (region->size - image_size) / | |
518 | CONFIG_PHYSICAL_ALIGN + 1; | |
519 | ||
520 | if (slot_area.num > 0) { | |
521 | slot_areas[slot_area_index++] = slot_area; | |
522 | slot_max += slot_area.num; | |
523 | } | |
524 | } | |
525 | ||
9b912485 BH |
526 | /* |
527 | * Skip as many 1GB huge pages as possible in the passed region | |
528 | * according to the number which users specified: | |
529 | */ | |
530 | static void | |
531 | process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) | |
532 | { | |
533 | unsigned long addr, size = 0; | |
534 | struct mem_vector tmp; | |
535 | int i = 0; | |
536 | ||
537 | if (!max_gb_huge_pages) { | |
538 | store_slot_info(region, image_size); | |
539 | return; | |
540 | } | |
541 | ||
542 | addr = ALIGN(region->start, PUD_SIZE); | |
543 | /* Did we raise the address above the passed in memory entry? */ | |
544 | if (addr < region->start + region->size) | |
545 | size = region->size - (addr - region->start); | |
546 | ||
547 | /* Check how many 1GB huge pages can be filtered out: */ | |
548 | while (size > PUD_SIZE && max_gb_huge_pages) { | |
549 | size -= PUD_SIZE; | |
550 | max_gb_huge_pages--; | |
551 | i++; | |
552 | } | |
553 | ||
554 | /* No good 1GB huge pages found: */ | |
555 | if (!i) { | |
556 | store_slot_info(region, image_size); | |
557 | return; | |
558 | } | |
559 | ||
560 | /* | |
561 | * Skip those 'i'*1GB good huge pages, and continue checking and | |
562 | * processing the remaining head or tail part of the passed region | |
563 | * if available. | |
564 | */ | |
565 | ||
566 | if (addr >= region->start + image_size) { | |
567 | tmp.start = region->start; | |
568 | tmp.size = addr - region->start; | |
569 | store_slot_info(&tmp, image_size); | |
570 | } | |
571 | ||
572 | size = region->size - (addr - region->start) - i * PUD_SIZE; | |
573 | if (size >= image_size) { | |
574 | tmp.start = addr + i * PUD_SIZE; | |
575 | tmp.size = size; | |
576 | store_slot_info(&tmp, image_size); | |
577 | } | |
578 | } | |
579 | ||
82fa9637 KC |
580 | static unsigned long slots_fetch_random(void) |
581 | { | |
ed9f007e KC |
582 | unsigned long slot; |
583 | int i; | |
584 | ||
82fa9637 KC |
585 | /* Handle case of no slots stored. */ |
586 | if (slot_max == 0) | |
587 | return 0; | |
588 | ||
d899a7d1 | 589 | slot = kaslr_get_random_long("Physical") % slot_max; |
ed9f007e KC |
590 | |
591 | for (i = 0; i < slot_area_index; i++) { | |
592 | if (slot >= slot_areas[i].num) { | |
593 | slot -= slot_areas[i].num; | |
594 | continue; | |
595 | } | |
596 | return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; | |
597 | } | |
598 | ||
599 | if (i == slot_area_index) | |
600 | debug_putstr("slots_fetch_random() failed!?\n"); | |
601 | return 0; | |
82fa9637 KC |
602 | } |
603 | ||
690eaa53 CF |
604 | static void __process_mem_region(struct mem_vector *entry, |
605 | unsigned long minimum, | |
606 | unsigned long image_size) | |
82fa9637 | 607 | { |
ed9f007e | 608 | struct mem_vector region, overlap; |
4cdba14f | 609 | unsigned long start_orig, end; |
87891b01 | 610 | struct mem_vector cur_entry; |
82fa9637 | 611 | |
ed9f007e | 612 | /* On 32-bit, ignore entries entirely above our maximum. */ |
87891b01 | 613 | if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) |
82fa9637 KC |
614 | return; |
615 | ||
616 | /* Ignore entries entirely below our minimum. */ | |
87891b01 | 617 | if (entry->start + entry->size < minimum) |
82fa9637 KC |
618 | return; |
619 | ||
4cdba14f | 620 | /* Ignore entries above memory limit */ |
87891b01 BH |
621 | end = min(entry->size + entry->start, mem_limit); |
622 | if (entry->start >= end) | |
4cdba14f | 623 | return; |
87891b01 BH |
624 | cur_entry.start = entry->start; |
625 | cur_entry.size = end - entry->start; | |
4cdba14f | 626 | |
87891b01 | 627 | region.start = cur_entry.start; |
4cdba14f | 628 | region.size = cur_entry.size; |
82fa9637 | 629 | |
ed9f007e KC |
630 | /* Give up if slot area array is full. */ |
631 | while (slot_area_index < MAX_SLOT_AREA) { | |
632 | start_orig = region.start; | |
82fa9637 | 633 | |
ed9f007e KC |
634 | /* Potentially raise address to minimum location. */ |
635 | if (region.start < minimum) | |
636 | region.start = minimum; | |
82fa9637 | 637 | |
ed9f007e KC |
638 | /* Potentially raise address to meet alignment needs. */ |
639 | region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); | |
82fa9637 | 640 | |
27aac205 | 641 | /* Did we raise the address above the passed in memory entry? */ |
87891b01 | 642 | if (region.start > cur_entry.start + cur_entry.size) |
ed9f007e | 643 | return; |
82fa9637 | 644 | |
ed9f007e KC |
645 | /* Reduce size by any delta from the original address. */ |
646 | region.size -= region.start - start_orig; | |
82fa9637 | 647 | |
ed9f007e KC |
648 | /* On 32-bit, reduce region size to fit within max size. */ |
649 | if (IS_ENABLED(CONFIG_X86_32) && | |
650 | region.start + region.size > KERNEL_IMAGE_SIZE) | |
651 | region.size = KERNEL_IMAGE_SIZE - region.start; | |
652 | ||
653 | /* Return if region can't contain decompressed kernel */ | |
654 | if (region.size < image_size) | |
655 | return; | |
656 | ||
657 | /* If nothing overlaps, store the region and return. */ | |
658 | if (!mem_avoid_overlap(®ion, &overlap)) { | |
747ff626 | 659 | process_gb_huge_pages(®ion, image_size); |
ed9f007e KC |
660 | return; |
661 | } | |
662 | ||
663 | /* Store beginning of region if holds at least image_size. */ | |
664 | if (overlap.start > region.start + image_size) { | |
665 | struct mem_vector beginning; | |
666 | ||
667 | beginning.start = region.start; | |
668 | beginning.size = overlap.start - region.start; | |
747ff626 | 669 | process_gb_huge_pages(&beginning, image_size); |
ed9f007e KC |
670 | } |
671 | ||
672 | /* Return if overlap extends to or past end of region. */ | |
673 | if (overlap.start + overlap.size >= region.start + region.size) | |
674 | return; | |
675 | ||
676 | /* Clip off the overlapping region and start over. */ | |
677 | region.size -= overlap.start - region.start + overlap.size; | |
678 | region.start = overlap.start + overlap.size; | |
82fa9637 KC |
679 | } |
680 | } | |
681 | ||
690eaa53 CF |
682 | static bool process_mem_region(struct mem_vector *region, |
683 | unsigned long long minimum, | |
684 | unsigned long long image_size) | |
685 | { | |
686 | int i; | |
687 | /* | |
688 | * If no immovable memory found, or MEMORY_HOTREMOVE disabled, | |
689 | * use @region directly. | |
690 | */ | |
691 | if (!num_immovable_mem) { | |
692 | __process_mem_region(region, minimum, image_size); | |
693 | ||
694 | if (slot_area_index == MAX_SLOT_AREA) { | |
695 | debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); | |
696 | return 1; | |
697 | } | |
698 | return 0; | |
699 | } | |
700 | ||
82df8261 | 701 | #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) |
690eaa53 CF |
702 | /* |
703 | * If immovable memory found, filter the intersection between | |
704 | * immovable memory and @region. | |
705 | */ | |
706 | for (i = 0; i < num_immovable_mem; i++) { | |
707 | unsigned long long start, end, entry_end, region_end; | |
708 | struct mem_vector entry; | |
709 | ||
710 | if (!mem_overlaps(region, &immovable_mem[i])) | |
711 | continue; | |
712 | ||
713 | start = immovable_mem[i].start; | |
714 | end = start + immovable_mem[i].size; | |
715 | region_end = region->start + region->size; | |
716 | ||
717 | entry.start = clamp(region->start, start, end); | |
718 | entry_end = clamp(region_end, start, end); | |
719 | entry.size = entry_end - entry.start; | |
720 | ||
721 | __process_mem_region(&entry, minimum, image_size); | |
722 | ||
723 | if (slot_area_index == MAX_SLOT_AREA) { | |
724 | debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); | |
725 | return 1; | |
726 | } | |
727 | } | |
690eaa53 | 728 | #endif |
e4a0bd03 | 729 | return 0; |
690eaa53 CF |
730 | } |
731 | ||
c05cd797 BH |
732 | #ifdef CONFIG_EFI |
733 | /* | |
734 | * Returns true if mirror region found (and must have been processed | |
735 | * for slots adding) | |
736 | */ | |
737 | static bool | |
738 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
739 | { | |
740 | struct efi_info *e = &boot_params->efi_info; | |
741 | bool efi_mirror_found = false; | |
742 | struct mem_vector region; | |
743 | efi_memory_desc_t *md; | |
744 | unsigned long pmap; | |
745 | char *signature; | |
746 | u32 nr_desc; | |
747 | int i; | |
748 | ||
749 | signature = (char *)&e->efi_loader_signature; | |
750 | if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && | |
751 | strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) | |
752 | return false; | |
753 | ||
754 | #ifdef CONFIG_X86_32 | |
755 | /* Can't handle data above 4GB at this time */ | |
756 | if (e->efi_memmap_hi) { | |
757 | warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); | |
758 | return false; | |
759 | } | |
760 | pmap = e->efi_memmap; | |
761 | #else | |
762 | pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); | |
763 | #endif | |
764 | ||
765 | nr_desc = e->efi_memmap_size / e->efi_memdesc_size; | |
766 | for (i = 0; i < nr_desc; i++) { | |
767 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
768 | if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { | |
c05cd797 | 769 | efi_mirror_found = true; |
0982adc7 | 770 | break; |
c05cd797 BH |
771 | } |
772 | } | |
773 | ||
0982adc7 NH |
774 | for (i = 0; i < nr_desc; i++) { |
775 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
776 | ||
777 | /* | |
778 | * Here we are more conservative in picking free memory than | |
779 | * the EFI spec allows: | |
780 | * | |
781 | * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also | |
782 | * free memory and thus available to place the kernel image into, | |
783 | * but in practice there's firmware where using that memory leads | |
784 | * to crashes. | |
785 | * | |
786 | * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. | |
787 | */ | |
788 | if (md->type != EFI_CONVENTIONAL_MEMORY) | |
789 | continue; | |
790 | ||
262b45ae DW |
791 | if (efi_soft_reserve_enabled() && |
792 | (md->attribute & EFI_MEMORY_SP)) | |
793 | continue; | |
794 | ||
0982adc7 NH |
795 | if (efi_mirror_found && |
796 | !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) | |
797 | continue; | |
798 | ||
799 | region.start = md->phys_addr; | |
800 | region.size = md->num_pages << EFI_PAGE_SHIFT; | |
690eaa53 | 801 | if (process_mem_region(®ion, minimum, image_size)) |
0982adc7 | 802 | break; |
0982adc7 NH |
803 | } |
804 | return true; | |
c05cd797 BH |
805 | } |
806 | #else | |
807 | static inline bool | |
808 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
809 | { | |
810 | return false; | |
811 | } | |
812 | #endif | |
813 | ||
f62995c9 BH |
814 | static void process_e820_entries(unsigned long minimum, |
815 | unsigned long image_size) | |
82fa9637 KC |
816 | { |
817 | int i; | |
87891b01 | 818 | struct mem_vector region; |
f62995c9 BH |
819 | struct boot_e820_entry *entry; |
820 | ||
821 | /* Verify potential e820 positions, appending to slots list. */ | |
822 | for (i = 0; i < boot_params->e820_entries; i++) { | |
823 | entry = &boot_params->e820_table[i]; | |
824 | /* Skip non-RAM entries. */ | |
825 | if (entry->type != E820_TYPE_RAM) | |
826 | continue; | |
87891b01 BH |
827 | region.start = entry->addr; |
828 | region.size = entry->size; | |
690eaa53 | 829 | if (process_mem_region(®ion, minimum, image_size)) |
f62995c9 | 830 | break; |
f62995c9 BH |
831 | } |
832 | } | |
82fa9637 | 833 | |
f62995c9 BH |
834 | static unsigned long find_random_phys_addr(unsigned long minimum, |
835 | unsigned long image_size) | |
836 | { | |
f2844249 DJ |
837 | /* Check if we had too many memmaps. */ |
838 | if (memmap_too_large) { | |
c05cd797 | 839 | debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); |
f2844249 DJ |
840 | return 0; |
841 | } | |
842 | ||
82fa9637 KC |
843 | /* Make sure minimum is aligned. */ |
844 | minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); | |
845 | ||
c05cd797 BH |
846 | if (process_efi_entries(minimum, image_size)) |
847 | return slots_fetch_random(); | |
848 | ||
f62995c9 | 849 | process_e820_entries(minimum, image_size); |
82fa9637 KC |
850 | return slots_fetch_random(); |
851 | } | |
852 | ||
071a7493 BH |
853 | static unsigned long find_random_virt_addr(unsigned long minimum, |
854 | unsigned long image_size) | |
855 | { | |
856 | unsigned long slots, random_addr; | |
857 | ||
858 | /* Make sure minimum is aligned. */ | |
859 | minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); | |
860 | /* Align image_size for easy slot calculations. */ | |
861 | image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); | |
862 | ||
863 | /* | |
864 | * There are how many CONFIG_PHYSICAL_ALIGN-sized slots | |
865 | * that can hold image_size within the range of minimum to | |
866 | * KERNEL_IMAGE_SIZE? | |
867 | */ | |
868 | slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / | |
869 | CONFIG_PHYSICAL_ALIGN + 1; | |
870 | ||
d899a7d1 | 871 | random_addr = kaslr_get_random_long("Virtual") % slots; |
071a7493 BH |
872 | |
873 | return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; | |
874 | } | |
875 | ||
549f90db BP |
876 | /* |
877 | * Since this function examines addresses much more numerically, | |
878 | * it takes the input and output pointers as 'unsigned long'. | |
879 | */ | |
8391c73c BH |
880 | void choose_random_location(unsigned long input, |
881 | unsigned long input_size, | |
882 | unsigned long *output, | |
883 | unsigned long output_size, | |
884 | unsigned long *virt_addr) | |
8ab3820f | 885 | { |
e066cc47 | 886 | unsigned long random_addr, min_addr; |
8ab3820f KC |
887 | |
888 | if (cmdline_find_option_bool("nokaslr")) { | |
0f8ede1b | 889 | warn("KASLR disabled: 'nokaslr' on cmdline."); |
8391c73c | 890 | return; |
8ab3820f KC |
891 | } |
892 | ||
4c2b4058 KS |
893 | #ifdef CONFIG_X86_5LEVEL |
894 | if (__read_cr4() & X86_CR4_LA57) { | |
ad3fe525 | 895 | __pgtable_l5_enabled = 1; |
b16e770b KS |
896 | pgdir_shift = 48; |
897 | ptrs_per_p4d = 512; | |
4c2b4058 KS |
898 | } |
899 | #endif | |
900 | ||
6655e0aa | 901 | boot_params->hdr.loadflags |= KASLR_FLAG; |
78cac48c | 902 | |
11fdf97a KC |
903 | /* Prepare to add new identity pagetables on demand. */ |
904 | initialize_identity_maps(); | |
905 | ||
82fa9637 | 906 | /* Record the various known unsafe memory ranges. */ |
8391c73c | 907 | mem_avoid_init(input, input_size, *output); |
82fa9637 | 908 | |
e066cc47 YL |
909 | /* |
910 | * Low end of the randomization range should be the | |
911 | * smaller of 512M or the initial kernel image | |
912 | * location: | |
913 | */ | |
914 | min_addr = min(*output, 512UL << 20); | |
915 | ||
c05cd797 | 916 | /* Walk available memory entries to find a random address. */ |
e066cc47 | 917 | random_addr = find_random_phys_addr(min_addr, output_size); |
9016875d | 918 | if (!random_addr) { |
f2844249 | 919 | warn("Physical KASLR disabled: no suitable memory region!"); |
8391c73c BH |
920 | } else { |
921 | /* Update the new physical address location. */ | |
922 | if (*output != random_addr) { | |
923 | add_identity_map(random_addr, output_size); | |
924 | *output = random_addr; | |
925 | } | |
da63b6b2 BH |
926 | |
927 | /* | |
928 | * This loads the identity mapping page table. | |
929 | * This should only be done if a new physical address | |
930 | * is found for the kernel, otherwise we should keep | |
931 | * the old page table to make it be like the "nokaslr" | |
932 | * case. | |
933 | */ | |
934 | finalize_identity_maps(); | |
82fa9637 KC |
935 | } |
936 | ||
8391c73c BH |
937 | |
938 | /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ | |
939 | if (IS_ENABLED(CONFIG_X86_64)) | |
940 | random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); | |
941 | *virt_addr = random_addr; | |
8ab3820f | 942 | } |