Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
7de828df KC |
2 | /* |
3 | * kaslr.c | |
4 | * | |
5 | * This contains the routines needed to generate a reasonable level of | |
6 | * entropy to choose a randomized kernel base address offset in support | |
7 | * of Kernel Address Space Layout Randomization (KASLR). Additionally | |
8 | * handles walking the physical memory maps (and tracking memory regions | |
9 | * to avoid) in order to select a physical memory location that can | |
10 | * contain the entire properly aligned running kernel image. | |
11 | * | |
12 | */ | |
d52e7d5a BH |
13 | |
14 | /* | |
15 | * isspace() in linux/ctype.h is expected by next_args() to filter | |
16 | * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, | |
17 | * since isdigit() is implemented in both of them. Hence disable it | |
18 | * here. | |
19 | */ | |
20 | #define BOOT_CTYPE_H | |
21 | ||
22 | /* | |
23 | * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. | |
24 | * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL | |
25 | * which is meaningless and will cause compiling error in some cases. | |
d52e7d5a | 26 | */ |
f922c4ab | 27 | #define __DISABLE_EXPORTS |
d52e7d5a | 28 | |
8ab3820f | 29 | #include "misc.h" |
dc425a6e | 30 | #include "error.h" |
5b8b9cf7 | 31 | #include "../string.h" |
8ab3820f | 32 | |
a653f356 KC |
33 | #include <generated/compile.h> |
34 | #include <linux/module.h> | |
35 | #include <linux/uts.h> | |
36 | #include <linux/utsname.h> | |
d52e7d5a | 37 | #include <linux/ctype.h> |
c05cd797 | 38 | #include <linux/efi.h> |
a653f356 | 39 | #include <generated/utsrelease.h> |
c05cd797 | 40 | #include <asm/efi.h> |
a653f356 | 41 | |
d52e7d5a BH |
42 | /* Macros used by the included decompressor code below. */ |
43 | #define STATIC | |
44 | #include <linux/decompress/mm.h> | |
45 | ||
e626e6bb | 46 | #ifdef CONFIG_X86_5LEVEL |
ad3fe525 | 47 | unsigned int __pgtable_l5_enabled; |
b16e770b KS |
48 | unsigned int pgdir_shift __ro_after_init = 39; |
49 | unsigned int ptrs_per_p4d __ro_after_init = 1; | |
e626e6bb KS |
50 | #endif |
51 | ||
d52e7d5a BH |
52 | extern unsigned long get_cmd_line_ptr(void); |
53 | ||
fb43d6cb DH |
54 | /* Used by PAGE_KERN* macros: */ |
55 | pteval_t __default_kernel_pte_mask __read_mostly = ~0; | |
56 | ||
a653f356 | 57 | /* Simplified build-specific string for starting entropy. */ |
327f7d72 | 58 | static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" |
a653f356 KC |
59 | LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; |
60 | ||
a653f356 KC |
61 | static unsigned long rotate_xor(unsigned long hash, const void *area, |
62 | size_t size) | |
63 | { | |
64 | size_t i; | |
65 | unsigned long *ptr = (unsigned long *)area; | |
66 | ||
67 | for (i = 0; i < size / sizeof(hash); i++) { | |
68 | /* Rotate by odd number of bits and XOR. */ | |
69 | hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); | |
70 | hash ^= ptr[i]; | |
71 | } | |
72 | ||
73 | return hash; | |
74 | } | |
75 | ||
76 | /* Attempt to create a simple but unpredictable starting entropy. */ | |
d899a7d1 | 77 | static unsigned long get_boot_seed(void) |
a653f356 KC |
78 | { |
79 | unsigned long hash = 0; | |
80 | ||
81 | hash = rotate_xor(hash, build_str, sizeof(build_str)); | |
6655e0aa | 82 | hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); |
a653f356 KC |
83 | |
84 | return hash; | |
85 | } | |
86 | ||
d899a7d1 TG |
87 | #define KASLR_COMPRESSED_BOOT |
88 | #include "../../lib/kaslr.c" | |
8ab3820f | 89 | |
82fa9637 | 90 | |
f2844249 DJ |
91 | /* Only supporting at most 4 unusable memmap regions with kaslr */ |
92 | #define MAX_MEMMAP_REGIONS 4 | |
93 | ||
94 | static bool memmap_too_large; | |
95 | ||
d52e7d5a | 96 | |
4cdba14f | 97 | /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ |
5db1b1e1 | 98 | static unsigned long long mem_limit = ULLONG_MAX; |
4cdba14f | 99 | |
690eaa53 CF |
100 | /* Number of immovable memory regions */ |
101 | static int num_immovable_mem; | |
4cdba14f | 102 | |
ed09acde KC |
103 | enum mem_avoid_index { |
104 | MEM_AVOID_ZO_RANGE = 0, | |
105 | MEM_AVOID_INITRD, | |
106 | MEM_AVOID_CMDLINE, | |
107 | MEM_AVOID_BOOTPARAMS, | |
f2844249 DJ |
108 | MEM_AVOID_MEMMAP_BEGIN, |
109 | MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, | |
ed09acde KC |
110 | MEM_AVOID_MAX, |
111 | }; | |
112 | ||
e290e8c5 | 113 | static struct mem_vector mem_avoid[MEM_AVOID_MAX]; |
82fa9637 | 114 | |
82fa9637 KC |
115 | static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) |
116 | { | |
117 | /* Item one is entirely before item two. */ | |
118 | if (one->start + one->size <= two->start) | |
119 | return false; | |
120 | /* Item one is entirely after item two. */ | |
121 | if (one->start >= two->start + two->size) | |
122 | return false; | |
123 | return true; | |
124 | } | |
125 | ||
d52e7d5a | 126 | char *skip_spaces(const char *str) |
f2844249 | 127 | { |
d52e7d5a BH |
128 | while (isspace(*str)) |
129 | ++str; | |
130 | return (char *)str; | |
f2844249 | 131 | } |
d52e7d5a BH |
132 | #include "../../../../lib/ctype.c" |
133 | #include "../../../../lib/cmdline.c" | |
f2844249 DJ |
134 | |
135 | static int | |
136 | parse_memmap(char *p, unsigned long long *start, unsigned long long *size) | |
137 | { | |
138 | char *oldp; | |
139 | ||
140 | if (!p) | |
141 | return -EINVAL; | |
142 | ||
143 | /* We don't care about this option here */ | |
144 | if (!strncmp(p, "exactmap", 8)) | |
145 | return -EINVAL; | |
146 | ||
147 | oldp = p; | |
d52e7d5a | 148 | *size = memparse(p, &p); |
f2844249 DJ |
149 | if (p == oldp) |
150 | return -EINVAL; | |
151 | ||
152 | switch (*p) { | |
f2844249 DJ |
153 | case '#': |
154 | case '$': | |
155 | case '!': | |
d52e7d5a | 156 | *start = memparse(p + 1, &p); |
f2844249 | 157 | return 0; |
4cdba14f BH |
158 | case '@': |
159 | /* memmap=nn@ss specifies usable region, should be skipped */ | |
160 | *size = 0; | |
161 | /* Fall through */ | |
162 | default: | |
163 | /* | |
164 | * If w/o offset, only size specified, memmap=nn[KMG] has the | |
165 | * same behaviour as mem=nn[KMG]. It limits the max address | |
166 | * system can use. Region above the limit should be avoided. | |
167 | */ | |
168 | *start = 0; | |
f2844249 DJ |
169 | return 0; |
170 | } | |
171 | ||
172 | return -EINVAL; | |
173 | } | |
174 | ||
d52e7d5a | 175 | static void mem_avoid_memmap(char *str) |
f2844249 | 176 | { |
d52e7d5a | 177 | static int i; |
f2844249 | 178 | |
d52e7d5a | 179 | if (i >= MAX_MEMMAP_REGIONS) |
f2844249 DJ |
180 | return; |
181 | ||
f2844249 DJ |
182 | while (str && (i < MAX_MEMMAP_REGIONS)) { |
183 | int rc; | |
184 | unsigned long long start, size; | |
185 | char *k = strchr(str, ','); | |
186 | ||
187 | if (k) | |
188 | *k++ = 0; | |
189 | ||
190 | rc = parse_memmap(str, &start, &size); | |
191 | if (rc < 0) | |
192 | break; | |
193 | str = k; | |
4cdba14f BH |
194 | |
195 | if (start == 0) { | |
196 | /* Store the specified memory limit if size > 0 */ | |
197 | if (size > 0) | |
198 | mem_limit = size; | |
199 | ||
f2844249 | 200 | continue; |
4cdba14f | 201 | } |
f2844249 DJ |
202 | |
203 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; | |
204 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; | |
205 | i++; | |
206 | } | |
207 | ||
208 | /* More than 4 memmaps, fail kaslr */ | |
209 | if ((i >= MAX_MEMMAP_REGIONS) && str) | |
210 | memmap_too_large = true; | |
211 | } | |
212 | ||
9b912485 BH |
213 | /* Store the number of 1GB huge pages which users specified: */ |
214 | static unsigned long max_gb_huge_pages; | |
215 | ||
216 | static void parse_gb_huge_pages(char *param, char *val) | |
217 | { | |
218 | static bool gbpage_sz; | |
219 | char *p; | |
220 | ||
221 | if (!strcmp(param, "hugepagesz")) { | |
222 | p = val; | |
223 | if (memparse(p, &p) != PUD_SIZE) { | |
224 | gbpage_sz = false; | |
225 | return; | |
226 | } | |
227 | ||
228 | if (gbpage_sz) | |
229 | warn("Repeatedly set hugeTLB page size of 1G!\n"); | |
230 | gbpage_sz = true; | |
231 | return; | |
232 | } | |
233 | ||
234 | if (!strcmp(param, "hugepages") && gbpage_sz) { | |
235 | p = val; | |
236 | max_gb_huge_pages = simple_strtoull(p, &p, 0); | |
237 | return; | |
238 | } | |
239 | } | |
240 | ||
241 | ||
44060e8a | 242 | static void handle_mem_options(void) |
d52e7d5a BH |
243 | { |
244 | char *args = (char *)get_cmd_line_ptr(); | |
245 | size_t len = strlen((char *)args); | |
246 | char *tmp_cmdline; | |
247 | char *param, *val; | |
4cdba14f | 248 | u64 mem_size; |
d52e7d5a | 249 | |
747ff626 BH |
250 | if (!strstr(args, "memmap=") && !strstr(args, "mem=") && |
251 | !strstr(args, "hugepages")) | |
44060e8a | 252 | return; |
d52e7d5a BH |
253 | |
254 | tmp_cmdline = malloc(len + 1); | |
69550d41 | 255 | if (!tmp_cmdline) |
d52e7d5a BH |
256 | error("Failed to allocate space for tmp_cmdline"); |
257 | ||
258 | memcpy(tmp_cmdline, args, len); | |
259 | tmp_cmdline[len] = 0; | |
260 | args = tmp_cmdline; | |
261 | ||
262 | /* Chew leading spaces */ | |
263 | args = skip_spaces(args); | |
264 | ||
265 | while (*args) { | |
266 | args = next_arg(args, ¶m, &val); | |
267 | /* Stop at -- */ | |
268 | if (!val && strcmp(param, "--") == 0) { | |
269 | warn("Only '--' specified in cmdline"); | |
44060e8a | 270 | goto out; |
d52e7d5a BH |
271 | } |
272 | ||
4cdba14f | 273 | if (!strcmp(param, "memmap")) { |
d52e7d5a | 274 | mem_avoid_memmap(val); |
747ff626 BH |
275 | } else if (strstr(param, "hugepages")) { |
276 | parse_gb_huge_pages(param, val); | |
4cdba14f BH |
277 | } else if (!strcmp(param, "mem")) { |
278 | char *p = val; | |
279 | ||
280 | if (!strcmp(p, "nopentium")) | |
281 | continue; | |
282 | mem_size = memparse(p, &p); | |
44060e8a CF |
283 | if (mem_size == 0) |
284 | goto out; | |
285 | ||
4cdba14f BH |
286 | mem_limit = mem_size; |
287 | } | |
d52e7d5a BH |
288 | } |
289 | ||
44060e8a | 290 | out: |
d52e7d5a | 291 | free(tmp_cmdline); |
44060e8a | 292 | return; |
d52e7d5a BH |
293 | } |
294 | ||
9dc1969c | 295 | /* |
ed09acde KC |
296 | * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). |
297 | * The mem_avoid array is used to store the ranges that need to be avoided | |
298 | * when KASLR searches for an appropriate random address. We must avoid any | |
9dc1969c | 299 | * regions that are unsafe to overlap with during decompression, and other |
ed09acde KC |
300 | * things like the initrd, cmdline and boot_params. This comment seeks to |
301 | * explain mem_avoid as clearly as possible since incorrect mem_avoid | |
302 | * memory ranges lead to really hard to debug boot failures. | |
303 | * | |
304 | * The initrd, cmdline, and boot_params are trivial to identify for | |
cb18ef0d | 305 | * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and |
ed09acde KC |
306 | * MEM_AVOID_BOOTPARAMS respectively below. |
307 | * | |
308 | * What is not obvious how to avoid is the range of memory that is used | |
309 | * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover | |
310 | * the compressed kernel (ZO) and its run space, which is used to extract | |
311 | * the uncompressed kernel (VO) and relocs. | |
312 | * | |
313 | * ZO's full run size sits against the end of the decompression buffer, so | |
314 | * we can calculate where text, data, bss, etc of ZO are positioned more | |
315 | * easily. | |
316 | * | |
317 | * For additional background, the decompression calculations can be found | |
318 | * in header.S, and the memory diagram is based on the one found in misc.c. | |
319 | * | |
320 | * The following conditions are already enforced by the image layouts and | |
321 | * associated code: | |
322 | * - input + input_size >= output + output_size | |
323 | * - kernel_total_size <= init_size | |
324 | * - kernel_total_size <= output_size (see Note below) | |
325 | * - output + init_size >= output + output_size | |
9dc1969c | 326 | * |
ed09acde KC |
327 | * (Note that kernel_total_size and output_size have no fundamental |
328 | * relationship, but output_size is passed to choose_random_location | |
329 | * as a maximum of the two. The diagram is showing a case where | |
330 | * kernel_total_size is larger than output_size, but this case is | |
331 | * handled by bumping output_size.) | |
9dc1969c | 332 | * |
ed09acde | 333 | * The above conditions can be illustrated by a diagram: |
9dc1969c | 334 | * |
ed09acde KC |
335 | * 0 output input input+input_size output+init_size |
336 | * | | | | | | |
337 | * | | | | | | |
338 | * |-----|--------|--------|--------------|-----------|--|-------------| | |
339 | * | | | | |
340 | * | | | | |
341 | * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size | |
9dc1969c | 342 | * |
ed09acde KC |
343 | * [output, output+init_size) is the entire memory range used for |
344 | * extracting the compressed image. | |
9dc1969c | 345 | * |
ed09acde KC |
346 | * [output, output+kernel_total_size) is the range needed for the |
347 | * uncompressed kernel (VO) and its run size (bss, brk, etc). | |
9dc1969c | 348 | * |
ed09acde KC |
349 | * [output, output+output_size) is VO plus relocs (i.e. the entire |
350 | * uncompressed payload contained by ZO). This is the area of the buffer | |
351 | * written to during decompression. | |
9dc1969c | 352 | * |
ed09acde KC |
353 | * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case |
354 | * range of the copied ZO and decompression code. (i.e. the range | |
355 | * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) | |
9dc1969c | 356 | * |
ed09acde KC |
357 | * [input, input+input_size) is the original copied compressed image (ZO) |
358 | * (i.e. it does not include its run size). This range must be avoided | |
359 | * because it contains the data used for decompression. | |
9dc1969c | 360 | * |
ed09acde KC |
361 | * [input+input_size, output+init_size) is [_text, _end) for ZO. This |
362 | * range includes ZO's heap and stack, and must be avoided since it | |
363 | * performs the decompression. | |
9dc1969c | 364 | * |
ed09acde KC |
365 | * Since the above two ranges need to be avoided and they are adjacent, |
366 | * they can be merged, resulting in: [input, output+init_size) which | |
367 | * becomes the MEM_AVOID_ZO_RANGE below. | |
9dc1969c | 368 | */ |
82fa9637 | 369 | static void mem_avoid_init(unsigned long input, unsigned long input_size, |
9dc1969c | 370 | unsigned long output) |
82fa9637 | 371 | { |
9dc1969c | 372 | unsigned long init_size = boot_params->hdr.init_size; |
82fa9637 KC |
373 | u64 initrd_start, initrd_size; |
374 | u64 cmd_line, cmd_line_size; | |
82fa9637 KC |
375 | char *ptr; |
376 | ||
377 | /* | |
378 | * Avoid the region that is unsafe to overlap during | |
9dc1969c | 379 | * decompression. |
82fa9637 | 380 | */ |
ed09acde KC |
381 | mem_avoid[MEM_AVOID_ZO_RANGE].start = input; |
382 | mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; | |
3a94707d KC |
383 | add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, |
384 | mem_avoid[MEM_AVOID_ZO_RANGE].size); | |
82fa9637 KC |
385 | |
386 | /* Avoid initrd. */ | |
6655e0aa KC |
387 | initrd_start = (u64)boot_params->ext_ramdisk_image << 32; |
388 | initrd_start |= boot_params->hdr.ramdisk_image; | |
389 | initrd_size = (u64)boot_params->ext_ramdisk_size << 32; | |
390 | initrd_size |= boot_params->hdr.ramdisk_size; | |
ed09acde KC |
391 | mem_avoid[MEM_AVOID_INITRD].start = initrd_start; |
392 | mem_avoid[MEM_AVOID_INITRD].size = initrd_size; | |
3a94707d | 393 | /* No need to set mapping for initrd, it will be handled in VO. */ |
82fa9637 KC |
394 | |
395 | /* Avoid kernel command line. */ | |
6655e0aa KC |
396 | cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; |
397 | cmd_line |= boot_params->hdr.cmd_line_ptr; | |
82fa9637 KC |
398 | /* Calculate size of cmd_line. */ |
399 | ptr = (char *)(unsigned long)cmd_line; | |
69550d41 | 400 | for (cmd_line_size = 0; ptr[cmd_line_size++];) |
82fa9637 | 401 | ; |
ed09acde KC |
402 | mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; |
403 | mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; | |
3a94707d KC |
404 | add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, |
405 | mem_avoid[MEM_AVOID_CMDLINE].size); | |
82fa9637 | 406 | |
ed09acde KC |
407 | /* Avoid boot parameters. */ |
408 | mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; | |
409 | mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); | |
3a94707d KC |
410 | add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, |
411 | mem_avoid[MEM_AVOID_BOOTPARAMS].size); | |
412 | ||
413 | /* We don't need to set a mapping for setup_data. */ | |
414 | ||
f2844249 | 415 | /* Mark the memmap regions we need to avoid */ |
747ff626 | 416 | handle_mem_options(); |
f2844249 | 417 | |
690eaa53 CF |
418 | /* Enumerate the immovable memory regions */ |
419 | num_immovable_mem = count_immovable_mem_regions(); | |
420 | ||
3a94707d KC |
421 | #ifdef CONFIG_X86_VERBOSE_BOOTUP |
422 | /* Make sure video RAM can be used. */ | |
423 | add_identity_map(0, PMD_SIZE); | |
424 | #endif | |
82fa9637 KC |
425 | } |
426 | ||
06486d6c KC |
427 | /* |
428 | * Does this memory vector overlap a known avoided area? If so, record the | |
429 | * overlap region with the lowest address. | |
430 | */ | |
431 | static bool mem_avoid_overlap(struct mem_vector *img, | |
432 | struct mem_vector *overlap) | |
82fa9637 KC |
433 | { |
434 | int i; | |
0cacbfbe | 435 | struct setup_data *ptr; |
06486d6c KC |
436 | unsigned long earliest = img->start + img->size; |
437 | bool is_overlapping = false; | |
82fa9637 KC |
438 | |
439 | for (i = 0; i < MEM_AVOID_MAX; i++) { | |
06486d6c KC |
440 | if (mem_overlaps(img, &mem_avoid[i]) && |
441 | mem_avoid[i].start < earliest) { | |
442 | *overlap = mem_avoid[i]; | |
6daa2ec0 | 443 | earliest = overlap->start; |
06486d6c KC |
444 | is_overlapping = true; |
445 | } | |
82fa9637 KC |
446 | } |
447 | ||
0cacbfbe | 448 | /* Avoid all entries in the setup_data linked list. */ |
6655e0aa | 449 | ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; |
0cacbfbe KC |
450 | while (ptr) { |
451 | struct mem_vector avoid; | |
452 | ||
20cc2888 | 453 | avoid.start = (unsigned long)ptr; |
0cacbfbe KC |
454 | avoid.size = sizeof(*ptr) + ptr->len; |
455 | ||
06486d6c KC |
456 | if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { |
457 | *overlap = avoid; | |
6daa2ec0 | 458 | earliest = overlap->start; |
06486d6c KC |
459 | is_overlapping = true; |
460 | } | |
0cacbfbe KC |
461 | |
462 | ptr = (struct setup_data *)(unsigned long)ptr->next; | |
463 | } | |
464 | ||
06486d6c | 465 | return is_overlapping; |
82fa9637 KC |
466 | } |
467 | ||
c401cf15 BH |
468 | struct slot_area { |
469 | unsigned long addr; | |
470 | int num; | |
471 | }; | |
472 | ||
473 | #define MAX_SLOT_AREA 100 | |
474 | ||
475 | static struct slot_area slot_areas[MAX_SLOT_AREA]; | |
476 | ||
e290e8c5 | 477 | static unsigned long slot_max; |
82fa9637 | 478 | |
c401cf15 BH |
479 | static unsigned long slot_area_index; |
480 | ||
481 | static void store_slot_info(struct mem_vector *region, unsigned long image_size) | |
482 | { | |
483 | struct slot_area slot_area; | |
484 | ||
485 | if (slot_area_index == MAX_SLOT_AREA) | |
486 | return; | |
487 | ||
488 | slot_area.addr = region->start; | |
489 | slot_area.num = (region->size - image_size) / | |
490 | CONFIG_PHYSICAL_ALIGN + 1; | |
491 | ||
492 | if (slot_area.num > 0) { | |
493 | slot_areas[slot_area_index++] = slot_area; | |
494 | slot_max += slot_area.num; | |
495 | } | |
496 | } | |
497 | ||
9b912485 BH |
498 | /* |
499 | * Skip as many 1GB huge pages as possible in the passed region | |
500 | * according to the number which users specified: | |
501 | */ | |
502 | static void | |
503 | process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) | |
504 | { | |
505 | unsigned long addr, size = 0; | |
506 | struct mem_vector tmp; | |
507 | int i = 0; | |
508 | ||
509 | if (!max_gb_huge_pages) { | |
510 | store_slot_info(region, image_size); | |
511 | return; | |
512 | } | |
513 | ||
514 | addr = ALIGN(region->start, PUD_SIZE); | |
515 | /* Did we raise the address above the passed in memory entry? */ | |
516 | if (addr < region->start + region->size) | |
517 | size = region->size - (addr - region->start); | |
518 | ||
519 | /* Check how many 1GB huge pages can be filtered out: */ | |
520 | while (size > PUD_SIZE && max_gb_huge_pages) { | |
521 | size -= PUD_SIZE; | |
522 | max_gb_huge_pages--; | |
523 | i++; | |
524 | } | |
525 | ||
526 | /* No good 1GB huge pages found: */ | |
527 | if (!i) { | |
528 | store_slot_info(region, image_size); | |
529 | return; | |
530 | } | |
531 | ||
532 | /* | |
533 | * Skip those 'i'*1GB good huge pages, and continue checking and | |
534 | * processing the remaining head or tail part of the passed region | |
535 | * if available. | |
536 | */ | |
537 | ||
538 | if (addr >= region->start + image_size) { | |
539 | tmp.start = region->start; | |
540 | tmp.size = addr - region->start; | |
541 | store_slot_info(&tmp, image_size); | |
542 | } | |
543 | ||
544 | size = region->size - (addr - region->start) - i * PUD_SIZE; | |
545 | if (size >= image_size) { | |
546 | tmp.start = addr + i * PUD_SIZE; | |
547 | tmp.size = size; | |
548 | store_slot_info(&tmp, image_size); | |
549 | } | |
550 | } | |
551 | ||
82fa9637 KC |
552 | static unsigned long slots_fetch_random(void) |
553 | { | |
ed9f007e KC |
554 | unsigned long slot; |
555 | int i; | |
556 | ||
82fa9637 KC |
557 | /* Handle case of no slots stored. */ |
558 | if (slot_max == 0) | |
559 | return 0; | |
560 | ||
d899a7d1 | 561 | slot = kaslr_get_random_long("Physical") % slot_max; |
ed9f007e KC |
562 | |
563 | for (i = 0; i < slot_area_index; i++) { | |
564 | if (slot >= slot_areas[i].num) { | |
565 | slot -= slot_areas[i].num; | |
566 | continue; | |
567 | } | |
568 | return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; | |
569 | } | |
570 | ||
571 | if (i == slot_area_index) | |
572 | debug_putstr("slots_fetch_random() failed!?\n"); | |
573 | return 0; | |
82fa9637 KC |
574 | } |
575 | ||
690eaa53 CF |
576 | static void __process_mem_region(struct mem_vector *entry, |
577 | unsigned long minimum, | |
578 | unsigned long image_size) | |
82fa9637 | 579 | { |
ed9f007e | 580 | struct mem_vector region, overlap; |
4cdba14f | 581 | unsigned long start_orig, end; |
87891b01 | 582 | struct mem_vector cur_entry; |
82fa9637 | 583 | |
ed9f007e | 584 | /* On 32-bit, ignore entries entirely above our maximum. */ |
87891b01 | 585 | if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) |
82fa9637 KC |
586 | return; |
587 | ||
588 | /* Ignore entries entirely below our minimum. */ | |
87891b01 | 589 | if (entry->start + entry->size < minimum) |
82fa9637 KC |
590 | return; |
591 | ||
4cdba14f | 592 | /* Ignore entries above memory limit */ |
87891b01 BH |
593 | end = min(entry->size + entry->start, mem_limit); |
594 | if (entry->start >= end) | |
4cdba14f | 595 | return; |
87891b01 BH |
596 | cur_entry.start = entry->start; |
597 | cur_entry.size = end - entry->start; | |
4cdba14f | 598 | |
87891b01 | 599 | region.start = cur_entry.start; |
4cdba14f | 600 | region.size = cur_entry.size; |
82fa9637 | 601 | |
ed9f007e KC |
602 | /* Give up if slot area array is full. */ |
603 | while (slot_area_index < MAX_SLOT_AREA) { | |
604 | start_orig = region.start; | |
82fa9637 | 605 | |
ed9f007e KC |
606 | /* Potentially raise address to minimum location. */ |
607 | if (region.start < minimum) | |
608 | region.start = minimum; | |
82fa9637 | 609 | |
ed9f007e KC |
610 | /* Potentially raise address to meet alignment needs. */ |
611 | region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); | |
82fa9637 | 612 | |
27aac205 | 613 | /* Did we raise the address above the passed in memory entry? */ |
87891b01 | 614 | if (region.start > cur_entry.start + cur_entry.size) |
ed9f007e | 615 | return; |
82fa9637 | 616 | |
ed9f007e KC |
617 | /* Reduce size by any delta from the original address. */ |
618 | region.size -= region.start - start_orig; | |
82fa9637 | 619 | |
ed9f007e KC |
620 | /* On 32-bit, reduce region size to fit within max size. */ |
621 | if (IS_ENABLED(CONFIG_X86_32) && | |
622 | region.start + region.size > KERNEL_IMAGE_SIZE) | |
623 | region.size = KERNEL_IMAGE_SIZE - region.start; | |
624 | ||
625 | /* Return if region can't contain decompressed kernel */ | |
626 | if (region.size < image_size) | |
627 | return; | |
628 | ||
629 | /* If nothing overlaps, store the region and return. */ | |
630 | if (!mem_avoid_overlap(®ion, &overlap)) { | |
747ff626 | 631 | process_gb_huge_pages(®ion, image_size); |
ed9f007e KC |
632 | return; |
633 | } | |
634 | ||
635 | /* Store beginning of region if holds at least image_size. */ | |
636 | if (overlap.start > region.start + image_size) { | |
637 | struct mem_vector beginning; | |
638 | ||
639 | beginning.start = region.start; | |
640 | beginning.size = overlap.start - region.start; | |
747ff626 | 641 | process_gb_huge_pages(&beginning, image_size); |
ed9f007e KC |
642 | } |
643 | ||
644 | /* Return if overlap extends to or past end of region. */ | |
645 | if (overlap.start + overlap.size >= region.start + region.size) | |
646 | return; | |
647 | ||
648 | /* Clip off the overlapping region and start over. */ | |
649 | region.size -= overlap.start - region.start + overlap.size; | |
650 | region.start = overlap.start + overlap.size; | |
82fa9637 KC |
651 | } |
652 | } | |
653 | ||
690eaa53 CF |
654 | static bool process_mem_region(struct mem_vector *region, |
655 | unsigned long long minimum, | |
656 | unsigned long long image_size) | |
657 | { | |
658 | int i; | |
659 | /* | |
660 | * If no immovable memory found, or MEMORY_HOTREMOVE disabled, | |
661 | * use @region directly. | |
662 | */ | |
663 | if (!num_immovable_mem) { | |
664 | __process_mem_region(region, minimum, image_size); | |
665 | ||
666 | if (slot_area_index == MAX_SLOT_AREA) { | |
667 | debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); | |
668 | return 1; | |
669 | } | |
670 | return 0; | |
671 | } | |
672 | ||
82df8261 | 673 | #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) |
690eaa53 CF |
674 | /* |
675 | * If immovable memory found, filter the intersection between | |
676 | * immovable memory and @region. | |
677 | */ | |
678 | for (i = 0; i < num_immovable_mem; i++) { | |
679 | unsigned long long start, end, entry_end, region_end; | |
680 | struct mem_vector entry; | |
681 | ||
682 | if (!mem_overlaps(region, &immovable_mem[i])) | |
683 | continue; | |
684 | ||
685 | start = immovable_mem[i].start; | |
686 | end = start + immovable_mem[i].size; | |
687 | region_end = region->start + region->size; | |
688 | ||
689 | entry.start = clamp(region->start, start, end); | |
690 | entry_end = clamp(region_end, start, end); | |
691 | entry.size = entry_end - entry.start; | |
692 | ||
693 | __process_mem_region(&entry, minimum, image_size); | |
694 | ||
695 | if (slot_area_index == MAX_SLOT_AREA) { | |
696 | debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); | |
697 | return 1; | |
698 | } | |
699 | } | |
700 | return 0; | |
701 | #endif | |
702 | } | |
703 | ||
c05cd797 BH |
704 | #ifdef CONFIG_EFI |
705 | /* | |
706 | * Returns true if mirror region found (and must have been processed | |
707 | * for slots adding) | |
708 | */ | |
709 | static bool | |
710 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
711 | { | |
712 | struct efi_info *e = &boot_params->efi_info; | |
713 | bool efi_mirror_found = false; | |
714 | struct mem_vector region; | |
715 | efi_memory_desc_t *md; | |
716 | unsigned long pmap; | |
717 | char *signature; | |
718 | u32 nr_desc; | |
719 | int i; | |
720 | ||
721 | signature = (char *)&e->efi_loader_signature; | |
722 | if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && | |
723 | strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) | |
724 | return false; | |
725 | ||
726 | #ifdef CONFIG_X86_32 | |
727 | /* Can't handle data above 4GB at this time */ | |
728 | if (e->efi_memmap_hi) { | |
729 | warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); | |
730 | return false; | |
731 | } | |
732 | pmap = e->efi_memmap; | |
733 | #else | |
734 | pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); | |
735 | #endif | |
736 | ||
737 | nr_desc = e->efi_memmap_size / e->efi_memdesc_size; | |
738 | for (i = 0; i < nr_desc; i++) { | |
739 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
740 | if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { | |
c05cd797 | 741 | efi_mirror_found = true; |
0982adc7 | 742 | break; |
c05cd797 BH |
743 | } |
744 | } | |
745 | ||
0982adc7 NH |
746 | for (i = 0; i < nr_desc; i++) { |
747 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
748 | ||
749 | /* | |
750 | * Here we are more conservative in picking free memory than | |
751 | * the EFI spec allows: | |
752 | * | |
753 | * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also | |
754 | * free memory and thus available to place the kernel image into, | |
755 | * but in practice there's firmware where using that memory leads | |
756 | * to crashes. | |
757 | * | |
758 | * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. | |
759 | */ | |
760 | if (md->type != EFI_CONVENTIONAL_MEMORY) | |
761 | continue; | |
762 | ||
763 | if (efi_mirror_found && | |
764 | !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) | |
765 | continue; | |
766 | ||
767 | region.start = md->phys_addr; | |
768 | region.size = md->num_pages << EFI_PAGE_SHIFT; | |
690eaa53 | 769 | if (process_mem_region(®ion, minimum, image_size)) |
0982adc7 | 770 | break; |
0982adc7 NH |
771 | } |
772 | return true; | |
c05cd797 BH |
773 | } |
774 | #else | |
775 | static inline bool | |
776 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
777 | { | |
778 | return false; | |
779 | } | |
780 | #endif | |
781 | ||
f62995c9 BH |
782 | static void process_e820_entries(unsigned long minimum, |
783 | unsigned long image_size) | |
82fa9637 KC |
784 | { |
785 | int i; | |
87891b01 | 786 | struct mem_vector region; |
f62995c9 BH |
787 | struct boot_e820_entry *entry; |
788 | ||
789 | /* Verify potential e820 positions, appending to slots list. */ | |
790 | for (i = 0; i < boot_params->e820_entries; i++) { | |
791 | entry = &boot_params->e820_table[i]; | |
792 | /* Skip non-RAM entries. */ | |
793 | if (entry->type != E820_TYPE_RAM) | |
794 | continue; | |
87891b01 BH |
795 | region.start = entry->addr; |
796 | region.size = entry->size; | |
690eaa53 | 797 | if (process_mem_region(®ion, minimum, image_size)) |
f62995c9 | 798 | break; |
f62995c9 BH |
799 | } |
800 | } | |
82fa9637 | 801 | |
f62995c9 BH |
802 | static unsigned long find_random_phys_addr(unsigned long minimum, |
803 | unsigned long image_size) | |
804 | { | |
f2844249 DJ |
805 | /* Check if we had too many memmaps. */ |
806 | if (memmap_too_large) { | |
c05cd797 | 807 | debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); |
f2844249 DJ |
808 | return 0; |
809 | } | |
810 | ||
82fa9637 KC |
811 | /* Make sure minimum is aligned. */ |
812 | minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); | |
813 | ||
c05cd797 BH |
814 | if (process_efi_entries(minimum, image_size)) |
815 | return slots_fetch_random(); | |
816 | ||
f62995c9 | 817 | process_e820_entries(minimum, image_size); |
82fa9637 KC |
818 | return slots_fetch_random(); |
819 | } | |
820 | ||
071a7493 BH |
821 | static unsigned long find_random_virt_addr(unsigned long minimum, |
822 | unsigned long image_size) | |
823 | { | |
824 | unsigned long slots, random_addr; | |
825 | ||
826 | /* Make sure minimum is aligned. */ | |
827 | minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); | |
828 | /* Align image_size for easy slot calculations. */ | |
829 | image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); | |
830 | ||
831 | /* | |
832 | * There are how many CONFIG_PHYSICAL_ALIGN-sized slots | |
833 | * that can hold image_size within the range of minimum to | |
834 | * KERNEL_IMAGE_SIZE? | |
835 | */ | |
836 | slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / | |
837 | CONFIG_PHYSICAL_ALIGN + 1; | |
838 | ||
d899a7d1 | 839 | random_addr = kaslr_get_random_long("Virtual") % slots; |
071a7493 BH |
840 | |
841 | return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; | |
842 | } | |
843 | ||
549f90db BP |
844 | /* |
845 | * Since this function examines addresses much more numerically, | |
846 | * it takes the input and output pointers as 'unsigned long'. | |
847 | */ | |
8391c73c BH |
848 | void choose_random_location(unsigned long input, |
849 | unsigned long input_size, | |
850 | unsigned long *output, | |
851 | unsigned long output_size, | |
852 | unsigned long *virt_addr) | |
8ab3820f | 853 | { |
e066cc47 | 854 | unsigned long random_addr, min_addr; |
8ab3820f KC |
855 | |
856 | if (cmdline_find_option_bool("nokaslr")) { | |
0f8ede1b | 857 | warn("KASLR disabled: 'nokaslr' on cmdline."); |
8391c73c | 858 | return; |
8ab3820f KC |
859 | } |
860 | ||
4c2b4058 KS |
861 | #ifdef CONFIG_X86_5LEVEL |
862 | if (__read_cr4() & X86_CR4_LA57) { | |
ad3fe525 | 863 | __pgtable_l5_enabled = 1; |
b16e770b KS |
864 | pgdir_shift = 48; |
865 | ptrs_per_p4d = 512; | |
4c2b4058 KS |
866 | } |
867 | #endif | |
868 | ||
6655e0aa | 869 | boot_params->hdr.loadflags |= KASLR_FLAG; |
78cac48c | 870 | |
11fdf97a KC |
871 | /* Prepare to add new identity pagetables on demand. */ |
872 | initialize_identity_maps(); | |
873 | ||
82fa9637 | 874 | /* Record the various known unsafe memory ranges. */ |
8391c73c | 875 | mem_avoid_init(input, input_size, *output); |
82fa9637 | 876 | |
e066cc47 YL |
877 | /* |
878 | * Low end of the randomization range should be the | |
879 | * smaller of 512M or the initial kernel image | |
880 | * location: | |
881 | */ | |
882 | min_addr = min(*output, 512UL << 20); | |
883 | ||
c05cd797 | 884 | /* Walk available memory entries to find a random address. */ |
e066cc47 | 885 | random_addr = find_random_phys_addr(min_addr, output_size); |
9016875d | 886 | if (!random_addr) { |
f2844249 | 887 | warn("Physical KASLR disabled: no suitable memory region!"); |
8391c73c BH |
888 | } else { |
889 | /* Update the new physical address location. */ | |
890 | if (*output != random_addr) { | |
891 | add_identity_map(random_addr, output_size); | |
892 | *output = random_addr; | |
893 | } | |
da63b6b2 BH |
894 | |
895 | /* | |
896 | * This loads the identity mapping page table. | |
897 | * This should only be done if a new physical address | |
898 | * is found for the kernel, otherwise we should keep | |
899 | * the old page table to make it be like the "nokaslr" | |
900 | * case. | |
901 | */ | |
902 | finalize_identity_maps(); | |
82fa9637 KC |
903 | } |
904 | ||
8391c73c BH |
905 | |
906 | /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ | |
907 | if (IS_ENABLED(CONFIG_X86_64)) | |
908 | random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); | |
909 | *virt_addr = random_addr; | |
8ab3820f | 910 | } |