Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
7de828df KC |
2 | /* |
3 | * kaslr.c | |
4 | * | |
5 | * This contains the routines needed to generate a reasonable level of | |
6 | * entropy to choose a randomized kernel base address offset in support | |
7 | * of Kernel Address Space Layout Randomization (KASLR). Additionally | |
8 | * handles walking the physical memory maps (and tracking memory regions | |
9 | * to avoid) in order to select a physical memory location that can | |
10 | * contain the entire properly aligned running kernel image. | |
11 | * | |
12 | */ | |
d52e7d5a BH |
13 | |
14 | /* | |
15 | * isspace() in linux/ctype.h is expected by next_args() to filter | |
16 | * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, | |
17 | * since isdigit() is implemented in both of them. Hence disable it | |
18 | * here. | |
19 | */ | |
20 | #define BOOT_CTYPE_H | |
21 | ||
8ab3820f | 22 | #include "misc.h" |
dc425a6e | 23 | #include "error.h" |
5b8b9cf7 | 24 | #include "../string.h" |
5dc91f2d | 25 | #include "efi.h" |
8ab3820f | 26 | |
a653f356 | 27 | #include <generated/compile.h> |
2df8220c | 28 | #include <generated/utsversion.h> |
a653f356 | 29 | #include <generated/utsrelease.h> |
a653f356 | 30 | |
76167e5c AS |
31 | #define _SETUP |
32 | #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ | |
33 | #undef _SETUP | |
34 | ||
d52e7d5a BH |
35 | extern unsigned long get_cmd_line_ptr(void); |
36 | ||
a653f356 | 37 | /* Simplified build-specific string for starting entropy. */ |
327f7d72 | 38 | static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" |
a653f356 KC |
39 | LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; |
40 | ||
a653f356 KC |
41 | static unsigned long rotate_xor(unsigned long hash, const void *area, |
42 | size_t size) | |
43 | { | |
44 | size_t i; | |
45 | unsigned long *ptr = (unsigned long *)area; | |
46 | ||
47 | for (i = 0; i < size / sizeof(hash); i++) { | |
48 | /* Rotate by odd number of bits and XOR. */ | |
49 | hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); | |
50 | hash ^= ptr[i]; | |
51 | } | |
52 | ||
53 | return hash; | |
54 | } | |
55 | ||
56 | /* Attempt to create a simple but unpredictable starting entropy. */ | |
d899a7d1 | 57 | static unsigned long get_boot_seed(void) |
a653f356 KC |
58 | { |
59 | unsigned long hash = 0; | |
60 | ||
61 | hash = rotate_xor(hash, build_str, sizeof(build_str)); | |
d55d5bc5 | 62 | hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr)); |
a653f356 KC |
63 | |
64 | return hash; | |
65 | } | |
66 | ||
d899a7d1 TG |
67 | #define KASLR_COMPRESSED_BOOT |
68 | #include "../../lib/kaslr.c" | |
8ab3820f | 69 | |
82fa9637 | 70 | |
f2844249 DJ |
71 | /* Only supporting at most 4 unusable memmap regions with kaslr */ |
72 | #define MAX_MEMMAP_REGIONS 4 | |
73 | ||
74 | static bool memmap_too_large; | |
75 | ||
d52e7d5a | 76 | |
45128694 AS |
77 | /* |
78 | * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. | |
79 | * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options. | |
80 | */ | |
3a066990 | 81 | static u64 mem_limit; |
4cdba14f | 82 | |
690eaa53 CF |
83 | /* Number of immovable memory regions */ |
84 | static int num_immovable_mem; | |
4cdba14f | 85 | |
ed09acde KC |
86 | enum mem_avoid_index { |
87 | MEM_AVOID_ZO_RANGE = 0, | |
88 | MEM_AVOID_INITRD, | |
89 | MEM_AVOID_CMDLINE, | |
90 | MEM_AVOID_BOOTPARAMS, | |
f2844249 DJ |
91 | MEM_AVOID_MEMMAP_BEGIN, |
92 | MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, | |
ed09acde KC |
93 | MEM_AVOID_MAX, |
94 | }; | |
95 | ||
e290e8c5 | 96 | static struct mem_vector mem_avoid[MEM_AVOID_MAX]; |
82fa9637 | 97 | |
82fa9637 KC |
98 | static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) |
99 | { | |
100 | /* Item one is entirely before item two. */ | |
101 | if (one->start + one->size <= two->start) | |
102 | return false; | |
103 | /* Item one is entirely after item two. */ | |
104 | if (one->start >= two->start + two->size) | |
105 | return false; | |
106 | return true; | |
107 | } | |
108 | ||
d52e7d5a | 109 | char *skip_spaces(const char *str) |
f2844249 | 110 | { |
d52e7d5a BH |
111 | while (isspace(*str)) |
112 | ++str; | |
113 | return (char *)str; | |
f2844249 | 114 | } |
d52e7d5a BH |
115 | #include "../../../../lib/ctype.c" |
116 | #include "../../../../lib/cmdline.c" | |
f2844249 DJ |
117 | |
118 | static int | |
37aee82c | 119 | parse_memmap(char *p, u64 *start, u64 *size) |
f2844249 DJ |
120 | { |
121 | char *oldp; | |
122 | ||
123 | if (!p) | |
124 | return -EINVAL; | |
125 | ||
126 | /* We don't care about this option here */ | |
127 | if (!strncmp(p, "exactmap", 8)) | |
128 | return -EINVAL; | |
129 | ||
130 | oldp = p; | |
d52e7d5a | 131 | *size = memparse(p, &p); |
f2844249 DJ |
132 | if (p == oldp) |
133 | return -EINVAL; | |
134 | ||
135 | switch (*p) { | |
f2844249 DJ |
136 | case '#': |
137 | case '$': | |
138 | case '!': | |
d52e7d5a | 139 | *start = memparse(p + 1, &p); |
f2844249 | 140 | return 0; |
4cdba14f | 141 | case '@': |
37aee82c AB |
142 | /* |
143 | * memmap=nn@ss specifies usable region, should | |
144 | * be skipped | |
145 | */ | |
146 | *size = 0; | |
df561f66 | 147 | fallthrough; |
4cdba14f BH |
148 | default: |
149 | /* | |
150 | * If w/o offset, only size specified, memmap=nn[KMG] has the | |
151 | * same behaviour as mem=nn[KMG]. It limits the max address | |
152 | * system can use. Region above the limit should be avoided. | |
153 | */ | |
154 | *start = 0; | |
f2844249 DJ |
155 | return 0; |
156 | } | |
157 | ||
158 | return -EINVAL; | |
159 | } | |
160 | ||
37aee82c | 161 | static void mem_avoid_memmap(char *str) |
f2844249 | 162 | { |
d52e7d5a | 163 | static int i; |
f2844249 | 164 | |
d52e7d5a | 165 | if (i >= MAX_MEMMAP_REGIONS) |
f2844249 DJ |
166 | return; |
167 | ||
f2844249 DJ |
168 | while (str && (i < MAX_MEMMAP_REGIONS)) { |
169 | int rc; | |
3a066990 | 170 | u64 start, size; |
f2844249 DJ |
171 | char *k = strchr(str, ','); |
172 | ||
173 | if (k) | |
174 | *k++ = 0; | |
175 | ||
37aee82c | 176 | rc = parse_memmap(str, &start, &size); |
f2844249 DJ |
177 | if (rc < 0) |
178 | break; | |
179 | str = k; | |
4cdba14f BH |
180 | |
181 | if (start == 0) { | |
182 | /* Store the specified memory limit if size > 0 */ | |
45128694 | 183 | if (size > 0 && size < mem_limit) |
4cdba14f BH |
184 | mem_limit = size; |
185 | ||
f2844249 | 186 | continue; |
4cdba14f | 187 | } |
f2844249 DJ |
188 | |
189 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; | |
190 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; | |
191 | i++; | |
192 | } | |
193 | ||
194 | /* More than 4 memmaps, fail kaslr */ | |
195 | if ((i >= MAX_MEMMAP_REGIONS) && str) | |
196 | memmap_too_large = true; | |
197 | } | |
198 | ||
9b912485 BH |
199 | /* Store the number of 1GB huge pages which users specified: */ |
200 | static unsigned long max_gb_huge_pages; | |
201 | ||
202 | static void parse_gb_huge_pages(char *param, char *val) | |
203 | { | |
204 | static bool gbpage_sz; | |
205 | char *p; | |
206 | ||
207 | if (!strcmp(param, "hugepagesz")) { | |
208 | p = val; | |
209 | if (memparse(p, &p) != PUD_SIZE) { | |
210 | gbpage_sz = false; | |
211 | return; | |
212 | } | |
213 | ||
214 | if (gbpage_sz) | |
215 | warn("Repeatedly set hugeTLB page size of 1G!\n"); | |
216 | gbpage_sz = true; | |
217 | return; | |
218 | } | |
219 | ||
220 | if (!strcmp(param, "hugepages") && gbpage_sz) { | |
221 | p = val; | |
222 | max_gb_huge_pages = simple_strtoull(p, &p, 0); | |
223 | return; | |
224 | } | |
225 | } | |
226 | ||
44060e8a | 227 | static void handle_mem_options(void) |
d52e7d5a BH |
228 | { |
229 | char *args = (char *)get_cmd_line_ptr(); | |
709709ac | 230 | size_t len; |
d52e7d5a BH |
231 | char *tmp_cmdline; |
232 | char *param, *val; | |
4cdba14f | 233 | u64 mem_size; |
d52e7d5a | 234 | |
709709ac | 235 | if (!args) |
44060e8a | 236 | return; |
d52e7d5a | 237 | |
76167e5c | 238 | len = strnlen(args, COMMAND_LINE_SIZE-1); |
d52e7d5a | 239 | tmp_cmdline = malloc(len + 1); |
69550d41 | 240 | if (!tmp_cmdline) |
d52e7d5a BH |
241 | error("Failed to allocate space for tmp_cmdline"); |
242 | ||
243 | memcpy(tmp_cmdline, args, len); | |
244 | tmp_cmdline[len] = 0; | |
245 | args = tmp_cmdline; | |
246 | ||
247 | /* Chew leading spaces */ | |
248 | args = skip_spaces(args); | |
249 | ||
250 | while (*args) { | |
251 | args = next_arg(args, ¶m, &val); | |
252 | /* Stop at -- */ | |
e2ee6173 AS |
253 | if (!val && strcmp(param, "--") == 0) |
254 | break; | |
d52e7d5a | 255 | |
4cdba14f | 256 | if (!strcmp(param, "memmap")) { |
37aee82c | 257 | mem_avoid_memmap(val); |
50def269 | 258 | } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) { |
747ff626 | 259 | parse_gb_huge_pages(param, val); |
4cdba14f BH |
260 | } else if (!strcmp(param, "mem")) { |
261 | char *p = val; | |
262 | ||
263 | if (!strcmp(p, "nopentium")) | |
264 | continue; | |
265 | mem_size = memparse(p, &p); | |
44060e8a | 266 | if (mem_size == 0) |
e2ee6173 | 267 | break; |
44060e8a | 268 | |
45128694 AS |
269 | if (mem_size < mem_limit) |
270 | mem_limit = mem_size; | |
4cdba14f | 271 | } |
d52e7d5a BH |
272 | } |
273 | ||
274 | free(tmp_cmdline); | |
44060e8a | 275 | return; |
d52e7d5a BH |
276 | } |
277 | ||
9dc1969c | 278 | /* |
45128694 AS |
279 | * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) |
280 | * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. | |
281 | * | |
ed09acde KC |
282 | * The mem_avoid array is used to store the ranges that need to be avoided |
283 | * when KASLR searches for an appropriate random address. We must avoid any | |
9dc1969c | 284 | * regions that are unsafe to overlap with during decompression, and other |
ed09acde KC |
285 | * things like the initrd, cmdline and boot_params. This comment seeks to |
286 | * explain mem_avoid as clearly as possible since incorrect mem_avoid | |
287 | * memory ranges lead to really hard to debug boot failures. | |
288 | * | |
289 | * The initrd, cmdline, and boot_params are trivial to identify for | |
cb18ef0d | 290 | * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and |
ed09acde KC |
291 | * MEM_AVOID_BOOTPARAMS respectively below. |
292 | * | |
293 | * What is not obvious how to avoid is the range of memory that is used | |
294 | * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover | |
295 | * the compressed kernel (ZO) and its run space, which is used to extract | |
296 | * the uncompressed kernel (VO) and relocs. | |
297 | * | |
298 | * ZO's full run size sits against the end of the decompression buffer, so | |
299 | * we can calculate where text, data, bss, etc of ZO are positioned more | |
300 | * easily. | |
301 | * | |
302 | * For additional background, the decompression calculations can be found | |
303 | * in header.S, and the memory diagram is based on the one found in misc.c. | |
304 | * | |
305 | * The following conditions are already enforced by the image layouts and | |
306 | * associated code: | |
307 | * - input + input_size >= output + output_size | |
308 | * - kernel_total_size <= init_size | |
309 | * - kernel_total_size <= output_size (see Note below) | |
310 | * - output + init_size >= output + output_size | |
9dc1969c | 311 | * |
ed09acde KC |
312 | * (Note that kernel_total_size and output_size have no fundamental |
313 | * relationship, but output_size is passed to choose_random_location | |
314 | * as a maximum of the two. The diagram is showing a case where | |
315 | * kernel_total_size is larger than output_size, but this case is | |
316 | * handled by bumping output_size.) | |
9dc1969c | 317 | * |
ed09acde | 318 | * The above conditions can be illustrated by a diagram: |
9dc1969c | 319 | * |
ed09acde KC |
320 | * 0 output input input+input_size output+init_size |
321 | * | | | | | | |
322 | * | | | | | | |
323 | * |-----|--------|--------|--------------|-----------|--|-------------| | |
324 | * | | | | |
325 | * | | | | |
326 | * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size | |
9dc1969c | 327 | * |
ed09acde KC |
328 | * [output, output+init_size) is the entire memory range used for |
329 | * extracting the compressed image. | |
9dc1969c | 330 | * |
ed09acde KC |
331 | * [output, output+kernel_total_size) is the range needed for the |
332 | * uncompressed kernel (VO) and its run size (bss, brk, etc). | |
9dc1969c | 333 | * |
ed09acde KC |
334 | * [output, output+output_size) is VO plus relocs (i.e. the entire |
335 | * uncompressed payload contained by ZO). This is the area of the buffer | |
336 | * written to during decompression. | |
9dc1969c | 337 | * |
ed09acde KC |
338 | * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case |
339 | * range of the copied ZO and decompression code. (i.e. the range | |
340 | * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) | |
9dc1969c | 341 | * |
ed09acde KC |
342 | * [input, input+input_size) is the original copied compressed image (ZO) |
343 | * (i.e. it does not include its run size). This range must be avoided | |
344 | * because it contains the data used for decompression. | |
9dc1969c | 345 | * |
ed09acde KC |
346 | * [input+input_size, output+init_size) is [_text, _end) for ZO. This |
347 | * range includes ZO's heap and stack, and must be avoided since it | |
348 | * performs the decompression. | |
9dc1969c | 349 | * |
ed09acde KC |
350 | * Since the above two ranges need to be avoided and they are adjacent, |
351 | * they can be merged, resulting in: [input, output+init_size) which | |
352 | * becomes the MEM_AVOID_ZO_RANGE below. | |
9dc1969c | 353 | */ |
82fa9637 | 354 | static void mem_avoid_init(unsigned long input, unsigned long input_size, |
9dc1969c | 355 | unsigned long output) |
82fa9637 | 356 | { |
d55d5bc5 | 357 | unsigned long init_size = boot_params_ptr->hdr.init_size; |
82fa9637 | 358 | u64 initrd_start, initrd_size; |
709709ac | 359 | unsigned long cmd_line, cmd_line_size; |
82fa9637 KC |
360 | |
361 | /* | |
362 | * Avoid the region that is unsafe to overlap during | |
9dc1969c | 363 | * decompression. |
82fa9637 | 364 | */ |
ed09acde KC |
365 | mem_avoid[MEM_AVOID_ZO_RANGE].start = input; |
366 | mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; | |
82fa9637 KC |
367 | |
368 | /* Avoid initrd. */ | |
d55d5bc5 AB |
369 | initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32; |
370 | initrd_start |= boot_params_ptr->hdr.ramdisk_image; | |
371 | initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32; | |
372 | initrd_size |= boot_params_ptr->hdr.ramdisk_size; | |
ed09acde KC |
373 | mem_avoid[MEM_AVOID_INITRD].start = initrd_start; |
374 | mem_avoid[MEM_AVOID_INITRD].size = initrd_size; | |
3a94707d | 375 | /* No need to set mapping for initrd, it will be handled in VO. */ |
82fa9637 KC |
376 | |
377 | /* Avoid kernel command line. */ | |
709709ac | 378 | cmd_line = get_cmd_line_ptr(); |
82fa9637 | 379 | /* Calculate size of cmd_line. */ |
709709ac | 380 | if (cmd_line) { |
76167e5c | 381 | cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; |
709709ac AS |
382 | mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; |
383 | mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; | |
709709ac | 384 | } |
82fa9637 | 385 | |
ed09acde | 386 | /* Avoid boot parameters. */ |
d55d5bc5 AB |
387 | mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr; |
388 | mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr); | |
3a94707d KC |
389 | |
390 | /* We don't need to set a mapping for setup_data. */ | |
391 | ||
f2844249 | 392 | /* Mark the memmap regions we need to avoid */ |
747ff626 | 393 | handle_mem_options(); |
f2844249 | 394 | |
690eaa53 CF |
395 | /* Enumerate the immovable memory regions */ |
396 | num_immovable_mem = count_immovable_mem_regions(); | |
82fa9637 KC |
397 | } |
398 | ||
06486d6c KC |
399 | /* |
400 | * Does this memory vector overlap a known avoided area? If so, record the | |
401 | * overlap region with the lowest address. | |
402 | */ | |
403 | static bool mem_avoid_overlap(struct mem_vector *img, | |
404 | struct mem_vector *overlap) | |
82fa9637 KC |
405 | { |
406 | int i; | |
0cacbfbe | 407 | struct setup_data *ptr; |
0eb1a8af | 408 | u64 earliest = img->start + img->size; |
06486d6c | 409 | bool is_overlapping = false; |
82fa9637 KC |
410 | |
411 | for (i = 0; i < MEM_AVOID_MAX; i++) { | |
06486d6c KC |
412 | if (mem_overlaps(img, &mem_avoid[i]) && |
413 | mem_avoid[i].start < earliest) { | |
414 | *overlap = mem_avoid[i]; | |
6daa2ec0 | 415 | earliest = overlap->start; |
06486d6c KC |
416 | is_overlapping = true; |
417 | } | |
82fa9637 KC |
418 | } |
419 | ||
0cacbfbe | 420 | /* Avoid all entries in the setup_data linked list. */ |
d55d5bc5 | 421 | ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; |
0cacbfbe KC |
422 | while (ptr) { |
423 | struct mem_vector avoid; | |
424 | ||
20cc2888 | 425 | avoid.start = (unsigned long)ptr; |
0cacbfbe KC |
426 | avoid.size = sizeof(*ptr) + ptr->len; |
427 | ||
06486d6c KC |
428 | if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { |
429 | *overlap = avoid; | |
6daa2ec0 | 430 | earliest = overlap->start; |
06486d6c KC |
431 | is_overlapping = true; |
432 | } | |
0cacbfbe | 433 | |
b3c72fc9 DK |
434 | if (ptr->type == SETUP_INDIRECT && |
435 | ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) { | |
436 | avoid.start = ((struct setup_indirect *)ptr->data)->addr; | |
437 | avoid.size = ((struct setup_indirect *)ptr->data)->len; | |
438 | ||
439 | if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { | |
440 | *overlap = avoid; | |
441 | earliest = overlap->start; | |
442 | is_overlapping = true; | |
443 | } | |
444 | } | |
445 | ||
0cacbfbe KC |
446 | ptr = (struct setup_data *)(unsigned long)ptr->next; |
447 | } | |
448 | ||
06486d6c | 449 | return is_overlapping; |
82fa9637 KC |
450 | } |
451 | ||
c401cf15 | 452 | struct slot_area { |
0eb1a8af | 453 | u64 addr; |
d6d0f36c | 454 | unsigned long num; |
c401cf15 BH |
455 | }; |
456 | ||
457 | #define MAX_SLOT_AREA 100 | |
458 | ||
459 | static struct slot_area slot_areas[MAX_SLOT_AREA]; | |
d6d0f36c | 460 | static unsigned int slot_area_index; |
e290e8c5 | 461 | static unsigned long slot_max; |
82fa9637 | 462 | |
c401cf15 BH |
463 | static void store_slot_info(struct mem_vector *region, unsigned long image_size) |
464 | { | |
465 | struct slot_area slot_area; | |
466 | ||
467 | if (slot_area_index == MAX_SLOT_AREA) | |
468 | return; | |
469 | ||
470 | slot_area.addr = region->start; | |
46a5b29a | 471 | slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN; |
c401cf15 | 472 | |
46a5b29a AS |
473 | slot_areas[slot_area_index++] = slot_area; |
474 | slot_max += slot_area.num; | |
c401cf15 BH |
475 | } |
476 | ||
9b912485 BH |
477 | /* |
478 | * Skip as many 1GB huge pages as possible in the passed region | |
479 | * according to the number which users specified: | |
480 | */ | |
481 | static void | |
482 | process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) | |
483 | { | |
0eb1a8af AS |
484 | u64 pud_start, pud_end; |
485 | unsigned long gb_huge_pages; | |
9b912485 | 486 | struct mem_vector tmp; |
9b912485 | 487 | |
50def269 | 488 | if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) { |
9b912485 BH |
489 | store_slot_info(region, image_size); |
490 | return; | |
491 | } | |
492 | ||
be9e8d95 AS |
493 | /* Are there any 1GB pages in the region? */ |
494 | pud_start = ALIGN(region->start, PUD_SIZE); | |
495 | pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE); | |
9b912485 BH |
496 | |
497 | /* No good 1GB huge pages found: */ | |
be9e8d95 | 498 | if (pud_start >= pud_end) { |
9b912485 BH |
499 | store_slot_info(region, image_size); |
500 | return; | |
501 | } | |
502 | ||
be9e8d95 AS |
503 | /* Check if the head part of the region is usable. */ |
504 | if (pud_start >= region->start + image_size) { | |
9b912485 | 505 | tmp.start = region->start; |
be9e8d95 | 506 | tmp.size = pud_start - region->start; |
9b912485 BH |
507 | store_slot_info(&tmp, image_size); |
508 | } | |
509 | ||
be9e8d95 AS |
510 | /* Skip the good 1GB pages. */ |
511 | gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT; | |
512 | if (gb_huge_pages > max_gb_huge_pages) { | |
513 | pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT); | |
514 | max_gb_huge_pages = 0; | |
515 | } else { | |
516 | max_gb_huge_pages -= gb_huge_pages; | |
517 | } | |
518 | ||
519 | /* Check if the tail part of the region is usable. */ | |
520 | if (region->start + region->size >= pud_end + image_size) { | |
521 | tmp.start = pud_end; | |
522 | tmp.size = region->start + region->size - pud_end; | |
9b912485 BH |
523 | store_slot_info(&tmp, image_size); |
524 | } | |
525 | } | |
526 | ||
0eb1a8af | 527 | static u64 slots_fetch_random(void) |
82fa9637 | 528 | { |
ed9f007e | 529 | unsigned long slot; |
d6d0f36c | 530 | unsigned int i; |
ed9f007e | 531 | |
82fa9637 KC |
532 | /* Handle case of no slots stored. */ |
533 | if (slot_max == 0) | |
534 | return 0; | |
535 | ||
d899a7d1 | 536 | slot = kaslr_get_random_long("Physical") % slot_max; |
ed9f007e KC |
537 | |
538 | for (i = 0; i < slot_area_index; i++) { | |
539 | if (slot >= slot_areas[i].num) { | |
540 | slot -= slot_areas[i].num; | |
541 | continue; | |
542 | } | |
0eb1a8af | 543 | return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN); |
ed9f007e KC |
544 | } |
545 | ||
546 | if (i == slot_area_index) | |
547 | debug_putstr("slots_fetch_random() failed!?\n"); | |
548 | return 0; | |
82fa9637 KC |
549 | } |
550 | ||
690eaa53 CF |
551 | static void __process_mem_region(struct mem_vector *entry, |
552 | unsigned long minimum, | |
553 | unsigned long image_size) | |
82fa9637 | 554 | { |
ed9f007e | 555 | struct mem_vector region, overlap; |
0eb1a8af | 556 | u64 region_end; |
82fa9637 | 557 | |
bf457be1 | 558 | /* Enforce minimum and memory limit. */ |
3a066990 | 559 | region.start = max_t(u64, entry->start, minimum); |
bf457be1 | 560 | region_end = min(entry->start + entry->size, mem_limit); |
82fa9637 | 561 | |
ed9f007e KC |
562 | /* Give up if slot area array is full. */ |
563 | while (slot_area_index < MAX_SLOT_AREA) { | |
ed9f007e KC |
564 | /* Potentially raise address to meet alignment needs. */ |
565 | region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); | |
82fa9637 | 566 | |
27aac205 | 567 | /* Did we raise the address above the passed in memory entry? */ |
bf457be1 | 568 | if (region.start > region_end) |
ed9f007e | 569 | return; |
82fa9637 | 570 | |
ed9f007e | 571 | /* Reduce size by any delta from the original address. */ |
bf457be1 | 572 | region.size = region_end - region.start; |
ed9f007e KC |
573 | |
574 | /* Return if region can't contain decompressed kernel */ | |
575 | if (region.size < image_size) | |
576 | return; | |
577 | ||
578 | /* If nothing overlaps, store the region and return. */ | |
579 | if (!mem_avoid_overlap(®ion, &overlap)) { | |
747ff626 | 580 | process_gb_huge_pages(®ion, image_size); |
ed9f007e KC |
581 | return; |
582 | } | |
583 | ||
584 | /* Store beginning of region if holds at least image_size. */ | |
8d1cf859 | 585 | if (overlap.start >= region.start + image_size) { |
ef7b07d5 AS |
586 | region.size = overlap.start - region.start; |
587 | process_gb_huge_pages(®ion, image_size); | |
ed9f007e KC |
588 | } |
589 | ||
ed9f007e | 590 | /* Clip off the overlapping region and start over. */ |
ed9f007e | 591 | region.start = overlap.start + overlap.size; |
82fa9637 KC |
592 | } |
593 | } | |
594 | ||
690eaa53 | 595 | static bool process_mem_region(struct mem_vector *region, |
e4cb955b AS |
596 | unsigned long minimum, |
597 | unsigned long image_size) | |
690eaa53 CF |
598 | { |
599 | int i; | |
600 | /* | |
601 | * If no immovable memory found, or MEMORY_HOTREMOVE disabled, | |
602 | * use @region directly. | |
603 | */ | |
604 | if (!num_immovable_mem) { | |
605 | __process_mem_region(region, minimum, image_size); | |
606 | ||
607 | if (slot_area_index == MAX_SLOT_AREA) { | |
608 | debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); | |
21d6a7dc | 609 | return true; |
690eaa53 | 610 | } |
21d6a7dc | 611 | return false; |
690eaa53 CF |
612 | } |
613 | ||
82df8261 | 614 | #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) |
690eaa53 CF |
615 | /* |
616 | * If immovable memory found, filter the intersection between | |
617 | * immovable memory and @region. | |
618 | */ | |
619 | for (i = 0; i < num_immovable_mem; i++) { | |
3a066990 | 620 | u64 start, end, entry_end, region_end; |
690eaa53 CF |
621 | struct mem_vector entry; |
622 | ||
623 | if (!mem_overlaps(region, &immovable_mem[i])) | |
624 | continue; | |
625 | ||
626 | start = immovable_mem[i].start; | |
627 | end = start + immovable_mem[i].size; | |
628 | region_end = region->start + region->size; | |
629 | ||
630 | entry.start = clamp(region->start, start, end); | |
631 | entry_end = clamp(region_end, start, end); | |
632 | entry.size = entry_end - entry.start; | |
633 | ||
634 | __process_mem_region(&entry, minimum, image_size); | |
635 | ||
636 | if (slot_area_index == MAX_SLOT_AREA) { | |
637 | debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); | |
5b3fd8aa | 638 | return true; |
690eaa53 CF |
639 | } |
640 | } | |
690eaa53 | 641 | #endif |
ee92fa03 | 642 | return false; |
690eaa53 CF |
643 | } |
644 | ||
c05cd797 | 645 | #ifdef CONFIG_EFI |
3fd1239a KS |
646 | |
647 | /* | |
648 | * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are | |
649 | * guaranteed to be free. | |
650 | * | |
651 | * Pick free memory more conservatively than the EFI spec allows: according to | |
652 | * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus | |
653 | * available to place the kernel image into, but in practice there's firmware | |
654 | * where using that memory leads to crashes. Buggy vendor EFI code registers | |
655 | * for an event that triggers on SetVirtualAddressMap(). The handler assumes | |
656 | * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which | |
657 | * is probably true for Windows. | |
658 | * | |
659 | * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). | |
660 | */ | |
661 | static inline bool memory_type_is_free(efi_memory_desc_t *md) | |
662 | { | |
663 | if (md->type == EFI_CONVENTIONAL_MEMORY) | |
664 | return true; | |
665 | ||
666 | if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && | |
667 | md->type == EFI_UNACCEPTED_MEMORY) | |
668 | return true; | |
669 | ||
670 | return false; | |
671 | } | |
672 | ||
c05cd797 | 673 | /* |
08705365 AS |
674 | * Returns true if we processed the EFI memmap, which we prefer over the E820 |
675 | * table if it is available. | |
c05cd797 BH |
676 | */ |
677 | static bool | |
678 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
679 | { | |
d55d5bc5 | 680 | struct efi_info *e = &boot_params_ptr->efi_info; |
c05cd797 BH |
681 | bool efi_mirror_found = false; |
682 | struct mem_vector region; | |
683 | efi_memory_desc_t *md; | |
684 | unsigned long pmap; | |
685 | char *signature; | |
686 | u32 nr_desc; | |
687 | int i; | |
688 | ||
689 | signature = (char *)&e->efi_loader_signature; | |
690 | if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && | |
691 | strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) | |
692 | return false; | |
693 | ||
694 | #ifdef CONFIG_X86_32 | |
695 | /* Can't handle data above 4GB at this time */ | |
696 | if (e->efi_memmap_hi) { | |
697 | warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); | |
698 | return false; | |
699 | } | |
700 | pmap = e->efi_memmap; | |
701 | #else | |
702 | pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); | |
703 | #endif | |
704 | ||
705 | nr_desc = e->efi_memmap_size / e->efi_memdesc_size; | |
706 | for (i = 0; i < nr_desc; i++) { | |
707 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
708 | if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { | |
c05cd797 | 709 | efi_mirror_found = true; |
0982adc7 | 710 | break; |
c05cd797 BH |
711 | } |
712 | } | |
713 | ||
0982adc7 NH |
714 | for (i = 0; i < nr_desc; i++) { |
715 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
716 | ||
3fd1239a | 717 | if (!memory_type_is_free(md)) |
0982adc7 NH |
718 | continue; |
719 | ||
262b45ae DW |
720 | if (efi_soft_reserve_enabled() && |
721 | (md->attribute & EFI_MEMORY_SP)) | |
722 | continue; | |
723 | ||
0982adc7 NH |
724 | if (efi_mirror_found && |
725 | !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) | |
726 | continue; | |
727 | ||
728 | region.start = md->phys_addr; | |
729 | region.size = md->num_pages << EFI_PAGE_SHIFT; | |
690eaa53 | 730 | if (process_mem_region(®ion, minimum, image_size)) |
0982adc7 | 731 | break; |
0982adc7 NH |
732 | } |
733 | return true; | |
c05cd797 BH |
734 | } |
735 | #else | |
736 | static inline bool | |
737 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
738 | { | |
739 | return false; | |
740 | } | |
741 | #endif | |
742 | ||
f62995c9 BH |
743 | static void process_e820_entries(unsigned long minimum, |
744 | unsigned long image_size) | |
82fa9637 KC |
745 | { |
746 | int i; | |
87891b01 | 747 | struct mem_vector region; |
f62995c9 BH |
748 | struct boot_e820_entry *entry; |
749 | ||
750 | /* Verify potential e820 positions, appending to slots list. */ | |
d55d5bc5 AB |
751 | for (i = 0; i < boot_params_ptr->e820_entries; i++) { |
752 | entry = &boot_params_ptr->e820_table[i]; | |
f62995c9 BH |
753 | /* Skip non-RAM entries. */ |
754 | if (entry->type != E820_TYPE_RAM) | |
755 | continue; | |
87891b01 BH |
756 | region.start = entry->addr; |
757 | region.size = entry->size; | |
690eaa53 | 758 | if (process_mem_region(®ion, minimum, image_size)) |
f62995c9 | 759 | break; |
f62995c9 BH |
760 | } |
761 | } | |
82fa9637 | 762 | |
a8ebb704 AG |
763 | /* |
764 | * If KHO is active, only process its scratch areas to ensure we are not | |
765 | * stepping onto preserved memory. | |
766 | */ | |
767 | static bool process_kho_entries(unsigned long minimum, unsigned long image_size) | |
768 | { | |
769 | struct kho_scratch *kho_scratch; | |
770 | struct setup_data *ptr; | |
771 | struct kho_data *kho; | |
772 | int i, nr_areas = 0; | |
773 | ||
774 | if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) | |
775 | return false; | |
776 | ||
777 | ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; | |
778 | while (ptr) { | |
779 | if (ptr->type == SETUP_KEXEC_KHO) { | |
780 | kho = (struct kho_data *)(unsigned long)ptr->data; | |
781 | kho_scratch = (void *)(unsigned long)kho->scratch_addr; | |
782 | nr_areas = kho->scratch_size / sizeof(*kho_scratch); | |
783 | break; | |
784 | } | |
785 | ||
786 | ptr = (struct setup_data *)(unsigned long)ptr->next; | |
787 | } | |
788 | ||
789 | if (!nr_areas) | |
790 | return false; | |
791 | ||
792 | for (i = 0; i < nr_areas; i++) { | |
793 | struct kho_scratch *area = &kho_scratch[i]; | |
794 | struct mem_vector region = { | |
795 | .start = area->addr, | |
796 | .size = area->size, | |
797 | }; | |
798 | ||
799 | if (process_mem_region(®ion, minimum, image_size)) | |
800 | break; | |
801 | } | |
802 | ||
803 | return true; | |
804 | } | |
805 | ||
f62995c9 BH |
806 | static unsigned long find_random_phys_addr(unsigned long minimum, |
807 | unsigned long image_size) | |
808 | { | |
f49236ae AS |
809 | u64 phys_addr; |
810 | ||
45128694 AS |
811 | /* Bail out early if it's impossible to succeed. */ |
812 | if (minimum + image_size > mem_limit) | |
813 | return 0; | |
814 | ||
f2844249 DJ |
815 | /* Check if we had too many memmaps. */ |
816 | if (memmap_too_large) { | |
c05cd797 | 817 | debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); |
f2844249 DJ |
818 | return 0; |
819 | } | |
820 | ||
a8ebb704 AG |
821 | /* |
822 | * During kexec handover only process KHO scratch areas that are known | |
823 | * not to contain any data that must be preserved. | |
824 | */ | |
825 | if (!process_kho_entries(minimum, image_size) && | |
826 | !process_efi_entries(minimum, image_size)) | |
4268b4da | 827 | process_e820_entries(minimum, image_size); |
82fa9637 | 828 | |
f49236ae | 829 | phys_addr = slots_fetch_random(); |
c05cd797 | 830 | |
f49236ae AS |
831 | /* Perform a final check to make sure the address is in range. */ |
832 | if (phys_addr < minimum || phys_addr + image_size > mem_limit) { | |
833 | warn("Invalid physical address chosen!\n"); | |
834 | return 0; | |
835 | } | |
836 | ||
837 | return (unsigned long)phys_addr; | |
82fa9637 KC |
838 | } |
839 | ||
071a7493 BH |
840 | static unsigned long find_random_virt_addr(unsigned long minimum, |
841 | unsigned long image_size) | |
842 | { | |
843 | unsigned long slots, random_addr; | |
844 | ||
071a7493 BH |
845 | /* |
846 | * There are how many CONFIG_PHYSICAL_ALIGN-sized slots | |
847 | * that can hold image_size within the range of minimum to | |
848 | * KERNEL_IMAGE_SIZE? | |
849 | */ | |
eb38be6d | 850 | slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN; |
071a7493 | 851 | |
d899a7d1 | 852 | random_addr = kaslr_get_random_long("Virtual") % slots; |
071a7493 BH |
853 | |
854 | return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; | |
855 | } | |
856 | ||
549f90db BP |
857 | /* |
858 | * Since this function examines addresses much more numerically, | |
859 | * it takes the input and output pointers as 'unsigned long'. | |
860 | */ | |
8391c73c BH |
861 | void choose_random_location(unsigned long input, |
862 | unsigned long input_size, | |
863 | unsigned long *output, | |
864 | unsigned long output_size, | |
865 | unsigned long *virt_addr) | |
8ab3820f | 866 | { |
e066cc47 | 867 | unsigned long random_addr, min_addr; |
8ab3820f KC |
868 | |
869 | if (cmdline_find_option_bool("nokaslr")) { | |
0f8ede1b | 870 | warn("KASLR disabled: 'nokaslr' on cmdline."); |
8391c73c | 871 | return; |
8ab3820f | 872 | } |
4c2b4058 | 873 | |
d55d5bc5 | 874 | boot_params_ptr->hdr.loadflags |= KASLR_FLAG; |
78cac48c | 875 | |
45128694 AS |
876 | if (IS_ENABLED(CONFIG_X86_32)) |
877 | mem_limit = KERNEL_IMAGE_SIZE; | |
878 | else | |
879 | mem_limit = MAXMEM; | |
880 | ||
82fa9637 | 881 | /* Record the various known unsafe memory ranges. */ |
8391c73c | 882 | mem_avoid_init(input, input_size, *output); |
82fa9637 | 883 | |
e066cc47 YL |
884 | /* |
885 | * Low end of the randomization range should be the | |
886 | * smaller of 512M or the initial kernel image | |
887 | * location: | |
888 | */ | |
889 | min_addr = min(*output, 512UL << 20); | |
45128694 AS |
890 | /* Make sure minimum is aligned. */ |
891 | min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN); | |
e066cc47 | 892 | |
c05cd797 | 893 | /* Walk available memory entries to find a random address. */ |
e066cc47 | 894 | random_addr = find_random_phys_addr(min_addr, output_size); |
9016875d | 895 | if (!random_addr) { |
f2844249 | 896 | warn("Physical KASLR disabled: no suitable memory region!"); |
8391c73c BH |
897 | } else { |
898 | /* Update the new physical address location. */ | |
8570978e | 899 | if (*output != random_addr) |
8391c73c | 900 | *output = random_addr; |
82fa9637 KC |
901 | } |
902 | ||
8391c73c BH |
903 | |
904 | /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ | |
905 | if (IS_ENABLED(CONFIG_X86_64)) | |
906 | random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); | |
907 | *virt_addr = random_addr; | |
8ab3820f | 908 | } |