Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
7de828df KC |
2 | /* |
3 | * kaslr.c | |
4 | * | |
5 | * This contains the routines needed to generate a reasonable level of | |
6 | * entropy to choose a randomized kernel base address offset in support | |
7 | * of Kernel Address Space Layout Randomization (KASLR). Additionally | |
8 | * handles walking the physical memory maps (and tracking memory regions | |
9 | * to avoid) in order to select a physical memory location that can | |
10 | * contain the entire properly aligned running kernel image. | |
11 | * | |
12 | */ | |
d52e7d5a BH |
13 | |
14 | /* | |
15 | * isspace() in linux/ctype.h is expected by next_args() to filter | |
16 | * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, | |
17 | * since isdigit() is implemented in both of them. Hence disable it | |
18 | * here. | |
19 | */ | |
20 | #define BOOT_CTYPE_H | |
21 | ||
8ab3820f | 22 | #include "misc.h" |
dc425a6e | 23 | #include "error.h" |
5b8b9cf7 | 24 | #include "../string.h" |
5dc91f2d | 25 | #include "efi.h" |
8ab3820f | 26 | |
a653f356 KC |
27 | #include <generated/compile.h> |
28 | #include <linux/module.h> | |
29 | #include <linux/uts.h> | |
30 | #include <linux/utsname.h> | |
d52e7d5a | 31 | #include <linux/ctype.h> |
a653f356 | 32 | #include <generated/utsrelease.h> |
a653f356 | 33 | |
76167e5c AS |
34 | #define _SETUP |
35 | #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ | |
36 | #undef _SETUP | |
37 | ||
d52e7d5a BH |
38 | extern unsigned long get_cmd_line_ptr(void); |
39 | ||
a653f356 | 40 | /* Simplified build-specific string for starting entropy. */ |
327f7d72 | 41 | static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" |
a653f356 KC |
42 | LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; |
43 | ||
a653f356 KC |
44 | static unsigned long rotate_xor(unsigned long hash, const void *area, |
45 | size_t size) | |
46 | { | |
47 | size_t i; | |
48 | unsigned long *ptr = (unsigned long *)area; | |
49 | ||
50 | for (i = 0; i < size / sizeof(hash); i++) { | |
51 | /* Rotate by odd number of bits and XOR. */ | |
52 | hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); | |
53 | hash ^= ptr[i]; | |
54 | } | |
55 | ||
56 | return hash; | |
57 | } | |
58 | ||
59 | /* Attempt to create a simple but unpredictable starting entropy. */ | |
d899a7d1 | 60 | static unsigned long get_boot_seed(void) |
a653f356 KC |
61 | { |
62 | unsigned long hash = 0; | |
63 | ||
64 | hash = rotate_xor(hash, build_str, sizeof(build_str)); | |
6655e0aa | 65 | hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); |
a653f356 KC |
66 | |
67 | return hash; | |
68 | } | |
69 | ||
d899a7d1 TG |
70 | #define KASLR_COMPRESSED_BOOT |
71 | #include "../../lib/kaslr.c" | |
8ab3820f | 72 | |
82fa9637 | 73 | |
f2844249 DJ |
74 | /* Only supporting at most 4 unusable memmap regions with kaslr */ |
75 | #define MAX_MEMMAP_REGIONS 4 | |
76 | ||
77 | static bool memmap_too_large; | |
78 | ||
d52e7d5a | 79 | |
45128694 AS |
80 | /* |
81 | * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. | |
82 | * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options. | |
83 | */ | |
3a066990 | 84 | static u64 mem_limit; |
4cdba14f | 85 | |
690eaa53 CF |
86 | /* Number of immovable memory regions */ |
87 | static int num_immovable_mem; | |
4cdba14f | 88 | |
ed09acde KC |
89 | enum mem_avoid_index { |
90 | MEM_AVOID_ZO_RANGE = 0, | |
91 | MEM_AVOID_INITRD, | |
92 | MEM_AVOID_CMDLINE, | |
93 | MEM_AVOID_BOOTPARAMS, | |
f2844249 DJ |
94 | MEM_AVOID_MEMMAP_BEGIN, |
95 | MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, | |
ed09acde KC |
96 | MEM_AVOID_MAX, |
97 | }; | |
98 | ||
e290e8c5 | 99 | static struct mem_vector mem_avoid[MEM_AVOID_MAX]; |
82fa9637 | 100 | |
82fa9637 KC |
101 | static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) |
102 | { | |
103 | /* Item one is entirely before item two. */ | |
104 | if (one->start + one->size <= two->start) | |
105 | return false; | |
106 | /* Item one is entirely after item two. */ | |
107 | if (one->start >= two->start + two->size) | |
108 | return false; | |
109 | return true; | |
110 | } | |
111 | ||
d52e7d5a | 112 | char *skip_spaces(const char *str) |
f2844249 | 113 | { |
d52e7d5a BH |
114 | while (isspace(*str)) |
115 | ++str; | |
116 | return (char *)str; | |
f2844249 | 117 | } |
d52e7d5a BH |
118 | #include "../../../../lib/ctype.c" |
119 | #include "../../../../lib/cmdline.c" | |
f2844249 | 120 | |
199c8471 DW |
121 | enum parse_mode { |
122 | PARSE_MEMMAP, | |
123 | PARSE_EFI, | |
124 | }; | |
125 | ||
f2844249 | 126 | static int |
3a066990 | 127 | parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) |
f2844249 DJ |
128 | { |
129 | char *oldp; | |
130 | ||
131 | if (!p) | |
132 | return -EINVAL; | |
133 | ||
134 | /* We don't care about this option here */ | |
135 | if (!strncmp(p, "exactmap", 8)) | |
136 | return -EINVAL; | |
137 | ||
138 | oldp = p; | |
d52e7d5a | 139 | *size = memparse(p, &p); |
f2844249 DJ |
140 | if (p == oldp) |
141 | return -EINVAL; | |
142 | ||
143 | switch (*p) { | |
f2844249 DJ |
144 | case '#': |
145 | case '$': | |
146 | case '!': | |
d52e7d5a | 147 | *start = memparse(p + 1, &p); |
f2844249 | 148 | return 0; |
4cdba14f | 149 | case '@': |
199c8471 DW |
150 | if (mode == PARSE_MEMMAP) { |
151 | /* | |
152 | * memmap=nn@ss specifies usable region, should | |
153 | * be skipped | |
154 | */ | |
155 | *size = 0; | |
156 | } else { | |
3a066990 | 157 | u64 flags; |
199c8471 DW |
158 | |
159 | /* | |
160 | * efi_fake_mem=nn@ss:attr the attr specifies | |
161 | * flags that might imply a soft-reservation. | |
162 | */ | |
163 | *start = memparse(p + 1, &p); | |
164 | if (p && *p == ':') { | |
165 | p++; | |
166 | if (kstrtoull(p, 0, &flags) < 0) | |
167 | *size = 0; | |
168 | else if (flags & EFI_MEMORY_SP) | |
169 | return 0; | |
170 | } | |
171 | *size = 0; | |
172 | } | |
df561f66 | 173 | fallthrough; |
4cdba14f BH |
174 | default: |
175 | /* | |
176 | * If w/o offset, only size specified, memmap=nn[KMG] has the | |
177 | * same behaviour as mem=nn[KMG]. It limits the max address | |
178 | * system can use. Region above the limit should be avoided. | |
179 | */ | |
180 | *start = 0; | |
f2844249 DJ |
181 | return 0; |
182 | } | |
183 | ||
184 | return -EINVAL; | |
185 | } | |
186 | ||
199c8471 | 187 | static void mem_avoid_memmap(enum parse_mode mode, char *str) |
f2844249 | 188 | { |
d52e7d5a | 189 | static int i; |
f2844249 | 190 | |
d52e7d5a | 191 | if (i >= MAX_MEMMAP_REGIONS) |
f2844249 DJ |
192 | return; |
193 | ||
f2844249 DJ |
194 | while (str && (i < MAX_MEMMAP_REGIONS)) { |
195 | int rc; | |
3a066990 | 196 | u64 start, size; |
f2844249 DJ |
197 | char *k = strchr(str, ','); |
198 | ||
199 | if (k) | |
200 | *k++ = 0; | |
201 | ||
199c8471 | 202 | rc = parse_memmap(str, &start, &size, mode); |
f2844249 DJ |
203 | if (rc < 0) |
204 | break; | |
205 | str = k; | |
4cdba14f BH |
206 | |
207 | if (start == 0) { | |
208 | /* Store the specified memory limit if size > 0 */ | |
45128694 | 209 | if (size > 0 && size < mem_limit) |
4cdba14f BH |
210 | mem_limit = size; |
211 | ||
f2844249 | 212 | continue; |
4cdba14f | 213 | } |
f2844249 DJ |
214 | |
215 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; | |
216 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; | |
217 | i++; | |
218 | } | |
219 | ||
220 | /* More than 4 memmaps, fail kaslr */ | |
221 | if ((i >= MAX_MEMMAP_REGIONS) && str) | |
222 | memmap_too_large = true; | |
223 | } | |
224 | ||
9b912485 BH |
225 | /* Store the number of 1GB huge pages which users specified: */ |
226 | static unsigned long max_gb_huge_pages; | |
227 | ||
228 | static void parse_gb_huge_pages(char *param, char *val) | |
229 | { | |
230 | static bool gbpage_sz; | |
231 | char *p; | |
232 | ||
233 | if (!strcmp(param, "hugepagesz")) { | |
234 | p = val; | |
235 | if (memparse(p, &p) != PUD_SIZE) { | |
236 | gbpage_sz = false; | |
237 | return; | |
238 | } | |
239 | ||
240 | if (gbpage_sz) | |
241 | warn("Repeatedly set hugeTLB page size of 1G!\n"); | |
242 | gbpage_sz = true; | |
243 | return; | |
244 | } | |
245 | ||
246 | if (!strcmp(param, "hugepages") && gbpage_sz) { | |
247 | p = val; | |
248 | max_gb_huge_pages = simple_strtoull(p, &p, 0); | |
249 | return; | |
250 | } | |
251 | } | |
252 | ||
44060e8a | 253 | static void handle_mem_options(void) |
d52e7d5a BH |
254 | { |
255 | char *args = (char *)get_cmd_line_ptr(); | |
709709ac | 256 | size_t len; |
d52e7d5a BH |
257 | char *tmp_cmdline; |
258 | char *param, *val; | |
4cdba14f | 259 | u64 mem_size; |
d52e7d5a | 260 | |
709709ac | 261 | if (!args) |
44060e8a | 262 | return; |
d52e7d5a | 263 | |
76167e5c | 264 | len = strnlen(args, COMMAND_LINE_SIZE-1); |
d52e7d5a | 265 | tmp_cmdline = malloc(len + 1); |
69550d41 | 266 | if (!tmp_cmdline) |
d52e7d5a BH |
267 | error("Failed to allocate space for tmp_cmdline"); |
268 | ||
269 | memcpy(tmp_cmdline, args, len); | |
270 | tmp_cmdline[len] = 0; | |
271 | args = tmp_cmdline; | |
272 | ||
273 | /* Chew leading spaces */ | |
274 | args = skip_spaces(args); | |
275 | ||
276 | while (*args) { | |
277 | args = next_arg(args, ¶m, &val); | |
278 | /* Stop at -- */ | |
e2ee6173 AS |
279 | if (!val && strcmp(param, "--") == 0) |
280 | break; | |
d52e7d5a | 281 | |
4cdba14f | 282 | if (!strcmp(param, "memmap")) { |
199c8471 | 283 | mem_avoid_memmap(PARSE_MEMMAP, val); |
50def269 | 284 | } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) { |
747ff626 | 285 | parse_gb_huge_pages(param, val); |
4cdba14f BH |
286 | } else if (!strcmp(param, "mem")) { |
287 | char *p = val; | |
288 | ||
289 | if (!strcmp(p, "nopentium")) | |
290 | continue; | |
291 | mem_size = memparse(p, &p); | |
44060e8a | 292 | if (mem_size == 0) |
e2ee6173 | 293 | break; |
44060e8a | 294 | |
45128694 AS |
295 | if (mem_size < mem_limit) |
296 | mem_limit = mem_size; | |
199c8471 DW |
297 | } else if (!strcmp(param, "efi_fake_mem")) { |
298 | mem_avoid_memmap(PARSE_EFI, val); | |
4cdba14f | 299 | } |
d52e7d5a BH |
300 | } |
301 | ||
302 | free(tmp_cmdline); | |
44060e8a | 303 | return; |
d52e7d5a BH |
304 | } |
305 | ||
9dc1969c | 306 | /* |
45128694 AS |
307 | * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) |
308 | * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. | |
309 | * | |
ed09acde KC |
310 | * The mem_avoid array is used to store the ranges that need to be avoided |
311 | * when KASLR searches for an appropriate random address. We must avoid any | |
9dc1969c | 312 | * regions that are unsafe to overlap with during decompression, and other |
ed09acde KC |
313 | * things like the initrd, cmdline and boot_params. This comment seeks to |
314 | * explain mem_avoid as clearly as possible since incorrect mem_avoid | |
315 | * memory ranges lead to really hard to debug boot failures. | |
316 | * | |
317 | * The initrd, cmdline, and boot_params are trivial to identify for | |
cb18ef0d | 318 | * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and |
ed09acde KC |
319 | * MEM_AVOID_BOOTPARAMS respectively below. |
320 | * | |
321 | * What is not obvious how to avoid is the range of memory that is used | |
322 | * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover | |
323 | * the compressed kernel (ZO) and its run space, which is used to extract | |
324 | * the uncompressed kernel (VO) and relocs. | |
325 | * | |
326 | * ZO's full run size sits against the end of the decompression buffer, so | |
327 | * we can calculate where text, data, bss, etc of ZO are positioned more | |
328 | * easily. | |
329 | * | |
330 | * For additional background, the decompression calculations can be found | |
331 | * in header.S, and the memory diagram is based on the one found in misc.c. | |
332 | * | |
333 | * The following conditions are already enforced by the image layouts and | |
334 | * associated code: | |
335 | * - input + input_size >= output + output_size | |
336 | * - kernel_total_size <= init_size | |
337 | * - kernel_total_size <= output_size (see Note below) | |
338 | * - output + init_size >= output + output_size | |
9dc1969c | 339 | * |
ed09acde KC |
340 | * (Note that kernel_total_size and output_size have no fundamental |
341 | * relationship, but output_size is passed to choose_random_location | |
342 | * as a maximum of the two. The diagram is showing a case where | |
343 | * kernel_total_size is larger than output_size, but this case is | |
344 | * handled by bumping output_size.) | |
9dc1969c | 345 | * |
ed09acde | 346 | * The above conditions can be illustrated by a diagram: |
9dc1969c | 347 | * |
ed09acde KC |
348 | * 0 output input input+input_size output+init_size |
349 | * | | | | | | |
350 | * | | | | | | |
351 | * |-----|--------|--------|--------------|-----------|--|-------------| | |
352 | * | | | | |
353 | * | | | | |
354 | * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size | |
9dc1969c | 355 | * |
ed09acde KC |
356 | * [output, output+init_size) is the entire memory range used for |
357 | * extracting the compressed image. | |
9dc1969c | 358 | * |
ed09acde KC |
359 | * [output, output+kernel_total_size) is the range needed for the |
360 | * uncompressed kernel (VO) and its run size (bss, brk, etc). | |
9dc1969c | 361 | * |
ed09acde KC |
362 | * [output, output+output_size) is VO plus relocs (i.e. the entire |
363 | * uncompressed payload contained by ZO). This is the area of the buffer | |
364 | * written to during decompression. | |
9dc1969c | 365 | * |
ed09acde KC |
366 | * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case |
367 | * range of the copied ZO and decompression code. (i.e. the range | |
368 | * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) | |
9dc1969c | 369 | * |
ed09acde KC |
370 | * [input, input+input_size) is the original copied compressed image (ZO) |
371 | * (i.e. it does not include its run size). This range must be avoided | |
372 | * because it contains the data used for decompression. | |
9dc1969c | 373 | * |
ed09acde KC |
374 | * [input+input_size, output+init_size) is [_text, _end) for ZO. This |
375 | * range includes ZO's heap and stack, and must be avoided since it | |
376 | * performs the decompression. | |
9dc1969c | 377 | * |
ed09acde KC |
378 | * Since the above two ranges need to be avoided and they are adjacent, |
379 | * they can be merged, resulting in: [input, output+init_size) which | |
380 | * becomes the MEM_AVOID_ZO_RANGE below. | |
9dc1969c | 381 | */ |
82fa9637 | 382 | static void mem_avoid_init(unsigned long input, unsigned long input_size, |
9dc1969c | 383 | unsigned long output) |
82fa9637 | 384 | { |
9dc1969c | 385 | unsigned long init_size = boot_params->hdr.init_size; |
82fa9637 | 386 | u64 initrd_start, initrd_size; |
709709ac | 387 | unsigned long cmd_line, cmd_line_size; |
82fa9637 KC |
388 | |
389 | /* | |
390 | * Avoid the region that is unsafe to overlap during | |
9dc1969c | 391 | * decompression. |
82fa9637 | 392 | */ |
ed09acde KC |
393 | mem_avoid[MEM_AVOID_ZO_RANGE].start = input; |
394 | mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; | |
82fa9637 KC |
395 | |
396 | /* Avoid initrd. */ | |
6655e0aa KC |
397 | initrd_start = (u64)boot_params->ext_ramdisk_image << 32; |
398 | initrd_start |= boot_params->hdr.ramdisk_image; | |
399 | initrd_size = (u64)boot_params->ext_ramdisk_size << 32; | |
400 | initrd_size |= boot_params->hdr.ramdisk_size; | |
ed09acde KC |
401 | mem_avoid[MEM_AVOID_INITRD].start = initrd_start; |
402 | mem_avoid[MEM_AVOID_INITRD].size = initrd_size; | |
3a94707d | 403 | /* No need to set mapping for initrd, it will be handled in VO. */ |
82fa9637 KC |
404 | |
405 | /* Avoid kernel command line. */ | |
709709ac | 406 | cmd_line = get_cmd_line_ptr(); |
82fa9637 | 407 | /* Calculate size of cmd_line. */ |
709709ac | 408 | if (cmd_line) { |
76167e5c | 409 | cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; |
709709ac AS |
410 | mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; |
411 | mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; | |
709709ac | 412 | } |
82fa9637 | 413 | |
ed09acde KC |
414 | /* Avoid boot parameters. */ |
415 | mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; | |
416 | mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); | |
3a94707d KC |
417 | |
418 | /* We don't need to set a mapping for setup_data. */ | |
419 | ||
f2844249 | 420 | /* Mark the memmap regions we need to avoid */ |
747ff626 | 421 | handle_mem_options(); |
f2844249 | 422 | |
690eaa53 CF |
423 | /* Enumerate the immovable memory regions */ |
424 | num_immovable_mem = count_immovable_mem_regions(); | |
82fa9637 KC |
425 | } |
426 | ||
06486d6c KC |
427 | /* |
428 | * Does this memory vector overlap a known avoided area? If so, record the | |
429 | * overlap region with the lowest address. | |
430 | */ | |
431 | static bool mem_avoid_overlap(struct mem_vector *img, | |
432 | struct mem_vector *overlap) | |
82fa9637 KC |
433 | { |
434 | int i; | |
0cacbfbe | 435 | struct setup_data *ptr; |
0eb1a8af | 436 | u64 earliest = img->start + img->size; |
06486d6c | 437 | bool is_overlapping = false; |
82fa9637 KC |
438 | |
439 | for (i = 0; i < MEM_AVOID_MAX; i++) { | |
06486d6c KC |
440 | if (mem_overlaps(img, &mem_avoid[i]) && |
441 | mem_avoid[i].start < earliest) { | |
442 | *overlap = mem_avoid[i]; | |
6daa2ec0 | 443 | earliest = overlap->start; |
06486d6c KC |
444 | is_overlapping = true; |
445 | } | |
82fa9637 KC |
446 | } |
447 | ||
0cacbfbe | 448 | /* Avoid all entries in the setup_data linked list. */ |
6655e0aa | 449 | ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; |
0cacbfbe KC |
450 | while (ptr) { |
451 | struct mem_vector avoid; | |
452 | ||
20cc2888 | 453 | avoid.start = (unsigned long)ptr; |
0cacbfbe KC |
454 | avoid.size = sizeof(*ptr) + ptr->len; |
455 | ||
06486d6c KC |
456 | if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { |
457 | *overlap = avoid; | |
6daa2ec0 | 458 | earliest = overlap->start; |
06486d6c KC |
459 | is_overlapping = true; |
460 | } | |
0cacbfbe | 461 | |
b3c72fc9 DK |
462 | if (ptr->type == SETUP_INDIRECT && |
463 | ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) { | |
464 | avoid.start = ((struct setup_indirect *)ptr->data)->addr; | |
465 | avoid.size = ((struct setup_indirect *)ptr->data)->len; | |
466 | ||
467 | if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { | |
468 | *overlap = avoid; | |
469 | earliest = overlap->start; | |
470 | is_overlapping = true; | |
471 | } | |
472 | } | |
473 | ||
0cacbfbe KC |
474 | ptr = (struct setup_data *)(unsigned long)ptr->next; |
475 | } | |
476 | ||
06486d6c | 477 | return is_overlapping; |
82fa9637 KC |
478 | } |
479 | ||
c401cf15 | 480 | struct slot_area { |
0eb1a8af | 481 | u64 addr; |
d6d0f36c | 482 | unsigned long num; |
c401cf15 BH |
483 | }; |
484 | ||
485 | #define MAX_SLOT_AREA 100 | |
486 | ||
487 | static struct slot_area slot_areas[MAX_SLOT_AREA]; | |
d6d0f36c | 488 | static unsigned int slot_area_index; |
e290e8c5 | 489 | static unsigned long slot_max; |
82fa9637 | 490 | |
c401cf15 BH |
491 | static void store_slot_info(struct mem_vector *region, unsigned long image_size) |
492 | { | |
493 | struct slot_area slot_area; | |
494 | ||
495 | if (slot_area_index == MAX_SLOT_AREA) | |
496 | return; | |
497 | ||
498 | slot_area.addr = region->start; | |
46a5b29a | 499 | slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN; |
c401cf15 | 500 | |
46a5b29a AS |
501 | slot_areas[slot_area_index++] = slot_area; |
502 | slot_max += slot_area.num; | |
c401cf15 BH |
503 | } |
504 | ||
9b912485 BH |
505 | /* |
506 | * Skip as many 1GB huge pages as possible in the passed region | |
507 | * according to the number which users specified: | |
508 | */ | |
509 | static void | |
510 | process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) | |
511 | { | |
0eb1a8af AS |
512 | u64 pud_start, pud_end; |
513 | unsigned long gb_huge_pages; | |
9b912485 | 514 | struct mem_vector tmp; |
9b912485 | 515 | |
50def269 | 516 | if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) { |
9b912485 BH |
517 | store_slot_info(region, image_size); |
518 | return; | |
519 | } | |
520 | ||
be9e8d95 AS |
521 | /* Are there any 1GB pages in the region? */ |
522 | pud_start = ALIGN(region->start, PUD_SIZE); | |
523 | pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE); | |
9b912485 BH |
524 | |
525 | /* No good 1GB huge pages found: */ | |
be9e8d95 | 526 | if (pud_start >= pud_end) { |
9b912485 BH |
527 | store_slot_info(region, image_size); |
528 | return; | |
529 | } | |
530 | ||
be9e8d95 AS |
531 | /* Check if the head part of the region is usable. */ |
532 | if (pud_start >= region->start + image_size) { | |
9b912485 | 533 | tmp.start = region->start; |
be9e8d95 | 534 | tmp.size = pud_start - region->start; |
9b912485 BH |
535 | store_slot_info(&tmp, image_size); |
536 | } | |
537 | ||
be9e8d95 AS |
538 | /* Skip the good 1GB pages. */ |
539 | gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT; | |
540 | if (gb_huge_pages > max_gb_huge_pages) { | |
541 | pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT); | |
542 | max_gb_huge_pages = 0; | |
543 | } else { | |
544 | max_gb_huge_pages -= gb_huge_pages; | |
545 | } | |
546 | ||
547 | /* Check if the tail part of the region is usable. */ | |
548 | if (region->start + region->size >= pud_end + image_size) { | |
549 | tmp.start = pud_end; | |
550 | tmp.size = region->start + region->size - pud_end; | |
9b912485 BH |
551 | store_slot_info(&tmp, image_size); |
552 | } | |
553 | } | |
554 | ||
0eb1a8af | 555 | static u64 slots_fetch_random(void) |
82fa9637 | 556 | { |
ed9f007e | 557 | unsigned long slot; |
d6d0f36c | 558 | unsigned int i; |
ed9f007e | 559 | |
82fa9637 KC |
560 | /* Handle case of no slots stored. */ |
561 | if (slot_max == 0) | |
562 | return 0; | |
563 | ||
d899a7d1 | 564 | slot = kaslr_get_random_long("Physical") % slot_max; |
ed9f007e KC |
565 | |
566 | for (i = 0; i < slot_area_index; i++) { | |
567 | if (slot >= slot_areas[i].num) { | |
568 | slot -= slot_areas[i].num; | |
569 | continue; | |
570 | } | |
0eb1a8af | 571 | return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN); |
ed9f007e KC |
572 | } |
573 | ||
574 | if (i == slot_area_index) | |
575 | debug_putstr("slots_fetch_random() failed!?\n"); | |
576 | return 0; | |
82fa9637 KC |
577 | } |
578 | ||
690eaa53 CF |
579 | static void __process_mem_region(struct mem_vector *entry, |
580 | unsigned long minimum, | |
581 | unsigned long image_size) | |
82fa9637 | 582 | { |
ed9f007e | 583 | struct mem_vector region, overlap; |
0eb1a8af | 584 | u64 region_end; |
82fa9637 | 585 | |
bf457be1 | 586 | /* Enforce minimum and memory limit. */ |
3a066990 | 587 | region.start = max_t(u64, entry->start, minimum); |
bf457be1 | 588 | region_end = min(entry->start + entry->size, mem_limit); |
82fa9637 | 589 | |
ed9f007e KC |
590 | /* Give up if slot area array is full. */ |
591 | while (slot_area_index < MAX_SLOT_AREA) { | |
ed9f007e KC |
592 | /* Potentially raise address to meet alignment needs. */ |
593 | region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); | |
82fa9637 | 594 | |
27aac205 | 595 | /* Did we raise the address above the passed in memory entry? */ |
bf457be1 | 596 | if (region.start > region_end) |
ed9f007e | 597 | return; |
82fa9637 | 598 | |
ed9f007e | 599 | /* Reduce size by any delta from the original address. */ |
bf457be1 | 600 | region.size = region_end - region.start; |
ed9f007e KC |
601 | |
602 | /* Return if region can't contain decompressed kernel */ | |
603 | if (region.size < image_size) | |
604 | return; | |
605 | ||
606 | /* If nothing overlaps, store the region and return. */ | |
607 | if (!mem_avoid_overlap(®ion, &overlap)) { | |
747ff626 | 608 | process_gb_huge_pages(®ion, image_size); |
ed9f007e KC |
609 | return; |
610 | } | |
611 | ||
612 | /* Store beginning of region if holds at least image_size. */ | |
8d1cf859 | 613 | if (overlap.start >= region.start + image_size) { |
ef7b07d5 AS |
614 | region.size = overlap.start - region.start; |
615 | process_gb_huge_pages(®ion, image_size); | |
ed9f007e KC |
616 | } |
617 | ||
ed9f007e | 618 | /* Clip off the overlapping region and start over. */ |
ed9f007e | 619 | region.start = overlap.start + overlap.size; |
82fa9637 KC |
620 | } |
621 | } | |
622 | ||
690eaa53 | 623 | static bool process_mem_region(struct mem_vector *region, |
e4cb955b AS |
624 | unsigned long minimum, |
625 | unsigned long image_size) | |
690eaa53 CF |
626 | { |
627 | int i; | |
628 | /* | |
629 | * If no immovable memory found, or MEMORY_HOTREMOVE disabled, | |
630 | * use @region directly. | |
631 | */ | |
632 | if (!num_immovable_mem) { | |
633 | __process_mem_region(region, minimum, image_size); | |
634 | ||
635 | if (slot_area_index == MAX_SLOT_AREA) { | |
636 | debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); | |
21d6a7dc | 637 | return true; |
690eaa53 | 638 | } |
21d6a7dc | 639 | return false; |
690eaa53 CF |
640 | } |
641 | ||
82df8261 | 642 | #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) |
690eaa53 CF |
643 | /* |
644 | * If immovable memory found, filter the intersection between | |
645 | * immovable memory and @region. | |
646 | */ | |
647 | for (i = 0; i < num_immovable_mem; i++) { | |
3a066990 | 648 | u64 start, end, entry_end, region_end; |
690eaa53 CF |
649 | struct mem_vector entry; |
650 | ||
651 | if (!mem_overlaps(region, &immovable_mem[i])) | |
652 | continue; | |
653 | ||
654 | start = immovable_mem[i].start; | |
655 | end = start + immovable_mem[i].size; | |
656 | region_end = region->start + region->size; | |
657 | ||
658 | entry.start = clamp(region->start, start, end); | |
659 | entry_end = clamp(region_end, start, end); | |
660 | entry.size = entry_end - entry.start; | |
661 | ||
662 | __process_mem_region(&entry, minimum, image_size); | |
663 | ||
664 | if (slot_area_index == MAX_SLOT_AREA) { | |
665 | debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); | |
5b3fd8aa | 666 | return true; |
690eaa53 CF |
667 | } |
668 | } | |
690eaa53 | 669 | #endif |
e4a0bd03 | 670 | return 0; |
690eaa53 CF |
671 | } |
672 | ||
c05cd797 BH |
673 | #ifdef CONFIG_EFI |
674 | /* | |
08705365 AS |
675 | * Returns true if we processed the EFI memmap, which we prefer over the E820 |
676 | * table if it is available. | |
c05cd797 BH |
677 | */ |
678 | static bool | |
679 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
680 | { | |
681 | struct efi_info *e = &boot_params->efi_info; | |
682 | bool efi_mirror_found = false; | |
683 | struct mem_vector region; | |
684 | efi_memory_desc_t *md; | |
685 | unsigned long pmap; | |
686 | char *signature; | |
687 | u32 nr_desc; | |
688 | int i; | |
689 | ||
690 | signature = (char *)&e->efi_loader_signature; | |
691 | if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && | |
692 | strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) | |
693 | return false; | |
694 | ||
695 | #ifdef CONFIG_X86_32 | |
696 | /* Can't handle data above 4GB at this time */ | |
697 | if (e->efi_memmap_hi) { | |
698 | warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); | |
699 | return false; | |
700 | } | |
701 | pmap = e->efi_memmap; | |
702 | #else | |
703 | pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); | |
704 | #endif | |
705 | ||
706 | nr_desc = e->efi_memmap_size / e->efi_memdesc_size; | |
707 | for (i = 0; i < nr_desc; i++) { | |
708 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
709 | if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { | |
c05cd797 | 710 | efi_mirror_found = true; |
0982adc7 | 711 | break; |
c05cd797 BH |
712 | } |
713 | } | |
714 | ||
0982adc7 NH |
715 | for (i = 0; i < nr_desc; i++) { |
716 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); | |
717 | ||
718 | /* | |
719 | * Here we are more conservative in picking free memory than | |
720 | * the EFI spec allows: | |
721 | * | |
722 | * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also | |
723 | * free memory and thus available to place the kernel image into, | |
724 | * but in practice there's firmware where using that memory leads | |
725 | * to crashes. | |
726 | * | |
727 | * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. | |
728 | */ | |
729 | if (md->type != EFI_CONVENTIONAL_MEMORY) | |
730 | continue; | |
731 | ||
262b45ae DW |
732 | if (efi_soft_reserve_enabled() && |
733 | (md->attribute & EFI_MEMORY_SP)) | |
734 | continue; | |
735 | ||
0982adc7 NH |
736 | if (efi_mirror_found && |
737 | !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) | |
738 | continue; | |
739 | ||
740 | region.start = md->phys_addr; | |
741 | region.size = md->num_pages << EFI_PAGE_SHIFT; | |
690eaa53 | 742 | if (process_mem_region(®ion, minimum, image_size)) |
0982adc7 | 743 | break; |
0982adc7 NH |
744 | } |
745 | return true; | |
c05cd797 BH |
746 | } |
747 | #else | |
748 | static inline bool | |
749 | process_efi_entries(unsigned long minimum, unsigned long image_size) | |
750 | { | |
751 | return false; | |
752 | } | |
753 | #endif | |
754 | ||
f62995c9 BH |
755 | static void process_e820_entries(unsigned long minimum, |
756 | unsigned long image_size) | |
82fa9637 KC |
757 | { |
758 | int i; | |
87891b01 | 759 | struct mem_vector region; |
f62995c9 BH |
760 | struct boot_e820_entry *entry; |
761 | ||
762 | /* Verify potential e820 positions, appending to slots list. */ | |
763 | for (i = 0; i < boot_params->e820_entries; i++) { | |
764 | entry = &boot_params->e820_table[i]; | |
765 | /* Skip non-RAM entries. */ | |
766 | if (entry->type != E820_TYPE_RAM) | |
767 | continue; | |
87891b01 BH |
768 | region.start = entry->addr; |
769 | region.size = entry->size; | |
690eaa53 | 770 | if (process_mem_region(®ion, minimum, image_size)) |
f62995c9 | 771 | break; |
f62995c9 BH |
772 | } |
773 | } | |
82fa9637 | 774 | |
f62995c9 BH |
775 | static unsigned long find_random_phys_addr(unsigned long minimum, |
776 | unsigned long image_size) | |
777 | { | |
f49236ae AS |
778 | u64 phys_addr; |
779 | ||
45128694 AS |
780 | /* Bail out early if it's impossible to succeed. */ |
781 | if (minimum + image_size > mem_limit) | |
782 | return 0; | |
783 | ||
f2844249 DJ |
784 | /* Check if we had too many memmaps. */ |
785 | if (memmap_too_large) { | |
c05cd797 | 786 | debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); |
f2844249 DJ |
787 | return 0; |
788 | } | |
789 | ||
4268b4da AS |
790 | if (!process_efi_entries(minimum, image_size)) |
791 | process_e820_entries(minimum, image_size); | |
82fa9637 | 792 | |
f49236ae | 793 | phys_addr = slots_fetch_random(); |
c05cd797 | 794 | |
f49236ae AS |
795 | /* Perform a final check to make sure the address is in range. */ |
796 | if (phys_addr < minimum || phys_addr + image_size > mem_limit) { | |
797 | warn("Invalid physical address chosen!\n"); | |
798 | return 0; | |
799 | } | |
800 | ||
801 | return (unsigned long)phys_addr; | |
82fa9637 KC |
802 | } |
803 | ||
071a7493 BH |
804 | static unsigned long find_random_virt_addr(unsigned long minimum, |
805 | unsigned long image_size) | |
806 | { | |
807 | unsigned long slots, random_addr; | |
808 | ||
071a7493 BH |
809 | /* |
810 | * There are how many CONFIG_PHYSICAL_ALIGN-sized slots | |
811 | * that can hold image_size within the range of minimum to | |
812 | * KERNEL_IMAGE_SIZE? | |
813 | */ | |
eb38be6d | 814 | slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN; |
071a7493 | 815 | |
d899a7d1 | 816 | random_addr = kaslr_get_random_long("Virtual") % slots; |
071a7493 BH |
817 | |
818 | return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; | |
819 | } | |
820 | ||
549f90db BP |
821 | /* |
822 | * Since this function examines addresses much more numerically, | |
823 | * it takes the input and output pointers as 'unsigned long'. | |
824 | */ | |
8391c73c BH |
825 | void choose_random_location(unsigned long input, |
826 | unsigned long input_size, | |
827 | unsigned long *output, | |
828 | unsigned long output_size, | |
829 | unsigned long *virt_addr) | |
8ab3820f | 830 | { |
e066cc47 | 831 | unsigned long random_addr, min_addr; |
8ab3820f KC |
832 | |
833 | if (cmdline_find_option_bool("nokaslr")) { | |
0f8ede1b | 834 | warn("KASLR disabled: 'nokaslr' on cmdline."); |
8391c73c | 835 | return; |
8ab3820f | 836 | } |
4c2b4058 | 837 | |
6655e0aa | 838 | boot_params->hdr.loadflags |= KASLR_FLAG; |
78cac48c | 839 | |
45128694 AS |
840 | if (IS_ENABLED(CONFIG_X86_32)) |
841 | mem_limit = KERNEL_IMAGE_SIZE; | |
842 | else | |
843 | mem_limit = MAXMEM; | |
844 | ||
82fa9637 | 845 | /* Record the various known unsafe memory ranges. */ |
8391c73c | 846 | mem_avoid_init(input, input_size, *output); |
82fa9637 | 847 | |
e066cc47 YL |
848 | /* |
849 | * Low end of the randomization range should be the | |
850 | * smaller of 512M or the initial kernel image | |
851 | * location: | |
852 | */ | |
853 | min_addr = min(*output, 512UL << 20); | |
45128694 AS |
854 | /* Make sure minimum is aligned. */ |
855 | min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN); | |
e066cc47 | 856 | |
c05cd797 | 857 | /* Walk available memory entries to find a random address. */ |
e066cc47 | 858 | random_addr = find_random_phys_addr(min_addr, output_size); |
9016875d | 859 | if (!random_addr) { |
f2844249 | 860 | warn("Physical KASLR disabled: no suitable memory region!"); |
8391c73c BH |
861 | } else { |
862 | /* Update the new physical address location. */ | |
8570978e | 863 | if (*output != random_addr) |
8391c73c | 864 | *output = random_addr; |
82fa9637 KC |
865 | } |
866 | ||
8391c73c BH |
867 | |
868 | /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ | |
869 | if (IS_ENABLED(CONFIG_X86_64)) | |
870 | random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); | |
871 | *virt_addr = random_addr; | |
8ab3820f | 872 | } |