Linux 6.16-rc6
[linux-block.git] / fs / coredump.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
10c28d93
AK
2#include <linux/slab.h>
3#include <linux/file.h>
4#include <linux/fdtable.h>
70d78fe7 5#include <linux/freezer.h>
10c28d93
AK
6#include <linux/mm.h>
7#include <linux/stat.h>
8#include <linux/fcntl.h>
9#include <linux/swap.h>
315c6926 10#include <linux/ctype.h>
10c28d93
AK
11#include <linux/string.h>
12#include <linux/init.h>
13#include <linux/pagemap.h>
14#include <linux/perf_event.h>
15#include <linux/highmem.h>
16#include <linux/spinlock.h>
17#include <linux/key.h>
18#include <linux/personality.h>
19#include <linux/binfmts.h>
179899fd 20#include <linux/coredump.h>
7d442a33 21#include <linux/sort.h>
f7ccbae4 22#include <linux/sched/coredump.h>
3f07c014 23#include <linux/sched/signal.h>
68db0cf1 24#include <linux/sched/task_stack.h>
10c28d93
AK
25#include <linux/utsname.h>
26#include <linux/pid_namespace.h>
27#include <linux/module.h>
28#include <linux/namei.h>
29#include <linux/mount.h>
30#include <linux/security.h>
31#include <linux/syscalls.h>
32#include <linux/tsacct_kern.h>
33#include <linux/cn_proc.h>
34#include <linux/audit.h>
10c28d93
AK
35#include <linux/kmod.h>
36#include <linux/fsnotify.h>
37#include <linux/fs_struct.h>
38#include <linux/pipe_fs_i.h>
39#include <linux/oom.h>
40#include <linux/compat.h>
378c6520
JH
41#include <linux/fs.h>
42#include <linux/path.h>
03927c8a 43#include <linux/timekeeping.h>
f0bc21b2 44#include <linux/sysctl.h>
84158b7f 45#include <linux/elf.h>
b5325b2a 46#include <linux/pidfs.h>
a9194f88
CB
47#include <linux/net.h>
48#include <linux/socket.h>
1d8db6fd 49#include <net/af_unix.h>
a9194f88 50#include <net/net_namespace.h>
1d8db6fd 51#include <net/sock.h>
b5325b2a 52#include <uapi/linux/pidfd.h>
a9194f88 53#include <uapi/linux/un.h>
10c28d93 54
7c0f6ba6 55#include <linux/uaccess.h>
10c28d93
AK
56#include <asm/mmu_context.h>
57#include <asm/tlb.h>
58#include <asm/exec.h>
59
60#include <trace/events/task.h>
61#include "internal.h"
62
63#include <trace/events/sched.h>
64
95c5436a 65static bool dump_vma_snapshot(struct coredump_params *cprm);
390031c9 66static void free_vma_snapshot(struct coredump_params *cprm);
95c5436a 67
4bbf9c3b
AP
68#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024)
69/* Define a reasonable max cap */
70#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024)
b5325b2a
CB
71/*
72 * File descriptor number for the pidfd for the thread-group leader of
73 * the coredumping task installed into the usermode helper's file
74 * descriptor table.
75 */
76#define COREDUMP_PIDFD_NUMBER 3
4bbf9c3b 77
f0bc21b2
XN
78static int core_uses_pid;
79static unsigned int core_pipe_limit;
39ec9eaa 80static unsigned int core_sort_vma;
f0bc21b2 81static char core_pattern[CORENAME_MAX_SIZE] = "core";
3ceadcf6 82static int core_name_size = CORENAME_MAX_SIZE;
4bbf9c3b 83unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT;
10c28d93 84
727b5510
CB
85enum coredump_type_t {
86 COREDUMP_FILE = 1,
87 COREDUMP_PIPE = 2,
a9194f88 88 COREDUMP_SOCK = 3,
727b5510
CB
89};
90
10c28d93
AK
91struct core_name {
92 char *corename;
93 int used, size;
727b5510 94 enum coredump_type_t core_type;
10c28d93 95};
10c28d93 96
3ceadcf6 97static int expand_corename(struct core_name *cn, int size)
10c28d93 98{
6dd142d9
KC
99 char *corename;
100
101 size = kmalloc_size_roundup(size);
102 corename = krealloc(cn->corename, size, GFP_KERNEL);
10c28d93 103
e7fd1549 104 if (!corename)
10c28d93 105 return -ENOMEM;
10c28d93 106
3ceadcf6
ON
107 if (size > core_name_size) /* racy but harmless */
108 core_name_size = size;
109
6dd142d9 110 cn->size = size;
e7fd1549 111 cn->corename = corename;
10c28d93
AK
112 return 0;
113}
114
b4176b7c
NI
115static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
116 va_list arg)
10c28d93 117{
5fe9d8ca 118 int free, need;
404ca80e 119 va_list arg_copy;
10c28d93 120
5fe9d8ca
ON
121again:
122 free = cn->size - cn->used;
404ca80e
ED
123
124 va_copy(arg_copy, arg);
125 need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
126 va_end(arg_copy);
127
5fe9d8ca
ON
128 if (need < free) {
129 cn->used += need;
130 return 0;
131 }
10c28d93 132
3ceadcf6 133 if (!expand_corename(cn, cn->size + need - free + 1))
5fe9d8ca 134 goto again;
10c28d93 135
5fe9d8ca 136 return -ENOMEM;
10c28d93
AK
137}
138
b4176b7c 139static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
bc03c691
ON
140{
141 va_list arg;
142 int ret;
143
144 va_start(arg, fmt);
145 ret = cn_vprintf(cn, fmt, arg);
146 va_end(arg);
147
148 return ret;
149}
150
b4176b7c
NI
151static __printf(2, 3)
152int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
10c28d93 153{
923bed03
ON
154 int cur = cn->used;
155 va_list arg;
156 int ret;
157
158 va_start(arg, fmt);
159 ret = cn_vprintf(cn, fmt, arg);
160 va_end(arg);
161
ac94b6e3
JH
162 if (ret == 0) {
163 /*
164 * Ensure that this coredump name component can't cause the
165 * resulting corefile path to consist of a ".." or ".".
166 */
167 if ((cn->used - cur == 1 && cn->corename[cur] == '.') ||
168 (cn->used - cur == 2 && cn->corename[cur] == '.'
169 && cn->corename[cur+1] == '.'))
170 cn->corename[cur] = '!';
171
172 /*
173 * Empty names are fishy and could be used to create a "//" in a
174 * corefile name, causing the coredump to happen one directory
175 * level too high. Enforce that all components of the core
176 * pattern are at least one character long.
177 */
178 if (cn->used == cur)
179 ret = cn_printf(cn, "!");
180 }
181
923bed03
ON
182 for (; cur < cn->used; ++cur) {
183 if (cn->corename[cur] == '/')
184 cn->corename[cur] = '!';
185 }
186 return ret;
10c28d93
AK
187}
188
f38c85f1 189static int cn_print_exe_file(struct core_name *cn, bool name_only)
10c28d93
AK
190{
191 struct file *exe_file;
f38c85f1 192 char *pathbuf, *path, *ptr;
10c28d93
AK
193 int ret;
194
195 exe_file = get_mm_exe_file(current->mm);
923bed03
ON
196 if (!exe_file)
197 return cn_esc_printf(cn, "%s (path unknown)", current->comm);
10c28d93 198
0ee931c4 199 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
10c28d93
AK
200 if (!pathbuf) {
201 ret = -ENOMEM;
202 goto put_exe_file;
203 }
204
9bf39ab2 205 path = file_path(exe_file, pathbuf, PATH_MAX);
10c28d93
AK
206 if (IS_ERR(path)) {
207 ret = PTR_ERR(path);
208 goto free_buf;
209 }
210
f38c85f1
LW
211 if (name_only) {
212 ptr = strrchr(path, '/');
213 if (ptr)
214 path = ptr + 1;
215 }
923bed03 216 ret = cn_esc_printf(cn, "%s", path);
10c28d93
AK
217
218free_buf:
219 kfree(pathbuf);
220put_exe_file:
221 fput(exe_file);
222 return ret;
223}
224
225/* format_corename will inspect the pattern parameter, and output a
226 * name into corename, which must have space for at least
227 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
228 */
315c6926
PW
229static int format_corename(struct core_name *cn, struct coredump_params *cprm,
230 size_t **argv, int *argc)
10c28d93
AK
231{
232 const struct cred *cred = current_cred();
233 const char *pat_ptr = core_pattern;
315c6926 234 bool was_space = false;
10c28d93
AK
235 int pid_in_pattern = 0;
236 int err = 0;
237
e7fd1549 238 cn->used = 0;
3ceadcf6 239 cn->corename = NULL;
727b5510
CB
240 if (*pat_ptr == '|')
241 cn->core_type = COREDUMP_PIPE;
a9194f88
CB
242 else if (*pat_ptr == '@')
243 cn->core_type = COREDUMP_SOCK;
727b5510
CB
244 else
245 cn->core_type = COREDUMP_FILE;
3ceadcf6 246 if (expand_corename(cn, core_name_size))
10c28d93 247 return -ENOMEM;
888ffc59
ON
248 cn->corename[0] = '\0';
249
a9194f88
CB
250 switch (cn->core_type) {
251 case COREDUMP_PIPE: {
315c6926
PW
252 int argvs = sizeof(core_pattern) / 2;
253 (*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
254 if (!(*argv))
255 return -ENOMEM;
256 (*argv)[(*argc)++] = 0;
888ffc59 257 ++pat_ptr;
db973a72
SM
258 if (!(*pat_ptr))
259 return -ENOMEM;
a9194f88
CB
260 break;
261 }
262 case COREDUMP_SOCK: {
263 /* skip the @ */
264 pat_ptr++;
265 if (!(*pat_ptr))
266 return -ENOMEM;
267
268 err = cn_printf(cn, "%s", pat_ptr);
269 if (err)
270 return err;
271
272 /* Require absolute paths. */
273 if (cn->corename[0] != '/')
274 return -EINVAL;
275
276 /*
277 * Ensure we can uses spaces to indicate additional
278 * parameters in the future.
279 */
280 if (strchr(cn->corename, ' ')) {
281 coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename);
282 return -EINVAL;
283 }
284
285 /*
286 * Currently no need to parse any other options.
287 * Relevant information can be retrieved from the peer
288 * pidfd retrievable via SO_PEERPIDFD by the receiver or
289 * via /proc/<pid>, using the SO_PEERPIDFD to guard
290 * against pid recycling when opening /proc/<pid>.
291 */
292 return 0;
293 }
294 case COREDUMP_FILE:
295 break;
296 default:
297 WARN_ON_ONCE(true);
298 return -EINVAL;
315c6926 299 }
10c28d93
AK
300
301 /* Repeat as long as we have more pattern to process and more output
302 space */
303 while (*pat_ptr) {
315c6926
PW
304 /*
305 * Split on spaces before doing template expansion so that
306 * %e and %E don't get split if they have spaces in them
307 */
727b5510 308 if (cn->core_type == COREDUMP_PIPE) {
315c6926 309 if (isspace(*pat_ptr)) {
2bf509d9
MD
310 if (cn->used != 0)
311 was_space = true;
315c6926
PW
312 pat_ptr++;
313 continue;
314 } else if (was_space) {
315 was_space = false;
316 err = cn_printf(cn, "%c", '\0');
317 if (err)
318 return err;
319 (*argv)[(*argc)++] = cn->used;
320 }
321 }
10c28d93 322 if (*pat_ptr != '%') {
10c28d93
AK
323 err = cn_printf(cn, "%c", *pat_ptr++);
324 } else {
325 switch (*++pat_ptr) {
326 /* single % at the end, drop that */
327 case 0:
328 goto out;
329 /* Double percent, output one percent */
330 case '%':
331 err = cn_printf(cn, "%c", '%');
332 break;
333 /* pid */
334 case 'p':
335 pid_in_pattern = 1;
336 err = cn_printf(cn, "%d",
337 task_tgid_vnr(current));
338 break;
65aafb1e
SG
339 /* global pid */
340 case 'P':
341 err = cn_printf(cn, "%d",
342 task_tgid_nr(current));
343 break;
b03023ec
ON
344 case 'i':
345 err = cn_printf(cn, "%d",
346 task_pid_vnr(current));
347 break;
348 case 'I':
349 err = cn_printf(cn, "%d",
350 task_pid_nr(current));
351 break;
10c28d93
AK
352 /* uid */
353 case 'u':
5202efe5
NI
354 err = cn_printf(cn, "%u",
355 from_kuid(&init_user_ns,
356 cred->uid));
10c28d93
AK
357 break;
358 /* gid */
359 case 'g':
5202efe5
NI
360 err = cn_printf(cn, "%u",
361 from_kgid(&init_user_ns,
362 cred->gid));
10c28d93 363 break;
12a2b4b2
ON
364 case 'd':
365 err = cn_printf(cn, "%d",
366 __get_dumpable(cprm->mm_flags));
367 break;
10c28d93
AK
368 /* signal that caused the coredump */
369 case 's':
b4176b7c
NI
370 err = cn_printf(cn, "%d",
371 cprm->siginfo->si_signo);
10c28d93
AK
372 break;
373 /* UNIX time of coredump */
374 case 't': {
03927c8a
AB
375 time64_t time;
376
377 time = ktime_get_real_seconds();
378 err = cn_printf(cn, "%lld", time);
10c28d93
AK
379 break;
380 }
381 /* hostname */
923bed03 382 case 'h':
10c28d93 383 down_read(&uts_sem);
923bed03 384 err = cn_esc_printf(cn, "%s",
10c28d93
AK
385 utsname()->nodename);
386 up_read(&uts_sem);
10c28d93 387 break;
f38c85f1 388 /* executable, could be changed by prctl PR_SET_NAME etc */
923bed03
ON
389 case 'e':
390 err = cn_esc_printf(cn, "%s", current->comm);
10c28d93 391 break;
f38c85f1
LW
392 /* file name of executable */
393 case 'f':
394 err = cn_print_exe_file(cn, true);
395 break;
10c28d93 396 case 'E':
f38c85f1 397 err = cn_print_exe_file(cn, false);
10c28d93
AK
398 break;
399 /* core limit size */
400 case 'c':
401 err = cn_printf(cn, "%lu",
402 rlimit(RLIMIT_CORE));
403 break;
8603b6f5
ON
404 /* CPU the task ran on */
405 case 'C':
406 err = cn_printf(cn, "%d", cprm->cpu);
407 break;
b5325b2a
CB
408 /* pidfd number */
409 case 'F': {
410 /*
411 * Installing a pidfd only makes sense if
412 * we actually spawn a usermode helper.
413 */
727b5510 414 if (cn->core_type != COREDUMP_PIPE)
b5325b2a
CB
415 break;
416
417 /*
418 * Note that we'll install a pidfd for the
419 * thread-group leader. We know that task
420 * linkage hasn't been removed yet and even if
421 * this @current isn't the actual thread-group
422 * leader we know that the thread-group leader
423 * cannot be reaped until @current has exited.
424 */
425 cprm->pid = task_tgid(current);
426 err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER);
427 break;
428 }
10c28d93
AK
429 default:
430 break;
431 }
432 ++pat_ptr;
433 }
434
435 if (err)
436 return err;
437 }
438
888ffc59 439out:
10c28d93
AK
440 /* Backward compatibility with core_uses_pid:
441 *
442 * If core_pattern does not include a %p (as is the default)
443 * and core_uses_pid is set, then .%pid will be appended to
444 * the filename. Do not do this for piped commands. */
727b5510
CB
445 if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid)
446 return cn_printf(cn, ".%d", task_tgid_vnr(current));
a9194f88 447
727b5510 448 return 0;
10c28d93
AK
449}
450
1e3fa25f 451static int zap_process(struct signal_struct *signal, int exit_code)
10c28d93
AK
452{
453 struct task_struct *t;
454 int nr = 0;
455
1e3fa25f
ON
456 signal->flags = SIGNAL_GROUP_EXIT;
457 signal->group_exit_code = exit_code;
458 signal->group_stop_count = 0;
10c28d93 459
1e3fa25f 460 __for_each_thread(signal, t) {
10c28d93 461 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
92307383 462 if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
10c28d93
AK
463 sigaddset(&t->pending.signal, SIGKILL);
464 signal_wake_up(t, 1);
240a1853 465 nr++;
10c28d93 466 }
d61ba589 467 }
10c28d93
AK
468
469 return nr;
470}
471
0258b5fd 472static int zap_threads(struct task_struct *tsk,
403bad72 473 struct core_state *core_state, int exit_code)
10c28d93 474{
49697335 475 struct signal_struct *signal = tsk->signal;
10c28d93
AK
476 int nr = -EAGAIN;
477
478 spin_lock_irq(&tsk->sighand->siglock);
49697335 479 if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) {
1e3fa25f 480 /* Allow SIGKILL, see prepare_signal() */
49697335 481 signal->core_state = core_state;
1e3fa25f 482 nr = zap_process(signal, exit_code);
403bad72 483 clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
0258b5fd
EB
484 tsk->flags |= PF_DUMPCORE;
485 atomic_set(&core_state->nr_threads, nr);
10c28d93
AK
486 }
487 spin_unlock_irq(&tsk->sighand->siglock);
10c28d93
AK
488 return nr;
489}
490
491static int coredump_wait(int exit_code, struct core_state *core_state)
492{
493 struct task_struct *tsk = current;
10c28d93
AK
494 int core_waiters = -EBUSY;
495
496 init_completion(&core_state->startup);
497 core_state->dumper.task = tsk;
498 core_state->dumper.next = NULL;
499
0258b5fd 500 core_waiters = zap_threads(tsk, core_state, exit_code);
10c28d93
AK
501 if (core_waiters > 0) {
502 struct core_thread *ptr;
503
f5d39b02
PZ
504 wait_for_completion_state(&core_state->startup,
505 TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
10c28d93
AK
506 /*
507 * Wait for all the threads to become inactive, so that
508 * all the thread context (extended register state, like
509 * fpu etc) gets copied to the memory.
510 */
511 ptr = core_state->dumper.next;
512 while (ptr != NULL) {
f9fc8cad 513 wait_task_inactive(ptr->task, TASK_ANY);
10c28d93
AK
514 ptr = ptr->next;
515 }
516 }
517
518 return core_waiters;
519}
520
0258b5fd 521static void coredump_finish(bool core_dumped)
10c28d93
AK
522{
523 struct core_thread *curr, *next;
524 struct task_struct *task;
525
6cd8f0ac 526 spin_lock_irq(&current->sighand->siglock);
acdedd99
ON
527 if (core_dumped && !__fatal_signal_pending(current))
528 current->signal->group_exit_code |= 0x80;
0258b5fd
EB
529 next = current->signal->core_state->dumper.next;
530 current->signal->core_state = NULL;
6cd8f0ac
ON
531 spin_unlock_irq(&current->sighand->siglock);
532
10c28d93
AK
533 while ((curr = next) != NULL) {
534 next = curr->next;
535 task = curr->task;
536 /*
92307383 537 * see coredump_task_exit(), curr->task must not see
10c28d93
AK
538 * ->task == NULL before we read ->next.
539 */
540 smp_mb();
541 curr->task = NULL;
542 wake_up_process(task);
543 }
10c28d93
AK
544}
545
528f827e
ON
546static bool dump_interrupted(void)
547{
548 /*
549 * SIGKILL or freezing() interrupt the coredumping. Perhaps we
550 * can do try_to_freeze() and check __fatal_signal_pending(),
551 * but then we need to teach dump_write() to restart and clear
552 * TIF_SIGPENDING.
553 */
a78282e2 554 return fatal_signal_pending(current) || freezing(current);
528f827e
ON
555}
556
10c28d93
AK
557static void wait_for_dump_helpers(struct file *file)
558{
de32ec4c 559 struct pipe_inode_info *pipe = file->private_data;
10c28d93
AK
560
561 pipe_lock(pipe);
562 pipe->readers++;
563 pipe->writers--;
0ddad21d 564 wake_up_interruptible_sync(&pipe->rd_wait);
dc7ee2aa
ON
565 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
566 pipe_unlock(pipe);
10c28d93 567
dc7ee2aa
ON
568 /*
569 * We actually want wait_event_freezable() but then we need
570 * to clear TIF_SIGPENDING and improve dump_interrupted().
571 */
0ddad21d 572 wait_event_interruptible(pipe->rd_wait, pipe->readers == 1);
10c28d93 573
dc7ee2aa 574 pipe_lock(pipe);
10c28d93
AK
575 pipe->readers--;
576 pipe->writers++;
577 pipe_unlock(pipe);
10c28d93
AK
578}
579
580/*
b5325b2a 581 * umh_coredump_setup
10c28d93
AK
582 * helper function to customize the process used
583 * to collect the core in userspace. Specifically
584 * it sets up a pipe and installs it as fd 0 (stdin)
585 * for the process. Returns 0 on success, or
586 * PTR_ERR on failure.
587 * Note that it also sets the core limit to 1. This
588 * is a special value that we use to trap recursive
589 * core dumps
590 */
b5325b2a 591static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
10c28d93
AK
592{
593 struct file *files[2];
594 struct coredump_params *cp = (struct coredump_params *)info->data;
95c5f431
CB
595 int err;
596
b5325b2a
CB
597 if (cp->pid) {
598 struct file *pidfs_file __free(fput) = NULL;
599
600 pidfs_file = pidfs_alloc_file(cp->pid, 0);
601 if (IS_ERR(pidfs_file))
602 return PTR_ERR(pidfs_file);
603
1d8db6fd
CB
604 pidfs_coredump(cp);
605
b5325b2a
CB
606 /*
607 * Usermode helpers are childen of either
608 * system_unbound_wq or of kthreadd. So we know that
609 * we're starting off with a clean file descriptor
610 * table. So we should always be able to use
611 * COREDUMP_PIDFD_NUMBER as our file descriptor value.
612 */
613 err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0);
614 if (err < 0)
615 return err;
616 }
617
95c5f431 618 err = create_pipe_files(files, 0);
10c28d93
AK
619 if (err)
620 return err;
621
622 cp->file = files[1];
623
45525b26
AV
624 err = replace_fd(0, files[0], 0);
625 fput(files[0]);
95c5f431
CB
626 if (err < 0)
627 return err;
628
10c28d93
AK
629 /* and disallow core files too */
630 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
631
95c5f431 632 return 0;
10c28d93
AK
633}
634
a78282e2 635void do_coredump(const kernel_siginfo_t *siginfo)
10c28d93
AK
636{
637 struct core_state core_state;
638 struct core_name cn;
639 struct mm_struct *mm = current->mm;
640 struct linux_binfmt * binfmt;
641 const struct cred *old_cred;
642 struct cred *cred;
a78282e2 643 int retval = 0;
315c6926
PW
644 size_t *argv = NULL;
645 int argc = 0;
fbb18169
JH
646 /* require nonrelative corefile path and be extra careful */
647 bool need_suid_safe = false;
acdedd99 648 bool core_dumped = false;
10c28d93
AK
649 static atomic_t core_dump_count = ATOMIC_INIT(0);
650 struct coredump_params cprm = {
5ab1c309 651 .siginfo = siginfo,
10c28d93
AK
652 .limit = rlimit(RLIMIT_CORE),
653 /*
654 * We must use the same mm->flags while dumping core to avoid
655 * inconsistency of bit flags, since this flag is not protected
656 * by any locks.
657 */
658 .mm_flags = mm->flags,
95c5436a 659 .vma_meta = NULL,
8603b6f5 660 .cpu = raw_smp_processor_id(),
10c28d93
AK
661 };
662
5ab1c309 663 audit_core_dumps(siginfo->si_signo);
10c28d93
AK
664
665 binfmt = mm->binfmt;
a78282e2 666 if (!binfmt || !binfmt->core_dump)
10c28d93 667 goto fail;
a78282e2 668 if (!__get_dumpable(cprm.mm_flags))
10c28d93
AK
669 goto fail;
670
671 cred = prepare_creds();
a78282e2 672 if (!cred)
10c28d93
AK
673 goto fail;
674 /*
675 * We cannot trust fsuid as being the "true" uid of the process
676 * nor do we know its entire history. We only know it was tainted
677 * so we dump it as root in mode 2, and only into a controlled
678 * environment (pipe handler or fully qualified path).
679 */
e579d2c2 680 if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
10c28d93 681 /* Setuid core dump mode */
10c28d93 682 cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
fbb18169 683 need_suid_safe = true;
10c28d93
AK
684 }
685
5ab1c309 686 retval = coredump_wait(siginfo->si_signo, &core_state);
10c28d93
AK
687 if (retval < 0)
688 goto fail_creds;
689
690 old_cred = override_creds(cred);
691
727b5510
CB
692 retval = format_corename(&cn, &cprm, &argv, &argc);
693 if (retval < 0) {
694 coredump_report_failure("format_corename failed, aborting core");
695 goto fail_unlock;
696 }
10c28d93 697
d4fde206
CB
698 switch (cn.core_type) {
699 case COREDUMP_FILE: {
abf08576 700 struct mnt_idmap *idmap;
10c28d93 701 struct inode *inode;
88e46070 702 int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
378c6520 703 O_LARGEFILE | O_EXCL;
10c28d93 704
a78282e2 705 if (cprm.limit < binfmt->min_coredump)
10c28d93
AK
706 goto fail_unlock;
707
fbb18169 708 if (need_suid_safe && cn.corename[0] != '/') {
c114e994
RK
709 coredump_report_failure(
710 "this process can only dump core to a fully qualified path, skipping core dump");
10c28d93
AK
711 goto fail_unlock;
712 }
713
fbb18169
JH
714 /*
715 * Unlink the file if it exists unless this is a SUID
716 * binary - in that case, we're running around with root
717 * privs and don't want to unlink another user's coredump.
718 */
719 if (!need_suid_safe) {
fbb18169
JH
720 /*
721 * If it doesn't exist, that's fine. If there's some
722 * other problem, we'll catch it at the filp_open().
723 */
96271654 724 do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
fbb18169
JH
725 }
726
727 /*
728 * There is a race between unlinking and creating the
729 * file, but if that causes an EEXIST here, that's
730 * fine - another process raced with us while creating
731 * the corefile, and the other process won. To userspace,
732 * what matters is that at least one of the two processes
733 * writes its coredump successfully, not which one.
734 */
378c6520
JH
735 if (need_suid_safe) {
736 /*
737 * Using user namespaces, normal user tasks can change
738 * their current->fs->root to point to arbitrary
739 * directories. Since the intention of the "only dump
740 * with a fully qualified path" rule is to control where
741 * coredumps may be placed using root privileges,
742 * current->fs->root must not be used. Instead, use the
743 * root directory of init_task.
744 */
745 struct path root;
746
747 task_lock(&init_task);
748 get_fs_root(init_task.fs, &root);
749 task_unlock(&init_task);
ffb37ca3
AV
750 cprm.file = file_open_root(&root, cn.corename,
751 open_flags, 0600);
378c6520
JH
752 path_put(&root);
753 } else {
754 cprm.file = filp_open(cn.corename, open_flags, 0600);
755 }
a78282e2 756 if (IS_ERR(cprm.file))
10c28d93
AK
757 goto fail_unlock;
758
496ad9aa 759 inode = file_inode(cprm.file);
a78282e2 760 if (inode->i_nlink > 1)
10c28d93 761 goto close_fail;
a78282e2 762 if (d_unhashed(cprm.file->f_path.dentry))
10c28d93
AK
763 goto close_fail;
764 /*
765 * AK: actually i see no reason to not allow this for named
766 * pipes etc, but keep the previous behaviour for now.
767 */
a78282e2 768 if (!S_ISREG(inode->i_mode))
10c28d93
AK
769 goto close_fail;
770 /*
40f705a7
JH
771 * Don't dump core if the filesystem changed owner or mode
772 * of the file during file creation. This is an issue when
773 * a process dumps core while its cwd is e.g. on a vfat
774 * filesystem.
10c28d93 775 */
abf08576 776 idmap = file_mnt_idmap(cprm.file);
e67fe633 777 if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode),
a2bd096f 778 current_fsuid())) {
c114e994
RK
779 coredump_report_failure("Core dump to %s aborted: "
780 "cannot preserve file owner", cn.corename);
10c28d93 781 goto close_fail;
dbd9d6f8
DO
782 }
783 if ((inode->i_mode & 0677) != 0600) {
c114e994
RK
784 coredump_report_failure("Core dump to %s aborted: "
785 "cannot preserve file permissions", cn.corename);
40f705a7 786 goto close_fail;
dbd9d6f8 787 }
a78282e2 788 if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
10c28d93 789 goto close_fail;
a78282e2
LT
790 if (do_truncate(idmap, cprm.file->f_path.dentry,
791 0, 0, cprm.file))
10c28d93 792 goto close_fail;
d4fde206
CB
793 break;
794 }
795 case COREDUMP_PIPE: {
796 int argi;
797 int dump_count;
798 char **helper_argv;
799 struct subprocess_info *sub_info;
800
801 if (cprm.limit == 1) {
802 /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
803 *
804 * Normally core limits are irrelevant to pipes, since
805 * we're not writing to the file system, but we use
806 * cprm.limit of 1 here as a special value, this is a
807 * consistent way to catch recursive crashes.
808 * We can still crash if the core_pattern binary sets
809 * RLIM_CORE = !1, but it runs as root, and can do
810 * lots of stupid things.
811 *
812 * Note that we use task_tgid_vnr here to grab the pid
813 * of the process group leader. That way we get the
814 * right pid if a thread in a multi-threaded
815 * core_pattern process dies.
816 */
817 coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
818 goto fail_unlock;
819 }
820 cprm.limit = RLIM_INFINITY;
821
822 dump_count = atomic_inc_return(&core_dump_count);
823 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
824 coredump_report_failure("over core_pipe_limit, skipping core dump");
825 goto fail_dropcount;
826 }
827
828 helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
829 GFP_KERNEL);
830 if (!helper_argv) {
831 coredump_report_failure("%s failed to allocate memory", __func__);
832 goto fail_dropcount;
833 }
834 for (argi = 0; argi < argc; argi++)
835 helper_argv[argi] = cn.corename + argv[argi];
836 helper_argv[argi] = NULL;
837
838 retval = -ENOMEM;
839 sub_info = call_usermodehelper_setup(helper_argv[0],
840 helper_argv, NULL, GFP_KERNEL,
841 umh_coredump_setup, NULL, &cprm);
842 if (sub_info)
843 retval = call_usermodehelper_exec(sub_info,
844 UMH_WAIT_EXEC);
845
846 kfree(helper_argv);
847 if (retval) {
848 coredump_report_failure("|%s pipe failed", cn.corename);
849 goto close_fail;
850 }
851 break;
852 }
a9194f88
CB
853 case COREDUMP_SOCK: {
854#ifdef CONFIG_UNIX
855 struct file *file __free(fput) = NULL;
856 struct sockaddr_un addr = {
857 .sun_family = AF_UNIX,
858 };
859 ssize_t addr_len;
860 struct socket *socket;
861
862 addr_len = strscpy(addr.sun_path, cn.corename);
863 if (addr_len < 0)
864 goto close_fail;
865 addr_len += offsetof(struct sockaddr_un, sun_path) + 1;
866
867 /*
868 * It is possible that the userspace process which is
869 * supposed to handle the coredump and is listening on
870 * the AF_UNIX socket coredumps. Userspace should just
871 * mark itself non dumpable.
872 */
873
874 retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket);
875 if (retval < 0)
876 goto close_fail;
877
878 file = sock_alloc_file(socket, 0, NULL);
879 if (IS_ERR(file))
880 goto close_fail;
881
1d8db6fd
CB
882 /*
883 * Set the thread-group leader pid which is used for the
884 * peer credentials during connect() below. Then
885 * immediately register it in pidfs...
886 */
887 cprm.pid = task_tgid(current);
888 retval = pidfs_register_pid(cprm.pid);
889 if (retval)
890 goto close_fail;
891
892 /*
893 * ... and set the coredump information so userspace
894 * has it available after connect()...
895 */
896 pidfs_coredump(&cprm);
897
a9194f88
CB
898 retval = kernel_connect(socket, (struct sockaddr *)(&addr),
899 addr_len, O_NONBLOCK | SOCK_COREDUMP);
1d8db6fd
CB
900
901 /*
902 * ... Make sure to only put our reference after connect() took
903 * its own reference keeping the pidfs entry alive ...
904 */
905 pidfs_put_pid(cprm.pid);
906
a9194f88
CB
907 if (retval) {
908 if (retval == -EAGAIN)
909 coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
910 else
911 coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval);
912 goto close_fail;
913 }
914
1d8db6fd
CB
915 /* ... and validate that @sk_peer_pid matches @cprm.pid. */
916 if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid))
917 goto close_fail;
918
a9194f88
CB
919 cprm.limit = RLIM_INFINITY;
920 cprm.file = no_free_ptr(file);
921#else
922 coredump_report_failure("Core dump socket support %s disabled", cn.corename);
923 goto close_fail;
924#endif
925 break;
926 }
d4fde206
CB
927 default:
928 WARN_ON_ONCE(true);
929 goto close_fail;
10c28d93
AK
930 }
931
932 /* get us an unshared descriptor table; almost always a no-op */
c39ab6de 933 /* The cell spufs coredump code reads the file descriptor tables */
1f702603 934 retval = unshare_files();
10c28d93
AK
935 if (retval)
936 goto close_fail;
e86d35c3 937 if (!dump_interrupted()) {
3740d93e
LC
938 /*
939 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
940 * have this set to NULL.
941 */
942 if (!cprm.file) {
c114e994 943 coredump_report_failure("Core dump to |%s disabled", cn.corename);
3740d93e
LC
944 goto close_fail;
945 }
a78282e2 946 if (!dump_vma_snapshot(&cprm))
95c5436a
EB
947 goto close_fail;
948
e86d35c3
AV
949 file_start_write(cprm.file);
950 core_dumped = binfmt->core_dump(&cprm);
d0f1088b
AV
951 /*
952 * Ensures that file size is big enough to contain the current
953 * file postion. This prevents gdb from complaining about
954 * a truncated file if the last "write" to the file was
955 * dump_skip.
956 */
957 if (cprm.to_skip) {
958 cprm.to_skip--;
959 dump_emit(&cprm, "", 1);
960 }
e86d35c3 961 file_end_write(cprm.file);
390031c9 962 free_vma_snapshot(&cprm);
e86d35c3 963 }
a9194f88
CB
964
965#ifdef CONFIG_UNIX
966 /* Let userspace know we're done processing the coredump. */
967 if (sock_from_file(cprm.file))
968 kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR);
969#endif
970
971 /*
972 * When core_pipe_limit is set we wait for the coredump server
973 * or usermodehelper to finish before exiting so it can e.g.,
974 * inspect /proc/<pid>.
975 */
976 if (core_pipe_limit) {
977 switch (cn.core_type) {
978 case COREDUMP_PIPE:
979 wait_for_dump_helpers(cprm.file);
980 break;
981#ifdef CONFIG_UNIX
982 case COREDUMP_SOCK: {
983 ssize_t n;
984
985 /*
986 * We use a simple read to wait for the coredump
987 * processing to finish. Either the socket is
988 * closed or we get sent unexpected data. In
989 * both cases, we're done.
990 */
991 n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL);
992 if (n != 0)
993 coredump_report_failure("Unexpected data on coredump socket");
994 break;
995 }
996#endif
997 default:
998 break;
999 }
1000 }
1001
10c28d93
AK
1002close_fail:
1003 if (cprm.file)
1004 filp_close(cprm.file, NULL);
1005fail_dropcount:
727b5510 1006 if (cn.core_type == COREDUMP_PIPE)
10c28d93
AK
1007 atomic_dec(&core_dump_count);
1008fail_unlock:
315c6926 1009 kfree(argv);
10c28d93 1010 kfree(cn.corename);
0258b5fd 1011 coredump_finish(core_dumped);
10c28d93
AK
1012 revert_creds(old_cred);
1013fail_creds:
1014 put_cred(cred);
1015fail:
a78282e2 1016 return;
10c28d93
AK
1017}
1018
1019/*
1020 * Core dumping helper functions. These are the only things you should
1021 * do on a core-file: use only these functions to write out all the
1022 * necessary info.
1023 */
d0f1088b 1024static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
ecc8c772
AV
1025{
1026 struct file *file = cprm->file;
2507a4fb
AV
1027 loff_t pos = file->f_pos;
1028 ssize_t n;
1c587ee6 1029
2c4cb043 1030 if (cprm->written + nr > cprm->limit)
ecc8c772 1031 return 0;
df0c09c0
JH
1032 if (dump_interrupted())
1033 return 0;
1034 n = __kernel_write(file, addr, nr, &pos);
a78282e2 1035 if (n != nr)
df0c09c0
JH
1036 return 0;
1037 file->f_pos = pos;
1038 cprm->written += n;
1039 cprm->pos += n;
1040
ecc8c772
AV
1041 return 1;
1042}
ecc8c772 1043
d0f1088b 1044static int __dump_skip(struct coredump_params *cprm, size_t nr)
10c28d93 1045{
9b56d543
AV
1046 static char zeroes[PAGE_SIZE];
1047 struct file *file = cprm->file;
1c587ee6 1048
4e3299ea 1049 if (file->f_mode & FMODE_LSEEK) {
1c587ee6 1050 if (dump_interrupted() || vfs_llseek(file, nr, SEEK_CUR) < 0)
10c28d93 1051 return 0;
1607f09c 1052 cprm->pos += nr;
9b56d543 1053 return 1;
10c28d93 1054 }
1c587ee6
CB
1055
1056 while (nr > PAGE_SIZE) {
1057 if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
1058 return 0;
1059 nr -= PAGE_SIZE;
1060 }
1061
1062 return __dump_emit(cprm, zeroes, nr);
10c28d93 1063}
d0f1088b 1064
9c7417b5
GU
1065int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
1066{
1067 if (cprm->to_skip) {
1068 if (!__dump_skip(cprm, cprm->to_skip))
1069 return 0;
1070 cprm->to_skip = 0;
1071 }
1072 return __dump_emit(cprm, addr, nr);
1073}
1074EXPORT_SYMBOL(dump_emit);
1075
1076void dump_skip_to(struct coredump_params *cprm, unsigned long pos)
1077{
1078 cprm->to_skip = pos - cprm->pos;
1079}
1080EXPORT_SYMBOL(dump_skip_to);
1081
1082void dump_skip(struct coredump_params *cprm, size_t nr)
1083{
1084 cprm->to_skip += nr;
1085}
1086EXPORT_SYMBOL(dump_skip);
1087
1088#ifdef CONFIG_ELF_CORE
06bbaa6d
AV
1089static int dump_emit_page(struct coredump_params *cprm, struct page *page)
1090{
cd598003 1091 struct bio_vec bvec;
06bbaa6d
AV
1092 struct iov_iter iter;
1093 struct file *file = cprm->file;
4f526fef 1094 loff_t pos;
06bbaa6d
AV
1095 ssize_t n;
1096
a50026bd
LT
1097 if (!page)
1098 return 0;
1099
06bbaa6d
AV
1100 if (cprm->to_skip) {
1101 if (!__dump_skip(cprm, cprm->to_skip))
1102 return 0;
1103 cprm->to_skip = 0;
1104 }
1105 if (cprm->written + PAGE_SIZE > cprm->limit)
1106 return 0;
1107 if (dump_interrupted())
1108 return 0;
4f526fef 1109 pos = file->f_pos;
cd598003 1110 bvec_set_page(&bvec, page, PAGE_SIZE, 0);
de4eda9d 1111 iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
06bbaa6d
AV
1112 n = __kernel_write_iter(cprm->file, &iter, &pos);
1113 if (n != PAGE_SIZE)
1114 return 0;
1115 file->f_pos = pos;
1116 cprm->written += PAGE_SIZE;
1117 cprm->pos += PAGE_SIZE;
1118
1119 return 1;
1120}
1121
a50026bd
LT
1122/*
1123 * If we might get machine checks from kernel accesses during the
1124 * core dump, let's get those errors early rather than during the
1125 * IO. This is not performance-critical enough to warrant having
1126 * all the machine check logic in the iovec paths.
1127 */
1128#ifdef copy_mc_to_kernel
1129
1130#define dump_page_alloc() alloc_page(GFP_KERNEL)
1131#define dump_page_free(x) __free_page(x)
1132static struct page *dump_page_copy(struct page *src, struct page *dst)
1133{
1134 void *buf = kmap_local_page(src);
1135 size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE);
1136 kunmap_local(buf);
1137 return left ? NULL : dst;
1138}
1139
1140#else
1141
1142/* We just want to return non-NULL; it's never used. */
1143#define dump_page_alloc() ERR_PTR(-EINVAL)
1144#define dump_page_free(x) ((void)(x))
1145static inline struct page *dump_page_copy(struct page *src, struct page *dst)
1146{
1147 return src;
1148}
1149#endif
1150
afc63a97
JH
1151int dump_user_range(struct coredump_params *cprm, unsigned long start,
1152 unsigned long len)
1153{
1154 unsigned long addr;
a50026bd 1155 struct page *dump_page;
d6ff4c8f 1156 int locked, ret;
a50026bd
LT
1157
1158 dump_page = dump_page_alloc();
1159 if (!dump_page)
1160 return 0;
afc63a97 1161
d6ff4c8f
MG
1162 ret = 0;
1163 locked = 0;
afc63a97
JH
1164 for (addr = start; addr < start + len; addr += PAGE_SIZE) {
1165 struct page *page;
afc63a97 1166
d6ff4c8f
MG
1167 if (!locked) {
1168 if (mmap_read_lock_killable(current->mm))
1169 goto out;
1170 locked = 1;
1171 }
1172
afc63a97
JH
1173 /*
1174 * To avoid having to allocate page tables for virtual address
1175 * ranges that have never been used yet, and also to make it
1176 * easy to generate sparse core files, use a helper that returns
1177 * NULL when encountering an empty page table entry that would
1178 * otherwise have been filled with the zero page.
1179 */
d6ff4c8f 1180 page = get_dump_page(addr, &locked);
afc63a97 1181 if (page) {
d6ff4c8f
MG
1182 if (locked) {
1183 mmap_read_unlock(current->mm);
1184 locked = 0;
1185 }
a50026bd 1186 int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
afc63a97 1187 put_page(page);
d6ff4c8f
MG
1188 if (stop)
1189 goto out;
afc63a97 1190 } else {
d0f1088b 1191 dump_skip(cprm, PAGE_SIZE);
afc63a97 1192 }
d6ff4c8f
MG
1193
1194 if (dump_interrupted())
1195 goto out;
1196
1197 if (!need_resched())
1198 continue;
1199 if (locked) {
1200 mmap_read_unlock(current->mm);
1201 locked = 0;
1202 }
0dfcb72d 1203 cond_resched();
afc63a97 1204 }
d6ff4c8f
MG
1205 ret = 1;
1206out:
1207 if (locked)
1208 mmap_read_unlock(current->mm);
1209
a50026bd 1210 dump_page_free(dump_page);
d6ff4c8f 1211 return ret;
afc63a97
JH
1212}
1213#endif
1214
22a8cb82
AV
1215int dump_align(struct coredump_params *cprm, int align)
1216{
d0f1088b 1217 unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1);
22a8cb82 1218 if (align & (align - 1))
db51242d 1219 return 0;
d0f1088b
AV
1220 if (mod)
1221 cprm->to_skip += align - mod;
1222 return 1;
22a8cb82
AV
1223}
1224EXPORT_SYMBOL(dump_align);
4d22c75d 1225
f0bc21b2
XN
1226#ifdef CONFIG_SYSCTL
1227
1228void validate_coredump_safety(void)
1229{
1230 if (suid_dumpable == SUID_DUMP_ROOT &&
a9194f88 1231 core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') {
c114e994
RK
1232
1233 coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: "
1234 "pipe handler or fully qualified core dump path required. "
1235 "Set kernel.core_pattern before fs.suid_dumpable.");
f0bc21b2
XN
1236 }
1237}
1238
16195d2c
CB
1239static inline bool check_coredump_socket(void)
1240{
1241 if (core_pattern[0] != '@')
1242 return true;
1243
1244 /*
1245 * Coredump socket must be located in the initial mount
1246 * namespace. Don't give the impression that anything else is
1247 * supported right now.
1248 */
1249 if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns)
1250 return false;
1251
1252 /* Must be an absolute path. */
1253 if (*(core_pattern + 1) != '/')
1254 return false;
1255
1256 return true;
1257}
1258
78eb4ea2 1259static int proc_dostring_coredump(const struct ctl_table *table, int write,
f0bc21b2
XN
1260 void *buffer, size_t *lenp, loff_t *ppos)
1261{
16195d2c
CB
1262 int error;
1263 ssize_t retval;
1264 char old_core_pattern[CORENAME_MAX_SIZE];
1265
1266 retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
1267
1268 error = proc_dostring(table, write, buffer, lenp, ppos);
1269 if (error)
1270 return error;
1271 if (!check_coredump_socket()) {
1272 strscpy(core_pattern, old_core_pattern, retval + 1);
1273 return -EINVAL;
1274 }
f0bc21b2 1275
16195d2c 1276 validate_coredump_safety();
f0bc21b2
XN
1277 return error;
1278}
1279
4bbf9c3b
AP
1280static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT;
1281static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX;
c72d9146
CB
1282static char core_modes[] = {
1283 "file\npipe"
1284#ifdef CONFIG_UNIX
1285 "\nsocket"
1286#endif
1287};
4bbf9c3b 1288
1751f872 1289static const struct ctl_table coredump_sysctls[] = {
f0bc21b2
XN
1290 {
1291 .procname = "core_uses_pid",
1292 .data = &core_uses_pid,
1293 .maxlen = sizeof(int),
1294 .mode = 0644,
1295 .proc_handler = proc_dointvec,
1296 },
1297 {
1298 .procname = "core_pattern",
1299 .data = core_pattern,
1300 .maxlen = CORENAME_MAX_SIZE,
1301 .mode = 0644,
1302 .proc_handler = proc_dostring_coredump,
1303 },
1304 {
1305 .procname = "core_pipe_limit",
1306 .data = &core_pipe_limit,
1307 .maxlen = sizeof(unsigned int),
1308 .mode = 0644,
049439e2
NB
1309 .proc_handler = proc_dointvec_minmax,
1310 .extra1 = SYSCTL_ZERO,
1311 .extra2 = SYSCTL_INT_MAX,
f0bc21b2 1312 },
4bbf9c3b
AP
1313 {
1314 .procname = "core_file_note_size_limit",
1315 .data = &core_file_note_size_limit,
1316 .maxlen = sizeof(unsigned int),
1317 .mode = 0644,
1318 .proc_handler = proc_douintvec_minmax,
1319 .extra1 = (unsigned int *)&core_file_note_size_min,
1320 .extra2 = (unsigned int *)&core_file_note_size_max,
1321 },
39ec9eaa
KC
1322 {
1323 .procname = "core_sort_vma",
1324 .data = &core_sort_vma,
1325 .maxlen = sizeof(int),
1326 .mode = 0644,
1327 .proc_handler = proc_douintvec_minmax,
1328 .extra1 = SYSCTL_ZERO,
1329 .extra2 = SYSCTL_ONE,
1330 },
c72d9146
CB
1331 {
1332 .procname = "core_modes",
1333 .data = core_modes,
1334 .maxlen = sizeof(core_modes) - 1,
1335 .mode = 0444,
1336 .proc_handler = proc_dostring,
1337 },
f0bc21b2
XN
1338};
1339
1340static int __init init_fs_coredump_sysctls(void)
1341{
1342 register_sysctl_init("kernel", coredump_sysctls);
1343 return 0;
1344}
1345fs_initcall(init_fs_coredump_sysctls);
1346#endif /* CONFIG_SYSCTL */
1347
429a22e7
JH
1348/*
1349 * The purpose of always_dump_vma() is to make sure that special kernel mappings
1350 * that are useful for post-mortem analysis are included in every core dump.
1351 * In that way we ensure that the core dump is fully interpretable later
1352 * without matching up the same kernel and hardware config to see what PC values
1353 * meant. These special mappings include - vDSO, vsyscall, and other
1354 * architecture specific mappings
1355 */
1356static bool always_dump_vma(struct vm_area_struct *vma)
1357{
1358 /* Any vsyscall mappings? */
1359 if (vma == get_gate_vma(vma->vm_mm))
1360 return true;
1361
1362 /*
1363 * Assume that all vmas with a .name op should always be dumped.
1364 * If this changes, a new vm_ops field can easily be added.
1365 */
1366 if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
1367 return true;
1368
1369 /*
1370 * arch_vma_name() returns non-NULL for special architecture mappings,
1371 * such as vDSO sections.
1372 */
1373 if (arch_vma_name(vma))
1374 return true;
1375
1376 return false;
1377}
1378
84158b7f
JH
1379#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
1380
429a22e7
JH
1381/*
1382 * Decide how much of @vma's contents should be included in a core dump.
1383 */
a07279c9
JH
1384static unsigned long vma_dump_size(struct vm_area_struct *vma,
1385 unsigned long mm_flags)
429a22e7
JH
1386{
1387#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
1388
1389 /* always dump the vdso and vsyscall sections */
1390 if (always_dump_vma(vma))
1391 goto whole;
1392
1393 if (vma->vm_flags & VM_DONTDUMP)
1394 return 0;
1395
1396 /* support for DAX */
1397 if (vma_is_dax(vma)) {
1398 if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
1399 goto whole;
1400 if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
1401 goto whole;
1402 return 0;
1403 }
1404
1405 /* Hugetlb memory check */
1406 if (is_vm_hugetlb_page(vma)) {
1407 if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
1408 goto whole;
1409 if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
1410 goto whole;
1411 return 0;
1412 }
1413
1414 /* Do not dump I/O mapped devices or special mappings */
1415 if (vma->vm_flags & VM_IO)
1416 return 0;
1417
1418 /* By default, dump shared memory if mapped from an anonymous file. */
1419 if (vma->vm_flags & VM_SHARED) {
1420 if (file_inode(vma->vm_file)->i_nlink == 0 ?
1421 FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
1422 goto whole;
1423 return 0;
1424 }
1425
1426 /* Dump segments that have been written to. */
1427 if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE))
1428 goto whole;
1429 if (vma->vm_file == NULL)
1430 return 0;
1431
1432 if (FILTER(MAPPED_PRIVATE))
1433 goto whole;
1434
1435 /*
1436 * If this is the beginning of an executable file mapping,
1437 * dump the first page to aid in determining what was mapped here.
1438 */
1439 if (FILTER(ELF_HEADERS) &&
84158b7f
JH
1440 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
1441 if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
1442 return PAGE_SIZE;
1443
1444 /*
1445 * ELF libraries aren't always executable.
1446 * We'll want to check whether the mapping starts with the ELF
1447 * magic, but not now - we're holding the mmap lock,
1448 * so copy_from_user() doesn't work here.
1449 * Use a placeholder instead, and fix it up later in
1450 * dump_vma_snapshot().
1451 */
1452 return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
1453 }
429a22e7
JH
1454
1455#undef FILTER
1456
1457 return 0;
1458
1459whole:
1460 return vma->vm_end - vma->vm_start;
1461}
a07279c9 1462
a07279c9
JH
1463/*
1464 * Helper function for iterating across a vma list. It ensures that the caller
1465 * will visit `gate_vma' prior to terminating the search.
1466 */
e552cdb8 1467static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi,
182ea1d7 1468 struct vm_area_struct *vma,
a07279c9
JH
1469 struct vm_area_struct *gate_vma)
1470{
182ea1d7 1471 if (gate_vma && (vma == gate_vma))
a07279c9 1472 return NULL;
182ea1d7 1473
e552cdb8 1474 vma = vma_next(vmi);
182ea1d7
MWO
1475 if (vma)
1476 return vma;
a07279c9
JH
1477 return gate_vma;
1478}
1479
390031c9
EB
1480static void free_vma_snapshot(struct coredump_params *cprm)
1481{
1482 if (cprm->vma_meta) {
1483 int i;
1484 for (i = 0; i < cprm->vma_count; i++) {
1485 struct file *file = cprm->vma_meta[i].file;
1486 if (file)
1487 fput(file);
1488 }
1489 kvfree(cprm->vma_meta);
1490 cprm->vma_meta = NULL;
1491 }
1492}
1493
7d442a33
BM
1494static int cmp_vma_size(const void *vma_meta_lhs_ptr, const void *vma_meta_rhs_ptr)
1495{
1496 const struct core_vma_metadata *vma_meta_lhs = vma_meta_lhs_ptr;
1497 const struct core_vma_metadata *vma_meta_rhs = vma_meta_rhs_ptr;
1498
1499 if (vma_meta_lhs->dump_size < vma_meta_rhs->dump_size)
1500 return -1;
1501 if (vma_meta_lhs->dump_size > vma_meta_rhs->dump_size)
1502 return 1;
1503 return 0;
1504}
1505
a07279c9
JH
1506/*
1507 * Under the mmap_lock, take a snapshot of relevant information about the task's
1508 * VMAs.
1509 */
95c5436a 1510static bool dump_vma_snapshot(struct coredump_params *cprm)
a07279c9 1511{
182ea1d7 1512 struct vm_area_struct *gate_vma, *vma = NULL;
a07279c9 1513 struct mm_struct *mm = current->mm;
e552cdb8 1514 VMA_ITERATOR(vmi, mm, 0);
182ea1d7 1515 int i = 0;
a07279c9
JH
1516
1517 /*
1518 * Once the stack expansion code is fixed to not change VMA bounds
1519 * under mmap_lock in read mode, this can be changed to take the
1520 * mmap_lock in read mode.
1521 */
1522 if (mmap_write_lock_killable(mm))
95c5436a 1523 return false;
a07279c9 1524
95c5436a 1525 cprm->vma_data_size = 0;
a07279c9 1526 gate_vma = get_gate_vma(mm);
95c5436a 1527 cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0);
a07279c9 1528
95c5436a
EB
1529 cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL);
1530 if (!cprm->vma_meta) {
a07279c9 1531 mmap_write_unlock(mm);
95c5436a 1532 return false;
a07279c9
JH
1533 }
1534
e552cdb8 1535 while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) {
95c5436a 1536 struct core_vma_metadata *m = cprm->vma_meta + i;
a07279c9
JH
1537
1538 m->start = vma->vm_start;
1539 m->end = vma->vm_end;
1540 m->flags = vma->vm_flags;
1541 m->dump_size = vma_dump_size(vma, cprm->mm_flags);
390031c9 1542 m->pgoff = vma->vm_pgoff;
390031c9
EB
1543 m->file = vma->vm_file;
1544 if (m->file)
1545 get_file(m->file);
182ea1d7 1546 i++;
a07279c9
JH
1547 }
1548
1549 mmap_write_unlock(mm);
1550
95c5436a
EB
1551 for (i = 0; i < cprm->vma_count; i++) {
1552 struct core_vma_metadata *m = cprm->vma_meta + i;
84158b7f
JH
1553
1554 if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
1555 char elfmag[SELFMAG];
1556
1557 if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
1558 memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
1559 m->dump_size = 0;
1560 } else {
1561 m->dump_size = PAGE_SIZE;
1562 }
1563 }
1564
95c5436a 1565 cprm->vma_data_size += m->dump_size;
84158b7f
JH
1566 }
1567
39ec9eaa
KC
1568 if (core_sort_vma)
1569 sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta),
1570 cmp_vma_size, NULL);
7d442a33 1571
95c5436a 1572 return true;
a07279c9 1573}