perf header: Add die information in CPU topology
[linux-2.6-block.git] / tools / perf / builtin-record.c
... / ...
CommitLineData
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * builtin-record.c
4 *
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
8 */
9#include "builtin.h"
10
11#include "perf.h"
12
13#include "util/build-id.h"
14#include "util/util.h"
15#include <subcmd/parse-options.h>
16#include "util/parse-events.h"
17#include "util/config.h"
18
19#include "util/callchain.h"
20#include "util/cgroup.h"
21#include "util/header.h"
22#include "util/event.h"
23#include "util/evlist.h"
24#include "util/evsel.h"
25#include "util/debug.h"
26#include "util/session.h"
27#include "util/tool.h"
28#include "util/symbol.h"
29#include "util/cpumap.h"
30#include "util/thread_map.h"
31#include "util/data.h"
32#include "util/perf_regs.h"
33#include "util/auxtrace.h"
34#include "util/tsc.h"
35#include "util/parse-branch-options.h"
36#include "util/parse-regs-options.h"
37#include "util/llvm-utils.h"
38#include "util/bpf-loader.h"
39#include "util/trigger.h"
40#include "util/perf-hooks.h"
41#include "util/cpu-set-sched.h"
42#include "util/time-utils.h"
43#include "util/units.h"
44#include "util/bpf-event.h"
45#include "asm/bug.h"
46
47#include <errno.h>
48#include <inttypes.h>
49#include <locale.h>
50#include <poll.h>
51#include <unistd.h>
52#include <sched.h>
53#include <signal.h>
54#include <sys/mman.h>
55#include <sys/wait.h>
56#include <linux/time64.h>
57
58struct switch_output {
59 bool enabled;
60 bool signal;
61 unsigned long size;
62 unsigned long time;
63 const char *str;
64 bool set;
65 char **filenames;
66 int num_files;
67 int cur_file;
68};
69
70struct record {
71 struct perf_tool tool;
72 struct record_opts opts;
73 u64 bytes_written;
74 struct perf_data data;
75 struct auxtrace_record *itr;
76 struct perf_evlist *evlist;
77 struct perf_session *session;
78 int realtime_prio;
79 bool no_buildid;
80 bool no_buildid_set;
81 bool no_buildid_cache;
82 bool no_buildid_cache_set;
83 bool buildid_all;
84 bool timestamp_filename;
85 bool timestamp_boundary;
86 struct switch_output switch_output;
87 unsigned long long samples;
88 cpu_set_t affinity_mask;
89};
90
91static volatile int auxtrace_record__snapshot_started;
92static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93static DEFINE_TRIGGER(switch_output_trigger);
94
95static const char *affinity_tags[PERF_AFFINITY_MAX] = {
96 "SYS", "NODE", "CPU"
97};
98
99static bool switch_output_signal(struct record *rec)
100{
101 return rec->switch_output.signal &&
102 trigger_is_ready(&switch_output_trigger);
103}
104
105static bool switch_output_size(struct record *rec)
106{
107 return rec->switch_output.size &&
108 trigger_is_ready(&switch_output_trigger) &&
109 (rec->bytes_written >= rec->switch_output.size);
110}
111
112static bool switch_output_time(struct record *rec)
113{
114 return rec->switch_output.time &&
115 trigger_is_ready(&switch_output_trigger);
116}
117
118static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119 void *bf, size_t size)
120{
121 struct perf_data_file *file = &rec->session->data->file;
122
123 if (perf_data_file__write(file, bf, size) < 0) {
124 pr_err("failed to write perf data, error: %m\n");
125 return -1;
126 }
127
128 rec->bytes_written += size;
129
130 if (switch_output_size(rec))
131 trigger_hit(&switch_output_trigger);
132
133 return 0;
134}
135
136static int record__aio_enabled(struct record *rec);
137static int record__comp_enabled(struct record *rec);
138static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
139 void *src, size_t src_size);
140
141#ifdef HAVE_AIO_SUPPORT
142static int record__aio_write(struct aiocb *cblock, int trace_fd,
143 void *buf, size_t size, off_t off)
144{
145 int rc;
146
147 cblock->aio_fildes = trace_fd;
148 cblock->aio_buf = buf;
149 cblock->aio_nbytes = size;
150 cblock->aio_offset = off;
151 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
152
153 do {
154 rc = aio_write(cblock);
155 if (rc == 0) {
156 break;
157 } else if (errno != EAGAIN) {
158 cblock->aio_fildes = -1;
159 pr_err("failed to queue perf data, error: %m\n");
160 break;
161 }
162 } while (1);
163
164 return rc;
165}
166
167static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
168{
169 void *rem_buf;
170 off_t rem_off;
171 size_t rem_size;
172 int rc, aio_errno;
173 ssize_t aio_ret, written;
174
175 aio_errno = aio_error(cblock);
176 if (aio_errno == EINPROGRESS)
177 return 0;
178
179 written = aio_ret = aio_return(cblock);
180 if (aio_ret < 0) {
181 if (aio_errno != EINTR)
182 pr_err("failed to write perf data, error: %m\n");
183 written = 0;
184 }
185
186 rem_size = cblock->aio_nbytes - written;
187
188 if (rem_size == 0) {
189 cblock->aio_fildes = -1;
190 /*
191 * md->refcount is incremented in record__aio_pushfn() for
192 * every aio write request started in record__aio_push() so
193 * decrement it because the request is now complete.
194 */
195 perf_mmap__put(md);
196 rc = 1;
197 } else {
198 /*
199 * aio write request may require restart with the
200 * reminder if the kernel didn't write whole
201 * chunk at once.
202 */
203 rem_off = cblock->aio_offset + written;
204 rem_buf = (void *)(cblock->aio_buf + written);
205 record__aio_write(cblock, cblock->aio_fildes,
206 rem_buf, rem_size, rem_off);
207 rc = 0;
208 }
209
210 return rc;
211}
212
213static int record__aio_sync(struct perf_mmap *md, bool sync_all)
214{
215 struct aiocb **aiocb = md->aio.aiocb;
216 struct aiocb *cblocks = md->aio.cblocks;
217 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
218 int i, do_suspend;
219
220 do {
221 do_suspend = 0;
222 for (i = 0; i < md->aio.nr_cblocks; ++i) {
223 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
224 if (sync_all)
225 aiocb[i] = NULL;
226 else
227 return i;
228 } else {
229 /*
230 * Started aio write is not complete yet
231 * so it has to be waited before the
232 * next allocation.
233 */
234 aiocb[i] = &cblocks[i];
235 do_suspend = 1;
236 }
237 }
238 if (!do_suspend)
239 return -1;
240
241 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
242 if (!(errno == EAGAIN || errno == EINTR))
243 pr_err("failed to sync perf data, error: %m\n");
244 }
245 } while (1);
246}
247
248struct record_aio {
249 struct record *rec;
250 void *data;
251 size_t size;
252};
253
254static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
255{
256 struct record_aio *aio = to;
257
258 /*
259 * map->base data pointed by buf is copied into free map->aio.data[] buffer
260 * to release space in the kernel buffer as fast as possible, calling
261 * perf_mmap__consume() from perf_mmap__push() function.
262 *
263 * That lets the kernel to proceed with storing more profiling data into
264 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
265 *
266 * Coping can be done in two steps in case the chunk of profiling data
267 * crosses the upper bound of the kernel buffer. In this case we first move
268 * part of data from map->start till the upper bound and then the reminder
269 * from the beginning of the kernel buffer till the end of the data chunk.
270 */
271
272 if (record__comp_enabled(aio->rec)) {
273 size = zstd_compress(aio->rec->session, aio->data + aio->size,
274 perf_mmap__mmap_len(map) - aio->size,
275 buf, size);
276 } else {
277 memcpy(aio->data + aio->size, buf, size);
278 }
279
280 if (!aio->size) {
281 /*
282 * Increment map->refcount to guard map->aio.data[] buffer
283 * from premature deallocation because map object can be
284 * released earlier than aio write request started on
285 * map->aio.data[] buffer is complete.
286 *
287 * perf_mmap__put() is done at record__aio_complete()
288 * after started aio request completion or at record__aio_push()
289 * if the request failed to start.
290 */
291 perf_mmap__get(map);
292 }
293
294 aio->size += size;
295
296 return size;
297}
298
299static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
300{
301 int ret, idx;
302 int trace_fd = rec->session->data->file.fd;
303 struct record_aio aio = { .rec = rec, .size = 0 };
304
305 /*
306 * Call record__aio_sync() to wait till map->aio.data[] buffer
307 * becomes available after previous aio write operation.
308 */
309
310 idx = record__aio_sync(map, false);
311 aio.data = map->aio.data[idx];
312 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
313 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
314 return ret;
315
316 rec->samples++;
317 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
318 if (!ret) {
319 *off += aio.size;
320 rec->bytes_written += aio.size;
321 if (switch_output_size(rec))
322 trigger_hit(&switch_output_trigger);
323 } else {
324 /*
325 * Decrement map->refcount incremented in record__aio_pushfn()
326 * back if record__aio_write() operation failed to start, otherwise
327 * map->refcount is decremented in record__aio_complete() after
328 * aio write operation finishes successfully.
329 */
330 perf_mmap__put(map);
331 }
332
333 return ret;
334}
335
336static off_t record__aio_get_pos(int trace_fd)
337{
338 return lseek(trace_fd, 0, SEEK_CUR);
339}
340
341static void record__aio_set_pos(int trace_fd, off_t pos)
342{
343 lseek(trace_fd, pos, SEEK_SET);
344}
345
346static void record__aio_mmap_read_sync(struct record *rec)
347{
348 int i;
349 struct perf_evlist *evlist = rec->evlist;
350 struct perf_mmap *maps = evlist->mmap;
351
352 if (!record__aio_enabled(rec))
353 return;
354
355 for (i = 0; i < evlist->nr_mmaps; i++) {
356 struct perf_mmap *map = &maps[i];
357
358 if (map->base)
359 record__aio_sync(map, true);
360 }
361}
362
363static int nr_cblocks_default = 1;
364static int nr_cblocks_max = 4;
365
366static int record__aio_parse(const struct option *opt,
367 const char *str,
368 int unset)
369{
370 struct record_opts *opts = (struct record_opts *)opt->value;
371
372 if (unset) {
373 opts->nr_cblocks = 0;
374 } else {
375 if (str)
376 opts->nr_cblocks = strtol(str, NULL, 0);
377 if (!opts->nr_cblocks)
378 opts->nr_cblocks = nr_cblocks_default;
379 }
380
381 return 0;
382}
383#else /* HAVE_AIO_SUPPORT */
384static int nr_cblocks_max = 0;
385
386static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
387 off_t *off __maybe_unused)
388{
389 return -1;
390}
391
392static off_t record__aio_get_pos(int trace_fd __maybe_unused)
393{
394 return -1;
395}
396
397static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
398{
399}
400
401static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
402{
403}
404#endif
405
406static int record__aio_enabled(struct record *rec)
407{
408 return rec->opts.nr_cblocks > 0;
409}
410
411#define MMAP_FLUSH_DEFAULT 1
412static int record__mmap_flush_parse(const struct option *opt,
413 const char *str,
414 int unset)
415{
416 int flush_max;
417 struct record_opts *opts = (struct record_opts *)opt->value;
418 static struct parse_tag tags[] = {
419 { .tag = 'B', .mult = 1 },
420 { .tag = 'K', .mult = 1 << 10 },
421 { .tag = 'M', .mult = 1 << 20 },
422 { .tag = 'G', .mult = 1 << 30 },
423 { .tag = 0 },
424 };
425
426 if (unset)
427 return 0;
428
429 if (str) {
430 opts->mmap_flush = parse_tag_value(str, tags);
431 if (opts->mmap_flush == (int)-1)
432 opts->mmap_flush = strtol(str, NULL, 0);
433 }
434
435 if (!opts->mmap_flush)
436 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
437
438 flush_max = perf_evlist__mmap_size(opts->mmap_pages);
439 flush_max /= 4;
440 if (opts->mmap_flush > flush_max)
441 opts->mmap_flush = flush_max;
442
443 return 0;
444}
445
446#ifdef HAVE_ZSTD_SUPPORT
447static unsigned int comp_level_default = 1;
448
449static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
450{
451 struct record_opts *opts = opt->value;
452
453 if (unset) {
454 opts->comp_level = 0;
455 } else {
456 if (str)
457 opts->comp_level = strtol(str, NULL, 0);
458 if (!opts->comp_level)
459 opts->comp_level = comp_level_default;
460 }
461
462 return 0;
463}
464#endif
465static unsigned int comp_level_max = 22;
466
467static int record__comp_enabled(struct record *rec)
468{
469 return rec->opts.comp_level > 0;
470}
471
472static int process_synthesized_event(struct perf_tool *tool,
473 union perf_event *event,
474 struct perf_sample *sample __maybe_unused,
475 struct machine *machine __maybe_unused)
476{
477 struct record *rec = container_of(tool, struct record, tool);
478 return record__write(rec, NULL, event, event->header.size);
479}
480
481static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
482{
483 struct record *rec = to;
484
485 if (record__comp_enabled(rec)) {
486 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
487 bf = map->data;
488 }
489
490 rec->samples++;
491 return record__write(rec, map, bf, size);
492}
493
494static volatile int done;
495static volatile int signr = -1;
496static volatile int child_finished;
497
498static void sig_handler(int sig)
499{
500 if (sig == SIGCHLD)
501 child_finished = 1;
502 else
503 signr = sig;
504
505 done = 1;
506}
507
508static void sigsegv_handler(int sig)
509{
510 perf_hooks__recover();
511 sighandler_dump_stack(sig);
512}
513
514static void record__sig_exit(void)
515{
516 if (signr == -1)
517 return;
518
519 signal(signr, SIG_DFL);
520 raise(signr);
521}
522
523#ifdef HAVE_AUXTRACE_SUPPORT
524
525static int record__process_auxtrace(struct perf_tool *tool,
526 struct perf_mmap *map,
527 union perf_event *event, void *data1,
528 size_t len1, void *data2, size_t len2)
529{
530 struct record *rec = container_of(tool, struct record, tool);
531 struct perf_data *data = &rec->data;
532 size_t padding;
533 u8 pad[8] = {0};
534
535 if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
536 off_t file_offset;
537 int fd = perf_data__fd(data);
538 int err;
539
540 file_offset = lseek(fd, 0, SEEK_CUR);
541 if (file_offset == -1)
542 return -1;
543 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
544 event, file_offset);
545 if (err)
546 return err;
547 }
548
549 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
550 padding = (len1 + len2) & 7;
551 if (padding)
552 padding = 8 - padding;
553
554 record__write(rec, map, event, event->header.size);
555 record__write(rec, map, data1, len1);
556 if (len2)
557 record__write(rec, map, data2, len2);
558 record__write(rec, map, &pad, padding);
559
560 return 0;
561}
562
563static int record__auxtrace_mmap_read(struct record *rec,
564 struct perf_mmap *map)
565{
566 int ret;
567
568 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
569 record__process_auxtrace);
570 if (ret < 0)
571 return ret;
572
573 if (ret)
574 rec->samples++;
575
576 return 0;
577}
578
579static int record__auxtrace_mmap_read_snapshot(struct record *rec,
580 struct perf_mmap *map)
581{
582 int ret;
583
584 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
585 record__process_auxtrace,
586 rec->opts.auxtrace_snapshot_size);
587 if (ret < 0)
588 return ret;
589
590 if (ret)
591 rec->samples++;
592
593 return 0;
594}
595
596static int record__auxtrace_read_snapshot_all(struct record *rec)
597{
598 int i;
599 int rc = 0;
600
601 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
602 struct perf_mmap *map = &rec->evlist->mmap[i];
603
604 if (!map->auxtrace_mmap.base)
605 continue;
606
607 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
608 rc = -1;
609 goto out;
610 }
611 }
612out:
613 return rc;
614}
615
616static void record__read_auxtrace_snapshot(struct record *rec)
617{
618 pr_debug("Recording AUX area tracing snapshot\n");
619 if (record__auxtrace_read_snapshot_all(rec) < 0) {
620 trigger_error(&auxtrace_snapshot_trigger);
621 } else {
622 if (auxtrace_record__snapshot_finish(rec->itr))
623 trigger_error(&auxtrace_snapshot_trigger);
624 else
625 trigger_ready(&auxtrace_snapshot_trigger);
626 }
627}
628
629static int record__auxtrace_init(struct record *rec)
630{
631 int err;
632
633 if (!rec->itr) {
634 rec->itr = auxtrace_record__init(rec->evlist, &err);
635 if (err)
636 return err;
637 }
638
639 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
640 rec->opts.auxtrace_snapshot_opts);
641 if (err)
642 return err;
643
644 return auxtrace_parse_filters(rec->evlist);
645}
646
647#else
648
649static inline
650int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
651 struct perf_mmap *map __maybe_unused)
652{
653 return 0;
654}
655
656static inline
657void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
658{
659}
660
661static inline
662int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
663{
664 return 0;
665}
666
667static int record__auxtrace_init(struct record *rec __maybe_unused)
668{
669 return 0;
670}
671
672#endif
673
674static int record__mmap_evlist(struct record *rec,
675 struct perf_evlist *evlist)
676{
677 struct record_opts *opts = &rec->opts;
678 char msg[512];
679
680 if (opts->affinity != PERF_AFFINITY_SYS)
681 cpu__setup_cpunode_map();
682
683 if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
684 opts->auxtrace_mmap_pages,
685 opts->auxtrace_snapshot_mode,
686 opts->nr_cblocks, opts->affinity,
687 opts->mmap_flush, opts->comp_level) < 0) {
688 if (errno == EPERM) {
689 pr_err("Permission error mapping pages.\n"
690 "Consider increasing "
691 "/proc/sys/kernel/perf_event_mlock_kb,\n"
692 "or try again with a smaller value of -m/--mmap_pages.\n"
693 "(current value: %u,%u)\n",
694 opts->mmap_pages, opts->auxtrace_mmap_pages);
695 return -errno;
696 } else {
697 pr_err("failed to mmap with %d (%s)\n", errno,
698 str_error_r(errno, msg, sizeof(msg)));
699 if (errno)
700 return -errno;
701 else
702 return -EINVAL;
703 }
704 }
705 return 0;
706}
707
708static int record__mmap(struct record *rec)
709{
710 return record__mmap_evlist(rec, rec->evlist);
711}
712
713static int record__open(struct record *rec)
714{
715 char msg[BUFSIZ];
716 struct perf_evsel *pos;
717 struct perf_evlist *evlist = rec->evlist;
718 struct perf_session *session = rec->session;
719 struct record_opts *opts = &rec->opts;
720 int rc = 0;
721
722 /*
723 * For initial_delay we need to add a dummy event so that we can track
724 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
725 * real events, the ones asked by the user.
726 */
727 if (opts->initial_delay) {
728 if (perf_evlist__add_dummy(evlist))
729 return -ENOMEM;
730
731 pos = perf_evlist__first(evlist);
732 pos->tracking = 0;
733 pos = perf_evlist__last(evlist);
734 pos->tracking = 1;
735 pos->attr.enable_on_exec = 1;
736 }
737
738 perf_evlist__config(evlist, opts, &callchain_param);
739
740 evlist__for_each_entry(evlist, pos) {
741try_again:
742 if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
743 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
744 if (verbose > 0)
745 ui__warning("%s\n", msg);
746 goto try_again;
747 }
748 if ((errno == EINVAL || errno == EBADF) &&
749 pos->leader != pos &&
750 pos->weak_group) {
751 pos = perf_evlist__reset_weak_group(evlist, pos);
752 goto try_again;
753 }
754 rc = -errno;
755 perf_evsel__open_strerror(pos, &opts->target,
756 errno, msg, sizeof(msg));
757 ui__error("%s\n", msg);
758 goto out;
759 }
760
761 pos->supported = true;
762 }
763
764 if (perf_evlist__apply_filters(evlist, &pos)) {
765 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
766 pos->filter, perf_evsel__name(pos), errno,
767 str_error_r(errno, msg, sizeof(msg)));
768 rc = -1;
769 goto out;
770 }
771
772 rc = record__mmap(rec);
773 if (rc)
774 goto out;
775
776 session->evlist = evlist;
777 perf_session__set_id_hdr_size(session);
778out:
779 return rc;
780}
781
782static int process_sample_event(struct perf_tool *tool,
783 union perf_event *event,
784 struct perf_sample *sample,
785 struct perf_evsel *evsel,
786 struct machine *machine)
787{
788 struct record *rec = container_of(tool, struct record, tool);
789
790 if (rec->evlist->first_sample_time == 0)
791 rec->evlist->first_sample_time = sample->time;
792
793 rec->evlist->last_sample_time = sample->time;
794
795 if (rec->buildid_all)
796 return 0;
797
798 rec->samples++;
799 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
800}
801
802static int process_buildids(struct record *rec)
803{
804 struct perf_session *session = rec->session;
805
806 if (perf_data__size(&rec->data) == 0)
807 return 0;
808
809 /*
810 * During this process, it'll load kernel map and replace the
811 * dso->long_name to a real pathname it found. In this case
812 * we prefer the vmlinux path like
813 * /lib/modules/3.16.4/build/vmlinux
814 *
815 * rather than build-id path (in debug directory).
816 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
817 */
818 symbol_conf.ignore_vmlinux_buildid = true;
819
820 /*
821 * If --buildid-all is given, it marks all DSO regardless of hits,
822 * so no need to process samples. But if timestamp_boundary is enabled,
823 * it still needs to walk on all samples to get the timestamps of
824 * first/last samples.
825 */
826 if (rec->buildid_all && !rec->timestamp_boundary)
827 rec->tool.sample = NULL;
828
829 return perf_session__process_events(session);
830}
831
832static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
833{
834 int err;
835 struct perf_tool *tool = data;
836 /*
837 *As for guest kernel when processing subcommand record&report,
838 *we arrange module mmap prior to guest kernel mmap and trigger
839 *a preload dso because default guest module symbols are loaded
840 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
841 *method is used to avoid symbol missing when the first addr is
842 *in module instead of in guest kernel.
843 */
844 err = perf_event__synthesize_modules(tool, process_synthesized_event,
845 machine);
846 if (err < 0)
847 pr_err("Couldn't record guest kernel [%d]'s reference"
848 " relocation symbol.\n", machine->pid);
849
850 /*
851 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
852 * have no _text sometimes.
853 */
854 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
855 machine);
856 if (err < 0)
857 pr_err("Couldn't record guest kernel [%d]'s reference"
858 " relocation symbol.\n", machine->pid);
859}
860
861static struct perf_event_header finished_round_event = {
862 .size = sizeof(struct perf_event_header),
863 .type = PERF_RECORD_FINISHED_ROUND,
864};
865
866static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
867{
868 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
869 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
870 CPU_ZERO(&rec->affinity_mask);
871 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
872 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
873 }
874}
875
876static size_t process_comp_header(void *record, size_t increment)
877{
878 struct compressed_event *event = record;
879 size_t size = sizeof(*event);
880
881 if (increment) {
882 event->header.size += increment;
883 return increment;
884 }
885
886 event->header.type = PERF_RECORD_COMPRESSED;
887 event->header.size = size;
888
889 return size;
890}
891
892static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
893 void *src, size_t src_size)
894{
895 size_t compressed;
896 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct compressed_event) - 1;
897
898 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
899 max_record_size, process_comp_header);
900
901 session->bytes_transferred += src_size;
902 session->bytes_compressed += compressed;
903
904 return compressed;
905}
906
907static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
908 bool overwrite, bool synch)
909{
910 u64 bytes_written = rec->bytes_written;
911 int i;
912 int rc = 0;
913 struct perf_mmap *maps;
914 int trace_fd = rec->data.file.fd;
915 off_t off = 0;
916
917 if (!evlist)
918 return 0;
919
920 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
921 if (!maps)
922 return 0;
923
924 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
925 return 0;
926
927 if (record__aio_enabled(rec))
928 off = record__aio_get_pos(trace_fd);
929
930 for (i = 0; i < evlist->nr_mmaps; i++) {
931 u64 flush = 0;
932 struct perf_mmap *map = &maps[i];
933
934 if (map->base) {
935 record__adjust_affinity(rec, map);
936 if (synch) {
937 flush = map->flush;
938 map->flush = 1;
939 }
940 if (!record__aio_enabled(rec)) {
941 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
942 if (synch)
943 map->flush = flush;
944 rc = -1;
945 goto out;
946 }
947 } else {
948 if (record__aio_push(rec, map, &off) < 0) {
949 record__aio_set_pos(trace_fd, off);
950 if (synch)
951 map->flush = flush;
952 rc = -1;
953 goto out;
954 }
955 }
956 if (synch)
957 map->flush = flush;
958 }
959
960 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
961 record__auxtrace_mmap_read(rec, map) != 0) {
962 rc = -1;
963 goto out;
964 }
965 }
966
967 if (record__aio_enabled(rec))
968 record__aio_set_pos(trace_fd, off);
969
970 /*
971 * Mark the round finished in case we wrote
972 * at least one event.
973 */
974 if (bytes_written != rec->bytes_written)
975 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
976
977 if (overwrite)
978 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
979out:
980 return rc;
981}
982
983static int record__mmap_read_all(struct record *rec, bool synch)
984{
985 int err;
986
987 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
988 if (err)
989 return err;
990
991 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
992}
993
994static void record__init_features(struct record *rec)
995{
996 struct perf_session *session = rec->session;
997 int feat;
998
999 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1000 perf_header__set_feat(&session->header, feat);
1001
1002 if (rec->no_buildid)
1003 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1004
1005 if (!have_tracepoints(&rec->evlist->entries))
1006 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1007
1008 if (!rec->opts.branch_stack)
1009 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1010
1011 if (!rec->opts.full_auxtrace)
1012 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1013
1014 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1015 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1016
1017 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1018 if (!record__comp_enabled(rec))
1019 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1020
1021 perf_header__clear_feat(&session->header, HEADER_STAT);
1022}
1023
1024static void
1025record__finish_output(struct record *rec)
1026{
1027 struct perf_data *data = &rec->data;
1028 int fd = perf_data__fd(data);
1029
1030 if (data->is_pipe)
1031 return;
1032
1033 rec->session->header.data_size += rec->bytes_written;
1034 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1035
1036 if (!rec->no_buildid) {
1037 process_buildids(rec);
1038
1039 if (rec->buildid_all)
1040 dsos__hit_all(rec->session);
1041 }
1042 perf_session__write_header(rec->session, rec->evlist, fd, true);
1043
1044 return;
1045}
1046
1047static int record__synthesize_workload(struct record *rec, bool tail)
1048{
1049 int err;
1050 struct thread_map *thread_map;
1051
1052 if (rec->opts.tail_synthesize != tail)
1053 return 0;
1054
1055 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1056 if (thread_map == NULL)
1057 return -1;
1058
1059 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1060 process_synthesized_event,
1061 &rec->session->machines.host,
1062 rec->opts.sample_address);
1063 thread_map__put(thread_map);
1064 return err;
1065}
1066
1067static int record__synthesize(struct record *rec, bool tail);
1068
1069static int
1070record__switch_output(struct record *rec, bool at_exit)
1071{
1072 struct perf_data *data = &rec->data;
1073 int fd, err;
1074 char *new_filename;
1075
1076 /* Same Size: "2015122520103046"*/
1077 char timestamp[] = "InvalidTimestamp";
1078
1079 record__aio_mmap_read_sync(rec);
1080
1081 record__synthesize(rec, true);
1082 if (target__none(&rec->opts.target))
1083 record__synthesize_workload(rec, true);
1084
1085 rec->samples = 0;
1086 record__finish_output(rec);
1087 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1088 if (err) {
1089 pr_err("Failed to get current timestamp\n");
1090 return -EINVAL;
1091 }
1092
1093 fd = perf_data__switch(data, timestamp,
1094 rec->session->header.data_offset,
1095 at_exit, &new_filename);
1096 if (fd >= 0 && !at_exit) {
1097 rec->bytes_written = 0;
1098 rec->session->header.data_size = 0;
1099 }
1100
1101 if (!quiet)
1102 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1103 data->path, timestamp);
1104
1105 if (rec->switch_output.num_files) {
1106 int n = rec->switch_output.cur_file + 1;
1107
1108 if (n >= rec->switch_output.num_files)
1109 n = 0;
1110 rec->switch_output.cur_file = n;
1111 if (rec->switch_output.filenames[n]) {
1112 remove(rec->switch_output.filenames[n]);
1113 free(rec->switch_output.filenames[n]);
1114 }
1115 rec->switch_output.filenames[n] = new_filename;
1116 } else {
1117 free(new_filename);
1118 }
1119
1120 /* Output tracking events */
1121 if (!at_exit) {
1122 record__synthesize(rec, false);
1123
1124 /*
1125 * In 'perf record --switch-output' without -a,
1126 * record__synthesize() in record__switch_output() won't
1127 * generate tracking events because there's no thread_map
1128 * in evlist. Which causes newly created perf.data doesn't
1129 * contain map and comm information.
1130 * Create a fake thread_map and directly call
1131 * perf_event__synthesize_thread_map() for those events.
1132 */
1133 if (target__none(&rec->opts.target))
1134 record__synthesize_workload(rec, false);
1135 }
1136 return fd;
1137}
1138
1139static volatile int workload_exec_errno;
1140
1141/*
1142 * perf_evlist__prepare_workload will send a SIGUSR1
1143 * if the fork fails, since we asked by setting its
1144 * want_signal to true.
1145 */
1146static void workload_exec_failed_signal(int signo __maybe_unused,
1147 siginfo_t *info,
1148 void *ucontext __maybe_unused)
1149{
1150 workload_exec_errno = info->si_value.sival_int;
1151 done = 1;
1152 child_finished = 1;
1153}
1154
1155static void snapshot_sig_handler(int sig);
1156static void alarm_sig_handler(int sig);
1157
1158int __weak
1159perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1160 struct perf_tool *tool __maybe_unused,
1161 perf_event__handler_t process __maybe_unused,
1162 struct machine *machine __maybe_unused)
1163{
1164 return 0;
1165}
1166
1167static const struct perf_event_mmap_page *
1168perf_evlist__pick_pc(struct perf_evlist *evlist)
1169{
1170 if (evlist) {
1171 if (evlist->mmap && evlist->mmap[0].base)
1172 return evlist->mmap[0].base;
1173 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1174 return evlist->overwrite_mmap[0].base;
1175 }
1176 return NULL;
1177}
1178
1179static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1180{
1181 const struct perf_event_mmap_page *pc;
1182
1183 pc = perf_evlist__pick_pc(rec->evlist);
1184 if (pc)
1185 return pc;
1186 return NULL;
1187}
1188
1189static int record__synthesize(struct record *rec, bool tail)
1190{
1191 struct perf_session *session = rec->session;
1192 struct machine *machine = &session->machines.host;
1193 struct perf_data *data = &rec->data;
1194 struct record_opts *opts = &rec->opts;
1195 struct perf_tool *tool = &rec->tool;
1196 int fd = perf_data__fd(data);
1197 int err = 0;
1198
1199 if (rec->opts.tail_synthesize != tail)
1200 return 0;
1201
1202 if (data->is_pipe) {
1203 /*
1204 * We need to synthesize events first, because some
1205 * features works on top of them (on report side).
1206 */
1207 err = perf_event__synthesize_attrs(tool, rec->evlist,
1208 process_synthesized_event);
1209 if (err < 0) {
1210 pr_err("Couldn't synthesize attrs.\n");
1211 goto out;
1212 }
1213
1214 err = perf_event__synthesize_features(tool, session, rec->evlist,
1215 process_synthesized_event);
1216 if (err < 0) {
1217 pr_err("Couldn't synthesize features.\n");
1218 return err;
1219 }
1220
1221 if (have_tracepoints(&rec->evlist->entries)) {
1222 /*
1223 * FIXME err <= 0 here actually means that
1224 * there were no tracepoints so its not really
1225 * an error, just that we don't need to
1226 * synthesize anything. We really have to
1227 * return this more properly and also
1228 * propagate errors that now are calling die()
1229 */
1230 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1231 process_synthesized_event);
1232 if (err <= 0) {
1233 pr_err("Couldn't record tracing data.\n");
1234 goto out;
1235 }
1236 rec->bytes_written += err;
1237 }
1238 }
1239
1240 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1241 process_synthesized_event, machine);
1242 if (err)
1243 goto out;
1244
1245 if (rec->opts.full_auxtrace) {
1246 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1247 session, process_synthesized_event);
1248 if (err)
1249 goto out;
1250 }
1251
1252 if (!perf_evlist__exclude_kernel(rec->evlist)) {
1253 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1254 machine);
1255 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1256 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1257 "Check /proc/kallsyms permission or run as root.\n");
1258
1259 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1260 machine);
1261 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1262 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1263 "Check /proc/modules permission or run as root.\n");
1264 }
1265
1266 if (perf_guest) {
1267 machines__process_guests(&session->machines,
1268 perf_event__synthesize_guest_os, tool);
1269 }
1270
1271 err = perf_event__synthesize_extra_attr(&rec->tool,
1272 rec->evlist,
1273 process_synthesized_event,
1274 data->is_pipe);
1275 if (err)
1276 goto out;
1277
1278 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
1279 process_synthesized_event,
1280 NULL);
1281 if (err < 0) {
1282 pr_err("Couldn't synthesize thread map.\n");
1283 return err;
1284 }
1285
1286 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
1287 process_synthesized_event, NULL);
1288 if (err < 0) {
1289 pr_err("Couldn't synthesize cpu map.\n");
1290 return err;
1291 }
1292
1293 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1294 machine, opts);
1295 if (err < 0)
1296 pr_warning("Couldn't synthesize bpf events.\n");
1297
1298 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
1299 process_synthesized_event, opts->sample_address,
1300 1);
1301out:
1302 return err;
1303}
1304
1305static int __cmd_record(struct record *rec, int argc, const char **argv)
1306{
1307 int err;
1308 int status = 0;
1309 unsigned long waking = 0;
1310 const bool forks = argc > 0;
1311 struct perf_tool *tool = &rec->tool;
1312 struct record_opts *opts = &rec->opts;
1313 struct perf_data *data = &rec->data;
1314 struct perf_session *session;
1315 bool disabled = false, draining = false;
1316 struct perf_evlist *sb_evlist = NULL;
1317 int fd;
1318 float ratio = 0;
1319
1320 atexit(record__sig_exit);
1321 signal(SIGCHLD, sig_handler);
1322 signal(SIGINT, sig_handler);
1323 signal(SIGTERM, sig_handler);
1324 signal(SIGSEGV, sigsegv_handler);
1325
1326 if (rec->opts.record_namespaces)
1327 tool->namespace_events = true;
1328
1329 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1330 signal(SIGUSR2, snapshot_sig_handler);
1331 if (rec->opts.auxtrace_snapshot_mode)
1332 trigger_on(&auxtrace_snapshot_trigger);
1333 if (rec->switch_output.enabled)
1334 trigger_on(&switch_output_trigger);
1335 } else {
1336 signal(SIGUSR2, SIG_IGN);
1337 }
1338
1339 session = perf_session__new(data, false, tool);
1340 if (session == NULL) {
1341 pr_err("Perf session creation failed.\n");
1342 return -1;
1343 }
1344
1345 fd = perf_data__fd(data);
1346 rec->session = session;
1347
1348 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1349 pr_err("Compression initialization failed.\n");
1350 return -1;
1351 }
1352
1353 session->header.env.comp_type = PERF_COMP_ZSTD;
1354 session->header.env.comp_level = rec->opts.comp_level;
1355
1356 record__init_features(rec);
1357
1358 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1359 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1360
1361 if (forks) {
1362 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1363 argv, data->is_pipe,
1364 workload_exec_failed_signal);
1365 if (err < 0) {
1366 pr_err("Couldn't run the workload!\n");
1367 status = err;
1368 goto out_delete_session;
1369 }
1370 }
1371
1372 /*
1373 * If we have just single event and are sending data
1374 * through pipe, we need to force the ids allocation,
1375 * because we synthesize event name through the pipe
1376 * and need the id for that.
1377 */
1378 if (data->is_pipe && rec->evlist->nr_entries == 1)
1379 rec->opts.sample_id = true;
1380
1381 if (record__open(rec) != 0) {
1382 err = -1;
1383 goto out_child;
1384 }
1385 session->header.env.comp_mmap_len = session->evlist->mmap_len;
1386
1387 err = bpf__apply_obj_config();
1388 if (err) {
1389 char errbuf[BUFSIZ];
1390
1391 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1392 pr_err("ERROR: Apply config to BPF failed: %s\n",
1393 errbuf);
1394 goto out_child;
1395 }
1396
1397 /*
1398 * Normally perf_session__new would do this, but it doesn't have the
1399 * evlist.
1400 */
1401 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1402 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1403 rec->tool.ordered_events = false;
1404 }
1405
1406 if (!rec->evlist->nr_groups)
1407 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1408
1409 if (data->is_pipe) {
1410 err = perf_header__write_pipe(fd);
1411 if (err < 0)
1412 goto out_child;
1413 } else {
1414 err = perf_session__write_header(session, rec->evlist, fd, false);
1415 if (err < 0)
1416 goto out_child;
1417 }
1418
1419 if (!rec->no_buildid
1420 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1421 pr_err("Couldn't generate buildids. "
1422 "Use --no-buildid to profile anyway.\n");
1423 err = -1;
1424 goto out_child;
1425 }
1426
1427 if (!opts->no_bpf_event)
1428 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1429
1430 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1431 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1432 opts->no_bpf_event = true;
1433 }
1434
1435 err = record__synthesize(rec, false);
1436 if (err < 0)
1437 goto out_child;
1438
1439 if (rec->realtime_prio) {
1440 struct sched_param param;
1441
1442 param.sched_priority = rec->realtime_prio;
1443 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1444 pr_err("Could not set realtime priority.\n");
1445 err = -1;
1446 goto out_child;
1447 }
1448 }
1449
1450 /*
1451 * When perf is starting the traced process, all the events
1452 * (apart from group members) have enable_on_exec=1 set,
1453 * so don't spoil it by prematurely enabling them.
1454 */
1455 if (!target__none(&opts->target) && !opts->initial_delay)
1456 perf_evlist__enable(rec->evlist);
1457
1458 /*
1459 * Let the child rip
1460 */
1461 if (forks) {
1462 struct machine *machine = &session->machines.host;
1463 union perf_event *event;
1464 pid_t tgid;
1465
1466 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1467 if (event == NULL) {
1468 err = -ENOMEM;
1469 goto out_child;
1470 }
1471
1472 /*
1473 * Some H/W events are generated before COMM event
1474 * which is emitted during exec(), so perf script
1475 * cannot see a correct process name for those events.
1476 * Synthesize COMM event to prevent it.
1477 */
1478 tgid = perf_event__synthesize_comm(tool, event,
1479 rec->evlist->workload.pid,
1480 process_synthesized_event,
1481 machine);
1482 free(event);
1483
1484 if (tgid == -1)
1485 goto out_child;
1486
1487 event = malloc(sizeof(event->namespaces) +
1488 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1489 machine->id_hdr_size);
1490 if (event == NULL) {
1491 err = -ENOMEM;
1492 goto out_child;
1493 }
1494
1495 /*
1496 * Synthesize NAMESPACES event for the command specified.
1497 */
1498 perf_event__synthesize_namespaces(tool, event,
1499 rec->evlist->workload.pid,
1500 tgid, process_synthesized_event,
1501 machine);
1502 free(event);
1503
1504 perf_evlist__start_workload(rec->evlist);
1505 }
1506
1507 if (opts->initial_delay) {
1508 usleep(opts->initial_delay * USEC_PER_MSEC);
1509 perf_evlist__enable(rec->evlist);
1510 }
1511
1512 trigger_ready(&auxtrace_snapshot_trigger);
1513 trigger_ready(&switch_output_trigger);
1514 perf_hooks__invoke_record_start();
1515 for (;;) {
1516 unsigned long long hits = rec->samples;
1517
1518 /*
1519 * rec->evlist->bkw_mmap_state is possible to be
1520 * BKW_MMAP_EMPTY here: when done == true and
1521 * hits != rec->samples in previous round.
1522 *
1523 * perf_evlist__toggle_bkw_mmap ensure we never
1524 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1525 */
1526 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1527 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1528
1529 if (record__mmap_read_all(rec, false) < 0) {
1530 trigger_error(&auxtrace_snapshot_trigger);
1531 trigger_error(&switch_output_trigger);
1532 err = -1;
1533 goto out_child;
1534 }
1535
1536 if (auxtrace_record__snapshot_started) {
1537 auxtrace_record__snapshot_started = 0;
1538 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1539 record__read_auxtrace_snapshot(rec);
1540 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1541 pr_err("AUX area tracing snapshot failed\n");
1542 err = -1;
1543 goto out_child;
1544 }
1545 }
1546
1547 if (trigger_is_hit(&switch_output_trigger)) {
1548 /*
1549 * If switch_output_trigger is hit, the data in
1550 * overwritable ring buffer should have been collected,
1551 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1552 *
1553 * If SIGUSR2 raise after or during record__mmap_read_all(),
1554 * record__mmap_read_all() didn't collect data from
1555 * overwritable ring buffer. Read again.
1556 */
1557 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1558 continue;
1559 trigger_ready(&switch_output_trigger);
1560
1561 /*
1562 * Reenable events in overwrite ring buffer after
1563 * record__mmap_read_all(): we should have collected
1564 * data from it.
1565 */
1566 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1567
1568 if (!quiet)
1569 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1570 waking);
1571 waking = 0;
1572 fd = record__switch_output(rec, false);
1573 if (fd < 0) {
1574 pr_err("Failed to switch to new file\n");
1575 trigger_error(&switch_output_trigger);
1576 err = fd;
1577 goto out_child;
1578 }
1579
1580 /* re-arm the alarm */
1581 if (rec->switch_output.time)
1582 alarm(rec->switch_output.time);
1583 }
1584
1585 if (hits == rec->samples) {
1586 if (done || draining)
1587 break;
1588 err = perf_evlist__poll(rec->evlist, -1);
1589 /*
1590 * Propagate error, only if there's any. Ignore positive
1591 * number of returned events and interrupt error.
1592 */
1593 if (err > 0 || (err < 0 && errno == EINTR))
1594 err = 0;
1595 waking++;
1596
1597 if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1598 draining = true;
1599 }
1600
1601 /*
1602 * When perf is starting the traced process, at the end events
1603 * die with the process and we wait for that. Thus no need to
1604 * disable events in this case.
1605 */
1606 if (done && !disabled && !target__none(&opts->target)) {
1607 trigger_off(&auxtrace_snapshot_trigger);
1608 perf_evlist__disable(rec->evlist);
1609 disabled = true;
1610 }
1611 }
1612 trigger_off(&auxtrace_snapshot_trigger);
1613 trigger_off(&switch_output_trigger);
1614
1615 if (forks && workload_exec_errno) {
1616 char msg[STRERR_BUFSIZE];
1617 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1618 pr_err("Workload failed: %s\n", emsg);
1619 err = -1;
1620 goto out_child;
1621 }
1622
1623 if (!quiet)
1624 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1625
1626 if (target__none(&rec->opts.target))
1627 record__synthesize_workload(rec, true);
1628
1629out_child:
1630 record__mmap_read_all(rec, true);
1631 record__aio_mmap_read_sync(rec);
1632
1633 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1634 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1635 session->header.env.comp_ratio = ratio + 0.5;
1636 }
1637
1638 if (forks) {
1639 int exit_status;
1640
1641 if (!child_finished)
1642 kill(rec->evlist->workload.pid, SIGTERM);
1643
1644 wait(&exit_status);
1645
1646 if (err < 0)
1647 status = err;
1648 else if (WIFEXITED(exit_status))
1649 status = WEXITSTATUS(exit_status);
1650 else if (WIFSIGNALED(exit_status))
1651 signr = WTERMSIG(exit_status);
1652 } else
1653 status = err;
1654
1655 record__synthesize(rec, true);
1656 /* this will be recalculated during process_buildids() */
1657 rec->samples = 0;
1658
1659 if (!err) {
1660 if (!rec->timestamp_filename) {
1661 record__finish_output(rec);
1662 } else {
1663 fd = record__switch_output(rec, true);
1664 if (fd < 0) {
1665 status = fd;
1666 goto out_delete_session;
1667 }
1668 }
1669 }
1670
1671 perf_hooks__invoke_record_end();
1672
1673 if (!err && !quiet) {
1674 char samples[128];
1675 const char *postfix = rec->timestamp_filename ?
1676 ".<timestamp>" : "";
1677
1678 if (rec->samples && !rec->opts.full_auxtrace)
1679 scnprintf(samples, sizeof(samples),
1680 " (%" PRIu64 " samples)", rec->samples);
1681 else
1682 samples[0] = '\0';
1683
1684 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1685 perf_data__size(data) / 1024.0 / 1024.0,
1686 data->path, postfix, samples);
1687 if (ratio) {
1688 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1689 rec->session->bytes_transferred / 1024.0 / 1024.0,
1690 ratio);
1691 }
1692 fprintf(stderr, " ]\n");
1693 }
1694
1695out_delete_session:
1696 zstd_fini(&session->zstd_data);
1697 perf_session__delete(session);
1698
1699 if (!opts->no_bpf_event)
1700 perf_evlist__stop_sb_thread(sb_evlist);
1701 return status;
1702}
1703
1704static void callchain_debug(struct callchain_param *callchain)
1705{
1706 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1707
1708 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1709
1710 if (callchain->record_mode == CALLCHAIN_DWARF)
1711 pr_debug("callchain: stack dump size %d\n",
1712 callchain->dump_size);
1713}
1714
1715int record_opts__parse_callchain(struct record_opts *record,
1716 struct callchain_param *callchain,
1717 const char *arg, bool unset)
1718{
1719 int ret;
1720 callchain->enabled = !unset;
1721
1722 /* --no-call-graph */
1723 if (unset) {
1724 callchain->record_mode = CALLCHAIN_NONE;
1725 pr_debug("callchain: disabled\n");
1726 return 0;
1727 }
1728
1729 ret = parse_callchain_record_opt(arg, callchain);
1730 if (!ret) {
1731 /* Enable data address sampling for DWARF unwind. */
1732 if (callchain->record_mode == CALLCHAIN_DWARF)
1733 record->sample_address = true;
1734 callchain_debug(callchain);
1735 }
1736
1737 return ret;
1738}
1739
1740int record_parse_callchain_opt(const struct option *opt,
1741 const char *arg,
1742 int unset)
1743{
1744 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1745}
1746
1747int record_callchain_opt(const struct option *opt,
1748 const char *arg __maybe_unused,
1749 int unset __maybe_unused)
1750{
1751 struct callchain_param *callchain = opt->value;
1752
1753 callchain->enabled = true;
1754
1755 if (callchain->record_mode == CALLCHAIN_NONE)
1756 callchain->record_mode = CALLCHAIN_FP;
1757
1758 callchain_debug(callchain);
1759 return 0;
1760}
1761
1762static int perf_record_config(const char *var, const char *value, void *cb)
1763{
1764 struct record *rec = cb;
1765
1766 if (!strcmp(var, "record.build-id")) {
1767 if (!strcmp(value, "cache"))
1768 rec->no_buildid_cache = false;
1769 else if (!strcmp(value, "no-cache"))
1770 rec->no_buildid_cache = true;
1771 else if (!strcmp(value, "skip"))
1772 rec->no_buildid = true;
1773 else
1774 return -1;
1775 return 0;
1776 }
1777 if (!strcmp(var, "record.call-graph")) {
1778 var = "call-graph.record-mode";
1779 return perf_default_config(var, value, cb);
1780 }
1781#ifdef HAVE_AIO_SUPPORT
1782 if (!strcmp(var, "record.aio")) {
1783 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1784 if (!rec->opts.nr_cblocks)
1785 rec->opts.nr_cblocks = nr_cblocks_default;
1786 }
1787#endif
1788
1789 return 0;
1790}
1791
1792struct clockid_map {
1793 const char *name;
1794 int clockid;
1795};
1796
1797#define CLOCKID_MAP(n, c) \
1798 { .name = n, .clockid = (c), }
1799
1800#define CLOCKID_END { .name = NULL, }
1801
1802
1803/*
1804 * Add the missing ones, we need to build on many distros...
1805 */
1806#ifndef CLOCK_MONOTONIC_RAW
1807#define CLOCK_MONOTONIC_RAW 4
1808#endif
1809#ifndef CLOCK_BOOTTIME
1810#define CLOCK_BOOTTIME 7
1811#endif
1812#ifndef CLOCK_TAI
1813#define CLOCK_TAI 11
1814#endif
1815
1816static const struct clockid_map clockids[] = {
1817 /* available for all events, NMI safe */
1818 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1819 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1820
1821 /* available for some events */
1822 CLOCKID_MAP("realtime", CLOCK_REALTIME),
1823 CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1824 CLOCKID_MAP("tai", CLOCK_TAI),
1825
1826 /* available for the lazy */
1827 CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1828 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1829 CLOCKID_MAP("real", CLOCK_REALTIME),
1830 CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1831
1832 CLOCKID_END,
1833};
1834
1835static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1836{
1837 struct timespec res;
1838
1839 *res_ns = 0;
1840 if (!clock_getres(clk_id, &res))
1841 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1842 else
1843 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1844
1845 return 0;
1846}
1847
1848static int parse_clockid(const struct option *opt, const char *str, int unset)
1849{
1850 struct record_opts *opts = (struct record_opts *)opt->value;
1851 const struct clockid_map *cm;
1852 const char *ostr = str;
1853
1854 if (unset) {
1855 opts->use_clockid = 0;
1856 return 0;
1857 }
1858
1859 /* no arg passed */
1860 if (!str)
1861 return 0;
1862
1863 /* no setting it twice */
1864 if (opts->use_clockid)
1865 return -1;
1866
1867 opts->use_clockid = true;
1868
1869 /* if its a number, we're done */
1870 if (sscanf(str, "%d", &opts->clockid) == 1)
1871 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1872
1873 /* allow a "CLOCK_" prefix to the name */
1874 if (!strncasecmp(str, "CLOCK_", 6))
1875 str += 6;
1876
1877 for (cm = clockids; cm->name; cm++) {
1878 if (!strcasecmp(str, cm->name)) {
1879 opts->clockid = cm->clockid;
1880 return get_clockid_res(opts->clockid,
1881 &opts->clockid_res_ns);
1882 }
1883 }
1884
1885 opts->use_clockid = false;
1886 ui__warning("unknown clockid %s, check man page\n", ostr);
1887 return -1;
1888}
1889
1890static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1891{
1892 struct record_opts *opts = (struct record_opts *)opt->value;
1893
1894 if (unset || !str)
1895 return 0;
1896
1897 if (!strcasecmp(str, "node"))
1898 opts->affinity = PERF_AFFINITY_NODE;
1899 else if (!strcasecmp(str, "cpu"))
1900 opts->affinity = PERF_AFFINITY_CPU;
1901
1902 return 0;
1903}
1904
1905static int record__parse_mmap_pages(const struct option *opt,
1906 const char *str,
1907 int unset __maybe_unused)
1908{
1909 struct record_opts *opts = opt->value;
1910 char *s, *p;
1911 unsigned int mmap_pages;
1912 int ret;
1913
1914 if (!str)
1915 return -EINVAL;
1916
1917 s = strdup(str);
1918 if (!s)
1919 return -ENOMEM;
1920
1921 p = strchr(s, ',');
1922 if (p)
1923 *p = '\0';
1924
1925 if (*s) {
1926 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1927 if (ret)
1928 goto out_free;
1929 opts->mmap_pages = mmap_pages;
1930 }
1931
1932 if (!p) {
1933 ret = 0;
1934 goto out_free;
1935 }
1936
1937 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1938 if (ret)
1939 goto out_free;
1940
1941 opts->auxtrace_mmap_pages = mmap_pages;
1942
1943out_free:
1944 free(s);
1945 return ret;
1946}
1947
1948static void switch_output_size_warn(struct record *rec)
1949{
1950 u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1951 struct switch_output *s = &rec->switch_output;
1952
1953 wakeup_size /= 2;
1954
1955 if (s->size < wakeup_size) {
1956 char buf[100];
1957
1958 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1959 pr_warning("WARNING: switch-output data size lower than "
1960 "wakeup kernel buffer size (%s) "
1961 "expect bigger perf.data sizes\n", buf);
1962 }
1963}
1964
1965static int switch_output_setup(struct record *rec)
1966{
1967 struct switch_output *s = &rec->switch_output;
1968 static struct parse_tag tags_size[] = {
1969 { .tag = 'B', .mult = 1 },
1970 { .tag = 'K', .mult = 1 << 10 },
1971 { .tag = 'M', .mult = 1 << 20 },
1972 { .tag = 'G', .mult = 1 << 30 },
1973 { .tag = 0 },
1974 };
1975 static struct parse_tag tags_time[] = {
1976 { .tag = 's', .mult = 1 },
1977 { .tag = 'm', .mult = 60 },
1978 { .tag = 'h', .mult = 60*60 },
1979 { .tag = 'd', .mult = 60*60*24 },
1980 { .tag = 0 },
1981 };
1982 unsigned long val;
1983
1984 if (!s->set)
1985 return 0;
1986
1987 if (!strcmp(s->str, "signal")) {
1988 s->signal = true;
1989 pr_debug("switch-output with SIGUSR2 signal\n");
1990 goto enabled;
1991 }
1992
1993 val = parse_tag_value(s->str, tags_size);
1994 if (val != (unsigned long) -1) {
1995 s->size = val;
1996 pr_debug("switch-output with %s size threshold\n", s->str);
1997 goto enabled;
1998 }
1999
2000 val = parse_tag_value(s->str, tags_time);
2001 if (val != (unsigned long) -1) {
2002 s->time = val;
2003 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2004 s->str, s->time);
2005 goto enabled;
2006 }
2007
2008 return -1;
2009
2010enabled:
2011 rec->timestamp_filename = true;
2012 s->enabled = true;
2013
2014 if (s->size && !rec->opts.no_buffering)
2015 switch_output_size_warn(rec);
2016
2017 return 0;
2018}
2019
2020static const char * const __record_usage[] = {
2021 "perf record [<options>] [<command>]",
2022 "perf record [<options>] -- <command> [<options>]",
2023 NULL
2024};
2025const char * const *record_usage = __record_usage;
2026
2027/*
2028 * XXX Ideally would be local to cmd_record() and passed to a record__new
2029 * because we need to have access to it in record__exit, that is called
2030 * after cmd_record() exits, but since record_options need to be accessible to
2031 * builtin-script, leave it here.
2032 *
2033 * At least we don't ouch it in all the other functions here directly.
2034 *
2035 * Just say no to tons of global variables, sigh.
2036 */
2037static struct record record = {
2038 .opts = {
2039 .sample_time = true,
2040 .mmap_pages = UINT_MAX,
2041 .user_freq = UINT_MAX,
2042 .user_interval = ULLONG_MAX,
2043 .freq = 4000,
2044 .target = {
2045 .uses_mmap = true,
2046 .default_per_cpu = true,
2047 },
2048 .mmap_flush = MMAP_FLUSH_DEFAULT,
2049 },
2050 .tool = {
2051 .sample = process_sample_event,
2052 .fork = perf_event__process_fork,
2053 .exit = perf_event__process_exit,
2054 .comm = perf_event__process_comm,
2055 .namespaces = perf_event__process_namespaces,
2056 .mmap = perf_event__process_mmap,
2057 .mmap2 = perf_event__process_mmap2,
2058 .ordered_events = true,
2059 },
2060};
2061
2062const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2063 "\n\t\t\t\tDefault: fp";
2064
2065static bool dry_run;
2066
2067/*
2068 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2069 * with it and switch to use the library functions in perf_evlist that came
2070 * from builtin-record.c, i.e. use record_opts,
2071 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2072 * using pipes, etc.
2073 */
2074static struct option __record_options[] = {
2075 OPT_CALLBACK('e', "event", &record.evlist, "event",
2076 "event selector. use 'perf list' to list available events",
2077 parse_events_option),
2078 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2079 "event filter", parse_filter),
2080 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2081 NULL, "don't record events from perf itself",
2082 exclude_perf),
2083 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2084 "record events on existing process id"),
2085 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2086 "record events on existing thread id"),
2087 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2088 "collect data with this RT SCHED_FIFO priority"),
2089 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2090 "collect data without buffering"),
2091 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2092 "collect raw sample records from all opened counters"),
2093 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2094 "system-wide collection from all CPUs"),
2095 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2096 "list of cpus to monitor"),
2097 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2098 OPT_STRING('o', "output", &record.data.path, "file",
2099 "output file name"),
2100 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2101 &record.opts.no_inherit_set,
2102 "child tasks do not inherit counters"),
2103 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2104 "synthesize non-sample events at the end of output"),
2105 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2106 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2107 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2108 "Fail if the specified frequency can't be used"),
2109 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2110 "profile at this frequency",
2111 record__parse_freq),
2112 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2113 "number of mmap data pages and AUX area tracing mmap pages",
2114 record__parse_mmap_pages),
2115 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2116 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2117 record__mmap_flush_parse),
2118 OPT_BOOLEAN(0, "group", &record.opts.group,
2119 "put the counters into a counter group"),
2120 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2121 NULL, "enables call-graph recording" ,
2122 &record_callchain_opt),
2123 OPT_CALLBACK(0, "call-graph", &record.opts,
2124 "record_mode[,record_size]", record_callchain_help,
2125 &record_parse_callchain_opt),
2126 OPT_INCR('v', "verbose", &verbose,
2127 "be more verbose (show counter open errors, etc)"),
2128 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2129 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2130 "per thread counts"),
2131 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2132 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2133 "Record the sample physical addresses"),
2134 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2135 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2136 &record.opts.sample_time_set,
2137 "Record the sample timestamps"),
2138 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2139 "Record the sample period"),
2140 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2141 "don't sample"),
2142 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2143 &record.no_buildid_cache_set,
2144 "do not update the buildid cache"),
2145 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2146 &record.no_buildid_set,
2147 "do not collect buildids in perf.data"),
2148 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2149 "monitor event in cgroup name only",
2150 parse_cgroups),
2151 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2152 "ms to wait before starting measurement after program start"),
2153 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2154 "user to profile"),
2155
2156 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2157 "branch any", "sample any taken branches",
2158 parse_branch_stack),
2159
2160 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2161 "branch filter mask", "branch stack filter modes",
2162 parse_branch_stack),
2163 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2164 "sample by weight (on special events only)"),
2165 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2166 "sample transaction flags (special events only)"),
2167 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2168 "use per-thread mmaps"),
2169 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2170 "sample selected machine registers on interrupt,"
2171 " use '-I?' to list register names", parse_intr_regs),
2172 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2173 "sample selected machine registers on interrupt,"
2174 " use '--user-regs=?' to list register names", parse_user_regs),
2175 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2176 "Record running/enabled time of read (:S) events"),
2177 OPT_CALLBACK('k', "clockid", &record.opts,
2178 "clockid", "clockid to use for events, see clock_gettime()",
2179 parse_clockid),
2180 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2181 "opts", "AUX area tracing Snapshot Mode", ""),
2182 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2183 "per thread proc mmap processing timeout in ms"),
2184 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2185 "Record namespaces events"),
2186 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2187 "Record context switch events"),
2188 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2189 "Configure all used events to run in kernel space.",
2190 PARSE_OPT_EXCLUSIVE),
2191 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2192 "Configure all used events to run in user space.",
2193 PARSE_OPT_EXCLUSIVE),
2194 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2195 "collect kernel callchains"),
2196 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2197 "collect user callchains"),
2198 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2199 "clang binary to use for compiling BPF scriptlets"),
2200 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2201 "options passed to clang when compiling BPF scriptlets"),
2202 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2203 "file", "vmlinux pathname"),
2204 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2205 "Record build-id of all DSOs regardless of hits"),
2206 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2207 "append timestamp to output filename"),
2208 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2209 "Record timestamp boundary (time of first/last samples)"),
2210 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2211 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2212 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2213 "signal"),
2214 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2215 "Limit number of switch output generated files"),
2216 OPT_BOOLEAN(0, "dry-run", &dry_run,
2217 "Parse options then exit"),
2218#ifdef HAVE_AIO_SUPPORT
2219 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2220 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2221 record__aio_parse),
2222#endif
2223 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2224 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2225 record__parse_affinity),
2226#ifdef HAVE_ZSTD_SUPPORT
2227 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2228 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2229 record__parse_comp_level),
2230#endif
2231 OPT_END()
2232};
2233
2234struct option *record_options = __record_options;
2235
2236int cmd_record(int argc, const char **argv)
2237{
2238 int err;
2239 struct record *rec = &record;
2240 char errbuf[BUFSIZ];
2241
2242 setlocale(LC_ALL, "");
2243
2244#ifndef HAVE_LIBBPF_SUPPORT
2245# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2246 set_nobuild('\0', "clang-path", true);
2247 set_nobuild('\0', "clang-opt", true);
2248# undef set_nobuild
2249#endif
2250
2251#ifndef HAVE_BPF_PROLOGUE
2252# if !defined (HAVE_DWARF_SUPPORT)
2253# define REASON "NO_DWARF=1"
2254# elif !defined (HAVE_LIBBPF_SUPPORT)
2255# define REASON "NO_LIBBPF=1"
2256# else
2257# define REASON "this architecture doesn't support BPF prologue"
2258# endif
2259# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2260 set_nobuild('\0', "vmlinux", true);
2261# undef set_nobuild
2262# undef REASON
2263#endif
2264
2265 CPU_ZERO(&rec->affinity_mask);
2266 rec->opts.affinity = PERF_AFFINITY_SYS;
2267
2268 rec->evlist = perf_evlist__new();
2269 if (rec->evlist == NULL)
2270 return -ENOMEM;
2271
2272 err = perf_config(perf_record_config, rec);
2273 if (err)
2274 return err;
2275
2276 argc = parse_options(argc, argv, record_options, record_usage,
2277 PARSE_OPT_STOP_AT_NON_OPTION);
2278 if (quiet)
2279 perf_quiet_option();
2280
2281 /* Make system wide (-a) the default target. */
2282 if (!argc && target__none(&rec->opts.target))
2283 rec->opts.target.system_wide = true;
2284
2285 if (nr_cgroups && !rec->opts.target.system_wide) {
2286 usage_with_options_msg(record_usage, record_options,
2287 "cgroup monitoring only available in system-wide mode");
2288
2289 }
2290
2291 if (rec->opts.comp_level != 0) {
2292 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2293 rec->no_buildid = true;
2294 }
2295
2296 if (rec->opts.record_switch_events &&
2297 !perf_can_record_switch_events()) {
2298 ui__error("kernel does not support recording context switch events\n");
2299 parse_options_usage(record_usage, record_options, "switch-events", 0);
2300 return -EINVAL;
2301 }
2302
2303 if (switch_output_setup(rec)) {
2304 parse_options_usage(record_usage, record_options, "switch-output", 0);
2305 return -EINVAL;
2306 }
2307
2308 if (rec->switch_output.time) {
2309 signal(SIGALRM, alarm_sig_handler);
2310 alarm(rec->switch_output.time);
2311 }
2312
2313 if (rec->switch_output.num_files) {
2314 rec->switch_output.filenames = calloc(sizeof(char *),
2315 rec->switch_output.num_files);
2316 if (!rec->switch_output.filenames)
2317 return -EINVAL;
2318 }
2319
2320 /*
2321 * Allow aliases to facilitate the lookup of symbols for address
2322 * filters. Refer to auxtrace_parse_filters().
2323 */
2324 symbol_conf.allow_aliases = true;
2325
2326 symbol__init(NULL);
2327
2328 err = record__auxtrace_init(rec);
2329 if (err)
2330 goto out;
2331
2332 if (dry_run)
2333 goto out;
2334
2335 err = bpf__setup_stdout(rec->evlist);
2336 if (err) {
2337 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2338 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2339 errbuf);
2340 goto out;
2341 }
2342
2343 err = -ENOMEM;
2344
2345 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2346 pr_warning(
2347"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2348"check /proc/sys/kernel/kptr_restrict.\n\n"
2349"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2350"file is not found in the buildid cache or in the vmlinux path.\n\n"
2351"Samples in kernel modules won't be resolved at all.\n\n"
2352"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2353"even with a suitable vmlinux or kallsyms file.\n\n");
2354
2355 if (rec->no_buildid_cache || rec->no_buildid) {
2356 disable_buildid_cache();
2357 } else if (rec->switch_output.enabled) {
2358 /*
2359 * In 'perf record --switch-output', disable buildid
2360 * generation by default to reduce data file switching
2361 * overhead. Still generate buildid if they are required
2362 * explicitly using
2363 *
2364 * perf record --switch-output --no-no-buildid \
2365 * --no-no-buildid-cache
2366 *
2367 * Following code equals to:
2368 *
2369 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2370 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2371 * disable_buildid_cache();
2372 */
2373 bool disable = true;
2374
2375 if (rec->no_buildid_set && !rec->no_buildid)
2376 disable = false;
2377 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2378 disable = false;
2379 if (disable) {
2380 rec->no_buildid = true;
2381 rec->no_buildid_cache = true;
2382 disable_buildid_cache();
2383 }
2384 }
2385
2386 if (record.opts.overwrite)
2387 record.opts.tail_synthesize = true;
2388
2389 if (rec->evlist->nr_entries == 0 &&
2390 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2391 pr_err("Not enough memory for event selector list\n");
2392 goto out;
2393 }
2394
2395 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2396 rec->opts.no_inherit = true;
2397
2398 err = target__validate(&rec->opts.target);
2399 if (err) {
2400 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2401 ui__warning("%s\n", errbuf);
2402 }
2403
2404 err = target__parse_uid(&rec->opts.target);
2405 if (err) {
2406 int saved_errno = errno;
2407
2408 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2409 ui__error("%s", errbuf);
2410
2411 err = -saved_errno;
2412 goto out;
2413 }
2414
2415 /* Enable ignoring missing threads when -u/-p option is defined. */
2416 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2417
2418 err = -ENOMEM;
2419 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2420 usage_with_options(record_usage, record_options);
2421
2422 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2423 if (err)
2424 goto out;
2425
2426 /*
2427 * We take all buildids when the file contains
2428 * AUX area tracing data because we do not decode the
2429 * trace because it would take too long.
2430 */
2431 if (rec->opts.full_auxtrace)
2432 rec->buildid_all = true;
2433
2434 if (record_opts__config(&rec->opts)) {
2435 err = -EINVAL;
2436 goto out;
2437 }
2438
2439 if (rec->opts.nr_cblocks > nr_cblocks_max)
2440 rec->opts.nr_cblocks = nr_cblocks_max;
2441 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2442
2443 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2444 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2445
2446 if (rec->opts.comp_level > comp_level_max)
2447 rec->opts.comp_level = comp_level_max;
2448 pr_debug("comp level: %d\n", rec->opts.comp_level);
2449
2450 err = __cmd_record(&record, argc, argv);
2451out:
2452 perf_evlist__delete(rec->evlist);
2453 symbol__exit();
2454 auxtrace_record__free(rec->itr);
2455 return err;
2456}
2457
2458static void snapshot_sig_handler(int sig __maybe_unused)
2459{
2460 struct record *rec = &record;
2461
2462 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2463 trigger_hit(&auxtrace_snapshot_trigger);
2464 auxtrace_record__snapshot_started = 1;
2465 if (auxtrace_record__snapshot_start(record.itr))
2466 trigger_error(&auxtrace_snapshot_trigger);
2467 }
2468
2469 if (switch_output_signal(rec))
2470 trigger_hit(&switch_output_trigger);
2471}
2472
2473static void alarm_sig_handler(int sig __maybe_unused)
2474{
2475 struct record *rec = &record;
2476
2477 if (switch_output_time(rec))
2478 trigger_hit(&switch_output_trigger);
2479}