Merge branch 'net-Add-address-attribute-to-control-metric-of-prefix-route'
[linux-2.6-block.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC              02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE  1024
70 #endif
71
72 struct trace {
73         struct perf_tool        tool;
74         struct syscalltbl       *sctbl;
75         struct {
76                 int             max;
77                 struct syscall  *table;
78                 struct {
79                         struct perf_evsel *sys_enter,
80                                           *sys_exit;
81                 }               events;
82         } syscalls;
83         struct record_opts      opts;
84         struct perf_evlist      *evlist;
85         struct machine          *host;
86         struct thread           *current;
87         struct cgroup           *cgroup;
88         u64                     base_time;
89         FILE                    *output;
90         unsigned long           nr_events;
91         struct strlist          *ev_qualifier;
92         struct {
93                 size_t          nr;
94                 int             *entries;
95         }                       ev_qualifier_ids;
96         struct {
97                 size_t          nr;
98                 pid_t           *entries;
99         }                       filter_pids;
100         double                  duration_filter;
101         double                  runtime_ms;
102         struct {
103                 u64             vfs_getname,
104                                 proc_getname;
105         } stats;
106         unsigned int            max_stack;
107         unsigned int            min_stack;
108         bool                    not_ev_qualifier;
109         bool                    live;
110         bool                    full_time;
111         bool                    sched;
112         bool                    multiple_threads;
113         bool                    summary;
114         bool                    summary_only;
115         bool                    failure_only;
116         bool                    show_comm;
117         bool                    print_sample;
118         bool                    show_tool_stats;
119         bool                    trace_syscalls;
120         bool                    kernel_syscallchains;
121         bool                    force;
122         bool                    vfs_getname;
123         int                     trace_pgfaults;
124         int                     open_id;
125 };
126
127 struct tp_field {
128         int offset;
129         union {
130                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132         };
133 };
134
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138         u##bits value; \
139         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140         return value;  \
141 }
142
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151         u##bits value; \
152         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153         return bswap_##bits(value);\
154 }
155
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159
160 static int tp_field__init_uint(struct tp_field *field,
161                                struct format_field *format_field,
162                                bool needs_swap)
163 {
164         field->offset = format_field->offset;
165
166         switch (format_field->size) {
167         case 1:
168                 field->integer = tp_field__u8;
169                 break;
170         case 2:
171                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172                 break;
173         case 4:
174                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175                 break;
176         case 8:
177                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
178                 break;
179         default:
180                 return -1;
181         }
182
183         return 0;
184 }
185
186 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 {
188         return sample->raw_data + field->offset;
189 }
190
191 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 {
193         field->offset = format_field->offset;
194         field->pointer = tp_field__ptr;
195         return 0;
196 }
197
198 struct syscall_tp {
199         struct tp_field id;
200         union {
201                 struct tp_field args, ret;
202         };
203 };
204
205 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
206                                           struct tp_field *field,
207                                           const char *name)
208 {
209         struct format_field *format_field = perf_evsel__field(evsel, name);
210
211         if (format_field == NULL)
212                 return -1;
213
214         return tp_field__init_uint(field, format_field, evsel->needs_swap);
215 }
216
217 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
218         ({ struct syscall_tp *sc = evsel->priv;\
219            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220
221 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
222                                          struct tp_field *field,
223                                          const char *name)
224 {
225         struct format_field *format_field = perf_evsel__field(evsel, name);
226
227         if (format_field == NULL)
228                 return -1;
229
230         return tp_field__init_ptr(field, format_field);
231 }
232
233 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
234         ({ struct syscall_tp *sc = evsel->priv;\
235            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236
237 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
238 {
239         zfree(&evsel->priv);
240         perf_evsel__delete(evsel);
241 }
242
243 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 {
245         evsel->priv = malloc(sizeof(struct syscall_tp));
246         if (evsel->priv != NULL) {
247                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
248                         goto out_delete;
249
250                 evsel->handler = handler;
251                 return 0;
252         }
253
254         return -ENOMEM;
255
256 out_delete:
257         zfree(&evsel->priv);
258         return -ENOENT;
259 }
260
261 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 {
263         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264
265         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266         if (IS_ERR(evsel))
267                 evsel = perf_evsel__newtp("syscalls", direction);
268
269         if (IS_ERR(evsel))
270                 return NULL;
271
272         if (perf_evsel__init_syscall_tp(evsel, handler))
273                 goto out_delete;
274
275         return evsel;
276
277 out_delete:
278         perf_evsel__delete_priv(evsel);
279         return NULL;
280 }
281
282 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
283         ({ struct syscall_tp *fields = evsel->priv; \
284            fields->name.integer(&fields->name, sample); })
285
286 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
287         ({ struct syscall_tp *fields = evsel->priv; \
288            fields->name.pointer(&fields->name, sample); })
289
290 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 {
292         int idx = val - sa->offset;
293
294         if (idx < 0 || idx >= sa->nr_entries)
295                 return scnprintf(bf, size, intfmt, val);
296
297         return scnprintf(bf, size, "%s", sa->entries[idx]);
298 }
299
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301                                                 const char *intfmt,
302                                                 struct syscall_arg *arg)
303 {
304         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
305 }
306
307 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308                                               struct syscall_arg *arg)
309 {
310         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311 }
312
313 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
314
315 struct strarrays {
316         int             nr_entries;
317         struct strarray **entries;
318 };
319
320 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
321         .nr_entries = ARRAY_SIZE(array), \
322         .entries = array, \
323 }
324
325 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
326                                         struct syscall_arg *arg)
327 {
328         struct strarrays *sas = arg->parm;
329         int i;
330
331         for (i = 0; i < sas->nr_entries; ++i) {
332                 struct strarray *sa = sas->entries[i];
333                 int idx = arg->val - sa->offset;
334
335                 if (idx >= 0 && idx < sa->nr_entries) {
336                         if (sa->entries[idx] == NULL)
337                                 break;
338                         return scnprintf(bf, size, "%s", sa->entries[idx]);
339                 }
340         }
341
342         return scnprintf(bf, size, "%d", arg->val);
343 }
344
345 #ifndef AT_FDCWD
346 #define AT_FDCWD        -100
347 #endif
348
349 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
350                                            struct syscall_arg *arg)
351 {
352         int fd = arg->val;
353
354         if (fd == AT_FDCWD)
355                 return scnprintf(bf, size, "CWD");
356
357         return syscall_arg__scnprintf_fd(bf, size, arg);
358 }
359
360 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361
362 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
363                                               struct syscall_arg *arg);
364
365 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366
367 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 {
369         return scnprintf(bf, size, "%#lx", arg->val);
370 }
371
372 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 {
374         return scnprintf(bf, size, "%d", arg->val);
375 }
376
377 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 {
379         return scnprintf(bf, size, "%ld", arg->val);
380 }
381
382 static const char *bpf_cmd[] = {
383         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
384         "MAP_GET_NEXT_KEY", "PROG_LOAD",
385 };
386 static DEFINE_STRARRAY(bpf_cmd);
387
388 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
389 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390
391 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
392 static DEFINE_STRARRAY(itimers);
393
394 static const char *keyctl_options[] = {
395         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
396         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
397         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
398         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
399         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 };
401 static DEFINE_STRARRAY(keyctl_options);
402
403 static const char *whences[] = { "SET", "CUR", "END",
404 #ifdef SEEK_DATA
405 "DATA",
406 #endif
407 #ifdef SEEK_HOLE
408 "HOLE",
409 #endif
410 };
411 static DEFINE_STRARRAY(whences);
412
413 static const char *fcntl_cmds[] = {
414         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
415         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
416         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
417         "GETOWNER_UIDS",
418 };
419 static DEFINE_STRARRAY(fcntl_cmds);
420
421 static const char *fcntl_linux_specific_cmds[] = {
422         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
423         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
424         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
425 };
426
427 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428
429 static struct strarray *fcntl_cmds_arrays[] = {
430         &strarray__fcntl_cmds,
431         &strarray__fcntl_linux_specific_cmds,
432 };
433
434 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435
436 static const char *rlimit_resources[] = {
437         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
438         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
439         "RTTIME",
440 };
441 static DEFINE_STRARRAY(rlimit_resources);
442
443 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
444 static DEFINE_STRARRAY(sighow);
445
446 static const char *clockid[] = {
447         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
448         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
449         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 };
451 static DEFINE_STRARRAY(clockid);
452
453 static const char *socket_families[] = {
454         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
455         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
456         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
457         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
458         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
459         "ALG", "NFC", "VSOCK",
460 };
461 static DEFINE_STRARRAY(socket_families);
462
463 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
464                                                  struct syscall_arg *arg)
465 {
466         size_t printed = 0;
467         int mode = arg->val;
468
469         if (mode == F_OK) /* 0 */
470                 return scnprintf(bf, size, "F");
471 #define P_MODE(n) \
472         if (mode & n##_OK) { \
473                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
474                 mode &= ~n##_OK; \
475         }
476
477         P_MODE(R);
478         P_MODE(W);
479         P_MODE(X);
480 #undef P_MODE
481
482         if (mode)
483                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484
485         return printed;
486 }
487
488 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489
490 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
491                                               struct syscall_arg *arg);
492
493 #define SCA_FILENAME syscall_arg__scnprintf_filename
494
495 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
496                                                 struct syscall_arg *arg)
497 {
498         int printed = 0, flags = arg->val;
499
500 #define P_FLAG(n) \
501         if (flags & O_##n) { \
502                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
503                 flags &= ~O_##n; \
504         }
505
506         P_FLAG(CLOEXEC);
507         P_FLAG(NONBLOCK);
508 #undef P_FLAG
509
510         if (flags)
511                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512
513         return printed;
514 }
515
516 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517
518 #ifndef GRND_NONBLOCK
519 #define GRND_NONBLOCK   0x0001
520 #endif
521 #ifndef GRND_RANDOM
522 #define GRND_RANDOM     0x0002
523 #endif
524
525 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526                                                    struct syscall_arg *arg)
527 {
528         int printed = 0, flags = arg->val;
529
530 #define P_FLAG(n) \
531         if (flags & GRND_##n) { \
532                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533                 flags &= ~GRND_##n; \
534         }
535
536         P_FLAG(RANDOM);
537         P_FLAG(NONBLOCK);
538 #undef P_FLAG
539
540         if (flags)
541                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542
543         return printed;
544 }
545
546 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547
548 #define STRARRAY(name, array) \
549           { .scnprintf  = SCA_STRARRAY, \
550             .parm       = &strarray__##array, }
551
552 #include "trace/beauty/arch_errno_names.c"
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/futex_op.c"
555 #include "trace/beauty/futex_val3.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567
568 struct syscall_arg_fmt {
569         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570         void       *parm;
571         const char *name;
572         bool       show_zero;
573 };
574
575 static struct syscall_fmt {
576         const char *name;
577         const char *alias;
578         struct syscall_arg_fmt arg[6];
579         u8         nr_args;
580         bool       errpid;
581         bool       timeout;
582         bool       hexret;
583 } syscall_fmts[] = {
584         { .name     = "access",
585           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
586         { .name     = "bpf",
587           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
588         { .name     = "brk",        .hexret = true,
589           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
590         { .name     = "clock_gettime",
591           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
592         { .name     = "clone",      .errpid = true, .nr_args = 5,
593           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
594                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
595                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
596                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
597                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
598         { .name     = "close",
599           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
600         { .name     = "epoll_ctl",
601           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
602         { .name     = "eventfd2",
603           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
604         { .name     = "fchmodat",
605           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
606         { .name     = "fchownat",
607           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608         { .name     = "fcntl",
609           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
610                            .parm      = &strarrays__fcntl_cmds_arrays,
611                            .show_zero = true, },
612                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
613         { .name     = "flock",
614           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
615         { .name     = "fstat", .alias = "newfstat", },
616         { .name     = "fstatat", .alias = "newfstatat", },
617         { .name     = "futex",
618           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
619                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
620         { .name     = "futimesat",
621           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
622         { .name     = "getitimer",
623           .arg = { [0] = STRARRAY(which, itimers), }, },
624         { .name     = "getpid",     .errpid = true, },
625         { .name     = "getpgid",    .errpid = true, },
626         { .name     = "getppid",    .errpid = true, },
627         { .name     = "getrandom",
628           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
629         { .name     = "getrlimit",
630           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
631         { .name     = "gettid",     .errpid = true, },
632         { .name     = "ioctl",
633           .arg = {
634 #if defined(__i386__) || defined(__x86_64__)
635 /*
636  * FIXME: Make this available to all arches.
637  */
638                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
639                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 #else
641                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 #endif
643         { .name     = "kcmp",       .nr_args = 5,
644           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
645                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
646                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
647                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
648                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
649         { .name     = "keyctl",
650           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651         { .name     = "kill",
652           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653         { .name     = "linkat",
654           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655         { .name     = "lseek",
656           .arg = { [2] = STRARRAY(whence, whences), }, },
657         { .name     = "lstat", .alias = "newlstat", },
658         { .name     = "madvise",
659           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
660                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661         { .name     = "mkdirat",
662           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663         { .name     = "mknodat",
664           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665         { .name     = "mlock",
666           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
667         { .name     = "mlockall",
668           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
669         { .name     = "mmap",       .hexret = true,
670 /* The standard mmap maps to old_mmap on s390x */
671 #if defined(__s390x__)
672         .alias = "old_mmap",
673 #endif
674           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
675                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
676                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
677         { .name     = "mprotect",
678           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
679                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
680         { .name     = "mq_unlink",
681           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
682         { .name     = "mremap",     .hexret = true,
683           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
684                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
685                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
686         { .name     = "munlock",
687           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688         { .name     = "munmap",
689           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690         { .name     = "name_to_handle_at",
691           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
692         { .name     = "newfstatat",
693           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694         { .name     = "open",
695           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696         { .name     = "open_by_handle_at",
697           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
698                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699         { .name     = "openat",
700           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
701                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
702         { .name     = "perf_event_open",
703           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
704                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
705                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706         { .name     = "pipe2",
707           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
708         { .name     = "pkey_alloc",
709           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
710         { .name     = "pkey_free",
711           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
712         { .name     = "pkey_mprotect",
713           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
714                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
715                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
716         { .name     = "poll", .timeout = true, },
717         { .name     = "ppoll", .timeout = true, },
718         { .name     = "prctl", .alias = "arch_prctl",
719           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
720                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
721                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
722         { .name     = "pread", .alias = "pread64", },
723         { .name     = "preadv", .alias = "pread", },
724         { .name     = "prlimit64",
725           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
726         { .name     = "pwrite", .alias = "pwrite64", },
727         { .name     = "readlinkat",
728           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
729         { .name     = "recvfrom",
730           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
731         { .name     = "recvmmsg",
732           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733         { .name     = "recvmsg",
734           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
735         { .name     = "renameat",
736           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
737         { .name     = "rt_sigaction",
738           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739         { .name     = "rt_sigprocmask",
740           .arg = { [0] = STRARRAY(how, sighow), }, },
741         { .name     = "rt_sigqueueinfo",
742           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
743         { .name     = "rt_tgsigqueueinfo",
744           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
745         { .name     = "sched_setscheduler",
746           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747         { .name     = "seccomp",
748           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
749                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
750         { .name     = "select", .timeout = true, },
751         { .name     = "sendmmsg",
752           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753         { .name     = "sendmsg",
754           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755         { .name     = "sendto",
756           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757         { .name     = "set_tid_address", .errpid = true, },
758         { .name     = "setitimer",
759           .arg = { [0] = STRARRAY(which, itimers), }, },
760         { .name     = "setrlimit",
761           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762         { .name     = "socket",
763           .arg = { [0] = STRARRAY(family, socket_families),
764                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765         { .name     = "socketpair",
766           .arg = { [0] = STRARRAY(family, socket_families),
767                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
768         { .name     = "stat", .alias = "newstat", },
769         { .name     = "statx",
770           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
771                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
772                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
773         { .name     = "swapoff",
774           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
775         { .name     = "swapon",
776           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777         { .name     = "symlinkat",
778           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
779         { .name     = "tgkill",
780           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
781         { .name     = "tkill",
782           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783         { .name     = "uname", .alias = "newuname", },
784         { .name     = "unlinkat",
785           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
786         { .name     = "utimensat",
787           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
788         { .name     = "wait4",      .errpid = true,
789           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
790         { .name     = "waitid",     .errpid = true,
791           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
792 };
793
794 static int syscall_fmt__cmp(const void *name, const void *fmtp)
795 {
796         const struct syscall_fmt *fmt = fmtp;
797         return strcmp(name, fmt->name);
798 }
799
800 static struct syscall_fmt *syscall_fmt__find(const char *name)
801 {
802         const int nmemb = ARRAY_SIZE(syscall_fmts);
803         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
804 }
805
806 struct syscall {
807         struct event_format *tp_format;
808         int                 nr_args;
809         struct format_field *args;
810         const char          *name;
811         bool                is_exit;
812         struct syscall_fmt  *fmt;
813         struct syscall_arg_fmt *arg_fmt;
814 };
815
816 /*
817  * We need to have this 'calculated' boolean because in some cases we really
818  * don't know what is the duration of a syscall, for instance, when we start
819  * a session and some threads are waiting for a syscall to finish, say 'poll',
820  * in which case all we can do is to print "( ? ) for duration and for the
821  * start timestamp.
822  */
823 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
824 {
825         double duration = (double)t / NSEC_PER_MSEC;
826         size_t printed = fprintf(fp, "(");
827
828         if (!calculated)
829                 printed += fprintf(fp, "         ");
830         else if (duration >= 1.0)
831                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832         else if (duration >= 0.01)
833                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834         else
835                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836         return printed + fprintf(fp, "): ");
837 }
838
839 /**
840  * filename.ptr: The filename char pointer that will be vfs_getname'd
841  * filename.entry_str_pos: Where to insert the string translated from
842  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
843  * ret_scnprintf: syscall args may set this to a different syscall return
844  *                formatter, for instance, fcntl may return fds, file flags, etc.
845  */
846 struct thread_trace {
847         u64               entry_time;
848         bool              entry_pending;
849         unsigned long     nr_events;
850         unsigned long     pfmaj, pfmin;
851         char              *entry_str;
852         double            runtime_ms;
853         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
854         struct {
855                 unsigned long ptr;
856                 short int     entry_str_pos;
857                 bool          pending_open;
858                 unsigned int  namelen;
859                 char          *name;
860         } filename;
861         struct {
862                 int       max;
863                 char      **table;
864         } paths;
865
866         struct intlist *syscall_stats;
867 };
868
869 static struct thread_trace *thread_trace__new(void)
870 {
871         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
872
873         if (ttrace)
874                 ttrace->paths.max = -1;
875
876         ttrace->syscall_stats = intlist__new(NULL);
877
878         return ttrace;
879 }
880
881 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
882 {
883         struct thread_trace *ttrace;
884
885         if (thread == NULL)
886                 goto fail;
887
888         if (thread__priv(thread) == NULL)
889                 thread__set_priv(thread, thread_trace__new());
890
891         if (thread__priv(thread) == NULL)
892                 goto fail;
893
894         ttrace = thread__priv(thread);
895         ++ttrace->nr_events;
896
897         return ttrace;
898 fail:
899         color_fprintf(fp, PERF_COLOR_RED,
900                       "WARNING: not enough memory, dropping samples!\n");
901         return NULL;
902 }
903
904
905 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
906                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
907 {
908         struct thread_trace *ttrace = thread__priv(arg->thread);
909
910         ttrace->ret_scnprintf = ret_scnprintf;
911 }
912
913 #define TRACE_PFMAJ             (1 << 0)
914 #define TRACE_PFMIN             (1 << 1)
915
916 static const size_t trace__entry_str_size = 2048;
917
918 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
919 {
920         struct thread_trace *ttrace = thread__priv(thread);
921
922         if (fd > ttrace->paths.max) {
923                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
924
925                 if (npath == NULL)
926                         return -1;
927
928                 if (ttrace->paths.max != -1) {
929                         memset(npath + ttrace->paths.max + 1, 0,
930                                (fd - ttrace->paths.max) * sizeof(char *));
931                 } else {
932                         memset(npath, 0, (fd + 1) * sizeof(char *));
933                 }
934
935                 ttrace->paths.table = npath;
936                 ttrace->paths.max   = fd;
937         }
938
939         ttrace->paths.table[fd] = strdup(pathname);
940
941         return ttrace->paths.table[fd] != NULL ? 0 : -1;
942 }
943
944 static int thread__read_fd_path(struct thread *thread, int fd)
945 {
946         char linkname[PATH_MAX], pathname[PATH_MAX];
947         struct stat st;
948         int ret;
949
950         if (thread->pid_ == thread->tid) {
951                 scnprintf(linkname, sizeof(linkname),
952                           "/proc/%d/fd/%d", thread->pid_, fd);
953         } else {
954                 scnprintf(linkname, sizeof(linkname),
955                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
956         }
957
958         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
959                 return -1;
960
961         ret = readlink(linkname, pathname, sizeof(pathname));
962
963         if (ret < 0 || ret > st.st_size)
964                 return -1;
965
966         pathname[ret] = '\0';
967         return trace__set_fd_pathname(thread, fd, pathname);
968 }
969
970 static const char *thread__fd_path(struct thread *thread, int fd,
971                                    struct trace *trace)
972 {
973         struct thread_trace *ttrace = thread__priv(thread);
974
975         if (ttrace == NULL)
976                 return NULL;
977
978         if (fd < 0)
979                 return NULL;
980
981         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
982                 if (!trace->live)
983                         return NULL;
984                 ++trace->stats.proc_getname;
985                 if (thread__read_fd_path(thread, fd))
986                         return NULL;
987         }
988
989         return ttrace->paths.table[fd];
990 }
991
992 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
993 {
994         int fd = arg->val;
995         size_t printed = scnprintf(bf, size, "%d", fd);
996         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
997
998         if (path)
999                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000
1001         return printed;
1002 }
1003
1004 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005 {
1006         size_t printed = scnprintf(bf, size, "%d", fd);
1007         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008
1009         if (thread) {
1010                 const char *path = thread__fd_path(thread, fd, trace);
1011
1012                 if (path)
1013                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014
1015                 thread__put(thread);
1016         }
1017
1018         return printed;
1019 }
1020
1021 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022                                               struct syscall_arg *arg)
1023 {
1024         int fd = arg->val;
1025         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026         struct thread_trace *ttrace = thread__priv(arg->thread);
1027
1028         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029                 zfree(&ttrace->paths.table[fd]);
1030
1031         return printed;
1032 }
1033
1034 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035                                      unsigned long ptr)
1036 {
1037         struct thread_trace *ttrace = thread__priv(thread);
1038
1039         ttrace->filename.ptr = ptr;
1040         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041 }
1042
1043 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044                                               struct syscall_arg *arg)
1045 {
1046         unsigned long ptr = arg->val;
1047
1048         if (!arg->trace->vfs_getname)
1049                 return scnprintf(bf, size, "%#x", ptr);
1050
1051         thread__set_filename_pos(arg->thread, bf, ptr);
1052         return 0;
1053 }
1054
1055 static bool trace__filter_duration(struct trace *trace, double t)
1056 {
1057         return t < (trace->duration_filter * NSEC_PER_MSEC);
1058 }
1059
1060 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061 {
1062         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063
1064         return fprintf(fp, "%10.3f ", ts);
1065 }
1066
1067 /*
1068  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069  * using ttrace->entry_time for a thread that receives a sys_exit without
1070  * first having received a sys_enter ("poll" issued before tracing session
1071  * starts, lost sys_enter exit due to ring buffer overflow).
1072  */
1073 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074 {
1075         if (tstamp > 0)
1076                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1077
1078         return fprintf(fp, "         ? ");
1079 }
1080
1081 static bool done = false;
1082 static bool interrupted = false;
1083
1084 static void sig_handler(int sig)
1085 {
1086         done = true;
1087         interrupted = sig == SIGINT;
1088 }
1089
1090 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1092 {
1093         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094         printed += fprintf_duration(duration, duration_calculated, fp);
1095
1096         if (trace->multiple_threads) {
1097                 if (trace->show_comm)
1098                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099                 printed += fprintf(fp, "%d ", thread->tid);
1100         }
1101
1102         return printed;
1103 }
1104
1105 static int trace__process_event(struct trace *trace, struct machine *machine,
1106                                 union perf_event *event, struct perf_sample *sample)
1107 {
1108         int ret = 0;
1109
1110         switch (event->header.type) {
1111         case PERF_RECORD_LOST:
1112                 color_fprintf(trace->output, PERF_COLOR_RED,
1113                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1114                 ret = machine__process_lost_event(machine, event, sample);
1115                 break;
1116         default:
1117                 ret = machine__process_event(machine, event, sample);
1118                 break;
1119         }
1120
1121         return ret;
1122 }
1123
1124 static int trace__tool_process(struct perf_tool *tool,
1125                                union perf_event *event,
1126                                struct perf_sample *sample,
1127                                struct machine *machine)
1128 {
1129         struct trace *trace = container_of(tool, struct trace, tool);
1130         return trace__process_event(trace, machine, event, sample);
1131 }
1132
1133 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134 {
1135         struct machine *machine = vmachine;
1136
1137         if (machine->kptr_restrict_warned)
1138                 return NULL;
1139
1140         if (symbol_conf.kptr_restrict) {
1141                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143                            "Kernel samples will not be resolved.\n");
1144                 machine->kptr_restrict_warned = true;
1145                 return NULL;
1146         }
1147
1148         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149 }
1150
1151 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152 {
1153         int err = symbol__init(NULL);
1154
1155         if (err)
1156                 return err;
1157
1158         trace->host = machine__new_host();
1159         if (trace->host == NULL)
1160                 return -ENOMEM;
1161
1162         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163         if (err < 0)
1164                 goto out;
1165
1166         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167                                             evlist->threads, trace__tool_process, false,
1168                                             trace->opts.proc_map_timeout, 1);
1169 out:
1170         if (err)
1171                 symbol__exit();
1172
1173         return err;
1174 }
1175
1176 static void trace__symbols__exit(struct trace *trace)
1177 {
1178         machine__exit(trace->host);
1179         trace->host = NULL;
1180
1181         symbol__exit();
1182 }
1183
1184 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185 {
1186         int idx;
1187
1188         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189                 nr_args = sc->fmt->nr_args;
1190
1191         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192         if (sc->arg_fmt == NULL)
1193                 return -1;
1194
1195         for (idx = 0; idx < nr_args; ++idx) {
1196                 if (sc->fmt)
1197                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198         }
1199
1200         sc->nr_args = nr_args;
1201         return 0;
1202 }
1203
1204 static int syscall__set_arg_fmts(struct syscall *sc)
1205 {
1206         struct format_field *field;
1207         int idx = 0, len;
1208
1209         for (field = sc->args; field; field = field->next, ++idx) {
1210                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1211                         continue;
1212
1213                 if (strcmp(field->type, "const char *") == 0 &&
1214                          (strcmp(field->name, "filename") == 0 ||
1215                           strcmp(field->name, "path") == 0 ||
1216                           strcmp(field->name, "pathname") == 0))
1217                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218                 else if (field->flags & FIELD_IS_POINTER)
1219                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220                 else if (strcmp(field->type, "pid_t") == 0)
1221                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1222                 else if (strcmp(field->type, "umode_t") == 0)
1223                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224                 else if ((strcmp(field->type, "int") == 0 ||
1225                           strcmp(field->type, "unsigned int") == 0 ||
1226                           strcmp(field->type, "long") == 0) &&
1227                          (len = strlen(field->name)) >= 2 &&
1228                          strcmp(field->name + len - 2, "fd") == 0) {
1229                         /*
1230                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1231                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232                          * 65 int
1233                          * 23 unsigned int
1234                          * 7 unsigned long
1235                          */
1236                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1237                 }
1238         }
1239
1240         return 0;
1241 }
1242
1243 static int trace__read_syscall_info(struct trace *trace, int id)
1244 {
1245         char tp_name[128];
1246         struct syscall *sc;
1247         const char *name = syscalltbl__name(trace->sctbl, id);
1248
1249         if (name == NULL)
1250                 return -1;
1251
1252         if (id > trace->syscalls.max) {
1253                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254
1255                 if (nsyscalls == NULL)
1256                         return -1;
1257
1258                 if (trace->syscalls.max != -1) {
1259                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1260                                (id - trace->syscalls.max) * sizeof(*sc));
1261                 } else {
1262                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263                 }
1264
1265                 trace->syscalls.table = nsyscalls;
1266                 trace->syscalls.max   = id;
1267         }
1268
1269         sc = trace->syscalls.table + id;
1270         sc->name = name;
1271
1272         sc->fmt  = syscall_fmt__find(sc->name);
1273
1274         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276
1277         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280         }
1281
1282         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283                 return -1;
1284
1285         if (IS_ERR(sc->tp_format))
1286                 return -1;
1287
1288         sc->args = sc->tp_format->format.fields;
1289         /*
1290          * We need to check and discard the first variable '__syscall_nr'
1291          * or 'nr' that mean the syscall number. It is needless here.
1292          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293          */
1294         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295                 sc->args = sc->args->next;
1296                 --sc->nr_args;
1297         }
1298
1299         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1300
1301         return syscall__set_arg_fmts(sc);
1302 }
1303
1304 static int trace__validate_ev_qualifier(struct trace *trace)
1305 {
1306         int err = 0, i;
1307         size_t nr_allocated;
1308         struct str_node *pos;
1309
1310         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1313
1314         if (trace->ev_qualifier_ids.entries == NULL) {
1315                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316                        trace->output);
1317                 err = -EINVAL;
1318                 goto out;
1319         }
1320
1321         nr_allocated = trace->ev_qualifier_ids.nr;
1322         i = 0;
1323
1324         strlist__for_each_entry(pos, trace->ev_qualifier) {
1325                 const char *sc = pos->s;
1326                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327
1328                 if (id < 0) {
1329                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330                         if (id >= 0)
1331                                 goto matches;
1332
1333                         if (err == 0) {
1334                                 fputs("Error:\tInvalid syscall ", trace->output);
1335                                 err = -EINVAL;
1336                         } else {
1337                                 fputs(", ", trace->output);
1338                         }
1339
1340                         fputs(sc, trace->output);
1341                 }
1342 matches:
1343                 trace->ev_qualifier_ids.entries[i++] = id;
1344                 if (match_next == -1)
1345                         continue;
1346
1347                 while (1) {
1348                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349                         if (id < 0)
1350                                 break;
1351                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352                                 void *entries;
1353
1354                                 nr_allocated += 8;
1355                                 entries = realloc(trace->ev_qualifier_ids.entries,
1356                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357                                 if (entries == NULL) {
1358                                         err = -ENOMEM;
1359                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360                                         goto out_free;
1361                                 }
1362                                 trace->ev_qualifier_ids.entries = entries;
1363                         }
1364                         trace->ev_qualifier_ids.nr++;
1365                         trace->ev_qualifier_ids.entries[i++] = id;
1366                 }
1367         }
1368
1369         if (err < 0) {
1370                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1372 out_free:
1373                 zfree(&trace->ev_qualifier_ids.entries);
1374                 trace->ev_qualifier_ids.nr = 0;
1375         }
1376 out:
1377         return err;
1378 }
1379
1380 /*
1381  * args is to be interpreted as a series of longs but we need to handle
1382  * 8-byte unaligned accesses. args points to raw_data within the event
1383  * and raw_data is guaranteed to be 8-byte unaligned because it is
1384  * preceded by raw_size which is a u32. So we need to copy args to a temp
1385  * variable to read it. Most notably this avoids extended load instructions
1386  * on unaligned addresses
1387  */
1388 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389 {
1390         unsigned long val;
1391         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392
1393         memcpy(&val, p, sizeof(val));
1394         return val;
1395 }
1396
1397 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398                                       struct syscall_arg *arg)
1399 {
1400         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402
1403         return scnprintf(bf, size, "arg%d: ", arg->idx);
1404 }
1405
1406 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407                                      struct syscall_arg *arg, unsigned long val)
1408 {
1409         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1410                 arg->val = val;
1411                 if (sc->arg_fmt[arg->idx].parm)
1412                         arg->parm = sc->arg_fmt[arg->idx].parm;
1413                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414         }
1415         return scnprintf(bf, size, "%ld", val);
1416 }
1417
1418 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419                                       unsigned char *args, struct trace *trace,
1420                                       struct thread *thread)
1421 {
1422         size_t printed = 0;
1423         unsigned long val;
1424         u8 bit = 1;
1425         struct syscall_arg arg = {
1426                 .args   = args,
1427                 .idx    = 0,
1428                 .mask   = 0,
1429                 .trace  = trace,
1430                 .thread = thread,
1431         };
1432         struct thread_trace *ttrace = thread__priv(thread);
1433
1434         /*
1435          * Things like fcntl will set this in its 'cmd' formatter to pick the
1436          * right formatter for the return value (an fd? file flags?), which is
1437          * not needed for syscalls that always return a given type, say an fd.
1438          */
1439         ttrace->ret_scnprintf = NULL;
1440
1441         if (sc->args != NULL) {
1442                 struct format_field *field;
1443
1444                 for (field = sc->args; field;
1445                      field = field->next, ++arg.idx, bit <<= 1) {
1446                         if (arg.mask & bit)
1447                                 continue;
1448
1449                         val = syscall_arg__val(&arg, arg.idx);
1450
1451                         /*
1452                          * Suppress this argument if its value is zero and
1453                          * and we don't have a string associated in an
1454                          * strarray for it.
1455                          */
1456                         if (val == 0 &&
1457                             !(sc->arg_fmt &&
1458                               (sc->arg_fmt[arg.idx].show_zero ||
1459                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461                               sc->arg_fmt[arg.idx].parm))
1462                                 continue;
1463
1464                         printed += scnprintf(bf + printed, size - printed,
1465                                              "%s%s: ", printed ? ", " : "", field->name);
1466                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1467                 }
1468         } else if (IS_ERR(sc->tp_format)) {
1469                 /*
1470                  * If we managed to read the tracepoint /format file, then we
1471                  * may end up not having any args, like with gettid(), so only
1472                  * print the raw args when we didn't manage to read it.
1473                  */
1474                 while (arg.idx < sc->nr_args) {
1475                         if (arg.mask & bit)
1476                                 goto next_arg;
1477                         val = syscall_arg__val(&arg, arg.idx);
1478                         if (printed)
1479                                 printed += scnprintf(bf + printed, size - printed, ", ");
1480                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482 next_arg:
1483                         ++arg.idx;
1484                         bit <<= 1;
1485                 }
1486         }
1487
1488         return printed;
1489 }
1490
1491 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492                                   union perf_event *event,
1493                                   struct perf_sample *sample);
1494
1495 static struct syscall *trace__syscall_info(struct trace *trace,
1496                                            struct perf_evsel *evsel, int id)
1497 {
1498
1499         if (id < 0) {
1500
1501                 /*
1502                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503                  * before that, leaving at a higher verbosity level till that is
1504                  * explained. Reproduced with plain ftrace with:
1505                  *
1506                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507                  * grep "NR -1 " /t/trace_pipe
1508                  *
1509                  * After generating some load on the machine.
1510                  */
1511                 if (verbose > 1) {
1512                         static u64 n;
1513                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514                                 id, perf_evsel__name(evsel), ++n);
1515                 }
1516                 return NULL;
1517         }
1518
1519         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520             trace__read_syscall_info(trace, id))
1521                 goto out_cant_read;
1522
1523         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524                 goto out_cant_read;
1525
1526         return &trace->syscalls.table[id];
1527
1528 out_cant_read:
1529         if (verbose > 0) {
1530                 fprintf(trace->output, "Problems reading syscall %d", id);
1531                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1532                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533                 fputs(" information\n", trace->output);
1534         }
1535         return NULL;
1536 }
1537
1538 static void thread__update_stats(struct thread_trace *ttrace,
1539                                  int id, struct perf_sample *sample)
1540 {
1541         struct int_node *inode;
1542         struct stats *stats;
1543         u64 duration = 0;
1544
1545         inode = intlist__findnew(ttrace->syscall_stats, id);
1546         if (inode == NULL)
1547                 return;
1548
1549         stats = inode->priv;
1550         if (stats == NULL) {
1551                 stats = malloc(sizeof(struct stats));
1552                 if (stats == NULL)
1553                         return;
1554                 init_stats(stats);
1555                 inode->priv = stats;
1556         }
1557
1558         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559                 duration = sample->time - ttrace->entry_time;
1560
1561         update_stats(stats, duration);
1562 }
1563
1564 static int trace__printf_interrupted_entry(struct trace *trace)
1565 {
1566         struct thread_trace *ttrace;
1567         size_t printed;
1568
1569         if (trace->failure_only || trace->current == NULL)
1570                 return 0;
1571
1572         ttrace = thread__priv(trace->current);
1573
1574         if (!ttrace->entry_pending)
1575                 return 0;
1576
1577         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1579         ttrace->entry_pending = false;
1580
1581         return printed;
1582 }
1583
1584 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585                                  struct perf_sample *sample, struct thread *thread)
1586 {
1587         int printed = 0;
1588
1589         if (trace->print_sample) {
1590                 double ts = (double)sample->time / NSEC_PER_MSEC;
1591
1592                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593                                    perf_evsel__name(evsel), ts,
1594                                    thread__comm_str(thread),
1595                                    sample->pid, sample->tid, sample->cpu);
1596         }
1597
1598         return printed;
1599 }
1600
1601 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1602                             union perf_event *event __maybe_unused,
1603                             struct perf_sample *sample)
1604 {
1605         char *msg;
1606         void *args;
1607         size_t printed = 0;
1608         struct thread *thread;
1609         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1610         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611         struct thread_trace *ttrace;
1612
1613         if (sc == NULL)
1614                 return -1;
1615
1616         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617         ttrace = thread__trace(thread, trace->output);
1618         if (ttrace == NULL)
1619                 goto out_put;
1620
1621         trace__fprintf_sample(trace, evsel, sample, thread);
1622
1623         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624
1625         if (ttrace->entry_str == NULL) {
1626                 ttrace->entry_str = malloc(trace__entry_str_size);
1627                 if (!ttrace->entry_str)
1628                         goto out_put;
1629         }
1630
1631         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632                 trace__printf_interrupted_entry(trace);
1633
1634         ttrace->entry_time = sample->time;
1635         msg = ttrace->entry_str;
1636         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637
1638         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639                                            args, trace, thread);
1640
1641         if (sc->is_exit) {
1642                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1643                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1645                 }
1646         } else {
1647                 ttrace->entry_pending = true;
1648                 /* See trace__vfs_getname & trace__sys_exit */
1649                 ttrace->filename.pending_open = false;
1650         }
1651
1652         if (trace->current != thread) {
1653                 thread__put(trace->current);
1654                 trace->current = thread__get(thread);
1655         }
1656         err = 0;
1657 out_put:
1658         thread__put(thread);
1659         return err;
1660 }
1661
1662 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1663                                     struct perf_sample *sample,
1664                                     struct callchain_cursor *cursor)
1665 {
1666         struct addr_location al;
1667         int max_stack = evsel->attr.sample_max_stack ?
1668                         evsel->attr.sample_max_stack :
1669                         trace->max_stack;
1670
1671         if (machine__resolve(trace->host, &al, sample) < 0 ||
1672             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673                 return -1;
1674
1675         return 0;
1676 }
1677
1678 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679 {
1680         /* TODO: user-configurable print_opts */
1681         const unsigned int print_opts = EVSEL__PRINT_SYM |
1682                                         EVSEL__PRINT_DSO |
1683                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684
1685         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686 }
1687
1688 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689 {
1690         struct perf_env *env = perf_evsel__env(evsel);
1691         const char *arch_name = perf_env__arch(env);
1692
1693         return arch_syscalls__strerrno(arch_name, err);
1694 }
1695
1696 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697                            union perf_event *event __maybe_unused,
1698                            struct perf_sample *sample)
1699 {
1700         long ret;
1701         u64 duration = 0;
1702         bool duration_calculated = false;
1703         struct thread *thread;
1704         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1705         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706         struct thread_trace *ttrace;
1707
1708         if (sc == NULL)
1709                 return -1;
1710
1711         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712         ttrace = thread__trace(thread, trace->output);
1713         if (ttrace == NULL)
1714                 goto out_put;
1715
1716         trace__fprintf_sample(trace, evsel, sample, thread);
1717
1718         if (trace->summary)
1719                 thread__update_stats(ttrace, id, sample);
1720
1721         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722
1723         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1724                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725                 ttrace->filename.pending_open = false;
1726                 ++trace->stats.vfs_getname;
1727         }
1728
1729         if (ttrace->entry_time) {
1730                 duration = sample->time - ttrace->entry_time;
1731                 if (trace__filter_duration(trace, duration))
1732                         goto out;
1733                 duration_calculated = true;
1734         } else if (trace->duration_filter)
1735                 goto out;
1736
1737         if (sample->callchain) {
1738                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1739                 if (callchain_ret == 0) {
1740                         if (callchain_cursor.nr < trace->min_stack)
1741                                 goto out;
1742                         callchain_ret = 1;
1743                 }
1744         }
1745
1746         if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747                 goto out;
1748
1749         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750
1751         if (ttrace->entry_pending) {
1752                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1753         } else {
1754                 fprintf(trace->output, " ... [");
1755                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756                 fprintf(trace->output, "]: %s()", sc->name);
1757         }
1758
1759         if (sc->fmt == NULL) {
1760                 if (ret < 0)
1761                         goto errno_print;
1762 signed_print:
1763                 fprintf(trace->output, ") = %ld", ret);
1764         } else if (ret < 0) {
1765 errno_print: {
1766                 char bf[STRERR_BUFSIZE];
1767                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768                            *e = errno_to_name(evsel, -ret);
1769
1770                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771         }
1772         } else if (ret == 0 && sc->fmt->timeout)
1773                 fprintf(trace->output, ") = 0 Timeout");
1774         else if (ttrace->ret_scnprintf) {
1775                 char bf[1024];
1776                 struct syscall_arg arg = {
1777                         .val    = ret,
1778                         .thread = thread,
1779                         .trace  = trace,
1780                 };
1781                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782                 ttrace->ret_scnprintf = NULL;
1783                 fprintf(trace->output, ") = %s", bf);
1784         } else if (sc->fmt->hexret)
1785                 fprintf(trace->output, ") = %#lx", ret);
1786         else if (sc->fmt->errpid) {
1787                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1788
1789                 if (child != NULL) {
1790                         fprintf(trace->output, ") = %ld", ret);
1791                         if (child->comm_set)
1792                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1793                         thread__put(child);
1794                 }
1795         } else
1796                 goto signed_print;
1797
1798         fputc('\n', trace->output);
1799
1800         if (callchain_ret > 0)
1801                 trace__fprintf_callchain(trace, sample);
1802         else if (callchain_ret < 0)
1803                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804 out:
1805         ttrace->entry_pending = false;
1806         err = 0;
1807 out_put:
1808         thread__put(thread);
1809         return err;
1810 }
1811
1812 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813                               union perf_event *event __maybe_unused,
1814                               struct perf_sample *sample)
1815 {
1816         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817         struct thread_trace *ttrace;
1818         size_t filename_len, entry_str_len, to_move;
1819         ssize_t remaining_space;
1820         char *pos;
1821         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822
1823         if (!thread)
1824                 goto out;
1825
1826         ttrace = thread__priv(thread);
1827         if (!ttrace)
1828                 goto out_put;
1829
1830         filename_len = strlen(filename);
1831         if (filename_len == 0)
1832                 goto out_put;
1833
1834         if (ttrace->filename.namelen < filename_len) {
1835                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1836
1837                 if (f == NULL)
1838                         goto out_put;
1839
1840                 ttrace->filename.namelen = filename_len;
1841                 ttrace->filename.name = f;
1842         }
1843
1844         strcpy(ttrace->filename.name, filename);
1845         ttrace->filename.pending_open = true;
1846
1847         if (!ttrace->filename.ptr)
1848                 goto out_put;
1849
1850         entry_str_len = strlen(ttrace->entry_str);
1851         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852         if (remaining_space <= 0)
1853                 goto out_put;
1854
1855         if (filename_len > (size_t)remaining_space) {
1856                 filename += filename_len - remaining_space;
1857                 filename_len = remaining_space;
1858         }
1859
1860         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862         memmove(pos + filename_len, pos, to_move);
1863         memcpy(pos, filename, filename_len);
1864
1865         ttrace->filename.ptr = 0;
1866         ttrace->filename.entry_str_pos = 0;
1867 out_put:
1868         thread__put(thread);
1869 out:
1870         return 0;
1871 }
1872
1873 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874                                      union perf_event *event __maybe_unused,
1875                                      struct perf_sample *sample)
1876 {
1877         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879         struct thread *thread = machine__findnew_thread(trace->host,
1880                                                         sample->pid,
1881                                                         sample->tid);
1882         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883
1884         if (ttrace == NULL)
1885                 goto out_dump;
1886
1887         ttrace->runtime_ms += runtime_ms;
1888         trace->runtime_ms += runtime_ms;
1889 out_put:
1890         thread__put(thread);
1891         return 0;
1892
1893 out_dump:
1894         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895                evsel->name,
1896                perf_evsel__strval(evsel, sample, "comm"),
1897                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898                runtime,
1899                perf_evsel__intval(evsel, sample, "vruntime"));
1900         goto out_put;
1901 }
1902
1903 static int bpf_output__printer(enum binary_printer_ops op,
1904                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1905 {
1906         unsigned char ch = (unsigned char)val;
1907
1908         switch (op) {
1909         case BINARY_PRINT_CHAR_DATA:
1910                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911         case BINARY_PRINT_DATA_BEGIN:
1912         case BINARY_PRINT_LINE_BEGIN:
1913         case BINARY_PRINT_ADDR:
1914         case BINARY_PRINT_NUM_DATA:
1915         case BINARY_PRINT_NUM_PAD:
1916         case BINARY_PRINT_SEP:
1917         case BINARY_PRINT_CHAR_PAD:
1918         case BINARY_PRINT_LINE_END:
1919         case BINARY_PRINT_DATA_END:
1920         default:
1921                 break;
1922         }
1923
1924         return 0;
1925 }
1926
1927 static void bpf_output__fprintf(struct trace *trace,
1928                                 struct perf_sample *sample)
1929 {
1930         binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931                         bpf_output__printer, NULL, trace->output);
1932 }
1933
1934 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1935                                 union perf_event *event __maybe_unused,
1936                                 struct perf_sample *sample)
1937 {
1938         int callchain_ret = 0;
1939
1940         if (sample->callchain) {
1941                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1942                 if (callchain_ret == 0) {
1943                         if (callchain_cursor.nr < trace->min_stack)
1944                                 goto out;
1945                         callchain_ret = 1;
1946                 }
1947         }
1948
1949         trace__printf_interrupted_entry(trace);
1950         trace__fprintf_tstamp(trace, sample->time, trace->output);
1951
1952         if (trace->trace_syscalls)
1953                 fprintf(trace->output, "(         ): ");
1954
1955         fprintf(trace->output, "%s:", evsel->name);
1956
1957         if (perf_evsel__is_bpf_output(evsel)) {
1958                 bpf_output__fprintf(trace, sample);
1959         } else if (evsel->tp_format) {
1960                 event_format__fprintf(evsel->tp_format, sample->cpu,
1961                                       sample->raw_data, sample->raw_size,
1962                                       trace->output);
1963         }
1964
1965         fprintf(trace->output, "\n");
1966
1967         if (callchain_ret > 0)
1968                 trace__fprintf_callchain(trace, sample);
1969         else if (callchain_ret < 0)
1970                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1971 out:
1972         return 0;
1973 }
1974
1975 static void print_location(FILE *f, struct perf_sample *sample,
1976                            struct addr_location *al,
1977                            bool print_dso, bool print_sym)
1978 {
1979
1980         if ((verbose > 0 || print_dso) && al->map)
1981                 fprintf(f, "%s@", al->map->dso->long_name);
1982
1983         if ((verbose > 0 || print_sym) && al->sym)
1984                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985                         al->addr - al->sym->start);
1986         else if (al->map)
1987                 fprintf(f, "0x%" PRIx64, al->addr);
1988         else
1989                 fprintf(f, "0x%" PRIx64, sample->addr);
1990 }
1991
1992 static int trace__pgfault(struct trace *trace,
1993                           struct perf_evsel *evsel,
1994                           union perf_event *event __maybe_unused,
1995                           struct perf_sample *sample)
1996 {
1997         struct thread *thread;
1998         struct addr_location al;
1999         char map_type = 'd';
2000         struct thread_trace *ttrace;
2001         int err = -1;
2002         int callchain_ret = 0;
2003
2004         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005
2006         if (sample->callchain) {
2007                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2008                 if (callchain_ret == 0) {
2009                         if (callchain_cursor.nr < trace->min_stack)
2010                                 goto out_put;
2011                         callchain_ret = 1;
2012                 }
2013         }
2014
2015         ttrace = thread__trace(thread, trace->output);
2016         if (ttrace == NULL)
2017                 goto out_put;
2018
2019         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020                 ttrace->pfmaj++;
2021         else
2022                 ttrace->pfmin++;
2023
2024         if (trace->summary_only)
2025                 goto out;
2026
2027         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2028                               sample->ip, &al);
2029
2030         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2031
2032         fprintf(trace->output, "%sfault [",
2033                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2034                 "maj" : "min");
2035
2036         print_location(trace->output, sample, &al, false, true);
2037
2038         fprintf(trace->output, "] => ");
2039
2040         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2041                                    sample->addr, &al);
2042
2043         if (!al.map) {
2044                 thread__find_addr_location(thread, sample->cpumode,
2045                                            MAP__FUNCTION, sample->addr, &al);
2046
2047                 if (al.map)
2048                         map_type = 'x';
2049                 else
2050                         map_type = '?';
2051         }
2052
2053         print_location(trace->output, sample, &al, true, false);
2054
2055         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2056
2057         if (callchain_ret > 0)
2058                 trace__fprintf_callchain(trace, sample);
2059         else if (callchain_ret < 0)
2060                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2061 out:
2062         err = 0;
2063 out_put:
2064         thread__put(thread);
2065         return err;
2066 }
2067
2068 static void trace__set_base_time(struct trace *trace,
2069                                  struct perf_evsel *evsel,
2070                                  struct perf_sample *sample)
2071 {
2072         /*
2073          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2074          * and don't use sample->time unconditionally, we may end up having
2075          * some other event in the future without PERF_SAMPLE_TIME for good
2076          * reason, i.e. we may not be interested in its timestamps, just in
2077          * it taking place, picking some piece of information when it
2078          * appears in our event stream (vfs_getname comes to mind).
2079          */
2080         if (trace->base_time == 0 && !trace->full_time &&
2081             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2082                 trace->base_time = sample->time;
2083 }
2084
2085 static int trace__process_sample(struct perf_tool *tool,
2086                                  union perf_event *event,
2087                                  struct perf_sample *sample,
2088                                  struct perf_evsel *evsel,
2089                                  struct machine *machine __maybe_unused)
2090 {
2091         struct trace *trace = container_of(tool, struct trace, tool);
2092         struct thread *thread;
2093         int err = 0;
2094
2095         tracepoint_handler handler = evsel->handler;
2096
2097         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098         if (thread && thread__is_filtered(thread))
2099                 goto out;
2100
2101         trace__set_base_time(trace, evsel, sample);
2102
2103         if (handler) {
2104                 ++trace->nr_events;
2105                 handler(trace, evsel, event, sample);
2106         }
2107 out:
2108         thread__put(thread);
2109         return err;
2110 }
2111
2112 static int trace__record(struct trace *trace, int argc, const char **argv)
2113 {
2114         unsigned int rec_argc, i, j;
2115         const char **rec_argv;
2116         const char * const record_args[] = {
2117                 "record",
2118                 "-R",
2119                 "-m", "1024",
2120                 "-c", "1",
2121         };
2122
2123         const char * const sc_args[] = { "-e", };
2124         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2125         const char * const majpf_args[] = { "-e", "major-faults" };
2126         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2127         const char * const minpf_args[] = { "-e", "minor-faults" };
2128         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2129
2130         /* +1 is for the event string below */
2131         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2132                 majpf_args_nr + minpf_args_nr + argc;
2133         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2134
2135         if (rec_argv == NULL)
2136                 return -ENOMEM;
2137
2138         j = 0;
2139         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2140                 rec_argv[j++] = record_args[i];
2141
2142         if (trace->trace_syscalls) {
2143                 for (i = 0; i < sc_args_nr; i++)
2144                         rec_argv[j++] = sc_args[i];
2145
2146                 /* event string may be different for older kernels - e.g., RHEL6 */
2147                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2148                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2149                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2150                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2151                 else {
2152                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2153                         free(rec_argv);
2154                         return -1;
2155                 }
2156         }
2157
2158         if (trace->trace_pgfaults & TRACE_PFMAJ)
2159                 for (i = 0; i < majpf_args_nr; i++)
2160                         rec_argv[j++] = majpf_args[i];
2161
2162         if (trace->trace_pgfaults & TRACE_PFMIN)
2163                 for (i = 0; i < minpf_args_nr; i++)
2164                         rec_argv[j++] = minpf_args[i];
2165
2166         for (i = 0; i < (unsigned int)argc; i++)
2167                 rec_argv[j++] = argv[i];
2168
2169         return cmd_record(j, rec_argv);
2170 }
2171
2172 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2173
2174 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2175 {
2176         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2177
2178         if (IS_ERR(evsel))
2179                 return false;
2180
2181         if (perf_evsel__field(evsel, "pathname") == NULL) {
2182                 perf_evsel__delete(evsel);
2183                 return false;
2184         }
2185
2186         evsel->handler = trace__vfs_getname;
2187         perf_evlist__add(evlist, evsel);
2188         return true;
2189 }
2190
2191 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2192 {
2193         struct perf_evsel *evsel;
2194         struct perf_event_attr attr = {
2195                 .type = PERF_TYPE_SOFTWARE,
2196                 .mmap_data = 1,
2197         };
2198
2199         attr.config = config;
2200         attr.sample_period = 1;
2201
2202         event_attr_init(&attr);
2203
2204         evsel = perf_evsel__new(&attr);
2205         if (evsel)
2206                 evsel->handler = trace__pgfault;
2207
2208         return evsel;
2209 }
2210
2211 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2212 {
2213         const u32 type = event->header.type;
2214         struct perf_evsel *evsel;
2215
2216         if (type != PERF_RECORD_SAMPLE) {
2217                 trace__process_event(trace, trace->host, event, sample);
2218                 return;
2219         }
2220
2221         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2222         if (evsel == NULL) {
2223                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2224                 return;
2225         }
2226
2227         trace__set_base_time(trace, evsel, sample);
2228
2229         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2230             sample->raw_data == NULL) {
2231                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2232                        perf_evsel__name(evsel), sample->tid,
2233                        sample->cpu, sample->raw_size);
2234         } else {
2235                 tracepoint_handler handler = evsel->handler;
2236                 handler(trace, evsel, event, sample);
2237         }
2238 }
2239
2240 static int trace__add_syscall_newtp(struct trace *trace)
2241 {
2242         int ret = -1;
2243         struct perf_evlist *evlist = trace->evlist;
2244         struct perf_evsel *sys_enter, *sys_exit;
2245
2246         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2247         if (sys_enter == NULL)
2248                 goto out;
2249
2250         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2251                 goto out_delete_sys_enter;
2252
2253         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2254         if (sys_exit == NULL)
2255                 goto out_delete_sys_enter;
2256
2257         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2258                 goto out_delete_sys_exit;
2259
2260         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2261         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2262
2263         perf_evlist__add(evlist, sys_enter);
2264         perf_evlist__add(evlist, sys_exit);
2265
2266         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2267                 /*
2268                  * We're interested only in the user space callchain
2269                  * leading to the syscall, allow overriding that for
2270                  * debugging reasons using --kernel_syscall_callchains
2271                  */
2272                 sys_exit->attr.exclude_callchain_kernel = 1;
2273         }
2274
2275         trace->syscalls.events.sys_enter = sys_enter;
2276         trace->syscalls.events.sys_exit  = sys_exit;
2277
2278         ret = 0;
2279 out:
2280         return ret;
2281
2282 out_delete_sys_exit:
2283         perf_evsel__delete_priv(sys_exit);
2284 out_delete_sys_enter:
2285         perf_evsel__delete_priv(sys_enter);
2286         goto out;
2287 }
2288
2289 static int trace__set_ev_qualifier_filter(struct trace *trace)
2290 {
2291         int err = -1;
2292         struct perf_evsel *sys_exit;
2293         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2294                                                 trace->ev_qualifier_ids.nr,
2295                                                 trace->ev_qualifier_ids.entries);
2296
2297         if (filter == NULL)
2298                 goto out_enomem;
2299
2300         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2301                                           filter)) {
2302                 sys_exit = trace->syscalls.events.sys_exit;
2303                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2304         }
2305
2306         free(filter);
2307 out:
2308         return err;
2309 out_enomem:
2310         errno = ENOMEM;
2311         goto out;
2312 }
2313
2314 static int trace__set_filter_loop_pids(struct trace *trace)
2315 {
2316         unsigned int nr = 1;
2317         pid_t pids[32] = {
2318                 getpid(),
2319         };
2320         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2321
2322         while (thread && nr < ARRAY_SIZE(pids)) {
2323                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2324
2325                 if (parent == NULL)
2326                         break;
2327
2328                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2329                         pids[nr++] = parent->tid;
2330                         break;
2331                 }
2332                 thread = parent;
2333         }
2334
2335         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2336 }
2337
2338 static int trace__run(struct trace *trace, int argc, const char **argv)
2339 {
2340         struct perf_evlist *evlist = trace->evlist;
2341         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2342         int err = -1, i;
2343         unsigned long before;
2344         const bool forks = argc > 0;
2345         bool draining = false;
2346
2347         trace->live = true;
2348
2349         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2350                 goto out_error_raw_syscalls;
2351
2352         if (trace->trace_syscalls)
2353                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2354
2355         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2356                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2357                 if (pgfault_maj == NULL)
2358                         goto out_error_mem;
2359                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2360                 perf_evlist__add(evlist, pgfault_maj);
2361         }
2362
2363         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2364                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2365                 if (pgfault_min == NULL)
2366                         goto out_error_mem;
2367                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2368                 perf_evlist__add(evlist, pgfault_min);
2369         }
2370
2371         if (trace->sched &&
2372             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2373                                    trace__sched_stat_runtime))
2374                 goto out_error_sched_stat_runtime;
2375
2376         /*
2377          * If a global cgroup was set, apply it to all the events without an
2378          * explicit cgroup. I.e.:
2379          *
2380          *      trace -G A -e sched:*switch
2381          *
2382          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2383          * _and_ sched:sched_switch to the 'A' cgroup, while:
2384          *
2385          * trace -e sched:*switch -G A
2386          *
2387          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2388          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2389          * a cgroup (on the root cgroup, sys wide, etc).
2390          *
2391          * Multiple cgroups:
2392          *
2393          * trace -G A -e sched:*switch -G B
2394          *
2395          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2396          * to the 'B' cgroup.
2397          *
2398          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2399          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2400          */
2401         if (trace->cgroup)
2402                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2403
2404         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2405         if (err < 0) {
2406                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2407                 goto out_delete_evlist;
2408         }
2409
2410         err = trace__symbols_init(trace, evlist);
2411         if (err < 0) {
2412                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2413                 goto out_delete_evlist;
2414         }
2415
2416         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2417
2418         signal(SIGCHLD, sig_handler);
2419         signal(SIGINT, sig_handler);
2420
2421         if (forks) {
2422                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2423                                                     argv, false, NULL);
2424                 if (err < 0) {
2425                         fprintf(trace->output, "Couldn't run the workload!\n");
2426                         goto out_delete_evlist;
2427                 }
2428         }
2429
2430         err = perf_evlist__open(evlist);
2431         if (err < 0)
2432                 goto out_error_open;
2433
2434         err = bpf__apply_obj_config();
2435         if (err) {
2436                 char errbuf[BUFSIZ];
2437
2438                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2439                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2440                          errbuf);
2441                 goto out_error_open;
2442         }
2443
2444         /*
2445          * Better not use !target__has_task() here because we need to cover the
2446          * case where no threads were specified in the command line, but a
2447          * workload was, and in that case we will fill in the thread_map when
2448          * we fork the workload in perf_evlist__prepare_workload.
2449          */
2450         if (trace->filter_pids.nr > 0)
2451                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2452         else if (thread_map__pid(evlist->threads, 0) == -1)
2453                 err = trace__set_filter_loop_pids(trace);
2454
2455         if (err < 0)
2456                 goto out_error_mem;
2457
2458         if (trace->ev_qualifier_ids.nr > 0) {
2459                 err = trace__set_ev_qualifier_filter(trace);
2460                 if (err < 0)
2461                         goto out_errno;
2462
2463                 pr_debug("event qualifier tracepoint filter: %s\n",
2464                          trace->syscalls.events.sys_exit->filter);
2465         }
2466
2467         err = perf_evlist__apply_filters(evlist, &evsel);
2468         if (err < 0)
2469                 goto out_error_apply_filters;
2470
2471         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2472         if (err < 0)
2473                 goto out_error_mmap;
2474
2475         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2476                 perf_evlist__enable(evlist);
2477
2478         if (forks)
2479                 perf_evlist__start_workload(evlist);
2480
2481         if (trace->opts.initial_delay) {
2482                 usleep(trace->opts.initial_delay * 1000);
2483                 perf_evlist__enable(evlist);
2484         }
2485
2486         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2487                                   evlist->threads->nr > 1 ||
2488                                   perf_evlist__first(evlist)->attr.inherit;
2489
2490         /*
2491          * Now that we already used evsel->attr to ask the kernel to setup the
2492          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2493          * trace__resolve_callchain(), allowing per-event max-stack settings
2494          * to override an explicitely set --max-stack global setting.
2495          */
2496         evlist__for_each_entry(evlist, evsel) {
2497                 if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2498                     evsel->attr.sample_max_stack == 0)
2499                         evsel->attr.sample_max_stack = trace->max_stack;
2500         }
2501 again:
2502         before = trace->nr_events;
2503
2504         for (i = 0; i < evlist->nr_mmaps; i++) {
2505                 union perf_event *event;
2506                 struct perf_mmap *md;
2507
2508                 md = &evlist->mmap[i];
2509                 if (perf_mmap__read_init(md) < 0)
2510                         continue;
2511
2512                 while ((event = perf_mmap__read_event(md)) != NULL) {
2513                         struct perf_sample sample;
2514
2515                         ++trace->nr_events;
2516
2517                         err = perf_evlist__parse_sample(evlist, event, &sample);
2518                         if (err) {
2519                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2520                                 goto next_event;
2521                         }
2522
2523                         trace__handle_event(trace, event, &sample);
2524 next_event:
2525                         perf_mmap__consume(md);
2526
2527                         if (interrupted)
2528                                 goto out_disable;
2529
2530                         if (done && !draining) {
2531                                 perf_evlist__disable(evlist);
2532                                 draining = true;
2533                         }
2534                 }
2535                 perf_mmap__read_done(md);
2536         }
2537
2538         if (trace->nr_events == before) {
2539                 int timeout = done ? 100 : -1;
2540
2541                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2542                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2543                                 draining = true;
2544
2545                         goto again;
2546                 }
2547         } else {
2548                 goto again;
2549         }
2550
2551 out_disable:
2552         thread__zput(trace->current);
2553
2554         perf_evlist__disable(evlist);
2555
2556         if (!err) {
2557                 if (trace->summary)
2558                         trace__fprintf_thread_summary(trace, trace->output);
2559
2560                 if (trace->show_tool_stats) {
2561                         fprintf(trace->output, "Stats:\n "
2562                                                " vfs_getname : %" PRIu64 "\n"
2563                                                " proc_getname: %" PRIu64 "\n",
2564                                 trace->stats.vfs_getname,
2565                                 trace->stats.proc_getname);
2566                 }
2567         }
2568
2569 out_delete_evlist:
2570         trace__symbols__exit(trace);
2571
2572         perf_evlist__delete(evlist);
2573         cgroup__put(trace->cgroup);
2574         trace->evlist = NULL;
2575         trace->live = false;
2576         return err;
2577 {
2578         char errbuf[BUFSIZ];
2579
2580 out_error_sched_stat_runtime:
2581         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2582         goto out_error;
2583
2584 out_error_raw_syscalls:
2585         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2586         goto out_error;
2587
2588 out_error_mmap:
2589         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2590         goto out_error;
2591
2592 out_error_open:
2593         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2594
2595 out_error:
2596         fprintf(trace->output, "%s\n", errbuf);
2597         goto out_delete_evlist;
2598
2599 out_error_apply_filters:
2600         fprintf(trace->output,
2601                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2602                 evsel->filter, perf_evsel__name(evsel), errno,
2603                 str_error_r(errno, errbuf, sizeof(errbuf)));
2604         goto out_delete_evlist;
2605 }
2606 out_error_mem:
2607         fprintf(trace->output, "Not enough memory to run!\n");
2608         goto out_delete_evlist;
2609
2610 out_errno:
2611         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2612         goto out_delete_evlist;
2613 }
2614
2615 static int trace__replay(struct trace *trace)
2616 {
2617         const struct perf_evsel_str_handler handlers[] = {
2618                 { "probe:vfs_getname",       trace__vfs_getname, },
2619         };
2620         struct perf_data data = {
2621                 .file      = {
2622                         .path = input_name,
2623                 },
2624                 .mode      = PERF_DATA_MODE_READ,
2625                 .force     = trace->force,
2626         };
2627         struct perf_session *session;
2628         struct perf_evsel *evsel;
2629         int err = -1;
2630
2631         trace->tool.sample        = trace__process_sample;
2632         trace->tool.mmap          = perf_event__process_mmap;
2633         trace->tool.mmap2         = perf_event__process_mmap2;
2634         trace->tool.comm          = perf_event__process_comm;
2635         trace->tool.exit          = perf_event__process_exit;
2636         trace->tool.fork          = perf_event__process_fork;
2637         trace->tool.attr          = perf_event__process_attr;
2638         trace->tool.tracing_data  = perf_event__process_tracing_data;
2639         trace->tool.build_id      = perf_event__process_build_id;
2640         trace->tool.namespaces    = perf_event__process_namespaces;
2641
2642         trace->tool.ordered_events = true;
2643         trace->tool.ordering_requires_timestamps = true;
2644
2645         /* add tid to output */
2646         trace->multiple_threads = true;
2647
2648         session = perf_session__new(&data, false, &trace->tool);
2649         if (session == NULL)
2650                 return -1;
2651
2652         if (trace->opts.target.pid)
2653                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2654
2655         if (trace->opts.target.tid)
2656                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2657
2658         if (symbol__init(&session->header.env) < 0)
2659                 goto out;
2660
2661         trace->host = &session->machines.host;
2662
2663         err = perf_session__set_tracepoints_handlers(session, handlers);
2664         if (err)
2665                 goto out;
2666
2667         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2668                                                      "raw_syscalls:sys_enter");
2669         /* older kernels have syscalls tp versus raw_syscalls */
2670         if (evsel == NULL)
2671                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2672                                                              "syscalls:sys_enter");
2673
2674         if (evsel &&
2675             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2676             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2677                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2678                 goto out;
2679         }
2680
2681         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682                                                      "raw_syscalls:sys_exit");
2683         if (evsel == NULL)
2684                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2685                                                              "syscalls:sys_exit");
2686         if (evsel &&
2687             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2688             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2689                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2690                 goto out;
2691         }
2692
2693         evlist__for_each_entry(session->evlist, evsel) {
2694                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2695                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2696                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2697                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2698                         evsel->handler = trace__pgfault;
2699         }
2700
2701         setup_pager();
2702
2703         err = perf_session__process_events(session);
2704         if (err)
2705                 pr_err("Failed to process events, error %d", err);
2706
2707         else if (trace->summary)
2708                 trace__fprintf_thread_summary(trace, trace->output);
2709
2710 out:
2711         perf_session__delete(session);
2712
2713         return err;
2714 }
2715
2716 static size_t trace__fprintf_threads_header(FILE *fp)
2717 {
2718         size_t printed;
2719
2720         printed  = fprintf(fp, "\n Summary of events:\n\n");
2721
2722         return printed;
2723 }
2724
2725 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2726         struct stats    *stats;
2727         double          msecs;
2728         int             syscall;
2729 )
2730 {
2731         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2732         struct stats *stats = source->priv;
2733
2734         entry->syscall = source->i;
2735         entry->stats   = stats;
2736         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2737 }
2738
2739 static size_t thread__dump_stats(struct thread_trace *ttrace,
2740                                  struct trace *trace, FILE *fp)
2741 {
2742         size_t printed = 0;
2743         struct syscall *sc;
2744         struct rb_node *nd;
2745         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2746
2747         if (syscall_stats == NULL)
2748                 return 0;
2749
2750         printed += fprintf(fp, "\n");
2751
2752         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2753         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2754         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2755
2756         resort_rb__for_each_entry(nd, syscall_stats) {
2757                 struct stats *stats = syscall_stats_entry->stats;
2758                 if (stats) {
2759                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2760                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2761                         double avg = avg_stats(stats);
2762                         double pct;
2763                         u64 n = (u64) stats->n;
2764
2765                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2766                         avg /= NSEC_PER_MSEC;
2767
2768                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2769                         printed += fprintf(fp, "   %-15s", sc->name);
2770                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2771                                            n, syscall_stats_entry->msecs, min, avg);
2772                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2773                 }
2774         }
2775
2776         resort_rb__delete(syscall_stats);
2777         printed += fprintf(fp, "\n\n");
2778
2779         return printed;
2780 }
2781
2782 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2783 {
2784         size_t printed = 0;
2785         struct thread_trace *ttrace = thread__priv(thread);
2786         double ratio;
2787
2788         if (ttrace == NULL)
2789                 return 0;
2790
2791         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2792
2793         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2794         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2795         printed += fprintf(fp, "%.1f%%", ratio);
2796         if (ttrace->pfmaj)
2797                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2798         if (ttrace->pfmin)
2799                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2800         if (trace->sched)
2801                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2802         else if (fputc('\n', fp) != EOF)
2803                 ++printed;
2804
2805         printed += thread__dump_stats(ttrace, trace, fp);
2806
2807         return printed;
2808 }
2809
2810 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2811 {
2812         return ttrace ? ttrace->nr_events : 0;
2813 }
2814
2815 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2816         struct thread *thread;
2817 )
2818 {
2819         entry->thread = rb_entry(nd, struct thread, rb_node);
2820 }
2821
2822 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2823 {
2824         size_t printed = trace__fprintf_threads_header(fp);
2825         struct rb_node *nd;
2826         int i;
2827
2828         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2829                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2830
2831                 if (threads == NULL) {
2832                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2833                         return 0;
2834                 }
2835
2836                 resort_rb__for_each_entry(nd, threads)
2837                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2838
2839                 resort_rb__delete(threads);
2840         }
2841         return printed;
2842 }
2843
2844 static int trace__set_duration(const struct option *opt, const char *str,
2845                                int unset __maybe_unused)
2846 {
2847         struct trace *trace = opt->value;
2848
2849         trace->duration_filter = atof(str);
2850         return 0;
2851 }
2852
2853 static int trace__set_filter_pids(const struct option *opt, const char *str,
2854                                   int unset __maybe_unused)
2855 {
2856         int ret = -1;
2857         size_t i;
2858         struct trace *trace = opt->value;
2859         /*
2860          * FIXME: introduce a intarray class, plain parse csv and create a
2861          * { int nr, int entries[] } struct...
2862          */
2863         struct intlist *list = intlist__new(str);
2864
2865         if (list == NULL)
2866                 return -1;
2867
2868         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2869         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2870
2871         if (trace->filter_pids.entries == NULL)
2872                 goto out;
2873
2874         trace->filter_pids.entries[0] = getpid();
2875
2876         for (i = 1; i < trace->filter_pids.nr; ++i)
2877                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2878
2879         intlist__delete(list);
2880         ret = 0;
2881 out:
2882         return ret;
2883 }
2884
2885 static int trace__open_output(struct trace *trace, const char *filename)
2886 {
2887         struct stat st;
2888
2889         if (!stat(filename, &st) && st.st_size) {
2890                 char oldname[PATH_MAX];
2891
2892                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2893                 unlink(oldname);
2894                 rename(filename, oldname);
2895         }
2896
2897         trace->output = fopen(filename, "w");
2898
2899         return trace->output == NULL ? -errno : 0;
2900 }
2901
2902 static int parse_pagefaults(const struct option *opt, const char *str,
2903                             int unset __maybe_unused)
2904 {
2905         int *trace_pgfaults = opt->value;
2906
2907         if (strcmp(str, "all") == 0)
2908                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2909         else if (strcmp(str, "maj") == 0)
2910                 *trace_pgfaults |= TRACE_PFMAJ;
2911         else if (strcmp(str, "min") == 0)
2912                 *trace_pgfaults |= TRACE_PFMIN;
2913         else
2914                 return -1;
2915
2916         return 0;
2917 }
2918
2919 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2920 {
2921         struct perf_evsel *evsel;
2922
2923         evlist__for_each_entry(evlist, evsel)
2924                 evsel->handler = handler;
2925 }
2926
2927 /*
2928  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2929  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2930  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2931  *
2932  * It'd be better to introduce a parse_options() variant that would return a
2933  * list with the terms it didn't match to an event...
2934  */
2935 static int trace__parse_events_option(const struct option *opt, const char *str,
2936                                       int unset __maybe_unused)
2937 {
2938         struct trace *trace = (struct trace *)opt->value;
2939         const char *s = str;
2940         char *sep = NULL, *lists[2] = { NULL, NULL, };
2941         int len = strlen(str) + 1, err = -1, list, idx;
2942         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2943         char group_name[PATH_MAX];
2944
2945         if (strace_groups_dir == NULL)
2946                 return -1;
2947
2948         if (*s == '!') {
2949                 ++s;
2950                 trace->not_ev_qualifier = true;
2951         }
2952
2953         while (1) {
2954                 if ((sep = strchr(s, ',')) != NULL)
2955                         *sep = '\0';
2956
2957                 list = 0;
2958                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2959                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2960                         list = 1;
2961                 } else {
2962                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2963                         if (access(group_name, R_OK) == 0)
2964                                 list = 1;
2965                 }
2966
2967                 if (lists[list]) {
2968                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2969                 } else {
2970                         lists[list] = malloc(len);
2971                         if (lists[list] == NULL)
2972                                 goto out;
2973                         strcpy(lists[list], s);
2974                 }
2975
2976                 if (!sep)
2977                         break;
2978
2979                 *sep = ',';
2980                 s = sep + 1;
2981         }
2982
2983         if (lists[1] != NULL) {
2984                 struct strlist_config slist_config = {
2985                         .dirname = strace_groups_dir,
2986                 };
2987
2988                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2989                 if (trace->ev_qualifier == NULL) {
2990                         fputs("Not enough memory to parse event qualifier", trace->output);
2991                         goto out;
2992                 }
2993
2994                 if (trace__validate_ev_qualifier(trace))
2995                         goto out;
2996         }
2997
2998         err = 0;
2999
3000         if (lists[0]) {
3001                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3002                                                "event selector. use 'perf list' to list available events",
3003                                                parse_events_option);
3004                 err = parse_events_option(&o, lists[0], 0);
3005         }
3006 out:
3007         if (sep)
3008                 *sep = ',';
3009
3010         return err;
3011 }
3012
3013 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3014 {
3015         struct trace *trace = opt->value;
3016
3017         if (!list_empty(&trace->evlist->entries))
3018                 return parse_cgroups(opt, str, unset);
3019
3020         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3021
3022         return 0;
3023 }
3024
3025 int cmd_trace(int argc, const char **argv)
3026 {
3027         const char *trace_usage[] = {
3028                 "perf trace [<options>] [<command>]",
3029                 "perf trace [<options>] -- <command> [<options>]",
3030                 "perf trace record [<options>] [<command>]",
3031                 "perf trace record [<options>] -- <command> [<options>]",
3032                 NULL
3033         };
3034         struct trace trace = {
3035                 .syscalls = {
3036                         . max = -1,
3037                 },
3038                 .opts = {
3039                         .target = {
3040                                 .uid       = UINT_MAX,
3041                                 .uses_mmap = true,
3042                         },
3043                         .user_freq     = UINT_MAX,
3044                         .user_interval = ULLONG_MAX,
3045                         .no_buffering  = true,
3046                         .mmap_pages    = UINT_MAX,
3047                         .proc_map_timeout  = 500,
3048                 },
3049                 .output = stderr,
3050                 .show_comm = true,
3051                 .trace_syscalls = true,
3052                 .kernel_syscallchains = false,
3053                 .max_stack = UINT_MAX,
3054         };
3055         const char *output_name = NULL;
3056         const struct option trace_options[] = {
3057         OPT_CALLBACK('e', "event", &trace, "event",
3058                      "event/syscall selector. use 'perf list' to list available events",
3059                      trace__parse_events_option),
3060         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3061                     "show the thread COMM next to its id"),
3062         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3063         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3064                      trace__parse_events_option),
3065         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3066         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3067         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3068                     "trace events on existing process id"),
3069         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3070                     "trace events on existing thread id"),
3071         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3072                      "pids to filter (by the kernel)", trace__set_filter_pids),
3073         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3074                     "system-wide collection from all CPUs"),
3075         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3076                     "list of cpus to monitor"),
3077         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3078                     "child tasks do not inherit counters"),
3079         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3080                      "number of mmap data pages",
3081                      perf_evlist__parse_mmap_pages),
3082         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3083                    "user to profile"),
3084         OPT_CALLBACK(0, "duration", &trace, "float",
3085                      "show only events with duration > N.M ms",
3086                      trace__set_duration),
3087         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3088         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3089         OPT_BOOLEAN('T', "time", &trace.full_time,
3090                     "Show full timestamp, not time relative to first start"),
3091         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3092                     "Show only syscalls that failed"),
3093         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3094                     "Show only syscall summary with statistics"),
3095         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3096                     "Show all syscalls and summary with statistics"),
3097         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3098                      "Trace pagefaults", parse_pagefaults, "maj"),
3099         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3100         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3101         OPT_CALLBACK(0, "call-graph", &trace.opts,
3102                      "record_mode[,record_size]", record_callchain_help,
3103                      &record_parse_callchain_opt),
3104         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3105                     "Show the kernel callchains on the syscall exit path"),
3106         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3107                      "Set the minimum stack depth when parsing the callchain, "
3108                      "anything below the specified depth will be ignored."),
3109         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3110                      "Set the maximum stack depth when parsing the callchain, "
3111                      "anything beyond the specified depth will be ignored. "
3112                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3113         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3114                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3115         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3116                         "per thread proc mmap processing timeout in ms"),
3117         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3118                      trace__parse_cgroups),
3119         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3120                      "ms to wait before starting measurement after program "
3121                      "start"),
3122         OPT_END()
3123         };
3124         bool __maybe_unused max_stack_user_set = true;
3125         bool mmap_pages_user_set = true;
3126         const char * const trace_subcommands[] = { "record", NULL };
3127         int err;
3128         char bf[BUFSIZ];
3129
3130         signal(SIGSEGV, sighandler_dump_stack);
3131         signal(SIGFPE, sighandler_dump_stack);
3132
3133         trace.evlist = perf_evlist__new();
3134         trace.sctbl = syscalltbl__new();
3135
3136         if (trace.evlist == NULL || trace.sctbl == NULL) {
3137                 pr_err("Not enough memory to run!\n");
3138                 err = -ENOMEM;
3139                 goto out;
3140         }
3141
3142         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3143                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3144
3145         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3146                 usage_with_options_msg(trace_usage, trace_options,
3147                                        "cgroup monitoring only available in system-wide mode");
3148         }
3149
3150         err = bpf__setup_stdout(trace.evlist);
3151         if (err) {
3152                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3153                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3154                 goto out;
3155         }
3156
3157         err = -1;
3158
3159         if (trace.trace_pgfaults) {
3160                 trace.opts.sample_address = true;
3161                 trace.opts.sample_time = true;
3162         }
3163
3164         if (trace.opts.mmap_pages == UINT_MAX)
3165                 mmap_pages_user_set = false;
3166
3167         if (trace.max_stack == UINT_MAX) {
3168                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3169                 max_stack_user_set = false;
3170         }
3171
3172 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3173         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3174                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3175         }
3176 #endif
3177
3178         if (callchain_param.enabled) {
3179                 if (!mmap_pages_user_set && geteuid() == 0)
3180                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3181
3182                 symbol_conf.use_callchain = true;
3183         }
3184
3185         if (trace.evlist->nr_entries > 0)
3186                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3187
3188         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3189                 return trace__record(&trace, argc-1, &argv[1]);
3190
3191         /* summary_only implies summary option, but don't overwrite summary if set */
3192         if (trace.summary_only)
3193                 trace.summary = trace.summary_only;
3194
3195         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3196             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3197                 pr_err("Please specify something to trace.\n");
3198                 return -1;
3199         }
3200
3201         if (!trace.trace_syscalls && trace.ev_qualifier) {
3202                 pr_err("The -e option can't be used with --no-syscalls.\n");
3203                 goto out;
3204         }
3205
3206         if (output_name != NULL) {
3207                 err = trace__open_output(&trace, output_name);
3208                 if (err < 0) {
3209                         perror("failed to create output file");
3210                         goto out;
3211                 }
3212         }
3213
3214         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3215
3216         err = target__validate(&trace.opts.target);
3217         if (err) {
3218                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3219                 fprintf(trace.output, "%s", bf);
3220                 goto out_close;
3221         }
3222
3223         err = target__parse_uid(&trace.opts.target);
3224         if (err) {
3225                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3226                 fprintf(trace.output, "%s", bf);
3227                 goto out_close;
3228         }
3229
3230         if (!argc && target__none(&trace.opts.target))
3231                 trace.opts.target.system_wide = true;
3232
3233         if (input_name)
3234                 err = trace__replay(&trace);
3235         else
3236                 err = trace__run(&trace, argc, argv);
3237
3238 out_close:
3239         if (output_name != NULL)
3240                 fclose(trace.output);
3241 out:
3242         return err;
3243 }