Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[linux-2.6-block.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/env.h"
25 #include "util/event.h"
26 #include "util/evlist.h"
27 #include <subcmd/exec-cmd.h>
28 #include "util/machine.h"
29 #include "util/path.h"
30 #include "util/session.h"
31 #include "util/thread.h"
32 #include <subcmd/parse-options.h>
33 #include "util/strlist.h"
34 #include "util/intlist.h"
35 #include "util/thread_map.h"
36 #include "util/stat.h"
37 #include "trace/beauty/beauty.h"
38 #include "trace-event.h"
39 #include "util/parse-events.h"
40 #include "util/bpf-loader.h"
41 #include "callchain.h"
42 #include "print_binary.h"
43 #include "string2.h"
44 #include "syscalltbl.h"
45 #include "rb_resort.h"
46
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/kernel.h>
56 #include <linux/random.h>
57 #include <linux/stringify.h>
58 #include <linux/time64.h>
59 #include <fcntl.h>
60
61 #include "sane_ctype.h"
62
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC              02000000
65 #endif
66
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE  1024
69 #endif
70
71 struct trace {
72         struct perf_tool        tool;
73         struct syscalltbl       *sctbl;
74         struct {
75                 int             max;
76                 struct syscall  *table;
77                 struct {
78                         struct perf_evsel *sys_enter,
79                                           *sys_exit;
80                 }               events;
81         } syscalls;
82         struct record_opts      opts;
83         struct perf_evlist      *evlist;
84         struct machine          *host;
85         struct thread           *current;
86         u64                     base_time;
87         FILE                    *output;
88         unsigned long           nr_events;
89         struct strlist          *ev_qualifier;
90         struct {
91                 size_t          nr;
92                 int             *entries;
93         }                       ev_qualifier_ids;
94         struct {
95                 size_t          nr;
96                 pid_t           *entries;
97         }                       filter_pids;
98         double                  duration_filter;
99         double                  runtime_ms;
100         struct {
101                 u64             vfs_getname,
102                                 proc_getname;
103         } stats;
104         unsigned int            max_stack;
105         unsigned int            min_stack;
106         bool                    not_ev_qualifier;
107         bool                    live;
108         bool                    full_time;
109         bool                    sched;
110         bool                    multiple_threads;
111         bool                    summary;
112         bool                    summary_only;
113         bool                    show_comm;
114         bool                    print_sample;
115         bool                    show_tool_stats;
116         bool                    trace_syscalls;
117         bool                    kernel_syscallchains;
118         bool                    force;
119         bool                    vfs_getname;
120         int                     trace_pgfaults;
121         int                     open_id;
122 };
123
124 struct tp_field {
125         int offset;
126         union {
127                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
128                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
129         };
130 };
131
132 #define TP_UINT_FIELD(bits) \
133 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
134 { \
135         u##bits value; \
136         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137         return value;  \
138 }
139
140 TP_UINT_FIELD(8);
141 TP_UINT_FIELD(16);
142 TP_UINT_FIELD(32);
143 TP_UINT_FIELD(64);
144
145 #define TP_UINT_FIELD__SWAPPED(bits) \
146 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
147 { \
148         u##bits value; \
149         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
150         return bswap_##bits(value);\
151 }
152
153 TP_UINT_FIELD__SWAPPED(16);
154 TP_UINT_FIELD__SWAPPED(32);
155 TP_UINT_FIELD__SWAPPED(64);
156
157 static int tp_field__init_uint(struct tp_field *field,
158                                struct format_field *format_field,
159                                bool needs_swap)
160 {
161         field->offset = format_field->offset;
162
163         switch (format_field->size) {
164         case 1:
165                 field->integer = tp_field__u8;
166                 break;
167         case 2:
168                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
169                 break;
170         case 4:
171                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
172                 break;
173         case 8:
174                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
175                 break;
176         default:
177                 return -1;
178         }
179
180         return 0;
181 }
182
183 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
184 {
185         return sample->raw_data + field->offset;
186 }
187
188 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
189 {
190         field->offset = format_field->offset;
191         field->pointer = tp_field__ptr;
192         return 0;
193 }
194
195 struct syscall_tp {
196         struct tp_field id;
197         union {
198                 struct tp_field args, ret;
199         };
200 };
201
202 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
203                                           struct tp_field *field,
204                                           const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_uint(field, format_field, evsel->needs_swap);
212 }
213
214 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
217
218 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
219                                          struct tp_field *field,
220                                          const char *name)
221 {
222         struct format_field *format_field = perf_evsel__field(evsel, name);
223
224         if (format_field == NULL)
225                 return -1;
226
227         return tp_field__init_ptr(field, format_field);
228 }
229
230 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
231         ({ struct syscall_tp *sc = evsel->priv;\
232            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
233
234 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
235 {
236         zfree(&evsel->priv);
237         perf_evsel__delete(evsel);
238 }
239
240 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
241 {
242         evsel->priv = malloc(sizeof(struct syscall_tp));
243         if (evsel->priv != NULL) {
244                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
245                         goto out_delete;
246
247                 evsel->handler = handler;
248                 return 0;
249         }
250
251         return -ENOMEM;
252
253 out_delete:
254         zfree(&evsel->priv);
255         return -ENOENT;
256 }
257
258 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
259 {
260         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
261
262         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
263         if (IS_ERR(evsel))
264                 evsel = perf_evsel__newtp("syscalls", direction);
265
266         if (IS_ERR(evsel))
267                 return NULL;
268
269         if (perf_evsel__init_syscall_tp(evsel, handler))
270                 goto out_delete;
271
272         return evsel;
273
274 out_delete:
275         perf_evsel__delete_priv(evsel);
276         return NULL;
277 }
278
279 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
280         ({ struct syscall_tp *fields = evsel->priv; \
281            fields->name.integer(&fields->name, sample); })
282
283 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
284         ({ struct syscall_tp *fields = evsel->priv; \
285            fields->name.pointer(&fields->name, sample); })
286
287 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
288 {
289         int idx = val - sa->offset;
290
291         if (idx < 0 || idx >= sa->nr_entries)
292                 return scnprintf(bf, size, intfmt, val);
293
294         return scnprintf(bf, size, "%s", sa->entries[idx]);
295 }
296
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298                                                 const char *intfmt,
299                                                 struct syscall_arg *arg)
300 {
301         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
302 }
303
304 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
305                                               struct syscall_arg *arg)
306 {
307         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
308 }
309
310 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
311
312 struct strarrays {
313         int             nr_entries;
314         struct strarray **entries;
315 };
316
317 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
318         .nr_entries = ARRAY_SIZE(array), \
319         .entries = array, \
320 }
321
322 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
323                                         struct syscall_arg *arg)
324 {
325         struct strarrays *sas = arg->parm;
326         int i;
327
328         for (i = 0; i < sas->nr_entries; ++i) {
329                 struct strarray *sa = sas->entries[i];
330                 int idx = arg->val - sa->offset;
331
332                 if (idx >= 0 && idx < sa->nr_entries) {
333                         if (sa->entries[idx] == NULL)
334                                 break;
335                         return scnprintf(bf, size, "%s", sa->entries[idx]);
336                 }
337         }
338
339         return scnprintf(bf, size, "%d", arg->val);
340 }
341
342 #ifndef AT_FDCWD
343 #define AT_FDCWD        -100
344 #endif
345
346 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
347                                            struct syscall_arg *arg)
348 {
349         int fd = arg->val;
350
351         if (fd == AT_FDCWD)
352                 return scnprintf(bf, size, "CWD");
353
354         return syscall_arg__scnprintf_fd(bf, size, arg);
355 }
356
357 #define SCA_FDAT syscall_arg__scnprintf_fd_at
358
359 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
360                                               struct syscall_arg *arg);
361
362 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
363
364 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
365 {
366         return scnprintf(bf, size, "%#lx", arg->val);
367 }
368
369 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
370 {
371         return scnprintf(bf, size, "%d", arg->val);
372 }
373
374 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
375 {
376         return scnprintf(bf, size, "%ld", arg->val);
377 }
378
379 static const char *bpf_cmd[] = {
380         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
381         "MAP_GET_NEXT_KEY", "PROG_LOAD",
382 };
383 static DEFINE_STRARRAY(bpf_cmd);
384
385 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
386 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
387
388 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
389 static DEFINE_STRARRAY(itimers);
390
391 static const char *keyctl_options[] = {
392         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
393         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
394         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
395         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
396         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
397 };
398 static DEFINE_STRARRAY(keyctl_options);
399
400 static const char *whences[] = { "SET", "CUR", "END",
401 #ifdef SEEK_DATA
402 "DATA",
403 #endif
404 #ifdef SEEK_HOLE
405 "HOLE",
406 #endif
407 };
408 static DEFINE_STRARRAY(whences);
409
410 static const char *fcntl_cmds[] = {
411         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
412         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
413         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
414         "GETOWNER_UIDS",
415 };
416 static DEFINE_STRARRAY(fcntl_cmds);
417
418 static const char *fcntl_linux_specific_cmds[] = {
419         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
420         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
421         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
422 };
423
424 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
425
426 static struct strarray *fcntl_cmds_arrays[] = {
427         &strarray__fcntl_cmds,
428         &strarray__fcntl_linux_specific_cmds,
429 };
430
431 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
432
433 static const char *rlimit_resources[] = {
434         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
435         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
436         "RTTIME",
437 };
438 static DEFINE_STRARRAY(rlimit_resources);
439
440 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
441 static DEFINE_STRARRAY(sighow);
442
443 static const char *clockid[] = {
444         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
445         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
446         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
447 };
448 static DEFINE_STRARRAY(clockid);
449
450 static const char *socket_families[] = {
451         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
452         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
453         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
454         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
455         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
456         "ALG", "NFC", "VSOCK",
457 };
458 static DEFINE_STRARRAY(socket_families);
459
460 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
461                                                  struct syscall_arg *arg)
462 {
463         size_t printed = 0;
464         int mode = arg->val;
465
466         if (mode == F_OK) /* 0 */
467                 return scnprintf(bf, size, "F");
468 #define P_MODE(n) \
469         if (mode & n##_OK) { \
470                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
471                 mode &= ~n##_OK; \
472         }
473
474         P_MODE(R);
475         P_MODE(W);
476         P_MODE(X);
477 #undef P_MODE
478
479         if (mode)
480                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
481
482         return printed;
483 }
484
485 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
486
487 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
488                                               struct syscall_arg *arg);
489
490 #define SCA_FILENAME syscall_arg__scnprintf_filename
491
492 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
493                                                 struct syscall_arg *arg)
494 {
495         int printed = 0, flags = arg->val;
496
497 #define P_FLAG(n) \
498         if (flags & O_##n) { \
499                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
500                 flags &= ~O_##n; \
501         }
502
503         P_FLAG(CLOEXEC);
504         P_FLAG(NONBLOCK);
505 #undef P_FLAG
506
507         if (flags)
508                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
509
510         return printed;
511 }
512
513 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
514
515 #ifndef GRND_NONBLOCK
516 #define GRND_NONBLOCK   0x0001
517 #endif
518 #ifndef GRND_RANDOM
519 #define GRND_RANDOM     0x0002
520 #endif
521
522 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
523                                                    struct syscall_arg *arg)
524 {
525         int printed = 0, flags = arg->val;
526
527 #define P_FLAG(n) \
528         if (flags & GRND_##n) { \
529                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
530                 flags &= ~GRND_##n; \
531         }
532
533         P_FLAG(RANDOM);
534         P_FLAG(NONBLOCK);
535 #undef P_FLAG
536
537         if (flags)
538                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
539
540         return printed;
541 }
542
543 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
544
545 #define STRARRAY(name, array) \
546           { .scnprintf  = SCA_STRARRAY, \
547             .parm       = &strarray__##array, }
548
549 #include "trace/beauty/arch_errno_names.c"
550 #include "trace/beauty/eventfd.c"
551 #include "trace/beauty/futex_op.c"
552 #include "trace/beauty/futex_val3.c"
553 #include "trace/beauty/mmap.c"
554 #include "trace/beauty/mode_t.c"
555 #include "trace/beauty/msg_flags.c"
556 #include "trace/beauty/open_flags.c"
557 #include "trace/beauty/perf_event_open.c"
558 #include "trace/beauty/pid.c"
559 #include "trace/beauty/sched_policy.c"
560 #include "trace/beauty/seccomp.c"
561 #include "trace/beauty/signum.c"
562 #include "trace/beauty/socket_type.c"
563 #include "trace/beauty/waitid_options.c"
564
565 struct syscall_arg_fmt {
566         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
567         void       *parm;
568         const char *name;
569         bool       show_zero;
570 };
571
572 static struct syscall_fmt {
573         const char *name;
574         const char *alias;
575         struct syscall_arg_fmt arg[6];
576         u8         nr_args;
577         bool       errpid;
578         bool       timeout;
579         bool       hexret;
580 } syscall_fmts[] = {
581         { .name     = "access",
582           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
583         { .name     = "bpf",
584           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
585         { .name     = "brk",        .hexret = true,
586           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
587         { .name     = "clock_gettime",
588           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
589         { .name     = "clone",      .errpid = true, .nr_args = 5,
590           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
591                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
592                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
593                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
594                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
595         { .name     = "close",
596           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
597         { .name     = "epoll_ctl",
598           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
599         { .name     = "eventfd2",
600           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
601         { .name     = "fchmodat",
602           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
603         { .name     = "fchownat",
604           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605         { .name     = "fcntl",
606           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
607                            .parm      = &strarrays__fcntl_cmds_arrays,
608                            .show_zero = true, },
609                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
610         { .name     = "flock",
611           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
612         { .name     = "fstat", .alias = "newfstat", },
613         { .name     = "fstatat", .alias = "newfstatat", },
614         { .name     = "futex",
615           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
616                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
617         { .name     = "futimesat",
618           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
619         { .name     = "getitimer",
620           .arg = { [0] = STRARRAY(which, itimers), }, },
621         { .name     = "getpid",     .errpid = true, },
622         { .name     = "getpgid",    .errpid = true, },
623         { .name     = "getppid",    .errpid = true, },
624         { .name     = "getrandom",
625           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
626         { .name     = "getrlimit",
627           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
628         { .name     = "gettid",     .errpid = true, },
629         { .name     = "ioctl",
630           .arg = {
631 #if defined(__i386__) || defined(__x86_64__)
632 /*
633  * FIXME: Make this available to all arches.
634  */
635                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
636                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
637 #else
638                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
639 #endif
640         { .name     = "kcmp",       .nr_args = 5,
641           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
642                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
643                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
644                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
645                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
646         { .name     = "keyctl",
647           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
648         { .name     = "kill",
649           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
650         { .name     = "linkat",
651           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
652         { .name     = "lseek",
653           .arg = { [2] = STRARRAY(whence, whences), }, },
654         { .name     = "lstat", .alias = "newlstat", },
655         { .name     = "madvise",
656           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
657                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
658         { .name     = "mkdirat",
659           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
660         { .name     = "mknodat",
661           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
662         { .name     = "mlock",
663           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
664         { .name     = "mlockall",
665           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666         { .name     = "mmap",       .hexret = true,
667 /* The standard mmap maps to old_mmap on s390x */
668 #if defined(__s390x__)
669         .alias = "old_mmap",
670 #endif
671           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
672                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
673                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
674         { .name     = "mprotect",
675           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
676                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
677         { .name     = "mq_unlink",
678           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
679         { .name     = "mremap",     .hexret = true,
680           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
681                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
682                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
683         { .name     = "munlock",
684           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
685         { .name     = "munmap",
686           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
687         { .name     = "name_to_handle_at",
688           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
689         { .name     = "newfstatat",
690           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691         { .name     = "open",
692           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
693         { .name     = "open_by_handle_at",
694           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
695                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696         { .name     = "openat",
697           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
698                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699         { .name     = "perf_event_open",
700           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
701                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
702                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
703         { .name     = "pipe2",
704           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
705         { .name     = "pkey_alloc",
706           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
707         { .name     = "pkey_free",
708           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
709         { .name     = "pkey_mprotect",
710           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
711                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
712                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
713         { .name     = "poll", .timeout = true, },
714         { .name     = "ppoll", .timeout = true, },
715         { .name     = "prctl", .alias = "arch_prctl",
716           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
717                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
718                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
719         { .name     = "pread", .alias = "pread64", },
720         { .name     = "preadv", .alias = "pread", },
721         { .name     = "prlimit64",
722           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
723         { .name     = "pwrite", .alias = "pwrite64", },
724         { .name     = "readlinkat",
725           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
726         { .name     = "recvfrom",
727           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
728         { .name     = "recvmmsg",
729           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730         { .name     = "recvmsg",
731           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
732         { .name     = "renameat",
733           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
734         { .name     = "rt_sigaction",
735           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
736         { .name     = "rt_sigprocmask",
737           .arg = { [0] = STRARRAY(how, sighow), }, },
738         { .name     = "rt_sigqueueinfo",
739           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
740         { .name     = "rt_tgsigqueueinfo",
741           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742         { .name     = "sched_setscheduler",
743           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
744         { .name     = "seccomp",
745           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
746                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
747         { .name     = "select", .timeout = true, },
748         { .name     = "sendmmsg",
749           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
750         { .name     = "sendmsg",
751           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752         { .name     = "sendto",
753           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754         { .name     = "set_tid_address", .errpid = true, },
755         { .name     = "setitimer",
756           .arg = { [0] = STRARRAY(which, itimers), }, },
757         { .name     = "setrlimit",
758           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
759         { .name     = "socket",
760           .arg = { [0] = STRARRAY(family, socket_families),
761                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
762         { .name     = "socketpair",
763           .arg = { [0] = STRARRAY(family, socket_families),
764                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765         { .name     = "stat", .alias = "newstat", },
766         { .name     = "statx",
767           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
768                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
769                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
770         { .name     = "swapoff",
771           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
772         { .name     = "swapon",
773           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
774         { .name     = "symlinkat",
775           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
776         { .name     = "tgkill",
777           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
778         { .name     = "tkill",
779           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
780         { .name     = "uname", .alias = "newuname", },
781         { .name     = "unlinkat",
782           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
783         { .name     = "utimensat",
784           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
785         { .name     = "wait4",      .errpid = true,
786           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
787         { .name     = "waitid",     .errpid = true,
788           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
789 };
790
791 static int syscall_fmt__cmp(const void *name, const void *fmtp)
792 {
793         const struct syscall_fmt *fmt = fmtp;
794         return strcmp(name, fmt->name);
795 }
796
797 static struct syscall_fmt *syscall_fmt__find(const char *name)
798 {
799         const int nmemb = ARRAY_SIZE(syscall_fmts);
800         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
801 }
802
803 struct syscall {
804         struct event_format *tp_format;
805         int                 nr_args;
806         struct format_field *args;
807         const char          *name;
808         bool                is_exit;
809         struct syscall_fmt  *fmt;
810         struct syscall_arg_fmt *arg_fmt;
811 };
812
813 /*
814  * We need to have this 'calculated' boolean because in some cases we really
815  * don't know what is the duration of a syscall, for instance, when we start
816  * a session and some threads are waiting for a syscall to finish, say 'poll',
817  * in which case all we can do is to print "( ? ) for duration and for the
818  * start timestamp.
819  */
820 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
821 {
822         double duration = (double)t / NSEC_PER_MSEC;
823         size_t printed = fprintf(fp, "(");
824
825         if (!calculated)
826                 printed += fprintf(fp, "         ");
827         else if (duration >= 1.0)
828                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
829         else if (duration >= 0.01)
830                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
831         else
832                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
833         return printed + fprintf(fp, "): ");
834 }
835
836 /**
837  * filename.ptr: The filename char pointer that will be vfs_getname'd
838  * filename.entry_str_pos: Where to insert the string translated from
839  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
840  * ret_scnprintf: syscall args may set this to a different syscall return
841  *                formatter, for instance, fcntl may return fds, file flags, etc.
842  */
843 struct thread_trace {
844         u64               entry_time;
845         bool              entry_pending;
846         unsigned long     nr_events;
847         unsigned long     pfmaj, pfmin;
848         char              *entry_str;
849         double            runtime_ms;
850         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
851         struct {
852                 unsigned long ptr;
853                 short int     entry_str_pos;
854                 bool          pending_open;
855                 unsigned int  namelen;
856                 char          *name;
857         } filename;
858         struct {
859                 int       max;
860                 char      **table;
861         } paths;
862
863         struct intlist *syscall_stats;
864 };
865
866 static struct thread_trace *thread_trace__new(void)
867 {
868         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
869
870         if (ttrace)
871                 ttrace->paths.max = -1;
872
873         ttrace->syscall_stats = intlist__new(NULL);
874
875         return ttrace;
876 }
877
878 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
879 {
880         struct thread_trace *ttrace;
881
882         if (thread == NULL)
883                 goto fail;
884
885         if (thread__priv(thread) == NULL)
886                 thread__set_priv(thread, thread_trace__new());
887
888         if (thread__priv(thread) == NULL)
889                 goto fail;
890
891         ttrace = thread__priv(thread);
892         ++ttrace->nr_events;
893
894         return ttrace;
895 fail:
896         color_fprintf(fp, PERF_COLOR_RED,
897                       "WARNING: not enough memory, dropping samples!\n");
898         return NULL;
899 }
900
901
902 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
903                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
904 {
905         struct thread_trace *ttrace = thread__priv(arg->thread);
906
907         ttrace->ret_scnprintf = ret_scnprintf;
908 }
909
910 #define TRACE_PFMAJ             (1 << 0)
911 #define TRACE_PFMIN             (1 << 1)
912
913 static const size_t trace__entry_str_size = 2048;
914
915 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
916 {
917         struct thread_trace *ttrace = thread__priv(thread);
918
919         if (fd > ttrace->paths.max) {
920                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
921
922                 if (npath == NULL)
923                         return -1;
924
925                 if (ttrace->paths.max != -1) {
926                         memset(npath + ttrace->paths.max + 1, 0,
927                                (fd - ttrace->paths.max) * sizeof(char *));
928                 } else {
929                         memset(npath, 0, (fd + 1) * sizeof(char *));
930                 }
931
932                 ttrace->paths.table = npath;
933                 ttrace->paths.max   = fd;
934         }
935
936         ttrace->paths.table[fd] = strdup(pathname);
937
938         return ttrace->paths.table[fd] != NULL ? 0 : -1;
939 }
940
941 static int thread__read_fd_path(struct thread *thread, int fd)
942 {
943         char linkname[PATH_MAX], pathname[PATH_MAX];
944         struct stat st;
945         int ret;
946
947         if (thread->pid_ == thread->tid) {
948                 scnprintf(linkname, sizeof(linkname),
949                           "/proc/%d/fd/%d", thread->pid_, fd);
950         } else {
951                 scnprintf(linkname, sizeof(linkname),
952                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
953         }
954
955         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
956                 return -1;
957
958         ret = readlink(linkname, pathname, sizeof(pathname));
959
960         if (ret < 0 || ret > st.st_size)
961                 return -1;
962
963         pathname[ret] = '\0';
964         return trace__set_fd_pathname(thread, fd, pathname);
965 }
966
967 static const char *thread__fd_path(struct thread *thread, int fd,
968                                    struct trace *trace)
969 {
970         struct thread_trace *ttrace = thread__priv(thread);
971
972         if (ttrace == NULL)
973                 return NULL;
974
975         if (fd < 0)
976                 return NULL;
977
978         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
979                 if (!trace->live)
980                         return NULL;
981                 ++trace->stats.proc_getname;
982                 if (thread__read_fd_path(thread, fd))
983                         return NULL;
984         }
985
986         return ttrace->paths.table[fd];
987 }
988
989 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
990 {
991         int fd = arg->val;
992         size_t printed = scnprintf(bf, size, "%d", fd);
993         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
994
995         if (path)
996                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
997
998         return printed;
999 }
1000
1001 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1002 {
1003         size_t printed = scnprintf(bf, size, "%d", fd);
1004         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1005
1006         if (thread) {
1007                 const char *path = thread__fd_path(thread, fd, trace);
1008
1009                 if (path)
1010                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1011
1012                 thread__put(thread);
1013         }
1014
1015         return printed;
1016 }
1017
1018 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1019                                               struct syscall_arg *arg)
1020 {
1021         int fd = arg->val;
1022         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1023         struct thread_trace *ttrace = thread__priv(arg->thread);
1024
1025         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1026                 zfree(&ttrace->paths.table[fd]);
1027
1028         return printed;
1029 }
1030
1031 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1032                                      unsigned long ptr)
1033 {
1034         struct thread_trace *ttrace = thread__priv(thread);
1035
1036         ttrace->filename.ptr = ptr;
1037         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1038 }
1039
1040 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1041                                               struct syscall_arg *arg)
1042 {
1043         unsigned long ptr = arg->val;
1044
1045         if (!arg->trace->vfs_getname)
1046                 return scnprintf(bf, size, "%#x", ptr);
1047
1048         thread__set_filename_pos(arg->thread, bf, ptr);
1049         return 0;
1050 }
1051
1052 static bool trace__filter_duration(struct trace *trace, double t)
1053 {
1054         return t < (trace->duration_filter * NSEC_PER_MSEC);
1055 }
1056
1057 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1058 {
1059         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1060
1061         return fprintf(fp, "%10.3f ", ts);
1062 }
1063
1064 /*
1065  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1066  * using ttrace->entry_time for a thread that receives a sys_exit without
1067  * first having received a sys_enter ("poll" issued before tracing session
1068  * starts, lost sys_enter exit due to ring buffer overflow).
1069  */
1070 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1071 {
1072         if (tstamp > 0)
1073                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1074
1075         return fprintf(fp, "         ? ");
1076 }
1077
1078 static bool done = false;
1079 static bool interrupted = false;
1080
1081 static void sig_handler(int sig)
1082 {
1083         done = true;
1084         interrupted = sig == SIGINT;
1085 }
1086
1087 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1088                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1089 {
1090         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1091         printed += fprintf_duration(duration, duration_calculated, fp);
1092
1093         if (trace->multiple_threads) {
1094                 if (trace->show_comm)
1095                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1096                 printed += fprintf(fp, "%d ", thread->tid);
1097         }
1098
1099         return printed;
1100 }
1101
1102 static int trace__process_event(struct trace *trace, struct machine *machine,
1103                                 union perf_event *event, struct perf_sample *sample)
1104 {
1105         int ret = 0;
1106
1107         switch (event->header.type) {
1108         case PERF_RECORD_LOST:
1109                 color_fprintf(trace->output, PERF_COLOR_RED,
1110                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1111                 ret = machine__process_lost_event(machine, event, sample);
1112                 break;
1113         default:
1114                 ret = machine__process_event(machine, event, sample);
1115                 break;
1116         }
1117
1118         return ret;
1119 }
1120
1121 static int trace__tool_process(struct perf_tool *tool,
1122                                union perf_event *event,
1123                                struct perf_sample *sample,
1124                                struct machine *machine)
1125 {
1126         struct trace *trace = container_of(tool, struct trace, tool);
1127         return trace__process_event(trace, machine, event, sample);
1128 }
1129
1130 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1131 {
1132         struct machine *machine = vmachine;
1133
1134         if (machine->kptr_restrict_warned)
1135                 return NULL;
1136
1137         if (symbol_conf.kptr_restrict) {
1138                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1139                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1140                            "Kernel samples will not be resolved.\n");
1141                 machine->kptr_restrict_warned = true;
1142                 return NULL;
1143         }
1144
1145         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1146 }
1147
1148 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1149 {
1150         int err = symbol__init(NULL);
1151
1152         if (err)
1153                 return err;
1154
1155         trace->host = machine__new_host();
1156         if (trace->host == NULL)
1157                 return -ENOMEM;
1158
1159         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1160         if (err < 0)
1161                 goto out;
1162
1163         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1164                                             evlist->threads, trace__tool_process, false,
1165                                             trace->opts.proc_map_timeout, 1);
1166 out:
1167         if (err)
1168                 symbol__exit();
1169
1170         return err;
1171 }
1172
1173 static void trace__symbols__exit(struct trace *trace)
1174 {
1175         machine__exit(trace->host);
1176         trace->host = NULL;
1177
1178         symbol__exit();
1179 }
1180
1181 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1182 {
1183         int idx;
1184
1185         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1186                 nr_args = sc->fmt->nr_args;
1187
1188         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1189         if (sc->arg_fmt == NULL)
1190                 return -1;
1191
1192         for (idx = 0; idx < nr_args; ++idx) {
1193                 if (sc->fmt)
1194                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1195         }
1196
1197         sc->nr_args = nr_args;
1198         return 0;
1199 }
1200
1201 static int syscall__set_arg_fmts(struct syscall *sc)
1202 {
1203         struct format_field *field;
1204         int idx = 0, len;
1205
1206         for (field = sc->args; field; field = field->next, ++idx) {
1207                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1208                         continue;
1209
1210                 if (strcmp(field->type, "const char *") == 0 &&
1211                          (strcmp(field->name, "filename") == 0 ||
1212                           strcmp(field->name, "path") == 0 ||
1213                           strcmp(field->name, "pathname") == 0))
1214                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1215                 else if (field->flags & FIELD_IS_POINTER)
1216                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1217                 else if (strcmp(field->type, "pid_t") == 0)
1218                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1219                 else if (strcmp(field->type, "umode_t") == 0)
1220                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1221                 else if ((strcmp(field->type, "int") == 0 ||
1222                           strcmp(field->type, "unsigned int") == 0 ||
1223                           strcmp(field->type, "long") == 0) &&
1224                          (len = strlen(field->name)) >= 2 &&
1225                          strcmp(field->name + len - 2, "fd") == 0) {
1226                         /*
1227                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1228                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1229                          * 65 int
1230                          * 23 unsigned int
1231                          * 7 unsigned long
1232                          */
1233                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1234                 }
1235         }
1236
1237         return 0;
1238 }
1239
1240 static int trace__read_syscall_info(struct trace *trace, int id)
1241 {
1242         char tp_name[128];
1243         struct syscall *sc;
1244         const char *name = syscalltbl__name(trace->sctbl, id);
1245
1246         if (name == NULL)
1247                 return -1;
1248
1249         if (id > trace->syscalls.max) {
1250                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1251
1252                 if (nsyscalls == NULL)
1253                         return -1;
1254
1255                 if (trace->syscalls.max != -1) {
1256                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1257                                (id - trace->syscalls.max) * sizeof(*sc));
1258                 } else {
1259                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1260                 }
1261
1262                 trace->syscalls.table = nsyscalls;
1263                 trace->syscalls.max   = id;
1264         }
1265
1266         sc = trace->syscalls.table + id;
1267         sc->name = name;
1268
1269         sc->fmt  = syscall_fmt__find(sc->name);
1270
1271         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1272         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1273
1274         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1275                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1276                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1277         }
1278
1279         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1280                 return -1;
1281
1282         if (IS_ERR(sc->tp_format))
1283                 return -1;
1284
1285         sc->args = sc->tp_format->format.fields;
1286         /*
1287          * We need to check and discard the first variable '__syscall_nr'
1288          * or 'nr' that mean the syscall number. It is needless here.
1289          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1290          */
1291         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1292                 sc->args = sc->args->next;
1293                 --sc->nr_args;
1294         }
1295
1296         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1297
1298         return syscall__set_arg_fmts(sc);
1299 }
1300
1301 static int trace__validate_ev_qualifier(struct trace *trace)
1302 {
1303         int err = 0, i;
1304         size_t nr_allocated;
1305         struct str_node *pos;
1306
1307         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1308         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1309                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1310
1311         if (trace->ev_qualifier_ids.entries == NULL) {
1312                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1313                        trace->output);
1314                 err = -EINVAL;
1315                 goto out;
1316         }
1317
1318         nr_allocated = trace->ev_qualifier_ids.nr;
1319         i = 0;
1320
1321         strlist__for_each_entry(pos, trace->ev_qualifier) {
1322                 const char *sc = pos->s;
1323                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1324
1325                 if (id < 0) {
1326                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1327                         if (id >= 0)
1328                                 goto matches;
1329
1330                         if (err == 0) {
1331                                 fputs("Error:\tInvalid syscall ", trace->output);
1332                                 err = -EINVAL;
1333                         } else {
1334                                 fputs(", ", trace->output);
1335                         }
1336
1337                         fputs(sc, trace->output);
1338                 }
1339 matches:
1340                 trace->ev_qualifier_ids.entries[i++] = id;
1341                 if (match_next == -1)
1342                         continue;
1343
1344                 while (1) {
1345                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1346                         if (id < 0)
1347                                 break;
1348                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1349                                 void *entries;
1350
1351                                 nr_allocated += 8;
1352                                 entries = realloc(trace->ev_qualifier_ids.entries,
1353                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1354                                 if (entries == NULL) {
1355                                         err = -ENOMEM;
1356                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1357                                         goto out_free;
1358                                 }
1359                                 trace->ev_qualifier_ids.entries = entries;
1360                         }
1361                         trace->ev_qualifier_ids.nr++;
1362                         trace->ev_qualifier_ids.entries[i++] = id;
1363                 }
1364         }
1365
1366         if (err < 0) {
1367                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1368                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1369 out_free:
1370                 zfree(&trace->ev_qualifier_ids.entries);
1371                 trace->ev_qualifier_ids.nr = 0;
1372         }
1373 out:
1374         return err;
1375 }
1376
1377 /*
1378  * args is to be interpreted as a series of longs but we need to handle
1379  * 8-byte unaligned accesses. args points to raw_data within the event
1380  * and raw_data is guaranteed to be 8-byte unaligned because it is
1381  * preceded by raw_size which is a u32. So we need to copy args to a temp
1382  * variable to read it. Most notably this avoids extended load instructions
1383  * on unaligned addresses
1384  */
1385 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1386 {
1387         unsigned long val;
1388         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1389
1390         memcpy(&val, p, sizeof(val));
1391         return val;
1392 }
1393
1394 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1395                                       struct syscall_arg *arg)
1396 {
1397         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1398                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1399
1400         return scnprintf(bf, size, "arg%d: ", arg->idx);
1401 }
1402
1403 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1404                                      struct syscall_arg *arg, unsigned long val)
1405 {
1406         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1407                 arg->val = val;
1408                 if (sc->arg_fmt[arg->idx].parm)
1409                         arg->parm = sc->arg_fmt[arg->idx].parm;
1410                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1411         }
1412         return scnprintf(bf, size, "%ld", val);
1413 }
1414
1415 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1416                                       unsigned char *args, struct trace *trace,
1417                                       struct thread *thread)
1418 {
1419         size_t printed = 0;
1420         unsigned long val;
1421         u8 bit = 1;
1422         struct syscall_arg arg = {
1423                 .args   = args,
1424                 .idx    = 0,
1425                 .mask   = 0,
1426                 .trace  = trace,
1427                 .thread = thread,
1428         };
1429         struct thread_trace *ttrace = thread__priv(thread);
1430
1431         /*
1432          * Things like fcntl will set this in its 'cmd' formatter to pick the
1433          * right formatter for the return value (an fd? file flags?), which is
1434          * not needed for syscalls that always return a given type, say an fd.
1435          */
1436         ttrace->ret_scnprintf = NULL;
1437
1438         if (sc->args != NULL) {
1439                 struct format_field *field;
1440
1441                 for (field = sc->args; field;
1442                      field = field->next, ++arg.idx, bit <<= 1) {
1443                         if (arg.mask & bit)
1444                                 continue;
1445
1446                         val = syscall_arg__val(&arg, arg.idx);
1447
1448                         /*
1449                          * Suppress this argument if its value is zero and
1450                          * and we don't have a string associated in an
1451                          * strarray for it.
1452                          */
1453                         if (val == 0 &&
1454                             !(sc->arg_fmt &&
1455                               (sc->arg_fmt[arg.idx].show_zero ||
1456                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1457                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1458                               sc->arg_fmt[arg.idx].parm))
1459                                 continue;
1460
1461                         printed += scnprintf(bf + printed, size - printed,
1462                                              "%s%s: ", printed ? ", " : "", field->name);
1463                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1464                 }
1465         } else if (IS_ERR(sc->tp_format)) {
1466                 /*
1467                  * If we managed to read the tracepoint /format file, then we
1468                  * may end up not having any args, like with gettid(), so only
1469                  * print the raw args when we didn't manage to read it.
1470                  */
1471                 while (arg.idx < sc->nr_args) {
1472                         if (arg.mask & bit)
1473                                 goto next_arg;
1474                         val = syscall_arg__val(&arg, arg.idx);
1475                         if (printed)
1476                                 printed += scnprintf(bf + printed, size - printed, ", ");
1477                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1478                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1479 next_arg:
1480                         ++arg.idx;
1481                         bit <<= 1;
1482                 }
1483         }
1484
1485         return printed;
1486 }
1487
1488 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1489                                   union perf_event *event,
1490                                   struct perf_sample *sample);
1491
1492 static struct syscall *trace__syscall_info(struct trace *trace,
1493                                            struct perf_evsel *evsel, int id)
1494 {
1495
1496         if (id < 0) {
1497
1498                 /*
1499                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1500                  * before that, leaving at a higher verbosity level till that is
1501                  * explained. Reproduced with plain ftrace with:
1502                  *
1503                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1504                  * grep "NR -1 " /t/trace_pipe
1505                  *
1506                  * After generating some load on the machine.
1507                  */
1508                 if (verbose > 1) {
1509                         static u64 n;
1510                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1511                                 id, perf_evsel__name(evsel), ++n);
1512                 }
1513                 return NULL;
1514         }
1515
1516         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1517             trace__read_syscall_info(trace, id))
1518                 goto out_cant_read;
1519
1520         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1521                 goto out_cant_read;
1522
1523         return &trace->syscalls.table[id];
1524
1525 out_cant_read:
1526         if (verbose > 0) {
1527                 fprintf(trace->output, "Problems reading syscall %d", id);
1528                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1529                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1530                 fputs(" information\n", trace->output);
1531         }
1532         return NULL;
1533 }
1534
1535 static void thread__update_stats(struct thread_trace *ttrace,
1536                                  int id, struct perf_sample *sample)
1537 {
1538         struct int_node *inode;
1539         struct stats *stats;
1540         u64 duration = 0;
1541
1542         inode = intlist__findnew(ttrace->syscall_stats, id);
1543         if (inode == NULL)
1544                 return;
1545
1546         stats = inode->priv;
1547         if (stats == NULL) {
1548                 stats = malloc(sizeof(struct stats));
1549                 if (stats == NULL)
1550                         return;
1551                 init_stats(stats);
1552                 inode->priv = stats;
1553         }
1554
1555         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1556                 duration = sample->time - ttrace->entry_time;
1557
1558         update_stats(stats, duration);
1559 }
1560
1561 static int trace__printf_interrupted_entry(struct trace *trace)
1562 {
1563         struct thread_trace *ttrace;
1564         size_t printed;
1565
1566         if (trace->current == NULL)
1567                 return 0;
1568
1569         ttrace = thread__priv(trace->current);
1570
1571         if (!ttrace->entry_pending)
1572                 return 0;
1573
1574         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1575         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1576         ttrace->entry_pending = false;
1577
1578         return printed;
1579 }
1580
1581 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1582                                  struct perf_sample *sample, struct thread *thread)
1583 {
1584         int printed = 0;
1585
1586         if (trace->print_sample) {
1587                 double ts = (double)sample->time / NSEC_PER_MSEC;
1588
1589                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1590                                    perf_evsel__name(evsel), ts,
1591                                    thread__comm_str(thread),
1592                                    sample->pid, sample->tid, sample->cpu);
1593         }
1594
1595         return printed;
1596 }
1597
1598 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1599                             union perf_event *event __maybe_unused,
1600                             struct perf_sample *sample)
1601 {
1602         char *msg;
1603         void *args;
1604         size_t printed = 0;
1605         struct thread *thread;
1606         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1607         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1608         struct thread_trace *ttrace;
1609
1610         if (sc == NULL)
1611                 return -1;
1612
1613         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1614         ttrace = thread__trace(thread, trace->output);
1615         if (ttrace == NULL)
1616                 goto out_put;
1617
1618         trace__fprintf_sample(trace, evsel, sample, thread);
1619
1620         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1621
1622         if (ttrace->entry_str == NULL) {
1623                 ttrace->entry_str = malloc(trace__entry_str_size);
1624                 if (!ttrace->entry_str)
1625                         goto out_put;
1626         }
1627
1628         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1629                 trace__printf_interrupted_entry(trace);
1630
1631         ttrace->entry_time = sample->time;
1632         msg = ttrace->entry_str;
1633         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1634
1635         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1636                                            args, trace, thread);
1637
1638         if (sc->is_exit) {
1639                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1640                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1641                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1642                 }
1643         } else {
1644                 ttrace->entry_pending = true;
1645                 /* See trace__vfs_getname & trace__sys_exit */
1646                 ttrace->filename.pending_open = false;
1647         }
1648
1649         if (trace->current != thread) {
1650                 thread__put(trace->current);
1651                 trace->current = thread__get(thread);
1652         }
1653         err = 0;
1654 out_put:
1655         thread__put(thread);
1656         return err;
1657 }
1658
1659 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1660                                     struct perf_sample *sample,
1661                                     struct callchain_cursor *cursor)
1662 {
1663         struct addr_location al;
1664         int max_stack = evsel->attr.sample_max_stack ?
1665                         evsel->attr.sample_max_stack :
1666                         trace->max_stack;
1667
1668         if (machine__resolve(trace->host, &al, sample) < 0 ||
1669             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1670                 return -1;
1671
1672         return 0;
1673 }
1674
1675 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1676 {
1677         /* TODO: user-configurable print_opts */
1678         const unsigned int print_opts = EVSEL__PRINT_SYM |
1679                                         EVSEL__PRINT_DSO |
1680                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1681
1682         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1683 }
1684
1685 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1686 {
1687         struct perf_env *env = perf_evsel__env(evsel);
1688         const char *arch_name = perf_env__arch(env);
1689
1690         return arch_syscalls__strerrno(arch_name, err);
1691 }
1692
1693 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1694                            union perf_event *event __maybe_unused,
1695                            struct perf_sample *sample)
1696 {
1697         long ret;
1698         u64 duration = 0;
1699         bool duration_calculated = false;
1700         struct thread *thread;
1701         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1702         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1703         struct thread_trace *ttrace;
1704
1705         if (sc == NULL)
1706                 return -1;
1707
1708         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1709         ttrace = thread__trace(thread, trace->output);
1710         if (ttrace == NULL)
1711                 goto out_put;
1712
1713         trace__fprintf_sample(trace, evsel, sample, thread);
1714
1715         if (trace->summary)
1716                 thread__update_stats(ttrace, id, sample);
1717
1718         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1719
1720         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1721                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1722                 ttrace->filename.pending_open = false;
1723                 ++trace->stats.vfs_getname;
1724         }
1725
1726         if (ttrace->entry_time) {
1727                 duration = sample->time - ttrace->entry_time;
1728                 if (trace__filter_duration(trace, duration))
1729                         goto out;
1730                 duration_calculated = true;
1731         } else if (trace->duration_filter)
1732                 goto out;
1733
1734         if (sample->callchain) {
1735                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1736                 if (callchain_ret == 0) {
1737                         if (callchain_cursor.nr < trace->min_stack)
1738                                 goto out;
1739                         callchain_ret = 1;
1740                 }
1741         }
1742
1743         if (trace->summary_only)
1744                 goto out;
1745
1746         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1747
1748         if (ttrace->entry_pending) {
1749                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1750         } else {
1751                 fprintf(trace->output, " ... [");
1752                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1753                 fprintf(trace->output, "]: %s()", sc->name);
1754         }
1755
1756         if (sc->fmt == NULL) {
1757                 if (ret < 0)
1758                         goto errno_print;
1759 signed_print:
1760                 fprintf(trace->output, ") = %ld", ret);
1761         } else if (ret < 0) {
1762 errno_print: {
1763                 char bf[STRERR_BUFSIZE];
1764                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1765                            *e = errno_to_name(evsel, -ret);
1766
1767                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1768         }
1769         } else if (ret == 0 && sc->fmt->timeout)
1770                 fprintf(trace->output, ") = 0 Timeout");
1771         else if (ttrace->ret_scnprintf) {
1772                 char bf[1024];
1773                 struct syscall_arg arg = {
1774                         .val    = ret,
1775                         .thread = thread,
1776                         .trace  = trace,
1777                 };
1778                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1779                 ttrace->ret_scnprintf = NULL;
1780                 fprintf(trace->output, ") = %s", bf);
1781         } else if (sc->fmt->hexret)
1782                 fprintf(trace->output, ") = %#lx", ret);
1783         else if (sc->fmt->errpid) {
1784                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1785
1786                 if (child != NULL) {
1787                         fprintf(trace->output, ") = %ld", ret);
1788                         if (child->comm_set)
1789                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1790                         thread__put(child);
1791                 }
1792         } else
1793                 goto signed_print;
1794
1795         fputc('\n', trace->output);
1796
1797         if (callchain_ret > 0)
1798                 trace__fprintf_callchain(trace, sample);
1799         else if (callchain_ret < 0)
1800                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1801 out:
1802         ttrace->entry_pending = false;
1803         err = 0;
1804 out_put:
1805         thread__put(thread);
1806         return err;
1807 }
1808
1809 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1810                               union perf_event *event __maybe_unused,
1811                               struct perf_sample *sample)
1812 {
1813         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1814         struct thread_trace *ttrace;
1815         size_t filename_len, entry_str_len, to_move;
1816         ssize_t remaining_space;
1817         char *pos;
1818         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1819
1820         if (!thread)
1821                 goto out;
1822
1823         ttrace = thread__priv(thread);
1824         if (!ttrace)
1825                 goto out_put;
1826
1827         filename_len = strlen(filename);
1828         if (filename_len == 0)
1829                 goto out_put;
1830
1831         if (ttrace->filename.namelen < filename_len) {
1832                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1833
1834                 if (f == NULL)
1835                         goto out_put;
1836
1837                 ttrace->filename.namelen = filename_len;
1838                 ttrace->filename.name = f;
1839         }
1840
1841         strcpy(ttrace->filename.name, filename);
1842         ttrace->filename.pending_open = true;
1843
1844         if (!ttrace->filename.ptr)
1845                 goto out_put;
1846
1847         entry_str_len = strlen(ttrace->entry_str);
1848         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1849         if (remaining_space <= 0)
1850                 goto out_put;
1851
1852         if (filename_len > (size_t)remaining_space) {
1853                 filename += filename_len - remaining_space;
1854                 filename_len = remaining_space;
1855         }
1856
1857         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1858         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1859         memmove(pos + filename_len, pos, to_move);
1860         memcpy(pos, filename, filename_len);
1861
1862         ttrace->filename.ptr = 0;
1863         ttrace->filename.entry_str_pos = 0;
1864 out_put:
1865         thread__put(thread);
1866 out:
1867         return 0;
1868 }
1869
1870 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1871                                      union perf_event *event __maybe_unused,
1872                                      struct perf_sample *sample)
1873 {
1874         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1875         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1876         struct thread *thread = machine__findnew_thread(trace->host,
1877                                                         sample->pid,
1878                                                         sample->tid);
1879         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1880
1881         if (ttrace == NULL)
1882                 goto out_dump;
1883
1884         ttrace->runtime_ms += runtime_ms;
1885         trace->runtime_ms += runtime_ms;
1886 out_put:
1887         thread__put(thread);
1888         return 0;
1889
1890 out_dump:
1891         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1892                evsel->name,
1893                perf_evsel__strval(evsel, sample, "comm"),
1894                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1895                runtime,
1896                perf_evsel__intval(evsel, sample, "vruntime"));
1897         goto out_put;
1898 }
1899
1900 static int bpf_output__printer(enum binary_printer_ops op,
1901                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1902 {
1903         unsigned char ch = (unsigned char)val;
1904
1905         switch (op) {
1906         case BINARY_PRINT_CHAR_DATA:
1907                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1908         case BINARY_PRINT_DATA_BEGIN:
1909         case BINARY_PRINT_LINE_BEGIN:
1910         case BINARY_PRINT_ADDR:
1911         case BINARY_PRINT_NUM_DATA:
1912         case BINARY_PRINT_NUM_PAD:
1913         case BINARY_PRINT_SEP:
1914         case BINARY_PRINT_CHAR_PAD:
1915         case BINARY_PRINT_LINE_END:
1916         case BINARY_PRINT_DATA_END:
1917         default:
1918                 break;
1919         }
1920
1921         return 0;
1922 }
1923
1924 static void bpf_output__fprintf(struct trace *trace,
1925                                 struct perf_sample *sample)
1926 {
1927         binary__fprintf(sample->raw_data, sample->raw_size, 8,
1928                         bpf_output__printer, NULL, trace->output);
1929 }
1930
1931 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1932                                 union perf_event *event __maybe_unused,
1933                                 struct perf_sample *sample)
1934 {
1935         int callchain_ret = 0;
1936
1937         if (sample->callchain) {
1938                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1939                 if (callchain_ret == 0) {
1940                         if (callchain_cursor.nr < trace->min_stack)
1941                                 goto out;
1942                         callchain_ret = 1;
1943                 }
1944         }
1945
1946         trace__printf_interrupted_entry(trace);
1947         trace__fprintf_tstamp(trace, sample->time, trace->output);
1948
1949         if (trace->trace_syscalls)
1950                 fprintf(trace->output, "(         ): ");
1951
1952         fprintf(trace->output, "%s:", evsel->name);
1953
1954         if (perf_evsel__is_bpf_output(evsel)) {
1955                 bpf_output__fprintf(trace, sample);
1956         } else if (evsel->tp_format) {
1957                 event_format__fprintf(evsel->tp_format, sample->cpu,
1958                                       sample->raw_data, sample->raw_size,
1959                                       trace->output);
1960         }
1961
1962         fprintf(trace->output, ")\n");
1963
1964         if (callchain_ret > 0)
1965                 trace__fprintf_callchain(trace, sample);
1966         else if (callchain_ret < 0)
1967                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1968 out:
1969         return 0;
1970 }
1971
1972 static void print_location(FILE *f, struct perf_sample *sample,
1973                            struct addr_location *al,
1974                            bool print_dso, bool print_sym)
1975 {
1976
1977         if ((verbose > 0 || print_dso) && al->map)
1978                 fprintf(f, "%s@", al->map->dso->long_name);
1979
1980         if ((verbose > 0 || print_sym) && al->sym)
1981                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1982                         al->addr - al->sym->start);
1983         else if (al->map)
1984                 fprintf(f, "0x%" PRIx64, al->addr);
1985         else
1986                 fprintf(f, "0x%" PRIx64, sample->addr);
1987 }
1988
1989 static int trace__pgfault(struct trace *trace,
1990                           struct perf_evsel *evsel,
1991                           union perf_event *event __maybe_unused,
1992                           struct perf_sample *sample)
1993 {
1994         struct thread *thread;
1995         struct addr_location al;
1996         char map_type = 'd';
1997         struct thread_trace *ttrace;
1998         int err = -1;
1999         int callchain_ret = 0;
2000
2001         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2002
2003         if (sample->callchain) {
2004                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2005                 if (callchain_ret == 0) {
2006                         if (callchain_cursor.nr < trace->min_stack)
2007                                 goto out_put;
2008                         callchain_ret = 1;
2009                 }
2010         }
2011
2012         ttrace = thread__trace(thread, trace->output);
2013         if (ttrace == NULL)
2014                 goto out_put;
2015
2016         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2017                 ttrace->pfmaj++;
2018         else
2019                 ttrace->pfmin++;
2020
2021         if (trace->summary_only)
2022                 goto out;
2023
2024         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2025                               sample->ip, &al);
2026
2027         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2028
2029         fprintf(trace->output, "%sfault [",
2030                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2031                 "maj" : "min");
2032
2033         print_location(trace->output, sample, &al, false, true);
2034
2035         fprintf(trace->output, "] => ");
2036
2037         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2038                                    sample->addr, &al);
2039
2040         if (!al.map) {
2041                 thread__find_addr_location(thread, sample->cpumode,
2042                                            MAP__FUNCTION, sample->addr, &al);
2043
2044                 if (al.map)
2045                         map_type = 'x';
2046                 else
2047                         map_type = '?';
2048         }
2049
2050         print_location(trace->output, sample, &al, true, false);
2051
2052         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2053
2054         if (callchain_ret > 0)
2055                 trace__fprintf_callchain(trace, sample);
2056         else if (callchain_ret < 0)
2057                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2058 out:
2059         err = 0;
2060 out_put:
2061         thread__put(thread);
2062         return err;
2063 }
2064
2065 static void trace__set_base_time(struct trace *trace,
2066                                  struct perf_evsel *evsel,
2067                                  struct perf_sample *sample)
2068 {
2069         /*
2070          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2071          * and don't use sample->time unconditionally, we may end up having
2072          * some other event in the future without PERF_SAMPLE_TIME for good
2073          * reason, i.e. we may not be interested in its timestamps, just in
2074          * it taking place, picking some piece of information when it
2075          * appears in our event stream (vfs_getname comes to mind).
2076          */
2077         if (trace->base_time == 0 && !trace->full_time &&
2078             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2079                 trace->base_time = sample->time;
2080 }
2081
2082 static int trace__process_sample(struct perf_tool *tool,
2083                                  union perf_event *event,
2084                                  struct perf_sample *sample,
2085                                  struct perf_evsel *evsel,
2086                                  struct machine *machine __maybe_unused)
2087 {
2088         struct trace *trace = container_of(tool, struct trace, tool);
2089         struct thread *thread;
2090         int err = 0;
2091
2092         tracepoint_handler handler = evsel->handler;
2093
2094         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2095         if (thread && thread__is_filtered(thread))
2096                 goto out;
2097
2098         trace__set_base_time(trace, evsel, sample);
2099
2100         if (handler) {
2101                 ++trace->nr_events;
2102                 handler(trace, evsel, event, sample);
2103         }
2104 out:
2105         thread__put(thread);
2106         return err;
2107 }
2108
2109 static int trace__record(struct trace *trace, int argc, const char **argv)
2110 {
2111         unsigned int rec_argc, i, j;
2112         const char **rec_argv;
2113         const char * const record_args[] = {
2114                 "record",
2115                 "-R",
2116                 "-m", "1024",
2117                 "-c", "1",
2118         };
2119
2120         const char * const sc_args[] = { "-e", };
2121         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2122         const char * const majpf_args[] = { "-e", "major-faults" };
2123         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2124         const char * const minpf_args[] = { "-e", "minor-faults" };
2125         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2126
2127         /* +1 is for the event string below */
2128         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2129                 majpf_args_nr + minpf_args_nr + argc;
2130         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2131
2132         if (rec_argv == NULL)
2133                 return -ENOMEM;
2134
2135         j = 0;
2136         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2137                 rec_argv[j++] = record_args[i];
2138
2139         if (trace->trace_syscalls) {
2140                 for (i = 0; i < sc_args_nr; i++)
2141                         rec_argv[j++] = sc_args[i];
2142
2143                 /* event string may be different for older kernels - e.g., RHEL6 */
2144                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2145                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2146                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2147                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2148                 else {
2149                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2150                         free(rec_argv);
2151                         return -1;
2152                 }
2153         }
2154
2155         if (trace->trace_pgfaults & TRACE_PFMAJ)
2156                 for (i = 0; i < majpf_args_nr; i++)
2157                         rec_argv[j++] = majpf_args[i];
2158
2159         if (trace->trace_pgfaults & TRACE_PFMIN)
2160                 for (i = 0; i < minpf_args_nr; i++)
2161                         rec_argv[j++] = minpf_args[i];
2162
2163         for (i = 0; i < (unsigned int)argc; i++)
2164                 rec_argv[j++] = argv[i];
2165
2166         return cmd_record(j, rec_argv);
2167 }
2168
2169 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2170
2171 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2172 {
2173         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2174
2175         if (IS_ERR(evsel))
2176                 return false;
2177
2178         if (perf_evsel__field(evsel, "pathname") == NULL) {
2179                 perf_evsel__delete(evsel);
2180                 return false;
2181         }
2182
2183         evsel->handler = trace__vfs_getname;
2184         perf_evlist__add(evlist, evsel);
2185         return true;
2186 }
2187
2188 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2189 {
2190         struct perf_evsel *evsel;
2191         struct perf_event_attr attr = {
2192                 .type = PERF_TYPE_SOFTWARE,
2193                 .mmap_data = 1,
2194         };
2195
2196         attr.config = config;
2197         attr.sample_period = 1;
2198
2199         event_attr_init(&attr);
2200
2201         evsel = perf_evsel__new(&attr);
2202         if (evsel)
2203                 evsel->handler = trace__pgfault;
2204
2205         return evsel;
2206 }
2207
2208 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2209 {
2210         const u32 type = event->header.type;
2211         struct perf_evsel *evsel;
2212
2213         if (type != PERF_RECORD_SAMPLE) {
2214                 trace__process_event(trace, trace->host, event, sample);
2215                 return;
2216         }
2217
2218         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2219         if (evsel == NULL) {
2220                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2221                 return;
2222         }
2223
2224         trace__set_base_time(trace, evsel, sample);
2225
2226         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2227             sample->raw_data == NULL) {
2228                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2229                        perf_evsel__name(evsel), sample->tid,
2230                        sample->cpu, sample->raw_size);
2231         } else {
2232                 tracepoint_handler handler = evsel->handler;
2233                 handler(trace, evsel, event, sample);
2234         }
2235 }
2236
2237 static int trace__add_syscall_newtp(struct trace *trace)
2238 {
2239         int ret = -1;
2240         struct perf_evlist *evlist = trace->evlist;
2241         struct perf_evsel *sys_enter, *sys_exit;
2242
2243         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2244         if (sys_enter == NULL)
2245                 goto out;
2246
2247         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2248                 goto out_delete_sys_enter;
2249
2250         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2251         if (sys_exit == NULL)
2252                 goto out_delete_sys_enter;
2253
2254         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2255                 goto out_delete_sys_exit;
2256
2257         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2258         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2259
2260         perf_evlist__add(evlist, sys_enter);
2261         perf_evlist__add(evlist, sys_exit);
2262
2263         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2264                 /*
2265                  * We're interested only in the user space callchain
2266                  * leading to the syscall, allow overriding that for
2267                  * debugging reasons using --kernel_syscall_callchains
2268                  */
2269                 sys_exit->attr.exclude_callchain_kernel = 1;
2270         }
2271
2272         trace->syscalls.events.sys_enter = sys_enter;
2273         trace->syscalls.events.sys_exit  = sys_exit;
2274
2275         ret = 0;
2276 out:
2277         return ret;
2278
2279 out_delete_sys_exit:
2280         perf_evsel__delete_priv(sys_exit);
2281 out_delete_sys_enter:
2282         perf_evsel__delete_priv(sys_enter);
2283         goto out;
2284 }
2285
2286 static int trace__set_ev_qualifier_filter(struct trace *trace)
2287 {
2288         int err = -1;
2289         struct perf_evsel *sys_exit;
2290         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2291                                                 trace->ev_qualifier_ids.nr,
2292                                                 trace->ev_qualifier_ids.entries);
2293
2294         if (filter == NULL)
2295                 goto out_enomem;
2296
2297         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2298                                           filter)) {
2299                 sys_exit = trace->syscalls.events.sys_exit;
2300                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2301         }
2302
2303         free(filter);
2304 out:
2305         return err;
2306 out_enomem:
2307         errno = ENOMEM;
2308         goto out;
2309 }
2310
2311 static int trace__set_filter_loop_pids(struct trace *trace)
2312 {
2313         unsigned int nr = 1;
2314         pid_t pids[32] = {
2315                 getpid(),
2316         };
2317         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2318
2319         while (thread && nr < ARRAY_SIZE(pids)) {
2320                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2321
2322                 if (parent == NULL)
2323                         break;
2324
2325                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2326                         pids[nr++] = parent->tid;
2327                         break;
2328                 }
2329                 thread = parent;
2330         }
2331
2332         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2333 }
2334
2335 static int trace__run(struct trace *trace, int argc, const char **argv)
2336 {
2337         struct perf_evlist *evlist = trace->evlist;
2338         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2339         int err = -1, i;
2340         unsigned long before;
2341         const bool forks = argc > 0;
2342         bool draining = false;
2343
2344         trace->live = true;
2345
2346         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2347                 goto out_error_raw_syscalls;
2348
2349         if (trace->trace_syscalls)
2350                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2351
2352         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2353                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2354                 if (pgfault_maj == NULL)
2355                         goto out_error_mem;
2356                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2357                 perf_evlist__add(evlist, pgfault_maj);
2358         }
2359
2360         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2361                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2362                 if (pgfault_min == NULL)
2363                         goto out_error_mem;
2364                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2365                 perf_evlist__add(evlist, pgfault_min);
2366         }
2367
2368         if (trace->sched &&
2369             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2370                                    trace__sched_stat_runtime))
2371                 goto out_error_sched_stat_runtime;
2372
2373         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2374         if (err < 0) {
2375                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2376                 goto out_delete_evlist;
2377         }
2378
2379         err = trace__symbols_init(trace, evlist);
2380         if (err < 0) {
2381                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2382                 goto out_delete_evlist;
2383         }
2384
2385         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2386
2387         signal(SIGCHLD, sig_handler);
2388         signal(SIGINT, sig_handler);
2389
2390         if (forks) {
2391                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2392                                                     argv, false, NULL);
2393                 if (err < 0) {
2394                         fprintf(trace->output, "Couldn't run the workload!\n");
2395                         goto out_delete_evlist;
2396                 }
2397         }
2398
2399         err = perf_evlist__open(evlist);
2400         if (err < 0)
2401                 goto out_error_open;
2402
2403         err = bpf__apply_obj_config();
2404         if (err) {
2405                 char errbuf[BUFSIZ];
2406
2407                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2408                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2409                          errbuf);
2410                 goto out_error_open;
2411         }
2412
2413         /*
2414          * Better not use !target__has_task() here because we need to cover the
2415          * case where no threads were specified in the command line, but a
2416          * workload was, and in that case we will fill in the thread_map when
2417          * we fork the workload in perf_evlist__prepare_workload.
2418          */
2419         if (trace->filter_pids.nr > 0)
2420                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2421         else if (thread_map__pid(evlist->threads, 0) == -1)
2422                 err = trace__set_filter_loop_pids(trace);
2423
2424         if (err < 0)
2425                 goto out_error_mem;
2426
2427         if (trace->ev_qualifier_ids.nr > 0) {
2428                 err = trace__set_ev_qualifier_filter(trace);
2429                 if (err < 0)
2430                         goto out_errno;
2431
2432                 pr_debug("event qualifier tracepoint filter: %s\n",
2433                          trace->syscalls.events.sys_exit->filter);
2434         }
2435
2436         err = perf_evlist__apply_filters(evlist, &evsel);
2437         if (err < 0)
2438                 goto out_error_apply_filters;
2439
2440         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2441         if (err < 0)
2442                 goto out_error_mmap;
2443
2444         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2445                 perf_evlist__enable(evlist);
2446
2447         if (forks)
2448                 perf_evlist__start_workload(evlist);
2449
2450         if (trace->opts.initial_delay) {
2451                 usleep(trace->opts.initial_delay * 1000);
2452                 perf_evlist__enable(evlist);
2453         }
2454
2455         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2456                                   evlist->threads->nr > 1 ||
2457                                   perf_evlist__first(evlist)->attr.inherit;
2458
2459         /*
2460          * Now that we already used evsel->attr to ask the kernel to setup the
2461          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2462          * trace__resolve_callchain(), allowing per-event max-stack settings
2463          * to override an explicitely set --max-stack global setting.
2464          */
2465         evlist__for_each_entry(evlist, evsel) {
2466                 if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2467                     evsel->attr.sample_max_stack == 0)
2468                         evsel->attr.sample_max_stack = trace->max_stack;
2469         }
2470 again:
2471         before = trace->nr_events;
2472
2473         for (i = 0; i < evlist->nr_mmaps; i++) {
2474                 union perf_event *event;
2475
2476                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2477                         struct perf_sample sample;
2478
2479                         ++trace->nr_events;
2480
2481                         err = perf_evlist__parse_sample(evlist, event, &sample);
2482                         if (err) {
2483                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2484                                 goto next_event;
2485                         }
2486
2487                         trace__handle_event(trace, event, &sample);
2488 next_event:
2489                         perf_evlist__mmap_consume(evlist, i);
2490
2491                         if (interrupted)
2492                                 goto out_disable;
2493
2494                         if (done && !draining) {
2495                                 perf_evlist__disable(evlist);
2496                                 draining = true;
2497                         }
2498                 }
2499         }
2500
2501         if (trace->nr_events == before) {
2502                 int timeout = done ? 100 : -1;
2503
2504                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2505                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2506                                 draining = true;
2507
2508                         goto again;
2509                 }
2510         } else {
2511                 goto again;
2512         }
2513
2514 out_disable:
2515         thread__zput(trace->current);
2516
2517         perf_evlist__disable(evlist);
2518
2519         if (!err) {
2520                 if (trace->summary)
2521                         trace__fprintf_thread_summary(trace, trace->output);
2522
2523                 if (trace->show_tool_stats) {
2524                         fprintf(trace->output, "Stats:\n "
2525                                                " vfs_getname : %" PRIu64 "\n"
2526                                                " proc_getname: %" PRIu64 "\n",
2527                                 trace->stats.vfs_getname,
2528                                 trace->stats.proc_getname);
2529                 }
2530         }
2531
2532 out_delete_evlist:
2533         trace__symbols__exit(trace);
2534
2535         perf_evlist__delete(evlist);
2536         trace->evlist = NULL;
2537         trace->live = false;
2538         return err;
2539 {
2540         char errbuf[BUFSIZ];
2541
2542 out_error_sched_stat_runtime:
2543         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2544         goto out_error;
2545
2546 out_error_raw_syscalls:
2547         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2548         goto out_error;
2549
2550 out_error_mmap:
2551         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2552         goto out_error;
2553
2554 out_error_open:
2555         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2556
2557 out_error:
2558         fprintf(trace->output, "%s\n", errbuf);
2559         goto out_delete_evlist;
2560
2561 out_error_apply_filters:
2562         fprintf(trace->output,
2563                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2564                 evsel->filter, perf_evsel__name(evsel), errno,
2565                 str_error_r(errno, errbuf, sizeof(errbuf)));
2566         goto out_delete_evlist;
2567 }
2568 out_error_mem:
2569         fprintf(trace->output, "Not enough memory to run!\n");
2570         goto out_delete_evlist;
2571
2572 out_errno:
2573         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2574         goto out_delete_evlist;
2575 }
2576
2577 static int trace__replay(struct trace *trace)
2578 {
2579         const struct perf_evsel_str_handler handlers[] = {
2580                 { "probe:vfs_getname",       trace__vfs_getname, },
2581         };
2582         struct perf_data data = {
2583                 .file      = {
2584                         .path = input_name,
2585                 },
2586                 .mode      = PERF_DATA_MODE_READ,
2587                 .force     = trace->force,
2588         };
2589         struct perf_session *session;
2590         struct perf_evsel *evsel;
2591         int err = -1;
2592
2593         trace->tool.sample        = trace__process_sample;
2594         trace->tool.mmap          = perf_event__process_mmap;
2595         trace->tool.mmap2         = perf_event__process_mmap2;
2596         trace->tool.comm          = perf_event__process_comm;
2597         trace->tool.exit          = perf_event__process_exit;
2598         trace->tool.fork          = perf_event__process_fork;
2599         trace->tool.attr          = perf_event__process_attr;
2600         trace->tool.tracing_data  = perf_event__process_tracing_data;
2601         trace->tool.build_id      = perf_event__process_build_id;
2602         trace->tool.namespaces    = perf_event__process_namespaces;
2603
2604         trace->tool.ordered_events = true;
2605         trace->tool.ordering_requires_timestamps = true;
2606
2607         /* add tid to output */
2608         trace->multiple_threads = true;
2609
2610         session = perf_session__new(&data, false, &trace->tool);
2611         if (session == NULL)
2612                 return -1;
2613
2614         if (trace->opts.target.pid)
2615                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2616
2617         if (trace->opts.target.tid)
2618                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2619
2620         if (symbol__init(&session->header.env) < 0)
2621                 goto out;
2622
2623         trace->host = &session->machines.host;
2624
2625         err = perf_session__set_tracepoints_handlers(session, handlers);
2626         if (err)
2627                 goto out;
2628
2629         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2630                                                      "raw_syscalls:sys_enter");
2631         /* older kernels have syscalls tp versus raw_syscalls */
2632         if (evsel == NULL)
2633                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2634                                                              "syscalls:sys_enter");
2635
2636         if (evsel &&
2637             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2638             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2639                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2640                 goto out;
2641         }
2642
2643         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2644                                                      "raw_syscalls:sys_exit");
2645         if (evsel == NULL)
2646                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2647                                                              "syscalls:sys_exit");
2648         if (evsel &&
2649             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2650             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2651                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2652                 goto out;
2653         }
2654
2655         evlist__for_each_entry(session->evlist, evsel) {
2656                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2657                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2658                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2659                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2660                         evsel->handler = trace__pgfault;
2661         }
2662
2663         setup_pager();
2664
2665         err = perf_session__process_events(session);
2666         if (err)
2667                 pr_err("Failed to process events, error %d", err);
2668
2669         else if (trace->summary)
2670                 trace__fprintf_thread_summary(trace, trace->output);
2671
2672 out:
2673         perf_session__delete(session);
2674
2675         return err;
2676 }
2677
2678 static size_t trace__fprintf_threads_header(FILE *fp)
2679 {
2680         size_t printed;
2681
2682         printed  = fprintf(fp, "\n Summary of events:\n\n");
2683
2684         return printed;
2685 }
2686
2687 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2688         struct stats    *stats;
2689         double          msecs;
2690         int             syscall;
2691 )
2692 {
2693         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2694         struct stats *stats = source->priv;
2695
2696         entry->syscall = source->i;
2697         entry->stats   = stats;
2698         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2699 }
2700
2701 static size_t thread__dump_stats(struct thread_trace *ttrace,
2702                                  struct trace *trace, FILE *fp)
2703 {
2704         size_t printed = 0;
2705         struct syscall *sc;
2706         struct rb_node *nd;
2707         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2708
2709         if (syscall_stats == NULL)
2710                 return 0;
2711
2712         printed += fprintf(fp, "\n");
2713
2714         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2715         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2716         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2717
2718         resort_rb__for_each_entry(nd, syscall_stats) {
2719                 struct stats *stats = syscall_stats_entry->stats;
2720                 if (stats) {
2721                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2722                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2723                         double avg = avg_stats(stats);
2724                         double pct;
2725                         u64 n = (u64) stats->n;
2726
2727                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2728                         avg /= NSEC_PER_MSEC;
2729
2730                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2731                         printed += fprintf(fp, "   %-15s", sc->name);
2732                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2733                                            n, syscall_stats_entry->msecs, min, avg);
2734                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2735                 }
2736         }
2737
2738         resort_rb__delete(syscall_stats);
2739         printed += fprintf(fp, "\n\n");
2740
2741         return printed;
2742 }
2743
2744 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2745 {
2746         size_t printed = 0;
2747         struct thread_trace *ttrace = thread__priv(thread);
2748         double ratio;
2749
2750         if (ttrace == NULL)
2751                 return 0;
2752
2753         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2754
2755         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2756         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2757         printed += fprintf(fp, "%.1f%%", ratio);
2758         if (ttrace->pfmaj)
2759                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2760         if (ttrace->pfmin)
2761                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2762         if (trace->sched)
2763                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2764         else if (fputc('\n', fp) != EOF)
2765                 ++printed;
2766
2767         printed += thread__dump_stats(ttrace, trace, fp);
2768
2769         return printed;
2770 }
2771
2772 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2773 {
2774         return ttrace ? ttrace->nr_events : 0;
2775 }
2776
2777 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2778         struct thread *thread;
2779 )
2780 {
2781         entry->thread = rb_entry(nd, struct thread, rb_node);
2782 }
2783
2784 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2785 {
2786         size_t printed = trace__fprintf_threads_header(fp);
2787         struct rb_node *nd;
2788         int i;
2789
2790         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2791                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2792
2793                 if (threads == NULL) {
2794                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2795                         return 0;
2796                 }
2797
2798                 resort_rb__for_each_entry(nd, threads)
2799                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2800
2801                 resort_rb__delete(threads);
2802         }
2803         return printed;
2804 }
2805
2806 static int trace__set_duration(const struct option *opt, const char *str,
2807                                int unset __maybe_unused)
2808 {
2809         struct trace *trace = opt->value;
2810
2811         trace->duration_filter = atof(str);
2812         return 0;
2813 }
2814
2815 static int trace__set_filter_pids(const struct option *opt, const char *str,
2816                                   int unset __maybe_unused)
2817 {
2818         int ret = -1;
2819         size_t i;
2820         struct trace *trace = opt->value;
2821         /*
2822          * FIXME: introduce a intarray class, plain parse csv and create a
2823          * { int nr, int entries[] } struct...
2824          */
2825         struct intlist *list = intlist__new(str);
2826
2827         if (list == NULL)
2828                 return -1;
2829
2830         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2831         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2832
2833         if (trace->filter_pids.entries == NULL)
2834                 goto out;
2835
2836         trace->filter_pids.entries[0] = getpid();
2837
2838         for (i = 1; i < trace->filter_pids.nr; ++i)
2839                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2840
2841         intlist__delete(list);
2842         ret = 0;
2843 out:
2844         return ret;
2845 }
2846
2847 static int trace__open_output(struct trace *trace, const char *filename)
2848 {
2849         struct stat st;
2850
2851         if (!stat(filename, &st) && st.st_size) {
2852                 char oldname[PATH_MAX];
2853
2854                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2855                 unlink(oldname);
2856                 rename(filename, oldname);
2857         }
2858
2859         trace->output = fopen(filename, "w");
2860
2861         return trace->output == NULL ? -errno : 0;
2862 }
2863
2864 static int parse_pagefaults(const struct option *opt, const char *str,
2865                             int unset __maybe_unused)
2866 {
2867         int *trace_pgfaults = opt->value;
2868
2869         if (strcmp(str, "all") == 0)
2870                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2871         else if (strcmp(str, "maj") == 0)
2872                 *trace_pgfaults |= TRACE_PFMAJ;
2873         else if (strcmp(str, "min") == 0)
2874                 *trace_pgfaults |= TRACE_PFMIN;
2875         else
2876                 return -1;
2877
2878         return 0;
2879 }
2880
2881 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2882 {
2883         struct perf_evsel *evsel;
2884
2885         evlist__for_each_entry(evlist, evsel)
2886                 evsel->handler = handler;
2887 }
2888
2889 /*
2890  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2891  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2892  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2893  *
2894  * It'd be better to introduce a parse_options() variant that would return a
2895  * list with the terms it didn't match to an event...
2896  */
2897 static int trace__parse_events_option(const struct option *opt, const char *str,
2898                                       int unset __maybe_unused)
2899 {
2900         struct trace *trace = (struct trace *)opt->value;
2901         const char *s = str;
2902         char *sep = NULL, *lists[2] = { NULL, NULL, };
2903         int len = strlen(str) + 1, err = -1, list, idx;
2904         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2905         char group_name[PATH_MAX];
2906
2907         if (strace_groups_dir == NULL)
2908                 return -1;
2909
2910         if (*s == '!') {
2911                 ++s;
2912                 trace->not_ev_qualifier = true;
2913         }
2914
2915         while (1) {
2916                 if ((sep = strchr(s, ',')) != NULL)
2917                         *sep = '\0';
2918
2919                 list = 0;
2920                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2921                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2922                         list = 1;
2923                 } else {
2924                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2925                         if (access(group_name, R_OK) == 0)
2926                                 list = 1;
2927                 }
2928
2929                 if (lists[list]) {
2930                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2931                 } else {
2932                         lists[list] = malloc(len);
2933                         if (lists[list] == NULL)
2934                                 goto out;
2935                         strcpy(lists[list], s);
2936                 }
2937
2938                 if (!sep)
2939                         break;
2940
2941                 *sep = ',';
2942                 s = sep + 1;
2943         }
2944
2945         if (lists[1] != NULL) {
2946                 struct strlist_config slist_config = {
2947                         .dirname = strace_groups_dir,
2948                 };
2949
2950                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2951                 if (trace->ev_qualifier == NULL) {
2952                         fputs("Not enough memory to parse event qualifier", trace->output);
2953                         goto out;
2954                 }
2955
2956                 if (trace__validate_ev_qualifier(trace))
2957                         goto out;
2958         }
2959
2960         err = 0;
2961
2962         if (lists[0]) {
2963                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2964                                                "event selector. use 'perf list' to list available events",
2965                                                parse_events_option);
2966                 err = parse_events_option(&o, lists[0], 0);
2967         }
2968 out:
2969         if (sep)
2970                 *sep = ',';
2971
2972         return err;
2973 }
2974
2975 int cmd_trace(int argc, const char **argv)
2976 {
2977         const char *trace_usage[] = {
2978                 "perf trace [<options>] [<command>]",
2979                 "perf trace [<options>] -- <command> [<options>]",
2980                 "perf trace record [<options>] [<command>]",
2981                 "perf trace record [<options>] -- <command> [<options>]",
2982                 NULL
2983         };
2984         struct trace trace = {
2985                 .syscalls = {
2986                         . max = -1,
2987                 },
2988                 .opts = {
2989                         .target = {
2990                                 .uid       = UINT_MAX,
2991                                 .uses_mmap = true,
2992                         },
2993                         .user_freq     = UINT_MAX,
2994                         .user_interval = ULLONG_MAX,
2995                         .no_buffering  = true,
2996                         .mmap_pages    = UINT_MAX,
2997                         .proc_map_timeout  = 500,
2998                 },
2999                 .output = stderr,
3000                 .show_comm = true,
3001                 .trace_syscalls = true,
3002                 .kernel_syscallchains = false,
3003                 .max_stack = UINT_MAX,
3004         };
3005         const char *output_name = NULL;
3006         const struct option trace_options[] = {
3007         OPT_CALLBACK('e', "event", &trace, "event",
3008                      "event/syscall selector. use 'perf list' to list available events",
3009                      trace__parse_events_option),
3010         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3011                     "show the thread COMM next to its id"),
3012         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3013         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3014                      trace__parse_events_option),
3015         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3016         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3017         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3018                     "trace events on existing process id"),
3019         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3020                     "trace events on existing thread id"),
3021         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3022                      "pids to filter (by the kernel)", trace__set_filter_pids),
3023         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3024                     "system-wide collection from all CPUs"),
3025         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3026                     "list of cpus to monitor"),
3027         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3028                     "child tasks do not inherit counters"),
3029         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3030                      "number of mmap data pages",
3031                      perf_evlist__parse_mmap_pages),
3032         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3033                    "user to profile"),
3034         OPT_CALLBACK(0, "duration", &trace, "float",
3035                      "show only events with duration > N.M ms",
3036                      trace__set_duration),
3037         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3038         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3039         OPT_BOOLEAN('T', "time", &trace.full_time,
3040                     "Show full timestamp, not time relative to first start"),
3041         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3042                     "Show only syscall summary with statistics"),
3043         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3044                     "Show all syscalls and summary with statistics"),
3045         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3046                      "Trace pagefaults", parse_pagefaults, "maj"),
3047         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3048         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3049         OPT_CALLBACK(0, "call-graph", &trace.opts,
3050                      "record_mode[,record_size]", record_callchain_help,
3051                      &record_parse_callchain_opt),
3052         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3053                     "Show the kernel callchains on the syscall exit path"),
3054         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3055                      "Set the minimum stack depth when parsing the callchain, "
3056                      "anything below the specified depth will be ignored."),
3057         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3058                      "Set the maximum stack depth when parsing the callchain, "
3059                      "anything beyond the specified depth will be ignored. "
3060                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3061         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3062                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3063         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3064                         "per thread proc mmap processing timeout in ms"),
3065         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3066                      "ms to wait before starting measurement after program "
3067                      "start"),
3068         OPT_END()
3069         };
3070         bool __maybe_unused max_stack_user_set = true;
3071         bool mmap_pages_user_set = true;
3072         const char * const trace_subcommands[] = { "record", NULL };
3073         int err;
3074         char bf[BUFSIZ];
3075
3076         signal(SIGSEGV, sighandler_dump_stack);
3077         signal(SIGFPE, sighandler_dump_stack);
3078
3079         trace.evlist = perf_evlist__new();
3080         trace.sctbl = syscalltbl__new();
3081
3082         if (trace.evlist == NULL || trace.sctbl == NULL) {
3083                 pr_err("Not enough memory to run!\n");
3084                 err = -ENOMEM;
3085                 goto out;
3086         }
3087
3088         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3089                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3090
3091         err = bpf__setup_stdout(trace.evlist);
3092         if (err) {
3093                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3094                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3095                 goto out;
3096         }
3097
3098         err = -1;
3099
3100         if (trace.trace_pgfaults) {
3101                 trace.opts.sample_address = true;
3102                 trace.opts.sample_time = true;
3103         }
3104
3105         if (trace.opts.mmap_pages == UINT_MAX)
3106                 mmap_pages_user_set = false;
3107
3108         if (trace.max_stack == UINT_MAX) {
3109                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3110                 max_stack_user_set = false;
3111         }
3112
3113 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3114         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3115                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3116         }
3117 #endif
3118
3119         if (callchain_param.enabled) {
3120                 if (!mmap_pages_user_set && geteuid() == 0)
3121                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3122
3123                 symbol_conf.use_callchain = true;
3124         }
3125
3126         if (trace.evlist->nr_entries > 0)
3127                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3128
3129         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3130                 return trace__record(&trace, argc-1, &argv[1]);
3131
3132         /* summary_only implies summary option, but don't overwrite summary if set */
3133         if (trace.summary_only)
3134                 trace.summary = trace.summary_only;
3135
3136         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3137             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3138                 pr_err("Please specify something to trace.\n");
3139                 return -1;
3140         }
3141
3142         if (!trace.trace_syscalls && trace.ev_qualifier) {
3143                 pr_err("The -e option can't be used with --no-syscalls.\n");
3144                 goto out;
3145         }
3146
3147         if (output_name != NULL) {
3148                 err = trace__open_output(&trace, output_name);
3149                 if (err < 0) {
3150                         perror("failed to create output file");
3151                         goto out;
3152                 }
3153         }
3154
3155         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3156
3157         err = target__validate(&trace.opts.target);
3158         if (err) {
3159                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3160                 fprintf(trace.output, "%s", bf);
3161                 goto out_close;
3162         }
3163
3164         err = target__parse_uid(&trace.opts.target);
3165         if (err) {
3166                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3167                 fprintf(trace.output, "%s", bf);
3168                 goto out_close;
3169         }
3170
3171         if (!argc && target__none(&trace.opts.target))
3172                 trace.opts.target.system_wide = true;
3173
3174         if (input_name)
3175                 err = trace__replay(&trace);
3176         else
3177                 err = trace__run(&trace, argc, argv);
3178
3179 out_close:
3180         if (output_name != NULL)
3181                 fclose(trace.output);
3182 out:
3183         return err;
3184 }