perf trace: Use the fd->name beautifier as default for "fd" args
[linux-2.6-block.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <sys/ptrace.h>
47 #include <linux/random.h>
48 #include <linux/stringify.h>
49
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC              02000000
52 #endif
53
54 struct trace {
55         struct perf_tool        tool;
56         struct syscalltbl       *sctbl;
57         struct {
58                 int             max;
59                 struct syscall  *table;
60                 struct {
61                         struct perf_evsel *sys_enter,
62                                           *sys_exit;
63                 }               events;
64         } syscalls;
65         struct record_opts      opts;
66         struct perf_evlist      *evlist;
67         struct machine          *host;
68         struct thread           *current;
69         u64                     base_time;
70         FILE                    *output;
71         unsigned long           nr_events;
72         struct strlist          *ev_qualifier;
73         struct {
74                 size_t          nr;
75                 int             *entries;
76         }                       ev_qualifier_ids;
77         struct intlist          *tid_list;
78         struct intlist          *pid_list;
79         struct {
80                 size_t          nr;
81                 pid_t           *entries;
82         }                       filter_pids;
83         double                  duration_filter;
84         double                  runtime_ms;
85         struct {
86                 u64             vfs_getname,
87                                 proc_getname;
88         } stats;
89         unsigned int            max_stack;
90         unsigned int            min_stack;
91         bool                    not_ev_qualifier;
92         bool                    live;
93         bool                    full_time;
94         bool                    sched;
95         bool                    multiple_threads;
96         bool                    summary;
97         bool                    summary_only;
98         bool                    show_comm;
99         bool                    show_tool_stats;
100         bool                    trace_syscalls;
101         bool                    kernel_syscallchains;
102         bool                    force;
103         bool                    vfs_getname;
104         int                     trace_pgfaults;
105         int                     open_id;
106 };
107
108 struct tp_field {
109         int offset;
110         union {
111                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113         };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119         u##bits value; \
120         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121         return value;  \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132         u##bits value; \
133         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134         return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142                                struct format_field *format_field,
143                                bool needs_swap)
144 {
145         field->offset = format_field->offset;
146
147         switch (format_field->size) {
148         case 1:
149                 field->integer = tp_field__u8;
150                 break;
151         case 2:
152                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153                 break;
154         case 4:
155                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156                 break;
157         case 8:
158                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159                 break;
160         default:
161                 return -1;
162         }
163
164         return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169         return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174         field->offset = format_field->offset;
175         field->pointer = tp_field__ptr;
176         return 0;
177 }
178
179 struct syscall_tp {
180         struct tp_field id;
181         union {
182                 struct tp_field args, ret;
183         };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187                                           struct tp_field *field,
188                                           const char *name)
189 {
190         struct format_field *format_field = perf_evsel__field(evsel, name);
191
192         if (format_field == NULL)
193                 return -1;
194
195         return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199         ({ struct syscall_tp *sc = evsel->priv;\
200            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203                                          struct tp_field *field,
204                                          const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220         zfree(&evsel->priv);
221         perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226         evsel->priv = malloc(sizeof(struct syscall_tp));
227         if (evsel->priv != NULL) {
228                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229                         goto out_delete;
230
231                 evsel->handler = handler;
232                 return 0;
233         }
234
235         return -ENOMEM;
236
237 out_delete:
238         zfree(&evsel->priv);
239         return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247         if (IS_ERR(evsel))
248                 evsel = perf_evsel__newtp("syscalls", direction);
249
250         if (IS_ERR(evsel))
251                 return NULL;
252
253         if (perf_evsel__init_syscall_tp(evsel, handler))
254                 goto out_delete;
255
256         return evsel;
257
258 out_delete:
259         perf_evsel__delete_priv(evsel);
260         return NULL;
261 }
262
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264         ({ struct syscall_tp *fields = evsel->priv; \
265            fields->name.integer(&fields->name, sample); })
266
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268         ({ struct syscall_tp *fields = evsel->priv; \
269            fields->name.pointer(&fields->name, sample); })
270
271 struct syscall_arg {
272         unsigned long val;
273         struct thread *thread;
274         struct trace  *trace;
275         void          *parm;
276         u8            idx;
277         u8            mask;
278 };
279
280 struct strarray {
281         int         offset;
282         int         nr_entries;
283         const char **entries;
284 };
285
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287         .nr_entries = ARRAY_SIZE(array), \
288         .entries = array, \
289 }
290
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292         .offset     = off, \
293         .nr_entries = ARRAY_SIZE(array), \
294         .entries = array, \
295 }
296
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298                                                 const char *intfmt,
299                                                 struct syscall_arg *arg)
300 {
301         struct strarray *sa = arg->parm;
302         int idx = arg->val - sa->offset;
303
304         if (idx < 0 || idx >= sa->nr_entries)
305                 return scnprintf(bf, size, intfmt, arg->val);
306
307         return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311                                               struct syscall_arg *arg)
312 {
313         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  *        gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324                                                  struct syscall_arg *arg)
325 {
326         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333                                         struct syscall_arg *arg);
334
335 #define SCA_FD syscall_arg__scnprintf_fd
336
337 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
338                                            struct syscall_arg *arg)
339 {
340         int fd = arg->val;
341
342         if (fd == AT_FDCWD)
343                 return scnprintf(bf, size, "CWD");
344
345         return syscall_arg__scnprintf_fd(bf, size, arg);
346 }
347
348 #define SCA_FDAT syscall_arg__scnprintf_fd_at
349
350 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
351                                               struct syscall_arg *arg);
352
353 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
354
355 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
356                                          struct syscall_arg *arg)
357 {
358         return scnprintf(bf, size, "%#lx", arg->val);
359 }
360
361 #define SCA_HEX syscall_arg__scnprintf_hex
362
363 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
364                                          struct syscall_arg *arg)
365 {
366         return scnprintf(bf, size, "%d", arg->val);
367 }
368
369 #define SCA_INT syscall_arg__scnprintf_int
370
371 static const char *bpf_cmd[] = {
372         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
373         "MAP_GET_NEXT_KEY", "PROG_LOAD",
374 };
375 static DEFINE_STRARRAY(bpf_cmd);
376
377 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
378 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
379
380 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
381 static DEFINE_STRARRAY(itimers);
382
383 static const char *keyctl_options[] = {
384         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
385         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
386         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
387         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
388         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
389 };
390 static DEFINE_STRARRAY(keyctl_options);
391
392 static const char *whences[] = { "SET", "CUR", "END",
393 #ifdef SEEK_DATA
394 "DATA",
395 #endif
396 #ifdef SEEK_HOLE
397 "HOLE",
398 #endif
399 };
400 static DEFINE_STRARRAY(whences);
401
402 static const char *fcntl_cmds[] = {
403         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
404         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
405         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
406         "F_GETOWNER_UIDS",
407 };
408 static DEFINE_STRARRAY(fcntl_cmds);
409
410 static const char *rlimit_resources[] = {
411         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
412         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
413         "RTTIME",
414 };
415 static DEFINE_STRARRAY(rlimit_resources);
416
417 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
418 static DEFINE_STRARRAY(sighow);
419
420 static const char *clockid[] = {
421         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
422         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
423         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
424 };
425 static DEFINE_STRARRAY(clockid);
426
427 static const char *socket_families[] = {
428         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
429         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
430         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
431         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
432         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
433         "ALG", "NFC", "VSOCK",
434 };
435 static DEFINE_STRARRAY(socket_families);
436
437 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
438                                                  struct syscall_arg *arg)
439 {
440         size_t printed = 0;
441         int mode = arg->val;
442
443         if (mode == F_OK) /* 0 */
444                 return scnprintf(bf, size, "F");
445 #define P_MODE(n) \
446         if (mode & n##_OK) { \
447                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
448                 mode &= ~n##_OK; \
449         }
450
451         P_MODE(R);
452         P_MODE(W);
453         P_MODE(X);
454 #undef P_MODE
455
456         if (mode)
457                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
458
459         return printed;
460 }
461
462 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
463
464 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
465                                               struct syscall_arg *arg);
466
467 #define SCA_FILENAME syscall_arg__scnprintf_filename
468
469 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
470                                                 struct syscall_arg *arg)
471 {
472         int printed = 0, flags = arg->val;
473
474 #define P_FLAG(n) \
475         if (flags & O_##n) { \
476                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
477                 flags &= ~O_##n; \
478         }
479
480         P_FLAG(CLOEXEC);
481         P_FLAG(NONBLOCK);
482 #undef P_FLAG
483
484         if (flags)
485                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
486
487         return printed;
488 }
489
490 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
491
492 #if defined(__i386__) || defined(__x86_64__)
493 /*
494  * FIXME: Make this available to all arches.
495  */
496 #define TCGETS          0x5401
497
498 static const char *tioctls[] = {
499         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
500         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
501         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
502         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
503         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
504         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
505         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
506         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
507         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
508         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
509         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
510         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
511         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
512         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
513         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
514 };
515
516 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
517 #endif /* defined(__i386__) || defined(__x86_64__) */
518
519 #ifndef GRND_NONBLOCK
520 #define GRND_NONBLOCK   0x0001
521 #endif
522 #ifndef GRND_RANDOM
523 #define GRND_RANDOM     0x0002
524 #endif
525
526 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
527                                                    struct syscall_arg *arg)
528 {
529         int printed = 0, flags = arg->val;
530
531 #define P_FLAG(n) \
532         if (flags & GRND_##n) { \
533                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
534                 flags &= ~GRND_##n; \
535         }
536
537         P_FLAG(RANDOM);
538         P_FLAG(NONBLOCK);
539 #undef P_FLAG
540
541         if (flags)
542                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
543
544         return printed;
545 }
546
547 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
548
549 #define STRARRAY(arg, name, array) \
550           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
551           .arg_parm      = { [arg] = &strarray__##array, }
552
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/flock.c"
555 #include "trace/beauty/futex_op.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567
568 static struct syscall_fmt {
569         const char *name;
570         const char *alias;
571         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
572         void       *arg_parm[6];
573         bool       errmsg;
574         bool       errpid;
575         bool       timeout;
576         bool       hexret;
577 } syscall_fmts[] = {
578         { .name     = "access",     .errmsg = true,
579           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
580                              [1] = SCA_ACCMODE,  /* mode */ }, },
581         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
582         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
583         { .name     = "brk",        .hexret = true,
584           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
585         { .name     = "chdir",      .errmsg = true,
586           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
587         { .name     = "chmod",      .errmsg = true,
588           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
589         { .name     = "chroot",     .errmsg = true,
590           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
591         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592         { .name     = "clone",      .errpid = true, },
593         { .name     = "close",      .errmsg = true,
594           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595         { .name     = "connect",    .errmsg = true, },
596         { .name     = "creat",      .errmsg = true,
597           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
598         { .name     = "dup",        .errmsg = true, },
599         { .name     = "dup2",       .errmsg = true, },
600         { .name     = "dup3",       .errmsg = true, },
601         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
602         { .name     = "eventfd2",   .errmsg = true,
603           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
604         { .name     = "faccessat",  .errmsg = true,
605           .arg_scnprintf = { [1] = SCA_FILENAME, /* filename */ }, },
606         { .name     = "fadvise64",  .errmsg = true, },
607         { .name     = "fallocate",  .errmsg = true, },
608         { .name     = "fchdir",     .errmsg = true, },
609         { .name     = "fchmod",     .errmsg = true, },
610         { .name     = "fchmodat",   .errmsg = true,
611           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
612                              [1] = SCA_FILENAME, /* filename */ }, },
613         { .name     = "fchown",     .errmsg = true, },
614         { .name     = "fchownat",   .errmsg = true,
615           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
616                              [1] = SCA_FILENAME, /* filename */ }, },
617         { .name     = "fcntl",      .errmsg = true,
618           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
619           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
620         { .name     = "fdatasync",  .errmsg = true, },
621         { .name     = "flock",      .errmsg = true,
622           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
623         { .name     = "fsetxattr",  .errmsg = true, },
624         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
625         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
626           .arg_scnprintf = { [1] = SCA_FILENAME, /* filename */ }, },
627         { .name     = "fstatfs",    .errmsg = true, },
628         { .name     = "fsync",    .errmsg = true, },
629         { .name     = "ftruncate", .errmsg = true, },
630         { .name     = "futex",      .errmsg = true,
631           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
632         { .name     = "futimesat", .errmsg = true,
633           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
634                              [1] = SCA_FILENAME, /* filename */ }, },
635         { .name     = "getdents",   .errmsg = true, },
636         { .name     = "getdents64", .errmsg = true, },
637         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
638         { .name     = "getpid",     .errpid = true, },
639         { .name     = "getpgid",    .errpid = true, },
640         { .name     = "getppid",    .errpid = true, },
641         { .name     = "getrandom",  .errmsg = true,
642           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
643         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
644         { .name     = "getxattr",    .errmsg = true,
645           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
646         { .name     = "inotify_add_watch",          .errmsg = true,
647           .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
648         { .name     = "ioctl",      .errmsg = true,
649           .arg_scnprintf = {
650 #if defined(__i386__) || defined(__x86_64__)
651 /*
652  * FIXME: Make this available to all arches.
653  */
654                              [1] = SCA_STRHEXARRAY, /* cmd */
655                              [2] = SCA_HEX, /* arg */ },
656           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
657 #else
658                              [2] = SCA_HEX, /* arg */ }, },
659 #endif
660         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
661         { .name     = "kill",       .errmsg = true,
662           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
663         { .name     = "lchown",    .errmsg = true,
664           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
665         { .name     = "lgetxattr",  .errmsg = true,
666           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
667         { .name     = "linkat",     .errmsg = true,
668           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
669         { .name     = "listxattr",  .errmsg = true,
670           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
671         { .name     = "llistxattr", .errmsg = true,
672           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
673         { .name     = "lremovexattr",  .errmsg = true,
674           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
675         { .name     = "lseek",      .errmsg = true,
676           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
677           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
678         { .name     = "lsetxattr",  .errmsg = true,
679           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
680         { .name     = "lstat",      .errmsg = true, .alias = "newlstat",
681           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
682         { .name     = "lsxattr",    .errmsg = true,
683           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
684         { .name     = "madvise",    .errmsg = true,
685           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
686                              [2] = SCA_MADV_BHV, /* behavior */ }, },
687         { .name     = "mkdir",    .errmsg = true,
688           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
689         { .name     = "mkdirat",    .errmsg = true,
690           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
691                              [1] = SCA_FILENAME, /* pathname */ }, },
692         { .name     = "mknod",      .errmsg = true,
693           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
694         { .name     = "mknodat",    .errmsg = true,
695           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
696                              [1] = SCA_FILENAME, /* filename */ }, },
697         { .name     = "mlock",      .errmsg = true,
698           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
699         { .name     = "mlockall",   .errmsg = true,
700           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
701         { .name     = "mmap",       .hexret = true,
702           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
703                              [2] = SCA_MMAP_PROT, /* prot */
704                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
705         { .name     = "mprotect",   .errmsg = true,
706           .arg_scnprintf = { [0] = SCA_HEX, /* start */
707                              [2] = SCA_MMAP_PROT, /* prot */ }, },
708         { .name     = "mq_unlink", .errmsg = true,
709           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
710         { .name     = "mremap",     .hexret = true,
711           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
712                              [3] = SCA_MREMAP_FLAGS, /* flags */
713                              [4] = SCA_HEX, /* new_addr */ }, },
714         { .name     = "munlock",    .errmsg = true,
715           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
716         { .name     = "munmap",     .errmsg = true,
717           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
718         { .name     = "name_to_handle_at", .errmsg = true,
719           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
720         { .name     = "newfstatat", .errmsg = true,
721           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
722                              [1] = SCA_FILENAME, /* filename */ }, },
723         { .name     = "open",       .errmsg = true,
724           .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
725                              [1] = SCA_OPEN_FLAGS, /* flags */ }, },
726         { .name     = "open_by_handle_at", .errmsg = true,
727           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
728                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
729         { .name     = "openat",     .errmsg = true,
730           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
731                              [1] = SCA_FILENAME, /* filename */
732                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
733         { .name     = "perf_event_open", .errmsg = true,
734           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
735                              [3] = SCA_FD,  /* group_fd */
736                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
737         { .name     = "pipe2",      .errmsg = true,
738           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
739         { .name     = "poll",       .errmsg = true, .timeout = true, },
740         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
741         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
742         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
743         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
744         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
745         { .name     = "pwritev",    .errmsg = true, },
746         { .name     = "read",       .errmsg = true, },
747         { .name     = "readlink",   .errmsg = true,
748           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
749         { .name     = "readlinkat", .errmsg = true,
750           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
751                              [1] = SCA_FILENAME, /* pathname */ }, },
752         { .name     = "readv",      .errmsg = true, },
753         { .name     = "recvfrom",   .errmsg = true,
754           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
755         { .name     = "recvmmsg",   .errmsg = true,
756           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
757         { .name     = "recvmsg",    .errmsg = true,
758           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
759         { .name     = "removexattr", .errmsg = true,
760           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
761         { .name     = "renameat",   .errmsg = true,
762           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
763         { .name     = "rmdir",    .errmsg = true,
764           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
765         { .name     = "rt_sigaction", .errmsg = true,
766           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
767         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
768         { .name     = "rt_sigqueueinfo", .errmsg = true,
769           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
770         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
771           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
772         { .name     = "sched_setscheduler",   .errmsg = true,
773           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
774         { .name     = "seccomp", .errmsg = true,
775           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
776                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
777         { .name     = "select",     .errmsg = true, .timeout = true, },
778         { .name     = "sendmmsg",    .errmsg = true,
779           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
780         { .name     = "sendmsg",    .errmsg = true,
781           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
782         { .name     = "sendto",     .errmsg = true,
783           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
784         { .name     = "set_tid_address", .errpid = true, },
785         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
786         { .name     = "setpgid",    .errmsg = true, },
787         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
788         { .name     = "setxattr",   .errmsg = true,
789           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
790         { .name     = "shutdown",   .errmsg = true, },
791         { .name     = "socket",     .errmsg = true,
792           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
793                              [1] = SCA_SK_TYPE, /* type */ },
794           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
795         { .name     = "socketpair", .errmsg = true,
796           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
797                              [1] = SCA_SK_TYPE, /* type */ },
798           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
799         { .name     = "stat",       .errmsg = true, .alias = "newstat",
800           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
801         { .name     = "statfs",     .errmsg = true,
802           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
803         { .name     = "swapoff",    .errmsg = true,
804           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
805         { .name     = "swapon",     .errmsg = true,
806           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
807         { .name     = "symlinkat",  .errmsg = true,
808           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
809         { .name     = "tgkill",     .errmsg = true,
810           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
811         { .name     = "tkill",      .errmsg = true,
812           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
813         { .name     = "truncate",   .errmsg = true,
814           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
815         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
816         { .name     = "unlinkat",   .errmsg = true,
817           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
818                              [1] = SCA_FILENAME, /* pathname */ }, },
819         { .name     = "utime",  .errmsg = true,
820           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
821         { .name     = "utimensat",  .errmsg = true,
822           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
823                              [1] = SCA_FILENAME, /* filename */ }, },
824         { .name     = "utimes",  .errmsg = true,
825           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
826         { .name     = "vmsplice",  .errmsg = true, },
827         { .name     = "wait4",      .errpid = true,
828           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
829         { .name     = "waitid",     .errpid = true,
830           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
831         { .name     = "write",      .errmsg = true, },
832         { .name     = "writev",     .errmsg = true, },
833 };
834
835 static int syscall_fmt__cmp(const void *name, const void *fmtp)
836 {
837         const struct syscall_fmt *fmt = fmtp;
838         return strcmp(name, fmt->name);
839 }
840
841 static struct syscall_fmt *syscall_fmt__find(const char *name)
842 {
843         const int nmemb = ARRAY_SIZE(syscall_fmts);
844         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
845 }
846
847 struct syscall {
848         struct event_format *tp_format;
849         int                 nr_args;
850         struct format_field *args;
851         const char          *name;
852         bool                is_exit;
853         struct syscall_fmt  *fmt;
854         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
855         void                **arg_parm;
856 };
857
858 static size_t fprintf_duration(unsigned long t, FILE *fp)
859 {
860         double duration = (double)t / NSEC_PER_MSEC;
861         size_t printed = fprintf(fp, "(");
862
863         if (duration >= 1.0)
864                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
865         else if (duration >= 0.01)
866                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
867         else
868                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
869         return printed + fprintf(fp, "): ");
870 }
871
872 /**
873  * filename.ptr: The filename char pointer that will be vfs_getname'd
874  * filename.entry_str_pos: Where to insert the string translated from
875  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
876  */
877 struct thread_trace {
878         u64               entry_time;
879         u64               exit_time;
880         bool              entry_pending;
881         unsigned long     nr_events;
882         unsigned long     pfmaj, pfmin;
883         char              *entry_str;
884         double            runtime_ms;
885         struct {
886                 unsigned long ptr;
887                 short int     entry_str_pos;
888                 bool          pending_open;
889                 unsigned int  namelen;
890                 char          *name;
891         } filename;
892         struct {
893                 int       max;
894                 char      **table;
895         } paths;
896
897         struct intlist *syscall_stats;
898 };
899
900 static struct thread_trace *thread_trace__new(void)
901 {
902         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
903
904         if (ttrace)
905                 ttrace->paths.max = -1;
906
907         ttrace->syscall_stats = intlist__new(NULL);
908
909         return ttrace;
910 }
911
912 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
913 {
914         struct thread_trace *ttrace;
915
916         if (thread == NULL)
917                 goto fail;
918
919         if (thread__priv(thread) == NULL)
920                 thread__set_priv(thread, thread_trace__new());
921
922         if (thread__priv(thread) == NULL)
923                 goto fail;
924
925         ttrace = thread__priv(thread);
926         ++ttrace->nr_events;
927
928         return ttrace;
929 fail:
930         color_fprintf(fp, PERF_COLOR_RED,
931                       "WARNING: not enough memory, dropping samples!\n");
932         return NULL;
933 }
934
935 #define TRACE_PFMAJ             (1 << 0)
936 #define TRACE_PFMIN             (1 << 1)
937
938 static const size_t trace__entry_str_size = 2048;
939
940 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
941 {
942         struct thread_trace *ttrace = thread__priv(thread);
943
944         if (fd > ttrace->paths.max) {
945                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
946
947                 if (npath == NULL)
948                         return -1;
949
950                 if (ttrace->paths.max != -1) {
951                         memset(npath + ttrace->paths.max + 1, 0,
952                                (fd - ttrace->paths.max) * sizeof(char *));
953                 } else {
954                         memset(npath, 0, (fd + 1) * sizeof(char *));
955                 }
956
957                 ttrace->paths.table = npath;
958                 ttrace->paths.max   = fd;
959         }
960
961         ttrace->paths.table[fd] = strdup(pathname);
962
963         return ttrace->paths.table[fd] != NULL ? 0 : -1;
964 }
965
966 static int thread__read_fd_path(struct thread *thread, int fd)
967 {
968         char linkname[PATH_MAX], pathname[PATH_MAX];
969         struct stat st;
970         int ret;
971
972         if (thread->pid_ == thread->tid) {
973                 scnprintf(linkname, sizeof(linkname),
974                           "/proc/%d/fd/%d", thread->pid_, fd);
975         } else {
976                 scnprintf(linkname, sizeof(linkname),
977                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
978         }
979
980         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
981                 return -1;
982
983         ret = readlink(linkname, pathname, sizeof(pathname));
984
985         if (ret < 0 || ret > st.st_size)
986                 return -1;
987
988         pathname[ret] = '\0';
989         return trace__set_fd_pathname(thread, fd, pathname);
990 }
991
992 static const char *thread__fd_path(struct thread *thread, int fd,
993                                    struct trace *trace)
994 {
995         struct thread_trace *ttrace = thread__priv(thread);
996
997         if (ttrace == NULL)
998                 return NULL;
999
1000         if (fd < 0)
1001                 return NULL;
1002
1003         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1004                 if (!trace->live)
1005                         return NULL;
1006                 ++trace->stats.proc_getname;
1007                 if (thread__read_fd_path(thread, fd))
1008                         return NULL;
1009         }
1010
1011         return ttrace->paths.table[fd];
1012 }
1013
1014 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1015                                         struct syscall_arg *arg)
1016 {
1017         int fd = arg->val;
1018         size_t printed = scnprintf(bf, size, "%d", fd);
1019         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1020
1021         if (path)
1022                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1023
1024         return printed;
1025 }
1026
1027 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1028                                               struct syscall_arg *arg)
1029 {
1030         int fd = arg->val;
1031         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1032         struct thread_trace *ttrace = thread__priv(arg->thread);
1033
1034         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1035                 zfree(&ttrace->paths.table[fd]);
1036
1037         return printed;
1038 }
1039
1040 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1041                                      unsigned long ptr)
1042 {
1043         struct thread_trace *ttrace = thread__priv(thread);
1044
1045         ttrace->filename.ptr = ptr;
1046         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1047 }
1048
1049 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1050                                               struct syscall_arg *arg)
1051 {
1052         unsigned long ptr = arg->val;
1053
1054         if (!arg->trace->vfs_getname)
1055                 return scnprintf(bf, size, "%#x", ptr);
1056
1057         thread__set_filename_pos(arg->thread, bf, ptr);
1058         return 0;
1059 }
1060
1061 static bool trace__filter_duration(struct trace *trace, double t)
1062 {
1063         return t < (trace->duration_filter * NSEC_PER_MSEC);
1064 }
1065
1066 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1067 {
1068         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1069
1070         return fprintf(fp, "%10.3f ", ts);
1071 }
1072
1073 static bool done = false;
1074 static bool interrupted = false;
1075
1076 static void sig_handler(int sig)
1077 {
1078         done = true;
1079         interrupted = sig == SIGINT;
1080 }
1081
1082 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1083                                         u64 duration, u64 tstamp, FILE *fp)
1084 {
1085         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1086         printed += fprintf_duration(duration, fp);
1087
1088         if (trace->multiple_threads) {
1089                 if (trace->show_comm)
1090                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1091                 printed += fprintf(fp, "%d ", thread->tid);
1092         }
1093
1094         return printed;
1095 }
1096
1097 static int trace__process_event(struct trace *trace, struct machine *machine,
1098                                 union perf_event *event, struct perf_sample *sample)
1099 {
1100         int ret = 0;
1101
1102         switch (event->header.type) {
1103         case PERF_RECORD_LOST:
1104                 color_fprintf(trace->output, PERF_COLOR_RED,
1105                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1106                 ret = machine__process_lost_event(machine, event, sample);
1107                 break;
1108         default:
1109                 ret = machine__process_event(machine, event, sample);
1110                 break;
1111         }
1112
1113         return ret;
1114 }
1115
1116 static int trace__tool_process(struct perf_tool *tool,
1117                                union perf_event *event,
1118                                struct perf_sample *sample,
1119                                struct machine *machine)
1120 {
1121         struct trace *trace = container_of(tool, struct trace, tool);
1122         return trace__process_event(trace, machine, event, sample);
1123 }
1124
1125 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1126 {
1127         struct machine *machine = vmachine;
1128
1129         if (machine->kptr_restrict_warned)
1130                 return NULL;
1131
1132         if (symbol_conf.kptr_restrict) {
1133                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1134                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1135                            "Kernel samples will not be resolved.\n");
1136                 machine->kptr_restrict_warned = true;
1137                 return NULL;
1138         }
1139
1140         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1141 }
1142
1143 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1144 {
1145         int err = symbol__init(NULL);
1146
1147         if (err)
1148                 return err;
1149
1150         trace->host = machine__new_host();
1151         if (trace->host == NULL)
1152                 return -ENOMEM;
1153
1154         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1155                 return -errno;
1156
1157         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1158                                             evlist->threads, trace__tool_process, false,
1159                                             trace->opts.proc_map_timeout);
1160         if (err)
1161                 symbol__exit();
1162
1163         return err;
1164 }
1165
1166 static int syscall__set_arg_fmts(struct syscall *sc)
1167 {
1168         struct format_field *field;
1169         int idx = 0, len;
1170
1171         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1172         if (sc->arg_scnprintf == NULL)
1173                 return -1;
1174
1175         if (sc->fmt)
1176                 sc->arg_parm = sc->fmt->arg_parm;
1177
1178         for (field = sc->args; field; field = field->next) {
1179                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1180                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1181                 else if (field->flags & FIELD_IS_POINTER)
1182                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1183                 else if (strcmp(field->type, "pid_t") == 0)
1184                         sc->arg_scnprintf[idx] = SCA_PID;
1185                 else if (strcmp(field->type, "umode_t") == 0)
1186                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1187                 else if ((strcmp(field->type, "int") == 0 ||
1188                           strcmp(field->type, "unsigned int") == 0 ||
1189                           strcmp(field->type, "long") == 0) &&
1190                          (len = strlen(field->name)) >= 2 &&
1191                          strcmp(field->name + len - 2, "fd") == 0) {
1192                         /*
1193                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1194                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1195                          * 65 int
1196                          * 23 unsigned int
1197                          * 7 unsigned long
1198                          */
1199                         sc->arg_scnprintf[idx] = SCA_FD;
1200                 }
1201                 ++idx;
1202         }
1203
1204         return 0;
1205 }
1206
1207 static int trace__read_syscall_info(struct trace *trace, int id)
1208 {
1209         char tp_name[128];
1210         struct syscall *sc;
1211         const char *name = syscalltbl__name(trace->sctbl, id);
1212
1213         if (name == NULL)
1214                 return -1;
1215
1216         if (id > trace->syscalls.max) {
1217                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1218
1219                 if (nsyscalls == NULL)
1220                         return -1;
1221
1222                 if (trace->syscalls.max != -1) {
1223                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1224                                (id - trace->syscalls.max) * sizeof(*sc));
1225                 } else {
1226                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1227                 }
1228
1229                 trace->syscalls.table = nsyscalls;
1230                 trace->syscalls.max   = id;
1231         }
1232
1233         sc = trace->syscalls.table + id;
1234         sc->name = name;
1235
1236         sc->fmt  = syscall_fmt__find(sc->name);
1237
1238         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1239         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1240
1241         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1242                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1243                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1244         }
1245
1246         if (IS_ERR(sc->tp_format))
1247                 return -1;
1248
1249         sc->args = sc->tp_format->format.fields;
1250         sc->nr_args = sc->tp_format->format.nr_fields;
1251         /*
1252          * We need to check and discard the first variable '__syscall_nr'
1253          * or 'nr' that mean the syscall number. It is needless here.
1254          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1255          */
1256         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1257                 sc->args = sc->args->next;
1258                 --sc->nr_args;
1259         }
1260
1261         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1262
1263         return syscall__set_arg_fmts(sc);
1264 }
1265
1266 static int trace__validate_ev_qualifier(struct trace *trace)
1267 {
1268         int err = 0, i;
1269         struct str_node *pos;
1270
1271         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1272         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1273                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1274
1275         if (trace->ev_qualifier_ids.entries == NULL) {
1276                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1277                        trace->output);
1278                 err = -EINVAL;
1279                 goto out;
1280         }
1281
1282         i = 0;
1283
1284         strlist__for_each(pos, trace->ev_qualifier) {
1285                 const char *sc = pos->s;
1286                 int id = syscalltbl__id(trace->sctbl, sc);
1287
1288                 if (id < 0) {
1289                         if (err == 0) {
1290                                 fputs("Error:\tInvalid syscall ", trace->output);
1291                                 err = -EINVAL;
1292                         } else {
1293                                 fputs(", ", trace->output);
1294                         }
1295
1296                         fputs(sc, trace->output);
1297                 }
1298
1299                 trace->ev_qualifier_ids.entries[i++] = id;
1300         }
1301
1302         if (err < 0) {
1303                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1304                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1305                 zfree(&trace->ev_qualifier_ids.entries);
1306                 trace->ev_qualifier_ids.nr = 0;
1307         }
1308 out:
1309         return err;
1310 }
1311
1312 /*
1313  * args is to be interpreted as a series of longs but we need to handle
1314  * 8-byte unaligned accesses. args points to raw_data within the event
1315  * and raw_data is guaranteed to be 8-byte unaligned because it is
1316  * preceded by raw_size which is a u32. So we need to copy args to a temp
1317  * variable to read it. Most notably this avoids extended load instructions
1318  * on unaligned addresses
1319  */
1320
1321 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1322                                       unsigned char *args, struct trace *trace,
1323                                       struct thread *thread)
1324 {
1325         size_t printed = 0;
1326         unsigned char *p;
1327         unsigned long val;
1328
1329         if (sc->args != NULL) {
1330                 struct format_field *field;
1331                 u8 bit = 1;
1332                 struct syscall_arg arg = {
1333                         .idx    = 0,
1334                         .mask   = 0,
1335                         .trace  = trace,
1336                         .thread = thread,
1337                 };
1338
1339                 for (field = sc->args; field;
1340                      field = field->next, ++arg.idx, bit <<= 1) {
1341                         if (arg.mask & bit)
1342                                 continue;
1343
1344                         /* special care for unaligned accesses */
1345                         p = args + sizeof(unsigned long) * arg.idx;
1346                         memcpy(&val, p, sizeof(val));
1347
1348                         /*
1349                          * Suppress this argument if its value is zero and
1350                          * and we don't have a string associated in an
1351                          * strarray for it.
1352                          */
1353                         if (val == 0 &&
1354                             !(sc->arg_scnprintf &&
1355                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1356                               sc->arg_parm[arg.idx]))
1357                                 continue;
1358
1359                         printed += scnprintf(bf + printed, size - printed,
1360                                              "%s%s: ", printed ? ", " : "", field->name);
1361                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1362                                 arg.val = val;
1363                                 if (sc->arg_parm)
1364                                         arg.parm = sc->arg_parm[arg.idx];
1365                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1366                                                                       size - printed, &arg);
1367                         } else {
1368                                 printed += scnprintf(bf + printed, size - printed,
1369                                                      "%ld", val);
1370                         }
1371                 }
1372         } else if (IS_ERR(sc->tp_format)) {
1373                 /*
1374                  * If we managed to read the tracepoint /format file, then we
1375                  * may end up not having any args, like with gettid(), so only
1376                  * print the raw args when we didn't manage to read it.
1377                  */
1378                 int i = 0;
1379
1380                 while (i < 6) {
1381                         /* special care for unaligned accesses */
1382                         p = args + sizeof(unsigned long) * i;
1383                         memcpy(&val, p, sizeof(val));
1384                         printed += scnprintf(bf + printed, size - printed,
1385                                              "%sarg%d: %ld",
1386                                              printed ? ", " : "", i, val);
1387                         ++i;
1388                 }
1389         }
1390
1391         return printed;
1392 }
1393
1394 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1395                                   union perf_event *event,
1396                                   struct perf_sample *sample);
1397
1398 static struct syscall *trace__syscall_info(struct trace *trace,
1399                                            struct perf_evsel *evsel, int id)
1400 {
1401
1402         if (id < 0) {
1403
1404                 /*
1405                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1406                  * before that, leaving at a higher verbosity level till that is
1407                  * explained. Reproduced with plain ftrace with:
1408                  *
1409                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1410                  * grep "NR -1 " /t/trace_pipe
1411                  *
1412                  * After generating some load on the machine.
1413                  */
1414                 if (verbose > 1) {
1415                         static u64 n;
1416                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1417                                 id, perf_evsel__name(evsel), ++n);
1418                 }
1419                 return NULL;
1420         }
1421
1422         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1423             trace__read_syscall_info(trace, id))
1424                 goto out_cant_read;
1425
1426         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1427                 goto out_cant_read;
1428
1429         return &trace->syscalls.table[id];
1430
1431 out_cant_read:
1432         if (verbose) {
1433                 fprintf(trace->output, "Problems reading syscall %d", id);
1434                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1435                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1436                 fputs(" information\n", trace->output);
1437         }
1438         return NULL;
1439 }
1440
1441 static void thread__update_stats(struct thread_trace *ttrace,
1442                                  int id, struct perf_sample *sample)
1443 {
1444         struct int_node *inode;
1445         struct stats *stats;
1446         u64 duration = 0;
1447
1448         inode = intlist__findnew(ttrace->syscall_stats, id);
1449         if (inode == NULL)
1450                 return;
1451
1452         stats = inode->priv;
1453         if (stats == NULL) {
1454                 stats = malloc(sizeof(struct stats));
1455                 if (stats == NULL)
1456                         return;
1457                 init_stats(stats);
1458                 inode->priv = stats;
1459         }
1460
1461         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1462                 duration = sample->time - ttrace->entry_time;
1463
1464         update_stats(stats, duration);
1465 }
1466
1467 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1468 {
1469         struct thread_trace *ttrace;
1470         u64 duration;
1471         size_t printed;
1472
1473         if (trace->current == NULL)
1474                 return 0;
1475
1476         ttrace = thread__priv(trace->current);
1477
1478         if (!ttrace->entry_pending)
1479                 return 0;
1480
1481         duration = sample->time - ttrace->entry_time;
1482
1483         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1484         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1485         ttrace->entry_pending = false;
1486
1487         return printed;
1488 }
1489
1490 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1491                             union perf_event *event __maybe_unused,
1492                             struct perf_sample *sample)
1493 {
1494         char *msg;
1495         void *args;
1496         size_t printed = 0;
1497         struct thread *thread;
1498         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1499         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1500         struct thread_trace *ttrace;
1501
1502         if (sc == NULL)
1503                 return -1;
1504
1505         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1506         ttrace = thread__trace(thread, trace->output);
1507         if (ttrace == NULL)
1508                 goto out_put;
1509
1510         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1511
1512         if (ttrace->entry_str == NULL) {
1513                 ttrace->entry_str = malloc(trace__entry_str_size);
1514                 if (!ttrace->entry_str)
1515                         goto out_put;
1516         }
1517
1518         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1519                 trace__printf_interrupted_entry(trace, sample);
1520
1521         ttrace->entry_time = sample->time;
1522         msg = ttrace->entry_str;
1523         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1524
1525         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1526                                            args, trace, thread);
1527
1528         if (sc->is_exit) {
1529                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1530                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1531                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1532                 }
1533         } else {
1534                 ttrace->entry_pending = true;
1535                 /* See trace__vfs_getname & trace__sys_exit */
1536                 ttrace->filename.pending_open = false;
1537         }
1538
1539         if (trace->current != thread) {
1540                 thread__put(trace->current);
1541                 trace->current = thread__get(thread);
1542         }
1543         err = 0;
1544 out_put:
1545         thread__put(thread);
1546         return err;
1547 }
1548
1549 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1550                                     struct perf_sample *sample,
1551                                     struct callchain_cursor *cursor)
1552 {
1553         struct addr_location al;
1554
1555         if (machine__resolve(trace->host, &al, sample) < 0 ||
1556             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1557                 return -1;
1558
1559         return 0;
1560 }
1561
1562 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1563 {
1564         /* TODO: user-configurable print_opts */
1565         const unsigned int print_opts = EVSEL__PRINT_SYM |
1566                                         EVSEL__PRINT_DSO |
1567                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1568
1569         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1570 }
1571
1572 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1573                            union perf_event *event __maybe_unused,
1574                            struct perf_sample *sample)
1575 {
1576         long ret;
1577         u64 duration = 0;
1578         struct thread *thread;
1579         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1580         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1581         struct thread_trace *ttrace;
1582
1583         if (sc == NULL)
1584                 return -1;
1585
1586         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1587         ttrace = thread__trace(thread, trace->output);
1588         if (ttrace == NULL)
1589                 goto out_put;
1590
1591         if (trace->summary)
1592                 thread__update_stats(ttrace, id, sample);
1593
1594         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1595
1596         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1597                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1598                 ttrace->filename.pending_open = false;
1599                 ++trace->stats.vfs_getname;
1600         }
1601
1602         ttrace->exit_time = sample->time;
1603
1604         if (ttrace->entry_time) {
1605                 duration = sample->time - ttrace->entry_time;
1606                 if (trace__filter_duration(trace, duration))
1607                         goto out;
1608         } else if (trace->duration_filter)
1609                 goto out;
1610
1611         if (sample->callchain) {
1612                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1613                 if (callchain_ret == 0) {
1614                         if (callchain_cursor.nr < trace->min_stack)
1615                                 goto out;
1616                         callchain_ret = 1;
1617                 }
1618         }
1619
1620         if (trace->summary_only)
1621                 goto out;
1622
1623         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1624
1625         if (ttrace->entry_pending) {
1626                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1627         } else {
1628                 fprintf(trace->output, " ... [");
1629                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1630                 fprintf(trace->output, "]: %s()", sc->name);
1631         }
1632
1633         if (sc->fmt == NULL) {
1634 signed_print:
1635                 fprintf(trace->output, ") = %ld", ret);
1636         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1637                 char bf[STRERR_BUFSIZE];
1638                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1639                            *e = audit_errno_to_name(-ret);
1640
1641                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1642         } else if (ret == 0 && sc->fmt->timeout)
1643                 fprintf(trace->output, ") = 0 Timeout");
1644         else if (sc->fmt->hexret)
1645                 fprintf(trace->output, ") = %#lx", ret);
1646         else if (sc->fmt->errpid) {
1647                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1648
1649                 if (child != NULL) {
1650                         fprintf(trace->output, ") = %ld", ret);
1651                         if (child->comm_set)
1652                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1653                         thread__put(child);
1654                 }
1655         } else
1656                 goto signed_print;
1657
1658         fputc('\n', trace->output);
1659
1660         if (callchain_ret > 0)
1661                 trace__fprintf_callchain(trace, sample);
1662         else if (callchain_ret < 0)
1663                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1664 out:
1665         ttrace->entry_pending = false;
1666         err = 0;
1667 out_put:
1668         thread__put(thread);
1669         return err;
1670 }
1671
1672 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1673                               union perf_event *event __maybe_unused,
1674                               struct perf_sample *sample)
1675 {
1676         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1677         struct thread_trace *ttrace;
1678         size_t filename_len, entry_str_len, to_move;
1679         ssize_t remaining_space;
1680         char *pos;
1681         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1682
1683         if (!thread)
1684                 goto out;
1685
1686         ttrace = thread__priv(thread);
1687         if (!ttrace)
1688                 goto out;
1689
1690         filename_len = strlen(filename);
1691
1692         if (ttrace->filename.namelen < filename_len) {
1693                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1694
1695                 if (f == NULL)
1696                                 goto out;
1697
1698                 ttrace->filename.namelen = filename_len;
1699                 ttrace->filename.name = f;
1700         }
1701
1702         strcpy(ttrace->filename.name, filename);
1703         ttrace->filename.pending_open = true;
1704
1705         if (!ttrace->filename.ptr)
1706                 goto out;
1707
1708         entry_str_len = strlen(ttrace->entry_str);
1709         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1710         if (remaining_space <= 0)
1711                 goto out;
1712
1713         if (filename_len > (size_t)remaining_space) {
1714                 filename += filename_len - remaining_space;
1715                 filename_len = remaining_space;
1716         }
1717
1718         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1719         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1720         memmove(pos + filename_len, pos, to_move);
1721         memcpy(pos, filename, filename_len);
1722
1723         ttrace->filename.ptr = 0;
1724         ttrace->filename.entry_str_pos = 0;
1725 out:
1726         return 0;
1727 }
1728
1729 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1730                                      union perf_event *event __maybe_unused,
1731                                      struct perf_sample *sample)
1732 {
1733         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1734         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1735         struct thread *thread = machine__findnew_thread(trace->host,
1736                                                         sample->pid,
1737                                                         sample->tid);
1738         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1739
1740         if (ttrace == NULL)
1741                 goto out_dump;
1742
1743         ttrace->runtime_ms += runtime_ms;
1744         trace->runtime_ms += runtime_ms;
1745         thread__put(thread);
1746         return 0;
1747
1748 out_dump:
1749         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1750                evsel->name,
1751                perf_evsel__strval(evsel, sample, "comm"),
1752                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1753                runtime,
1754                perf_evsel__intval(evsel, sample, "vruntime"));
1755         thread__put(thread);
1756         return 0;
1757 }
1758
1759 static void bpf_output__printer(enum binary_printer_ops op,
1760                                 unsigned int val, void *extra)
1761 {
1762         FILE *output = extra;
1763         unsigned char ch = (unsigned char)val;
1764
1765         switch (op) {
1766         case BINARY_PRINT_CHAR_DATA:
1767                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1768                 break;
1769         case BINARY_PRINT_DATA_BEGIN:
1770         case BINARY_PRINT_LINE_BEGIN:
1771         case BINARY_PRINT_ADDR:
1772         case BINARY_PRINT_NUM_DATA:
1773         case BINARY_PRINT_NUM_PAD:
1774         case BINARY_PRINT_SEP:
1775         case BINARY_PRINT_CHAR_PAD:
1776         case BINARY_PRINT_LINE_END:
1777         case BINARY_PRINT_DATA_END:
1778         default:
1779                 break;
1780         }
1781 }
1782
1783 static void bpf_output__fprintf(struct trace *trace,
1784                                 struct perf_sample *sample)
1785 {
1786         print_binary(sample->raw_data, sample->raw_size, 8,
1787                      bpf_output__printer, trace->output);
1788 }
1789
1790 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1791                                 union perf_event *event __maybe_unused,
1792                                 struct perf_sample *sample)
1793 {
1794         int callchain_ret = 0;
1795
1796         if (sample->callchain) {
1797                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1798                 if (callchain_ret == 0) {
1799                         if (callchain_cursor.nr < trace->min_stack)
1800                                 goto out;
1801                         callchain_ret = 1;
1802                 }
1803         }
1804
1805         trace__printf_interrupted_entry(trace, sample);
1806         trace__fprintf_tstamp(trace, sample->time, trace->output);
1807
1808         if (trace->trace_syscalls)
1809                 fprintf(trace->output, "(         ): ");
1810
1811         fprintf(trace->output, "%s:", evsel->name);
1812
1813         if (perf_evsel__is_bpf_output(evsel)) {
1814                 bpf_output__fprintf(trace, sample);
1815         } else if (evsel->tp_format) {
1816                 event_format__fprintf(evsel->tp_format, sample->cpu,
1817                                       sample->raw_data, sample->raw_size,
1818                                       trace->output);
1819         }
1820
1821         fprintf(trace->output, ")\n");
1822
1823         if (callchain_ret > 0)
1824                 trace__fprintf_callchain(trace, sample);
1825         else if (callchain_ret < 0)
1826                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1827 out:
1828         return 0;
1829 }
1830
1831 static void print_location(FILE *f, struct perf_sample *sample,
1832                            struct addr_location *al,
1833                            bool print_dso, bool print_sym)
1834 {
1835
1836         if ((verbose || print_dso) && al->map)
1837                 fprintf(f, "%s@", al->map->dso->long_name);
1838
1839         if ((verbose || print_sym) && al->sym)
1840                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1841                         al->addr - al->sym->start);
1842         else if (al->map)
1843                 fprintf(f, "0x%" PRIx64, al->addr);
1844         else
1845                 fprintf(f, "0x%" PRIx64, sample->addr);
1846 }
1847
1848 static int trace__pgfault(struct trace *trace,
1849                           struct perf_evsel *evsel,
1850                           union perf_event *event __maybe_unused,
1851                           struct perf_sample *sample)
1852 {
1853         struct thread *thread;
1854         struct addr_location al;
1855         char map_type = 'd';
1856         struct thread_trace *ttrace;
1857         int err = -1;
1858         int callchain_ret = 0;
1859
1860         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1861
1862         if (sample->callchain) {
1863                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1864                 if (callchain_ret == 0) {
1865                         if (callchain_cursor.nr < trace->min_stack)
1866                                 goto out_put;
1867                         callchain_ret = 1;
1868                 }
1869         }
1870
1871         ttrace = thread__trace(thread, trace->output);
1872         if (ttrace == NULL)
1873                 goto out_put;
1874
1875         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1876                 ttrace->pfmaj++;
1877         else
1878                 ttrace->pfmin++;
1879
1880         if (trace->summary_only)
1881                 goto out;
1882
1883         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1884                               sample->ip, &al);
1885
1886         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1887
1888         fprintf(trace->output, "%sfault [",
1889                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1890                 "maj" : "min");
1891
1892         print_location(trace->output, sample, &al, false, true);
1893
1894         fprintf(trace->output, "] => ");
1895
1896         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1897                                    sample->addr, &al);
1898
1899         if (!al.map) {
1900                 thread__find_addr_location(thread, sample->cpumode,
1901                                            MAP__FUNCTION, sample->addr, &al);
1902
1903                 if (al.map)
1904                         map_type = 'x';
1905                 else
1906                         map_type = '?';
1907         }
1908
1909         print_location(trace->output, sample, &al, true, false);
1910
1911         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1912
1913         if (callchain_ret > 0)
1914                 trace__fprintf_callchain(trace, sample);
1915         else if (callchain_ret < 0)
1916                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1917 out:
1918         err = 0;
1919 out_put:
1920         thread__put(thread);
1921         return err;
1922 }
1923
1924 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1925 {
1926         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1927             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1928                 return false;
1929
1930         if (trace->pid_list || trace->tid_list)
1931                 return true;
1932
1933         return false;
1934 }
1935
1936 static void trace__set_base_time(struct trace *trace,
1937                                  struct perf_evsel *evsel,
1938                                  struct perf_sample *sample)
1939 {
1940         /*
1941          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1942          * and don't use sample->time unconditionally, we may end up having
1943          * some other event in the future without PERF_SAMPLE_TIME for good
1944          * reason, i.e. we may not be interested in its timestamps, just in
1945          * it taking place, picking some piece of information when it
1946          * appears in our event stream (vfs_getname comes to mind).
1947          */
1948         if (trace->base_time == 0 && !trace->full_time &&
1949             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1950                 trace->base_time = sample->time;
1951 }
1952
1953 static int trace__process_sample(struct perf_tool *tool,
1954                                  union perf_event *event,
1955                                  struct perf_sample *sample,
1956                                  struct perf_evsel *evsel,
1957                                  struct machine *machine __maybe_unused)
1958 {
1959         struct trace *trace = container_of(tool, struct trace, tool);
1960         int err = 0;
1961
1962         tracepoint_handler handler = evsel->handler;
1963
1964         if (skip_sample(trace, sample))
1965                 return 0;
1966
1967         trace__set_base_time(trace, evsel, sample);
1968
1969         if (handler) {
1970                 ++trace->nr_events;
1971                 handler(trace, evsel, event, sample);
1972         }
1973
1974         return err;
1975 }
1976
1977 static int parse_target_str(struct trace *trace)
1978 {
1979         if (trace->opts.target.pid) {
1980                 trace->pid_list = intlist__new(trace->opts.target.pid);
1981                 if (trace->pid_list == NULL) {
1982                         pr_err("Error parsing process id string\n");
1983                         return -EINVAL;
1984                 }
1985         }
1986
1987         if (trace->opts.target.tid) {
1988                 trace->tid_list = intlist__new(trace->opts.target.tid);
1989                 if (trace->tid_list == NULL) {
1990                         pr_err("Error parsing thread id string\n");
1991                         return -EINVAL;
1992                 }
1993         }
1994
1995         return 0;
1996 }
1997
1998 static int trace__record(struct trace *trace, int argc, const char **argv)
1999 {
2000         unsigned int rec_argc, i, j;
2001         const char **rec_argv;
2002         const char * const record_args[] = {
2003                 "record",
2004                 "-R",
2005                 "-m", "1024",
2006                 "-c", "1",
2007         };
2008
2009         const char * const sc_args[] = { "-e", };
2010         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2011         const char * const majpf_args[] = { "-e", "major-faults" };
2012         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2013         const char * const minpf_args[] = { "-e", "minor-faults" };
2014         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2015
2016         /* +1 is for the event string below */
2017         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2018                 majpf_args_nr + minpf_args_nr + argc;
2019         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2020
2021         if (rec_argv == NULL)
2022                 return -ENOMEM;
2023
2024         j = 0;
2025         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2026                 rec_argv[j++] = record_args[i];
2027
2028         if (trace->trace_syscalls) {
2029                 for (i = 0; i < sc_args_nr; i++)
2030                         rec_argv[j++] = sc_args[i];
2031
2032                 /* event string may be different for older kernels - e.g., RHEL6 */
2033                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2034                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2035                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2036                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2037                 else {
2038                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2039                         return -1;
2040                 }
2041         }
2042
2043         if (trace->trace_pgfaults & TRACE_PFMAJ)
2044                 for (i = 0; i < majpf_args_nr; i++)
2045                         rec_argv[j++] = majpf_args[i];
2046
2047         if (trace->trace_pgfaults & TRACE_PFMIN)
2048                 for (i = 0; i < minpf_args_nr; i++)
2049                         rec_argv[j++] = minpf_args[i];
2050
2051         for (i = 0; i < (unsigned int)argc; i++)
2052                 rec_argv[j++] = argv[i];
2053
2054         return cmd_record(j, rec_argv, NULL);
2055 }
2056
2057 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2058
2059 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2060 {
2061         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2062
2063         if (IS_ERR(evsel))
2064                 return false;
2065
2066         if (perf_evsel__field(evsel, "pathname") == NULL) {
2067                 perf_evsel__delete(evsel);
2068                 return false;
2069         }
2070
2071         evsel->handler = trace__vfs_getname;
2072         perf_evlist__add(evlist, evsel);
2073         return true;
2074 }
2075
2076 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2077 {
2078         struct perf_evsel *evsel;
2079         struct perf_event_attr attr = {
2080                 .type = PERF_TYPE_SOFTWARE,
2081                 .mmap_data = 1,
2082         };
2083
2084         attr.config = config;
2085         attr.sample_period = 1;
2086
2087         event_attr_init(&attr);
2088
2089         evsel = perf_evsel__new(&attr);
2090         if (evsel)
2091                 evsel->handler = trace__pgfault;
2092
2093         return evsel;
2094 }
2095
2096 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2097 {
2098         const u32 type = event->header.type;
2099         struct perf_evsel *evsel;
2100
2101         if (type != PERF_RECORD_SAMPLE) {
2102                 trace__process_event(trace, trace->host, event, sample);
2103                 return;
2104         }
2105
2106         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2107         if (evsel == NULL) {
2108                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2109                 return;
2110         }
2111
2112         trace__set_base_time(trace, evsel, sample);
2113
2114         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2115             sample->raw_data == NULL) {
2116                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2117                        perf_evsel__name(evsel), sample->tid,
2118                        sample->cpu, sample->raw_size);
2119         } else {
2120                 tracepoint_handler handler = evsel->handler;
2121                 handler(trace, evsel, event, sample);
2122         }
2123 }
2124
2125 static int trace__add_syscall_newtp(struct trace *trace)
2126 {
2127         int ret = -1;
2128         struct perf_evlist *evlist = trace->evlist;
2129         struct perf_evsel *sys_enter, *sys_exit;
2130
2131         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2132         if (sys_enter == NULL)
2133                 goto out;
2134
2135         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2136                 goto out_delete_sys_enter;
2137
2138         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2139         if (sys_exit == NULL)
2140                 goto out_delete_sys_enter;
2141
2142         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2143                 goto out_delete_sys_exit;
2144
2145         perf_evlist__add(evlist, sys_enter);
2146         perf_evlist__add(evlist, sys_exit);
2147
2148         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2149                 /*
2150                  * We're interested only in the user space callchain
2151                  * leading to the syscall, allow overriding that for
2152                  * debugging reasons using --kernel_syscall_callchains
2153                  */
2154                 sys_exit->attr.exclude_callchain_kernel = 1;
2155         }
2156
2157         trace->syscalls.events.sys_enter = sys_enter;
2158         trace->syscalls.events.sys_exit  = sys_exit;
2159
2160         ret = 0;
2161 out:
2162         return ret;
2163
2164 out_delete_sys_exit:
2165         perf_evsel__delete_priv(sys_exit);
2166 out_delete_sys_enter:
2167         perf_evsel__delete_priv(sys_enter);
2168         goto out;
2169 }
2170
2171 static int trace__set_ev_qualifier_filter(struct trace *trace)
2172 {
2173         int err = -1;
2174         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2175                                                 trace->ev_qualifier_ids.nr,
2176                                                 trace->ev_qualifier_ids.entries);
2177
2178         if (filter == NULL)
2179                 goto out_enomem;
2180
2181         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2182                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2183
2184         free(filter);
2185 out:
2186         return err;
2187 out_enomem:
2188         errno = ENOMEM;
2189         goto out;
2190 }
2191
2192 static int trace__run(struct trace *trace, int argc, const char **argv)
2193 {
2194         struct perf_evlist *evlist = trace->evlist;
2195         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2196         int err = -1, i;
2197         unsigned long before;
2198         const bool forks = argc > 0;
2199         bool draining = false;
2200
2201         trace->live = true;
2202
2203         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2204                 goto out_error_raw_syscalls;
2205
2206         if (trace->trace_syscalls)
2207                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2208
2209         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2210                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2211                 if (pgfault_maj == NULL)
2212                         goto out_error_mem;
2213                 perf_evlist__add(evlist, pgfault_maj);
2214         }
2215
2216         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2217                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2218                 if (pgfault_min == NULL)
2219                         goto out_error_mem;
2220                 perf_evlist__add(evlist, pgfault_min);
2221         }
2222
2223         if (trace->sched &&
2224             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2225                                    trace__sched_stat_runtime))
2226                 goto out_error_sched_stat_runtime;
2227
2228         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2229         if (err < 0) {
2230                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2231                 goto out_delete_evlist;
2232         }
2233
2234         err = trace__symbols_init(trace, evlist);
2235         if (err < 0) {
2236                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2237                 goto out_delete_evlist;
2238         }
2239
2240         perf_evlist__config(evlist, &trace->opts, NULL);
2241
2242         if (callchain_param.enabled) {
2243                 bool use_identifier = false;
2244
2245                 if (trace->syscalls.events.sys_exit) {
2246                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2247                                                      &trace->opts, &callchain_param);
2248                         use_identifier = true;
2249                 }
2250
2251                 if (pgfault_maj) {
2252                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2253                         use_identifier = true;
2254                 }
2255
2256                 if (pgfault_min) {
2257                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2258                         use_identifier = true;
2259                 }
2260
2261                 if (use_identifier) {
2262                        /*
2263                         * Now we have evsels with different sample_ids, use
2264                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2265                         * from a fixed position in each ring buffer record.
2266                         *
2267                         * As of this the changeset introducing this comment, this
2268                         * isn't strictly needed, as the fields that can come before
2269                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2270                         * some of those for things like copying the payload of
2271                         * pointer syscall arguments, and for vfs_getname we don't
2272                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2273                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2274                         */
2275                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2276                         perf_evlist__reset_sample_bit(evlist, ID);
2277                 }
2278         }
2279
2280         signal(SIGCHLD, sig_handler);
2281         signal(SIGINT, sig_handler);
2282
2283         if (forks) {
2284                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2285                                                     argv, false, NULL);
2286                 if (err < 0) {
2287                         fprintf(trace->output, "Couldn't run the workload!\n");
2288                         goto out_delete_evlist;
2289                 }
2290         }
2291
2292         err = perf_evlist__open(evlist);
2293         if (err < 0)
2294                 goto out_error_open;
2295
2296         err = bpf__apply_obj_config();
2297         if (err) {
2298                 char errbuf[BUFSIZ];
2299
2300                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2301                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2302                          errbuf);
2303                 goto out_error_open;
2304         }
2305
2306         /*
2307          * Better not use !target__has_task() here because we need to cover the
2308          * case where no threads were specified in the command line, but a
2309          * workload was, and in that case we will fill in the thread_map when
2310          * we fork the workload in perf_evlist__prepare_workload.
2311          */
2312         if (trace->filter_pids.nr > 0)
2313                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2314         else if (thread_map__pid(evlist->threads, 0) == -1)
2315                 err = perf_evlist__set_filter_pid(evlist, getpid());
2316
2317         if (err < 0)
2318                 goto out_error_mem;
2319
2320         if (trace->ev_qualifier_ids.nr > 0) {
2321                 err = trace__set_ev_qualifier_filter(trace);
2322                 if (err < 0)
2323                         goto out_errno;
2324
2325                 pr_debug("event qualifier tracepoint filter: %s\n",
2326                          trace->syscalls.events.sys_exit->filter);
2327         }
2328
2329         err = perf_evlist__apply_filters(evlist, &evsel);
2330         if (err < 0)
2331                 goto out_error_apply_filters;
2332
2333         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2334         if (err < 0)
2335                 goto out_error_mmap;
2336
2337         if (!target__none(&trace->opts.target))
2338                 perf_evlist__enable(evlist);
2339
2340         if (forks)
2341                 perf_evlist__start_workload(evlist);
2342
2343         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2344                                   evlist->threads->nr > 1 ||
2345                                   perf_evlist__first(evlist)->attr.inherit;
2346 again:
2347         before = trace->nr_events;
2348
2349         for (i = 0; i < evlist->nr_mmaps; i++) {
2350                 union perf_event *event;
2351
2352                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2353                         struct perf_sample sample;
2354
2355                         ++trace->nr_events;
2356
2357                         err = perf_evlist__parse_sample(evlist, event, &sample);
2358                         if (err) {
2359                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2360                                 goto next_event;
2361                         }
2362
2363                         trace__handle_event(trace, event, &sample);
2364 next_event:
2365                         perf_evlist__mmap_consume(evlist, i);
2366
2367                         if (interrupted)
2368                                 goto out_disable;
2369
2370                         if (done && !draining) {
2371                                 perf_evlist__disable(evlist);
2372                                 draining = true;
2373                         }
2374                 }
2375         }
2376
2377         if (trace->nr_events == before) {
2378                 int timeout = done ? 100 : -1;
2379
2380                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2381                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2382                                 draining = true;
2383
2384                         goto again;
2385                 }
2386         } else {
2387                 goto again;
2388         }
2389
2390 out_disable:
2391         thread__zput(trace->current);
2392
2393         perf_evlist__disable(evlist);
2394
2395         if (!err) {
2396                 if (trace->summary)
2397                         trace__fprintf_thread_summary(trace, trace->output);
2398
2399                 if (trace->show_tool_stats) {
2400                         fprintf(trace->output, "Stats:\n "
2401                                                " vfs_getname : %" PRIu64 "\n"
2402                                                " proc_getname: %" PRIu64 "\n",
2403                                 trace->stats.vfs_getname,
2404                                 trace->stats.proc_getname);
2405                 }
2406         }
2407
2408 out_delete_evlist:
2409         perf_evlist__delete(evlist);
2410         trace->evlist = NULL;
2411         trace->live = false;
2412         return err;
2413 {
2414         char errbuf[BUFSIZ];
2415
2416 out_error_sched_stat_runtime:
2417         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2418         goto out_error;
2419
2420 out_error_raw_syscalls:
2421         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2422         goto out_error;
2423
2424 out_error_mmap:
2425         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2426         goto out_error;
2427
2428 out_error_open:
2429         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2430
2431 out_error:
2432         fprintf(trace->output, "%s\n", errbuf);
2433         goto out_delete_evlist;
2434
2435 out_error_apply_filters:
2436         fprintf(trace->output,
2437                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2438                 evsel->filter, perf_evsel__name(evsel), errno,
2439                 strerror_r(errno, errbuf, sizeof(errbuf)));
2440         goto out_delete_evlist;
2441 }
2442 out_error_mem:
2443         fprintf(trace->output, "Not enough memory to run!\n");
2444         goto out_delete_evlist;
2445
2446 out_errno:
2447         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2448         goto out_delete_evlist;
2449 }
2450
2451 static int trace__replay(struct trace *trace)
2452 {
2453         const struct perf_evsel_str_handler handlers[] = {
2454                 { "probe:vfs_getname",       trace__vfs_getname, },
2455         };
2456         struct perf_data_file file = {
2457                 .path  = input_name,
2458                 .mode  = PERF_DATA_MODE_READ,
2459                 .force = trace->force,
2460         };
2461         struct perf_session *session;
2462         struct perf_evsel *evsel;
2463         int err = -1;
2464
2465         trace->tool.sample        = trace__process_sample;
2466         trace->tool.mmap          = perf_event__process_mmap;
2467         trace->tool.mmap2         = perf_event__process_mmap2;
2468         trace->tool.comm          = perf_event__process_comm;
2469         trace->tool.exit          = perf_event__process_exit;
2470         trace->tool.fork          = perf_event__process_fork;
2471         trace->tool.attr          = perf_event__process_attr;
2472         trace->tool.tracing_data = perf_event__process_tracing_data;
2473         trace->tool.build_id      = perf_event__process_build_id;
2474
2475         trace->tool.ordered_events = true;
2476         trace->tool.ordering_requires_timestamps = true;
2477
2478         /* add tid to output */
2479         trace->multiple_threads = true;
2480
2481         session = perf_session__new(&file, false, &trace->tool);
2482         if (session == NULL)
2483                 return -1;
2484
2485         if (symbol__init(&session->header.env) < 0)
2486                 goto out;
2487
2488         trace->host = &session->machines.host;
2489
2490         err = perf_session__set_tracepoints_handlers(session, handlers);
2491         if (err)
2492                 goto out;
2493
2494         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2495                                                      "raw_syscalls:sys_enter");
2496         /* older kernels have syscalls tp versus raw_syscalls */
2497         if (evsel == NULL)
2498                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2499                                                              "syscalls:sys_enter");
2500
2501         if (evsel &&
2502             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2503             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2504                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2505                 goto out;
2506         }
2507
2508         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2509                                                      "raw_syscalls:sys_exit");
2510         if (evsel == NULL)
2511                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2512                                                              "syscalls:sys_exit");
2513         if (evsel &&
2514             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2515             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2516                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2517                 goto out;
2518         }
2519
2520         evlist__for_each(session->evlist, evsel) {
2521                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2522                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2523                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2524                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2525                         evsel->handler = trace__pgfault;
2526         }
2527
2528         err = parse_target_str(trace);
2529         if (err != 0)
2530                 goto out;
2531
2532         setup_pager();
2533
2534         err = perf_session__process_events(session);
2535         if (err)
2536                 pr_err("Failed to process events, error %d", err);
2537
2538         else if (trace->summary)
2539                 trace__fprintf_thread_summary(trace, trace->output);
2540
2541 out:
2542         perf_session__delete(session);
2543
2544         return err;
2545 }
2546
2547 static size_t trace__fprintf_threads_header(FILE *fp)
2548 {
2549         size_t printed;
2550
2551         printed  = fprintf(fp, "\n Summary of events:\n\n");
2552
2553         return printed;
2554 }
2555
2556 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2557         struct stats    *stats;
2558         double          msecs;
2559         int             syscall;
2560 )
2561 {
2562         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2563         struct stats *stats = source->priv;
2564
2565         entry->syscall = source->i;
2566         entry->stats   = stats;
2567         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2568 }
2569
2570 static size_t thread__dump_stats(struct thread_trace *ttrace,
2571                                  struct trace *trace, FILE *fp)
2572 {
2573         size_t printed = 0;
2574         struct syscall *sc;
2575         struct rb_node *nd;
2576         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2577
2578         if (syscall_stats == NULL)
2579                 return 0;
2580
2581         printed += fprintf(fp, "\n");
2582
2583         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2584         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2585         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2586
2587         resort_rb__for_each(nd, syscall_stats) {
2588                 struct stats *stats = syscall_stats_entry->stats;
2589                 if (stats) {
2590                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2591                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2592                         double avg = avg_stats(stats);
2593                         double pct;
2594                         u64 n = (u64) stats->n;
2595
2596                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2597                         avg /= NSEC_PER_MSEC;
2598
2599                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2600                         printed += fprintf(fp, "   %-15s", sc->name);
2601                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2602                                            n, syscall_stats_entry->msecs, min, avg);
2603                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2604                 }
2605         }
2606
2607         resort_rb__delete(syscall_stats);
2608         printed += fprintf(fp, "\n\n");
2609
2610         return printed;
2611 }
2612
2613 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2614 {
2615         size_t printed = 0;
2616         struct thread_trace *ttrace = thread__priv(thread);
2617         double ratio;
2618
2619         if (ttrace == NULL)
2620                 return 0;
2621
2622         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2623
2624         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2625         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2626         printed += fprintf(fp, "%.1f%%", ratio);
2627         if (ttrace->pfmaj)
2628                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2629         if (ttrace->pfmin)
2630                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2631         if (trace->sched)
2632                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2633         else if (fputc('\n', fp) != EOF)
2634                 ++printed;
2635
2636         printed += thread__dump_stats(ttrace, trace, fp);
2637
2638         return printed;
2639 }
2640
2641 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2642 {
2643         return ttrace ? ttrace->nr_events : 0;
2644 }
2645
2646 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2647         struct thread *thread;
2648 )
2649 {
2650         entry->thread = rb_entry(nd, struct thread, rb_node);
2651 }
2652
2653 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2654 {
2655         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2656         size_t printed = trace__fprintf_threads_header(fp);
2657         struct rb_node *nd;
2658
2659         if (threads == NULL) {
2660                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2661                 return 0;
2662         }
2663
2664         resort_rb__for_each(nd, threads)
2665                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2666
2667         resort_rb__delete(threads);
2668
2669         return printed;
2670 }
2671
2672 static int trace__set_duration(const struct option *opt, const char *str,
2673                                int unset __maybe_unused)
2674 {
2675         struct trace *trace = opt->value;
2676
2677         trace->duration_filter = atof(str);
2678         return 0;
2679 }
2680
2681 static int trace__set_filter_pids(const struct option *opt, const char *str,
2682                                   int unset __maybe_unused)
2683 {
2684         int ret = -1;
2685         size_t i;
2686         struct trace *trace = opt->value;
2687         /*
2688          * FIXME: introduce a intarray class, plain parse csv and create a
2689          * { int nr, int entries[] } struct...
2690          */
2691         struct intlist *list = intlist__new(str);
2692
2693         if (list == NULL)
2694                 return -1;
2695
2696         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2697         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2698
2699         if (trace->filter_pids.entries == NULL)
2700                 goto out;
2701
2702         trace->filter_pids.entries[0] = getpid();
2703
2704         for (i = 1; i < trace->filter_pids.nr; ++i)
2705                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2706
2707         intlist__delete(list);
2708         ret = 0;
2709 out:
2710         return ret;
2711 }
2712
2713 static int trace__open_output(struct trace *trace, const char *filename)
2714 {
2715         struct stat st;
2716
2717         if (!stat(filename, &st) && st.st_size) {
2718                 char oldname[PATH_MAX];
2719
2720                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2721                 unlink(oldname);
2722                 rename(filename, oldname);
2723         }
2724
2725         trace->output = fopen(filename, "w");
2726
2727         return trace->output == NULL ? -errno : 0;
2728 }
2729
2730 static int parse_pagefaults(const struct option *opt, const char *str,
2731                             int unset __maybe_unused)
2732 {
2733         int *trace_pgfaults = opt->value;
2734
2735         if (strcmp(str, "all") == 0)
2736                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2737         else if (strcmp(str, "maj") == 0)
2738                 *trace_pgfaults |= TRACE_PFMAJ;
2739         else if (strcmp(str, "min") == 0)
2740                 *trace_pgfaults |= TRACE_PFMIN;
2741         else
2742                 return -1;
2743
2744         return 0;
2745 }
2746
2747 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2748 {
2749         struct perf_evsel *evsel;
2750
2751         evlist__for_each(evlist, evsel)
2752                 evsel->handler = handler;
2753 }
2754
2755 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2756 {
2757         const char *trace_usage[] = {
2758                 "perf trace [<options>] [<command>]",
2759                 "perf trace [<options>] -- <command> [<options>]",
2760                 "perf trace record [<options>] [<command>]",
2761                 "perf trace record [<options>] -- <command> [<options>]",
2762                 NULL
2763         };
2764         struct trace trace = {
2765                 .syscalls = {
2766                         . max = -1,
2767                 },
2768                 .opts = {
2769                         .target = {
2770                                 .uid       = UINT_MAX,
2771                                 .uses_mmap = true,
2772                         },
2773                         .user_freq     = UINT_MAX,
2774                         .user_interval = ULLONG_MAX,
2775                         .no_buffering  = true,
2776                         .mmap_pages    = UINT_MAX,
2777                         .proc_map_timeout  = 500,
2778                 },
2779                 .output = stderr,
2780                 .show_comm = true,
2781                 .trace_syscalls = true,
2782                 .kernel_syscallchains = false,
2783                 .max_stack = UINT_MAX,
2784         };
2785         const char *output_name = NULL;
2786         const char *ev_qualifier_str = NULL;
2787         const struct option trace_options[] = {
2788         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2789                      "event selector. use 'perf list' to list available events",
2790                      parse_events_option),
2791         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2792                     "show the thread COMM next to its id"),
2793         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2794         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2795         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2796         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2797         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2798                     "trace events on existing process id"),
2799         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2800                     "trace events on existing thread id"),
2801         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2802                      "pids to filter (by the kernel)", trace__set_filter_pids),
2803         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2804                     "system-wide collection from all CPUs"),
2805         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2806                     "list of cpus to monitor"),
2807         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2808                     "child tasks do not inherit counters"),
2809         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2810                      "number of mmap data pages",
2811                      perf_evlist__parse_mmap_pages),
2812         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2813                    "user to profile"),
2814         OPT_CALLBACK(0, "duration", &trace, "float",
2815                      "show only events with duration > N.M ms",
2816                      trace__set_duration),
2817         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2818         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2819         OPT_BOOLEAN('T', "time", &trace.full_time,
2820                     "Show full timestamp, not time relative to first start"),
2821         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2822                     "Show only syscall summary with statistics"),
2823         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2824                     "Show all syscalls and summary with statistics"),
2825         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2826                      "Trace pagefaults", parse_pagefaults, "maj"),
2827         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2828         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2829         OPT_CALLBACK(0, "call-graph", &trace.opts,
2830                      "record_mode[,record_size]", record_callchain_help,
2831                      &record_parse_callchain_opt),
2832         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2833                     "Show the kernel callchains on the syscall exit path"),
2834         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2835                      "Set the minimum stack depth when parsing the callchain, "
2836                      "anything below the specified depth will be ignored."),
2837         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2838                      "Set the maximum stack depth when parsing the callchain, "
2839                      "anything beyond the specified depth will be ignored. "
2840                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2841         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2842                         "per thread proc mmap processing timeout in ms"),
2843         OPT_END()
2844         };
2845         bool __maybe_unused max_stack_user_set = true;
2846         bool mmap_pages_user_set = true;
2847         const char * const trace_subcommands[] = { "record", NULL };
2848         int err;
2849         char bf[BUFSIZ];
2850
2851         signal(SIGSEGV, sighandler_dump_stack);
2852         signal(SIGFPE, sighandler_dump_stack);
2853
2854         trace.evlist = perf_evlist__new();
2855         trace.sctbl = syscalltbl__new();
2856
2857         if (trace.evlist == NULL || trace.sctbl == NULL) {
2858                 pr_err("Not enough memory to run!\n");
2859                 err = -ENOMEM;
2860                 goto out;
2861         }
2862
2863         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2864                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2865
2866         err = bpf__setup_stdout(trace.evlist);
2867         if (err) {
2868                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2869                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2870                 goto out;
2871         }
2872
2873         err = -1;
2874
2875         if (trace.trace_pgfaults) {
2876                 trace.opts.sample_address = true;
2877                 trace.opts.sample_time = true;
2878         }
2879
2880         if (trace.opts.mmap_pages == UINT_MAX)
2881                 mmap_pages_user_set = false;
2882
2883         if (trace.max_stack == UINT_MAX) {
2884                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2885                 max_stack_user_set = false;
2886         }
2887
2888 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2889         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2890                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2891 #endif
2892
2893         if (callchain_param.enabled) {
2894                 if (!mmap_pages_user_set && geteuid() == 0)
2895                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2896
2897                 symbol_conf.use_callchain = true;
2898         }
2899
2900         if (trace.evlist->nr_entries > 0)
2901                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2902
2903         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2904                 return trace__record(&trace, argc-1, &argv[1]);
2905
2906         /* summary_only implies summary option, but don't overwrite summary if set */
2907         if (trace.summary_only)
2908                 trace.summary = trace.summary_only;
2909
2910         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2911             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2912                 pr_err("Please specify something to trace.\n");
2913                 return -1;
2914         }
2915
2916         if (!trace.trace_syscalls && ev_qualifier_str) {
2917                 pr_err("The -e option can't be used with --no-syscalls.\n");
2918                 goto out;
2919         }
2920
2921         if (output_name != NULL) {
2922                 err = trace__open_output(&trace, output_name);
2923                 if (err < 0) {
2924                         perror("failed to create output file");
2925                         goto out;
2926                 }
2927         }
2928
2929         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2930
2931         if (ev_qualifier_str != NULL) {
2932                 const char *s = ev_qualifier_str;
2933                 struct strlist_config slist_config = {
2934                         .dirname = system_path(STRACE_GROUPS_DIR),
2935                 };
2936
2937                 trace.not_ev_qualifier = *s == '!';
2938                 if (trace.not_ev_qualifier)
2939                         ++s;
2940                 trace.ev_qualifier = strlist__new(s, &slist_config);
2941                 if (trace.ev_qualifier == NULL) {
2942                         fputs("Not enough memory to parse event qualifier",
2943                               trace.output);
2944                         err = -ENOMEM;
2945                         goto out_close;
2946                 }
2947
2948                 err = trace__validate_ev_qualifier(&trace);
2949                 if (err)
2950                         goto out_close;
2951         }
2952
2953         err = target__validate(&trace.opts.target);
2954         if (err) {
2955                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2956                 fprintf(trace.output, "%s", bf);
2957                 goto out_close;
2958         }
2959
2960         err = target__parse_uid(&trace.opts.target);
2961         if (err) {
2962                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2963                 fprintf(trace.output, "%s", bf);
2964                 goto out_close;
2965         }
2966
2967         if (!argc && target__none(&trace.opts.target))
2968                 trace.opts.target.system_wide = true;
2969
2970         if (input_name)
2971                 err = trace__replay(&trace);
2972         else
2973                 err = trace__run(&trace, argc, argv);
2974
2975 out_close:
2976         if (output_name != NULL)
2977                 fclose(trace.output);
2978 out:
2979         return err;
2980 }