perf trace: Add getrandom beautifier related defines for older systems
[linux-2.6-block.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39
40 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
41 #include <stdlib.h>
42 #include <sys/mman.h>
43 #include <linux/futex.h>
44 #include <linux/err.h>
45 #include <linux/seccomp.h>
46 #include <linux/filter.h>
47 #include <linux/audit.h>
48 #include <sys/ptrace.h>
49 #include <linux/random.h>
50
51 /* For older distros: */
52 #ifndef MAP_STACK
53 # define MAP_STACK              0x20000
54 #endif
55
56 #ifndef MADV_HWPOISON
57 # define MADV_HWPOISON          100
58
59 #endif
60
61 #ifndef MADV_MERGEABLE
62 # define MADV_MERGEABLE         12
63 #endif
64
65 #ifndef MADV_UNMERGEABLE
66 # define MADV_UNMERGEABLE       13
67 #endif
68
69 #ifndef EFD_SEMAPHORE
70 # define EFD_SEMAPHORE          1
71 #endif
72
73 #ifndef EFD_NONBLOCK
74 # define EFD_NONBLOCK           00004000
75 #endif
76
77 #ifndef EFD_CLOEXEC
78 # define EFD_CLOEXEC            02000000
79 #endif
80
81 #ifndef O_CLOEXEC
82 # define O_CLOEXEC              02000000
83 #endif
84
85 #ifndef SOCK_DCCP
86 # define SOCK_DCCP              6
87 #endif
88
89 #ifndef SOCK_CLOEXEC
90 # define SOCK_CLOEXEC           02000000
91 #endif
92
93 #ifndef SOCK_NONBLOCK
94 # define SOCK_NONBLOCK          00004000
95 #endif
96
97 #ifndef MSG_CMSG_CLOEXEC
98 # define MSG_CMSG_CLOEXEC       0x40000000
99 #endif
100
101 #ifndef PERF_FLAG_FD_NO_GROUP
102 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
103 #endif
104
105 #ifndef PERF_FLAG_FD_OUTPUT
106 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
107 #endif
108
109 #ifndef PERF_FLAG_PID_CGROUP
110 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
111 #endif
112
113 #ifndef PERF_FLAG_FD_CLOEXEC
114 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
115 #endif
116
117 struct trace {
118         struct perf_tool        tool;
119         struct syscalltbl       *sctbl;
120         struct {
121                 int             max;
122                 struct syscall  *table;
123                 struct {
124                         struct perf_evsel *sys_enter,
125                                           *sys_exit;
126                 }               events;
127         } syscalls;
128         struct record_opts      opts;
129         struct perf_evlist      *evlist;
130         struct machine          *host;
131         struct thread           *current;
132         u64                     base_time;
133         FILE                    *output;
134         unsigned long           nr_events;
135         struct strlist          *ev_qualifier;
136         struct {
137                 size_t          nr;
138                 int             *entries;
139         }                       ev_qualifier_ids;
140         struct intlist          *tid_list;
141         struct intlist          *pid_list;
142         struct {
143                 size_t          nr;
144                 pid_t           *entries;
145         }                       filter_pids;
146         double                  duration_filter;
147         double                  runtime_ms;
148         struct {
149                 u64             vfs_getname,
150                                 proc_getname;
151         } stats;
152         bool                    not_ev_qualifier;
153         bool                    live;
154         bool                    full_time;
155         bool                    sched;
156         bool                    multiple_threads;
157         bool                    summary;
158         bool                    summary_only;
159         bool                    show_comm;
160         bool                    show_tool_stats;
161         bool                    trace_syscalls;
162         bool                    kernel_syscallchains;
163         bool                    force;
164         bool                    vfs_getname;
165         int                     trace_pgfaults;
166         int                     open_id;
167 };
168
169 struct tp_field {
170         int offset;
171         union {
172                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
173                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
174         };
175 };
176
177 #define TP_UINT_FIELD(bits) \
178 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
179 { \
180         u##bits value; \
181         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
182         return value;  \
183 }
184
185 TP_UINT_FIELD(8);
186 TP_UINT_FIELD(16);
187 TP_UINT_FIELD(32);
188 TP_UINT_FIELD(64);
189
190 #define TP_UINT_FIELD__SWAPPED(bits) \
191 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
192 { \
193         u##bits value; \
194         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
195         return bswap_##bits(value);\
196 }
197
198 TP_UINT_FIELD__SWAPPED(16);
199 TP_UINT_FIELD__SWAPPED(32);
200 TP_UINT_FIELD__SWAPPED(64);
201
202 static int tp_field__init_uint(struct tp_field *field,
203                                struct format_field *format_field,
204                                bool needs_swap)
205 {
206         field->offset = format_field->offset;
207
208         switch (format_field->size) {
209         case 1:
210                 field->integer = tp_field__u8;
211                 break;
212         case 2:
213                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
214                 break;
215         case 4:
216                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
217                 break;
218         case 8:
219                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
220                 break;
221         default:
222                 return -1;
223         }
224
225         return 0;
226 }
227
228 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
229 {
230         return sample->raw_data + field->offset;
231 }
232
233 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
234 {
235         field->offset = format_field->offset;
236         field->pointer = tp_field__ptr;
237         return 0;
238 }
239
240 struct syscall_tp {
241         struct tp_field id;
242         union {
243                 struct tp_field args, ret;
244         };
245 };
246
247 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
248                                           struct tp_field *field,
249                                           const char *name)
250 {
251         struct format_field *format_field = perf_evsel__field(evsel, name);
252
253         if (format_field == NULL)
254                 return -1;
255
256         return tp_field__init_uint(field, format_field, evsel->needs_swap);
257 }
258
259 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
260         ({ struct syscall_tp *sc = evsel->priv;\
261            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
262
263 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
264                                          struct tp_field *field,
265                                          const char *name)
266 {
267         struct format_field *format_field = perf_evsel__field(evsel, name);
268
269         if (format_field == NULL)
270                 return -1;
271
272         return tp_field__init_ptr(field, format_field);
273 }
274
275 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
276         ({ struct syscall_tp *sc = evsel->priv;\
277            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
278
279 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
280 {
281         zfree(&evsel->priv);
282         perf_evsel__delete(evsel);
283 }
284
285 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
286 {
287         evsel->priv = malloc(sizeof(struct syscall_tp));
288         if (evsel->priv != NULL) {
289                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
290                         goto out_delete;
291
292                 evsel->handler = handler;
293                 return 0;
294         }
295
296         return -ENOMEM;
297
298 out_delete:
299         zfree(&evsel->priv);
300         return -ENOENT;
301 }
302
303 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
304 {
305         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
306
307         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
308         if (IS_ERR(evsel))
309                 evsel = perf_evsel__newtp("syscalls", direction);
310
311         if (IS_ERR(evsel))
312                 return NULL;
313
314         if (perf_evsel__init_syscall_tp(evsel, handler))
315                 goto out_delete;
316
317         return evsel;
318
319 out_delete:
320         perf_evsel__delete_priv(evsel);
321         return NULL;
322 }
323
324 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
325         ({ struct syscall_tp *fields = evsel->priv; \
326            fields->name.integer(&fields->name, sample); })
327
328 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
329         ({ struct syscall_tp *fields = evsel->priv; \
330            fields->name.pointer(&fields->name, sample); })
331
332 struct syscall_arg {
333         unsigned long val;
334         struct thread *thread;
335         struct trace  *trace;
336         void          *parm;
337         u8            idx;
338         u8            mask;
339 };
340
341 struct strarray {
342         int         offset;
343         int         nr_entries;
344         const char **entries;
345 };
346
347 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
348         .nr_entries = ARRAY_SIZE(array), \
349         .entries = array, \
350 }
351
352 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
353         .offset     = off, \
354         .nr_entries = ARRAY_SIZE(array), \
355         .entries = array, \
356 }
357
358 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
359                                                 const char *intfmt,
360                                                 struct syscall_arg *arg)
361 {
362         struct strarray *sa = arg->parm;
363         int idx = arg->val - sa->offset;
364
365         if (idx < 0 || idx >= sa->nr_entries)
366                 return scnprintf(bf, size, intfmt, arg->val);
367
368         return scnprintf(bf, size, "%s", sa->entries[idx]);
369 }
370
371 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
372                                               struct syscall_arg *arg)
373 {
374         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
375 }
376
377 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
378
379 #if defined(__i386__) || defined(__x86_64__)
380 /*
381  * FIXME: Make this available to all arches as soon as the ioctl beautifier
382  *        gets rewritten to support all arches.
383  */
384 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
385                                                  struct syscall_arg *arg)
386 {
387         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
388 }
389
390 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
391 #endif /* defined(__i386__) || defined(__x86_64__) */
392
393 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
394                                         struct syscall_arg *arg);
395
396 #define SCA_FD syscall_arg__scnprintf_fd
397
398 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
399                                            struct syscall_arg *arg)
400 {
401         int fd = arg->val;
402
403         if (fd == AT_FDCWD)
404                 return scnprintf(bf, size, "CWD");
405
406         return syscall_arg__scnprintf_fd(bf, size, arg);
407 }
408
409 #define SCA_FDAT syscall_arg__scnprintf_fd_at
410
411 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
412                                               struct syscall_arg *arg);
413
414 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
415
416 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
417                                          struct syscall_arg *arg)
418 {
419         return scnprintf(bf, size, "%#lx", arg->val);
420 }
421
422 #define SCA_HEX syscall_arg__scnprintf_hex
423
424 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
425                                          struct syscall_arg *arg)
426 {
427         return scnprintf(bf, size, "%d", arg->val);
428 }
429
430 #define SCA_INT syscall_arg__scnprintf_int
431
432 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
433                                                struct syscall_arg *arg)
434 {
435         int printed = 0, prot = arg->val;
436
437         if (prot == PROT_NONE)
438                 return scnprintf(bf, size, "NONE");
439 #define P_MMAP_PROT(n) \
440         if (prot & PROT_##n) { \
441                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
442                 prot &= ~PROT_##n; \
443         }
444
445         P_MMAP_PROT(EXEC);
446         P_MMAP_PROT(READ);
447         P_MMAP_PROT(WRITE);
448 #ifdef PROT_SEM
449         P_MMAP_PROT(SEM);
450 #endif
451         P_MMAP_PROT(GROWSDOWN);
452         P_MMAP_PROT(GROWSUP);
453 #undef P_MMAP_PROT
454
455         if (prot)
456                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
457
458         return printed;
459 }
460
461 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
462
463 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
464                                                 struct syscall_arg *arg)
465 {
466         int printed = 0, flags = arg->val;
467
468 #define P_MMAP_FLAG(n) \
469         if (flags & MAP_##n) { \
470                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
471                 flags &= ~MAP_##n; \
472         }
473
474         P_MMAP_FLAG(SHARED);
475         P_MMAP_FLAG(PRIVATE);
476 #ifdef MAP_32BIT
477         P_MMAP_FLAG(32BIT);
478 #endif
479         P_MMAP_FLAG(ANONYMOUS);
480         P_MMAP_FLAG(DENYWRITE);
481         P_MMAP_FLAG(EXECUTABLE);
482         P_MMAP_FLAG(FILE);
483         P_MMAP_FLAG(FIXED);
484         P_MMAP_FLAG(GROWSDOWN);
485 #ifdef MAP_HUGETLB
486         P_MMAP_FLAG(HUGETLB);
487 #endif
488         P_MMAP_FLAG(LOCKED);
489         P_MMAP_FLAG(NONBLOCK);
490         P_MMAP_FLAG(NORESERVE);
491         P_MMAP_FLAG(POPULATE);
492         P_MMAP_FLAG(STACK);
493 #ifdef MAP_UNINITIALIZED
494         P_MMAP_FLAG(UNINITIALIZED);
495 #endif
496 #undef P_MMAP_FLAG
497
498         if (flags)
499                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
500
501         return printed;
502 }
503
504 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
505
506 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
507                                                   struct syscall_arg *arg)
508 {
509         int printed = 0, flags = arg->val;
510
511 #define P_MREMAP_FLAG(n) \
512         if (flags & MREMAP_##n) { \
513                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
514                 flags &= ~MREMAP_##n; \
515         }
516
517         P_MREMAP_FLAG(MAYMOVE);
518 #ifdef MREMAP_FIXED
519         P_MREMAP_FLAG(FIXED);
520 #endif
521 #undef P_MREMAP_FLAG
522
523         if (flags)
524                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
525
526         return printed;
527 }
528
529 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
530
531 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
532                                                       struct syscall_arg *arg)
533 {
534         int behavior = arg->val;
535
536         switch (behavior) {
537 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
538         P_MADV_BHV(NORMAL);
539         P_MADV_BHV(RANDOM);
540         P_MADV_BHV(SEQUENTIAL);
541         P_MADV_BHV(WILLNEED);
542         P_MADV_BHV(DONTNEED);
543         P_MADV_BHV(REMOVE);
544         P_MADV_BHV(DONTFORK);
545         P_MADV_BHV(DOFORK);
546         P_MADV_BHV(HWPOISON);
547 #ifdef MADV_SOFT_OFFLINE
548         P_MADV_BHV(SOFT_OFFLINE);
549 #endif
550         P_MADV_BHV(MERGEABLE);
551         P_MADV_BHV(UNMERGEABLE);
552 #ifdef MADV_HUGEPAGE
553         P_MADV_BHV(HUGEPAGE);
554 #endif
555 #ifdef MADV_NOHUGEPAGE
556         P_MADV_BHV(NOHUGEPAGE);
557 #endif
558 #ifdef MADV_DONTDUMP
559         P_MADV_BHV(DONTDUMP);
560 #endif
561 #ifdef MADV_DODUMP
562         P_MADV_BHV(DODUMP);
563 #endif
564 #undef P_MADV_PHV
565         default: break;
566         }
567
568         return scnprintf(bf, size, "%#x", behavior);
569 }
570
571 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
572
573 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
574                                            struct syscall_arg *arg)
575 {
576         int printed = 0, op = arg->val;
577
578         if (op == 0)
579                 return scnprintf(bf, size, "NONE");
580 #define P_CMD(cmd) \
581         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
582                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
583                 op &= ~LOCK_##cmd; \
584         }
585
586         P_CMD(SH);
587         P_CMD(EX);
588         P_CMD(NB);
589         P_CMD(UN);
590         P_CMD(MAND);
591         P_CMD(RW);
592         P_CMD(READ);
593         P_CMD(WRITE);
594 #undef P_OP
595
596         if (op)
597                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
598
599         return printed;
600 }
601
602 #define SCA_FLOCK syscall_arg__scnprintf_flock
603
604 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
605 {
606         enum syscall_futex_args {
607                 SCF_UADDR   = (1 << 0),
608                 SCF_OP      = (1 << 1),
609                 SCF_VAL     = (1 << 2),
610                 SCF_TIMEOUT = (1 << 3),
611                 SCF_UADDR2  = (1 << 4),
612                 SCF_VAL3    = (1 << 5),
613         };
614         int op = arg->val;
615         int cmd = op & FUTEX_CMD_MASK;
616         size_t printed = 0;
617
618         switch (cmd) {
619 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
620         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
621         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
622         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
623         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
624         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
625         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
626         P_FUTEX_OP(WAKE_OP);                                                      break;
627         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
628         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
629         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
630         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
631         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
632         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
633         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
634         }
635
636         if (op & FUTEX_PRIVATE_FLAG)
637                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
638
639         if (op & FUTEX_CLOCK_REALTIME)
640                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
641
642         return printed;
643 }
644
645 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
646
647 static const char *bpf_cmd[] = {
648         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
649         "MAP_GET_NEXT_KEY", "PROG_LOAD",
650 };
651 static DEFINE_STRARRAY(bpf_cmd);
652
653 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
654 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
655
656 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
657 static DEFINE_STRARRAY(itimers);
658
659 static const char *keyctl_options[] = {
660         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
661         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
662         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
663         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
664         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
665 };
666 static DEFINE_STRARRAY(keyctl_options);
667
668 static const char *whences[] = { "SET", "CUR", "END",
669 #ifdef SEEK_DATA
670 "DATA",
671 #endif
672 #ifdef SEEK_HOLE
673 "HOLE",
674 #endif
675 };
676 static DEFINE_STRARRAY(whences);
677
678 static const char *fcntl_cmds[] = {
679         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
680         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
681         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
682         "F_GETOWNER_UIDS",
683 };
684 static DEFINE_STRARRAY(fcntl_cmds);
685
686 static const char *rlimit_resources[] = {
687         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
688         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
689         "RTTIME",
690 };
691 static DEFINE_STRARRAY(rlimit_resources);
692
693 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
694 static DEFINE_STRARRAY(sighow);
695
696 static const char *clockid[] = {
697         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
698         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
699         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
700 };
701 static DEFINE_STRARRAY(clockid);
702
703 static const char *socket_families[] = {
704         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
705         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
706         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
707         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
708         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
709         "ALG", "NFC", "VSOCK",
710 };
711 static DEFINE_STRARRAY(socket_families);
712
713 #ifndef SOCK_TYPE_MASK
714 #define SOCK_TYPE_MASK 0xf
715 #endif
716
717 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
718                                                       struct syscall_arg *arg)
719 {
720         size_t printed;
721         int type = arg->val,
722             flags = type & ~SOCK_TYPE_MASK;
723
724         type &= SOCK_TYPE_MASK;
725         /*
726          * Can't use a strarray, MIPS may override for ABI reasons.
727          */
728         switch (type) {
729 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
730         P_SK_TYPE(STREAM);
731         P_SK_TYPE(DGRAM);
732         P_SK_TYPE(RAW);
733         P_SK_TYPE(RDM);
734         P_SK_TYPE(SEQPACKET);
735         P_SK_TYPE(DCCP);
736         P_SK_TYPE(PACKET);
737 #undef P_SK_TYPE
738         default:
739                 printed = scnprintf(bf, size, "%#x", type);
740         }
741
742 #define P_SK_FLAG(n) \
743         if (flags & SOCK_##n) { \
744                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
745                 flags &= ~SOCK_##n; \
746         }
747
748         P_SK_FLAG(CLOEXEC);
749         P_SK_FLAG(NONBLOCK);
750 #undef P_SK_FLAG
751
752         if (flags)
753                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
754
755         return printed;
756 }
757
758 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
759
760 #ifndef MSG_PROBE
761 #define MSG_PROBE            0x10
762 #endif
763 #ifndef MSG_WAITFORONE
764 #define MSG_WAITFORONE  0x10000
765 #endif
766 #ifndef MSG_SENDPAGE_NOTLAST
767 #define MSG_SENDPAGE_NOTLAST 0x20000
768 #endif
769 #ifndef MSG_FASTOPEN
770 #define MSG_FASTOPEN         0x20000000
771 #endif
772
773 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
774                                                struct syscall_arg *arg)
775 {
776         int printed = 0, flags = arg->val;
777
778         if (flags == 0)
779                 return scnprintf(bf, size, "NONE");
780 #define P_MSG_FLAG(n) \
781         if (flags & MSG_##n) { \
782                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
783                 flags &= ~MSG_##n; \
784         }
785
786         P_MSG_FLAG(OOB);
787         P_MSG_FLAG(PEEK);
788         P_MSG_FLAG(DONTROUTE);
789         P_MSG_FLAG(TRYHARD);
790         P_MSG_FLAG(CTRUNC);
791         P_MSG_FLAG(PROBE);
792         P_MSG_FLAG(TRUNC);
793         P_MSG_FLAG(DONTWAIT);
794         P_MSG_FLAG(EOR);
795         P_MSG_FLAG(WAITALL);
796         P_MSG_FLAG(FIN);
797         P_MSG_FLAG(SYN);
798         P_MSG_FLAG(CONFIRM);
799         P_MSG_FLAG(RST);
800         P_MSG_FLAG(ERRQUEUE);
801         P_MSG_FLAG(NOSIGNAL);
802         P_MSG_FLAG(MORE);
803         P_MSG_FLAG(WAITFORONE);
804         P_MSG_FLAG(SENDPAGE_NOTLAST);
805         P_MSG_FLAG(FASTOPEN);
806         P_MSG_FLAG(CMSG_CLOEXEC);
807 #undef P_MSG_FLAG
808
809         if (flags)
810                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
811
812         return printed;
813 }
814
815 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
816
817 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
818                                                  struct syscall_arg *arg)
819 {
820         size_t printed = 0;
821         int mode = arg->val;
822
823         if (mode == F_OK) /* 0 */
824                 return scnprintf(bf, size, "F");
825 #define P_MODE(n) \
826         if (mode & n##_OK) { \
827                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
828                 mode &= ~n##_OK; \
829         }
830
831         P_MODE(R);
832         P_MODE(W);
833         P_MODE(X);
834 #undef P_MODE
835
836         if (mode)
837                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
838
839         return printed;
840 }
841
842 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
843
844 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
845                                               struct syscall_arg *arg);
846
847 #define SCA_FILENAME syscall_arg__scnprintf_filename
848
849 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
850                                                struct syscall_arg *arg)
851 {
852         int printed = 0, flags = arg->val;
853
854         if (!(flags & O_CREAT))
855                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
856
857         if (flags == 0)
858                 return scnprintf(bf, size, "RDONLY");
859 #define P_FLAG(n) \
860         if (flags & O_##n) { \
861                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
862                 flags &= ~O_##n; \
863         }
864
865         P_FLAG(APPEND);
866         P_FLAG(ASYNC);
867         P_FLAG(CLOEXEC);
868         P_FLAG(CREAT);
869         P_FLAG(DIRECT);
870         P_FLAG(DIRECTORY);
871         P_FLAG(EXCL);
872         P_FLAG(LARGEFILE);
873         P_FLAG(NOATIME);
874         P_FLAG(NOCTTY);
875 #ifdef O_NONBLOCK
876         P_FLAG(NONBLOCK);
877 #elif O_NDELAY
878         P_FLAG(NDELAY);
879 #endif
880 #ifdef O_PATH
881         P_FLAG(PATH);
882 #endif
883         P_FLAG(RDWR);
884 #ifdef O_DSYNC
885         if ((flags & O_SYNC) == O_SYNC)
886                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
887         else {
888                 P_FLAG(DSYNC);
889         }
890 #else
891         P_FLAG(SYNC);
892 #endif
893         P_FLAG(TRUNC);
894         P_FLAG(WRONLY);
895 #undef P_FLAG
896
897         if (flags)
898                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
899
900         return printed;
901 }
902
903 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
904
905 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
906                                                 struct syscall_arg *arg)
907 {
908         int printed = 0, flags = arg->val;
909
910         if (flags == 0)
911                 return 0;
912
913 #define P_FLAG(n) \
914         if (flags & PERF_FLAG_##n) { \
915                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
916                 flags &= ~PERF_FLAG_##n; \
917         }
918
919         P_FLAG(FD_NO_GROUP);
920         P_FLAG(FD_OUTPUT);
921         P_FLAG(PID_CGROUP);
922         P_FLAG(FD_CLOEXEC);
923 #undef P_FLAG
924
925         if (flags)
926                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
927
928         return printed;
929 }
930
931 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
932
933 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
934                                                    struct syscall_arg *arg)
935 {
936         int printed = 0, flags = arg->val;
937
938         if (flags == 0)
939                 return scnprintf(bf, size, "NONE");
940 #define P_FLAG(n) \
941         if (flags & EFD_##n) { \
942                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
943                 flags &= ~EFD_##n; \
944         }
945
946         P_FLAG(SEMAPHORE);
947         P_FLAG(CLOEXEC);
948         P_FLAG(NONBLOCK);
949 #undef P_FLAG
950
951         if (flags)
952                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
953
954         return printed;
955 }
956
957 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
958
959 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
960                                                 struct syscall_arg *arg)
961 {
962         int printed = 0, flags = arg->val;
963
964 #define P_FLAG(n) \
965         if (flags & O_##n) { \
966                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
967                 flags &= ~O_##n; \
968         }
969
970         P_FLAG(CLOEXEC);
971         P_FLAG(NONBLOCK);
972 #undef P_FLAG
973
974         if (flags)
975                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
976
977         return printed;
978 }
979
980 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
981
982 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
983 {
984         int sig = arg->val;
985
986         switch (sig) {
987 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
988         P_SIGNUM(HUP);
989         P_SIGNUM(INT);
990         P_SIGNUM(QUIT);
991         P_SIGNUM(ILL);
992         P_SIGNUM(TRAP);
993         P_SIGNUM(ABRT);
994         P_SIGNUM(BUS);
995         P_SIGNUM(FPE);
996         P_SIGNUM(KILL);
997         P_SIGNUM(USR1);
998         P_SIGNUM(SEGV);
999         P_SIGNUM(USR2);
1000         P_SIGNUM(PIPE);
1001         P_SIGNUM(ALRM);
1002         P_SIGNUM(TERM);
1003         P_SIGNUM(CHLD);
1004         P_SIGNUM(CONT);
1005         P_SIGNUM(STOP);
1006         P_SIGNUM(TSTP);
1007         P_SIGNUM(TTIN);
1008         P_SIGNUM(TTOU);
1009         P_SIGNUM(URG);
1010         P_SIGNUM(XCPU);
1011         P_SIGNUM(XFSZ);
1012         P_SIGNUM(VTALRM);
1013         P_SIGNUM(PROF);
1014         P_SIGNUM(WINCH);
1015         P_SIGNUM(IO);
1016         P_SIGNUM(PWR);
1017         P_SIGNUM(SYS);
1018 #ifdef SIGEMT
1019         P_SIGNUM(EMT);
1020 #endif
1021 #ifdef SIGSTKFLT
1022         P_SIGNUM(STKFLT);
1023 #endif
1024 #ifdef SIGSWI
1025         P_SIGNUM(SWI);
1026 #endif
1027         default: break;
1028         }
1029
1030         return scnprintf(bf, size, "%#x", sig);
1031 }
1032
1033 #define SCA_SIGNUM syscall_arg__scnprintf_signum
1034
1035 #if defined(__i386__) || defined(__x86_64__)
1036 /*
1037  * FIXME: Make this available to all arches.
1038  */
1039 #define TCGETS          0x5401
1040
1041 static const char *tioctls[] = {
1042         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
1043         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
1044         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
1045         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
1046         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
1047         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
1048         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
1049         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
1050         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
1051         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
1052         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
1053         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
1054         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
1055         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
1056         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
1057 };
1058
1059 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1060 #endif /* defined(__i386__) || defined(__x86_64__) */
1061
1062 #ifndef SECCOMP_SET_MODE_STRICT
1063 #define SECCOMP_SET_MODE_STRICT 0
1064 #endif
1065 #ifndef SECCOMP_SET_MODE_FILTER
1066 #define SECCOMP_SET_MODE_FILTER 1
1067 #endif
1068
1069 static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct syscall_arg *arg)
1070 {
1071         int op = arg->val;
1072         size_t printed = 0;
1073
1074         switch (op) {
1075 #define P_SECCOMP_SET_MODE_OP(n) case SECCOMP_SET_MODE_##n: printed = scnprintf(bf, size, #n); break
1076         P_SECCOMP_SET_MODE_OP(STRICT);
1077         P_SECCOMP_SET_MODE_OP(FILTER);
1078 #undef P_SECCOMP_SET_MODE_OP
1079         default: printed = scnprintf(bf, size, "%#x", op);                        break;
1080         }
1081
1082         return printed;
1083 }
1084
1085 #define SCA_SECCOMP_OP  syscall_arg__scnprintf_seccomp_op
1086
1087 #ifndef SECCOMP_FILTER_FLAG_TSYNC
1088 #define SECCOMP_FILTER_FLAG_TSYNC 1
1089 #endif
1090
1091 static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size,
1092                                                    struct syscall_arg *arg)
1093 {
1094         int printed = 0, flags = arg->val;
1095
1096 #define P_FLAG(n) \
1097         if (flags & SECCOMP_FILTER_FLAG_##n) { \
1098                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
1099                 flags &= ~SECCOMP_FILTER_FLAG_##n; \
1100         }
1101
1102         P_FLAG(TSYNC);
1103 #undef P_FLAG
1104
1105         if (flags)
1106                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
1107
1108         return printed;
1109 }
1110
1111 #define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags
1112
1113 #ifndef GRND_NONBLOCK
1114 #define GRND_NONBLOCK   0x0001
1115 #endif
1116 #ifndef GRND_RANDOM
1117 #define GRND_RANDOM     0x0002
1118 #endif
1119
1120 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
1121                                                    struct syscall_arg *arg)
1122 {
1123         int printed = 0, flags = arg->val;
1124
1125 #define P_FLAG(n) \
1126         if (flags & GRND_##n) { \
1127                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
1128                 flags &= ~GRND_##n; \
1129         }
1130
1131         P_FLAG(RANDOM);
1132         P_FLAG(NONBLOCK);
1133 #undef P_FLAG
1134
1135         if (flags)
1136                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
1137
1138         return printed;
1139 }
1140
1141 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
1142
1143 #define STRARRAY(arg, name, array) \
1144           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1145           .arg_parm      = { [arg] = &strarray__##array, }
1146
1147 #include "trace/beauty/pid.c"
1148 #include "trace/beauty/mode_t.c"
1149 #include "trace/beauty/sched_policy.c"
1150 #include "trace/beauty/waitid_options.c"
1151
1152 static struct syscall_fmt {
1153         const char *name;
1154         const char *alias;
1155         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1156         void       *arg_parm[6];
1157         bool       errmsg;
1158         bool       errpid;
1159         bool       timeout;
1160         bool       hexret;
1161 } syscall_fmts[] = {
1162         { .name     = "access",     .errmsg = true,
1163           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1164                              [1] = SCA_ACCMODE,  /* mode */ }, },
1165         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
1166         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1167         { .name     = "brk",        .hexret = true,
1168           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1169         { .name     = "chdir",      .errmsg = true,
1170           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1171         { .name     = "chmod",      .errmsg = true,
1172           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1173         { .name     = "chroot",     .errmsg = true,
1174           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1175         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1176         { .name     = "clone",      .errpid = true, },
1177         { .name     = "close",      .errmsg = true,
1178           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1179         { .name     = "connect",    .errmsg = true, },
1180         { .name     = "creat",      .errmsg = true,
1181           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1182         { .name     = "dup",        .errmsg = true,
1183           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1184         { .name     = "dup2",       .errmsg = true,
1185           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1186         { .name     = "dup3",       .errmsg = true,
1187           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1188         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1189         { .name     = "eventfd2",   .errmsg = true,
1190           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1191         { .name     = "faccessat",  .errmsg = true,
1192           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1193                              [1] = SCA_FILENAME, /* filename */ }, },
1194         { .name     = "fadvise64",  .errmsg = true,
1195           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1196         { .name     = "fallocate",  .errmsg = true,
1197           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1198         { .name     = "fchdir",     .errmsg = true,
1199           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1200         { .name     = "fchmod",     .errmsg = true,
1201           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1202         { .name     = "fchmodat",   .errmsg = true,
1203           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1204                              [1] = SCA_FILENAME, /* filename */ }, },
1205         { .name     = "fchown",     .errmsg = true,
1206           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1207         { .name     = "fchownat",   .errmsg = true,
1208           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1209                              [1] = SCA_FILENAME, /* filename */ }, },
1210         { .name     = "fcntl",      .errmsg = true,
1211           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1212                              [1] = SCA_STRARRAY, /* cmd */ },
1213           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1214         { .name     = "fdatasync",  .errmsg = true,
1215           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1216         { .name     = "flock",      .errmsg = true,
1217           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1218                              [1] = SCA_FLOCK, /* cmd */ }, },
1219         { .name     = "fsetxattr",  .errmsg = true,
1220           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1221         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1222           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1223         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1224           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1225                              [1] = SCA_FILENAME, /* filename */ }, },
1226         { .name     = "fstatfs",    .errmsg = true,
1227           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1228         { .name     = "fsync",    .errmsg = true,
1229           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1230         { .name     = "ftruncate", .errmsg = true,
1231           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1232         { .name     = "futex",      .errmsg = true,
1233           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1234         { .name     = "futimesat", .errmsg = true,
1235           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1236                              [1] = SCA_FILENAME, /* filename */ }, },
1237         { .name     = "getdents",   .errmsg = true,
1238           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1239         { .name     = "getdents64", .errmsg = true,
1240           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1241         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1242         { .name     = "getpid",     .errpid = true, },
1243         { .name     = "getpgid",    .errpid = true, },
1244         { .name     = "getppid",    .errpid = true, },
1245         { .name     = "getrandom",  .errmsg = true,
1246           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
1247         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1248         { .name     = "getxattr",    .errmsg = true,
1249           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1250         { .name     = "inotify_add_watch",          .errmsg = true,
1251           .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1252         { .name     = "ioctl",      .errmsg = true,
1253           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1254 #if defined(__i386__) || defined(__x86_64__)
1255 /*
1256  * FIXME: Make this available to all arches.
1257  */
1258                              [1] = SCA_STRHEXARRAY, /* cmd */
1259                              [2] = SCA_HEX, /* arg */ },
1260           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1261 #else
1262                              [2] = SCA_HEX, /* arg */ }, },
1263 #endif
1264         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
1265         { .name     = "kill",       .errmsg = true,
1266           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1267         { .name     = "lchown",    .errmsg = true,
1268           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1269         { .name     = "lgetxattr",  .errmsg = true,
1270           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1271         { .name     = "linkat",     .errmsg = true,
1272           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1273         { .name     = "listxattr",  .errmsg = true,
1274           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1275         { .name     = "llistxattr", .errmsg = true,
1276           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1277         { .name     = "lremovexattr",  .errmsg = true,
1278           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1279         { .name     = "lseek",      .errmsg = true,
1280           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1281                              [2] = SCA_STRARRAY, /* whence */ },
1282           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1283         { .name     = "lsetxattr",  .errmsg = true,
1284           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1285         { .name     = "lstat",      .errmsg = true, .alias = "newlstat",
1286           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1287         { .name     = "lsxattr",    .errmsg = true,
1288           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1289         { .name     = "madvise",    .errmsg = true,
1290           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1291                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1292         { .name     = "mkdir",    .errmsg = true,
1293           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1294         { .name     = "mkdirat",    .errmsg = true,
1295           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1296                              [1] = SCA_FILENAME, /* pathname */ }, },
1297         { .name     = "mknod",      .errmsg = true,
1298           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1299         { .name     = "mknodat",    .errmsg = true,
1300           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1301                              [1] = SCA_FILENAME, /* filename */ }, },
1302         { .name     = "mlock",      .errmsg = true,
1303           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1304         { .name     = "mlockall",   .errmsg = true,
1305           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1306         { .name     = "mmap",       .hexret = true,
1307           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1308                              [2] = SCA_MMAP_PROT, /* prot */
1309                              [3] = SCA_MMAP_FLAGS, /* flags */
1310                              [4] = SCA_FD,        /* fd */ }, },
1311         { .name     = "mprotect",   .errmsg = true,
1312           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1313                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1314         { .name     = "mq_unlink", .errmsg = true,
1315           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1316         { .name     = "mremap",     .hexret = true,
1317           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1318                              [3] = SCA_MREMAP_FLAGS, /* flags */
1319                              [4] = SCA_HEX, /* new_addr */ }, },
1320         { .name     = "munlock",    .errmsg = true,
1321           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1322         { .name     = "munmap",     .errmsg = true,
1323           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1324         { .name     = "name_to_handle_at", .errmsg = true,
1325           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1326         { .name     = "newfstatat", .errmsg = true,
1327           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1328                              [1] = SCA_FILENAME, /* filename */ }, },
1329         { .name     = "open",       .errmsg = true,
1330           .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1331                              [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1332         { .name     = "open_by_handle_at", .errmsg = true,
1333           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1334                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1335         { .name     = "openat",     .errmsg = true,
1336           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1337                              [1] = SCA_FILENAME, /* filename */
1338                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1339         { .name     = "perf_event_open", .errmsg = true,
1340           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1341                              [2] = SCA_INT, /* cpu */
1342                              [3] = SCA_FD,  /* group_fd */
1343                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1344         { .name     = "pipe2",      .errmsg = true,
1345           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1346         { .name     = "poll",       .errmsg = true, .timeout = true, },
1347         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1348         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1349           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1350         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1351           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1352         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1353         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1354           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1355         { .name     = "pwritev",    .errmsg = true,
1356           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1357         { .name     = "read",       .errmsg = true,
1358           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1359         { .name     = "readlink",   .errmsg = true,
1360           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1361         { .name     = "readlinkat", .errmsg = true,
1362           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1363                              [1] = SCA_FILENAME, /* pathname */ }, },
1364         { .name     = "readv",      .errmsg = true,
1365           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1366         { .name     = "recvfrom",   .errmsg = true,
1367           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1368                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1369         { .name     = "recvmmsg",   .errmsg = true,
1370           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1371                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1372         { .name     = "recvmsg",    .errmsg = true,
1373           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1374                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1375         { .name     = "removexattr", .errmsg = true,
1376           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1377         { .name     = "renameat",   .errmsg = true,
1378           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1379         { .name     = "rmdir",    .errmsg = true,
1380           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1381         { .name     = "rt_sigaction", .errmsg = true,
1382           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1383         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1384         { .name     = "rt_sigqueueinfo", .errmsg = true,
1385           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1386         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1387           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1388         { .name     = "sched_setscheduler",   .errmsg = true,
1389           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
1390         { .name     = "seccomp", .errmsg = true,
1391           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
1392                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
1393         { .name     = "select",     .errmsg = true, .timeout = true, },
1394         { .name     = "sendmmsg",    .errmsg = true,
1395           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1396                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1397         { .name     = "sendmsg",    .errmsg = true,
1398           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1399                              [2] = SCA_MSG_FLAGS, /* flags */ }, },
1400         { .name     = "sendto",     .errmsg = true,
1401           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1402                              [3] = SCA_MSG_FLAGS, /* flags */ }, },
1403         { .name     = "set_tid_address", .errpid = true, },
1404         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1405         { .name     = "setpgid",    .errmsg = true, },
1406         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1407         { .name     = "setxattr",   .errmsg = true,
1408           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1409         { .name     = "shutdown",   .errmsg = true,
1410           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1411         { .name     = "socket",     .errmsg = true,
1412           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1413                              [1] = SCA_SK_TYPE, /* type */ },
1414           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1415         { .name     = "socketpair", .errmsg = true,
1416           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1417                              [1] = SCA_SK_TYPE, /* type */ },
1418           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1419         { .name     = "stat",       .errmsg = true, .alias = "newstat",
1420           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1421         { .name     = "statfs",     .errmsg = true,
1422           .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1423         { .name     = "swapoff",    .errmsg = true,
1424           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1425         { .name     = "swapon",     .errmsg = true,
1426           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1427         { .name     = "symlinkat",  .errmsg = true,
1428           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1429         { .name     = "tgkill",     .errmsg = true,
1430           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1431         { .name     = "tkill",      .errmsg = true,
1432           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1433         { .name     = "truncate",   .errmsg = true,
1434           .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1435         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1436         { .name     = "unlinkat",   .errmsg = true,
1437           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1438                              [1] = SCA_FILENAME, /* pathname */ }, },
1439         { .name     = "utime",  .errmsg = true,
1440           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1441         { .name     = "utimensat",  .errmsg = true,
1442           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1443                              [1] = SCA_FILENAME, /* filename */ }, },
1444         { .name     = "utimes",  .errmsg = true,
1445           .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1446         { .name     = "vmsplice",  .errmsg = true,
1447           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1448         { .name     = "wait4",      .errpid = true,
1449           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
1450         { .name     = "waitid",     .errpid = true,
1451           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
1452         { .name     = "write",      .errmsg = true,
1453           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1454         { .name     = "writev",     .errmsg = true,
1455           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1456 };
1457
1458 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1459 {
1460         const struct syscall_fmt *fmt = fmtp;
1461         return strcmp(name, fmt->name);
1462 }
1463
1464 static struct syscall_fmt *syscall_fmt__find(const char *name)
1465 {
1466         const int nmemb = ARRAY_SIZE(syscall_fmts);
1467         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1468 }
1469
1470 struct syscall {
1471         struct event_format *tp_format;
1472         int                 nr_args;
1473         struct format_field *args;
1474         const char          *name;
1475         bool                is_exit;
1476         struct syscall_fmt  *fmt;
1477         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1478         void                **arg_parm;
1479 };
1480
1481 static size_t fprintf_duration(unsigned long t, FILE *fp)
1482 {
1483         double duration = (double)t / NSEC_PER_MSEC;
1484         size_t printed = fprintf(fp, "(");
1485
1486         if (duration >= 1.0)
1487                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1488         else if (duration >= 0.01)
1489                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1490         else
1491                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1492         return printed + fprintf(fp, "): ");
1493 }
1494
1495 /**
1496  * filename.ptr: The filename char pointer that will be vfs_getname'd
1497  * filename.entry_str_pos: Where to insert the string translated from
1498  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1499  */
1500 struct thread_trace {
1501         u64               entry_time;
1502         u64               exit_time;
1503         bool              entry_pending;
1504         unsigned long     nr_events;
1505         unsigned long     pfmaj, pfmin;
1506         char              *entry_str;
1507         double            runtime_ms;
1508         struct {
1509                 unsigned long ptr;
1510                 short int     entry_str_pos;
1511                 bool          pending_open;
1512                 unsigned int  namelen;
1513                 char          *name;
1514         } filename;
1515         struct {
1516                 int       max;
1517                 char      **table;
1518         } paths;
1519
1520         struct intlist *syscall_stats;
1521 };
1522
1523 static struct thread_trace *thread_trace__new(void)
1524 {
1525         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1526
1527         if (ttrace)
1528                 ttrace->paths.max = -1;
1529
1530         ttrace->syscall_stats = intlist__new(NULL);
1531
1532         return ttrace;
1533 }
1534
1535 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1536 {
1537         struct thread_trace *ttrace;
1538
1539         if (thread == NULL)
1540                 goto fail;
1541
1542         if (thread__priv(thread) == NULL)
1543                 thread__set_priv(thread, thread_trace__new());
1544
1545         if (thread__priv(thread) == NULL)
1546                 goto fail;
1547
1548         ttrace = thread__priv(thread);
1549         ++ttrace->nr_events;
1550
1551         return ttrace;
1552 fail:
1553         color_fprintf(fp, PERF_COLOR_RED,
1554                       "WARNING: not enough memory, dropping samples!\n");
1555         return NULL;
1556 }
1557
1558 #define TRACE_PFMAJ             (1 << 0)
1559 #define TRACE_PFMIN             (1 << 1)
1560
1561 static const size_t trace__entry_str_size = 2048;
1562
1563 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1564 {
1565         struct thread_trace *ttrace = thread__priv(thread);
1566
1567         if (fd > ttrace->paths.max) {
1568                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1569
1570                 if (npath == NULL)
1571                         return -1;
1572
1573                 if (ttrace->paths.max != -1) {
1574                         memset(npath + ttrace->paths.max + 1, 0,
1575                                (fd - ttrace->paths.max) * sizeof(char *));
1576                 } else {
1577                         memset(npath, 0, (fd + 1) * sizeof(char *));
1578                 }
1579
1580                 ttrace->paths.table = npath;
1581                 ttrace->paths.max   = fd;
1582         }
1583
1584         ttrace->paths.table[fd] = strdup(pathname);
1585
1586         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1587 }
1588
1589 static int thread__read_fd_path(struct thread *thread, int fd)
1590 {
1591         char linkname[PATH_MAX], pathname[PATH_MAX];
1592         struct stat st;
1593         int ret;
1594
1595         if (thread->pid_ == thread->tid) {
1596                 scnprintf(linkname, sizeof(linkname),
1597                           "/proc/%d/fd/%d", thread->pid_, fd);
1598         } else {
1599                 scnprintf(linkname, sizeof(linkname),
1600                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1601         }
1602
1603         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1604                 return -1;
1605
1606         ret = readlink(linkname, pathname, sizeof(pathname));
1607
1608         if (ret < 0 || ret > st.st_size)
1609                 return -1;
1610
1611         pathname[ret] = '\0';
1612         return trace__set_fd_pathname(thread, fd, pathname);
1613 }
1614
1615 static const char *thread__fd_path(struct thread *thread, int fd,
1616                                    struct trace *trace)
1617 {
1618         struct thread_trace *ttrace = thread__priv(thread);
1619
1620         if (ttrace == NULL)
1621                 return NULL;
1622
1623         if (fd < 0)
1624                 return NULL;
1625
1626         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1627                 if (!trace->live)
1628                         return NULL;
1629                 ++trace->stats.proc_getname;
1630                 if (thread__read_fd_path(thread, fd))
1631                         return NULL;
1632         }
1633
1634         return ttrace->paths.table[fd];
1635 }
1636
1637 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1638                                         struct syscall_arg *arg)
1639 {
1640         int fd = arg->val;
1641         size_t printed = scnprintf(bf, size, "%d", fd);
1642         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1643
1644         if (path)
1645                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1646
1647         return printed;
1648 }
1649
1650 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1651                                               struct syscall_arg *arg)
1652 {
1653         int fd = arg->val;
1654         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1655         struct thread_trace *ttrace = thread__priv(arg->thread);
1656
1657         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1658                 zfree(&ttrace->paths.table[fd]);
1659
1660         return printed;
1661 }
1662
1663 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1664                                      unsigned long ptr)
1665 {
1666         struct thread_trace *ttrace = thread__priv(thread);
1667
1668         ttrace->filename.ptr = ptr;
1669         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1670 }
1671
1672 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1673                                               struct syscall_arg *arg)
1674 {
1675         unsigned long ptr = arg->val;
1676
1677         if (!arg->trace->vfs_getname)
1678                 return scnprintf(bf, size, "%#x", ptr);
1679
1680         thread__set_filename_pos(arg->thread, bf, ptr);
1681         return 0;
1682 }
1683
1684 static bool trace__filter_duration(struct trace *trace, double t)
1685 {
1686         return t < (trace->duration_filter * NSEC_PER_MSEC);
1687 }
1688
1689 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1690 {
1691         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1692
1693         return fprintf(fp, "%10.3f ", ts);
1694 }
1695
1696 static bool done = false;
1697 static bool interrupted = false;
1698
1699 static void sig_handler(int sig)
1700 {
1701         done = true;
1702         interrupted = sig == SIGINT;
1703 }
1704
1705 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1706                                         u64 duration, u64 tstamp, FILE *fp)
1707 {
1708         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1709         printed += fprintf_duration(duration, fp);
1710
1711         if (trace->multiple_threads) {
1712                 if (trace->show_comm)
1713                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1714                 printed += fprintf(fp, "%d ", thread->tid);
1715         }
1716
1717         return printed;
1718 }
1719
1720 static int trace__process_event(struct trace *trace, struct machine *machine,
1721                                 union perf_event *event, struct perf_sample *sample)
1722 {
1723         int ret = 0;
1724
1725         switch (event->header.type) {
1726         case PERF_RECORD_LOST:
1727                 color_fprintf(trace->output, PERF_COLOR_RED,
1728                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1729                 ret = machine__process_lost_event(machine, event, sample);
1730                 break;
1731         default:
1732                 ret = machine__process_event(machine, event, sample);
1733                 break;
1734         }
1735
1736         return ret;
1737 }
1738
1739 static int trace__tool_process(struct perf_tool *tool,
1740                                union perf_event *event,
1741                                struct perf_sample *sample,
1742                                struct machine *machine)
1743 {
1744         struct trace *trace = container_of(tool, struct trace, tool);
1745         return trace__process_event(trace, machine, event, sample);
1746 }
1747
1748 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1749 {
1750         int err = symbol__init(NULL);
1751
1752         if (err)
1753                 return err;
1754
1755         trace->host = machine__new_host();
1756         if (trace->host == NULL)
1757                 return -ENOMEM;
1758
1759         if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1760                 return -errno;
1761
1762         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1763                                             evlist->threads, trace__tool_process, false,
1764                                             trace->opts.proc_map_timeout);
1765         if (err)
1766                 symbol__exit();
1767
1768         return err;
1769 }
1770
1771 static int syscall__set_arg_fmts(struct syscall *sc)
1772 {
1773         struct format_field *field;
1774         int idx = 0;
1775
1776         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1777         if (sc->arg_scnprintf == NULL)
1778                 return -1;
1779
1780         if (sc->fmt)
1781                 sc->arg_parm = sc->fmt->arg_parm;
1782
1783         for (field = sc->args; field; field = field->next) {
1784                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1785                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1786                 else if (field->flags & FIELD_IS_POINTER)
1787                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1788                 else if (strcmp(field->type, "pid_t") == 0)
1789                         sc->arg_scnprintf[idx] = SCA_PID;
1790                 else if (strcmp(field->type, "umode_t") == 0)
1791                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1792                 ++idx;
1793         }
1794
1795         return 0;
1796 }
1797
1798 static int trace__read_syscall_info(struct trace *trace, int id)
1799 {
1800         char tp_name[128];
1801         struct syscall *sc;
1802         const char *name = syscalltbl__name(trace->sctbl, id);
1803
1804         if (name == NULL)
1805                 return -1;
1806
1807         if (id > trace->syscalls.max) {
1808                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1809
1810                 if (nsyscalls == NULL)
1811                         return -1;
1812
1813                 if (trace->syscalls.max != -1) {
1814                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1815                                (id - trace->syscalls.max) * sizeof(*sc));
1816                 } else {
1817                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1818                 }
1819
1820                 trace->syscalls.table = nsyscalls;
1821                 trace->syscalls.max   = id;
1822         }
1823
1824         sc = trace->syscalls.table + id;
1825         sc->name = name;
1826
1827         sc->fmt  = syscall_fmt__find(sc->name);
1828
1829         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1830         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1831
1832         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1833                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1834                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1835         }
1836
1837         if (IS_ERR(sc->tp_format))
1838                 return -1;
1839
1840         sc->args = sc->tp_format->format.fields;
1841         sc->nr_args = sc->tp_format->format.nr_fields;
1842         /*
1843          * We need to check and discard the first variable '__syscall_nr'
1844          * or 'nr' that mean the syscall number. It is needless here.
1845          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1846          */
1847         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1848                 sc->args = sc->args->next;
1849                 --sc->nr_args;
1850         }
1851
1852         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1853
1854         return syscall__set_arg_fmts(sc);
1855 }
1856
1857 static int trace__validate_ev_qualifier(struct trace *trace)
1858 {
1859         int err = 0, i;
1860         struct str_node *pos;
1861
1862         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1863         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1864                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1865
1866         if (trace->ev_qualifier_ids.entries == NULL) {
1867                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1868                        trace->output);
1869                 err = -EINVAL;
1870                 goto out;
1871         }
1872
1873         i = 0;
1874
1875         strlist__for_each(pos, trace->ev_qualifier) {
1876                 const char *sc = pos->s;
1877                 int id = syscalltbl__id(trace->sctbl, sc);
1878
1879                 if (id < 0) {
1880                         if (err == 0) {
1881                                 fputs("Error:\tInvalid syscall ", trace->output);
1882                                 err = -EINVAL;
1883                         } else {
1884                                 fputs(", ", trace->output);
1885                         }
1886
1887                         fputs(sc, trace->output);
1888                 }
1889
1890                 trace->ev_qualifier_ids.entries[i++] = id;
1891         }
1892
1893         if (err < 0) {
1894                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1895                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1896                 zfree(&trace->ev_qualifier_ids.entries);
1897                 trace->ev_qualifier_ids.nr = 0;
1898         }
1899 out:
1900         return err;
1901 }
1902
1903 /*
1904  * args is to be interpreted as a series of longs but we need to handle
1905  * 8-byte unaligned accesses. args points to raw_data within the event
1906  * and raw_data is guaranteed to be 8-byte unaligned because it is
1907  * preceded by raw_size which is a u32. So we need to copy args to a temp
1908  * variable to read it. Most notably this avoids extended load instructions
1909  * on unaligned addresses
1910  */
1911
1912 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1913                                       unsigned char *args, struct trace *trace,
1914                                       struct thread *thread)
1915 {
1916         size_t printed = 0;
1917         unsigned char *p;
1918         unsigned long val;
1919
1920         if (sc->args != NULL) {
1921                 struct format_field *field;
1922                 u8 bit = 1;
1923                 struct syscall_arg arg = {
1924                         .idx    = 0,
1925                         .mask   = 0,
1926                         .trace  = trace,
1927                         .thread = thread,
1928                 };
1929
1930                 for (field = sc->args; field;
1931                      field = field->next, ++arg.idx, bit <<= 1) {
1932                         if (arg.mask & bit)
1933                                 continue;
1934
1935                         /* special care for unaligned accesses */
1936                         p = args + sizeof(unsigned long) * arg.idx;
1937                         memcpy(&val, p, sizeof(val));
1938
1939                         /*
1940                          * Suppress this argument if its value is zero and
1941                          * and we don't have a string associated in an
1942                          * strarray for it.
1943                          */
1944                         if (val == 0 &&
1945                             !(sc->arg_scnprintf &&
1946                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1947                               sc->arg_parm[arg.idx]))
1948                                 continue;
1949
1950                         printed += scnprintf(bf + printed, size - printed,
1951                                              "%s%s: ", printed ? ", " : "", field->name);
1952                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1953                                 arg.val = val;
1954                                 if (sc->arg_parm)
1955                                         arg.parm = sc->arg_parm[arg.idx];
1956                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1957                                                                       size - printed, &arg);
1958                         } else {
1959                                 printed += scnprintf(bf + printed, size - printed,
1960                                                      "%ld", val);
1961                         }
1962                 }
1963         } else {
1964                 int i = 0;
1965
1966                 while (i < 6) {
1967                         /* special care for unaligned accesses */
1968                         p = args + sizeof(unsigned long) * i;
1969                         memcpy(&val, p, sizeof(val));
1970                         printed += scnprintf(bf + printed, size - printed,
1971                                              "%sarg%d: %ld",
1972                                              printed ? ", " : "", i, val);
1973                         ++i;
1974                 }
1975         }
1976
1977         return printed;
1978 }
1979
1980 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1981                                   union perf_event *event,
1982                                   struct perf_sample *sample);
1983
1984 static struct syscall *trace__syscall_info(struct trace *trace,
1985                                            struct perf_evsel *evsel, int id)
1986 {
1987
1988         if (id < 0) {
1989
1990                 /*
1991                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1992                  * before that, leaving at a higher verbosity level till that is
1993                  * explained. Reproduced with plain ftrace with:
1994                  *
1995                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1996                  * grep "NR -1 " /t/trace_pipe
1997                  *
1998                  * After generating some load on the machine.
1999                  */
2000                 if (verbose > 1) {
2001                         static u64 n;
2002                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
2003                                 id, perf_evsel__name(evsel), ++n);
2004                 }
2005                 return NULL;
2006         }
2007
2008         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
2009             trace__read_syscall_info(trace, id))
2010                 goto out_cant_read;
2011
2012         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
2013                 goto out_cant_read;
2014
2015         return &trace->syscalls.table[id];
2016
2017 out_cant_read:
2018         if (verbose) {
2019                 fprintf(trace->output, "Problems reading syscall %d", id);
2020                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
2021                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2022                 fputs(" information\n", trace->output);
2023         }
2024         return NULL;
2025 }
2026
2027 static void thread__update_stats(struct thread_trace *ttrace,
2028                                  int id, struct perf_sample *sample)
2029 {
2030         struct int_node *inode;
2031         struct stats *stats;
2032         u64 duration = 0;
2033
2034         inode = intlist__findnew(ttrace->syscall_stats, id);
2035         if (inode == NULL)
2036                 return;
2037
2038         stats = inode->priv;
2039         if (stats == NULL) {
2040                 stats = malloc(sizeof(struct stats));
2041                 if (stats == NULL)
2042                         return;
2043                 init_stats(stats);
2044                 inode->priv = stats;
2045         }
2046
2047         if (ttrace->entry_time && sample->time > ttrace->entry_time)
2048                 duration = sample->time - ttrace->entry_time;
2049
2050         update_stats(stats, duration);
2051 }
2052
2053 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
2054 {
2055         struct thread_trace *ttrace;
2056         u64 duration;
2057         size_t printed;
2058
2059         if (trace->current == NULL)
2060                 return 0;
2061
2062         ttrace = thread__priv(trace->current);
2063
2064         if (!ttrace->entry_pending)
2065                 return 0;
2066
2067         duration = sample->time - ttrace->entry_time;
2068
2069         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
2070         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
2071         ttrace->entry_pending = false;
2072
2073         return printed;
2074 }
2075
2076 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
2077                             union perf_event *event __maybe_unused,
2078                             struct perf_sample *sample)
2079 {
2080         char *msg;
2081         void *args;
2082         size_t printed = 0;
2083         struct thread *thread;
2084         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2085         struct syscall *sc = trace__syscall_info(trace, evsel, id);
2086         struct thread_trace *ttrace;
2087
2088         if (sc == NULL)
2089                 return -1;
2090
2091         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2092         ttrace = thread__trace(thread, trace->output);
2093         if (ttrace == NULL)
2094                 goto out_put;
2095
2096         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2097
2098         if (ttrace->entry_str == NULL) {
2099                 ttrace->entry_str = malloc(trace__entry_str_size);
2100                 if (!ttrace->entry_str)
2101                         goto out_put;
2102         }
2103
2104         if (!trace->summary_only)
2105                 trace__printf_interrupted_entry(trace, sample);
2106
2107         ttrace->entry_time = sample->time;
2108         msg = ttrace->entry_str;
2109         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2110
2111         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2112                                            args, trace, thread);
2113
2114         if (sc->is_exit) {
2115                 if (!trace->duration_filter && !trace->summary_only) {
2116                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
2117                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
2118                 }
2119         } else {
2120                 ttrace->entry_pending = true;
2121                 /* See trace__vfs_getname & trace__sys_exit */
2122                 ttrace->filename.pending_open = false;
2123         }
2124
2125         if (trace->current != thread) {
2126                 thread__put(trace->current);
2127                 trace->current = thread__get(thread);
2128         }
2129         err = 0;
2130 out_put:
2131         thread__put(thread);
2132         return err;
2133 }
2134
2135 static int trace__fprintf_callchain(struct trace *trace, struct perf_evsel *evsel,
2136                                     struct perf_sample *sample)
2137 {
2138         struct addr_location al;
2139         /* TODO: user-configurable print_opts */
2140         const unsigned int print_opts = EVSEL__PRINT_SYM |
2141                                         EVSEL__PRINT_DSO |
2142                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
2143
2144         if (sample->callchain == NULL)
2145                 return 0;
2146
2147         if (machine__resolve(trace->host, &al, sample) < 0) {
2148                 pr_err("Problem processing %s callchain, skipping...\n",
2149                         perf_evsel__name(evsel));
2150                 return 0;
2151         }
2152
2153         return perf_evsel__fprintf_callchain(evsel, sample, &al, 38, print_opts,
2154                                              scripting_max_stack, trace->output);
2155 }
2156
2157 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2158                            union perf_event *event __maybe_unused,
2159                            struct perf_sample *sample)
2160 {
2161         long ret;
2162         u64 duration = 0;
2163         struct thread *thread;
2164         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2165         struct syscall *sc = trace__syscall_info(trace, evsel, id);
2166         struct thread_trace *ttrace;
2167
2168         if (sc == NULL)
2169                 return -1;
2170
2171         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2172         ttrace = thread__trace(thread, trace->output);
2173         if (ttrace == NULL)
2174                 goto out_put;
2175
2176         if (trace->summary)
2177                 thread__update_stats(ttrace, id, sample);
2178
2179         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2180
2181         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
2182                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2183                 ttrace->filename.pending_open = false;
2184                 ++trace->stats.vfs_getname;
2185         }
2186
2187         ttrace->exit_time = sample->time;
2188
2189         if (ttrace->entry_time) {
2190                 duration = sample->time - ttrace->entry_time;
2191                 if (trace__filter_duration(trace, duration))
2192                         goto out;
2193         } else if (trace->duration_filter)
2194                 goto out;
2195
2196         if (trace->summary_only)
2197                 goto out;
2198
2199         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2200
2201         if (ttrace->entry_pending) {
2202                 fprintf(trace->output, "%-70s", ttrace->entry_str);
2203         } else {
2204                 fprintf(trace->output, " ... [");
2205                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2206                 fprintf(trace->output, "]: %s()", sc->name);
2207         }
2208
2209         if (sc->fmt == NULL) {
2210 signed_print:
2211                 fprintf(trace->output, ") = %ld", ret);
2212         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
2213                 char bf[STRERR_BUFSIZE];
2214                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2215                            *e = audit_errno_to_name(-ret);
2216
2217                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
2218         } else if (ret == 0 && sc->fmt->timeout)
2219                 fprintf(trace->output, ") = 0 Timeout");
2220         else if (sc->fmt->hexret)
2221                 fprintf(trace->output, ") = %#lx", ret);
2222         else if (sc->fmt->errpid) {
2223                 struct thread *child = machine__find_thread(trace->host, ret, ret);
2224
2225                 if (child != NULL) {
2226                         fprintf(trace->output, ") = %ld", ret);
2227                         if (child->comm_set)
2228                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
2229                         thread__put(child);
2230                 }
2231         } else
2232                 goto signed_print;
2233
2234         fputc('\n', trace->output);
2235
2236         trace__fprintf_callchain(trace, evsel, sample);
2237 out:
2238         ttrace->entry_pending = false;
2239         err = 0;
2240 out_put:
2241         thread__put(thread);
2242         return err;
2243 }
2244
2245 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2246                               union perf_event *event __maybe_unused,
2247                               struct perf_sample *sample)
2248 {
2249         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2250         struct thread_trace *ttrace;
2251         size_t filename_len, entry_str_len, to_move;
2252         ssize_t remaining_space;
2253         char *pos;
2254         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2255
2256         if (!thread)
2257                 goto out;
2258
2259         ttrace = thread__priv(thread);
2260         if (!ttrace)
2261                 goto out;
2262
2263         filename_len = strlen(filename);
2264
2265         if (ttrace->filename.namelen < filename_len) {
2266                 char *f = realloc(ttrace->filename.name, filename_len + 1);
2267
2268                 if (f == NULL)
2269                                 goto out;
2270
2271                 ttrace->filename.namelen = filename_len;
2272                 ttrace->filename.name = f;
2273         }
2274
2275         strcpy(ttrace->filename.name, filename);
2276         ttrace->filename.pending_open = true;
2277
2278         if (!ttrace->filename.ptr)
2279                 goto out;
2280
2281         entry_str_len = strlen(ttrace->entry_str);
2282         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2283         if (remaining_space <= 0)
2284                 goto out;
2285
2286         if (filename_len > (size_t)remaining_space) {
2287                 filename += filename_len - remaining_space;
2288                 filename_len = remaining_space;
2289         }
2290
2291         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2292         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2293         memmove(pos + filename_len, pos, to_move);
2294         memcpy(pos, filename, filename_len);
2295
2296         ttrace->filename.ptr = 0;
2297         ttrace->filename.entry_str_pos = 0;
2298 out:
2299         return 0;
2300 }
2301
2302 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2303                                      union perf_event *event __maybe_unused,
2304                                      struct perf_sample *sample)
2305 {
2306         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2307         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2308         struct thread *thread = machine__findnew_thread(trace->host,
2309                                                         sample->pid,
2310                                                         sample->tid);
2311         struct thread_trace *ttrace = thread__trace(thread, trace->output);
2312
2313         if (ttrace == NULL)
2314                 goto out_dump;
2315
2316         ttrace->runtime_ms += runtime_ms;
2317         trace->runtime_ms += runtime_ms;
2318         thread__put(thread);
2319         return 0;
2320
2321 out_dump:
2322         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2323                evsel->name,
2324                perf_evsel__strval(evsel, sample, "comm"),
2325                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2326                runtime,
2327                perf_evsel__intval(evsel, sample, "vruntime"));
2328         thread__put(thread);
2329         return 0;
2330 }
2331
2332 static void bpf_output__printer(enum binary_printer_ops op,
2333                                 unsigned int val, void *extra)
2334 {
2335         FILE *output = extra;
2336         unsigned char ch = (unsigned char)val;
2337
2338         switch (op) {
2339         case BINARY_PRINT_CHAR_DATA:
2340                 fprintf(output, "%c", isprint(ch) ? ch : '.');
2341                 break;
2342         case BINARY_PRINT_DATA_BEGIN:
2343         case BINARY_PRINT_LINE_BEGIN:
2344         case BINARY_PRINT_ADDR:
2345         case BINARY_PRINT_NUM_DATA:
2346         case BINARY_PRINT_NUM_PAD:
2347         case BINARY_PRINT_SEP:
2348         case BINARY_PRINT_CHAR_PAD:
2349         case BINARY_PRINT_LINE_END:
2350         case BINARY_PRINT_DATA_END:
2351         default:
2352                 break;
2353         }
2354 }
2355
2356 static void bpf_output__fprintf(struct trace *trace,
2357                                 struct perf_sample *sample)
2358 {
2359         print_binary(sample->raw_data, sample->raw_size, 8,
2360                      bpf_output__printer, trace->output);
2361 }
2362
2363 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2364                                 union perf_event *event __maybe_unused,
2365                                 struct perf_sample *sample)
2366 {
2367         trace__printf_interrupted_entry(trace, sample);
2368         trace__fprintf_tstamp(trace, sample->time, trace->output);
2369
2370         if (trace->trace_syscalls)
2371                 fprintf(trace->output, "(         ): ");
2372
2373         fprintf(trace->output, "%s:", evsel->name);
2374
2375         if (perf_evsel__is_bpf_output(evsel)) {
2376                 bpf_output__fprintf(trace, sample);
2377         } else if (evsel->tp_format) {
2378                 event_format__fprintf(evsel->tp_format, sample->cpu,
2379                                       sample->raw_data, sample->raw_size,
2380                                       trace->output);
2381         }
2382
2383         fprintf(trace->output, ")\n");
2384
2385         trace__fprintf_callchain(trace, evsel, sample);
2386
2387         return 0;
2388 }
2389
2390 static void print_location(FILE *f, struct perf_sample *sample,
2391                            struct addr_location *al,
2392                            bool print_dso, bool print_sym)
2393 {
2394
2395         if ((verbose || print_dso) && al->map)
2396                 fprintf(f, "%s@", al->map->dso->long_name);
2397
2398         if ((verbose || print_sym) && al->sym)
2399                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2400                         al->addr - al->sym->start);
2401         else if (al->map)
2402                 fprintf(f, "0x%" PRIx64, al->addr);
2403         else
2404                 fprintf(f, "0x%" PRIx64, sample->addr);
2405 }
2406
2407 static int trace__pgfault(struct trace *trace,
2408                           struct perf_evsel *evsel,
2409                           union perf_event *event __maybe_unused,
2410                           struct perf_sample *sample)
2411 {
2412         struct thread *thread;
2413         struct addr_location al;
2414         char map_type = 'd';
2415         struct thread_trace *ttrace;
2416         int err = -1;
2417
2418         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2419         ttrace = thread__trace(thread, trace->output);
2420         if (ttrace == NULL)
2421                 goto out_put;
2422
2423         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2424                 ttrace->pfmaj++;
2425         else
2426                 ttrace->pfmin++;
2427
2428         if (trace->summary_only)
2429                 goto out;
2430
2431         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2432                               sample->ip, &al);
2433
2434         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2435
2436         fprintf(trace->output, "%sfault [",
2437                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2438                 "maj" : "min");
2439
2440         print_location(trace->output, sample, &al, false, true);
2441
2442         fprintf(trace->output, "] => ");
2443
2444         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2445                                    sample->addr, &al);
2446
2447         if (!al.map) {
2448                 thread__find_addr_location(thread, sample->cpumode,
2449                                            MAP__FUNCTION, sample->addr, &al);
2450
2451                 if (al.map)
2452                         map_type = 'x';
2453                 else
2454                         map_type = '?';
2455         }
2456
2457         print_location(trace->output, sample, &al, true, false);
2458
2459         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2460 out:
2461         err = 0;
2462 out_put:
2463         thread__put(thread);
2464         return err;
2465 }
2466
2467 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2468 {
2469         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2470             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2471                 return false;
2472
2473         if (trace->pid_list || trace->tid_list)
2474                 return true;
2475
2476         return false;
2477 }
2478
2479 static void trace__set_base_time(struct trace *trace,
2480                                  struct perf_evsel *evsel,
2481                                  struct perf_sample *sample)
2482 {
2483         /*
2484          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2485          * and don't use sample->time unconditionally, we may end up having
2486          * some other event in the future without PERF_SAMPLE_TIME for good
2487          * reason, i.e. we may not be interested in its timestamps, just in
2488          * it taking place, picking some piece of information when it
2489          * appears in our event stream (vfs_getname comes to mind).
2490          */
2491         if (trace->base_time == 0 && !trace->full_time &&
2492             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2493                 trace->base_time = sample->time;
2494 }
2495
2496 static int trace__process_sample(struct perf_tool *tool,
2497                                  union perf_event *event,
2498                                  struct perf_sample *sample,
2499                                  struct perf_evsel *evsel,
2500                                  struct machine *machine __maybe_unused)
2501 {
2502         struct trace *trace = container_of(tool, struct trace, tool);
2503         int err = 0;
2504
2505         tracepoint_handler handler = evsel->handler;
2506
2507         if (skip_sample(trace, sample))
2508                 return 0;
2509
2510         trace__set_base_time(trace, evsel, sample);
2511
2512         if (handler) {
2513                 ++trace->nr_events;
2514                 handler(trace, evsel, event, sample);
2515         }
2516
2517         return err;
2518 }
2519
2520 static int parse_target_str(struct trace *trace)
2521 {
2522         if (trace->opts.target.pid) {
2523                 trace->pid_list = intlist__new(trace->opts.target.pid);
2524                 if (trace->pid_list == NULL) {
2525                         pr_err("Error parsing process id string\n");
2526                         return -EINVAL;
2527                 }
2528         }
2529
2530         if (trace->opts.target.tid) {
2531                 trace->tid_list = intlist__new(trace->opts.target.tid);
2532                 if (trace->tid_list == NULL) {
2533                         pr_err("Error parsing thread id string\n");
2534                         return -EINVAL;
2535                 }
2536         }
2537
2538         return 0;
2539 }
2540
2541 static int trace__record(struct trace *trace, int argc, const char **argv)
2542 {
2543         unsigned int rec_argc, i, j;
2544         const char **rec_argv;
2545         const char * const record_args[] = {
2546                 "record",
2547                 "-R",
2548                 "-m", "1024",
2549                 "-c", "1",
2550         };
2551
2552         const char * const sc_args[] = { "-e", };
2553         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2554         const char * const majpf_args[] = { "-e", "major-faults" };
2555         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2556         const char * const minpf_args[] = { "-e", "minor-faults" };
2557         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2558
2559         /* +1 is for the event string below */
2560         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2561                 majpf_args_nr + minpf_args_nr + argc;
2562         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2563
2564         if (rec_argv == NULL)
2565                 return -ENOMEM;
2566
2567         j = 0;
2568         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2569                 rec_argv[j++] = record_args[i];
2570
2571         if (trace->trace_syscalls) {
2572                 for (i = 0; i < sc_args_nr; i++)
2573                         rec_argv[j++] = sc_args[i];
2574
2575                 /* event string may be different for older kernels - e.g., RHEL6 */
2576                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2577                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2578                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2579                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2580                 else {
2581                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2582                         return -1;
2583                 }
2584         }
2585
2586         if (trace->trace_pgfaults & TRACE_PFMAJ)
2587                 for (i = 0; i < majpf_args_nr; i++)
2588                         rec_argv[j++] = majpf_args[i];
2589
2590         if (trace->trace_pgfaults & TRACE_PFMIN)
2591                 for (i = 0; i < minpf_args_nr; i++)
2592                         rec_argv[j++] = minpf_args[i];
2593
2594         for (i = 0; i < (unsigned int)argc; i++)
2595                 rec_argv[j++] = argv[i];
2596
2597         return cmd_record(j, rec_argv, NULL);
2598 }
2599
2600 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2601
2602 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2603 {
2604         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2605
2606         if (IS_ERR(evsel))
2607                 return false;
2608
2609         if (perf_evsel__field(evsel, "pathname") == NULL) {
2610                 perf_evsel__delete(evsel);
2611                 return false;
2612         }
2613
2614         evsel->handler = trace__vfs_getname;
2615         perf_evlist__add(evlist, evsel);
2616         return true;
2617 }
2618
2619 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2620                                     u64 config)
2621 {
2622         struct perf_evsel *evsel;
2623         struct perf_event_attr attr = {
2624                 .type = PERF_TYPE_SOFTWARE,
2625                 .mmap_data = 1,
2626         };
2627
2628         attr.config = config;
2629         attr.sample_period = 1;
2630
2631         event_attr_init(&attr);
2632
2633         evsel = perf_evsel__new(&attr);
2634         if (!evsel)
2635                 return -ENOMEM;
2636
2637         evsel->handler = trace__pgfault;
2638         perf_evlist__add(evlist, evsel);
2639
2640         return 0;
2641 }
2642
2643 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2644 {
2645         const u32 type = event->header.type;
2646         struct perf_evsel *evsel;
2647
2648         if (type != PERF_RECORD_SAMPLE) {
2649                 trace__process_event(trace, trace->host, event, sample);
2650                 return;
2651         }
2652
2653         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2654         if (evsel == NULL) {
2655                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2656                 return;
2657         }
2658
2659         trace__set_base_time(trace, evsel, sample);
2660
2661         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2662             sample->raw_data == NULL) {
2663                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2664                        perf_evsel__name(evsel), sample->tid,
2665                        sample->cpu, sample->raw_size);
2666         } else {
2667                 tracepoint_handler handler = evsel->handler;
2668                 handler(trace, evsel, event, sample);
2669         }
2670 }
2671
2672 static int trace__add_syscall_newtp(struct trace *trace)
2673 {
2674         int ret = -1;
2675         struct perf_evlist *evlist = trace->evlist;
2676         struct perf_evsel *sys_enter, *sys_exit;
2677
2678         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2679         if (sys_enter == NULL)
2680                 goto out;
2681
2682         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2683                 goto out_delete_sys_enter;
2684
2685         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2686         if (sys_exit == NULL)
2687                 goto out_delete_sys_enter;
2688
2689         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2690                 goto out_delete_sys_exit;
2691
2692         perf_evlist__add(evlist, sys_enter);
2693         perf_evlist__add(evlist, sys_exit);
2694
2695         if (trace->opts.callgraph_set && !trace->kernel_syscallchains) {
2696                 /*
2697                  * We're interested only in the user space callchain
2698                  * leading to the syscall, allow overriding that for
2699                  * debugging reasons using --kernel_syscall_callchains
2700                  */
2701                 sys_exit->attr.exclude_callchain_kernel = 1;
2702         }
2703
2704         trace->syscalls.events.sys_enter = sys_enter;
2705         trace->syscalls.events.sys_exit  = sys_exit;
2706
2707         ret = 0;
2708 out:
2709         return ret;
2710
2711 out_delete_sys_exit:
2712         perf_evsel__delete_priv(sys_exit);
2713 out_delete_sys_enter:
2714         perf_evsel__delete_priv(sys_enter);
2715         goto out;
2716 }
2717
2718 static int trace__set_ev_qualifier_filter(struct trace *trace)
2719 {
2720         int err = -1;
2721         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2722                                                 trace->ev_qualifier_ids.nr,
2723                                                 trace->ev_qualifier_ids.entries);
2724
2725         if (filter == NULL)
2726                 goto out_enomem;
2727
2728         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2729                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2730
2731         free(filter);
2732 out:
2733         return err;
2734 out_enomem:
2735         errno = ENOMEM;
2736         goto out;
2737 }
2738
2739 static int trace__run(struct trace *trace, int argc, const char **argv)
2740 {
2741         struct perf_evlist *evlist = trace->evlist;
2742         struct perf_evsel *evsel;
2743         int err = -1, i;
2744         unsigned long before;
2745         const bool forks = argc > 0;
2746         bool draining = false;
2747
2748         trace->live = true;
2749
2750         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2751                 goto out_error_raw_syscalls;
2752
2753         if (trace->trace_syscalls)
2754                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2755
2756         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2757             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2758                 goto out_error_mem;
2759         }
2760
2761         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2762             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2763                 goto out_error_mem;
2764
2765         if (trace->sched &&
2766             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2767                                    trace__sched_stat_runtime))
2768                 goto out_error_sched_stat_runtime;
2769
2770         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2771         if (err < 0) {
2772                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2773                 goto out_delete_evlist;
2774         }
2775
2776         err = trace__symbols_init(trace, evlist);
2777         if (err < 0) {
2778                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2779                 goto out_delete_evlist;
2780         }
2781
2782         perf_evlist__config(evlist, &trace->opts, NULL);
2783
2784         if (trace->opts.callgraph_set && trace->syscalls.events.sys_exit) {
2785                 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2786                                              &trace->opts, &callchain_param);
2787                /*
2788                 * Now we have evsels with different sample_ids, use
2789                 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2790                 * from a fixed position in each ring buffer record.
2791                 *
2792                 * As of this the changeset introducing this comment, this
2793                 * isn't strictly needed, as the fields that can come before
2794                 * PERF_SAMPLE_ID are all used, but we'll probably disable
2795                 * some of those for things like copying the payload of
2796                 * pointer syscall arguments, and for vfs_getname we don't
2797                 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2798                 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2799                 */
2800                 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2801                 perf_evlist__reset_sample_bit(evlist, ID);
2802         }
2803
2804         signal(SIGCHLD, sig_handler);
2805         signal(SIGINT, sig_handler);
2806
2807         if (forks) {
2808                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2809                                                     argv, false, NULL);
2810                 if (err < 0) {
2811                         fprintf(trace->output, "Couldn't run the workload!\n");
2812                         goto out_delete_evlist;
2813                 }
2814         }
2815
2816         err = perf_evlist__open(evlist);
2817         if (err < 0)
2818                 goto out_error_open;
2819
2820         err = bpf__apply_obj_config();
2821         if (err) {
2822                 char errbuf[BUFSIZ];
2823
2824                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2825                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2826                          errbuf);
2827                 goto out_error_open;
2828         }
2829
2830         /*
2831          * Better not use !target__has_task() here because we need to cover the
2832          * case where no threads were specified in the command line, but a
2833          * workload was, and in that case we will fill in the thread_map when
2834          * we fork the workload in perf_evlist__prepare_workload.
2835          */
2836         if (trace->filter_pids.nr > 0)
2837                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2838         else if (thread_map__pid(evlist->threads, 0) == -1)
2839                 err = perf_evlist__set_filter_pid(evlist, getpid());
2840
2841         if (err < 0)
2842                 goto out_error_mem;
2843
2844         if (trace->ev_qualifier_ids.nr > 0) {
2845                 err = trace__set_ev_qualifier_filter(trace);
2846                 if (err < 0)
2847                         goto out_errno;
2848
2849                 pr_debug("event qualifier tracepoint filter: %s\n",
2850                          trace->syscalls.events.sys_exit->filter);
2851         }
2852
2853         err = perf_evlist__apply_filters(evlist, &evsel);
2854         if (err < 0)
2855                 goto out_error_apply_filters;
2856
2857         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2858         if (err < 0)
2859                 goto out_error_mmap;
2860
2861         if (!target__none(&trace->opts.target))
2862                 perf_evlist__enable(evlist);
2863
2864         if (forks)
2865                 perf_evlist__start_workload(evlist);
2866
2867         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2868                                   evlist->threads->nr > 1 ||
2869                                   perf_evlist__first(evlist)->attr.inherit;
2870 again:
2871         before = trace->nr_events;
2872
2873         for (i = 0; i < evlist->nr_mmaps; i++) {
2874                 union perf_event *event;
2875
2876                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2877                         struct perf_sample sample;
2878
2879                         ++trace->nr_events;
2880
2881                         err = perf_evlist__parse_sample(evlist, event, &sample);
2882                         if (err) {
2883                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2884                                 goto next_event;
2885                         }
2886
2887                         trace__handle_event(trace, event, &sample);
2888 next_event:
2889                         perf_evlist__mmap_consume(evlist, i);
2890
2891                         if (interrupted)
2892                                 goto out_disable;
2893
2894                         if (done && !draining) {
2895                                 perf_evlist__disable(evlist);
2896                                 draining = true;
2897                         }
2898                 }
2899         }
2900
2901         if (trace->nr_events == before) {
2902                 int timeout = done ? 100 : -1;
2903
2904                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2905                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2906                                 draining = true;
2907
2908                         goto again;
2909                 }
2910         } else {
2911                 goto again;
2912         }
2913
2914 out_disable:
2915         thread__zput(trace->current);
2916
2917         perf_evlist__disable(evlist);
2918
2919         if (!err) {
2920                 if (trace->summary)
2921                         trace__fprintf_thread_summary(trace, trace->output);
2922
2923                 if (trace->show_tool_stats) {
2924                         fprintf(trace->output, "Stats:\n "
2925                                                " vfs_getname : %" PRIu64 "\n"
2926                                                " proc_getname: %" PRIu64 "\n",
2927                                 trace->stats.vfs_getname,
2928                                 trace->stats.proc_getname);
2929                 }
2930         }
2931
2932 out_delete_evlist:
2933         perf_evlist__delete(evlist);
2934         trace->evlist = NULL;
2935         trace->live = false;
2936         return err;
2937 {
2938         char errbuf[BUFSIZ];
2939
2940 out_error_sched_stat_runtime:
2941         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2942         goto out_error;
2943
2944 out_error_raw_syscalls:
2945         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2946         goto out_error;
2947
2948 out_error_mmap:
2949         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2950         goto out_error;
2951
2952 out_error_open:
2953         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2954
2955 out_error:
2956         fprintf(trace->output, "%s\n", errbuf);
2957         goto out_delete_evlist;
2958
2959 out_error_apply_filters:
2960         fprintf(trace->output,
2961                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2962                 evsel->filter, perf_evsel__name(evsel), errno,
2963                 strerror_r(errno, errbuf, sizeof(errbuf)));
2964         goto out_delete_evlist;
2965 }
2966 out_error_mem:
2967         fprintf(trace->output, "Not enough memory to run!\n");
2968         goto out_delete_evlist;
2969
2970 out_errno:
2971         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2972         goto out_delete_evlist;
2973 }
2974
2975 static int trace__replay(struct trace *trace)
2976 {
2977         const struct perf_evsel_str_handler handlers[] = {
2978                 { "probe:vfs_getname",       trace__vfs_getname, },
2979         };
2980         struct perf_data_file file = {
2981                 .path  = input_name,
2982                 .mode  = PERF_DATA_MODE_READ,
2983                 .force = trace->force,
2984         };
2985         struct perf_session *session;
2986         struct perf_evsel *evsel;
2987         int err = -1;
2988
2989         trace->tool.sample        = trace__process_sample;
2990         trace->tool.mmap          = perf_event__process_mmap;
2991         trace->tool.mmap2         = perf_event__process_mmap2;
2992         trace->tool.comm          = perf_event__process_comm;
2993         trace->tool.exit          = perf_event__process_exit;
2994         trace->tool.fork          = perf_event__process_fork;
2995         trace->tool.attr          = perf_event__process_attr;
2996         trace->tool.tracing_data = perf_event__process_tracing_data;
2997         trace->tool.build_id      = perf_event__process_build_id;
2998
2999         trace->tool.ordered_events = true;
3000         trace->tool.ordering_requires_timestamps = true;
3001
3002         /* add tid to output */
3003         trace->multiple_threads = true;
3004
3005         session = perf_session__new(&file, false, &trace->tool);
3006         if (session == NULL)
3007                 return -1;
3008
3009         if (symbol__init(&session->header.env) < 0)
3010                 goto out;
3011
3012         trace->host = &session->machines.host;
3013
3014         err = perf_session__set_tracepoints_handlers(session, handlers);
3015         if (err)
3016                 goto out;
3017
3018         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3019                                                      "raw_syscalls:sys_enter");
3020         /* older kernels have syscalls tp versus raw_syscalls */
3021         if (evsel == NULL)
3022                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3023                                                              "syscalls:sys_enter");
3024
3025         if (evsel &&
3026             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
3027             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
3028                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
3029                 goto out;
3030         }
3031
3032         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3033                                                      "raw_syscalls:sys_exit");
3034         if (evsel == NULL)
3035                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3036                                                              "syscalls:sys_exit");
3037         if (evsel &&
3038             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
3039             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
3040                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
3041                 goto out;
3042         }
3043
3044         evlist__for_each(session->evlist, evsel) {
3045                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
3046                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
3047                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
3048                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
3049                         evsel->handler = trace__pgfault;
3050         }
3051
3052         err = parse_target_str(trace);
3053         if (err != 0)
3054                 goto out;
3055
3056         setup_pager();
3057
3058         err = perf_session__process_events(session);
3059         if (err)
3060                 pr_err("Failed to process events, error %d", err);
3061
3062         else if (trace->summary)
3063                 trace__fprintf_thread_summary(trace, trace->output);
3064
3065 out:
3066         perf_session__delete(session);
3067
3068         return err;
3069 }
3070
3071 static size_t trace__fprintf_threads_header(FILE *fp)
3072 {
3073         size_t printed;
3074
3075         printed  = fprintf(fp, "\n Summary of events:\n\n");
3076
3077         return printed;
3078 }
3079
3080 static size_t thread__dump_stats(struct thread_trace *ttrace,
3081                                  struct trace *trace, FILE *fp)
3082 {
3083         struct stats *stats;
3084         size_t printed = 0;
3085         struct syscall *sc;
3086         struct int_node *inode = intlist__first(ttrace->syscall_stats);
3087
3088         if (inode == NULL)
3089                 return 0;
3090
3091         printed += fprintf(fp, "\n");
3092
3093         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
3094         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
3095         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
3096
3097         /* each int_node is a syscall */
3098         while (inode) {
3099                 stats = inode->priv;
3100                 if (stats) {
3101                         double min = (double)(stats->min) / NSEC_PER_MSEC;
3102                         double max = (double)(stats->max) / NSEC_PER_MSEC;
3103                         double avg = avg_stats(stats);
3104                         double pct;
3105                         u64 n = (u64) stats->n;
3106
3107                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
3108                         avg /= NSEC_PER_MSEC;
3109
3110                         sc = &trace->syscalls.table[inode->i];
3111                         printed += fprintf(fp, "   %-15s", sc->name);
3112                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3113                                            n, avg * n, min, avg);
3114                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3115                 }
3116
3117                 inode = intlist__next(inode);
3118         }
3119
3120         printed += fprintf(fp, "\n\n");
3121
3122         return printed;
3123 }
3124
3125 /* struct used to pass data to per-thread function */
3126 struct summary_data {
3127         FILE *fp;
3128         struct trace *trace;
3129         size_t printed;
3130 };
3131
3132 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
3133 {
3134         struct summary_data *data = priv;
3135         FILE *fp = data->fp;
3136         size_t printed = data->printed;
3137         struct trace *trace = data->trace;
3138         struct thread_trace *ttrace = thread__priv(thread);
3139         double ratio;
3140
3141         if (ttrace == NULL)
3142                 return 0;
3143
3144         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
3145
3146         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3147         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3148         printed += fprintf(fp, "%.1f%%", ratio);
3149         if (ttrace->pfmaj)
3150                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
3151         if (ttrace->pfmin)
3152                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3153         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
3154         printed += thread__dump_stats(ttrace, trace, fp);
3155
3156         data->printed += printed;
3157
3158         return 0;
3159 }
3160
3161 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
3162 {
3163         struct summary_data data = {
3164                 .fp = fp,
3165                 .trace = trace
3166         };
3167         data.printed = trace__fprintf_threads_header(fp);
3168
3169         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
3170
3171         return data.printed;
3172 }
3173
3174 static int trace__set_duration(const struct option *opt, const char *str,
3175                                int unset __maybe_unused)
3176 {
3177         struct trace *trace = opt->value;
3178
3179         trace->duration_filter = atof(str);
3180         return 0;
3181 }
3182
3183 static int trace__set_filter_pids(const struct option *opt, const char *str,
3184                                   int unset __maybe_unused)
3185 {
3186         int ret = -1;
3187         size_t i;
3188         struct trace *trace = opt->value;
3189         /*
3190          * FIXME: introduce a intarray class, plain parse csv and create a
3191          * { int nr, int entries[] } struct...
3192          */
3193         struct intlist *list = intlist__new(str);
3194
3195         if (list == NULL)
3196                 return -1;
3197
3198         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3199         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3200
3201         if (trace->filter_pids.entries == NULL)
3202                 goto out;
3203
3204         trace->filter_pids.entries[0] = getpid();
3205
3206         for (i = 1; i < trace->filter_pids.nr; ++i)
3207                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3208
3209         intlist__delete(list);
3210         ret = 0;
3211 out:
3212         return ret;
3213 }
3214
3215 static int trace__open_output(struct trace *trace, const char *filename)
3216 {
3217         struct stat st;
3218
3219         if (!stat(filename, &st) && st.st_size) {
3220                 char oldname[PATH_MAX];
3221
3222                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3223                 unlink(oldname);
3224                 rename(filename, oldname);
3225         }
3226
3227         trace->output = fopen(filename, "w");
3228
3229         return trace->output == NULL ? -errno : 0;
3230 }
3231
3232 static int parse_pagefaults(const struct option *opt, const char *str,
3233                             int unset __maybe_unused)
3234 {
3235         int *trace_pgfaults = opt->value;
3236
3237         if (strcmp(str, "all") == 0)
3238                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3239         else if (strcmp(str, "maj") == 0)
3240                 *trace_pgfaults |= TRACE_PFMAJ;
3241         else if (strcmp(str, "min") == 0)
3242                 *trace_pgfaults |= TRACE_PFMIN;
3243         else
3244                 return -1;
3245
3246         return 0;
3247 }
3248
3249 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3250 {
3251         struct perf_evsel *evsel;
3252
3253         evlist__for_each(evlist, evsel)
3254                 evsel->handler = handler;
3255 }
3256
3257 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3258 {
3259         const char *trace_usage[] = {
3260                 "perf trace [<options>] [<command>]",
3261                 "perf trace [<options>] -- <command> [<options>]",
3262                 "perf trace record [<options>] [<command>]",
3263                 "perf trace record [<options>] -- <command> [<options>]",
3264                 NULL
3265         };
3266         struct trace trace = {
3267                 .syscalls = {
3268                         . max = -1,
3269                 },
3270                 .opts = {
3271                         .target = {
3272                                 .uid       = UINT_MAX,
3273                                 .uses_mmap = true,
3274                         },
3275                         .user_freq     = UINT_MAX,
3276                         .user_interval = ULLONG_MAX,
3277                         .no_buffering  = true,
3278                         .mmap_pages    = UINT_MAX,
3279                         .proc_map_timeout  = 500,
3280                 },
3281                 .output = stderr,
3282                 .show_comm = true,
3283                 .trace_syscalls = true,
3284                 .kernel_syscallchains = false,
3285         };
3286         const char *output_name = NULL;
3287         const char *ev_qualifier_str = NULL;
3288         const struct option trace_options[] = {
3289         OPT_CALLBACK(0, "event", &trace.evlist, "event",
3290                      "event selector. use 'perf list' to list available events",
3291                      parse_events_option),
3292         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3293                     "show the thread COMM next to its id"),
3294         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3295         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3296         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3297         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3298         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3299                     "trace events on existing process id"),
3300         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3301                     "trace events on existing thread id"),
3302         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3303                      "pids to filter (by the kernel)", trace__set_filter_pids),
3304         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3305                     "system-wide collection from all CPUs"),
3306         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3307                     "list of cpus to monitor"),
3308         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3309                     "child tasks do not inherit counters"),
3310         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3311                      "number of mmap data pages",
3312                      perf_evlist__parse_mmap_pages),
3313         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3314                    "user to profile"),
3315         OPT_CALLBACK(0, "duration", &trace, "float",
3316                      "show only events with duration > N.M ms",
3317                      trace__set_duration),
3318         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3319         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3320         OPT_BOOLEAN('T', "time", &trace.full_time,
3321                     "Show full timestamp, not time relative to first start"),
3322         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3323                     "Show only syscall summary with statistics"),
3324         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3325                     "Show all syscalls and summary with statistics"),
3326         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3327                      "Trace pagefaults", parse_pagefaults, "maj"),
3328         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3329         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3330         OPT_CALLBACK(0, "call-graph", &trace.opts,
3331                      "record_mode[,record_size]", record_callchain_help,
3332                      &record_parse_callchain_opt),
3333         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3334                     "Show the kernel callchains on the syscall exit path"),
3335         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3336                         "per thread proc mmap processing timeout in ms"),
3337         OPT_END()
3338         };
3339         const char * const trace_subcommands[] = { "record", NULL };
3340         int err;
3341         char bf[BUFSIZ];
3342
3343         signal(SIGSEGV, sighandler_dump_stack);
3344         signal(SIGFPE, sighandler_dump_stack);
3345
3346         trace.evlist = perf_evlist__new();
3347         trace.sctbl = syscalltbl__new();
3348
3349         if (trace.evlist == NULL || trace.sctbl == NULL) {
3350                 pr_err("Not enough memory to run!\n");
3351                 err = -ENOMEM;
3352                 goto out;
3353         }
3354
3355         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3356                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3357
3358         err = bpf__setup_stdout(trace.evlist);
3359         if (err) {
3360                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3361                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3362                 goto out;
3363         }
3364
3365         err = -1;
3366
3367         if (trace.trace_pgfaults) {
3368                 trace.opts.sample_address = true;
3369                 trace.opts.sample_time = true;
3370         }
3371
3372         if (trace.opts.callgraph_set)
3373                 symbol_conf.use_callchain = true;
3374
3375         if (trace.evlist->nr_entries > 0)
3376                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3377
3378         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3379                 return trace__record(&trace, argc-1, &argv[1]);
3380
3381         /* summary_only implies summary option, but don't overwrite summary if set */
3382         if (trace.summary_only)
3383                 trace.summary = trace.summary_only;
3384
3385         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3386             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3387                 pr_err("Please specify something to trace.\n");
3388                 return -1;
3389         }
3390
3391         if (!trace.trace_syscalls && ev_qualifier_str) {
3392                 pr_err("The -e option can't be used with --no-syscalls.\n");
3393                 goto out;
3394         }
3395
3396         if (output_name != NULL) {
3397                 err = trace__open_output(&trace, output_name);
3398                 if (err < 0) {
3399                         perror("failed to create output file");
3400                         goto out;
3401                 }
3402         }
3403
3404         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3405
3406         if (ev_qualifier_str != NULL) {
3407                 const char *s = ev_qualifier_str;
3408                 struct strlist_config slist_config = {
3409                         .dirname = system_path(STRACE_GROUPS_DIR),
3410                 };
3411
3412                 trace.not_ev_qualifier = *s == '!';
3413                 if (trace.not_ev_qualifier)
3414                         ++s;
3415                 trace.ev_qualifier = strlist__new(s, &slist_config);
3416                 if (trace.ev_qualifier == NULL) {
3417                         fputs("Not enough memory to parse event qualifier",
3418                               trace.output);
3419                         err = -ENOMEM;
3420                         goto out_close;
3421                 }
3422
3423                 err = trace__validate_ev_qualifier(&trace);
3424                 if (err)
3425                         goto out_close;
3426         }
3427
3428         err = target__validate(&trace.opts.target);
3429         if (err) {
3430                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3431                 fprintf(trace.output, "%s", bf);
3432                 goto out_close;
3433         }
3434
3435         err = target__parse_uid(&trace.opts.target);
3436         if (err) {
3437                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3438                 fprintf(trace.output, "%s", bf);
3439                 goto out_close;
3440         }
3441
3442         if (!argc && target__none(&trace.opts.target))
3443                 trace.opts.target.system_wide = true;
3444
3445         if (input_name)
3446                 err = trace__replay(&trace);
3447         else
3448                 err = trace__run(&trace, argc, argv);
3449
3450 out_close:
3451         if (output_name != NULL)
3452                 fclose(trace.output);
3453 out:
3454         return err;
3455 }