Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

[linux-2.6-block.git] / tools / perf / builtin-trace.c
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

index 90289f31dd87c774ef882c24b1cdf55da9edcca3..835619476370cc0ae1d43bae705597f819a4c732 100644 (file)
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -89,6 +89,8 @@ struct trace {
         u64                     base_time;
         FILE                    *output;
         unsigned long           nr_events;
+       unsigned long           nr_events_printed;
+       unsigned long           max_events;
         struct strlist          *ev_qualifier;
         struct {
                 size_t          nr;
@@ -106,6 +108,7 @@ struct trace {
         } stats;
         unsigned int            max_stack;
         unsigned int            min_stack;
+       bool                    raw_augmented_syscalls;
         bool                    not_ev_qualifier;
         bool                    live;
         bool                    full_time;
@@ -612,6 +615,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
  
  struct syscall_arg_fmt {
         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
+       unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
         void       *parm;
         const char *name;
         bool       show_zero;
@@ -723,6 +727,10 @@ static struct syscall_fmt {
           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
+       { .name     = "mount",
+         .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
+                  [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
+                          .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
         { .name     = "mprotect",
           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
@@ -832,7 +840,8 @@ static struct syscall_fmt {
           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
         { .name     = "tkill",
           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
-       { .name     = "umount2", .alias = "umount", },
+       { .name     = "umount2", .alias = "umount",
+         .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
         { .name     = "uname", .alias = "newuname", },
         { .name     = "unlinkat",
           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
@@ -856,6 +865,18 @@ static struct syscall_fmt *syscall_fmt__find(const char *name)
         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
  }
  
+static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
+{
+       int i, nmemb = ARRAY_SIZE(syscall_fmts);
+
+       for (i = 0; i < nmemb; ++i) {
+               if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
+                       return &syscall_fmts[i];
+       }
+
+       return NULL;
+}
+
  /*
   * is_exit: is this "exit" or "exit_group"?
   * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
@@ -1485,6 +1506,19 @@ static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
         return scnprintf(bf, size, "arg%d: ", arg->idx);
  }
  
+/*
+ * Check if the value is in fact zero, i.e. mask whatever needs masking, such
+ * as mount 'flags' argument that needs ignoring some magic flag, see comment
+ * in tools/perf/trace/beauty/mount_flags.c
+ */
+static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
+{
+       if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
+               return sc->arg_fmt[arg->idx].mask_val(arg, val);
+
+       return val;
+}
+
  static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
                                      struct syscall_arg *arg, unsigned long val)
  {
@@ -1533,6 +1567,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
                                 continue;
  
                         val = syscall_arg__val(&arg, arg.idx);
+                       /*
+                        * Some syscall args need some mask, most don't and
+                        * return val untouched.
+                        */
+                       val = syscall__mask_val(sc, &arg, val);
  
                         /*
                          * Suppress this argument if its value is zero and
@@ -1664,6 +1703,8 @@ static int trace__printf_interrupted_entry(struct trace *trace)
         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
         ttrace->entry_pending = false;
  
+       ++trace->nr_events_printed;
+
         return printed;
  }
  
@@ -1684,13 +1725,28 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
         return printed;
  }
  
-static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size)
+static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented)
  {
         void *augmented_args = NULL;
+       /*
+        * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
+        * and there we get all 6 syscall args plus the tracepoint common
+        * fields (sizeof(long)) and the syscall_nr (another long). So we check
+        * if that is the case and if so don't look after the sc->args_size,
+        * but always after the full raw_syscalls:sys_enter payload, which is
+        * fixed.
+        *
+        * We'll revisit this later to pass s->args_size to the BPF augmenter
+        * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
+        * copies only what we need for each syscall, like what happens when we
+        * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
+        * traffic to just what is needed for each syscall.
+        */
+       int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size;
  
-       *augmented_args_size = sample->raw_size - sc->args_size;
+       *augmented_args_size = sample->raw_size - args_size;
         if (*augmented_args_size > 0)
-               augmented_args = sample->raw_data + sc->args_size;
+               augmented_args = sample->raw_data + args_size;
  
         return augmented_args;
  }
@@ -1740,7 +1796,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
          * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
          */
         if (evsel != trace->syscalls.events.sys_enter)
-               augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size);
+               augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
         ttrace->entry_time = sample->time;
         msg = ttrace->entry_str;
         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
@@ -1793,7 +1849,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse
                 goto out_put;
  
         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
-       augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size);
+       augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
         syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
         fprintf(trace->output, "%s", msg);
         err = 0;
@@ -1810,12 +1866,14 @@ static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evse
         int max_stack = evsel->attr.sample_max_stack ?
                         evsel->attr.sample_max_stack :
                         trace->max_stack;
+       int err;
  
-       if (machine__resolve(trace->host, &al, sample) < 0 ||
-           thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
+       if (machine__resolve(trace->host, &al, sample) < 0)
                 return -1;
  
-       return 0;
+       err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
+       addr_location__put(&al);
+       return err;
  }
  
  static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
@@ -1940,6 +1998,13 @@ errno_print: {
  
         fputc('\n', trace->output);
  
+       /*
+        * We only consider an 'event' for the sake of --max-events a non-filtered
+        * sys_enter + sys_exit and other tracepoint events.
+        */
+       if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
+               interrupted = true;
+
         if (callchain_ret > 0)
                 trace__fprintf_callchain(trace, sample);
         else if (callchain_ret < 0)
@@ -2072,14 +2137,25 @@ static void bpf_output__fprintf(struct trace *trace,
  {
         binary__fprintf(sample->raw_data, sample->raw_size, 8,
                         bpf_output__printer, NULL, trace->output);
+       ++trace->nr_events_printed;
  }
  
  static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
                                 union perf_event *event __maybe_unused,
                                 struct perf_sample *sample)
  {
-       struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+       struct thread *thread;
         int callchain_ret = 0;
+       /*
+        * Check if we called perf_evsel__disable(evsel) due to, for instance,
+        * this event's max_events having been hit and this is an entry coming
+        * from the ring buffer that we should discard, since the max events
+        * have already been considered/printed.
+        */
+       if (evsel->disabled)
+               return 0;
+
+       thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
  
         if (sample->callchain) {
                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
@@ -2127,6 +2203,12 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
                         event_format__fprintf(evsel->tp_format, sample->cpu,
                                               sample->raw_data, sample->raw_size,
                                               trace->output);
+                       ++trace->nr_events_printed;
+
+                       if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
+                               perf_evsel__disable(evsel);
+                               perf_evsel__close(evsel);
+                       }
                 }
         }
  
@@ -2137,8 +2219,8 @@ newline:
                 trace__fprintf_callchain(trace, sample);
         else if (callchain_ret < 0)
                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
-       thread__put(thread);
  out:
+       thread__put(thread);
         return 0;
  }
  
@@ -2225,6 +2307,8 @@ static int trace__pgfault(struct trace *trace,
                 trace__fprintf_callchain(trace, sample);
         else if (callchain_ret < 0)
                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+
+       ++trace->nr_events_printed;
  out:
         err = 0;
  out_put:
@@ -2402,6 +2486,9 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st
                 tracepoint_handler handler = evsel->handler;
                 handler(trace, evsel, event, sample);
         }
+
+       if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
+               interrupted = true;
  }
  
  static int trace__add_syscall_newtp(struct trace *trace)
@@ -2706,7 +2793,7 @@ next_event:
                 int timeout = done ? 100 : -1;
  
                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
-                       if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
+                       if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
                                 draining = true;
  
                         goto again;
@@ -3138,6 +3225,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
         int len = strlen(str) + 1, err = -1, list, idx;
         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
         char group_name[PATH_MAX];
+       struct syscall_fmt *fmt;
  
         if (strace_groups_dir == NULL)
                 return -1;
@@ -3155,12 +3243,19 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
                         list = 1;
+                       goto do_concat;
+               }
+
+               fmt = syscall_fmt__find_by_alias(s);
+               if (fmt != NULL) {
+                       list = 1;
+                       s = fmt->name;
                 } else {
                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
                         if (access(group_name, R_OK) == 0)
                                 list = 1;
                 }
-
+do_concat:
                 if (lists[list]) {
                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
                 } else {
@@ -3249,6 +3344,7 @@ int cmd_trace(int argc, const char **argv)
                 .trace_syscalls = false,
                 .kernel_syscallchains = false,
                 .max_stack = UINT_MAX,
+               .max_events = ULONG_MAX,
         };
         const char *output_name = NULL;
         const struct option trace_options[] = {
@@ -3301,6 +3397,8 @@ int cmd_trace(int argc, const char **argv)
                      &record_parse_callchain_opt),
         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
                     "Show the kernel callchains on the syscall exit path"),
+       OPT_ULONG(0, "max-events", &trace.max_events,
+               "Set the maximum number of events to print, exit after that is reached. "),
         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
                      "Set the minimum stack depth when parsing the callchain, "
                      "anything below the specified depth will be ignored."),
@@ -3419,7 +3517,15 @@ int cmd_trace(int argc, const char **argv)
                 evsel->handler = trace__sys_enter;
  
                 evlist__for_each_entry(trace.evlist, evsel) {
+                       bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
+
+                       if (raw_syscalls_sys_exit) {
+                               trace.raw_augmented_syscalls = true;
+                               goto init_augmented_syscall_tp;
+                       }
+
                         if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
+init_augmented_syscall_tp:
                                 perf_evsel__init_augmented_syscall_tp(evsel);
                                 perf_evsel__init_augmented_syscall_tp_ret(evsel);
                                 evsel->handler = trace__sys_exit;