perf trace: Support --summary-mode=cgroup

author Namhyung Kim <namhyung@kernel.org>

Thu, 1 May 2025 22:53:37 +0000 (15:53 -0700)

committer Arnaldo Carvalho de Melo <acme@redhat.com>

Tue, 13 May 2025 21:20:46 +0000 (18:20 -0300)
author Namhyung Kim <namhyung@kernel.org>
Thu, 1 May 2025 22:53:37 +0000 (15:53 -0700)
committer Arnaldo Carvalho de Melo <acme@redhat.com>
Tue, 13 May 2025 21:20:46 +0000 (18:20 -0300)
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt

index a8a0d8c33438fef7c203ef7e523b63bd49bf2ed3..c1fb6056a0d36ddacca98c2e63104662e1fd162e 100644 (file)
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
  
  --summary-mode=mode::
         To be used with -s or -S, to select how to show summary.  By default it'll
-       show the syscall summary by thread.  Possible values are: thread, total.
+       show the syscall summary by thread.  Possible values are: thread, total,
+       cgroup.
  
  --tool_stats::
         Show tool stats such as number of times fd->pathname was discovered thru
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

index edab0ff60b3c7f10333bf96770eda5278b461572..07eddd5c0baaa2ee43da549db4de135bbcd0688b 100644 (file)
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -5302,6 +5302,8 @@ static int trace__parse_summary_mode(const struct option *opt, const char *str,
                 trace->summary_mode = SUMMARY__BY_THREAD;
         } else if (!strcmp(str, "total")) {
                 trace->summary_mode = SUMMARY__BY_TOTAL;
+       } else if (!strcmp(str, "cgroup")) {
+               trace->summary_mode = SUMMARY__BY_CGROUP;
         } else {
                 pr_err("Unknown summary mode: %s\n", str);
                 return -1;
@@ -5461,7 +5463,7 @@ int cmd_trace(int argc, const char **argv)
         OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
                     "Show errno stats per syscall, use with -s or -S"),
         OPT_CALLBACK(0, "summary-mode", &trace, "mode",
-                    "How to show summary: select thread (default) or total",
+                    "How to show summary: select thread (default), total or cgroup",
                      trace__parse_summary_mode),
         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
                      "Trace pagefaults", parse_pagefaults, "maj"),
@@ -5775,6 +5777,12 @@ init_augmented_syscall_tp:
                 symbol_conf.keep_exited_threads = true;
                 if (trace.summary_mode == SUMMARY__NONE)
                         trace.summary_mode = SUMMARY__BY_THREAD;
+
+               if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
+                       pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
+                       err = -EINVAL;
+                       goto out;
+               }
         }
  
         if (output_name != NULL) {
diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c

index 114d8d9ed9b2d3f319b2922984d65eeae60becd5..69fb165da206b01fc4fb4ceadf7788551933188e 100644 (file)
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -6,10 +6,12 @@
  
  #include "dwarf-regs.h" /* for EM_HOST */
  #include "syscalltbl.h"
+#include "util/cgroup.h"
  #include "util/hashmap.h"
  #include "util/trace.h"
  #include "util/util.h"
  #include <bpf/bpf.h>
+#include <linux/rbtree.h>
  #include <linux/time64.h>
  #include <tools/libc_compat.h> /* reallocarray */
  
@@ -18,6 +20,7 @@
  
  
  static struct syscall_summary_bpf *skel;
+static struct rb_root cgroups = RB_ROOT;
  
  int trace_prepare_bpf_summary(enum trace_summary_mode mode)
  {
@@ -29,9 +32,14 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
  
         if (mode == SUMMARY__BY_THREAD)
                 skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
+       else if (mode == SUMMARY__BY_CGROUP)
+               skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
         else
                 skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
  
+       if (cgroup_is_v2("perf_event") > 0)
+               skel->rodata->use_cgroup_v2 = 1;
+
         if (syscall_summary_bpf__load(skel) < 0) {
                 fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
                 return -1;
@@ -42,6 +50,9 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
                 return -1;
         }
  
+       if (mode == SUMMARY__BY_CGROUP)
+               read_all_cgroups(&cgroups);
+
         return 0;
  }
  
@@ -88,9 +99,13 @@ static double rel_stddev(struct syscall_stats *stat)
   * per-cpu analysis so it's keyed by the syscall number to combine stats
   * from different CPUs.  And syscall_data always has a syscall_node so
   * it can effectively work as flat hierarchy.
+ *
+ * For per-cgroup stats, it uses two-level data structure like thread
+ * syscall_data is keyed by CGROUP and has an array of node which
+ * represents each syscall for the cgroup.
   */
  struct syscall_data {
-       int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
+       u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
         int nr_events;
         int nr_nodes;
         u64 total_time;
@@ -191,7 +206,7 @@ static int print_thread_stat(struct syscall_data *data, FILE *fp)
  
         qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
  
-       printed += fprintf(fp, " thread (%d), ", data->key);
+       printed += fprintf(fp, " thread (%d), ", (int)data->key);
         printed += fprintf(fp, "%d events\n\n", data->nr_events);
  
         printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
@@ -283,6 +298,75 @@ static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
         return printed;
  }
  
+static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
+                              struct syscall_stats *map_data)
+{
+       struct syscall_data *data;
+       struct syscall_node *nodes;
+
+       if (!hashmap__find(hash, map_key->cgroup, &data)) {
+               data = zalloc(sizeof(*data));
+               if (data == NULL)
+                       return -ENOMEM;
+
+               data->key = map_key->cgroup;
+               if (hashmap__add(hash, data->key, data) < 0) {
+                       free(data);
+                       return -ENOMEM;
+               }
+       }
+
+       /* update thread total stats */
+       data->nr_events += map_data->count;
+       data->total_time += map_data->total_time;
+
+       nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
+       if (nodes == NULL)
+               return -ENOMEM;
+
+       data->nodes = nodes;
+       nodes = &data->nodes[data->nr_nodes++];
+       nodes->syscall_nr = map_key->nr;
+
+       /* each thread has an entry for each syscall, just use the stat */
+       memcpy(&nodes->stats, map_data, sizeof(*map_data));
+       return 0;
+}
+
+static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
+{
+       int printed = 0;
+       struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
+
+       qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
+
+       if (cgrp)
+               printed += fprintf(fp, " cgroup %s,", cgrp->name);
+       else
+               printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
+
+       printed += fprintf(fp, " %d events\n\n", data->nr_events);
+
+       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
+       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
+       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
+
+       printed += print_common_stats(data, fp);
+       printed += fprintf(fp, "\n\n");
+
+       return printed;
+}
+
+static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+       int printed = 0;
+
+       for (int i = 0; i < nr_data; i++)
+               printed += print_cgroup_stat(data[i], fp);
+
+       return printed;
+}
+
  int trace_print_bpf_summary(FILE *fp)
  {
         struct bpf_map *map = skel->maps.syscall_stats_map;
@@ -305,10 +389,19 @@ int trace_print_bpf_summary(FILE *fp)
                 struct syscall_stats stat;
  
                 if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
-                       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+                       switch (skel->rodata->aggr_mode) {
+                       case SYSCALL_AGGR_THREAD:
                                 update_thread_stats(&schash, &key, &stat);
-                       else
+                               break;
+                       case SYSCALL_AGGR_CPU:
                                 update_total_stats(&schash, &key, &stat);
+                               break;
+                       case SYSCALL_AGGR_CGROUP:
+                               update_cgroup_stats(&schash, &key, &stat);
+                               break;
+                       default:
+                               break;
+                       }
                 }
  
                 prev_key = &key;
@@ -325,10 +418,19 @@ int trace_print_bpf_summary(FILE *fp)
  
         qsort(data, nr_data, sizeof(*data), datacmp);
  
-       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+       switch (skel->rodata->aggr_mode) {
+       case SYSCALL_AGGR_THREAD:
                 printed += print_thread_stats(data, nr_data, fp);
-       else
+               break;
+       case SYSCALL_AGGR_CPU:
                 printed += print_total_stats(data, nr_data, fp);
+               break;
+       case SYSCALL_AGGR_CGROUP:
+               printed += print_cgroup_stats(data, nr_data, fp);
+               break;
+       default:
+               break;
+       }
  
         for (i = 0; i < nr_data && data; i++) {
                 free(data[i]->nodes);
@@ -343,5 +445,14 @@ out:
  
  void trace_cleanup_bpf_summary(void)
  {
+       if (!RB_EMPTY_ROOT(&cgroups)) {
+               struct cgroup *cgrp, *tmp;
+
+               rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
+                       cgroup__put(cgrp);
+
+               cgroups = RB_ROOT;
+       }
+
         syscall_summary_bpf__destroy(skel);
  }
diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c

index b25f53b3c1351392a42de9f05cf026ef4c4e6eb4..1bcd066a5199a4768ca9a044d2531020bb0a83bc 100644 (file)
--- a/tools/perf/util/bpf_skel/syscall_summary.bpf.c
+++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
@@ -8,6 +8,7 @@
  
  #include <bpf/bpf_helpers.h>
  #include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
  
  /* This is to calculate a delta between sys-enter and sys-exit for each thread */
  struct syscall_trace {
@@ -35,10 +36,41 @@ struct syscall_stats_map {
  int enabled; /* controlled from userspace */
  
  const volatile enum syscall_aggr_mode aggr_mode;
+const volatile int use_cgroup_v2;
  
-static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
+int perf_subsys_id = -1;
+
+static inline __u64 get_current_cgroup_id(void)
+{
+       struct task_struct *task;
+       struct cgroup *cgrp;
+
+       if (use_cgroup_v2)
+               return bpf_get_current_cgroup_id();
+
+       task = bpf_get_current_task_btf();
+
+       if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+               perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+                                                    perf_event_cgrp_id);
+#else
+               perf_subsys_id = perf_event_cgrp_id;
+#endif
+       }
+
+       cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
+       return BPF_CORE_READ(cgrp, kn, id);
+}
+
+static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
+                        long ret)
  {
-       struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
+       struct syscall_key key = {
+               .cpu_or_tid = cpu_or_tid,
+               .cgroup = cgroup_id,
+               .nr = nr,
+       };
         struct syscall_stats *stats;
  
         stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
@@ -90,7 +122,8 @@ SEC("tp_btf/sys_exit")
  int sys_exit(u64 *ctx)
  {
         int tid;
-       int key;
+       int key = 0;
+       u64 cgroup = 0;
         long ret = ctx[1]; /* return value of the syscall */
         struct syscall_trace *st;
         s64 delta;
@@ -105,11 +138,13 @@ int sys_exit(u64 *ctx)
  
         if (aggr_mode == SYSCALL_AGGR_THREAD)
                 key = tid;
+       else if (aggr_mode == SYSCALL_AGGR_CGROUP)
+               cgroup = get_current_cgroup_id();
         else
                 key = bpf_get_smp_processor_id();
  
         delta = bpf_ktime_get_ns() - st->timestamp;
-       update_stats(key, st->nr, delta, ret);
+       update_stats(key, cgroup, st->nr, delta, ret);
  
         bpf_map_delete_elem(&syscall_trace_map, &tid);
         return 0;
diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h

index 17f9ecba657088aa2f0cc30d1d5e9a4d422c44a6..72ccccb45925cd102f6413d3c8fd1adc96f9fcfd 100644 (file)
--- a/tools/perf/util/bpf_skel/syscall_summary.h
+++ b/tools/perf/util/bpf_skel/syscall_summary.h
@@ -6,9 +6,11 @@
  enum syscall_aggr_mode {
         SYSCALL_AGGR_THREAD,
         SYSCALL_AGGR_CPU,
+       SYSCALL_AGGR_CGROUP,
  };
  
  struct syscall_key {
+       u64 cgroup;
         int cpu_or_tid;
         int nr;
  };
diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h

index ef8361ed12c4edc11f78cd64a0ae805cb1468588..fa8d480527a22cefce89ca0c1da529338894def0 100644 (file)
--- a/tools/perf/util/trace.h
+++ b/tools/perf/util/trace.h
@@ -8,6 +8,7 @@ enum trace_summary_mode {
         SUMMARY__NONE = 0,
         SUMMARY__BY_TOTAL,
         SUMMARY__BY_THREAD,
+       SUMMARY__BY_CGROUP,
  };
  
  #ifdef HAVE_BPF_SKEL
author	Namhyung Kim <namhyung@kernel.org>
	Thu, 1 May 2025 22:53:37 +0000 (15:53 -0700)
committer	Arnaldo Carvalho de Melo <acme@redhat.com>
	Tue, 13 May 2025 21:20:46 +0000 (18:20 -0300)
tools/perf/Documentation/perf-trace.txt		patch \| blob \| blame \| history
tools/perf/builtin-trace.c		patch \| blob \| blame \| history
tools/perf/util/bpf-trace-summary.c		patch \| blob \| blame \| history
tools/perf/util/bpf_skel/syscall_summary.bpf.c		patch \| blob \| blame \| history
tools/perf/util/bpf_skel/syscall_summary.h		patch \| blob \| blame \| history
tools/perf/util/trace.h		patch \| blob \| blame \| history