trace->summary_mode = SUMMARY__BY_THREAD;
} else if (!strcmp(str, "total")) {
trace->summary_mode = SUMMARY__BY_TOTAL;
+ } else if (!strcmp(str, "cgroup")) {
+ trace->summary_mode = SUMMARY__BY_CGROUP;
} else {
pr_err("Unknown summary mode: %s\n", str);
return -1;
OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
"Show errno stats per syscall, use with -s or -S"),
OPT_CALLBACK(0, "summary-mode", &trace, "mode",
- "How to show summary: select thread (default) or total",
+ "How to show summary: select thread (default), total or cgroup",
trace__parse_summary_mode),
OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
"Trace pagefaults", parse_pagefaults, "maj"),
symbol_conf.keep_exited_threads = true;
if (trace.summary_mode == SUMMARY__NONE)
trace.summary_mode = SUMMARY__BY_THREAD;
+
+ if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
+ pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
+ err = -EINVAL;
+ goto out;
+ }
}
if (output_name != NULL) {
#include "dwarf-regs.h" /* for EM_HOST */
#include "syscalltbl.h"
+#include "util/cgroup.h"
#include "util/hashmap.h"
#include "util/trace.h"
#include "util/util.h"
#include <bpf/bpf.h>
+#include <linux/rbtree.h>
#include <linux/time64.h>
#include <tools/libc_compat.h> /* reallocarray */
static struct syscall_summary_bpf *skel;
+static struct rb_root cgroups = RB_ROOT;
int trace_prepare_bpf_summary(enum trace_summary_mode mode)
{
if (mode == SUMMARY__BY_THREAD)
skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
+ else if (mode == SUMMARY__BY_CGROUP)
+ skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
else
skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
+ if (cgroup_is_v2("perf_event") > 0)
+ skel->rodata->use_cgroup_v2 = 1;
+
if (syscall_summary_bpf__load(skel) < 0) {
fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
return -1;
return -1;
}
+ if (mode == SUMMARY__BY_CGROUP)
+ read_all_cgroups(&cgroups);
+
return 0;
}
* per-cpu analysis so it's keyed by the syscall number to combine stats
* from different CPUs. And syscall_data always has a syscall_node so
* it can effectively work as flat hierarchy.
+ *
+ * For per-cgroup stats, it uses two-level data structure like thread
+ * syscall_data is keyed by CGROUP and has an array of node which
+ * represents each syscall for the cgroup.
*/
struct syscall_data {
- int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
+ u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
int nr_events;
int nr_nodes;
u64 total_time;
qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
- printed += fprintf(fp, " thread (%d), ", data->key);
+ printed += fprintf(fp, " thread (%d), ", (int)data->key);
printed += fprintf(fp, "%d events\n\n", data->nr_events);
printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
return printed;
}
+static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
+ struct syscall_stats *map_data)
+{
+ struct syscall_data *data;
+ struct syscall_node *nodes;
+
+ if (!hashmap__find(hash, map_key->cgroup, &data)) {
+ data = zalloc(sizeof(*data));
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->key = map_key->cgroup;
+ if (hashmap__add(hash, data->key, data) < 0) {
+ free(data);
+ return -ENOMEM;
+ }
+ }
+
+ /* update thread total stats */
+ data->nr_events += map_data->count;
+ data->total_time += map_data->total_time;
+
+ nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
+ if (nodes == NULL)
+ return -ENOMEM;
+
+ data->nodes = nodes;
+ nodes = &data->nodes[data->nr_nodes++];
+ nodes->syscall_nr = map_key->nr;
+
+ /* each thread has an entry for each syscall, just use the stat */
+ memcpy(&nodes->stats, map_data, sizeof(*map_data));
+ return 0;
+}
+
+static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
+{
+ int printed = 0;
+ struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
+
+ qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
+
+ if (cgrp)
+ printed += fprintf(fp, " cgroup %s,", cgrp->name);
+ else
+ printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
+
+ printed += fprintf(fp, " %d events\n\n", data->nr_events);
+
+ printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
+ printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
+ printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
+
+ printed += print_common_stats(data, fp);
+ printed += fprintf(fp, "\n\n");
+
+ return printed;
+}
+
+static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+ int printed = 0;
+
+ for (int i = 0; i < nr_data; i++)
+ printed += print_cgroup_stat(data[i], fp);
+
+ return printed;
+}
+
int trace_print_bpf_summary(FILE *fp)
{
struct bpf_map *map = skel->maps.syscall_stats_map;
struct syscall_stats stat;
if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
- if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ switch (skel->rodata->aggr_mode) {
+ case SYSCALL_AGGR_THREAD:
update_thread_stats(&schash, &key, &stat);
- else
+ break;
+ case SYSCALL_AGGR_CPU:
update_total_stats(&schash, &key, &stat);
+ break;
+ case SYSCALL_AGGR_CGROUP:
+ update_cgroup_stats(&schash, &key, &stat);
+ break;
+ default:
+ break;
+ }
}
prev_key = &key;
qsort(data, nr_data, sizeof(*data), datacmp);
- if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ switch (skel->rodata->aggr_mode) {
+ case SYSCALL_AGGR_THREAD:
printed += print_thread_stats(data, nr_data, fp);
- else
+ break;
+ case SYSCALL_AGGR_CPU:
printed += print_total_stats(data, nr_data, fp);
+ break;
+ case SYSCALL_AGGR_CGROUP:
+ printed += print_cgroup_stats(data, nr_data, fp);
+ break;
+ default:
+ break;
+ }
for (i = 0; i < nr_data && data; i++) {
free(data[i]->nodes);
void trace_cleanup_bpf_summary(void)
{
+ if (!RB_EMPTY_ROOT(&cgroups)) {
+ struct cgroup *cgrp, *tmp;
+
+ rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
+ cgroup__put(cgrp);
+
+ cgroups = RB_ROOT;
+ }
+
syscall_summary_bpf__destroy(skel);
}
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
/* This is to calculate a delta between sys-enter and sys-exit for each thread */
struct syscall_trace {
int enabled; /* controlled from userspace */
const volatile enum syscall_aggr_mode aggr_mode;
+const volatile int use_cgroup_v2;
-static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
+int perf_subsys_id = -1;
+
+static inline __u64 get_current_cgroup_id(void)
+{
+ struct task_struct *task;
+ struct cgroup *cgrp;
+
+ if (use_cgroup_v2)
+ return bpf_get_current_cgroup_id();
+
+ task = bpf_get_current_task_btf();
+
+ if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+ perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+ perf_event_cgrp_id);
+#else
+ perf_subsys_id = perf_event_cgrp_id;
+#endif
+ }
+
+ cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
+ return BPF_CORE_READ(cgrp, kn, id);
+}
+
+static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
+ long ret)
{
- struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
+ struct syscall_key key = {
+ .cpu_or_tid = cpu_or_tid,
+ .cgroup = cgroup_id,
+ .nr = nr,
+ };
struct syscall_stats *stats;
stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
int sys_exit(u64 *ctx)
{
int tid;
- int key;
+ int key = 0;
+ u64 cgroup = 0;
long ret = ctx[1]; /* return value of the syscall */
struct syscall_trace *st;
s64 delta;
if (aggr_mode == SYSCALL_AGGR_THREAD)
key = tid;
+ else if (aggr_mode == SYSCALL_AGGR_CGROUP)
+ cgroup = get_current_cgroup_id();
else
key = bpf_get_smp_processor_id();
delta = bpf_ktime_get_ns() - st->timestamp;
- update_stats(key, st->nr, delta, ret);
+ update_stats(key, cgroup, st->nr, delta, ret);
bpf_map_delete_elem(&syscall_trace_map, &tid);
return 0;