perf record --off-cpu: Parse off-cpu event
authorHoward Chu <howardchu95@gmail.com>
Thu, 1 May 2025 02:28:00 +0000 (19:28 -0700)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Tue, 6 May 2025 00:48:02 +0000 (21:48 -0300)
Parse the off-cpu event using parse_event(), as bpf-output.

Call evlist__enable_evsel() on off-cpu event. This fixes the inability
to collect direct off-cpu samples on a workload, as reported by Arnaldo
Carvalho de Melo <acme@redhat.com>.

The reason being, workload sets enable_on_exec instead of calling
evlist__enable(), but off-cpu event does not attach to an executable and
execve won't be called, so the fds from perf_event_open() are not
enabled.

no-inherit should be set to 1, here's the reason:

We update the BPF perf_event map for direct off-cpu sample dumping (in
following patches), it executes as follows:

bpf_map_update_value()
 bpf_fd_array_map_update_elem()
  perf_event_fd_array_get_ptr()
   perf_event_read_local()

In perf_event_read_local(), there is:

int perf_event_read_local(struct perf_event *event, u64 *value,
  u64 *enabled, u64 *running)
{
...
/*
 * It must not be an event with inherit set, we cannot read
 * all child counters from atomic context.
 */
if (event->attr.inherit) {
ret = -EOPNOTSUPP;
goto out;
}

Which means no-inherit has to be true for updating the BPF perf_event
map.

Moreover, for bpf-output events, we primarily want a system-wide event
instead of a per-task event.

The reason is that in BPF's bpf_perf_event_output(), BPF uses the CPU
index to retrieve the perf_event file descriptor it outputs to.

Making a bpf-output event system-wide naturally satisfies this
requirement by mapping CPU appropriately.

Suggested-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Gautam Menghani <gautam@linux.ibm.com>
Tested-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241108204137.2444151-4-howardchu95@gmail.com
Link: https://lore.kernel.org/r/20250501022809.449767-3-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/builtin-record.c
tools/perf/util/bpf_off_cpu.c
tools/perf/util/evsel.c

index 6637a3acb1f1295f0aa92f2961c9bf38cdab5b45..4194ea5ac72999a14b94a861184a7a5a0ccda6ad 100644 (file)
@@ -2568,6 +2568,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
        if (!target__none(&opts->target) && !opts->target.initial_delay)
                evlist__enable(rec->evlist);
 
+       /*
+        * offcpu-time does not call execve, so enable_on_exe wouldn't work
+        * when recording a workload, do it manually
+        */
+       if (rec->off_cpu)
+               evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
+
        /*
         * Let the child rip
         */
index 4269b41d1771dfff0ce4f96fc60faed13077633e..2101aa2b7c4270f85cba456d6f672c01a56eaf6d 100644 (file)
@@ -38,32 +38,21 @@ union off_cpu_data {
 
 static int off_cpu_config(struct evlist *evlist)
 {
+       char off_cpu_event[64];
        struct evsel *evsel;
-       struct perf_event_attr attr = {
-               .type   = PERF_TYPE_SOFTWARE,
-               .config = PERF_COUNT_SW_BPF_OUTPUT,
-               .size   = sizeof(attr), /* to capture ABI version */
-       };
-       char *evname = strdup(OFFCPU_EVENT);
-
-       if (evname == NULL)
-               return -ENOMEM;
 
-       evsel = evsel__new(&attr);
-       if (!evsel) {
-               free(evname);
-               return -ENOMEM;
+       scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/name=%s/", OFFCPU_EVENT);
+       if (parse_event(evlist, off_cpu_event)) {
+               pr_err("Failed to open off-cpu event\n");
+               return -1;
        }
 
-       evsel->core.attr.freq = 1;
-       evsel->core.attr.sample_period = 1;
-       /* off-cpu analysis depends on stack trace */
-       evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
-
-       evlist__add(evlist, evsel);
-
-       free(evsel->name);
-       evsel->name = evname;
+       evlist__for_each_entry(evlist, evsel) {
+               if (evsel__is_offcpu_event(evsel)) {
+                       evsel->core.system_wide = true;
+                       break;
+               }
+       }
 
        return 0;
 }
index 99eee3a2c76258b2818deb42b45cf2138eecd7a9..2415e067b8926b4077f0b1a8b5ccd2776e81c24f 100644 (file)
@@ -1555,8 +1555,10 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
        if (evsel__is_dummy_event(evsel))
                evsel__reset_sample_bit(evsel, BRANCH_STACK);
 
-       if (evsel__is_offcpu_event(evsel))
+       if (evsel__is_offcpu_event(evsel)) {
                evsel->core.attr.sample_type &= OFFCPU_SAMPLE_TYPES;
+               attr->inherit = 0;
+       }
 
        arch__post_evsel_config(evsel, attr);
 }