perf report: Add --latency flag
authorDmitry Vyukov <dvyukov@google.com>
Thu, 13 Feb 2025 09:08:19 +0000 (10:08 +0100)
committerNamhyung Kim <namhyung@kernel.org>
Tue, 18 Feb 2025 22:04:32 +0000 (14:04 -0800)
Add record/report --latency flag that allows to capture and show
latency-centric profiles rather than the default CPU-consumption-centric
profiles. For latency profiles record captures context switch events,
and report shows Latency as the first column.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/e9640464bcbc47dde2cb557003f421052ebc9eec.1739437531.git.dvyukov@google.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/ui/hist.c
tools/perf/util/hist.h
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/symbol_conf.h

index 80686d590de24a2d1fceefcf795bf280af68441e..c7fc1ba265e2755d339c4b04c9ea5ea79d2eae86 100644 (file)
@@ -227,6 +227,10 @@ OPTIONS
        '--filter' exists, the new filter expression will be combined with
        them by '&&'.
 
+--latency::
+       Enable data collection for latency profiling.
+       Use perf report --latency for latency-centric profile.
+
 -a::
 --all-cpus::
         System-wide collection from all CPUs (default if no target is specified).
index 87f86451940623f39de8e4b3b01683958ad961df..66794131aec4889280d205c398f9fb0e6557d5c1 100644 (file)
@@ -68,6 +68,11 @@ OPTIONS
 --hide-unresolved::
         Only display entries resolved to a symbol.
 
+--latency::
+        Show latency-centric profile rather than the default
+        CPU-consumption-centric profile
+        (requires perf record --latency flag).
+
 -s::
 --sort=::
        Sort histogram entries by given key(s) - multiple keys can be specified
index cda7e6a7b45d03d111de839009c73e36faf24d5a..9af3f21fd0153327f920b6790fc796179dc6e0bb 100644 (file)
@@ -161,6 +161,7 @@ struct record {
        struct evlist           *sb_evlist;
        pthread_t               thread_id;
        int                     realtime_prio;
+       bool                    latency;
        bool                    switch_output_event_set;
        bool                    no_buildid;
        bool                    no_buildid_set;
@@ -3373,6 +3374,9 @@ static struct option __record_options[] = {
                     parse_events_option),
        OPT_CALLBACK(0, "filter", &record.evlist, "filter",
                     "event filter", parse_filter),
+       OPT_BOOLEAN(0, "latency", &record.latency,
+                   "Enable data collection for latency profiling.\n"
+                   "\t\t\t  Use perf report --latency for latency-centric profile."),
        OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
                           NULL, "don't record events from perf itself",
                           exclude_perf),
@@ -4019,6 +4023,22 @@ int cmd_record(int argc, const char **argv)
 
        }
 
+       if (record.latency) {
+               /*
+                * There is no fundamental reason why latency profiling
+                * can't work for system-wide mode, but exact semantics
+                * and details are to be defined.
+                * See the following thread for details:
+                * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
+                */
+               if (record.opts.target.system_wide) {
+                       pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
+                       err = -EINVAL;
+                       goto out_opts;
+               }
+               record.opts.record_switch_events = true;
+       }
+
        if (rec->buildid_mmap) {
                if (!perf_can_record_build_id()) {
                        pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
index 2a19abdc869a179c21c577e8cfb33e29390bcf2d..8e064b8bd589dc8dffbdfec0b934429963c4a5e5 100644 (file)
@@ -112,6 +112,8 @@ struct report {
        u64                     nr_entries;
        u64                     queue_size;
        u64                     total_cycles;
+       u64                     total_samples;
+       u64                     singlethreaded_samples;
        int                     socket_filter;
        DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
        struct branch_type_stat brtype_stat;
@@ -331,6 +333,10 @@ static int process_sample_event(const struct perf_tool *tool,
                                     &rep->total_cycles, evsel);
        }
 
+       rep->total_samples++;
+       if (al.parallelism == 1)
+               rep->singlethreaded_samples++;
+
        ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
        if (ret < 0)
                pr_debug("problem adding hist entry, skipping event\n");
@@ -1079,6 +1085,11 @@ static int __cmd_report(struct report *rep)
                return ret;
        }
 
+       /* Don't show Latency column for non-parallel profiles by default. */
+       if (!symbol_conf.prefer_latency && rep->total_samples &&
+               rep->singlethreaded_samples * 100 / rep->total_samples >= 99)
+               perf_hpp__cancel_latency();
+
        evlist__check_mem_load_aux(session->evlist);
 
        if (rep->stats_mode)
@@ -1468,6 +1479,10 @@ int cmd_report(int argc, const char **argv)
                    "Disable raw trace ordering"),
        OPT_BOOLEAN(0, "skip-empty", &report.skip_empty,
                    "Do not display empty (or dummy) events in the output"),
+       OPT_BOOLEAN(0, "latency", &symbol_conf.prefer_latency,
+                   "Show latency-centric profile rather than the default\n"
+                   "\t\t\t  CPU-consumption-centric profile\n"
+                   "\t\t\t  (requires perf record --latency flag)."),
        OPT_END()
        };
        struct perf_data data = {
@@ -1722,16 +1737,25 @@ repeat:
                symbol_conf.annotate_data_sample = true;
        }
 
+       symbol_conf.enable_latency = true;
        if (report.disable_order || !perf_session__has_switch_events(session)) {
                if (symbol_conf.parallelism_list_str ||
-                               (sort_order && strstr(sort_order, "parallelism")) ||
-                               (field_order && strstr(field_order, "parallelism"))) {
+                       symbol_conf.prefer_latency ||
+                       (sort_order && (strstr(sort_order, "latency") ||
+                               strstr(sort_order, "parallelism"))) ||
+                       (field_order && (strstr(field_order, "latency") ||
+                               strstr(field_order, "parallelism")))) {
                        if (report.disable_order)
-                               ui__error("Use of parallelism is incompatible with --disable-order.\n");
+                               ui__error("Use of latency profile or parallelism is incompatible with --disable-order.\n");
                        else
-                               ui__error("Use of parallelism requires --switch-events during record.\n");
+                               ui__error("Use of latency profile or parallelism requires --latency flag during record.\n");
                        return -1;
                }
+               /*
+                * If user did not ask for anything related to
+                * latency/parallelism explicitly, just don't show it.
+                */
+               symbol_conf.enable_latency = false;
        }
 
        if (sort_order && strstr(sort_order, "ipc")) {
index 6de6309595f9eb8896ea657fde6959bb2b76ceb4..ae3b7fe1dadc8f22b34a0c0708880217e701c0a9 100644 (file)
@@ -631,28 +631,48 @@ void perf_hpp__init(void)
        if (is_strict_order(field_order))
                return;
 
+       /*
+        * Overhead and latency columns are added in setup_overhead(),
+        * so they are added implicitly here only if they were added
+        * by setup_overhead() before (have was_taken flag set).
+        * This is required because setup_overhead() has more complex
+        * logic, in particular it does not add "overhead" if user
+        * specified "latency" in sort order, and vise versa.
+        */
        if (symbol_conf.cumulate_callchain) {
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC);
+               /*
+                * Addition of fields is idempotent, so we add latency
+                * column twice to get desired order with simpler logic.
+                */
+               if (symbol_conf.prefer_latency)
+                       hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC, true);
+               if (symbol_conf.enable_latency)
+                       hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
                perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self";
        }
 
-       hpp_dimension__add_output(PERF_HPP__OVERHEAD);
+       if (symbol_conf.prefer_latency)
+               hpp_dimension__add_output(PERF_HPP__LATENCY, true);
+       hpp_dimension__add_output(PERF_HPP__OVERHEAD, true);
+       if (symbol_conf.enable_latency)
+               hpp_dimension__add_output(PERF_HPP__LATENCY, true);
 
        if (symbol_conf.show_cpu_utilization) {
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS);
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_US);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS, false);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_US, false);
 
                if (perf_guest) {
-                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS);
-                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US);
+                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS, false);
+                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US, false);
                }
        }
 
        if (symbol_conf.show_nr_samples)
-               hpp_dimension__add_output(PERF_HPP__SAMPLES);
+               hpp_dimension__add_output(PERF_HPP__SAMPLES, false);
 
        if (symbol_conf.show_total_period)
-               hpp_dimension__add_output(PERF_HPP__PERIOD);
+               hpp_dimension__add_output(PERF_HPP__PERIOD, false);
 }
 
 void perf_hpp_list__column_register(struct perf_hpp_list *list,
@@ -701,6 +721,24 @@ void perf_hpp__cancel_cumulate(void)
        }
 }
 
+void perf_hpp__cancel_latency(void)
+{
+       struct perf_hpp_fmt *fmt, *lat, *acc, *tmp;
+
+       if (is_strict_order(field_order))
+               return;
+       if (sort_order && strstr(sort_order, "latency"))
+               return;
+
+       lat = &perf_hpp__format[PERF_HPP__LATENCY];
+       acc = &perf_hpp__format[PERF_HPP__LATENCY_ACC];
+
+       perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
+               if (fmt_equal(lat, fmt) || fmt_equal(acc, fmt))
+                       perf_hpp__column_unregister(fmt);
+       }
+}
+
 void perf_hpp__setup_output_field(struct perf_hpp_list *list)
 {
        struct perf_hpp_fmt *fmt;
index 91159f16c60b24a29007be7927b838917adf0523..29d4c7a3d174736a4c780d750e5db2be2c3855c1 100644 (file)
@@ -582,6 +582,7 @@ enum {
 
 void perf_hpp__init(void);
 void perf_hpp__cancel_cumulate(void);
+void perf_hpp__cancel_latency(void);
 void perf_hpp__setup_output_field(struct perf_hpp_list *list);
 void perf_hpp__reset_output_field(struct perf_hpp_list *list);
 void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
index bc4c3acfe7552dd067493c80b13cb522d0ea8f68..2b6023de7a53ae2ee04745f90d93d8bfab61f11b 100644 (file)
@@ -2622,6 +2622,7 @@ struct hpp_dimension {
        const char              *name;
        struct perf_hpp_fmt     *fmt;
        int                     taken;
+       int                     was_taken;
 };
 
 #define DIM(d, n) { .name = n, .fmt = &perf_hpp__format[d], }
@@ -3513,6 +3514,7 @@ static int __hpp_dimension__add(struct hpp_dimension *hd,
                return -1;
 
        hd->taken = 1;
+       hd->was_taken = 1;
        perf_hpp_list__register_sort_field(list, fmt);
        return 0;
 }
@@ -3547,10 +3549,15 @@ static int __hpp_dimension__add_output(struct perf_hpp_list *list,
        return 0;
 }
 
-int hpp_dimension__add_output(unsigned col)
+int hpp_dimension__add_output(unsigned col, bool implicit)
 {
+       struct hpp_dimension *hd;
+
        BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
+       hd = &hpp_sort_dimensions[col];
+       if (implicit && !hd->was_taken)
+               return 0;
+       return __hpp_dimension__add_output(&perf_hpp_list, hd);
 }
 
 int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
@@ -3809,10 +3816,24 @@ static char *setup_overhead(char *keys)
        if (sort__mode == SORT_MODE__DIFF)
                return keys;
 
-       keys = prefix_if_not_in("overhead", keys);
-
-       if (symbol_conf.cumulate_callchain)
-               keys = prefix_if_not_in("overhead_children", keys);
+       if (symbol_conf.prefer_latency) {
+               keys = prefix_if_not_in("overhead", keys);
+               keys = prefix_if_not_in("latency", keys);
+               if (symbol_conf.cumulate_callchain) {
+                       keys = prefix_if_not_in("overhead_children", keys);
+                       keys = prefix_if_not_in("latency_children", keys);
+               }
+       } else if (!keys || (!strstr(keys, "overhead") &&
+                       !strstr(keys, "latency"))) {
+               if (symbol_conf.enable_latency)
+                       keys = prefix_if_not_in("latency", keys);
+               keys = prefix_if_not_in("overhead", keys);
+               if (symbol_conf.cumulate_callchain) {
+                       if (symbol_conf.enable_latency)
+                               keys = prefix_if_not_in("latency_children", keys);
+                       keys = prefix_if_not_in("overhead_children", keys);
+               }
+       }
 
        return keys;
 }
index 11fb15f91409358296e19e28a6f52c2fd13b91ee..180d36a2bea355dab05d0ce0d2178f1dbd37923d 100644 (file)
@@ -141,7 +141,7 @@ int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, i
 
 bool is_strict_order(const char *order);
 
-int hpp_dimension__add_output(unsigned col);
+int hpp_dimension__add_output(unsigned col, bool implicit);
 void reset_dimensions(void);
 int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
                        struct evlist *evlist,
index c5b2e56127e228d80368f3f88c33ed00bb3c2e47..cd9aa82c7d5ad941d74407c35e033010cf1c9cab 100644 (file)
@@ -49,7 +49,9 @@ struct symbol_conf {
                        keep_exited_threads,
                        annotate_data_member,
                        annotate_data_sample,
-                       skip_empty;
+                       skip_empty,
+                       enable_latency,
+                       prefer_latency;
        const char      *vmlinux_name,
                        *kallsyms_name,
                        *source_prefix,