perf report: Add --latency flag

author Dmitry Vyukov <dvyukov@google.com>

Thu, 13 Feb 2025 09:08:19 +0000 (10:08 +0100)

committer Namhyung Kim <namhyung@kernel.org>

Tue, 18 Feb 2025 22:04:32 +0000 (14:04 -0800)
author Dmitry Vyukov <dvyukov@google.com>
Thu, 13 Feb 2025 09:08:19 +0000 (10:08 +0100)
committer Namhyung Kim <namhyung@kernel.org>
Tue, 18 Feb 2025 22:04:32 +0000 (14:04 -0800)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt

index 80686d590de24a2d1fceefcf795bf280af68441e..c7fc1ba265e2755d339c4b04c9ea5ea79d2eae86 100644 (file)
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -227,6 +227,10 @@ OPTIONS
         '--filter' exists, the new filter expression will be combined with
         them by '&&'.
  
+--latency::
+       Enable data collection for latency profiling.
+       Use perf report --latency for latency-centric profile.
+
  -a::
  --all-cpus::
          System-wide collection from all CPUs (default if no target is specified).
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index 87f86451940623f39de8e4b3b01683958ad961df..66794131aec4889280d205c398f9fb0e6557d5c1 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -68,6 +68,11 @@ OPTIONS
  --hide-unresolved::
          Only display entries resolved to a symbol.
  
+--latency::
+        Show latency-centric profile rather than the default
+        CPU-consumption-centric profile
+        (requires perf record --latency flag).
+
  -s::
  --sort=::
         Sort histogram entries by given key(s) - multiple keys can be specified
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index cda7e6a7b45d03d111de839009c73e36faf24d5a..9af3f21fd0153327f920b6790fc796179dc6e0bb 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -161,6 +161,7 @@ struct record {
         struct evlist           *sb_evlist;
         pthread_t               thread_id;
         int                     realtime_prio;
+       bool                    latency;
         bool                    switch_output_event_set;
         bool                    no_buildid;
         bool                    no_buildid_set;
@@ -3373,6 +3374,9 @@ static struct option __record_options[] = {
                      parse_events_option),
         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
                      "event filter", parse_filter),
+       OPT_BOOLEAN(0, "latency", &record.latency,
+                   "Enable data collection for latency profiling.\n"
+                   "\t\t\t  Use perf report --latency for latency-centric profile."),
         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
                            NULL, "don't record events from perf itself",
                            exclude_perf),
@@ -4019,6 +4023,22 @@ int cmd_record(int argc, const char **argv)
  
         }
  
+       if (record.latency) {
+               /*
+                * There is no fundamental reason why latency profiling
+                * can't work for system-wide mode, but exact semantics
+                * and details are to be defined.
+                * See the following thread for details:
+                * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
+                */
+               if (record.opts.target.system_wide) {
+                       pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
+                       err = -EINVAL;
+                       goto out_opts;
+               }
+               record.opts.record_switch_events = true;
+       }
+
         if (rec->buildid_mmap) {
                 if (!perf_can_record_build_id()) {
                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 2a19abdc869a179c21c577e8cfb33e29390bcf2d..8e064b8bd589dc8dffbdfec0b934429963c4a5e5 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -112,6 +112,8 @@ struct report {
         u64                     nr_entries;
         u64                     queue_size;
         u64                     total_cycles;
+       u64                     total_samples;
+       u64                     singlethreaded_samples;
         int                     socket_filter;
         DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
         struct branch_type_stat brtype_stat;
@@ -331,6 +333,10 @@ static int process_sample_event(const struct perf_tool *tool,
                                      &rep->total_cycles, evsel);
         }
  
+       rep->total_samples++;
+       if (al.parallelism == 1)
+               rep->singlethreaded_samples++;
+
         ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
         if (ret < 0)
                 pr_debug("problem adding hist entry, skipping event\n");
@@ -1079,6 +1085,11 @@ static int __cmd_report(struct report *rep)
                 return ret;
         }
  
+       /* Don't show Latency column for non-parallel profiles by default. */
+       if (!symbol_conf.prefer_latency && rep->total_samples &&
+               rep->singlethreaded_samples * 100 / rep->total_samples >= 99)
+               perf_hpp__cancel_latency();
+
         evlist__check_mem_load_aux(session->evlist);
  
         if (rep->stats_mode)
@@ -1468,6 +1479,10 @@ int cmd_report(int argc, const char **argv)
                     "Disable raw trace ordering"),
         OPT_BOOLEAN(0, "skip-empty", &report.skip_empty,
                     "Do not display empty (or dummy) events in the output"),
+       OPT_BOOLEAN(0, "latency", &symbol_conf.prefer_latency,
+                   "Show latency-centric profile rather than the default\n"
+                   "\t\t\t  CPU-consumption-centric profile\n"
+                   "\t\t\t  (requires perf record --latency flag)."),
         OPT_END()
         };
         struct perf_data data = {
@@ -1722,16 +1737,25 @@ repeat:
                 symbol_conf.annotate_data_sample = true;
         }
  
+       symbol_conf.enable_latency = true;
         if (report.disable_order || !perf_session__has_switch_events(session)) {
                 if (symbol_conf.parallelism_list_str ||
-                               (sort_order && strstr(sort_order, "parallelism")) ||
-                               (field_order && strstr(field_order, "parallelism"))) {
+                       symbol_conf.prefer_latency ||
+                       (sort_order && (strstr(sort_order, "latency") ||
+                               strstr(sort_order, "parallelism"))) ||
+                       (field_order && (strstr(field_order, "latency") ||
+                               strstr(field_order, "parallelism")))) {
                         if (report.disable_order)
-                               ui__error("Use of parallelism is incompatible with --disable-order.\n");
+                               ui__error("Use of latency profile or parallelism is incompatible with --disable-order.\n");
                         else
-                               ui__error("Use of parallelism requires --switch-events during record.\n");
+                               ui__error("Use of latency profile or parallelism requires --latency flag during record.\n");
                         return -1;
                 }
+               /*
+                * If user did not ask for anything related to
+                * latency/parallelism explicitly, just don't show it.
+                */
+               symbol_conf.enable_latency = false;
         }
  
         if (sort_order && strstr(sort_order, "ipc")) {
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c

index 6de6309595f9eb8896ea657fde6959bb2b76ceb4..ae3b7fe1dadc8f22b34a0c0708880217e701c0a9 100644 (file)
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -631,28 +631,48 @@ void perf_hpp__init(void)
         if (is_strict_order(field_order))
                 return;
  
+       /*
+        * Overhead and latency columns are added in setup_overhead(),
+        * so they are added implicitly here only if they were added
+        * by setup_overhead() before (have was_taken flag set).
+        * This is required because setup_overhead() has more complex
+        * logic, in particular it does not add "overhead" if user
+        * specified "latency" in sort order, and vise versa.
+        */
         if (symbol_conf.cumulate_callchain) {
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC);
+               /*
+                * Addition of fields is idempotent, so we add latency
+                * column twice to get desired order with simpler logic.
+                */
+               if (symbol_conf.prefer_latency)
+                       hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC, true);
+               if (symbol_conf.enable_latency)
+                       hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
                 perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self";
         }
  
-       hpp_dimension__add_output(PERF_HPP__OVERHEAD);
+       if (symbol_conf.prefer_latency)
+               hpp_dimension__add_output(PERF_HPP__LATENCY, true);
+       hpp_dimension__add_output(PERF_HPP__OVERHEAD, true);
+       if (symbol_conf.enable_latency)
+               hpp_dimension__add_output(PERF_HPP__LATENCY, true);
  
         if (symbol_conf.show_cpu_utilization) {
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS);
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_US);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS, false);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_US, false);
  
                 if (perf_guest) {
-                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS);
-                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US);
+                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS, false);
+                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US, false);
                 }
         }
  
         if (symbol_conf.show_nr_samples)
-               hpp_dimension__add_output(PERF_HPP__SAMPLES);
+               hpp_dimension__add_output(PERF_HPP__SAMPLES, false);
  
         if (symbol_conf.show_total_period)
-               hpp_dimension__add_output(PERF_HPP__PERIOD);
+               hpp_dimension__add_output(PERF_HPP__PERIOD, false);
  }
  
  void perf_hpp_list__column_register(struct perf_hpp_list *list,
@@ -701,6 +721,24 @@ void perf_hpp__cancel_cumulate(void)
         }
  }
  
+void perf_hpp__cancel_latency(void)
+{
+       struct perf_hpp_fmt *fmt, *lat, *acc, *tmp;
+
+       if (is_strict_order(field_order))
+               return;
+       if (sort_order && strstr(sort_order, "latency"))
+               return;
+
+       lat = &perf_hpp__format[PERF_HPP__LATENCY];
+       acc = &perf_hpp__format[PERF_HPP__LATENCY_ACC];
+
+       perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
+               if (fmt_equal(lat, fmt) || fmt_equal(acc, fmt))
+                       perf_hpp__column_unregister(fmt);
+       }
+}
+
  void perf_hpp__setup_output_field(struct perf_hpp_list *list)
  {
         struct perf_hpp_fmt *fmt;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h

index 91159f16c60b24a29007be7927b838917adf0523..29d4c7a3d174736a4c780d750e5db2be2c3855c1 100644 (file)
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -582,6 +582,7 @@ enum {
  
  void perf_hpp__init(void);
  void perf_hpp__cancel_cumulate(void);
+void perf_hpp__cancel_latency(void);
  void perf_hpp__setup_output_field(struct perf_hpp_list *list);
  void perf_hpp__reset_output_field(struct perf_hpp_list *list);
  void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c

index bc4c3acfe7552dd067493c80b13cb522d0ea8f68..2b6023de7a53ae2ee04745f90d93d8bfab61f11b 100644 (file)
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2622,6 +2622,7 @@ struct hpp_dimension {
         const char              *name;
         struct perf_hpp_fmt     *fmt;
         int                     taken;
+       int                     was_taken;
  };
  
  #define DIM(d, n) { .name = n, .fmt = &perf_hpp__format[d], }
@@ -3513,6 +3514,7 @@ static int __hpp_dimension__add(struct hpp_dimension *hd,
                 return -1;
  
         hd->taken = 1;
+       hd->was_taken = 1;
         perf_hpp_list__register_sort_field(list, fmt);
         return 0;
  }
@@ -3547,10 +3549,15 @@ static int __hpp_dimension__add_output(struct perf_hpp_list *list,
         return 0;
  }
  
-int hpp_dimension__add_output(unsigned col)
+int hpp_dimension__add_output(unsigned col, bool implicit)
  {
+       struct hpp_dimension *hd;
+
         BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
+       hd = &hpp_sort_dimensions[col];
+       if (implicit && !hd->was_taken)
+               return 0;
+       return __hpp_dimension__add_output(&perf_hpp_list, hd);
  }
  
  int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
@@ -3809,10 +3816,24 @@ static char *setup_overhead(char *keys)
         if (sort__mode == SORT_MODE__DIFF)
                 return keys;
  
-       keys = prefix_if_not_in("overhead", keys);
-
-       if (symbol_conf.cumulate_callchain)
-               keys = prefix_if_not_in("overhead_children", keys);
+       if (symbol_conf.prefer_latency) {
+               keys = prefix_if_not_in("overhead", keys);
+               keys = prefix_if_not_in("latency", keys);
+               if (symbol_conf.cumulate_callchain) {
+                       keys = prefix_if_not_in("overhead_children", keys);
+                       keys = prefix_if_not_in("latency_children", keys);
+               }
+       } else if (!keys || (!strstr(keys, "overhead") &&
+                       !strstr(keys, "latency"))) {
+               if (symbol_conf.enable_latency)
+                       keys = prefix_if_not_in("latency", keys);
+               keys = prefix_if_not_in("overhead", keys);
+               if (symbol_conf.cumulate_callchain) {
+                       if (symbol_conf.enable_latency)
+                               keys = prefix_if_not_in("latency_children", keys);
+                       keys = prefix_if_not_in("overhead_children", keys);
+               }
+       }
  
         return keys;
  }
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h

index 11fb15f91409358296e19e28a6f52c2fd13b91ee..180d36a2bea355dab05d0ce0d2178f1dbd37923d 100644 (file)
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -141,7 +141,7 @@ int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, i
  
  bool is_strict_order(const char *order);
  
-int hpp_dimension__add_output(unsigned col);
+int hpp_dimension__add_output(unsigned col, bool implicit);
  void reset_dimensions(void);
  int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
                         struct evlist *evlist,
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h

index c5b2e56127e228d80368f3f88c33ed00bb3c2e47..cd9aa82c7d5ad941d74407c35e033010cf1c9cab 100644 (file)
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -49,7 +49,9 @@ struct symbol_conf {
                         keep_exited_threads,
                         annotate_data_member,
                         annotate_data_sample,
-                       skip_empty;
+                       skip_empty,
+                       enable_latency,
+                       prefer_latency;
         const char      *vmlinux_name,
                         *kallsyms_name,
                         *source_prefix,
author	Dmitry Vyukov <dvyukov@google.com>
	Thu, 13 Feb 2025 09:08:19 +0000 (10:08 +0100)
committer	Namhyung Kim <namhyung@kernel.org>
	Tue, 18 Feb 2025 22:04:32 +0000 (14:04 -0800)
tools/perf/Documentation/perf-record.txt		patch \| blob \| blame \| history
tools/perf/Documentation/perf-report.txt		patch \| blob \| blame \| history
tools/perf/builtin-record.c		patch \| blob \| blame \| history
tools/perf/builtin-report.c		patch \| blob \| blame \| history
tools/perf/ui/hist.c		patch \| blob \| blame \| history
tools/perf/util/hist.h		patch \| blob \| blame \| history
tools/perf/util/sort.c		patch \| blob \| blame \| history
tools/perf/util/sort.h		patch \| blob \| blame \| history
tools/perf/util/symbol_conf.h		patch \| blob \| blame \| history