perf_counter: Fix/complete ftrace event records sampling
authorFrederic Weisbecker <fweisbec@gmail.com>
Thu, 6 Aug 2009 23:25:54 +0000 (01:25 +0200)
committerIngo Molnar <mingo@elte.hu>
Sun, 9 Aug 2009 10:53:48 +0000 (12:53 +0200)
This patch implements the kernel side support for ftrace event
record sampling.

A new counter sampling attribute is added:

   PERF_SAMPLE_TP_RECORD

which requests ftrace events record sampling. In this case
if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint
fires, we emit the tracepoint binary record to the
perfcounter event buffer, as a sample.

Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf
record:

 perf record -f -F 1 -a -e workqueue:workqueue_execution
 perf report -D

 0x21e18 [0x48]: event: 9
 .
 . ... raw event: size 72 bytes
 .  0000:  09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff  ......H........
 .  0010:  0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00  ........!......
 .  0020:  2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e  +...........eve
 .  0030:  74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00  ts/1...........
 .  0040:  e0 b1 31 81 ff ff ff ff                          .......
.
0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33

The raw ftrace binary record starts at offset 0020.

Translation:

 struct trace_entry {
type = 0x2b = 43;
flags = 1;
preempt_count = 2;
pid = 0xa = 10;
tgid = 0xa = 10;
 }

 thread_comm = "events/1"
 thread_pid  = 0xa = 10;
 func     = 0xffffffff8131b1e0 = flush_to_ldisc()

What will come next?

 - Userspace support ('perf trace'), 'flight data recorder' mode
   for perf trace, etc.

 - The unconditional copy from the profiling callback brings
   some costs however if someone wants no such sampling to
   occur, and needs to be fixed in the future. For that we need
   to have an instant access to the perf counter attribute.
   This is a matter of a flag to add in the struct ftrace_event.

 - Take care of the events recursivity! Don't ever try to record
   a lock event for example, it seems some locking is used in
   the profiling fast path and lead to a tracing recursivity.
   That will be fixed using raw spinlock or recursivity
   protection.

 - [...]

 - Profit! :-)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
include/linux/ftrace_event.h
include/linux/perf_counter.h
include/trace/ftrace.h
kernel/perf_counter.c
kernel/trace/trace.c
kernel/trace/trace.h
tools/perf/builtin-record.c

index d7cd193c2277a1191e16f5d5fcd93d5ee9a2b6d5..a81170de7f6bf4907faf644495831aad0c8fc30a 100644 (file)
@@ -89,7 +89,9 @@ enum print_line_t {
        TRACE_TYPE_NO_CONSUME   = 3     /* Handled but ask to not consume */
 };
 
-
+void tracing_generic_entry_update(struct trace_entry *entry,
+                                 unsigned long flags,
+                                 int pc);
 struct ring_buffer_event *
 trace_current_buffer_lock_reserve(int type, unsigned long len,
                                  unsigned long flags, int pc);
index e604e6ef72dd5af13b6265d688451cef47fd56fa..a67dd5c5b6d3b72db134d4d81ed9a0c0aa6d9be9 100644 (file)
@@ -121,8 +121,9 @@ enum perf_counter_sample_format {
        PERF_SAMPLE_CPU                         = 1U << 7,
        PERF_SAMPLE_PERIOD                      = 1U << 8,
        PERF_SAMPLE_STREAM_ID                   = 1U << 9,
+       PERF_SAMPLE_TP_RECORD                   = 1U << 10,
 
-       PERF_SAMPLE_MAX = 1U << 10,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 11,             /* non-ABI */
 };
 
 /*
@@ -413,6 +414,11 @@ struct perf_callchain_entry {
        __u64                           ip[PERF_MAX_STACK_DEPTH];
 };
 
+struct perf_tracepoint_record {
+       int                             size;
+       char                            *record;
+};
+
 struct task_struct;
 
 /**
@@ -681,6 +687,7 @@ struct perf_sample_data {
        struct pt_regs                  *regs;
        u64                             addr;
        u64                             period;
+       void                            *private;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
index fec71f8dbc4804ff3ec9d467fcc95bd8077a9e88..7fb16d90e7b1334ba431e0f7952c78931a58f567 100644 (file)
@@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call(                               \
 /*
  * Generate the functions needed for tracepoint perf_counter support.
  *
- * static void ftrace_profile_<call>(proto)
- * {
- *     extern void perf_tpcounter_event(int, u64, u64);
- *     u64 __addr = 0, __count = 1;
- *
- *     <assign>   <-- here we expand the TP_perf_assign() macro
- *
- *     perf_tpcounter_event(event_<call>.id, __addr, __count);
- * }
+ * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
  *
  * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call)
  * {
@@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call(                              \
  *
  */
 
-#undef TP_fast_assign
-#define TP_fast_assign(args...)
-
-#undef TP_perf_assign
-#define TP_perf_assign(args...) args
-
-#undef __perf_addr
-#define __perf_addr(a) __addr = (a)
-
-#undef __perf_count
-#define __perf_count(c) __count = (c)
-
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
                                                                        \
-static void ftrace_profile_##call(proto)                               \
-{                                                                      \
-       extern void perf_tpcounter_event(int, u64, u64);                \
-       u64 __addr = 0, __count = 1;                                    \
-       { assign; }                                                     \
-       perf_tpcounter_event(event_##call.id, __addr, __count);         \
-}                                                                      \
+static void ftrace_profile_##call(proto);                              \
                                                                        \
 static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
 {                                                                      \
@@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TP_perf_assign
-#define TP_perf_assign(args...)
-
 #endif
 
 /*
@@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = {                \
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+/*
+ * Define the insertion callback to profile events
+ *
+ * The job is very similar to ftrace_raw_event_<call> except that we don't
+ * insert in the ring buffer but in a perf counter.
+ *
+ * static void ftrace_profile_<call>(proto)
+ * {
+ *     struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
+ *     struct ftrace_event_call *event_call = &event_<call>;
+ *     extern void perf_tpcounter_event(int, u64, u64, void *, int);
+ *     struct ftrace_raw_##call *entry;
+ *     u64 __addr = 0, __count = 1;
+ *     unsigned long irq_flags;
+ *     int __entry_size;
+ *     int __data_size;
+ *     int pc;
+ *
+ *     local_save_flags(irq_flags);
+ *     pc = preempt_count();
+ *
+ *     __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *     __entry_size = __data_size + sizeof(*entry);
+ *
+ *     do {
+ *             char raw_data[__entry_size]; <- allocate our sample in the stack
+ *             struct trace_entry *ent;
+ *
+ *             entry = (struct ftrace_raw_<call> *)raw_data;
+ *             ent = &entry->ent;
+ *             tracing_generic_entry_update(ent, irq_flags, pc);
+ *             ent->type = event_call->id;
+ *
+ *             <tstruct> <- do some jobs with dynamic arrays
+ *
+ *             <assign>  <- affect our values
+ *
+ *             perf_tpcounter_event(event_call->id, __addr, __count, entry,
+ *                          __entry_size);  <- submit them to perf counter
+ *     } while (0);
+ *
+ * }
+ */
+
+#ifdef CONFIG_EVENT_PROFILE
+
+#undef __perf_addr
+#define __perf_addr(a) __addr = (a)
+
+#undef __perf_count
+#define __perf_count(c) __count = (c)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+static void ftrace_profile_##call(proto)                               \
+{                                                                      \
+       struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+       struct ftrace_event_call *event_call = &event_##call;           \
+       extern void perf_tpcounter_event(int, u64, u64, void *, int);   \
+       struct ftrace_raw_##call *entry;                                \
+       u64 __addr = 0, __count = 1;                                    \
+       unsigned long irq_flags;                                        \
+       int __entry_size;                                               \
+       int __data_size;                                                \
+       int pc;                                                         \
+                                                                       \
+       local_save_flags(irq_flags);                                    \
+       pc = preempt_count();                                           \
+                                                                       \
+       __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
+       __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\
+                                                                       \
+       do {                                                            \
+               char raw_data[__entry_size];                            \
+               struct trace_entry *ent;                                \
+                                                                       \
+               entry = (struct ftrace_raw_##call *)raw_data;           \
+               ent = &entry->ent;                                      \
+               tracing_generic_entry_update(ent, irq_flags, pc);       \
+               ent->type = event_call->id;                             \
+                                                                       \
+               tstruct                                                 \
+                                                                       \
+               { assign; }                                             \
+                                                                       \
+               perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+                            __entry_size);                             \
+       } while (0);                                                    \
+                                                                       \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+#endif /* CONFIG_EVENT_PROFILE */
+
 #undef _TRACE_PROFILE_INIT
 
index 52eb4b68d34f9c1fbaa45dbb1df060ef21e89407..868102172aa4da45b29d543c833ab5b9ab2475a3 100644 (file)
@@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                u64 counter;
        } group_entry;
        struct perf_callchain_entry *callchain = NULL;
+       struct perf_tracepoint_record *tp;
        int callchain_size = 0;
        u64 time;
        struct {
@@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                        header.size += sizeof(u64);
        }
 
+       if (sample_type & PERF_SAMPLE_TP_RECORD) {
+               tp = data->private;
+               header.size += tp->size;
+       }
+
        ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
        if (ret)
                return;
@@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                }
        }
 
+       if (sample_type & PERF_SAMPLE_TP_RECORD)
+               perf_output_copy(&handle, tp->record, tp->size);
+
        perf_output_end(&handle);
 }
 
@@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id, u64 addr, u64 count)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+                         int entry_size)
 {
+       struct perf_tracepoint_record tp = {
+               .size = entry_size,
+               .record = record,
+       };
+
        struct perf_sample_data data = {
                .regs = get_irq_regs(),
                .addr = addr,
+               .private = &tp,
        };
 
        if (!data.regs)
index 8930e39b9d8ca4780b57b49638c13d1bc97dcd86..c22b40f8f576c19e7c00dec5e8f1983667e7b084 100644 (file)
@@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
                ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
                (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 }
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
                                                    int type,
index 3548ae5cc7801e131621dff70037f26ba8d7ee01..8b9f4f6e9559a6d152e874f24f75dae235638fc9 100644 (file)
@@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts);
 
-void tracing_generic_entry_update(struct trace_entry *entry,
-                                 unsigned long flags,
-                                 int pc);
-
 void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
 
index 6da09928130f8b726446e21e55b4a114a02926dd..90c98082af106da85261a8995224386fcd228c4d 100644 (file)
@@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
        if (call_graph)
                attr->sample_type       |= PERF_SAMPLE_CALLCHAIN;
 
+
        attr->mmap              = track;
        attr->comm              = track;
        attr->inherit           = (cpu < 0) && inherit;