perf/core: Fix perf_event_read()
authorPeter Zijlstra <peterz@infradead.org>
Tue, 5 Sep 2017 14:26:44 +0000 (16:26 +0200)
committerIngo Molnar <mingo@kernel.org>
Fri, 27 Oct 2017 08:31:59 +0000 (10:31 +0200)
perf_event_read() has a number of issues regarding the timekeeping bits.

 - The IPI didn't update group times when it found INACTIVE

 - The direct call would not re-check ->state after taking ctx->lock
   which can result in ->count and timestamps getting out of sync.

And we can make use of the ordering introduced for perf_event_stop()
to make it more accurate for ACTIVE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/events/core.c

index 205a4321f6787b5315017dc1ab0af5839759ba6e..6f74f9c35490beccd46f1b3bb5a5d5638ab97939 100644 (file)
@@ -2081,8 +2081,9 @@ event_sched_in(struct perf_event *event,
 
        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
-        * Order event::oncpu write to happen before the ACTIVE state
-        * is visible.
+        * Order event::oncpu write to happen before the ACTIVE state is
+        * visible. This allows perf_event_{stop,read}() to observe the correct
+        * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
        WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
@@ -3638,12 +3639,16 @@ static void __perf_event_read(void *info)
                return;
 
        raw_spin_lock(&ctx->lock);
-       if (ctx->is_active) {
+       if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
 
-       update_event_times(event);
+       if (!data->group)
+               update_event_times(event);
+       else
+               update_group_times(event);
+
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                goto unlock;
 
@@ -3658,7 +3663,6 @@ static void __perf_event_read(void *info)
        pmu->read(event);
 
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
-               update_event_times(sub);
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
@@ -3748,23 +3752,35 @@ out:
 
 static int perf_event_read(struct perf_event *event, bool group)
 {
+       enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;
 
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
-       if (event->state == PERF_EVENT_STATE_ACTIVE) {
-               struct perf_read_data data = {
-                       .event = event,
-                       .group = group,
-                       .ret = 0,
-               };
+again:
+       if (state == PERF_EVENT_STATE_ACTIVE) {
+               struct perf_read_data data;
+
+               /*
+                * Orders the ->state and ->oncpu loads such that if we see
+                * ACTIVE we must also see the right ->oncpu.
+                *
+                * Matches the smp_wmb() from event_sched_in().
+                */
+               smp_rmb();
 
                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;
 
+               data = (struct perf_read_data){
+                       .event = event,
+                       .group = group,
+                       .ret = 0,
+               };
+
                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);
 
@@ -3781,20 +3797,27 @@ static int perf_event_read(struct perf_event *event, bool group)
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;
-       } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+
+       } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
 
                raw_spin_lock_irqsave(&ctx->lock, flags);
+               state = event->state;
+               if (state != PERF_EVENT_STATE_INACTIVE) {
+                       raw_spin_unlock_irqrestore(&ctx->lock, flags);
+                       goto again;
+               }
+
                /*
-                * may read while context is not active
-                * (e.g., thread is blocked), in that case
-                * we cannot update context time
+                * May read while context is not active (e.g., thread is
+                * blocked), in that case we cannot update context time
                 */
-               if (ctx->is_active) {
+               if (ctx->is_active & EVENT_TIME) {
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }
+
                if (group)
                        update_group_times(event);
                else