sched/tracing: Don't re-read p->state when emitting sched_switch event
authorValentin Schneider <valentin.schneider@arm.com>
Thu, 20 Jan 2022 16:25:19 +0000 (16:25 +0000)
committerPeter Zijlstra <peterz@infradead.org>
Tue, 1 Mar 2022 15:18:39 +0000 (16:18 +0100)
As of commit

  c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

the following sequence becomes possible:

      p->__state = TASK_INTERRUPTIBLE;
      __schedule()
deactivate_task(p);
  ttwu()
    READ !p->on_rq
    p->__state=TASK_WAKING
trace_sched_switch()
  __trace_sched_switch_state()
    task_state_index()
      return 0;

TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.

Prevent this by pushing the value read from __schedule() down the trace
event.

Reported-by: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220120162520.570782-2-valentin.schneider@arm.com
include/linux/sched.h
include/trace/events/sched.h
kernel/sched/core.c
kernel/trace/fgraph.c
kernel/trace/ftrace.c
kernel/trace/trace_events.c
kernel/trace/trace_osnoise.c
kernel/trace/trace_sched_switch.c
kernel/trace/trace_sched_wakeup.c

index f00132e7ef6eda42b26adf84f58d76252f4cb6ef..457c8a058b7701d236fe451ff185ae78b367276f 100644 (file)
@@ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 #define TASK_REPORT_IDLE       (TASK_REPORT + 1)
 #define TASK_REPORT_MAX                (TASK_REPORT_IDLE << 1)
 
-static inline unsigned int task_state_index(struct task_struct *tsk)
+static inline unsigned int __task_state_index(unsigned int tsk_state,
+                                             unsigned int tsk_exit_state)
 {
-       unsigned int tsk_state = READ_ONCE(tsk->__state);
-       unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
+       unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;
 
        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
 
@@ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
        return fls(state);
 }
 
+static inline unsigned int task_state_index(struct task_struct *tsk)
+{
+       return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
+}
+
 static inline char task_index_to_char(unsigned int state)
 {
        static const char state_char[] = "RSDTtXZPI";
index 94640482cfe7a3a988f6afe9082dc35fabc60d70..65e7867563214270ec80bdbad1da5d8a824a4f79 100644 (file)
@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
             TP_ARGS(p));
 
 #ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt,
+                                             unsigned int prev_state,
+                                             struct task_struct *p)
 {
        unsigned int state;
 
@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
         * it for left shift operation to get the correct task->state
         * mapping.
         */
-       state = task_state_index(p);
+       state = __task_state_index(prev_state, p->exit_state);
 
        return state ? (1 << (state - 1)) : state;
 }
@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
 TRACE_EVENT(sched_switch,
 
        TP_PROTO(bool preempt,
+                unsigned int prev_state,
                 struct task_struct *prev,
                 struct task_struct *next),
 
-       TP_ARGS(preempt, prev, next),
+       TP_ARGS(preempt, prev_state, prev, next),
 
        TP_STRUCT__entry(
                __array(        char,   prev_comm,      TASK_COMM_LEN   )
@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch,
                memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                __entry->prev_pid       = prev->pid;
                __entry->prev_prio      = prev->prio;
-               __entry->prev_state     = __trace_sched_switch_state(preempt, prev);
+               __entry->prev_state     = __trace_sched_switch_state(preempt, prev_state, prev);
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->next_pid       = next->pid;
                __entry->next_prio      = next->prio;
index ef946123e9af42f834c8a20c8c2bc9f6504ace24..3aafc15da24a821c4caaa955a9281ede945951b0 100644 (file)
@@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 {
        struct rq *rq = this_rq();
        struct mm_struct *mm = rq->prev_mm;
-       long prev_state;
+       unsigned int prev_state;
 
        /*
         * The previous task will have left us with a preempt_count of 2
@@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
                migrate_disable_switch(rq, prev);
                psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
 
                /* Also unlocks the rq: */
                rq = context_switch(rq, prev, next, &rf);
index 22061d38fc00fd65661a824099dba1eebfa9fe7d..19028e072cdb6a84d42777449c34f24f77f5bdee 100644 (file)
@@ -415,7 +415,9 @@ free:
 
 static void
 ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
-                       struct task_struct *prev, struct task_struct *next)
+                               unsigned int prev_state,
+                               struct task_struct *prev,
+                               struct task_struct *next)
 {
        unsigned long long timestamp;
        int index;
index f9feb197b2daaf348622665924795a3ae88e4fda..6762ae029fdd2c8232b93050419e9742c1c1d70a 100644 (file)
@@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 
 static void
 ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
-                   struct task_struct *prev, struct task_struct *next)
+                                    unsigned int prev_state,
+                                    struct task_struct *prev,
+                                    struct task_struct *next)
 {
        struct trace_array *tr = data;
        struct trace_pid_list *pid_list;
index 3147614c1812a16293b0abd47dca1f8f00113b49..2a19ea747ff4a9fc7c1909842eddd5cb4cec4add 100644 (file)
@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
 
 static void
 event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
-                   struct task_struct *prev, struct task_struct *next)
+                                       unsigned int prev_state,
+                                       struct task_struct *prev,
+                                       struct task_struct *next)
 {
        struct trace_array *tr = data;
        struct trace_pid_list *no_pid_list;
@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 
 static void
 event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
-                   struct task_struct *prev, struct task_struct *next)
+                                        unsigned int prev_state,
+                                        struct task_struct *prev,
+                                        struct task_struct *next)
 {
        struct trace_array *tr = data;
        struct trace_pid_list *no_pid_list;
index 870a08da5b489596f3a69e40bad173d2e8124100..1829b4cb8cc17c88b762cb2a1bb1bd60ce69a111 100644 (file)
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
  * used to record the beginning and to report the end of a thread noise window.
  */
 static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+                           unsigned int prev_state,
+                           struct task_struct *p,
                            struct task_struct *n)
 {
        struct osnoise_variables *osn_var = this_cpu_osn_var();
index e304196d7c2853ae0056c06c78d74819468265ca..993b0ed10d8cf1a4db349319148e3d5417a1f25b 100644 (file)
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(void *ignore, bool preempt,
+                  unsigned int prev_state,
                   struct task_struct *prev, struct task_struct *next)
 {
        int flags;
index 2402de520eca71331b324eef1dfe6528de7c055e..46429f9a96fafd176cab549a5bdc49030601d844 100644 (file)
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 
 static void notrace
 probe_wakeup_sched_switch(void *ignore, bool preempt,
+                         unsigned int prev_state,
                          struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array_cpu *data;