timers/nohz: Protect idle/iowait sleep time under seqcount
authorFrederic Weisbecker <frederic@kernel.org>
Wed, 22 Feb 2023 14:46:44 +0000 (15:46 +0100)
committerThomas Gleixner <tglx@linutronix.de>
Tue, 18 Apr 2023 14:35:12 +0000 (16:35 +0200)
Reading idle/IO sleep time (eg: from /proc/stat) can race with idle exit
updates because the state machine handling the stats is not atomic and
requires a coherent read batch.

As a result reading the sleep time may report irrelevant or backward
values.

Fix this with protecting the simple state machine within a seqcount.
This is expected to be cheap enough not to add measurable performance
impact on the idle path.

Note this only fixes reader VS writer condition partitially. A race
remains that involves remote updates of the CPU iowait task counter. It
can hardly be fixed.

Reported-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230222144649.624380-4-frederic@kernel.org
kernel/time/tick-sched.c
kernel/time/tick-sched.h

index 9058b9eb8bc1f5a2f2c7bd6da80ff065d567dd20..90d9b7b29875f4c592db9f5ee8e67ecc2400fa30 100644 (file)
@@ -646,6 +646,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 
        delta = ktime_sub(now, ts->idle_entrytime);
 
+       write_seqcount_begin(&ts->idle_sleeptime_seq);
        if (nr_iowait_cpu(smp_processor_id()) > 0)
                ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
        else
@@ -653,14 +654,18 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 
        ts->idle_entrytime = now;
        ts->idle_active = 0;
+       write_seqcount_end(&ts->idle_sleeptime_seq);
 
        sched_clock_idle_wakeup_event();
 }
 
 static void tick_nohz_start_idle(struct tick_sched *ts)
 {
+       write_seqcount_begin(&ts->idle_sleeptime_seq);
        ts->idle_entrytime = ktime_get();
        ts->idle_active = 1;
+       write_seqcount_end(&ts->idle_sleeptime_seq);
+
        sched_clock_idle_sleep_event();
 }
 
@@ -668,6 +673,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
                                 bool compute_delta, u64 *last_update_time)
 {
        ktime_t now, idle;
+       unsigned int seq;
 
        if (!tick_nohz_active)
                return -1;
@@ -676,13 +682,17 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
        if (last_update_time)
                *last_update_time = ktime_to_us(now);
 
-       if (ts->idle_active && compute_delta) {
-               ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+       do {
+               seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
 
-               idle = ktime_add(*sleeptime, delta);
-       } else {
-               idle = *sleeptime;
-       }
+               if (ts->idle_active && compute_delta) {
+                       ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+                       idle = ktime_add(*sleeptime, delta);
+               } else {
+                       idle = *sleeptime;
+               }
+       } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
 
        return ktime_to_us(idle);
 
index c6663254d17d862a1ac2121b268969c98eb80d1d..5ed5a9d41d5a7a9489f954635b64a5bf04c0cf6e 100644 (file)
@@ -75,6 +75,7 @@ struct tick_sched {
        ktime_t                         idle_waketime;
 
        /* Idle entry */
+       seqcount_t                      idle_sleeptime_seq;
        ktime_t                         idle_entrytime;
 
        /* Tick stop */