sched/psi: Fix periodic aggregation shut off
authorChengming Zhou <zhouchengming@bytedance.com>
Thu, 25 Aug 2022 16:41:02 +0000 (00:41 +0800)
committerPeter Zijlstra <peterz@infradead.org>
Fri, 9 Sep 2022 09:08:30 +0000 (11:08 +0200)
We don't want to wake periodic aggregation work back up if the
task change is the aggregation worker itself going to sleep, or
we'll ping-pong forever.

Previously, we would use psi_task_change() in psi_dequeue() when
task going to sleep, so this check was put in psi_task_change().

But commit 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups")
defer task sleep handling to psi_task_switch(), won't go through
psi_task_change() anymore.

So this patch move this check to psi_task_switch().

Fixes: 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220825164111.29534-2-zhouchengming@bytedance.com
kernel/sched/psi.c

index ecb4b4ff4ce0aba2e4f85c3eaba0c1e1a3f37d43..39463dcc16bb61a5cf08bd6080810ffdc96c27a6 100644 (file)
@@ -796,7 +796,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 {
        int cpu = task_cpu(task);
        struct psi_group *group;
-       bool wake_clock = true;
        void *iter = NULL;
        u64 now;
 
@@ -806,19 +805,9 @@ void psi_task_change(struct task_struct *task, int clear, int set)
        psi_flags_change(task, clear, set);
 
        now = cpu_clock(cpu);
-       /*
-        * Periodic aggregation shuts off if there is a period of no
-        * task changes, so we wake it back up if necessary. However,
-        * don't do this if the task change is the aggregation worker
-        * itself going to sleep, or we'll ping-pong forever.
-        */
-       if (unlikely((clear & TSK_RUNNING) &&
-                    (task->flags & PF_WQ_WORKER) &&
-                    wq_worker_last_func(task) == psi_avgs_work))
-               wake_clock = false;
 
        while ((group = iterate_groups(task, &iter)))
-               psi_group_change(group, cpu, clear, set, now, wake_clock);
+               psi_group_change(group, cpu, clear, set, now, true);
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -854,6 +843,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 
        if (prev->pid) {
                int clear = TSK_ONCPU, set = 0;
+               bool wake_clock = true;
 
                /*
                 * When we're going to sleep, psi_dequeue() lets us
@@ -867,13 +857,23 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                                clear |= TSK_MEMSTALL_RUNNING;
                        if (prev->in_iowait)
                                set |= TSK_IOWAIT;
+
+                       /*
+                        * Periodic aggregation shuts off if there is a period of no
+                        * task changes, so we wake it back up if necessary. However,
+                        * don't do this if the task change is the aggregation worker
+                        * itself going to sleep, or we'll ping-pong forever.
+                        */
+                       if (unlikely((prev->flags & PF_WQ_WORKER) &&
+                                    wq_worker_last_func(prev) == psi_avgs_work))
+                               wake_clock = false;
                }
 
                psi_flags_change(prev, clear, set);
 
                iter = NULL;
                while ((group = iterate_groups(prev, &iter)) && group != common)
-                       psi_group_change(group, cpu, clear, set, now, true);
+                       psi_group_change(group, cpu, clear, set, now, wake_clock);
 
                /*
                 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
@@ -882,7 +882,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                if (sleep) {
                        clear &= ~TSK_ONCPU;
                        for (; group; group = iterate_groups(prev, &iter))
-                               psi_group_change(group, cpu, clear, set, now, true);
+                               psi_group_change(group, cpu, clear, set, now, wake_clock);
                }
        }
 }