sched_ext: idle: Honor idle flags in the built-in idle selection policy
authorAndrea Righi <arighi@nvidia.com>
Fri, 14 Mar 2025 09:45:33 +0000 (10:45 +0100)
committerTejun Heo <tj@kernel.org>
Fri, 14 Mar 2025 18:17:01 +0000 (08:17 -1000)
Enable passing idle flags (%SCX_PICK_IDLE_*) to scx_select_cpu_dfl(),
to enforce strict selection criteria, such as selecting an idle CPU
strictly within @prev_cpu's node or choosing only a fully idle SMT core.

This functionality will be exposed through a dedicated kfunc in a
separate patch.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
kernel/sched/ext.c
kernel/sched/ext_idle.c
kernel/sched/ext_idle.h

index db5bc4d57dba4ddb45f281d333e178258a00e0ba..1756fbb8a668f5dcab83907801d054c4af6f3fca 100644 (file)
@@ -3396,7 +3396,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
                bool found;
                s32 cpu;
 
-               cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
+               cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0, &found);
                p->scx.selected_cpu = cpu;
                if (found) {
                        p->scx.slice = SCX_SLICE_DFL;
index 15e9d1c8b2815381b99bc84e5ecb51a269501012..16981456ec1edd7c23f3f08986a793583b97430b 100644 (file)
@@ -418,7 +418,7 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
  * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
  * we never call ops.select_cpu() for them, see select_task_rq().
  */
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found)
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags, bool *found)
 {
        const struct cpumask *llc_cpus = NULL;
        const struct cpumask *numa_cpus = NULL;
@@ -455,12 +455,13 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool
         * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
         */
        if (wake_flags & SCX_WAKE_SYNC) {
-               cpu = smp_processor_id();
+               int waker_node;
 
                /*
                 * If the waker's CPU is cache affine and prev_cpu is idle,
                 * then avoid a migration.
                 */
+               cpu = smp_processor_id();
                if (cpus_share_cache(cpu, prev_cpu) &&
                    scx_idle_test_and_clear_cpu(prev_cpu)) {
                        cpu = prev_cpu;
@@ -480,9 +481,11 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool
                 * piled up on it even if there is an idle core elsewhere on
                 * the system.
                 */
+               waker_node = cpu_to_node(cpu);
                if (!(current->flags & PF_EXITING) &&
                    cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
-                   !cpumask_empty(idle_cpumask(cpu_to_node(cpu))->cpu)) {
+                   (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
+                   !cpumask_empty(idle_cpumask(waker_node)->cpu)) {
                        if (cpumask_test_cpu(cpu, p->cpus_ptr))
                                goto cpu_found;
                }
@@ -521,15 +524,25 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool
                }
 
                /*
-                * Search for any full idle core usable by the task.
+                * Search for any full-idle core usable by the task.
                 *
-                * If NUMA aware idle selection is enabled, the search will
+                * If the node-aware idle CPU selection policy is enabled
+                * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
                 * begin in prev_cpu's node and proceed to other nodes in
                 * order of increasing distance.
                 */
-               cpu = scx_pick_idle_cpu(p->cpus_ptr, node, SCX_PICK_IDLE_CORE);
+               cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
                if (cpu >= 0)
                        goto cpu_found;
+
+               /*
+                * Give up if we're strictly looking for a full-idle SMT
+                * core.
+                */
+               if (flags & SCX_PICK_IDLE_CORE) {
+                       cpu = prev_cpu;
+                       goto out_unlock;
+               }
        }
 
        /*
@@ -560,18 +573,24 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool
 
        /*
         * Search for any idle CPU usable by the task.
+        *
+        * If the node-aware idle CPU selection policy is enabled
+        * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
+        * in prev_cpu's node and proceed to other nodes in order of
+        * increasing distance.
         */
-       cpu = scx_pick_idle_cpu(p->cpus_ptr, node, 0);
+       cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
        if (cpu >= 0)
                goto cpu_found;
 
-       rcu_read_unlock();
-       return prev_cpu;
+       cpu = prev_cpu;
+       goto out_unlock;
 
 cpu_found:
+       *found = true;
+out_unlock:
        rcu_read_unlock();
 
-       *found = true;
        return cpu;
 }
 
@@ -810,7 +829,7 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
                goto prev_cpu;
 
 #ifdef CONFIG_SMP
-       return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
+       return scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0, is_idle);
 #endif
 
 prev_cpu:
index 68c4307ce4f6f2fae0d9b88295649e35abfa5ee3..5c1db6b315f7ab0a96fb424958c0470d1d2ef06c 100644 (file)
@@ -27,7 +27,7 @@ static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node
 }
 #endif /* CONFIG_SMP */
 
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found);
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags, bool *found);
 void scx_idle_enable(struct sched_ext_ops *ops);
 void scx_idle_disable(void);
 int scx_idle_init(void);