sched_ext: Don't call put_prev_task_scx() before picking the next task

author Tejun Heo <tj@kernel.org>

Wed, 4 Sep 2024 07:54:28 +0000 (21:54 -1000)

committer Tejun Heo <tj@kernel.org>

Wed, 4 Sep 2024 07:54:28 +0000 (21:54 -1000)
author Tejun Heo <tj@kernel.org>
Wed, 4 Sep 2024 07:54:28 +0000 (21:54 -1000)
committer Tejun Heo <tj@kernel.org>
Wed, 4 Sep 2024 07:54:28 +0000 (21:54 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 5e862c7c0e6b429cd91daf5906419c9a2a2402cf..be86dbfa75a8ec50c90f7a0b20e2973b47dc8df1 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -630,11 +630,8 @@ enum scx_enq_flags {
          * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
          * %SCX_ENQ_LAST flag set.
          *
-        * If the BPF scheduler wants to continue executing the task,
-        * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
-        * If the task gets queued on a different dsq or the BPF side, the BPF
-        * scheduler is responsible for triggering a follow-up scheduling event.
-        * Otherwise, Execution may stall.
+        * The BPF scheduler is responsible for triggering a follow-up
+        * scheduling event. Otherwise, Execution may stall.
          */
         SCX_ENQ_LAST            = 1LLU << 41,
  
@@ -1852,12 +1849,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
         if (!scx_rq_online(rq))
                 goto local;
  
-       if (scx_ops_bypassing()) {
-               if (enq_flags & SCX_ENQ_LAST)
-                       goto local;
-               else
-                       goto global;
-       }
+       if (scx_ops_bypassing())
+               goto global;
  
         if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
                 goto direct;
@@ -1867,11 +1860,6 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
             unlikely(p->flags & PF_EXITING))
                 goto local;
  
-       /* see %SCX_OPS_ENQ_LAST */
-       if (!static_branch_unlikely(&scx_ops_enq_last) &&
-           (enq_flags & SCX_ENQ_LAST))
-               goto local;
-
         if (!SCX_HAS_OP(enqueue))
                 goto global;
  
@@ -2517,7 +2505,6 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
         struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
         bool prev_on_scx = prev->sched_class == &ext_sched_class;
         int nr_loops = SCX_DSP_MAX_LOOPS;
-       bool has_tasks = false;
  
         lockdep_assert_rq_held(rq);
         rq->scx.flags |= SCX_RQ_IN_BALANCE;
@@ -2542,9 +2529,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
                 /*
                  * If @prev is runnable & has slice left, it has priority and
                  * fetching more just increases latency for the fetched tasks.
-                * Tell put_prev_task_scx() to put @prev on local_dsq. If the
-                * BPF scheduler wants to handle this explicitly, it should
-                * implement ->cpu_released().
+                * Tell pick_next_task_scx() to keep running @prev. If the BPF
+                * scheduler wants to handle this explicitly, it should
+                * implement ->cpu_release().
                  *
                  * See scx_ops_disable_workfn() for the explanation on the
                  * bypassing test.
@@ -2570,7 +2557,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
                 goto has_tasks;
  
         if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
-               goto out;
+               goto no_tasks;
  
         dspc->rq = rq;
  
@@ -2609,13 +2596,23 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
                 }
         } while (dspc->nr_tasks);
  
-       goto out;
+no_tasks:
+       /*
+        * Didn't find another task to run. Keep running @prev unless
+        * %SCX_OPS_ENQ_LAST is in effect.
+        */
+       if ((prev->scx.flags & SCX_TASK_QUEUED) &&
+           (!static_branch_unlikely(&scx_ops_enq_last) || scx_ops_bypassing())) {
+               if (local)
+                       prev->scx.flags |= SCX_TASK_BAL_KEEP;
+               goto has_tasks;
+       }
+       rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+       return false;
  
  has_tasks:
-       has_tasks = true;
-out:
         rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
-       return has_tasks;
+       return true;
  }
  
  static int balance_scx(struct rq *rq, struct task_struct *prev,
@@ -2728,25 +2725,16 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
         if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
                 SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
  
-       /*
-        * If we're being called from put_prev_task_balance(), balance_scx() may
-        * have decided that @p should keep running.
-        */
-       if (p->scx.flags & SCX_TASK_BAL_KEEP) {
+       if (p->scx.flags & SCX_TASK_QUEUED) {
                 p->scx.flags &= ~SCX_TASK_BAL_KEEP;
-               set_task_runnable(rq, p);
-               dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
-               return;
-       }
  
-       if (p->scx.flags & SCX_TASK_QUEUED) {
                 set_task_runnable(rq, p);
  
                 /*
-                * If @p has slice left and balance_scx() didn't tag it for
-                * keeping, @p is getting preempted by a higher priority
-                * scheduler class or core-sched forcing a different task. Leave
-                * it at the head of the local DSQ.
+                * If @p has slice left and is being put, @p is getting
+                * preempted by a higher priority scheduler class or core-sched
+                * forcing a different task. Leave it at the head of the local
+                * DSQ.
                  */
                 if (p->scx.slice && !scx_ops_bypassing()) {
                         dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
@@ -2754,18 +2742,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
                 }
  
                 /*
-                * If we're in the pick_next_task path, balance_scx() should
-                * have already populated the local DSQ if there are any other
-                * available tasks. If empty, tell ops.enqueue() that @p is the
-                * only one available for this cpu. ops.enqueue() should put it
-                * on the local DSQ so that the subsequent pick_next_task_scx()
-                * can find the task unless it wants to trigger a separate
-                * follow-up scheduling event.
+                * If @p is runnable but we're about to enter a lower
+                * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
+                * ops.enqueue() that @p is the only one available for this cpu,
+                * which should trigger an explicit follow-up scheduling event.
                  */
-               if (list_empty(&rq->scx.local_dsq.list))
+               if (sched_class_above(&ext_sched_class, next->sched_class)) {
+                       WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
                         do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
-               else
+               } else {
                         do_enqueue_task(rq, p, 0, -1);
+               }
         }
  }
  
@@ -2780,27 +2767,33 @@ static struct task_struct *pick_next_task_scx(struct rq *rq,
  {
         struct task_struct *p;
  
-       if (prev->sched_class == &ext_sched_class)
-               put_prev_task_scx(rq, prev, NULL);
-
-       p = first_local_task(rq);
-       if (!p)
-               return NULL;
-
-       if (prev->sched_class != &ext_sched_class)
-               prev->sched_class->put_prev_task(rq, prev, p);
-
-       set_next_task_scx(rq, p, true);
+       /*
+        * If balance_scx() is telling us to keep running @prev, replenish slice
+        * if necessary and keep running @prev. Otherwise, pop the first one
+        * from the local DSQ.
+        */
+       if (prev->scx.flags & SCX_TASK_BAL_KEEP) {
+               prev->scx.flags &= ~SCX_TASK_BAL_KEEP;
+               p = prev;
+               if (!p->scx.slice)
+                       p->scx.slice = SCX_SLICE_DFL;
+       } else {
+               p = first_local_task(rq);
+               if (!p)
+                       return NULL;
  
-       if (unlikely(!p->scx.slice)) {
-               if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
-                       printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
-                                       p->comm, p->pid);
-                       scx_warned_zero_slice = true;
+               if (unlikely(!p->scx.slice)) {
+                       if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
+                               printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+                                               p->comm, p->pid);
+                               scx_warned_zero_slice = true;
+                       }
+                       p->scx.slice = SCX_SLICE_DFL;
                 }
-               p->scx.slice = SCX_SLICE_DFL;
         }
  
+       put_prev_set_next_task(rq, prev, p);
+
         return p;
  }
  
@@ -3875,12 +3868,13 @@ bool task_should_scx(struct task_struct *p)
   * to force global FIFO scheduling.
   *
   * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ *    %SCX_OPS_ENQ_LAST is also ignored.
   *
   * b. ops.dispatch() is ignored.
   *
- * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
- *    trusted. Whenever a tick triggers, the running task is rotated to the tail
- *    of the queue with core_sched_at touched.
+ * c. balance_scx() does not set %SCX_TASK_BAL_KEEP on non-zero slice as slice
+ *    can't be trusted. Whenever a tick triggers, the running task is rotated to
+ *    the tail of the queue with core_sched_at touched.
   *
   * d. pick_next_task() suppresses zero slice warning.
   *
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c

index 892278f12dce904a8f58f77a254271337ab1d509..7e7f28f4117eb973acfbe0f346d2c8b2ee6e53dd 100644 (file)
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -205,8 +205,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
  
         /*
          * All enqueued tasks must have their core_sched_seq updated for correct
-        * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
-        * qmap_ops.flags.
+        * core-sched ordering. Also, take a look at the end of qmap_dispatch().
          */
         tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
  
@@ -214,7 +213,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
          * If qmap_select_cpu() is telling us to or this is the last runnable
          * task on the CPU, enqueue locally.
          */
-       if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
+       if (tctx->force_local) {
                 tctx->force_local = false;
                 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
                 return;
@@ -285,6 +284,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
  {
         struct task_struct *p;
         struct cpu_ctx *cpuc;
+       struct task_ctx *tctx;
         u32 zero = 0, batch = dsp_batch ?: 1;
         void *fifo;
         s32 i, pid;
@@ -349,6 +349,21 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
  
                 cpuc->dsp_cnt = 0;
         }
+
+       /*
+        * No other tasks. @prev will keep running. Update its core_sched_seq as
+        * if the task were enqueued and dispatched immediately.
+        */
+       if (prev) {
+               tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+               if (!tctx) {
+                       scx_bpf_error("task_ctx lookup failed");
+                       return;
+               }
+
+               tctx->core_sched_seq =
+                       core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+       }
  }
  
  void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
@@ -701,6 +716,5 @@ SCX_OPS_DEFINE(qmap_ops,
                .cpu_offline             = (void *)qmap_cpu_offline,
                .init                    = (void *)qmap_init,
                .exit                    = (void *)qmap_exit,
-              .flags                   = SCX_OPS_ENQ_LAST,
                .timeout_ms              = 5000U,
                .name                    = "qmap");
author	Tejun Heo <tj@kernel.org>
	Wed, 4 Sep 2024 07:54:28 +0000 (21:54 -1000)
committer	Tejun Heo <tj@kernel.org>
	Wed, 4 Sep 2024 07:54:28 +0000 (21:54 -1000)
kernel/sched/ext.c		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.bpf.c		patch \| blob \| blame \| history