Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux-block.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 7d1ea9240af08f627c376108609a6a5b2390e2d3..5800b0623ff30687cf60b24e5109fc40e5ee9229 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
  
  #include <uapi/linux/sched/types.h>
  
+#include <asm/irq_regs.h>
  #include <asm/switch_to.h>
  #include <asm/tlb.h>
  
@@ -142,11 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
   * Number of tasks to iterate in a single balance run.
   * Limited because this is done with IRQs disabled.
   */
-#ifdef CONFIG_PREEMPT_RT
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
-#else
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-#endif
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
  
  __read_mostly int scheduler_running;
  
@@ -360,10 +357,7 @@ static void __sched_core_flip(bool enabled)
         /*
          * Toggle the offline CPUs.
          */
-       cpumask_copy(&sched_core_mask, cpu_possible_mask);
-       cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
-
-       for_each_cpu(cpu, &sched_core_mask)
+       for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
                 cpu_rq(cpu)->core_enabled = enabled;
  
         cpus_read_unlock();
@@ -481,8 +475,7 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
   *                             p->se.load, p->rt_priority,
   *                             p->dl.dl_{runtime, deadline, period, flags, bw, density}
   *  - sched_setnuma():         p->numa_preferred_nid
- *  - sched_move_task()/
- *    cpu_cgroup_fork():       p->sched_task_group
+ *  - sched_move_task():       p->sched_task_group
   *  - uclamp_update_active()   p->uclamp*
   *
   * p->state <- TASK_*:
@@ -2329,7 +2322,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
         rq = cpu_rq(new_cpu);
  
         rq_lock(rq, rf);
-       BUG_ON(task_cpu(p) != new_cpu);
+       WARN_ON_ONCE(task_cpu(p) != new_cpu);
         activate_task(rq, p, 0);
         check_preempt_curr(rq, p, 0);
  
@@ -2779,7 +2772,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
                 return -EINVAL;
         }
  
-       if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+       if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
                 /*
                  * MIGRATE_ENABLE gets here because 'p == current', but for
                  * anything else we cannot do is_migration_disabled(), punt
@@ -3255,12 +3248,12 @@ out:
  /*
   * wait_task_inactive - wait for a thread to unschedule.
   *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change.  If it changes, i.e. @p might have woken up,
- * then return zero.  When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count).  If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero.  When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count).  If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
   *
   * The caller must ensure that the task *will* unschedule sometime soon,
   * else this function might spin for a *long* time. This function can't
@@ -3291,12 +3284,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
                  *
                  * NOTE! Since we don't hold any locks, it's not
                  * even sure that "rq" stays as the right runqueue!
-                * But we don't care, since "task_running()" will
+                * But we don't care, since "task_on_cpu()" will
                  * return false if the runqueue has changed and p
                  * is actually now running somewhere else!
                  */
-               while (task_running(rq, p)) {
-                       if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+               while (task_on_cpu(rq, p)) {
+                       if (!(READ_ONCE(p->__state) & match_state))
                                 return 0;
                         cpu_relax();
                 }
@@ -3308,10 +3301,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
                  */
                 rq = task_rq_lock(p, &rf);
                 trace_sched_wait_task(p);
-               running = task_running(rq, p);
+               running = task_on_cpu(rq, p);
                 queued = task_on_rq_queued(p);
                 ncsw = 0;
-               if (!match_state || READ_ONCE(p->__state) == match_state)
+               if (READ_ONCE(p->__state) & match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                 task_rq_unlock(rq, p, &rf);
  
@@ -4397,6 +4390,17 @@ void set_numabalancing_state(bool enabled)
  }
  
  #ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+       struct pglist_data *pgdat;
+
+       for_each_online_pgdat(pgdat) {
+               pgdat->nbp_threshold = 0;
+               pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+               pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+       }
+}
+
  int sysctl_numa_balancing(struct ctl_table *table, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
  {
@@ -4413,6 +4417,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
         if (err < 0)
                 return err;
         if (write) {
+               if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+                   (state & NUMA_BALANCING_MEMORY_TIERING))
+                       reset_memory_tiering();
                 sysctl_numa_balancing_mode = state;
                 __set_numabalancing_state(state);
         }
@@ -5167,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                  * finish_task_switch()'s mmdrop().
                  */
                 switch_mm_irqs_off(prev->active_mm, next->mm, next);
+               lru_gen_use_mm(next->mm);
  
                 if (!prev->mm) {                        // from kernel
                         /* will mmdrop() in finish_task_switch(). */
@@ -6430,7 +6438,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
                         prev->sched_contributes_to_load =
                                 (prev_state & TASK_UNINTERRUPTIBLE) &&
                                 !(prev_state & TASK_NOLOAD) &&
-                               !(prev->flags & PF_FROZEN);
+                               !(prev_state & TASK_FROZEN);
  
                         if (prev->sched_contributes_to_load)
                                 rq->nr_uninterruptible++;
@@ -8650,7 +8658,7 @@ again:
         if (curr->sched_class != p->sched_class)
                 goto out_unlock;
  
-       if (task_running(p_rq, p) || !task_is_running(p))
+       if (task_on_cpu(p_rq, p) || !task_is_running(p))
                 goto out_unlock;
  
         yielded = curr->sched_class->yield_to_task(rq, p);
@@ -8862,7 +8870,7 @@ void sched_show_task(struct task_struct *p)
         if (pid_alive(p))
                 ppid = task_pid_nr(rcu_dereference(p->real_parent));
         rcu_read_unlock();
-       pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+       pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
                 free, task_pid_nr(p), ppid,
                 read_task_thread_flags(p));
  
@@ -8890,7 +8898,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p)
          * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
          * TASK_KILLABLE).
          */
-       if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
+       if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
                 return false;
  
         return true;
@@ -9602,9 +9610,6 @@ LIST_HEAD(task_groups);
  static struct kmem_cache *task_group_cache __read_mostly;
  #endif
  
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
-
  void __init sched_init(void)
  {
         unsigned long ptr = 0;
@@ -9648,14 +9653,6 @@ void __init sched_init(void)
  
  #endif /* CONFIG_RT_GROUP_SCHED */
         }
-#ifdef CONFIG_CPUMASK_OFFSTACK
-       for_each_possible_cpu(i) {
-               per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
-                       cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-               per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
-                       cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-       }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
  
         init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
  
@@ -10164,7 +10161,7 @@ void sched_release_group(struct task_group *tg)
         spin_unlock_irqrestore(&task_group_lock, flags);
  }
  
-static void sched_change_group(struct task_struct *tsk, int type)
+static void sched_change_group(struct task_struct *tsk)
  {
         struct task_group *tg;
  
@@ -10180,7 +10177,7 @@ static void sched_change_group(struct task_struct *tsk, int type)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         if (tsk->sched_class->task_change_group)
-               tsk->sched_class->task_change_group(tsk, type);
+               tsk->sched_class->task_change_group(tsk);
         else
  #endif
                 set_task_rq(tsk, task_cpu(tsk));
@@ -10211,7 +10208,7 @@ void sched_move_task(struct task_struct *tsk)
         if (running)
                 put_prev_task(rq, tsk);
  
-       sched_change_group(tsk, TASK_MOVE_GROUP);
+       sched_change_group(tsk);
  
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
@@ -10289,53 +10286,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
         sched_unregister_group(tg);
  }
  
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
-       struct rq_flags rf;
-       struct rq *rq;
-
-       rq = task_rq_lock(task, &rf);
-
-       update_rq_clock(rq);
-       sched_change_group(task, TASK_SET_GROUP);
-
-       task_rq_unlock(rq, task, &rf);
-}
-
+#ifdef CONFIG_RT_GROUP_SCHED
  static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  {
         struct task_struct *task;
         struct cgroup_subsys_state *css;
-       int ret = 0;
  
         cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
-#endif
-               /*
-                * Serialize against wake_up_new_task() such that if it's
-                * running, we're sure to observe its full state.
-                */
-               raw_spin_lock_irq(&task->pi_lock);
-               /*
-                * Avoid calling sched_move_task() before wake_up_new_task()
-                * has happened. This would lead to problems with PELT, due to
-                * move wanting to detach+attach while we're not attached yet.
-                */
-               if (READ_ONCE(task->__state) == TASK_NEW)
-                       ret = -EINVAL;
-               raw_spin_unlock_irq(&task->pi_lock);
-
-               if (ret)
-                       break;
         }
-       return ret;
+       return 0;
  }
+#endif
  
  static void cpu_cgroup_attach(struct cgroup_taskset *tset)
  {
@@ -11171,8 +11134,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
         .css_released   = cpu_cgroup_css_released,
         .css_free       = cpu_cgroup_css_free,
         .css_extra_stat_show = cpu_extra_stat_show,
-       .fork           = cpu_cgroup_fork,
+#ifdef CONFIG_RT_GROUP_SCHED
         .can_attach     = cpu_cgroup_can_attach,
+#endif
         .attach         = cpu_cgroup_attach,
         .legacy_cftypes = cpu_legacy_files,
         .dfl_cftypes    = cpu_files,
@@ -11184,6 +11148,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
  
  void dump_cpu_task(int cpu)
  {
+       if (cpu == smp_processor_id() && in_hardirq()) {
+               struct pt_regs *regs;
+
+               regs = get_irq_regs();
+               if (regs) {
+                       show_regs(regs);
+                       return;
+               }
+       }
+
+       if (trigger_single_cpu_backtrace(cpu))
+               return;
+
         pr_info("Task dump for CPU %d:\n", cpu);
         sched_show_task(cpu_curr(cpu));
  }