Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-block.git] / kernel / sched / core.c
index 7d1ea9240af08f627c376108609a6a5b2390e2d3..5800b0623ff30687cf60b24e5109fc40e5ee9229 100644 (file)
@@ -73,6 +73,7 @@
 
 #include <uapi/linux/sched/types.h>
 
+#include <asm/irq_regs.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
 
@@ -142,11 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
-#ifdef CONFIG_PREEMPT_RT
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
-#else
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-#endif
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
 
 __read_mostly int scheduler_running;
 
@@ -360,10 +357,7 @@ static void __sched_core_flip(bool enabled)
        /*
         * Toggle the offline CPUs.
         */
-       cpumask_copy(&sched_core_mask, cpu_possible_mask);
-       cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
-
-       for_each_cpu(cpu, &sched_core_mask)
+       for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
                cpu_rq(cpu)->core_enabled = enabled;
 
        cpus_read_unlock();
@@ -481,8 +475,7 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
  *                             p->se.load, p->rt_priority,
  *                             p->dl.dl_{runtime, deadline, period, flags, bw, density}
  *  - sched_setnuma():         p->numa_preferred_nid
- *  - sched_move_task()/
- *    cpu_cgroup_fork():       p->sched_task_group
+ *  - sched_move_task():       p->sched_task_group
  *  - uclamp_update_active()   p->uclamp*
  *
  * p->state <- TASK_*:
@@ -2329,7 +2322,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
        rq = cpu_rq(new_cpu);
 
        rq_lock(rq, rf);
-       BUG_ON(task_cpu(p) != new_cpu);
+       WARN_ON_ONCE(task_cpu(p) != new_cpu);
        activate_task(rq, p, 0);
        check_preempt_curr(rq, p, 0);
 
@@ -2779,7 +2772,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
                return -EINVAL;
        }
 
-       if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+       if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
                /*
                 * MIGRATE_ENABLE gets here because 'p == current', but for
                 * anything else we cannot do is_migration_disabled(), punt
@@ -3255,12 +3248,12 @@ out:
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change.  If it changes, i.e. @p might have woken up,
- * then return zero.  When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count).  If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero.  When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count).  If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
@@ -3291,12 +3284,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
                 *
                 * NOTE! Since we don't hold any locks, it's not
                 * even sure that "rq" stays as the right runqueue!
-                * But we don't care, since "task_running()" will
+                * But we don't care, since "task_on_cpu()" will
                 * return false if the runqueue has changed and p
                 * is actually now running somewhere else!
                 */
-               while (task_running(rq, p)) {
-                       if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+               while (task_on_cpu(rq, p)) {
+                       if (!(READ_ONCE(p->__state) & match_state))
                                return 0;
                        cpu_relax();
                }
@@ -3308,10 +3301,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
                 */
                rq = task_rq_lock(p, &rf);
                trace_sched_wait_task(p);
-               running = task_running(rq, p);
+               running = task_on_cpu(rq, p);
                queued = task_on_rq_queued(p);
                ncsw = 0;
-               if (!match_state || READ_ONCE(p->__state) == match_state)
+               if (READ_ONCE(p->__state) & match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                task_rq_unlock(rq, p, &rf);
 
@@ -4397,6 +4390,17 @@ void set_numabalancing_state(bool enabled)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+       struct pglist_data *pgdat;
+
+       for_each_online_pgdat(pgdat) {
+               pgdat->nbp_threshold = 0;
+               pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+               pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+       }
+}
+
 int sysctl_numa_balancing(struct ctl_table *table, int write,
                          void *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -4413,6 +4417,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
        if (err < 0)
                return err;
        if (write) {
+               if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+                   (state & NUMA_BALANCING_MEMORY_TIERING))
+                       reset_memory_tiering();
                sysctl_numa_balancing_mode = state;
                __set_numabalancing_state(state);
        }
@@ -5167,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 * finish_task_switch()'s mmdrop().
                 */
                switch_mm_irqs_off(prev->active_mm, next->mm, next);
+               lru_gen_use_mm(next->mm);
 
                if (!prev->mm) {                        // from kernel
                        /* will mmdrop() in finish_task_switch(). */
@@ -6430,7 +6438,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
                        prev->sched_contributes_to_load =
                                (prev_state & TASK_UNINTERRUPTIBLE) &&
                                !(prev_state & TASK_NOLOAD) &&
-                               !(prev->flags & PF_FROZEN);
+                               !(prev_state & TASK_FROZEN);
 
                        if (prev->sched_contributes_to_load)
                                rq->nr_uninterruptible++;
@@ -8650,7 +8658,7 @@ again:
        if (curr->sched_class != p->sched_class)
                goto out_unlock;
 
-       if (task_running(p_rq, p) || !task_is_running(p))
+       if (task_on_cpu(p_rq, p) || !task_is_running(p))
                goto out_unlock;
 
        yielded = curr->sched_class->yield_to_task(rq, p);
@@ -8862,7 +8870,7 @@ void sched_show_task(struct task_struct *p)
        if (pid_alive(p))
                ppid = task_pid_nr(rcu_dereference(p->real_parent));
        rcu_read_unlock();
-       pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+       pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
                free, task_pid_nr(p), ppid,
                read_task_thread_flags(p));
 
@@ -8890,7 +8898,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p)
         * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
         * TASK_KILLABLE).
         */
-       if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
+       if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
                return false;
 
        return true;
@@ -9602,9 +9610,6 @@ LIST_HEAD(task_groups);
 static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
-
 void __init sched_init(void)
 {
        unsigned long ptr = 0;
@@ -9648,14 +9653,6 @@ void __init sched_init(void)
 
 #endif /* CONFIG_RT_GROUP_SCHED */
        }
-#ifdef CONFIG_CPUMASK_OFFSTACK
-       for_each_possible_cpu(i) {
-               per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
-                       cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-               per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
-                       cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-       }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
 
        init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
 
@@ -10164,7 +10161,7 @@ void sched_release_group(struct task_group *tg)
        spin_unlock_irqrestore(&task_group_lock, flags);
 }
 
-static void sched_change_group(struct task_struct *tsk, int type)
+static void sched_change_group(struct task_struct *tsk)
 {
        struct task_group *tg;
 
@@ -10180,7 +10177,7 @@ static void sched_change_group(struct task_struct *tsk, int type)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_change_group)
-               tsk->sched_class->task_change_group(tsk, type);
+               tsk->sched_class->task_change_group(tsk);
        else
 #endif
                set_task_rq(tsk, task_cpu(tsk));
@@ -10211,7 +10208,7 @@ void sched_move_task(struct task_struct *tsk)
        if (running)
                put_prev_task(rq, tsk);
 
-       sched_change_group(tsk, TASK_MOVE_GROUP);
+       sched_change_group(tsk);
 
        if (queued)
                enqueue_task(rq, tsk, queue_flags);
@@ -10289,53 +10286,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
        sched_unregister_group(tg);
 }
 
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
-       struct rq_flags rf;
-       struct rq *rq;
-
-       rq = task_rq_lock(task, &rf);
-
-       update_rq_clock(rq);
-       sched_change_group(task, TASK_SET_GROUP);
-
-       task_rq_unlock(rq, task, &rf);
-}
-
+#ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 {
        struct task_struct *task;
        struct cgroup_subsys_state *css;
-       int ret = 0;
 
        cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
                if (!sched_rt_can_attach(css_tg(css), task))
                        return -EINVAL;
-#endif
-               /*
-                * Serialize against wake_up_new_task() such that if it's
-                * running, we're sure to observe its full state.
-                */
-               raw_spin_lock_irq(&task->pi_lock);
-               /*
-                * Avoid calling sched_move_task() before wake_up_new_task()
-                * has happened. This would lead to problems with PELT, due to
-                * move wanting to detach+attach while we're not attached yet.
-                */
-               if (READ_ONCE(task->__state) == TASK_NEW)
-                       ret = -EINVAL;
-               raw_spin_unlock_irq(&task->pi_lock);
-
-               if (ret)
-                       break;
        }
-       return ret;
+       return 0;
 }
+#endif
 
 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 {
@@ -11171,8 +11134,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
        .css_released   = cpu_cgroup_css_released,
        .css_free       = cpu_cgroup_css_free,
        .css_extra_stat_show = cpu_extra_stat_show,
-       .fork           = cpu_cgroup_fork,
+#ifdef CONFIG_RT_GROUP_SCHED
        .can_attach     = cpu_cgroup_can_attach,
+#endif
        .attach         = cpu_cgroup_attach,
        .legacy_cftypes = cpu_legacy_files,
        .dfl_cftypes    = cpu_files,
@@ -11184,6 +11148,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 
 void dump_cpu_task(int cpu)
 {
+       if (cpu == smp_processor_id() && in_hardirq()) {
+               struct pt_regs *regs;
+
+               regs = get_irq_regs();
+               if (regs) {
+                       show_regs(regs);
+                       return;
+               }
+       }
+
+       if (trigger_single_cpu_backtrace(cpu))
+               return;
+
        pr_info("Task dump for CPU %d:\n", cpu);
        sched_show_task(cpu_curr(cpu));
 }