Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux-block.git] / kernel / sched / psi.c
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index 7f6030091aeee5aca070b5b77bfb5f3291ac302b..ee2ecc081422eab3244ec6f5b6b1f67e13203f03 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
  {
         int cpu;
  
+       group->enabled = true;
         for_each_possible_cpu(cpu)
                 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
         group->avg_last_update = sched_clock();
@@ -201,6 +202,7 @@ void __init psi_init(void)
  {
         if (!psi_enable) {
                 static_branch_enable(&psi_disabled);
+               static_branch_disable(&psi_cgroups_enabled);
                 return;
         }
  
@@ -211,7 +213,7 @@ void __init psi_init(void)
         group_init(&psi_system);
  }
  
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
  {
         switch (state) {
         case PSI_IO_SOME:
@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
                 return unlikely(tasks[NR_MEMSTALL] &&
                         tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
         case PSI_CPU_SOME:
-               return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+               return unlikely(tasks[NR_RUNNING] > oncpu);
         case PSI_CPU_FULL:
-               return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+               return unlikely(tasks[NR_RUNNING] && !oncpu);
         case PSI_NONIDLE:
                 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
                         tasks[NR_RUNNING];
@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
                              bool wake_clock)
  {
         struct psi_group_cpu *groupc;
-       u32 state_mask = 0;
         unsigned int t, m;
         enum psi_states s;
+       u32 state_mask;
  
         groupc = per_cpu_ptr(group->pcpu, cpu);
  
         /*
-        * First we assess the aggregate resource states this CPU's
-        * tasks have been in since the last change, and account any
-        * SOME and FULL time these may have resulted in.
-        *
-        * Then we update the task counts according to the state
+        * First we update the task counts according to the state
          * change requested through the @clear and @set bits.
+        *
+        * Then if the cgroup PSI stats accounting enabled, we
+        * assess the aggregate resource states this CPU's tasks
+        * have been in since the last change, and account any
+        * SOME and FULL time these may have resulted in.
          */
         write_seqcount_begin(&groupc->seq);
  
-       record_times(groupc, now);
+       /*
+        * Start with TSK_ONCPU, which doesn't have a corresponding
+        * task count - it's just a boolean flag directly encoded in
+        * the state mask. Clear, set, or carry the current state if
+        * no changes are requested.
+        */
+       if (unlikely(clear & TSK_ONCPU)) {
+               state_mask = 0;
+               clear &= ~TSK_ONCPU;
+       } else if (unlikely(set & TSK_ONCPU)) {
+               state_mask = PSI_ONCPU;
+               set &= ~TSK_ONCPU;
+       } else {
+               state_mask = groupc->state_mask & PSI_ONCPU;
+       }
  
+       /*
+        * The rest of the state mask is calculated based on the task
+        * counts. Update those first, then construct the mask.
+        */
         for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
                 if (!(m & (1 << t)))
                         continue;
                 if (groupc->tasks[t]) {
                         groupc->tasks[t]--;
                 } else if (!psi_bug) {
-                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
                                         cpu, t, groupc->tasks[0],
                                         groupc->tasks[1], groupc->tasks[2],
-                                       groupc->tasks[3], groupc->tasks[4],
-                                       clear, set);
+                                       groupc->tasks[3], clear, set);
                         psi_bug = 1;
                 }
         }
@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 if (set & (1 << t))
                         groupc->tasks[t]++;
  
-       /* Calculate state mask representing active states */
+       if (!group->enabled) {
+               /*
+                * On the first group change after disabling PSI, conclude
+                * the current state and flush its time. This is unlikely
+                * to matter to the user, but aggregation (get_recent_times)
+                * may have already incorporated the live state into times_prev;
+                * avoid a delta sample underflow when PSI is later re-enabled.
+                */
+               if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+                       record_times(groupc, now);
+
+               groupc->state_mask = state_mask;
+
+               write_seqcount_end(&groupc->seq);
+               return;
+       }
+
         for (s = 0; s < NR_PSI_STATES; s++) {
-               if (test_state(groupc->tasks, s))
+               if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
                         state_mask |= (1 << s);
         }
  
@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
          * task in a cgroup is in_memstall, the corresponding groupc
          * on that cpu is in PSI_MEM_FULL state.
          */
-       if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+       if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
                 state_mask |= (1 << PSI_MEM_FULL);
  
+       record_times(groupc, now);
+
         groupc->state_mask = state_mask;
  
         write_seqcount_end(&groupc->seq);
@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
  }
  
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
  {
-       if (*iter == &psi_system)
-               return NULL;
-
  #ifdef CONFIG_CGROUPS
-       if (static_branch_likely(&psi_cgroups_enabled)) {
-               struct cgroup *cgroup = NULL;
-
-               if (!*iter)
-                       cgroup = task->cgroups->dfl_cgrp;
-               else
-                       cgroup = cgroup_parent(*iter);
-
-               if (cgroup && cgroup_parent(cgroup)) {
-                       *iter = cgroup;
-                       return cgroup_psi(cgroup);
-               }
-       }
+       if (static_branch_likely(&psi_cgroups_enabled))
+               return cgroup_psi(task_dfl_cgroup(task));
  #endif
-       *iter = &psi_system;
         return &psi_system;
  }
  
@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
  {
         int cpu = task_cpu(task);
         struct psi_group *group;
-       bool wake_clock = true;
-       void *iter = NULL;
         u64 now;
  
         if (!task->pid)
@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
         psi_flags_change(task, clear, set);
  
         now = cpu_clock(cpu);
-       /*
-        * Periodic aggregation shuts off if there is a period of no
-        * task changes, so we wake it back up if necessary. However,
-        * don't do this if the task change is the aggregation worker
-        * itself going to sleep, or we'll ping-pong forever.
-        */
-       if (unlikely((clear & TSK_RUNNING) &&
-                    (task->flags & PF_WQ_WORKER) &&
-                    wq_worker_last_func(task) == psi_avgs_work))
-               wake_clock = false;
  
-       while ((group = iterate_groups(task, &iter)))
-               psi_group_change(group, cpu, clear, set, now, wake_clock);
+       group = task_psi_group(task);
+       do {
+               psi_group_change(group, cpu, clear, set, now, true);
+       } while ((group = group->parent));
  }
  
  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
  {
         struct psi_group *group, *common = NULL;
         int cpu = task_cpu(prev);
-       void *iter;
         u64 now = cpu_clock(cpu);
  
         if (next->pid) {
-               bool identical_state;
-
                 psi_flags_change(next, 0, TSK_ONCPU);
                 /*
-                * When switching between tasks that have an identical
-                * runtime state, the cgroup that contains both tasks
-                * we reach the first common ancestor. Iterate @next's
-                * ancestors only until we encounter @prev's ONCPU.
+                * Set TSK_ONCPU on @next's cgroups. If @next shares any
+                * ancestors with @prev, those will already have @prev's
+                * TSK_ONCPU bit set, and we can stop the iteration there.
                  */
-               identical_state = prev->psi_flags == next->psi_flags;
-               iter = NULL;
-               while ((group = iterate_groups(next, &iter))) {
-                       if (identical_state &&
-                           per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+               group = task_psi_group(next);
+               do {
+                       if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+                           PSI_ONCPU) {
                                 common = group;
                                 break;
                         }
  
                         psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
-               }
+               } while ((group = group->parent));
         }
  
         if (prev->pid) {
                 int clear = TSK_ONCPU, set = 0;
+               bool wake_clock = true;
  
                 /*
                  * When we're going to sleep, psi_dequeue() lets us
@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                                 clear |= TSK_MEMSTALL_RUNNING;
                         if (prev->in_iowait)
                                 set |= TSK_IOWAIT;
+
+                       /*
+                        * Periodic aggregation shuts off if there is a period of no
+                        * task changes, so we wake it back up if necessary. However,
+                        * don't do this if the task change is the aggregation worker
+                        * itself going to sleep, or we'll ping-pong forever.
+                        */
+                       if (unlikely((prev->flags & PF_WQ_WORKER) &&
+                                    wq_worker_last_func(prev) == psi_avgs_work))
+                               wake_clock = false;
                 }
  
                 psi_flags_change(prev, clear, set);
  
-               iter = NULL;
-               while ((group = iterate_groups(prev, &iter)) && group != common)
-                       psi_group_change(group, cpu, clear, set, now, true);
+               group = task_psi_group(prev);
+               do {
+                       if (group == common)
+                               break;
+                       psi_group_change(group, cpu, clear, set, now, wake_clock);
+               } while ((group = group->parent));
  
                 /*
-                * TSK_ONCPU is handled up to the common ancestor. If we're tasked
-                * with dequeuing too, finish that for the rest of the hierarchy.
+                * TSK_ONCPU is handled up to the common ancestor. If there are
+                * any other differences between the two tasks (e.g. prev goes
+                * to sleep, or only one task is memstall), finish propagating
+                * those differences all the way up to the root.
                  */
-               if (sleep) {
+               if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
                         clear &= ~TSK_ONCPU;
-                       for (; group; group = iterate_groups(prev, &iter))
-                               psi_group_change(group, cpu, clear, set, now, true);
+                       for (; group; group = group->parent)
+                               psi_group_change(group, cpu, clear, set, now, wake_clock);
                 }
         }
  }
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+       int cpu = task_cpu(task);
+       struct psi_group *group;
+       struct psi_group_cpu *groupc;
+       u64 now;
+
+       if (!task->pid)
+               return;
+
+       now = cpu_clock(cpu);
+
+       group = task_psi_group(task);
+       do {
+               if (!group->enabled)
+                       continue;
+
+               groupc = per_cpu_ptr(group->pcpu, cpu);
+
+               write_seqcount_begin(&groupc->seq);
+
+               record_times(groupc, now);
+               groupc->times[PSI_IRQ_FULL] += delta;
+
+               write_seqcount_end(&groupc->seq);
+
+               if (group->poll_states & (1 << PSI_IRQ_FULL))
+                       psi_schedule_poll_work(group, 1);
+       } while ((group = group->parent));
+}
+#endif
+
  /**
   * psi_memstall_enter - mark the beginning of a memory stall section
   * @flags: flags to handle nested sections
@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
  #ifdef CONFIG_CGROUPS
  int psi_cgroup_alloc(struct cgroup *cgroup)
  {
-       if (static_branch_likely(&psi_disabled))
+       if (!static_branch_likely(&psi_cgroups_enabled))
                 return 0;
  
         cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
                 return -ENOMEM;
         }
         group_init(cgroup->psi);
+       cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
         return 0;
  }
  
  void psi_cgroup_free(struct cgroup *cgroup)
  {
-       if (static_branch_likely(&psi_disabled))
+       if (!static_branch_likely(&psi_cgroups_enabled))
                 return;
  
         cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
         struct rq_flags rf;
         struct rq *rq;
  
-       if (static_branch_likely(&psi_disabled)) {
+       if (!static_branch_likely(&psi_cgroups_enabled)) {
                 /*
                  * Lame to do this here, but the scheduler cannot be locked
                  * from the outside, so we move cgroups from inside sched/.
@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
  
         task_rq_unlock(rq, task, &rf);
  }
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+       int cpu;
+
+       /*
+        * After we disable psi_group->enabled, we don't actually
+        * stop percpu tasks accounting in each psi_group_cpu,
+        * instead only stop test_state() loop, record_times()
+        * and averaging worker, see psi_group_change() for details.
+        *
+        * When disable cgroup PSI, this function has nothing to sync
+        * since cgroup pressure files are hidden and percpu psi_group_cpu
+        * would see !psi_group->enabled and only do task accounting.
+        *
+        * When re-enable cgroup PSI, this function use psi_group_change()
+        * to get correct state mask from test_state() loop on tasks[],
+        * and restart groupc->state_start from now, use .clear = .set = 0
+        * here since no task status really changed.
+        */
+       if (!group->enabled)
+               return;
+
+       for_each_possible_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+               struct rq_flags rf;
+               u64 now;
+
+               rq_lock_irq(rq, &rf);
+               now = cpu_clock(cpu);
+               psi_group_change(group, cpu, 0, 0, now, true);
+               rq_unlock_irq(rq, &rf);
+       }
+}
  #endif /* CONFIG_CGROUPS */
  
  int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
  {
+       bool only_full = false;
         int full;
         u64 now;
  
@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
                 group->avg_next_update = update_averages(group, now);
         mutex_unlock(&group->avgs_lock);
  
-       for (full = 0; full < 2; full++) {
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       only_full = res == PSI_IRQ;
+#endif
+
+       for (full = 0; full < 2 - only_full; full++) {
                 unsigned long avg[3] = { 0, };
                 u64 total = 0;
                 int w;
@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
                 }
  
                 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
-                          full ? "full" : "some",
+                          full || only_full ? "full" : "some",
                            LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
                            LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
                            LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
         else
                 return ERR_PTR(-EINVAL);
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+               return ERR_PTR(-EINVAL);
+#endif
+
         if (state >= PSI_NONIDLE)
                 return ERR_PTR(-EINVAL);
  
@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
         .proc_release   = psi_fop_release,
  };
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+       return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+       return psi_open(file, psi_irq_show);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+                            size_t nbytes, loff_t *ppos)
+{
+       return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+       .proc_open      = psi_irq_open,
+       .proc_read      = seq_read,
+       .proc_lseek     = seq_lseek,
+       .proc_write     = psi_irq_write,
+       .proc_poll      = psi_fop_poll,
+       .proc_release   = psi_fop_release,
+};
+#endif
+
  static int __init psi_proc_init(void)
  {
         if (psi_enable) {
@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
                 proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
                 proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
                 proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+               proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
         }
         return 0;
  }