Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
diff --combined Documentation/admin-guide/cgroup-v2.rst

index 7bcfb38498c69187da4ea3305423370fd4e333f3,4cad4e2b31ec87d829833623e4b3e63c80849368..dc254a3cb95686e67a7335bad3101313e97eedd9
--- 1/Documentation/admin-guide/cgroup-v2.rst
--- 2/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@@ -976,6 -976,29 +976,29 @@@ All cgroup core files are prefixed wit
         killing cgroups is a process directed operation, i.e. it affects
         the whole thread-group.
   
+   cgroup.pressure
+       A read-write single value file that allowed values are "0" and "1".
+       The default is "1".
+ 
+       Writing "0" to the file will disable the cgroup PSI accounting.
+       Writing "1" to the file will re-enable the cgroup PSI accounting.
+ 
+       This control attribute is not hierarchical, so disable or enable PSI
+       accounting in a cgroup does not affect PSI accounting in descendants
+       and doesn't need pass enablement via ancestors from root.
+ 
+       The reason this control attribute exists is that PSI accounts stalls for
+       each cgroup separately and aggregates it at each level of the hierarchy.
+       This may cause non-negligible overhead for some workloads when under
+       deep level of the hierarchy, in which case this control attribute can
+       be used to disable PSI accounting in the non-leaf cgroups.
+ 
+   irq.pressure
+       A read-write nested-keyed file.
+ 
+       Shows pressure stall information for IRQ/SOFTIRQ. See
+       :ref:`Documentation/accounting/psi.rst <psi>` for details.
+ 
   Controllers
   ===========
   
@@@ -1355,11 -1378,6 +1378,11 @@@ PAGE_SIZE multiple when read back
           pagetables
                   Amount of memory allocated for page tables.
   
+ +        sec_pagetables
+ +              Amount of memory allocated for secondary page tables,
+ +              this currently includes KVM mmu allocations on x86
+ +              and arm64.
+ +
           percpu (npn)
                 Amount of memory used for storing per-cpu kernel
                 data structures.
@@@ -2190,93 -2208,75 +2213,93 @@@ Cpuset Interface File
   
         It accepts only the following input values when written to.
   
- -        ========      ================================
- -        "root"        a partition root
- -        "member"      a non-root member of a partition
- -        ========      ================================
- -
- -      When set to be a partition root, the current cgroup is the
- -      root of a new partition or scheduling domain that comprises
- -      itself and all its descendants except those that are separate
- -      partition roots themselves and their descendants.  The root
- -      cgroup is always a partition root.
- -
- -      There are constraints on where a partition root can be set.
- -      It can only be set in a cgroup if all the following conditions
- -      are true.
- -
- -      1) The "cpuset.cpus" is not empty and the list of CPUs are
- -         exclusive, i.e. they are not shared by any of its siblings.
- -      2) The parent cgroup is a partition root.
- -      3) The "cpuset.cpus" is also a proper subset of the parent's
- -         "cpuset.cpus.effective".
- -      4) There is no child cgroups with cpuset enabled.  This is for
- -         eliminating corner cases that have to be handled if such a
- -         condition is allowed.
- -
- -      Setting it to partition root will take the CPUs away from the
- -      effective CPUs of the parent cgroup.  Once it is set, this
- -      file cannot be reverted back to "member" if there are any child
- -      cgroups with cpuset enabled.
- -
- -      A parent partition cannot distribute all its CPUs to its
- -      child partitions.  There must be at least one cpu left in the
- -      parent partition.
- -
- -      Once becoming a partition root, changes to "cpuset.cpus" is
- -      generally allowed as long as the first condition above is true,
- -      the change will not take away all the CPUs from the parent
- -      partition and the new "cpuset.cpus" value is a superset of its
- -      children's "cpuset.cpus" values.
- -
- -      Sometimes, external factors like changes to ancestors'
- -      "cpuset.cpus" or cpu hotplug can cause the state of the partition
- -      root to change.  On read, the "cpuset.sched.partition" file
- -      can show the following values.
- -
- -        ==============        ==============================
- -        "member"              Non-root member of a partition
- -        "root"                Partition root
- -        "root invalid"        Invalid partition root
- -        ==============        ==============================
- -
- -      It is a partition root if the first 2 partition root conditions
- -      above are true and at least one CPU from "cpuset.cpus" is
- -      granted by the parent cgroup.
- -
- -      A partition root can become invalid if none of CPUs requested
- -      in "cpuset.cpus" can be granted by the parent cgroup or the
- -      parent cgroup is no longer a partition root itself.  In this
- -      case, it is not a real partition even though the restriction
- -      of the first partition root condition above will still apply.
- -      The cpu affinity of all the tasks in the cgroup will then be
- -      associated with CPUs in the nearest ancestor partition.
- -
- -      An invalid partition root can be transitioned back to a
- -      real partition root if at least one of the requested CPUs
- -      can now be granted by its parent.  In this case, the cpu
- -      affinity of all the tasks in the formerly invalid partition
- -      will be associated to the CPUs of the newly formed partition.
- -      Changing the partition state of an invalid partition root to
- -      "member" is always allowed even if child cpusets are present.
+ +        ==========    =====================================
+ +        "member"      Non-root member of a partition
+ +        "root"        Partition root
+ +        "isolated"    Partition root without load balancing
+ +        ==========    =====================================
+ +
+ +      The root cgroup is always a partition root and its state
+ +      cannot be changed.  All other non-root cgroups start out as
+ +      "member".
+ +
+ +      When set to "root", the current cgroup is the root of a new
+ +      partition or scheduling domain that comprises itself and all
+ +      its descendants except those that are separate partition roots
+ +      themselves and their descendants.
+ +
+ +      When set to "isolated", the CPUs in that partition root will
+ +      be in an isolated state without any load balancing from the
+ +      scheduler.  Tasks placed in such a partition with multiple
+ +      CPUs should be carefully distributed and bound to each of the
+ +      individual CPUs for optimal performance.
+ +
+ +      The value shown in "cpuset.cpus.effective" of a partition root
+ +      is the CPUs that the partition root can dedicate to a potential
+ +      new child partition root. The new child subtracts available
+ +      CPUs from its parent "cpuset.cpus.effective".
+ +
+ +      A partition root ("root" or "isolated") can be in one of the
+ +      two possible states - valid or invalid.  An invalid partition
+ +      root is in a degraded state where some state information may
+ +      be retained, but behaves more like a "member".
+ +
+ +      All possible state transitions among "member", "root" and
+ +      "isolated" are allowed.
+ +
+ +      On read, the "cpuset.cpus.partition" file can show the following
+ +      values.
+ +
+ +        ============================= =====================================
+ +        "member"                      Non-root member of a partition
+ +        "root"                        Partition root
+ +        "isolated"                    Partition root without load balancing
+ +        "root invalid (<reason>)"     Invalid partition root
+ +        "isolated invalid (<reason>)" Invalid isolated partition root
+ +        ============================= =====================================
+ +
+ +      In the case of an invalid partition root, a descriptive string on
+ +      why the partition is invalid is included within parentheses.
+ +
+ +      For a partition root to become valid, the following conditions
+ +      must be met.
+ +
+ +      1) The "cpuset.cpus" is exclusive with its siblings , i.e. they
+ +         are not shared by any of its siblings (exclusivity rule).
+ +      2) The parent cgroup is a valid partition root.
+ +      3) The "cpuset.cpus" is not empty and must contain at least
+ +         one of the CPUs from parent's "cpuset.cpus", i.e. they overlap.
+ +      4) The "cpuset.cpus.effective" cannot be empty unless there is
+ +         no task associated with this partition.
+ +
+ +      External events like hotplug or changes to "cpuset.cpus" can
+ +      cause a valid partition root to become invalid and vice versa.
+ +      Note that a task cannot be moved to a cgroup with empty
+ +      "cpuset.cpus.effective".
+ +
+ +      For a valid partition root with the sibling cpu exclusivity
+ +      rule enabled, changes made to "cpuset.cpus" that violate the
+ +      exclusivity rule will invalidate the partition as well as its
+ +      sibiling partitions with conflicting cpuset.cpus values. So
+ +      care must be taking in changing "cpuset.cpus".
+ +
+ +      A valid non-root parent partition may distribute out all its CPUs
+ +      to its child partitions when there is no task associated with it.
+ +
+ +      Care must be taken to change a valid partition root to
+ +      "member" as all its child partitions, if present, will become
+ +      invalid causing disruption to tasks running in those child
+ +      partitions. These inactivated partitions could be recovered if
+ +      their parent is switched back to a partition root with a proper
+ +      set of "cpuset.cpus".
+ +
+ +      Poll and inotify events are triggered whenever the state of
+ +      "cpuset.cpus.partition" changes.  That includes changes caused
+ +      by write to "cpuset.cpus.partition", cpu hotplug or other
+ +      changes that modify the validity status of the partition.
+ +      This will allow user space agents to monitor unexpected changes
+ +      to "cpuset.cpus.partition" without the need to do continuous
+ +      polling.
   
   
   Device controller
diff --combined include/linux/cgroup-defs.h

index 8f481d1b159af1b04aa93001cf797e70d987f275,7df76b318245f0e104f2bdddfb65ae135632e460..6e01f10f0d88999216270d54031ecb4d7be4d9b5
--- 1/include/linux/cgroup-defs.h
--- 2/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@@ -126,11 -126,11 +126,11 @@@ enum 
         CFTYPE_NO_PREFIX        = (1 << 3),     /* (DON'T USE FOR NEW FILES) no subsys prefix */
         CFTYPE_WORLD_WRITABLE   = (1 << 4),     /* (DON'T USE FOR NEW FILES) S_IWUGO */
         CFTYPE_DEBUG            = (1 << 5),     /* create when cgroup_debug */
- -      CFTYPE_PRESSURE         = (1 << 6),     /* only if pressure feature is enabled */
   
         /* internal flags, do not use outside cgroup core proper */
         __CFTYPE_ONLY_ON_DFL    = (1 << 16),    /* only on default hierarchy */
         __CFTYPE_NOT_ON_DFL     = (1 << 17),    /* not on default hierarchy */
+ +      __CFTYPE_ADDED          = (1 << 18),
   };
   
   /*
@@@ -384,7 -384,7 +384,7 @@@ struct cgroup 
         /*
          * The depth this cgroup is at.  The root is at depth zero and each
          * step down the hierarchy increments the level.  This along with
- -       * ancestor_ids[] can determine whether a given cgroup is a
+ +       * ancestors[] can determine whether a given cgroup is a
          * descendant of another without traversing the hierarchy.
          */
         int level;
@@@ -428,6 -428,9 +428,9 @@@
         struct cgroup_file procs_file;  /* handle for "cgroup.procs" */
         struct cgroup_file events_file; /* handle for "cgroup.events" */
   
+       /* handles for "{cpu,memory,io,irq}.pressure" */
+       struct cgroup_file psi_files[NR_PSI_RESOURCES];
+ 
         /*
          * The bitmask of subsystems enabled on the child cgroups.
          * ->subtree_control is the one configured through
@@@ -504,8 -507,8 +507,8 @@@
         /* Used to store internal freezer state */
         struct cgroup_freezer_state freezer;
   
- -      /* ids of the ancestors at each level including self */
- -      u64 ancestor_ids[];
+ +      /* All ancestors including self */
+ +      struct cgroup *ancestors[];
   };
   
   /*
@@@ -522,15 -525,11 +525,15 @@@ struct cgroup_root 
         /* Unique id for this hierarchy. */
         int hierarchy_id;
   
- -      /* The root cgroup.  Root is destroyed on its release. */
+ +      /*
+ +       * The root cgroup. The containing cgroup_root will be destroyed on its
+ +       * release. cgrp->ancestors[0] will be used overflowing into the
+ +       * following field. cgrp_ancestor_storage must immediately follow.
+ +       */
         struct cgroup cgrp;
   
- -      /* for cgrp->ancestor_ids[0] */
- -      u64 cgrp_ancestor_id_storage;
+ +      /* must follow cgrp for cgrp->ancestors[0], see above */
+ +      struct cgroup *cgrp_ancestor_storage;
   
         /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
         atomic_t nr_cgrps;
diff --combined include/linux/cgroup.h

index 23b102b4349e3b59b66cf370d7c31e4398807f19,80cb970257be9ff89c0de684b26876081e69b514..f2a9f2274c3bb22b78f3e2190b13ee4af0bec1ac
--- 1/include/linux/cgroup.h
--- 2/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -433,18 -433,6 +433,18 @@@ static inline void cgroup_put(struct cg
         css_put(&cgrp->self);
   }
   
+ +extern struct mutex cgroup_mutex;
+ +
+ +static inline void cgroup_lock(void)
+ +{
+ +      mutex_lock(&cgroup_mutex);
+ +}
+ +
+ +static inline void cgroup_unlock(void)
+ +{
+ +      mutex_unlock(&cgroup_mutex);
+ +}
+ +
   /**
    * task_css_set_check - obtain a task's css_set with extra access conditions
    * @task: the task to obtain css_set for
@@@ -459,6 -447,7 +459,6 @@@
    * as locks used during the cgroup_subsys::attach() methods.
    */
   #ifdef CONFIG_PROVE_RCU
- -extern struct mutex cgroup_mutex;
   extern spinlock_t css_set_lock;
   #define task_css_set_check(task, __c)                                 \
         rcu_dereference_check((task)->cgroups,                          \
@@@ -586,7 -575,7 +586,7 @@@ static inline bool cgroup_is_descendant
   {
         if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                 return false;
- -      return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
+ +      return cgrp->ancestors[ancestor->level] == ancestor;
   }
   
   /**
@@@ -603,9 -592,11 +603,9 @@@
   static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                              int ancestor_level)
   {
- -      if (cgrp->level < ancestor_level)
+ +      if (ancestor_level < 0 || ancestor_level > cgrp->level)
                 return NULL;
- -      while (cgrp && cgrp->level > ancestor_level)
- -              cgrp = cgroup_parent(cgrp);
- -      return cgrp;
+ +      return cgrp->ancestors[ancestor_level];
   }
   
   /**
@@@ -682,11 -673,6 +682,6 @@@ static inline void pr_cont_cgroup_path(
         pr_cont_kernfs_path(cgrp->kn);
   }
   
- static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
- {
-       return cgrp->psi;
- }
- 
   bool cgroup_psi_enabled(void);
   
   static inline void cgroup_init_kthreadd(void)
@@@ -718,8 -704,6 +713,8 @@@ struct cgroup
   static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
   static inline void css_get(struct cgroup_subsys_state *css) {}
   static inline void css_put(struct cgroup_subsys_state *css) {}
+ +static inline void cgroup_lock(void) {}
+ +static inline void cgroup_unlock(void) {}
   static inline int cgroup_attach_task_all(struct task_struct *from,
                                          struct task_struct *t) { return 0; }
   static inline int cgroupstats_build(struct cgroupstats *stats,
@@@ -759,6 -743,11 +754,6 @@@ static inline bool task_under_cgroup_hi
   
   static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
   {}
- -
- -static inline struct cgroup *cgroup_get_from_id(u64 id)
- -{
- -      return NULL;
- -}
   #endif /* !CONFIG_CGROUPS */
   
   #ifdef CONFIG_CGROUPS
diff --combined kernel/cgroup/cgroup.c

index 764bdd5fd8d14ecd24287715983800ee621a8871,fa1cf836b66a36e939d249142ebce87eee79f695..7f486677ab1febcf064735ed47c87075b5118db7
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -217,7 -217,6 +217,7 @@@ struct cgroup_namespace init_cgroup_ns 
   
   static struct file_system_type cgroup2_fs_type;
   static struct cftype cgroup_base_files[];
+ +static struct cftype cgroup_psi_files[];
   
   /* cgroup optional features */
   enum cgroup_opt_features {
@@@ -1690,16 -1689,12 +1690,16 @@@ static void css_clear_dir(struct cgroup
         css->flags &= ~CSS_VISIBLE;
   
         if (!css->ss) {
- -              if (cgroup_on_dfl(cgrp))
- -                      cfts = cgroup_base_files;
- -              else
- -                      cfts = cgroup1_base_files;
- -
- -              cgroup_addrm_files(css, cgrp, cfts, false);
+ +              if (cgroup_on_dfl(cgrp)) {
+ +                      cgroup_addrm_files(css, cgrp,
+ +                                         cgroup_base_files, false);
+ +                      if (cgroup_psi_enabled())
+ +                              cgroup_addrm_files(css, cgrp,
+ +                                                 cgroup_psi_files, false);
+ +              } else {
+ +                      cgroup_addrm_files(css, cgrp,
+ +                                         cgroup1_base_files, false);
+ +              }
         } else {
                 list_for_each_entry(cfts, &css->ss->cfts, node)
                         cgroup_addrm_files(css, cgrp, cfts, false);
@@@ -1722,22 -1717,14 +1722,22 @@@ static int css_populate_dir(struct cgro
                 return 0;
   
         if (!css->ss) {
- -              if (cgroup_on_dfl(cgrp))
- -                      cfts = cgroup_base_files;
- -              else
- -                      cfts = cgroup1_base_files;
- -
- -              ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- -              if (ret < 0)
- -                      return ret;
+ +              if (cgroup_on_dfl(cgrp)) {
+ +                      ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ +                                               cgroup_base_files, true);
+ +                      if (ret < 0)
+ +                              return ret;
+ +
+ +                      if (cgroup_psi_enabled()) {
+ +                              ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ +                                                       cgroup_psi_files, true);
+ +                              if (ret < 0)
+ +                                      return ret;
+ +                      }
+ +              } else {
+ +                      cgroup_addrm_files(css, cgrp,
+ +                                         cgroup1_base_files, true);
+ +              }
         } else {
                 list_for_each_entry(cfts, &css->ss->cfts, node) {
                         ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@@ -2063,7 -2050,7 +2063,7 @@@ int cgroup_setup_root(struct cgroup_roo
         }
         root_cgrp->kn = kernfs_root_to_node(root->kf_root);
         WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- -      root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ +      root_cgrp->ancestors[0] = root_cgrp;
   
         ret = css_populate_dir(&root_cgrp->self);
         if (ret)
@@@ -2186,7 -2173,7 +2186,7 @@@ static int cgroup_get_tree(struct fs_co
         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
         int ret;
   
- -      cgrp_dfl_visible = true;
+ +      WRITE_ONCE(cgrp_dfl_visible, true);
         cgroup_get_live(&cgrp_dfl_root.cgrp);
         ctx->root = &cgrp_dfl_root;
   
@@@ -2374,7 -2361,7 +2374,7 @@@ int task_cgroup_path(struct task_struc
                 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
         } else {
                 /* if no hierarchy exists, everyone is in "/" */
- -              ret = strlcpy(buf, "/", buflen);
+ +              ret = strscpy(buf, "/", buflen);
         }
   
         spin_unlock_irq(&css_set_lock);
@@@ -2406,7 -2393,7 +2406,7 @@@ EXPORT_SYMBOL_GPL(task_cgroup_path)
    * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
    * CPU hotplug is disabled on entry.
    */
- -static void cgroup_attach_lock(bool lock_threadgroup)
+ +void cgroup_attach_lock(bool lock_threadgroup)
   {
         cpus_read_lock();
         if (lock_threadgroup)
@@@ -2417,7 -2404,7 +2417,7 @@@
    * cgroup_attach_unlock - Undo cgroup_attach_lock()
    * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
    */
- -static void cgroup_attach_unlock(bool lock_threadgroup)
+ +void cgroup_attach_unlock(bool lock_threadgroup)
   {
         if (lock_threadgroup)
                 percpu_up_write(&cgroup_threadgroup_rwsem);
@@@ -3305,7 -3292,11 +3305,7 @@@ static int cgroup_apply_control(struct 
          * making the following cgroup_update_dfl_csses() properly update
          * css associations of all tasks in the subtree.
          */
- -      ret = cgroup_update_dfl_csses(cgrp);
- -      if (ret)
- -              return ret;
- -
- -      return 0;
+ +      return cgroup_update_dfl_csses(cgrp);
   }
   
   /**
@@@ -3698,27 -3689,27 +3698,27 @@@ static int cpu_stat_show(struct seq_fil
   static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
   {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
-       struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       struct psi_group *psi = cgroup_psi(cgrp);
   
         return psi_show(seq, psi, PSI_IO);
   }
   static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
   {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
-       struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       struct psi_group *psi = cgroup_psi(cgrp);
   
         return psi_show(seq, psi, PSI_MEM);
   }
   static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
   {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
-       struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       struct psi_group *psi = cgroup_psi(cgrp);
   
         return psi_show(seq, psi, PSI_CPU);
   }
   
- static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
-                                         size_t nbytes, enum psi_res res)
+ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+                             size_t nbytes, enum psi_res res)
   {
         struct cgroup_file_ctx *ctx = of->priv;
         struct psi_trigger *new;
@@@ -3738,7 -3729,7 +3738,7 @@@
                 return -EBUSY;
         }
   
-       psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       psi = cgroup_psi(cgrp);
         new = psi_trigger_create(psi, buf, res);
         if (IS_ERR(new)) {
                 cgroup_put(cgrp);
@@@ -3755,21 -3746,86 +3755,86 @@@ static ssize_t cgroup_io_pressure_write
                                           char *buf, size_t nbytes,
                                           loff_t off)
   {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+       return pressure_write(of, buf, nbytes, PSI_IO);
   }
   
   static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
   {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+       return pressure_write(of, buf, nbytes, PSI_MEM);
   }
   
   static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
   {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+       return pressure_write(of, buf, nbytes, PSI_CPU);
+ }
+ 
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       struct psi_group *psi = cgroup_psi(cgrp);
+ 
+       return psi_show(seq, psi, PSI_IRQ);
+ }
+ 
+ static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+                                        char *buf, size_t nbytes,
+                                        loff_t off)
+ {
+       return pressure_write(of, buf, nbytes, PSI_IRQ);
+ }
+ #endif
+ 
+ static int cgroup_pressure_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       struct psi_group *psi = cgroup_psi(cgrp);
+ 
+       seq_printf(seq, "%d\n", psi->enabled);
+ 
+       return 0;
+ }
+ 
+ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+                                    char *buf, size_t nbytes,
+                                    loff_t off)
+ {
+       ssize_t ret;
+       int enable;
+       struct cgroup *cgrp;
+       struct psi_group *psi;
+ 
+       ret = kstrtoint(strstrip(buf), 0, &enable);
+       if (ret)
+               return ret;
+ 
+       if (enable < 0 || enable > 1)
+               return -ERANGE;
+ 
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+ 
+       psi = cgroup_psi(cgrp);
+       if (psi->enabled != enable) {
+               int i;
+ 
+               /* show or hide {cpu,memory,io,irq}.pressure files */
+               for (i = 0; i < NR_PSI_RESOURCES; i++)
+                       cgroup_file_show(&cgrp->psi_files[i], enable);
+ 
+               psi->enabled = enable;
+               if (enable)
+                       psi_cgroup_restart(psi);
+       }
+ 
+       cgroup_kn_unlock(of->kn);
+ 
+       return nbytes;
   }
   
   static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@@ -3789,6 -3845,9 +3854,9 @@@ static void cgroup_pressure_release(str
   
   bool cgroup_psi_enabled(void)
   {
+       if (static_branch_likely(&psi_disabled))
+               return false;
+ 
         return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
   }
   
@@@ -4141,6 -4200,8 +4209,6 @@@ static int cgroup_addrm_files(struct cg
   restart:
         for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                 /* does cft->flags tell us to skip this file on @cgrp? */
- -              if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- -                      continue;
                 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                         continue;
                 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@@ -4205,25 -4266,21 +4273,25 @@@ static void cgroup_exit_cftypes(struct 
                 cft->ss = NULL;
   
                 /* revert flags set by cgroup core while adding @cfts */
- -              cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ +              cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ +                              __CFTYPE_ADDED);
         }
   }
   
   static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
   {
         struct cftype *cft;
+ +      int ret = 0;
   
         for (cft = cfts; cft->name[0] != '\0'; cft++) {
                 struct kernfs_ops *kf_ops;
   
                 WARN_ON(cft->ss || cft->kf_ops);
   
- -              if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- -                      continue;
+ +              if (cft->flags & __CFTYPE_ADDED) {
+ +                      ret = -EBUSY;
+ +                      break;
+ +              }
   
                 if (cft->seq_start)
                         kf_ops = &cgroup_kf_ops;
@@@ -4237,26 -4294,26 +4305,26 @@@
                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
                         if (!kf_ops) {
- -                              cgroup_exit_cftypes(cfts);
- -                              return -ENOMEM;
+ +                              ret = -ENOMEM;
+ +                              break;
                         }
                         kf_ops->atomic_write_len = cft->max_write_len;
                 }
   
                 cft->kf_ops = kf_ops;
                 cft->ss = ss;
+ +              cft->flags |= __CFTYPE_ADDED;
         }
   
- -      return 0;
+ +      if (ret)
+ +              cgroup_exit_cftypes(cfts);
+ +      return ret;
   }
   
   static int cgroup_rm_cftypes_locked(struct cftype *cfts)
   {
         lockdep_assert_held(&cgroup_mutex);
   
- -      if (!cfts || !cfts[0].ss)
- -              return -ENOENT;
- -
         list_del(&cfts->node);
         cgroup_apply_cftypes(cfts, false);
         cgroup_exit_cftypes(cfts);
@@@ -4278,12 -4335,6 +4346,12 @@@ int cgroup_rm_cftypes(struct cftype *cf
   {
         int ret;
   
+ +      if (!cfts || cfts[0].name[0] == '\0')
+ +              return 0;
+ +
+ +      if (!(cfts[0].flags & __CFTYPE_ADDED))
+ +              return -ENOENT;
+ +
         mutex_lock(&cgroup_mutex);
         ret = cgroup_rm_cftypes_locked(cfts);
         mutex_unlock(&cgroup_mutex);
@@@ -5168,13 -5219,11 +5236,14 @@@ static struct cftype cgroup_base_files[
                 .name = "cpu.stat",
                 .seq_show = cpu_stat_show,
         },
+ +      { }     /* terminate */
+ +};
+ +
+ +static struct cftype cgroup_psi_files[] = {
   #ifdef CONFIG_PSI
         {
                 .name = "io.pressure",
- -              .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
                 .seq_show = cgroup_io_pressure_show,
                 .write = cgroup_io_pressure_write,
                 .poll = cgroup_pressure_poll,
@@@ -5182,6 -5231,8 +5251,7 @@@
         },
         {
                 .name = "memory.pressure",
- -              .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
                 .seq_show = cgroup_memory_pressure_show,
                 .write = cgroup_memory_pressure_write,
                 .poll = cgroup_pressure_poll,
@@@ -5189,11 -5240,30 +5259,27 @@@
         },
         {
                 .name = "cpu.pressure",
- -              .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
                 .seq_show = cgroup_cpu_pressure_show,
                 .write = cgroup_cpu_pressure_write,
                 .poll = cgroup_pressure_poll,
                 .release = cgroup_pressure_release,
         },
- -              .flags = CFTYPE_PRESSURE,
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       {
+               .name = "irq.pressure",
- -              .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+               .seq_show = cgroup_irq_pressure_show,
+               .write = cgroup_irq_pressure_write,
+               .poll = cgroup_pressure_poll,
+               .release = cgroup_pressure_release,
+       },
+ #endif
+       {
+               .name = "cgroup.pressure",
+               .seq_show = cgroup_pressure_show,
+               .write = cgroup_pressure_write,
+       },
   #endif /* CONFIG_PSI */
         { }     /* terminate */
   };
@@@ -5470,7 -5540,8 +5556,7 @@@ static struct cgroup *cgroup_create(str
         int ret;
   
         /* allocate the cgroup and its ID, 0 is reserved for the root */
- -      cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- -                     GFP_KERNEL);
+ +      cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
         if (!cgrp)
                 return ERR_PTR(-ENOMEM);
   
@@@ -5522,7 -5593,7 +5608,7 @@@
   
         spin_lock_irq(&css_set_lock);
         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- -              cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ +              cgrp->ancestors[tcgrp->level] = tcgrp;
   
                 if (tcgrp != cgrp) {
                         tcgrp->nr_descendants++;
@@@ -5955,7 -6026,6 +6041,7 @@@ int __init cgroup_init(void
   
         BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ +      BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
   
         cgroup_rstat_boot();
@@@ -6076,22 -6146,16 +6162,22 @@@ void cgroup_path_from_kernfs_id(u64 id
   /*
    * cgroup_get_from_id : get the cgroup associated with cgroup id
    * @id: cgroup id
- - * On success return the cgrp, on failure return NULL
+ + * On success return the cgrp or ERR_PTR on failure
+ + * Only cgroups within current task's cgroup NS are valid.
    */
   struct cgroup *cgroup_get_from_id(u64 id)
   {
         struct kernfs_node *kn;
- -      struct cgroup *cgrp = NULL;
+ +      struct cgroup *cgrp, *root_cgrp;
   
         kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
         if (!kn)
- -              goto out;
+ +              return ERR_PTR(-ENOENT);
+ +
+ +      if (kernfs_type(kn) != KERNFS_DIR) {
+ +              kernfs_put(kn);
+ +              return ERR_PTR(-ENOENT);
+ +      }
   
         rcu_read_lock();
   
@@@ -6100,19 -6164,9 +6186,19 @@@
                 cgrp = NULL;
   
         rcu_read_unlock();
- -
         kernfs_put(kn);
- -out:
+ +
+ +      if (!cgrp)
+ +              return ERR_PTR(-ENOENT);
+ +
+ +      spin_lock_irq(&css_set_lock);
+ +      root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+ +      spin_unlock_irq(&css_set_lock);
+ +      if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ +              cgroup_put(cgrp);
+ +              return ERR_PTR(-ENOENT);
+ +      }
+ +
         return cgrp;
   }
   EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@@ -6142,7 -6196,7 +6228,7 @@@ int proc_cgroup_show(struct seq_file *m
                 struct cgroup *cgrp;
                 int ssid, count = 0;
   
- -              if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ +              if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
                         continue;
   
                 seq_printf(m, "%d:", root->hierarchy_id);
@@@ -6218,6 -6272,11 +6304,6 @@@ static struct cgroup *cgroup_get_from_f
                 return ERR_CAST(css);
   
         cgrp = css->cgroup;
- -      if (!cgroup_on_dfl(cgrp)) {
- -              cgroup_put(cgrp);
- -              return ERR_PTR(-EBADF);
- -      }
- -
         return cgrp;
   }
   
@@@ -6684,12 -6743,8 +6770,12 @@@ struct cgroup *cgroup_get_from_path(con
   {
         struct kernfs_node *kn;
         struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ +      struct cgroup *root_cgrp;
   
- -      kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ +      spin_lock_irq(&css_set_lock);
+ +      root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+ +      kn = kernfs_walk_and_get(root_cgrp->kn, path);
+ +      spin_unlock_irq(&css_set_lock);
         if (!kn)
                 goto out;
   
@@@ -6847,6 -6902,9 +6933,6 @@@ static ssize_t show_delegatable_files(s
                 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
                         continue;
   
- -              if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- -                      continue;
- -
                 if (prefix)
                         ret += snprintf(buf + ret, size - ret, "%s.", prefix);
   
@@@ -6866,11 -6924,8 +6952,11 @@@ static ssize_t delegate_show(struct kob
         int ssid;
         ssize_t ret = 0;
   
- -      ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- -                                   NULL);
+ +      ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ +                                   PAGE_SIZE - ret, NULL);
+ +      if (cgroup_psi_enabled())
+ +              ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ +                                            PAGE_SIZE - ret, NULL);
   
         for_each_subsys(ss, ssid)
                 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
diff --combined kernel/sched/core.c

index e4ce124ec70163afbe44894573fd96fac7961d2e,7d1ea9240af08f627c376108609a6a5b2390e2d3..5800b0623ff30687cf60b24e5109fc40e5ee9229
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -73,7 -73,6 +73,7 @@@
   
   #include <uapi/linux/sched/types.h>
   
+ +#include <asm/irq_regs.h>
   #include <asm/switch_to.h>
   #include <asm/tlb.h>
   
@@@ -143,7 -142,11 +143,7 @@@ __read_mostly int sysctl_resched_latenc
    * Number of tasks to iterate in a single balance run.
    * Limited because this is done with IRQs disabled.
    */
- -#ifdef CONFIG_PREEMPT_RT
- -const_debug unsigned int sysctl_sched_nr_migrate = 8;
- -#else
- -const_debug unsigned int sysctl_sched_nr_migrate = 32;
- -#endif
+ +const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
   
   __read_mostly int scheduler_running;
   
@@@ -357,7 -360,10 +357,7 @@@ static void __sched_core_flip(bool enab
         /*
          * Toggle the offline CPUs.
          */
- -      cpumask_copy(&sched_core_mask, cpu_possible_mask);
- -      cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
- -
- -      for_each_cpu(cpu, &sched_core_mask)
+ +      for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
                 cpu_rq(cpu)->core_enabled = enabled;
   
         cpus_read_unlock();
@@@ -475,7 -481,8 +475,7 @@@ sched_core_dequeue(struct rq *rq, struc
    *                            p->se.load, p->rt_priority,
    *                            p->dl.dl_{runtime, deadline, period, flags, bw, density}
    *  - sched_setnuma():                p->numa_preferred_nid
- - *  - sched_move_task()/
- - *    cpu_cgroup_fork():      p->sched_task_group
+ + *  - sched_move_task():      p->sched_task_group
    *  - uclamp_update_active()  p->uclamp*
    *
    * p->state <- TASK_*:
@@@ -701,6 -708,7 +701,7 @@@ static void update_rq_clock_task(struc
   
         rq->prev_irq_time += irq_delta;
         delta -= irq_delta;
+       psi_account_irqtime(rq->curr, irq_delta);
   #endif
   #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
         if (static_key_false((&paravirt_steal_rq_enabled))) {
@@@ -2321,7 -2329,7 +2322,7 @@@ static struct rq *move_queued_task(stru
         rq = cpu_rq(new_cpu);
   
         rq_lock(rq, rf);
- -      BUG_ON(task_cpu(p) != new_cpu);
+ +      WARN_ON_ONCE(task_cpu(p) != new_cpu);
         activate_task(rq, p, 0);
         check_preempt_curr(rq, p, 0);
   
@@@ -2771,7 -2779,7 +2772,7 @@@ static int affine_move_task(struct rq *
                 return -EINVAL;
         }
   
- -      if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ +      if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
                 /*
                  * MIGRATE_ENABLE gets here because 'p == current', but for
                  * anything else we cannot do is_migration_disabled(), punt
@@@ -3247,12 -3255,12 +3248,12 @@@ out
   /*
    * wait_task_inactive - wait for a thread to unschedule.
    *
- - * If @match_state is nonzero, it's the @p->state value just checked and
- - * not expected to change.  If it changes, i.e. @p might have woken up,
- - * then return zero.  When we succeed in waiting for @p to be off its CPU,
- - * we return a positive number (its total switch count).  If a second call
- - * a short while later returns the same number, the caller can be sure that
- - * @p has remained unscheduled the whole time.
+ + * Wait for the thread to block in any of the states set in @match_state.
+ + * If it changes, i.e. @p might have woken up, then return zero.  When we
+ + * succeed in waiting for @p to be off its CPU, we return a positive number
+ + * (its total switch count).  If a second call a short while later returns the
+ + * same number, the caller can be sure that @p has remained unscheduled the
+ + * whole time.
    *
    * The caller must ensure that the task *will* unschedule sometime soon,
    * else this function might spin for a *long* time. This function can't
@@@ -3283,12 -3291,12 +3284,12 @@@ unsigned long wait_task_inactive(struc
                  *
                  * NOTE! Since we don't hold any locks, it's not
                  * even sure that "rq" stays as the right runqueue!
- -               * But we don't care, since "task_running()" will
+ +               * But we don't care, since "task_on_cpu()" will
                  * return false if the runqueue has changed and p
                  * is actually now running somewhere else!
                  */
- -              while (task_running(rq, p)) {
- -                      if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+ +              while (task_on_cpu(rq, p)) {
+ +                      if (!(READ_ONCE(p->__state) & match_state))
                                 return 0;
                         cpu_relax();
                 }
@@@ -3300,10 -3308,10 +3301,10 @@@
                  */
                 rq = task_rq_lock(p, &rf);
                 trace_sched_wait_task(p);
- -              running = task_running(rq, p);
+ +              running = task_on_cpu(rq, p);
                 queued = task_on_rq_queued(p);
                 ncsw = 0;
- -              if (!match_state || READ_ONCE(p->__state) == match_state)
+ +              if (READ_ONCE(p->__state) & match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                 task_rq_unlock(rq, p, &rf);
   
@@@ -4389,17 -4397,6 +4390,17 @@@ void set_numabalancing_state(bool enabl
   }
   
   #ifdef CONFIG_PROC_SYSCTL
+ +static void reset_memory_tiering(void)
+ +{
+ +      struct pglist_data *pgdat;
+ +
+ +      for_each_online_pgdat(pgdat) {
+ +              pgdat->nbp_threshold = 0;
+ +              pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ +              pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ +      }
+ +}
+ +
   int sysctl_numa_balancing(struct ctl_table *table, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
   {
@@@ -4416,9 -4413,6 +4417,9 @@@
         if (err < 0)
                 return err;
         if (write) {
+ +              if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ +                  (state & NUMA_BALANCING_MEMORY_TIERING))
+ +                      reset_memory_tiering();
                 sysctl_numa_balancing_mode = state;
                 __set_numabalancing_state(state);
         }
@@@ -5173,7 -5167,6 +5174,7 @@@ context_switch(struct rq *rq, struct ta
                  * finish_task_switch()'s mmdrop().
                  */
                 switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ +              lru_gen_use_mm(next->mm);
   
                 if (!prev->mm) {                        // from kernel
                         /* will mmdrop() in finish_task_switch(). */
@@@ -6437,7 -6430,7 +6438,7 @@@ static void __sched notrace __schedule(
                         prev->sched_contributes_to_load =
                                 (prev_state & TASK_UNINTERRUPTIBLE) &&
                                 !(prev_state & TASK_NOLOAD) &&
- -                              !(prev->flags & PF_FROZEN);
+ +                              !(prev_state & TASK_FROZEN);
   
                         if (prev->sched_contributes_to_load)
                                 rq->nr_uninterruptible++;
@@@ -8657,7 -8650,7 +8658,7 @@@ again
         if (curr->sched_class != p->sched_class)
                 goto out_unlock;
   
- -      if (task_running(p_rq, p) || !task_is_running(p))
+ +      if (task_on_cpu(p_rq, p) || !task_is_running(p))
                 goto out_unlock;
   
         yielded = curr->sched_class->yield_to_task(rq, p);
@@@ -8869,7 -8862,7 +8870,7 @@@ void sched_show_task(struct task_struc
         if (pid_alive(p))
                 ppid = task_pid_nr(rcu_dereference(p->real_parent));
         rcu_read_unlock();
- -      pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ +      pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
                 free, task_pid_nr(p), ppid,
                 read_task_thread_flags(p));
   
@@@ -8897,7 -8890,7 +8898,7 @@@ state_filter_match(unsigned long state_
          * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
          * TASK_KILLABLE).
          */
- -      if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
+ +      if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
                 return false;
   
         return true;
@@@ -9609,6 -9602,9 +9610,6 @@@ LIST_HEAD(task_groups)
   static struct kmem_cache *task_group_cache __read_mostly;
   #endif
   
- -DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
- -DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
- -
   void __init sched_init(void)
   {
         unsigned long ptr = 0;
@@@ -9652,6 -9648,14 +9653,6 @@@
   
   #endif /* CONFIG_RT_GROUP_SCHED */
         }
- -#ifdef CONFIG_CPUMASK_OFFSTACK
- -      for_each_possible_cpu(i) {
- -              per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- -                      cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- -              per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
- -                      cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- -      }
- -#endif /* CONFIG_CPUMASK_OFFSTACK */
   
         init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
   
@@@ -10160,7 -10164,7 +10161,7 @@@ void sched_release_group(struct task_gr
         spin_unlock_irqrestore(&task_group_lock, flags);
   }
   
- -static void sched_change_group(struct task_struct *tsk, int type)
+ +static void sched_change_group(struct task_struct *tsk)
   {
         struct task_group *tg;
   
@@@ -10176,7 -10180,7 +10177,7 @@@
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
         if (tsk->sched_class->task_change_group)
- -              tsk->sched_class->task_change_group(tsk, type);
+ +              tsk->sched_class->task_change_group(tsk);
         else
   #endif
                 set_task_rq(tsk, task_cpu(tsk));
@@@ -10207,7 -10211,7 +10208,7 @@@ void sched_move_task(struct task_struc
         if (running)
                 put_prev_task(rq, tsk);
   
- -      sched_change_group(tsk, TASK_MOVE_GROUP);
+ +      sched_change_group(tsk);
   
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
@@@ -10285,19 -10289,53 +10286,19 @@@ static void cpu_cgroup_css_free(struct 
         sched_unregister_group(tg);
   }
   
- -/*
- - * This is called before wake_up_new_task(), therefore we really only
- - * have to set its group bits, all the other stuff does not apply.
- - */
- -static void cpu_cgroup_fork(struct task_struct *task)
- -{
- -      struct rq_flags rf;
- -      struct rq *rq;
- -
- -      rq = task_rq_lock(task, &rf);
- -
- -      update_rq_clock(rq);
- -      sched_change_group(task, TASK_SET_GROUP);
- -
- -      task_rq_unlock(rq, task, &rf);
- -}
- -
+ +#ifdef CONFIG_RT_GROUP_SCHED
   static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
   {
         struct task_struct *task;
         struct cgroup_subsys_state *css;
- -      int ret = 0;
   
         cgroup_taskset_for_each(task, css, tset) {
- -#ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
- -#endif
- -              /*
- -               * Serialize against wake_up_new_task() such that if it's
- -               * running, we're sure to observe its full state.
- -               */
- -              raw_spin_lock_irq(&task->pi_lock);
- -              /*
- -               * Avoid calling sched_move_task() before wake_up_new_task()
- -               * has happened. This would lead to problems with PELT, due to
- -               * move wanting to detach+attach while we're not attached yet.
- -               */
- -              if (READ_ONCE(task->__state) == TASK_NEW)
- -                      ret = -EINVAL;
- -              raw_spin_unlock_irq(&task->pi_lock);
- -
- -              if (ret)
- -                      break;
         }
- -      return ret;
+ +      return 0;
   }
+ +#endif
   
   static void cpu_cgroup_attach(struct cgroup_taskset *tset)
   {
@@@ -11133,9 -11171,8 +11134,9 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
         .css_released   = cpu_cgroup_css_released,
         .css_free       = cpu_cgroup_css_free,
         .css_extra_stat_show = cpu_extra_stat_show,
- -      .fork           = cpu_cgroup_fork,
+ +#ifdef CONFIG_RT_GROUP_SCHED
         .can_attach     = cpu_cgroup_can_attach,
+ +#endif
         .attach         = cpu_cgroup_attach,
         .legacy_cftypes = cpu_legacy_files,
         .dfl_cftypes    = cpu_files,
@@@ -11147,19 -11184,6 +11148,19 @@@
   
   void dump_cpu_task(int cpu)
   {
+ +      if (cpu == smp_processor_id() && in_hardirq()) {
+ +              struct pt_regs *regs;
+ +
+ +              regs = get_irq_regs();
+ +              if (regs) {
+ +                      show_regs(regs);
+ +                      return;
+ +              }
+ +      }
+ +
+ +      if (trigger_single_cpu_backtrace(cpu))
+ +              return;
+ +
         pr_info("Task dump for CPU %d:\n", cpu);
         sched_show_task(cpu_curr(cpu));
   }
diff --combined kernel/sched/psi.c

index 7f6030091aeee5aca070b5b77bfb5f3291ac302b,9711827e31e5946ee0279c1520d6921e0634cf64..ee2ecc081422eab3244ec6f5b6b1f67e13203f03
--- 1/kernel/sched/psi.c
--- 2/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@@ -181,6 -181,7 +181,7 @@@ static void group_init(struct psi_grou
   {
         int cpu;
   
+       group->enabled = true;
         for_each_possible_cpu(cpu)
                 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
         group->avg_last_update = sched_clock();
@@@ -201,6 -202,7 +202,7 @@@ void __init psi_init(void
   {
         if (!psi_enable) {
                 static_branch_enable(&psi_disabled);
+               static_branch_disable(&psi_cgroups_enabled);
                 return;
         }
   
@@@ -211,7 -213,7 +213,7 @@@
         group_init(&psi_system);
   }
   
- static bool test_state(unsigned int *tasks, enum psi_states state)
+ static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
   {
         switch (state) {
         case PSI_IO_SOME:
@@@ -224,9 -226,9 +226,9 @@@
                 return unlikely(tasks[NR_MEMSTALL] &&
                         tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
         case PSI_CPU_SOME:
-               return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+               return unlikely(tasks[NR_RUNNING] > oncpu);
         case PSI_CPU_FULL:
-               return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+               return unlikely(tasks[NR_RUNNING] && !oncpu);
         case PSI_NONIDLE:
                 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
                         tasks[NR_RUNNING];
@@@ -688,35 -690,53 +690,53 @@@ static void psi_group_change(struct psi
                              bool wake_clock)
   {
         struct psi_group_cpu *groupc;
-       u32 state_mask = 0;
         unsigned int t, m;
         enum psi_states s;
+       u32 state_mask;
   
         groupc = per_cpu_ptr(group->pcpu, cpu);
   
         /*
-        * First we assess the aggregate resource states this CPU's
-        * tasks have been in since the last change, and account any
-        * SOME and FULL time these may have resulted in.
-        *
-        * Then we update the task counts according to the state
+        * First we update the task counts according to the state
          * change requested through the @clear and @set bits.
+        *
+        * Then if the cgroup PSI stats accounting enabled, we
+        * assess the aggregate resource states this CPU's tasks
+        * have been in since the last change, and account any
+        * SOME and FULL time these may have resulted in.
          */
         write_seqcount_begin(&groupc->seq);
   
-       record_times(groupc, now);
+       /*
+        * Start with TSK_ONCPU, which doesn't have a corresponding
+        * task count - it's just a boolean flag directly encoded in
+        * the state mask. Clear, set, or carry the current state if
+        * no changes are requested.
+        */
+       if (unlikely(clear & TSK_ONCPU)) {
+               state_mask = 0;
+               clear &= ~TSK_ONCPU;
+       } else if (unlikely(set & TSK_ONCPU)) {
+               state_mask = PSI_ONCPU;
+               set &= ~TSK_ONCPU;
+       } else {
+               state_mask = groupc->state_mask & PSI_ONCPU;
+       }
   
+       /*
+        * The rest of the state mask is calculated based on the task
+        * counts. Update those first, then construct the mask.
+        */
         for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
                 if (!(m & (1 << t)))
                         continue;
                 if (groupc->tasks[t]) {
                         groupc->tasks[t]--;
                 } else if (!psi_bug) {
-                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
                                         cpu, t, groupc->tasks[0],
                                         groupc->tasks[1], groupc->tasks[2],
-                                       groupc->tasks[3], groupc->tasks[4],
-                                       clear, set);
+                                       groupc->tasks[3], clear, set);
                         psi_bug = 1;
                 }
         }
@@@ -725,9 -745,25 +745,25 @@@
                 if (set & (1 << t))
                         groupc->tasks[t]++;
   
-       /* Calculate state mask representing active states */
+       if (!group->enabled) {
+               /*
+                * On the first group change after disabling PSI, conclude
+                * the current state and flush its time. This is unlikely
+                * to matter to the user, but aggregation (get_recent_times)
+                * may have already incorporated the live state into times_prev;
+                * avoid a delta sample underflow when PSI is later re-enabled.
+                */
+               if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+                       record_times(groupc, now);
+ 
+               groupc->state_mask = state_mask;
+ 
+               write_seqcount_end(&groupc->seq);
+               return;
+       }
+ 
         for (s = 0; s < NR_PSI_STATES; s++) {
-               if (test_state(groupc->tasks, s))
+               if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
                         state_mask |= (1 << s);
         }
   
@@@ -739,9 -775,11 +775,11 @@@
          * task in a cgroup is in_memstall, the corresponding groupc
          * on that cpu is in PSI_MEM_FULL state.
          */
-       if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+       if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
                 state_mask |= (1 << PSI_MEM_FULL);
   
+       record_times(groupc, now);
+ 
         groupc->state_mask = state_mask;
   
         write_seqcount_end(&groupc->seq);
@@@ -753,27 -791,12 +791,12 @@@
                 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
   }
   
- static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+ static inline struct psi_group *task_psi_group(struct task_struct *task)
   {
-       if (*iter == &psi_system)
-               return NULL;
- 
   #ifdef CONFIG_CGROUPS
-       if (static_branch_likely(&psi_cgroups_enabled)) {
-               struct cgroup *cgroup = NULL;
- 
-               if (!*iter)
-                       cgroup = task->cgroups->dfl_cgrp;
-               else
-                       cgroup = cgroup_parent(*iter);
- 
-               if (cgroup && cgroup_parent(cgroup)) {
-                       *iter = cgroup;
-                       return cgroup_psi(cgroup);
-               }
-       }
+       if (static_branch_likely(&psi_cgroups_enabled))
+               return cgroup_psi(task_dfl_cgroup(task));
   #endif
-       *iter = &psi_system;
         return &psi_system;
   }
   
@@@ -796,8 -819,6 +819,6 @@@ void psi_task_change(struct task_struc
   {
         int cpu = task_cpu(task);
         struct psi_group *group;
-       bool wake_clock = true;
-       void *iter = NULL;
         u64 now;
   
         if (!task->pid)
@@@ -806,19 -827,11 +827,11 @@@
         psi_flags_change(task, clear, set);
   
         now = cpu_clock(cpu);
-       /*
-        * Periodic aggregation shuts off if there is a period of no
-        * task changes, so we wake it back up if necessary. However,
-        * don't do this if the task change is the aggregation worker
-        * itself going to sleep, or we'll ping-pong forever.
-        */
-       if (unlikely((clear & TSK_RUNNING) &&
-                    (task->flags & PF_WQ_WORKER) &&
-                    wq_worker_last_func(task) == psi_avgs_work))
-               wake_clock = false;
   
-       while ((group = iterate_groups(task, &iter)))
-               psi_group_change(group, cpu, clear, set, now, wake_clock);
+       group = task_psi_group(task);
+       do {
+               psi_group_change(group, cpu, clear, set, now, true);
+       } while ((group = group->parent));
   }
   
   void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@@ -826,34 -839,30 +839,30 @@@
   {
         struct psi_group *group, *common = NULL;
         int cpu = task_cpu(prev);
-       void *iter;
         u64 now = cpu_clock(cpu);
   
         if (next->pid) {
-               bool identical_state;
- 
                 psi_flags_change(next, 0, TSK_ONCPU);
                 /*
-                * When switching between tasks that have an identical
-                * runtime state, the cgroup that contains both tasks
-                * we reach the first common ancestor. Iterate @next's
-                * ancestors only until we encounter @prev's ONCPU.
+                * Set TSK_ONCPU on @next's cgroups. If @next shares any
+                * ancestors with @prev, those will already have @prev's
+                * TSK_ONCPU bit set, and we can stop the iteration there.
                  */
-               identical_state = prev->psi_flags == next->psi_flags;
-               iter = NULL;
-               while ((group = iterate_groups(next, &iter))) {
-                       if (identical_state &&
-                           per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+               group = task_psi_group(next);
+               do {
+                       if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+                           PSI_ONCPU) {
                                 common = group;
                                 break;
                         }
   
                         psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
-               }
+               } while ((group = group->parent));
         }
   
         if (prev->pid) {
                 int clear = TSK_ONCPU, set = 0;
+               bool wake_clock = true;
   
                 /*
                  * When we're going to sleep, psi_dequeue() lets us
@@@ -867,26 -876,74 +876,74 @@@
                                 clear |= TSK_MEMSTALL_RUNNING;
                         if (prev->in_iowait)
                                 set |= TSK_IOWAIT;
+ 
+                       /*
+                        * Periodic aggregation shuts off if there is a period of no
+                        * task changes, so we wake it back up if necessary. However,
+                        * don't do this if the task change is the aggregation worker
+                        * itself going to sleep, or we'll ping-pong forever.
+                        */
+                       if (unlikely((prev->flags & PF_WQ_WORKER) &&
+                                    wq_worker_last_func(prev) == psi_avgs_work))
+                               wake_clock = false;
                 }
   
                 psi_flags_change(prev, clear, set);
   
-               iter = NULL;
-               while ((group = iterate_groups(prev, &iter)) && group != common)
-                       psi_group_change(group, cpu, clear, set, now, true);
+               group = task_psi_group(prev);
+               do {
+                       if (group == common)
+                               break;
+                       psi_group_change(group, cpu, clear, set, now, wake_clock);
+               } while ((group = group->parent));
   
                 /*
-                * TSK_ONCPU is handled up to the common ancestor. If we're tasked
-                * with dequeuing too, finish that for the rest of the hierarchy.
+                * TSK_ONCPU is handled up to the common ancestor. If there are
+                * any other differences between the two tasks (e.g. prev goes
+                * to sleep, or only one task is memstall), finish propagating
+                * those differences all the way up to the root.
                  */
-               if (sleep) {
+               if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
                         clear &= ~TSK_ONCPU;
-                       for (; group; group = iterate_groups(prev, &iter))
-                               psi_group_change(group, cpu, clear, set, now, true);
+                       for (; group; group = group->parent)
+                               psi_group_change(group, cpu, clear, set, now, wake_clock);
                 }
         }
   }
   
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ void psi_account_irqtime(struct task_struct *task, u32 delta)
+ {
+       int cpu = task_cpu(task);
+       struct psi_group *group;
+       struct psi_group_cpu *groupc;
+       u64 now;
+ 
+       if (!task->pid)
+               return;
+ 
+       now = cpu_clock(cpu);
+ 
+       group = task_psi_group(task);
+       do {
+               if (!group->enabled)
+                       continue;
+ 
+               groupc = per_cpu_ptr(group->pcpu, cpu);
+ 
+               write_seqcount_begin(&groupc->seq);
+ 
+               record_times(groupc, now);
+               groupc->times[PSI_IRQ_FULL] += delta;
+ 
+               write_seqcount_end(&groupc->seq);
+ 
+               if (group->poll_states & (1 << PSI_IRQ_FULL))
+                       psi_schedule_poll_work(group, 1);
+       } while ((group = group->parent));
+ }
+ #endif
+ 
   /**
    * psi_memstall_enter - mark the beginning of a memory stall section
    * @flags: flags to handle nested sections
@@@ -917,7 -974,6 +974,7 @@@ void psi_memstall_enter(unsigned long *
   
         rq_unlock_irq(rq, &rf);
   }
+ +EXPORT_SYMBOL_GPL(psi_memstall_enter);
   
   /**
    * psi_memstall_leave - mark the end of an memory stall section
@@@ -947,12 -1003,11 +1004,12 @@@ void psi_memstall_leave(unsigned long *
   
         rq_unlock_irq(rq, &rf);
   }
+ +EXPORT_SYMBOL_GPL(psi_memstall_leave);
   
   #ifdef CONFIG_CGROUPS
   int psi_cgroup_alloc(struct cgroup *cgroup)
   {
-       if (static_branch_likely(&psi_disabled))
+       if (!static_branch_likely(&psi_cgroups_enabled))
                 return 0;
   
         cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@@ -965,12 -1020,13 +1022,13 @@@
                 return -ENOMEM;
         }
         group_init(cgroup->psi);
+       cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
         return 0;
   }
   
   void psi_cgroup_free(struct cgroup *cgroup)
   {
-       if (static_branch_likely(&psi_disabled))
+       if (!static_branch_likely(&psi_cgroups_enabled))
                 return;
   
         cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@@@ -998,7 -1054,7 +1056,7 @@@ void cgroup_move_task(struct task_struc
         struct rq_flags rf;
         struct rq *rq;
   
-       if (static_branch_likely(&psi_disabled)) {
+       if (!static_branch_likely(&psi_cgroups_enabled)) {
                 /*
                  * Lame to do this here, but the scheduler cannot be locked
                  * from the outside, so we move cgroups from inside sched/.
@@@ -1046,10 -1102,45 +1104,45 @@@
   
         task_rq_unlock(rq, task, &rf);
   }
+ 
+ void psi_cgroup_restart(struct psi_group *group)
+ {
+       int cpu;
+ 
+       /*
+        * After we disable psi_group->enabled, we don't actually
+        * stop percpu tasks accounting in each psi_group_cpu,
+        * instead only stop test_state() loop, record_times()
+        * and averaging worker, see psi_group_change() for details.
+        *
+        * When disable cgroup PSI, this function has nothing to sync
+        * since cgroup pressure files are hidden and percpu psi_group_cpu
+        * would see !psi_group->enabled and only do task accounting.
+        *
+        * When re-enable cgroup PSI, this function use psi_group_change()
+        * to get correct state mask from test_state() loop on tasks[],
+        * and restart groupc->state_start from now, use .clear = .set = 0
+        * here since no task status really changed.
+        */
+       if (!group->enabled)
+               return;
+ 
+       for_each_possible_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+               struct rq_flags rf;
+               u64 now;
+ 
+               rq_lock_irq(rq, &rf);
+               now = cpu_clock(cpu);
+               psi_group_change(group, cpu, 0, 0, now, true);
+               rq_unlock_irq(rq, &rf);
+       }
+ }
   #endif /* CONFIG_CGROUPS */
   
   int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
   {
+       bool only_full = false;
         int full;
         u64 now;
   
@@@ -1064,7 -1155,11 +1157,11 @@@
                 group->avg_next_update = update_averages(group, now);
         mutex_unlock(&group->avgs_lock);
   
-       for (full = 0; full < 2; full++) {
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       only_full = res == PSI_IRQ;
+ #endif
+ 
+       for (full = 0; full < 2 - only_full; full++) {
                 unsigned long avg[3] = { 0, };
                 u64 total = 0;
                 int w;
@@@ -1078,7 -1173,7 +1175,7 @@@
                 }
   
                 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
-                          full ? "full" : "some",
+                          full || only_full ? "full" : "some",
                            LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
                            LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
                            LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@@ -1106,6 -1201,11 +1203,11 @@@ struct psi_trigger *psi_trigger_create(
         else
                 return ERR_PTR(-EINVAL);
   
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+               return ERR_PTR(-EINVAL);
+ #endif
+ 
         if (state >= PSI_NONIDLE)
                 return ERR_PTR(-EINVAL);
   
@@@ -1390,6 -1490,33 +1492,33 @@@ static const struct proc_ops psi_cpu_pr
         .proc_release   = psi_fop_release,
   };
   
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ static int psi_irq_show(struct seq_file *m, void *v)
+ {
+       return psi_show(m, &psi_system, PSI_IRQ);
+ }
+ 
+ static int psi_irq_open(struct inode *inode, struct file *file)
+ {
+       return psi_open(file, psi_irq_show);
+ }
+ 
+ static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+                            size_t nbytes, loff_t *ppos)
+ {
+       return psi_write(file, user_buf, nbytes, PSI_IRQ);
+ }
+ 
+ static const struct proc_ops psi_irq_proc_ops = {
+       .proc_open      = psi_irq_open,
+       .proc_read      = seq_read,
+       .proc_lseek     = seq_lseek,
+       .proc_write     = psi_irq_write,
+       .proc_poll      = psi_fop_poll,
+       .proc_release   = psi_fop_release,
+ };
+ #endif
+ 
   static int __init psi_proc_init(void)
   {
         if (psi_enable) {
@@@ -1397,6 -1524,9 +1526,9 @@@
                 proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
                 proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
                 proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+               proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+ #endif
         }
         return 0;
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
		1	2
Documentation/admin-guide/cgroup-v2.rst	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup-defs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/psi.c	patch \|	diff1 \|	diff2 \|	blob \| history