Merge branch 'for-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 29 Dec 2018 18:57:20 +0000 (10:57 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 29 Dec 2018 18:57:20 +0000 (10:57 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 29 Dec 2018 18:57:20 +0000 (10:57 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 29 Dec 2018 18:57:20 +0000 (10:57 -0800)
diff --combined Documentation/admin-guide/cgroup-v2.rst

index baf19bf2838587dd97a52fc90ab039656b6cc1f4,07e06136a550b90b49b4163356f13b7a6ae00882..7bf3f129c68bdc75943c58764b78aba097671e25
--- 1/Documentation/admin-guide/cgroup-v2.rst
--- 2/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@@ -56,11 -56,13 +56,13 @@@ v1 is available under Documentation/cgr
            5-3-3-2. IO Latency Interface Files
        5-4. PID
          5-4-1. PID Interface Files
-      5-5. Device
-      5-6. RDMA
-        5-6-1. RDMA Interface Files
-      5-7. Misc
-        5-7-1. perf_event
+      5-5. Cpuset
+        5.5-1. Cpuset Interface Files
+      5-6. Device
+      5-7. RDMA
+        5-7-1. RDMA Interface Files
+      5-8. Misc
+        5-8-1. perf_event
        5-N. Non-normative information
          5-N-1. CPU controller root cgroup process behaviour
          5-N-2. IO controller root cgroup process behaviour
@@@ -1610,6 -1612,176 +1612,176 @@@ through fork() or clone(). These will r
   of a new process would cause a cgroup policy to be violated.
   
   
+ Cpuset
+ ------
+ 
+ The "cpuset" controller provides a mechanism for constraining
+ the CPU and memory node placement of tasks to only the resources
+ specified in the cpuset interface files in a task's current cgroup.
+ This is especially valuable on large NUMA systems where placing jobs
+ on properly sized subsets of the systems with careful processor and
+ memory placement to reduce cross-node memory access and contention
+ can improve overall system performance.
+ 
+ The "cpuset" controller is hierarchical.  That means the controller
+ cannot use CPUs or memory nodes not allowed in its parent.
+ 
+ 
+ Cpuset Interface Files
+ ~~~~~~~~~~~~~~~~~~~~~~
+ 
+   cpuset.cpus
+       A read-write multiple values file which exists on non-root
+       cpuset-enabled cgroups.
+ 
+       It lists the requested CPUs to be used by tasks within this
+       cgroup.  The actual list of CPUs to be granted, however, is
+       subjected to constraints imposed by its parent and can differ
+       from the requested CPUs.
+ 
+       The CPU numbers are comma-separated numbers or ranges.
+       For example:
+ 
+         # cat cpuset.cpus
+         0-4,6,8-10
+ 
+       An empty value indicates that the cgroup is using the same
+       setting as the nearest cgroup ancestor with a non-empty
+       "cpuset.cpus" or all the available CPUs if none is found.
+ 
+       The value of "cpuset.cpus" stays constant until the next update
+       and won't be affected by any CPU hotplug events.
+ 
+   cpuset.cpus.effective
+       A read-only multiple values file which exists on all
+       cpuset-enabled cgroups.
+ 
+       It lists the onlined CPUs that are actually granted to this
+       cgroup by its parent.  These CPUs are allowed to be used by
+       tasks within the current cgroup.
+ 
+       If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
+       all the CPUs from the parent cgroup that can be available to
+       be used by this cgroup.  Otherwise, it should be a subset of
+       "cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
+       can be granted.  In this case, it will be treated just like an
+       empty "cpuset.cpus".
+ 
+       Its value will be affected by CPU hotplug events.
+ 
+   cpuset.mems
+       A read-write multiple values file which exists on non-root
+       cpuset-enabled cgroups.
+ 
+       It lists the requested memory nodes to be used by tasks within
+       this cgroup.  The actual list of memory nodes granted, however,
+       is subjected to constraints imposed by its parent and can differ
+       from the requested memory nodes.
+ 
+       The memory node numbers are comma-separated numbers or ranges.
+       For example:
+ 
+         # cat cpuset.mems
+         0-1,3
+ 
+       An empty value indicates that the cgroup is using the same
+       setting as the nearest cgroup ancestor with a non-empty
+       "cpuset.mems" or all the available memory nodes if none
+       is found.
+ 
+       The value of "cpuset.mems" stays constant until the next update
+       and won't be affected by any memory nodes hotplug events.
+ 
+   cpuset.mems.effective
+       A read-only multiple values file which exists on all
+       cpuset-enabled cgroups.
+ 
+       It lists the onlined memory nodes that are actually granted to
+       this cgroup by its parent. These memory nodes are allowed to
+       be used by tasks within the current cgroup.
+ 
+       If "cpuset.mems" is empty, it shows all the memory nodes from the
+       parent cgroup that will be available to be used by this cgroup.
+       Otherwise, it should be a subset of "cpuset.mems" unless none of
+       the memory nodes listed in "cpuset.mems" can be granted.  In this
+       case, it will be treated just like an empty "cpuset.mems".
+ 
+       Its value will be affected by memory nodes hotplug events.
+ 
+   cpuset.cpus.partition
+       A read-write single value file which exists on non-root
+       cpuset-enabled cgroups.  This flag is owned by the parent cgroup
+       and is not delegatable.
+ 
+         It accepts only the following input values when written to.
+ 
+         "root"   - a paritition root
+         "member" - a non-root member of a partition
+ 
+       When set to be a partition root, the current cgroup is the
+       root of a new partition or scheduling domain that comprises
+       itself and all its descendants except those that are separate
+       partition roots themselves and their descendants.  The root
+       cgroup is always a partition root.
+ 
+       There are constraints on where a partition root can be set.
+       It can only be set in a cgroup if all the following conditions
+       are true.
+ 
+       1) The "cpuset.cpus" is not empty and the list of CPUs are
+          exclusive, i.e. they are not shared by any of its siblings.
+       2) The parent cgroup is a partition root.
+       3) The "cpuset.cpus" is also a proper subset of the parent's
+          "cpuset.cpus.effective".
+       4) There is no child cgroups with cpuset enabled.  This is for
+          eliminating corner cases that have to be handled if such a
+          condition is allowed.
+ 
+       Setting it to partition root will take the CPUs away from the
+       effective CPUs of the parent cgroup.  Once it is set, this
+       file cannot be reverted back to "member" if there are any child
+       cgroups with cpuset enabled.
+ 
+       A parent partition cannot distribute all its CPUs to its
+       child partitions.  There must be at least one cpu left in the
+       parent partition.
+ 
+       Once becoming a partition root, changes to "cpuset.cpus" is
+       generally allowed as long as the first condition above is true,
+       the change will not take away all the CPUs from the parent
+       partition and the new "cpuset.cpus" value is a superset of its
+       children's "cpuset.cpus" values.
+ 
+       Sometimes, external factors like changes to ancestors'
+       "cpuset.cpus" or cpu hotplug can cause the state of the partition
+       root to change.  On read, the "cpuset.sched.partition" file
+       can show the following values.
+ 
+       "member"       Non-root member of a partition
+       "root"         Partition root
+       "root invalid" Invalid partition root
+ 
+       It is a partition root if the first 2 partition root conditions
+       above are true and at least one CPU from "cpuset.cpus" is
+       granted by the parent cgroup.
+ 
+       A partition root can become invalid if none of CPUs requested
+       in "cpuset.cpus" can be granted by the parent cgroup or the
+       parent cgroup is no longer a partition root itself.  In this
+       case, it is not a real partition even though the restriction
+       of the first partition root condition above will still apply.
+       The cpu affinity of all the tasks in the cgroup will then be
+       associated with CPUs in the nearest ancestor partition.
+ 
+       An invalid partition root can be transitioned back to a
+       real partition root if at least one of the requested CPUs
+       can now be granted by its parent.  In this case, the cpu
+       affinity of all the tasks in the formerly invalid partition
+       will be associated to the CPUs of the newly formed partition.
+       Changing the partition state of an invalid partition root to
+       "member" is always allowed even if child cpusets are present.
+ 
+ 
   Device controller
   -----------------
   
@@@ -1879,10 -2051,8 +2051,10 @@@ following two functions
   
     wbc_init_bio(@wbc, @bio)
         Should be called for each bio carrying writeback data and
- -      associates the bio with the inode's owner cgroup.  Can be
- -      called anytime between bio allocation and submission.
+ +      associates the bio with the inode's owner cgroup and the
+ +      corresponding request queue.  This must be called after
+ +      a queue (device) has been associated with the bio and
+ +      before submission.
   
     wbc_account_io(@wbc, @page, @bytes)
         Should be called for each data segment being written out.
@@@ -1901,7 -2071,7 +2073,7 @@@ the configuration, the bio may be execu
   the writeback session is holding shared resources, e.g. a journal
   entry, may lead to priority inversion.  There is no one easy solution
   for the problem.  Filesystems can try to work around specific problem
- -cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+ +cases by skipping wbc_init_bio() and using bio_associate_blkg()
   directly.
   
   
diff --combined Documentation/admin-guide/kernel-parameters.txt

index ff4daa780ae8b1653b146bdc1bbf669741c09ec7,8b765244922891d2a712a2f87210b68c7762356c..b7c9040f547e28c46654a592210e5c8d3b918b92
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -486,10 -486,14 +486,14 @@@
                         cut the overhead, others just disable the usage. So
                         only cgroup_disable=memory is actually worthy}
   
-       cgroup_no_v1=   [KNL] Disable one, multiple, all cgroup controllers in v1
-                       Format: { controller[,controller...] | "all" }
+       cgroup_no_v1=   [KNL] Disable cgroup controllers and named hierarchies in v1
+                       Format: { { controller | "all" | "named" }
+                                 [,{ controller | "all" | "named" }...] }
                         Like cgroup_disable, but only applies to cgroup v1;
                         the blacklisted controllers remain available in cgroup2.
+                       "all" blacklists all controllers and "named" disables
+                       named mounts. Specifying both "all" and "named" disables
+                       all v1 hierarchies.
   
         cgroup.memory=  [KNL] Pass options to the cgroup memory controller.
                         Format: <string>
@@@ -674,9 -678,6 +678,9 @@@
         cpuidle.off=1   [CPU_IDLE]
                         disable the cpuidle sub-system
   
+ +      cpuidle.governor=
+ +                      [CPU_IDLE] Name of the cpuidle governor to use.
+ +
         cpufreq.off=1   [CPU_FREQ]
                         disable the cpufreq sub-system
   
@@@ -859,8 -860,7 +863,8 @@@
                         causing system reset or hang due to sending
                         INIT from AP to BSP.
   
- -      disable_counter_freezing [HW]
+ +      perf_v4_pmi=    [X86,INTEL]
+ +                      Format: <bool>
                         Disable Intel PMU counter freezing feature.
                         The feature only exists starting from
                         Arch Perfmon v4 (Skylake and newer).
@@@ -2099,9 -2099,6 +2103,9 @@@
                         off
                                 Disables hypervisor mitigations and doesn't
                                 emit any warnings.
+ +                              It also drops the swap size and available
+ +                              RAM limit restriction on both hypervisor and
+ +                              bare metal.
   
                         Default is 'flush'.
   
@@@ -2833,7 -2830,7 +2837,7 @@@
                         check bypass). With this option data leaks are possible
                         in the system.
   
- -      nospectre_v2    [X86] Disable all mitigations for the Spectre variant 2
+ +      nospectre_v2    [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
                         (indirect branch prediction) vulnerability. System may
                         allow data leaks with this option, which is equivalent
                         to spectre_v2=off.
@@@ -3511,10 -3508,6 +3515,10 @@@
                         before loading.
                         See Documentation/blockdev/ramdisk.txt.
   
+ +      psi=            [KNL] Enable or disable pressure stall information
+ +                      tracking.
+ +                      Format: <bool>
+ +
         psmouse.proto=  [HW,MOUSE] Highest PS2 mouse protocol extension to
                         probe for; one of (bare|imps|exps|lifebook|any).
         psmouse.rate=   [HW,MOUSE] Set desired mouse report rate, in reports
@@@ -3754,6 -3747,24 +3758,6 @@@
                         in microseconds.  The default of zero says
                         no holdoff.
   
- -      rcutorture.cbflood_inter_holdoff= [KNL]
- -                      Set holdoff time (jiffies) between successive
- -                      callback-flood tests.
- -
- -      rcutorture.cbflood_intra_holdoff= [KNL]
- -                      Set holdoff time (jiffies) between successive
- -                      bursts of callbacks within a given callback-flood
- -                      test.
- -
- -      rcutorture.cbflood_n_burst= [KNL]
- -                      Set the number of bursts making up a given
- -                      callback-flood test.  Set this to zero to
- -                      disable callback-flood testing.
- -
- -      rcutorture.cbflood_n_per_burst= [KNL]
- -                      Set the number of callbacks to be registered
- -                      in a given burst of a callback-flood test.
- -
         rcutorture.fqs_duration= [KNL]
                         Set duration of force_quiescent_state bursts
                         in microseconds.
@@@ -3766,23 -3777,6 +3770,23 @@@
                         Set wait time between force_quiescent_state bursts
                         in seconds.
   
+ +      rcutorture.fwd_progress= [KNL]
+ +                      Enable RCU grace-period forward-progress testing
+ +                      for the types of RCU supporting this notion.
+ +
+ +      rcutorture.fwd_progress_div= [KNL]
+ +                      Specify the fraction of a CPU-stall-warning
+ +                      period to do tight-loop forward-progress testing.
+ +
+ +      rcutorture.fwd_progress_holdoff= [KNL]
+ +                      Number of seconds to wait between successive
+ +                      forward-progress tests.
+ +
+ +      rcutorture.fwd_progress_need_resched= [KNL]
+ +                      Enclose cond_resched() calls within checks for
+ +                      need_resched() during tight-loop forward-progress
+ +                      testing.
+ +
         rcutorture.gp_cond= [KNL]
                         Use conditional/asynchronous update-side
                         primitives, if available.
@@@ -4204,13 -4198,9 +4208,13 @@@
   
         spectre_v2=     [X86] Control mitigation of Spectre variant 2
                         (indirect branch speculation) vulnerability.
+ +                      The default operation protects the kernel from
+ +                      user space attacks.
   
- -                      on   - unconditionally enable
- -                      off  - unconditionally disable
+ +                      on   - unconditionally enable, implies
+ +                             spectre_v2_user=on
+ +                      off  - unconditionally disable, implies
+ +                             spectre_v2_user=off
                         auto - kernel detects whether your CPU model is
                                vulnerable
   
@@@ -4220,12 -4210,6 +4224,12 @@@
                         CONFIG_RETPOLINE configuration option, and the
                         compiler with which the kernel was built.
   
+ +                      Selecting 'on' will also enable the mitigation
+ +                      against user space to user space task attacks.
+ +
+ +                      Selecting 'off' will disable both the kernel and
+ +                      the user space protections.
+ +
                         Specific mitigations can also be selected manually:
   
                         retpoline         - replace indirect branches
@@@ -4235,48 -4219,6 +4239,48 @@@
                         Not specifying this option is equivalent to
                         spectre_v2=auto.
   
+ +      spectre_v2_user=
+ +                      [X86] Control mitigation of Spectre variant 2
+ +                      (indirect branch speculation) vulnerability between
+ +                      user space tasks
+ +
+ +                      on      - Unconditionally enable mitigations. Is
+ +                                enforced by spectre_v2=on
+ +
+ +                      off     - Unconditionally disable mitigations. Is
+ +                                enforced by spectre_v2=off
+ +
+ +                      prctl   - Indirect branch speculation is enabled,
+ +                                but mitigation can be enabled via prctl
+ +                                per thread.  The mitigation control state
+ +                                is inherited on fork.
+ +
+ +                      prctl,ibpb
+ +                              - Like "prctl" above, but only STIBP is
+ +                                controlled per thread. IBPB is issued
+ +                                always when switching between different user
+ +                                space processes.
+ +
+ +                      seccomp
+ +                              - Same as "prctl" above, but all seccomp
+ +                                threads will enable the mitigation unless
+ +                                they explicitly opt out.
+ +
+ +                      seccomp,ibpb
+ +                              - Like "seccomp" above, but only STIBP is
+ +                                controlled per thread. IBPB is issued
+ +                                always when switching between different
+ +                                user space processes.
+ +
+ +                      auto    - Kernel selects the mitigation depending on
+ +                                the available CPU features and vulnerability.
+ +
+ +                      Default mitigation:
+ +                      If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
+ +
+ +                      Not specifying this option is equivalent to
+ +                      spectre_v2_user=auto.
+ +
         spec_store_bypass_disable=
                         [HW] Control Speculative Store Bypass (SSB) Disable mitigation
                         (Speculative Store Bypass vulnerability)
@@@ -4775,8 -4717,6 +4779,8 @@@
                                         prevent spurious wakeup);
                                 n = USB_QUIRK_DELAY_CTRL_MSG (Device needs a
                                         pause after every control message);
+ +                              o = USB_QUIRK_HUB_SLOW_RESET (Hub needs extra
+ +                                      delay after resetting its port);
                         Example: quirks=0781:5580:bk,0a5c:5834:gij
   
         usbhid.mousepoll=
diff --combined kernel/cgroup/cgroup.c

index 39eb36ba36ad2837d091c61352597459d67b4dc3,879c9f191f6625caff120351666aa3423fe6c3b0..f31bd61c946645a30c9f43814bec588c9034677c
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -86,6 -86,7 +86,7 @@@ EXPORT_SYMBOL_GPL(css_set_lock)
   
   DEFINE_SPINLOCK(trace_cgroup_path_lock);
   char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+ bool cgroup_debug __read_mostly;
   
   /*
    * Protects cgroup_idr and css_idr so that IDs can be released without
@@@ -493,7 -494,7 +494,7 @@@ static struct cgroup_subsys_state *cgro
   }
   
   /**
- - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
    * @cgrp: the cgroup of interest
    * @ss: the subsystem of interest (%NULL returns @cgrp->self)
    *
@@@ -502,8 -503,8 +503,8 @@@
    * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
    * function is guaranteed to return non-NULL css.
    */
- -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
- -                                              struct cgroup_subsys *ss)
+ +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+ +                                                      struct cgroup_subsys *ss)
   {
         lockdep_assert_held(&cgroup_mutex);
   
@@@ -523,35 -524,6 +524,35 @@@
         return cgroup_css(cgrp, ss);
   }
   
+ +/**
+ + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ + * @cgrp: the cgroup of interest
+ + * @ss: the subsystem of interest
+ + *
+ + * Find and get the effective css of @cgrp for @ss.  The effective css is
+ + * defined as the matching css of the nearest ancestor including self which
+ + * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
+ + * the root css is returned, so this function always returns a valid css.
+ + *
+ + * The returned css is not guaranteed to be online, and therefore it is the
+ + * callers responsiblity to tryget a reference for it.
+ + */
+ +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ +                                       struct cgroup_subsys *ss)
+ +{
+ +      struct cgroup_subsys_state *css;
+ +
+ +      do {
+ +              css = cgroup_css(cgrp, ss);
+ +
+ +              if (css)
+ +                      return css;
+ +              cgrp = cgroup_parent(cgrp);
+ +      } while (cgrp);
+ +
+ +      return init_css_set.subsys[ss->id];
+ +}
+ +
   /**
    * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
    * @cgrp: the cgroup of interest
@@@ -634,11 -606,10 +635,11 @@@ EXPORT_SYMBOL_GPL(of_css)
    *
    * Should be called under cgroup_[tree_]mutex.
    */
- -#define for_each_e_css(css, ssid, cgrp)                                       \
- -      for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
- -              if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
- -                      ;                                               \
+ +#define for_each_e_css(css, ssid, cgrp)                                           \
+ +      for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)            \
+ +              if (!((css) = cgroup_e_css_by_mask(cgrp,                    \
+ +                                                 cgroup_subsys[(ssid)]))) \
+ +                      ;                                                   \
                 else
   
   /**
@@@ -1037,7 -1008,7 +1038,7 @@@ static struct css_set *find_existing_cs
                          * @ss is in this hierarchy, so we want the
                          * effective css from @cgrp.
                          */
- -                      template[i] = cgroup_e_css(cgrp, ss);
+ +                      template[i] = cgroup_e_css_by_mask(cgrp, ss);
                 } else {
                         /*
                          * @ss is not in this hierarchy, so we don't want
@@@ -1429,12 -1400,15 +1430,15 @@@ static char *cgroup_file_name(struct cg
         struct cgroup_subsys *ss = cft->ss;
   
         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
-           !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
-               snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-                        cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+           !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+               const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+ 
+               snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+                        dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
                          cft->name);
-       else
+       } else {
                 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+       }
         return buf;
   }
   
@@@ -1774,7 -1748,7 +1778,7 @@@ static int parse_cgroup_root_flags(cha
   
         *root_flags = 0;
   
-       if (!data)
+       if (!data || *data == '\0')
                 return 0;
   
         while ((token = strsep(&data, ",")) != NULL) {
@@@ -3054,7 -3028,7 +3058,7 @@@ static int cgroup_apply_control(struct 
                 return ret;
   
         /*
- -       * At this point, cgroup_e_css() results reflect the new csses
+ +       * At this point, cgroup_e_css_by_mask() results reflect the new csses
          * making the following cgroup_update_dfl_csses() properly update
          * css associations of all tasks in the subtree.
          */
@@@ -3669,7 -3643,8 +3673,8 @@@ restart
                         continue;
                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
                         continue;
- 
+               if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+                       continue;
                 if (is_add) {
                         ret = cgroup_add_file(css, cgrp, cft);
                         if (ret) {
@@@ -4232,20 -4207,25 +4237,25 @@@ static void css_task_iter_advance(struc
   
         lockdep_assert_held(&css_set_lock);
   repeat:
-       /*
-        * Advance iterator to find next entry.  cset->tasks is consumed
-        * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
-        * next cset.
-        */
-       next = it->task_pos->next;
+       if (it->task_pos) {
+               /*
+                * Advance iterator to find next entry.  cset->tasks is
+                * consumed first and then ->mg_tasks.  After ->mg_tasks,
+                * we move onto the next cset.
+                */
+               next = it->task_pos->next;
   
-       if (next == it->tasks_head)
-               next = it->mg_tasks_head->next;
+               if (next == it->tasks_head)
+                       next = it->mg_tasks_head->next;
   
-       if (next == it->mg_tasks_head)
+               if (next == it->mg_tasks_head)
+                       css_task_iter_advance_css_set(it);
+               else
+                       it->task_pos = next;
+       } else {
+               /* called from start, proceed to the first cset */
                 css_task_iter_advance_css_set(it);
-       else
-               it->task_pos = next;
+       }
   
         /* if PROCS, skip over tasks which aren't group leaders */
         if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@@ -4285,7 -4265,7 +4295,7 @@@ void css_task_iter_start(struct cgroup_
   
         it->cset_head = it->cset_pos;
   
-       css_task_iter_advance_css_set(it);
+       css_task_iter_advance(it);
   
         spin_unlock_irq(&css_set_lock);
   }
@@@ -5373,7 -5353,7 +5383,7 @@@ int __init cgroup_init(void
         cgroup_rstat_boot();
   
         /*
- -       * The latency of the synchronize_sched() is too high for cgroups,
+ +       * The latency of the synchronize_rcu() is too high for cgroups,
          * avoid it at the cost of forcing all readers into the slow path.
          */
         rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
@@@ -5773,6 -5753,16 +5783,16 @@@ static int __init cgroup_disable(char *
   }
   __setup("cgroup_disable=", cgroup_disable);
   
+ void __init __weak enable_debug_cgroup(void) { }
+ 
+ static int __init enable_cgroup_debug(char *str)
+ {
+       cgroup_debug = true;
+       enable_debug_cgroup();
+       return 1;
+ }
+ __setup("cgroup_debug", enable_cgroup_debug);
+ 
   /**
    * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
    * @dentry: directory dentry of interest
@@@ -6008,10 -5998,8 +6028,8 @@@ static ssize_t show_delegatable_files(s
   
                 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
   
-               if (unlikely(ret >= size)) {
-                       WARN_ON(1);
+               if (WARN_ON(ret >= size))
                         break;
-               }
         }
   
         return ret;
diff --combined kernel/cgroup/cpuset.c

index 9510a5b32eaf6be16dd2817028235410f673d7dc,f0decd8165e7927c42a6d5989d5819d47e263cd7..479743db6c37093f0f6d4e6eebb9d127ab011050
--- 1/kernel/cgroup/cpuset.c
--- 2/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@@ -109,6 -109,16 +109,16 @@@ struct cpuset 
         cpumask_var_t effective_cpus;
         nodemask_t effective_mems;
   
+       /*
+        * CPUs allocated to child sub-partitions (default hierarchy only)
+        * - CPUs granted by the parent = effective_cpus U subparts_cpus
+        * - effective_cpus and subparts_cpus are mutually exclusive.
+        *
+        * effective_cpus contains only onlined CPUs, but subparts_cpus
+        * may have offlined ones.
+        */
+       cpumask_var_t subparts_cpus;
+ 
         /*
          * This is old Memory Nodes tasks took on.
          *
@@@ -134,6 -144,47 +144,47 @@@
   
         /* for custom sched domain */
         int relax_domain_level;
+ 
+       /* number of CPUs in subparts_cpus */
+       int nr_subparts_cpus;
+ 
+       /* partition root state */
+       int partition_root_state;
+ 
+       /*
+        * Default hierarchy only:
+        * use_parent_ecpus - set if using parent's effective_cpus
+        * child_ecpus_count - # of children with use_parent_ecpus set
+        */
+       int use_parent_ecpus;
+       int child_ecpus_count;
+ };
+ 
+ /*
+  * Partition root states:
+  *
+  *   0 - not a partition root
+  *
+  *   1 - partition root
+  *
+  *  -1 - invalid partition root
+  *       None of the cpus in cpus_allowed can be put into the parent's
+  *       subparts_cpus. In this case, the cpuset is not a real partition
+  *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
+  *       and the cpuset can be restored back to a partition root if the
+  *       parent cpuset can give more CPUs back to this child cpuset.
+  */
+ #define PRS_DISABLED          0
+ #define PRS_ENABLED           1
+ #define PRS_ERROR             -1
+ 
+ /*
+  * Temporary cpumasks for working with partitions that are passed among
+  * functions to avoid memory allocation in inner functions.
+  */
+ struct tmpmasks {
+       cpumask_var_t addmask, delmask; /* For partition root */
+       cpumask_var_t new_cpus;         /* For update_cpumasks_hier() */
   };
   
   static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@@ -218,9 -269,15 +269,15 @@@ static inline int is_spread_slab(const 
         return test_bit(CS_SPREAD_SLAB, &cs->flags);
   }
   
+ static inline int is_partition_root(const struct cpuset *cs)
+ {
+       return cs->partition_root_state > 0;
+ }
+ 
   static struct cpuset top_cpuset = {
         .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
                   (1 << CS_MEM_EXCLUSIVE)),
+       .partition_root_state = PRS_ENABLED,
   };
   
   /**
@@@ -418,6 -475,65 +475,65 @@@ static int is_cpuset_subset(const struc
                 is_mem_exclusive(p) <= is_mem_exclusive(q);
   }
   
+ /**
+  * alloc_cpumasks - allocate three cpumasks for cpuset
+  * @cs:  the cpuset that have cpumasks to be allocated.
+  * @tmp: the tmpmasks structure pointer
+  * Return: 0 if successful, -ENOMEM otherwise.
+  *
+  * Only one of the two input arguments should be non-NULL.
+  */
+ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+ {
+       cpumask_var_t *pmask1, *pmask2, *pmask3;
+ 
+       if (cs) {
+               pmask1 = &cs->cpus_allowed;
+               pmask2 = &cs->effective_cpus;
+               pmask3 = &cs->subparts_cpus;
+       } else {
+               pmask1 = &tmp->new_cpus;
+               pmask2 = &tmp->addmask;
+               pmask3 = &tmp->delmask;
+       }
+ 
+       if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
+               return -ENOMEM;
+ 
+       if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
+               goto free_one;
+ 
+       if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
+               goto free_two;
+ 
+       return 0;
+ 
+ free_two:
+       free_cpumask_var(*pmask2);
+ free_one:
+       free_cpumask_var(*pmask1);
+       return -ENOMEM;
+ }
+ 
+ /**
+  * free_cpumasks - free cpumasks in a tmpmasks structure
+  * @cs:  the cpuset that have cpumasks to be free.
+  * @tmp: the tmpmasks structure pointer
+  */
+ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+ {
+       if (cs) {
+               free_cpumask_var(cs->cpus_allowed);
+               free_cpumask_var(cs->effective_cpus);
+               free_cpumask_var(cs->subparts_cpus);
+       }
+       if (tmp) {
+               free_cpumask_var(tmp->new_cpus);
+               free_cpumask_var(tmp->addmask);
+               free_cpumask_var(tmp->delmask);
+       }
+ }
+ 
   /**
    * alloc_trial_cpuset - allocate a trial cpuset
    * @cs: the cpuset that the trial cpuset duplicates
@@@ -430,31 -546,24 +546,24 @@@ static struct cpuset *alloc_trial_cpuse
         if (!trial)
                 return NULL;
   
-       if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
-               goto free_cs;
-       if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
-               goto free_cpus;
+       if (alloc_cpumasks(trial, NULL)) {
+               kfree(trial);
+               return NULL;
+       }
   
         cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
         cpumask_copy(trial->effective_cpus, cs->effective_cpus);
         return trial;
- 
- free_cpus:
-       free_cpumask_var(trial->cpus_allowed);
- free_cs:
-       kfree(trial);
-       return NULL;
   }
   
   /**
-  * free_trial_cpuset - free the trial cpuset
-  * @trial: the trial cpuset to be freed
+  * free_cpuset - free the cpuset
+  * @cs: the cpuset to be freed
    */
- static void free_trial_cpuset(struct cpuset *trial)
+ static inline void free_cpuset(struct cpuset *cs)
   {
-       free_cpumask_var(trial->effective_cpus);
-       free_cpumask_var(trial->cpus_allowed);
-       kfree(trial);
+       free_cpumasks(cs, NULL);
+       kfree(cs);
   }
   
   /*
@@@ -660,13 -769,14 +769,14 @@@ static int generate_sched_domains(cpuma
         int ndoms = 0;          /* number of sched domains in result */
         int nslot;              /* next empty doms[] struct cpumask slot */
         struct cgroup_subsys_state *pos_css;
+       bool root_load_balance = is_sched_load_balance(&top_cpuset);
   
         doms = NULL;
         dattr = NULL;
         csa = NULL;
   
         /* Special case for the 99% of systems with one, full, sched domain */
-       if (is_sched_load_balance(&top_cpuset)) {
+       if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
                 ndoms = 1;
                 doms = alloc_sched_domains(ndoms);
                 if (!doms)
@@@ -689,6 -799,8 +799,8 @@@
         csn = 0;
   
         rcu_read_lock();
+       if (root_load_balance)
+               csa[csn++] = &top_cpuset;
         cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
                 if (cp == &top_cpuset)
                         continue;
@@@ -699,6 -811,9 +811,9 @@@
                  * parent's cpus, so just skip them, and then we call
                  * update_domain_attr_tree() to calc relax_domain_level of
                  * the corresponding sched domain.
+                *
+                * If root is load-balancing, we can skip @cp if it
+                * is a subset of the root's effective_cpus.
                  */
                 if (!cpumask_empty(cp->cpus_allowed) &&
                     !(is_sched_load_balance(cp) &&
@@@ -706,11 -821,16 +821,16 @@@
                                          housekeeping_cpumask(HK_FLAG_DOMAIN))))
                         continue;
   
+               if (root_load_balance &&
+                   cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+                       continue;
+ 
                 if (is_sched_load_balance(cp))
                         csa[csn++] = cp;
   
-               /* skip @cp's subtree */
-               pos_css = css_rightmost_descendant(pos_css);
+               /* skip @cp's subtree if not a partition root */
+               if (!is_partition_root(cp))
+                       pos_css = css_rightmost_descendant(pos_css);
         }
         rcu_read_unlock();
   
@@@ -838,7 -958,12 +958,12 @@@ static void rebuild_sched_domains_locke
          * passing doms with offlined cpu to partition_sched_domains().
          * Anyways, hotplug work item will rebuild sched domains.
          */
-       if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+       if (!top_cpuset.nr_subparts_cpus &&
+           !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+               goto out;
+ 
+       if (top_cpuset.nr_subparts_cpus &&
+          !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
                 goto out;
   
         /* Generate domain masks and attrs */
@@@ -881,10 -1006,248 +1006,248 @@@ static void update_tasks_cpumask(struc
         css_task_iter_end(&it);
   }
   
+ /**
+  * compute_effective_cpumask - Compute the effective cpumask of the cpuset
+  * @new_cpus: the temp variable for the new effective_cpus mask
+  * @cs: the cpuset the need to recompute the new effective_cpus mask
+  * @parent: the parent cpuset
+  *
+  * If the parent has subpartition CPUs, include them in the list of
+  * allowable CPUs in computing the new effective_cpus mask. Since offlined
+  * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+  * to mask those out.
+  */
+ static void compute_effective_cpumask(struct cpumask *new_cpus,
+                                     struct cpuset *cs, struct cpuset *parent)
+ {
+       if (parent->nr_subparts_cpus) {
+               cpumask_or(new_cpus, parent->effective_cpus,
+                          parent->subparts_cpus);
+               cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+               cpumask_and(new_cpus, new_cpus, cpu_active_mask);
+       } else {
+               cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+       }
+ }
+ 
+ /*
+  * Commands for update_parent_subparts_cpumask
+  */
+ enum subparts_cmd {
+       partcmd_enable,         /* Enable partition root         */
+       partcmd_disable,        /* Disable partition root        */
+       partcmd_update,         /* Update parent's subparts_cpus */
+ };
+ 
+ /**
+  * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+  * @cpuset:  The cpuset that requests change in partition root state
+  * @cmd:     Partition root state change command
+  * @newmask: Optional new cpumask for partcmd_update
+  * @tmp:     Temporary addmask and delmask
+  * Return:   0, 1 or an error code
+  *
+  * For partcmd_enable, the cpuset is being transformed from a non-partition
+  * root to a partition root. The cpus_allowed mask of the given cpuset will
+  * be put into parent's subparts_cpus and taken away from parent's
+  * effective_cpus. The function will return 0 if all the CPUs listed in
+  * cpus_allowed can be granted or an error code will be returned.
+  *
+  * For partcmd_disable, the cpuset is being transofrmed from a partition
+  * root back to a non-partition root. any CPUs in cpus_allowed that are in
+  * parent's subparts_cpus will be taken away from that cpumask and put back
+  * into parent's effective_cpus. 0 should always be returned.
+  *
+  * For partcmd_update, if the optional newmask is specified, the cpu
+  * list is to be changed from cpus_allowed to newmask. Otherwise,
+  * cpus_allowed is assumed to remain the same. The cpuset should either
+  * be a partition root or an invalid partition root. The partition root
+  * state may change if newmask is NULL and none of the requested CPUs can
+  * be granted by the parent. The function will return 1 if changes to
+  * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
+  * Error code should only be returned when newmask is non-NULL.
+  *
+  * The partcmd_enable and partcmd_disable commands are used by
+  * update_prstate(). The partcmd_update command is used by
+  * update_cpumasks_hier() with newmask NULL and update_cpumask() with
+  * newmask set.
+  *
+  * The checking is more strict when enabling partition root than the
+  * other two commands.
+  *
+  * Because of the implicit cpu exclusive nature of a partition root,
+  * cpumask changes that violates the cpu exclusivity rule will not be
+  * permitted when checked by validate_change(). The validate_change()
+  * function will also prevent any changes to the cpu list if it is not
+  * a superset of children's cpu lists.
+  */
+ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+                                         struct cpumask *newmask,
+                                         struct tmpmasks *tmp)
+ {
+       struct cpuset *parent = parent_cs(cpuset);
+       int adding;     /* Moving cpus from effective_cpus to subparts_cpus */
+       int deleting;   /* Moving cpus from subparts_cpus to effective_cpus */
+       bool part_error = false;        /* Partition error? */
+ 
+       lockdep_assert_held(&cpuset_mutex);
+ 
+       /*
+        * The parent must be a partition root.
+        * The new cpumask, if present, or the current cpus_allowed must
+        * not be empty.
+        */
+       if (!is_partition_root(parent) ||
+          (newmask && cpumask_empty(newmask)) ||
+          (!newmask && cpumask_empty(cpuset->cpus_allowed)))
+               return -EINVAL;
+ 
+       /*
+        * Enabling/disabling partition root is not allowed if there are
+        * online children.
+        */
+       if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
+               return -EBUSY;
+ 
+       /*
+        * Enabling partition root is not allowed if not all the CPUs
+        * can be granted from parent's effective_cpus or at least one
+        * CPU will be left after that.
+        */
+       if ((cmd == partcmd_enable) &&
+          (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
+            cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
+               return -EINVAL;
+ 
+       /*
+        * A cpumask update cannot make parent's effective_cpus become empty.
+        */
+       adding = deleting = false;
+       if (cmd == partcmd_enable) {
+               cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+               adding = true;
+       } else if (cmd == partcmd_disable) {
+               deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+                                      parent->subparts_cpus);
+       } else if (newmask) {
+               /*
+                * partcmd_update with newmask:
+                *
+                * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
+                * addmask = newmask & parent->effective_cpus
+                *                   & ~parent->subparts_cpus
+                */
+               cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+               deleting = cpumask_and(tmp->delmask, tmp->delmask,
+                                      parent->subparts_cpus);
+ 
+               cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+               adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+                                       parent->subparts_cpus);
+               /*
+                * Return error if the new effective_cpus could become empty.
+                */
+               if (adding &&
+                   cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+                       if (!deleting)
+                               return -EINVAL;
+                       /*
+                        * As some of the CPUs in subparts_cpus might have
+                        * been offlined, we need to compute the real delmask
+                        * to confirm that.
+                        */
+                       if (!cpumask_and(tmp->addmask, tmp->delmask,
+                                        cpu_active_mask))
+                               return -EINVAL;
+                       cpumask_copy(tmp->addmask, parent->effective_cpus);
+               }
+       } else {
+               /*
+                * partcmd_update w/o newmask:
+                *
+                * addmask = cpus_allowed & parent->effectiveb_cpus
+                *
+                * Note that parent's subparts_cpus may have been
+                * pre-shrunk in case there is a change in the cpu list.
+                * So no deletion is needed.
+                */
+               adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
+                                    parent->effective_cpus);
+               part_error = cpumask_equal(tmp->addmask,
+                                          parent->effective_cpus);
+       }
+ 
+       if (cmd == partcmd_update) {
+               int prev_prs = cpuset->partition_root_state;
+ 
+               /*
+                * Check for possible transition between PRS_ENABLED
+                * and PRS_ERROR.
+                */
+               switch (cpuset->partition_root_state) {
+               case PRS_ENABLED:
+                       if (part_error)
+                               cpuset->partition_root_state = PRS_ERROR;
+                       break;
+               case PRS_ERROR:
+                       if (!part_error)
+                               cpuset->partition_root_state = PRS_ENABLED;
+                       break;
+               }
+               /*
+                * Set part_error if previously in invalid state.
+                */
+               part_error = (prev_prs == PRS_ERROR);
+       }
+ 
+       if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+               return 0;       /* Nothing need to be done */
+ 
+       if (cpuset->partition_root_state == PRS_ERROR) {
+               /*
+                * Remove all its cpus from parent's subparts_cpus.
+                */
+               adding = false;
+               deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+                                      parent->subparts_cpus);
+       }
+ 
+       if (!adding && !deleting)
+               return 0;
+ 
+       /*
+        * Change the parent's subparts_cpus.
+        * Newly added CPUs will be removed from effective_cpus and
+        * newly deleted ones will be added back to effective_cpus.
+        */
+       spin_lock_irq(&callback_lock);
+       if (adding) {
+               cpumask_or(parent->subparts_cpus,
+                          parent->subparts_cpus, tmp->addmask);
+               cpumask_andnot(parent->effective_cpus,
+                              parent->effective_cpus, tmp->addmask);
+       }
+       if (deleting) {
+               cpumask_andnot(parent->subparts_cpus,
+                              parent->subparts_cpus, tmp->delmask);
+               /*
+                * Some of the CPUs in subparts_cpus might have been offlined.
+                */
+               cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
+               cpumask_or(parent->effective_cpus,
+                          parent->effective_cpus, tmp->delmask);
+       }
+ 
+       parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+       spin_unlock_irq(&callback_lock);
+ 
+       return cmd == partcmd_update;
+ }
+ 
   /*
    * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
-  * @cs: the cpuset to consider
-  * @new_cpus: temp variable for calculating new effective_cpus
+  * @cs:  the cpuset to consider
+  * @tmp: temp variables for calculating effective_cpus & partition setup
    *
    * When congifured cpumask is changed, the effective cpumasks of this cpuset
    * and all its descendants need to be updated.
@@@ -893,7 -1256,7 +1256,7 @@@
    *
    * Called with cpuset_mutex held
    */
- static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
   {
         struct cpuset *cp;
         struct cgroup_subsys_state *pos_css;
@@@ -903,27 -1266,115 +1266,115 @@@
         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                 struct cpuset *parent = parent_cs(cp);
   
-               cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+               compute_effective_cpumask(tmp->new_cpus, cp, parent);
   
                 /*
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some CPUs.
                  */
-               if (is_in_v2_mode() && cpumask_empty(new_cpus))
-                       cpumask_copy(new_cpus, parent->effective_cpus);
+               if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+                       cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+                       if (!cp->use_parent_ecpus) {
+                               cp->use_parent_ecpus = true;
+                               parent->child_ecpus_count++;
+                       }
+               } else if (cp->use_parent_ecpus) {
+                       cp->use_parent_ecpus = false;
+                       WARN_ON_ONCE(!parent->child_ecpus_count);
+                       parent->child_ecpus_count--;
+               }
   
-               /* Skip the whole subtree if the cpumask remains the same. */
-               if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+               /*
+                * Skip the whole subtree if the cpumask remains the same
+                * and has no partition root state.
+                */
+               if (!cp->partition_root_state &&
+                   cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
                         pos_css = css_rightmost_descendant(pos_css);
                         continue;
                 }
   
+               /*
+                * update_parent_subparts_cpumask() should have been called
+                * for cs already in update_cpumask(). We should also call
+                * update_tasks_cpumask() again for tasks in the parent
+                * cpuset if the parent's subparts_cpus changes.
+                */
+               if ((cp != cs) && cp->partition_root_state) {
+                       switch (parent->partition_root_state) {
+                       case PRS_DISABLED:
+                               /*
+                                * If parent is not a partition root or an
+                                * invalid partition root, clear the state
+                                * state and the CS_CPU_EXCLUSIVE flag.
+                                */
+                               WARN_ON_ONCE(cp->partition_root_state
+                                            != PRS_ERROR);
+                               cp->partition_root_state = 0;
+ 
+                               /*
+                                * clear_bit() is an atomic operation and
+                                * readers aren't interested in the state
+                                * of CS_CPU_EXCLUSIVE anyway. So we can
+                                * just update the flag without holding
+                                * the callback_lock.
+                                */
+                               clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+                               break;
+ 
+                       case PRS_ENABLED:
+                               if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
+                                       update_tasks_cpumask(parent);
+                               break;
+ 
+                       case PRS_ERROR:
+                               /*
+                                * When parent is invalid, it has to be too.
+                                */
+                               cp->partition_root_state = PRS_ERROR;
+                               if (cp->nr_subparts_cpus) {
+                                       cp->nr_subparts_cpus = 0;
+                                       cpumask_clear(cp->subparts_cpus);
+                               }
+                               break;
+                       }
+               }
+ 
                 if (!css_tryget_online(&cp->css))
                         continue;
                 rcu_read_unlock();
   
                 spin_lock_irq(&callback_lock);
-               cpumask_copy(cp->effective_cpus, new_cpus);
+ 
+               cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+               if (cp->nr_subparts_cpus &&
+                  (cp->partition_root_state != PRS_ENABLED)) {
+                       cp->nr_subparts_cpus = 0;
+                       cpumask_clear(cp->subparts_cpus);
+               } else if (cp->nr_subparts_cpus) {
+                       /*
+                        * Make sure that effective_cpus & subparts_cpus
+                        * are mutually exclusive.
+                        *
+                        * In the unlikely event that effective_cpus
+                        * becomes empty. we clear cp->nr_subparts_cpus and
+                        * let its child partition roots to compete for
+                        * CPUs again.
+                        */
+                       cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
+                                      cp->subparts_cpus);
+                       if (cpumask_empty(cp->effective_cpus)) {
+                               cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+                               cpumask_clear(cp->subparts_cpus);
+                               cp->nr_subparts_cpus = 0;
+                       } else if (!cpumask_subset(cp->subparts_cpus,
+                                                  tmp->new_cpus)) {
+                               cpumask_andnot(cp->subparts_cpus,
+                                       cp->subparts_cpus, tmp->new_cpus);
+                               cp->nr_subparts_cpus
+                                       = cpumask_weight(cp->subparts_cpus);
+                       }
+               }
                 spin_unlock_irq(&callback_lock);
   
                 WARN_ON(!is_in_v2_mode() &&
@@@ -932,11 -1383,15 +1383,15 @@@
                 update_tasks_cpumask(cp);
   
                 /*
-                * If the effective cpumask of any non-empty cpuset is changed,
-                * we need to rebuild sched domains.
+                * On legacy hierarchy, if the effective cpumask of any non-
+                * empty cpuset is changed, we need to rebuild sched domains.
+                * On default hierarchy, the cpuset needs to be a partition
+                * root as well.
                  */
                 if (!cpumask_empty(cp->cpus_allowed) &&
-                   is_sched_load_balance(cp))
+                   is_sched_load_balance(cp) &&
+                  (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+                   is_partition_root(cp)))
                         need_rebuild_sched_domains = true;
   
                 rcu_read_lock();
@@@ -948,6 -1403,35 +1403,35 @@@
                 rebuild_sched_domains_locked();
   }
   
+ /**
+  * update_sibling_cpumasks - Update siblings cpumasks
+  * @parent:  Parent cpuset
+  * @cs:      Current cpuset
+  * @tmp:     Temp variables
+  */
+ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+                                   struct tmpmasks *tmp)
+ {
+       struct cpuset *sibling;
+       struct cgroup_subsys_state *pos_css;
+ 
+       /*
+        * Check all its siblings and call update_cpumasks_hier()
+        * if their use_parent_ecpus flag is set in order for them
+        * to use the right effective_cpus value.
+        */
+       rcu_read_lock();
+       cpuset_for_each_child(sibling, pos_css, parent) {
+               if (sibling == cs)
+                       continue;
+               if (!sibling->use_parent_ecpus)
+                       continue;
+ 
+               update_cpumasks_hier(sibling, tmp);
+       }
+       rcu_read_unlock();
+ }
+ 
   /**
    * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
    * @cs: the cpuset to consider
@@@ -958,6 -1442,7 +1442,7 @@@ static int update_cpumask(struct cpuse
                           const char *buf)
   {
         int retval;
+       struct tmpmasks tmp;
   
         /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
         if (cs == &top_cpuset)
@@@ -989,12 -1474,50 +1474,50 @@@
         if (retval < 0)
                 return retval;
   
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+       /*
+        * Use the cpumasks in trialcs for tmpmasks when they are pointers
+        * to allocated cpumasks.
+        */
+       tmp.addmask  = trialcs->subparts_cpus;
+       tmp.delmask  = trialcs->effective_cpus;
+       tmp.new_cpus = trialcs->cpus_allowed;
+ #endif
+ 
+       if (cs->partition_root_state) {
+               /* Cpumask of a partition root cannot be empty */
+               if (cpumask_empty(trialcs->cpus_allowed))
+                       return -EINVAL;
+               if (update_parent_subparts_cpumask(cs, partcmd_update,
+                                       trialcs->cpus_allowed, &tmp) < 0)
+                       return -EINVAL;
+       }
+ 
         spin_lock_irq(&callback_lock);
         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ 
+       /*
+        * Make sure that subparts_cpus is a subset of cpus_allowed.
+        */
+       if (cs->nr_subparts_cpus) {
+               cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
+                              cs->cpus_allowed);
+               cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+       }
         spin_unlock_irq(&callback_lock);
   
-       /* use trialcs->cpus_allowed as a temp variable */
-       update_cpumasks_hier(cs, trialcs->cpus_allowed);
+       update_cpumasks_hier(cs, &tmp);
+ 
+       if (cs->partition_root_state) {
+               struct cpuset *parent = parent_cs(cs);
+ 
+               /*
+                * For partition root, update the cpumasks of sibling
+                * cpusets if they use parent's effective_cpus.
+                */
+               if (parent->child_ecpus_count)
+                       update_sibling_cpumasks(parent, cs, &tmp);
+       }
         return 0;
   }
   
@@@ -1348,7 -1871,95 +1871,95 @@@ static int update_flag(cpuset_flagbits_
         if (spread_flag_changed)
                 update_tasks_flags(cs);
   out:
-       free_trial_cpuset(trialcs);
+       free_cpuset(trialcs);
+       return err;
+ }
+ 
+ /*
+  * update_prstate - update partititon_root_state
+  * cs:        the cpuset to update
+  * val: 0 - disabled, 1 - enabled
+  *
+  * Call with cpuset_mutex held.
+  */
+ static int update_prstate(struct cpuset *cs, int val)
+ {
+       int err;
+       struct cpuset *parent = parent_cs(cs);
+       struct tmpmasks tmp;
+ 
+       if ((val != 0) && (val != 1))
+               return -EINVAL;
+       if (val == cs->partition_root_state)
+               return 0;
+ 
+       /*
+        * Cannot force a partial or invalid partition root to a full
+        * partition root.
+        */
+       if (val && cs->partition_root_state)
+               return -EINVAL;
+ 
+       if (alloc_cpumasks(NULL, &tmp))
+               return -ENOMEM;
+ 
+       err = -EINVAL;
+       if (!cs->partition_root_state) {
+               /*
+                * Turning on partition root requires setting the
+                * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
+                * cannot be NULL.
+                */
+               if (cpumask_empty(cs->cpus_allowed))
+                       goto out;
+ 
+               err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
+               if (err)
+                       goto out;
+ 
+               err = update_parent_subparts_cpumask(cs, partcmd_enable,
+                                                    NULL, &tmp);
+               if (err) {
+                       update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+                       goto out;
+               }
+               cs->partition_root_state = PRS_ENABLED;
+       } else {
+               /*
+                * Turning off partition root will clear the
+                * CS_CPU_EXCLUSIVE bit.
+                */
+               if (cs->partition_root_state == PRS_ERROR) {
+                       cs->partition_root_state = 0;
+                       update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+                       err = 0;
+                       goto out;
+               }
+ 
+               err = update_parent_subparts_cpumask(cs, partcmd_disable,
+                                                    NULL, &tmp);
+               if (err)
+                       goto out;
+ 
+               cs->partition_root_state = 0;
+ 
+               /* Turning off CS_CPU_EXCLUSIVE will not return error */
+               update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+       }
+ 
+       /*
+        * Update cpumask of parent's tasks except when it is the top
+        * cpuset as some system daemons cannot be mapped to other CPUs.
+        */
+       if (parent != &top_cpuset)
+               update_tasks_cpumask(parent);
+ 
+       if (parent->child_ecpus_count)
+               update_sibling_cpumasks(parent, cs, &tmp);
+ 
+       rebuild_sched_domains_locked();
+ out:
+       free_cpumasks(NULL, &tmp);
         return err;
   }
   
@@@ -1498,10 -2109,8 +2109,8 @@@ out_unlock
   static void cpuset_cancel_attach(struct cgroup_taskset *tset)
   {
         struct cgroup_subsys_state *css;
-       struct cpuset *cs;
   
         cgroup_taskset_first(tset, &css);
-       cs = css_cs(css);
   
         mutex_lock(&cpuset_mutex);
         css_cs(css)->attach_in_progress--;
@@@ -1593,10 -2202,12 +2202,12 @@@ typedef enum 
         FILE_MEMLIST,
         FILE_EFFECTIVE_CPULIST,
         FILE_EFFECTIVE_MEMLIST,
+       FILE_SUBPARTS_CPULIST,
         FILE_CPU_EXCLUSIVE,
         FILE_MEM_EXCLUSIVE,
         FILE_MEM_HARDWALL,
         FILE_SCHED_LOAD_BALANCE,
+       FILE_PARTITION_ROOT,
         FILE_SCHED_RELAX_DOMAIN_LEVEL,
         FILE_MEMORY_PRESSURE_ENABLED,
         FILE_MEMORY_PRESSURE,
@@@ -1732,7 -2343,7 +2343,7 @@@ static ssize_t cpuset_write_resmask(str
                 break;
         }
   
-       free_trial_cpuset(trialcs);
+       free_cpuset(trialcs);
   out_unlock:
         mutex_unlock(&cpuset_mutex);
         kernfs_unbreak_active_protection(of->kn);
@@@ -1770,6 -2381,9 +2381,9 @@@ static int cpuset_common_seq_show(struc
         case FILE_EFFECTIVE_MEMLIST:
                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                 break;
+       case FILE_SUBPARTS_CPULIST:
+               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+               break;
         default:
                 ret = -EINVAL;
         }
@@@ -1824,12 -2438,60 +2438,60 @@@ static s64 cpuset_read_s64(struct cgrou
         return 0;
   }
   
+ static int sched_partition_show(struct seq_file *seq, void *v)
+ {
+       struct cpuset *cs = css_cs(seq_css(seq));
+ 
+       switch (cs->partition_root_state) {
+       case PRS_ENABLED:
+               seq_puts(seq, "root\n");
+               break;
+       case PRS_DISABLED:
+               seq_puts(seq, "member\n");
+               break;
+       case PRS_ERROR:
+               seq_puts(seq, "root invalid\n");
+               break;
+       }
+       return 0;
+ }
+ 
+ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+                                    size_t nbytes, loff_t off)
+ {
+       struct cpuset *cs = css_cs(of_css(of));
+       int val;
+       int retval = -ENODEV;
+ 
+       buf = strstrip(buf);
+ 
+       /*
+        * Convert "root" to ENABLED, and convert "member" to DISABLED.
+        */
+       if (!strcmp(buf, "root"))
+               val = PRS_ENABLED;
+       else if (!strcmp(buf, "member"))
+               val = PRS_DISABLED;
+       else
+               return -EINVAL;
+ 
+       css_get(&cs->css);
+       mutex_lock(&cpuset_mutex);
+       if (!is_cpuset_online(cs))
+               goto out_unlock;
+ 
+       retval = update_prstate(cs, val);
+ out_unlock:
+       mutex_unlock(&cpuset_mutex);
+       css_put(&cs->css);
+       return retval ?: nbytes;
+ }
   
   /*
    * for the common functions, 'private' gives the type of file
    */
   
- static struct cftype files[] = {
+ static struct cftype legacy_files[] = {
         {
                 .name = "cpus",
                 .seq_show = cpuset_common_seq_show,
@@@ -1931,6 -2593,60 +2593,60 @@@
         { }     /* terminate */
   };
   
+ /*
+  * This is currently a minimal set for the default hierarchy. It can be
+  * expanded later on by migrating more features and control files from v1.
+  */
+ static struct cftype dfl_files[] = {
+       {
+               .name = "cpus",
+               .seq_show = cpuset_common_seq_show,
+               .write = cpuset_write_resmask,
+               .max_write_len = (100U + 6 * NR_CPUS),
+               .private = FILE_CPULIST,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+ 
+       {
+               .name = "mems",
+               .seq_show = cpuset_common_seq_show,
+               .write = cpuset_write_resmask,
+               .max_write_len = (100U + 6 * MAX_NUMNODES),
+               .private = FILE_MEMLIST,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+ 
+       {
+               .name = "cpus.effective",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_CPULIST,
+       },
+ 
+       {
+               .name = "mems.effective",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_MEMLIST,
+       },
+ 
+       {
+               .name = "cpus.partition",
+               .seq_show = sched_partition_show,
+               .write = sched_partition_write,
+               .private = FILE_PARTITION_ROOT,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+ 
+       {
+               .name = "cpus.subpartitions",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_SUBPARTS_CPULIST,
+               .flags = CFTYPE_DEBUG,
+       },
+ 
+       { }     /* terminate */
+ };
+ 
+ 
   /*
    *    cpuset_css_alloc - allocate a cpuset css
    *    cgrp:   control group that the new cpuset will be part of
@@@ -1947,26 -2663,19 +2663,19 @@@ cpuset_css_alloc(struct cgroup_subsys_s
         cs = kzalloc(sizeof(*cs), GFP_KERNEL);
         if (!cs)
                 return ERR_PTR(-ENOMEM);
-       if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
-               goto free_cs;
-       if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-               goto free_cpus;
+ 
+       if (alloc_cpumasks(cs, NULL)) {
+               kfree(cs);
+               return ERR_PTR(-ENOMEM);
+       }
   
         set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-       cpumask_clear(cs->cpus_allowed);
         nodes_clear(cs->mems_allowed);
-       cpumask_clear(cs->effective_cpus);
         nodes_clear(cs->effective_mems);
         fmeter_init(&cs->fmeter);
         cs->relax_domain_level = -1;
   
         return &cs->css;
- 
- free_cpus:
-       free_cpumask_var(cs->cpus_allowed);
- free_cs:
-       kfree(cs);
-       return ERR_PTR(-ENOMEM);
   }
   
   static int cpuset_css_online(struct cgroup_subsys_state *css)
@@@ -1993,6 -2702,8 +2702,8 @@@
         if (is_in_v2_mode()) {
                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                 cs->effective_mems = parent->effective_mems;
+               cs->use_parent_ecpus = true;
+               parent->child_ecpus_count++;
         }
         spin_unlock_irq(&callback_lock);
   
@@@ -2035,7 -2746,12 +2746,12 @@@ out_unlock
   /*
    * If the cpuset being removed has its flag 'sched_load_balance'
    * enabled, then simulate turning sched_load_balance off, which
-  * will call rebuild_sched_domains_locked().
+  * will call rebuild_sched_domains_locked(). That is not needed
+  * in the default hierarchy where only changes in partition
+  * will cause repartitioning.
+  *
+  * If the cpuset has the 'sched.partition' flag enabled, simulate
+  * turning 'sched.partition" off.
    */
   
   static void cpuset_css_offline(struct cgroup_subsys_state *css)
@@@ -2044,9 -2760,20 +2760,20 @@@
   
         mutex_lock(&cpuset_mutex);
   
-       if (is_sched_load_balance(cs))
+       if (is_partition_root(cs))
+               update_prstate(cs, 0);
+ 
+       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+           is_sched_load_balance(cs))
                 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
   
+       if (cs->use_parent_ecpus) {
+               struct cpuset *parent = parent_cs(cs);
+ 
+               cs->use_parent_ecpus = false;
+               parent->child_ecpus_count--;
+       }
+ 
         cpuset_dec();
         clear_bit(CS_ONLINE, &cs->flags);
   
@@@ -2057,9 -2784,7 +2784,7 @@@ static void cpuset_css_free(struct cgro
   {
         struct cpuset *cs = css_cs(css);
   
-       free_cpumask_var(cs->effective_cpus);
-       free_cpumask_var(cs->cpus_allowed);
-       kfree(cs);
+       free_cpuset(cs);
   }
   
   static void cpuset_bind(struct cgroup_subsys_state *root_css)
@@@ -2105,8 -2830,10 +2830,10 @@@ struct cgroup_subsys cpuset_cgrp_subsy
         .post_attach    = cpuset_post_attach,
         .bind           = cpuset_bind,
         .fork           = cpuset_fork,
-       .legacy_cftypes = files,
+       .legacy_cftypes = legacy_files,
+       .dfl_cftypes    = dfl_files,
         .early_init     = true,
+       .threaded       = true,
   };
   
   /**
@@@ -2121,6 -2848,7 +2848,7 @@@ int __init cpuset_init(void
   
         BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+       BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
   
         cpumask_setall(top_cpuset.cpus_allowed);
         nodes_setall(top_cpuset.mems_allowed);
@@@ -2227,20 -2955,29 +2955,29 @@@ hotplug_update_tasks(struct cpuset *cs
                 update_tasks_nodemask(cs);
   }
   
+ static bool force_rebuild;
+ 
+ void cpuset_force_rebuild(void)
+ {
+       force_rebuild = true;
+ }
+ 
   /**
    * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
    * @cs: cpuset in interest
+  * @tmp: the tmpmasks structure pointer
    *
    * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
    * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
    * all its tasks are moved to the nearest ancestor with both resources.
    */
- static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
   {
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
         bool cpus_updated;
         bool mems_updated;
+       struct cpuset *parent;
   retry:
         wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
   
@@@ -2255,9 -2992,60 +2992,60 @@@
                 goto retry;
         }
   
-       cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
-       nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+       parent =  parent_cs(cs);
+       compute_effective_cpumask(&new_cpus, cs, parent);
+       nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+ 
+       if (cs->nr_subparts_cpus)
+               /*
+                * Make sure that CPUs allocated to child partitions
+                * do not show up in effective_cpus.
+                */
+               cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+ 
+       if (!tmp || !cs->partition_root_state)
+               goto update_tasks;
   
+       /*
+        * In the unlikely event that a partition root has empty
+        * effective_cpus or its parent becomes erroneous, we have to
+        * transition it to the erroneous state.
+        */
+       if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+          (parent->partition_root_state == PRS_ERROR))) {
+               if (cs->nr_subparts_cpus) {
+                       cs->nr_subparts_cpus = 0;
+                       cpumask_clear(cs->subparts_cpus);
+                       compute_effective_cpumask(&new_cpus, cs, parent);
+               }
+ 
+               /*
+                * If the effective_cpus is empty because the child
+                * partitions take away all the CPUs, we can keep
+                * the current partition and let the child partitions
+                * fight for available CPUs.
+                */
+               if ((parent->partition_root_state == PRS_ERROR) ||
+                    cpumask_empty(&new_cpus)) {
+                       update_parent_subparts_cpumask(cs, partcmd_disable,
+                                                      NULL, tmp);
+                       cs->partition_root_state = PRS_ERROR;
+               }
+               cpuset_force_rebuild();
+       }
+ 
+       /*
+        * On the other hand, an erroneous partition root may be transitioned
+        * back to a regular one or a partition root with no CPU allocated
+        * from the parent may change to erroneous.
+        */
+       if (is_partition_root(parent) &&
+          ((cs->partition_root_state == PRS_ERROR) ||
+           !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+            update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+               cpuset_force_rebuild();
+ 
+ update_tasks:
         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
   
@@@ -2271,13 -3059,6 +3059,6 @@@
         mutex_unlock(&cpuset_mutex);
   }
   
- static bool force_rebuild;
- 
- void cpuset_force_rebuild(void)
- {
-       force_rebuild = true;
- }
- 
   /**
    * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
    *
@@@ -2300,6 -3081,10 +3081,10 @@@ static void cpuset_hotplug_workfn(struc
         static nodemask_t new_mems;
         bool cpus_updated, mems_updated;
         bool on_dfl = is_in_v2_mode();
+       struct tmpmasks tmp, *ptmp = NULL;
+ 
+       if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+               ptmp = &tmp;
   
         mutex_lock(&cpuset_mutex);
   
@@@ -2307,6 -3092,11 +3092,11 @@@
         cpumask_copy(&new_cpus, cpu_active_mask);
         new_mems = node_states[N_MEMORY];
   
+       /*
+        * If subparts_cpus is populated, it is likely that the check below
+        * will produce a false positive on cpus_updated when the cpu list
+        * isn't changed. It is extra work, but it is better to be safe.
+        */
         cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
         mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
   
@@@ -2315,6 -3105,22 +3105,22 @@@
                 spin_lock_irq(&callback_lock);
                 if (!on_dfl)
                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               /*
+                * Make sure that CPUs allocated to child partitions
+                * do not show up in effective_cpus. If no CPU is left,
+                * we clear the subparts_cpus & let the child partitions
+                * fight for the CPUs again.
+                */
+               if (top_cpuset.nr_subparts_cpus) {
+                       if (cpumask_subset(&new_cpus,
+                                          top_cpuset.subparts_cpus)) {
+                               top_cpuset.nr_subparts_cpus = 0;
+                               cpumask_clear(top_cpuset.subparts_cpus);
+                       } else {
+                               cpumask_andnot(&new_cpus, &new_cpus,
+                                              top_cpuset.subparts_cpus);
+                       }
+               }
                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                 spin_unlock_irq(&callback_lock);
                 /* we don't mess with cpumasks of tasks in top_cpuset */
@@@ -2343,7 -3149,7 +3149,7 @@@
                                 continue;
                         rcu_read_unlock();
   
-                       cpuset_hotplug_update_tasks(cs);
+                       cpuset_hotplug_update_tasks(cs, ptmp);
   
                         rcu_read_lock();
                         css_put(&cs->css);
@@@ -2356,6 -3162,8 +3162,8 @@@
                 force_rebuild = false;
                 rebuild_sched_domains();
         }
+ 
+       free_cpumasks(NULL, ptmp);
   }
   
   void cpuset_update_active_cpus(void)
@@@ -2666,9 -3474,9 +3474,9 @@@ void cpuset_print_current_mems_allowed(
         rcu_read_lock();
   
         cgrp = task_cs(current)->css.cgroup;
- -      pr_info("%s cpuset=", current->comm);
+ +      pr_cont(",cpuset=");
         pr_cont_cgroup_name(cgrp);
- -      pr_cont(" mems_allowed=%*pbl\n",
+ +      pr_cont(",mems_allowed=%*pbl",
                 nodemask_pr_args(&current->mems_allowed));
   
         rcu_read_unlock();
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 29 Dec 2018 18:57:20 +0000 (10:57 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 29 Dec 2018 18:57:20 +0000 (10:57 -0800)
		1	2
Documentation/admin-guide/cgroup-v2.rst	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cpuset.c	patch \|	diff1 \|	diff2 \|	blob \| history