sched, cpuset: customize sched domains, core

[linux-2.6-block.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 070eefdd90f56cfd8ad31b981d8ffcc57b38d05e..62d7481caca53314d45f139440c33d138e74b5cd 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -163,10 +163,11 @@ struct rt_prio_array {
  };
  
  struct rt_bandwidth {
-       ktime_t rt_period;
-       u64 rt_runtime;
-       spinlock_t rt_runtime_lock;
-       struct hrtimer rt_period_timer;
+       /* nests inside the rq lock: */
+       spinlock_t              rt_runtime_lock;
+       ktime_t                 rt_period;
+       u64                     rt_runtime;
+       struct hrtimer          rt_period_timer;
  };
  
  static struct rt_bandwidth def_rt_bandwidth;
@@ -270,8 +271,21 @@ struct task_group {
  
         struct rcu_head rcu;
         struct list_head list;
+
+       struct task_group *parent;
+       struct list_head siblings;
+       struct list_head children;
  };
  
+#ifdef CONFIG_USER_SCHED
+
+/*
+ * Root task group.
+ *     Every UID task group (including init_task_group aka UID-0) will
+ *     be a child to this group.
+ */
+struct task_group root_task_group;
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /* Default task group's sched entity on each cpu */
  static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
@@ -283,6 +297,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
  static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  #endif
+#else
+#define root_task_group init_task_group
+#endif
  
  /* task_group_lock serializes add/remove of task groups and also changes to
   * a task group's cpu shares.
@@ -403,6 +420,7 @@ struct rt_rq {
         int rt_throttled;
         u64 rt_time;
         u64 rt_runtime;
+       /* Nests inside the rq lock: */
         spinlock_t rt_runtime_lock;
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -6753,6 +6771,7 @@ static noinline void sd_init_##type(struct sched_domain *sd)      \
  {                                                              \
         memset(sd, 0, sizeof(*sd));                             \
         *sd = SD_##type##_INIT;                                 \
+       sd->level = SD_LV_##type;                               \
  }
  
  SD_INIT_FUNC(CPU)
@@ -6801,11 +6820,42 @@ struct allmasks {
  #define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
                         ((unsigned long)(a) + offsetof(struct allmasks, v))
  
+static int default_relax_domain_level = -1;
+
+static int __init setup_relax_domain_level(char *str)
+{
+       default_relax_domain_level = simple_strtoul(str, NULL, 0);
+       return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+                                struct sched_domain_attr *attr)
+{
+       int request;
+
+       if (!attr || attr->relax_domain_level < 0) {
+               if (default_relax_domain_level < 0)
+                       return;
+               else
+                       request = default_relax_domain_level;
+       } else
+               request = attr->relax_domain_level;
+       if (request < sd->level) {
+               /* turn off idle balance on this domain */
+               sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+       } else {
+               /* turn on idle balance on this domain */
+               sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+       }
+}
+
  /*
   * Build sched domains for a given set of cpus and attach the sched domains
   * to the individual cpus
   */
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int __build_sched_domains(const cpumask_t *cpu_map,
+                                struct sched_domain_attr *attr)
  {
         int i;
         struct root_domain *rd;
@@ -6869,6 +6919,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                                 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
                         sd = &per_cpu(allnodes_domains, i);
                         SD_INIT(sd, ALLNODES);
+                       set_domain_attribute(sd, attr);
                         sd->span = *cpu_map;
                         cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                         p = sd;
@@ -6878,6 +6929,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
  
                 sd = &per_cpu(node_domains, i);
                 SD_INIT(sd, NODE);
+               set_domain_attribute(sd, attr);
                 sched_domain_node_span(cpu_to_node(i), &sd->span);
                 sd->parent = p;
                 if (p)
@@ -6888,6 +6940,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                 p = sd;
                 sd = &per_cpu(phys_domains, i);
                 SD_INIT(sd, CPU);
+               set_domain_attribute(sd, attr);
                 sd->span = *nodemask;
                 sd->parent = p;
                 if (p)
@@ -6898,6 +6951,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                 p = sd;
                 sd = &per_cpu(core_domains, i);
                 SD_INIT(sd, MC);
+               set_domain_attribute(sd, attr);
                 sd->span = cpu_coregroup_map(i);
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
@@ -6909,6 +6963,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                 p = sd;
                 sd = &per_cpu(cpu_domains, i);
                 SD_INIT(sd, SIBLING);
+               set_domain_attribute(sd, attr);
                 sd->span = per_cpu(cpu_sibling_map, i);
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
@@ -7106,8 +7161,15 @@ error:
  #endif
  }
  
+static int build_sched_domains(const cpumask_t *cpu_map)
+{
+       return __build_sched_domains(cpu_map, NULL);
+}
+
  static cpumask_t *doms_cur;    /* current sched domains */
  static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;    /* attribues of custom domains
+                                                  in 'doms_cur' */
  
  /*
   * Special case: If a kmalloc of a doms_cur partition (array of
@@ -7135,6 +7197,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
         if (!doms_cur)
                 doms_cur = &fallback_doms;
         cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+       dattr_cur = NULL;
         err = build_sched_domains(doms_cur);
         register_sched_domain_sysctl();
  
@@ -7164,6 +7227,22 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
         arch_destroy_sched_domains(cpu_map, &tmpmask);
  }
  
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+                       struct sched_domain_attr *new, int idx_new)
+{
+       struct sched_domain_attr tmp;
+
+       /* fast path */
+       if (!new && !cur)
+               return 1;
+
+       tmp = SD_ATTR_INIT;
+       return !memcmp(cur ? (cur + idx_cur) : &tmp,
+                       new ? (new + idx_new) : &tmp,
+                       sizeof(struct sched_domain_attr));
+}
+
  /*
   * Partition sched domains as specified by the 'ndoms_new'
   * cpumasks in the array doms_new[] of cpumasks. This compares
@@ -7185,7 +7264,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
   *
   * Call with hotplug lock held
   */
-void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
+void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+                            struct sched_domain_attr *dattr_new)
  {
         int i, j;
  
@@ -7198,12 +7278,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
                 ndoms_new = 1;
                 doms_new = &fallback_doms;
                 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+               dattr_new = NULL;
         }
  
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
                 for (j = 0; j < ndoms_new; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j]))
+                       if (cpus_equal(doms_cur[i], doms_new[j])
+                           && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
                 }
                 /* no match - a current sched domain not in new doms_new[] */
@@ -7215,11 +7297,13 @@ match1:
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
                 for (j = 0; j < ndoms_cur; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j]))
+                       if (cpus_equal(doms_new[i], doms_cur[j])
+                           && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
                 }
                 /* no match - add a new doms_new */
-               build_sched_domains(doms_new + i);
+               __build_sched_domains(doms_new + i,
+                                       dattr_new ? dattr_new + i : NULL);
  match2:
                 ;
         }
@@ -7227,7 +7311,9 @@ match2:
         /* Remember the new sched domains */
         if (doms_cur != &fallback_doms)
                 kfree(doms_cur);
+       kfree(dattr_cur);       /* kfree(NULL) is safe */
         doms_cur = doms_new;
+       dattr_cur = dattr_new;
         ndoms_cur = ndoms_new;
  
         register_sched_domain_sysctl();
@@ -7434,10 +7520,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
-               struct cfs_rq *cfs_rq, struct sched_entity *se,
-               int cpu, int add)
+static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+                               struct sched_entity *se, int cpu, int add,
+                               struct sched_entity *parent)
  {
+       struct rq *rq = cpu_rq(cpu);
         tg->cfs_rq[cpu] = cfs_rq;
         init_cfs_rq(cfs_rq, rq);
         cfs_rq->tg = tg;
@@ -7445,19 +7532,29 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
                 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
  
         tg->se[cpu] = se;
-       se->cfs_rq = &rq->cfs;
+       /* se could be NULL for init_task_group */
+       if (!se)
+               return;
+
+       if (!parent)
+               se->cfs_rq = &rq->cfs;
+       else
+               se->cfs_rq = parent->my_q;
+
         se->my_q = cfs_rq;
         se->load.weight = tg->shares;
         se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
-       se->parent = NULL;
+       se->parent = parent;
  }
  #endif
  
  #ifdef CONFIG_RT_GROUP_SCHED
-static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
-               struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
-               int cpu, int add)
+static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+               struct sched_rt_entity *rt_se, int cpu, int add,
+               struct sched_rt_entity *parent)
  {
+       struct rq *rq = cpu_rq(cpu);
+
         tg->rt_rq[cpu] = rt_rq;
         init_rt_rq(rt_rq, rq);
         rt_rq->tg = tg;
@@ -7467,9 +7564,17 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
                 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
  
         tg->rt_se[cpu] = rt_se;
+       if (!rt_se)
+               return;
+
+       if (!parent)
+               rt_se->rt_rq = &rq->rt;
+       else
+               rt_se->rt_rq = parent->my_q;
+
         rt_se->rt_rq = &rq->rt;
         rt_se->my_q = rt_rq;
-       rt_se->parent = NULL;
+       rt_se->parent = parent;
         INIT_LIST_HEAD(&rt_se->run_list);
  }
  #endif
@@ -7484,6 +7589,9 @@ void __init sched_init(void)
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_USER_SCHED
+       alloc_size *= 2;
  #endif
         /*
          * As sched_init() is called before page_alloc is setup,
@@ -7498,12 +7606,29 @@ void __init sched_init(void)
  
                 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
+
+#ifdef CONFIG_USER_SCHED
+               root_task_group.se = (struct sched_entity **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+
+               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+#endif
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
                 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
                 init_task_group.rt_rq = (struct rt_rq **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+
+#ifdef CONFIG_USER_SCHED
+               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+
+               root_task_group.rt_rq = (struct rt_rq **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+#endif
  #endif
         }
  
@@ -7517,10 +7642,21 @@ void __init sched_init(void)
  #ifdef CONFIG_RT_GROUP_SCHED
         init_rt_bandwidth(&init_task_group.rt_bandwidth,
                         global_rt_period(), global_rt_runtime());
+#ifdef CONFIG_USER_SCHED
+       init_rt_bandwidth(&root_task_group.rt_bandwidth,
+                       global_rt_period(), RUNTIME_INF);
+#endif
  #endif
  
  #ifdef CONFIG_GROUP_SCHED
         list_add(&init_task_group.list, &task_groups);
+       INIT_LIST_HEAD(&init_task_group.children);
+
+#ifdef CONFIG_USER_SCHED
+       INIT_LIST_HEAD(&root_task_group.children);
+       init_task_group.parent = &root_task_group;
+       list_add(&init_task_group.siblings, &root_task_group.children);
+#endif
  #endif
  
         for_each_possible_cpu(i) {
@@ -7537,18 +7673,61 @@ void __init sched_init(void)
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 init_task_group.shares = init_task_group_load;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-               init_tg_cfs_entry(rq, &init_task_group,
+#ifdef CONFIG_CGROUP_SCHED
+               /*
+                * How much cpu bandwidth does init_task_group get?
+                *
+                * In case of task-groups formed thr' the cgroup filesystem, it
+                * gets 100% of the cpu resources in the system. This overall
+                * system cpu resource is divided among the tasks of
+                * init_task_group and its child task-groups in a fair manner,
+                * based on each entity's (task or task-group's) weight
+                * (se->load.weight).
+                *
+                * In other words, if init_task_group has 10 tasks of weight
+                * 1024) and two child groups A0 and A1 (of weight 1024 each),
+                * then A0's share of the cpu resource is:
+                *
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *
+                * We achieve this by letting init_task_group's tasks sit
+                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                */
+               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
+#elif defined CONFIG_USER_SCHED
+               root_task_group.shares = NICE_0_LOAD;
+               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
+               /*
+                * In case of task-groups formed thr' the user id of tasks,
+                * init_task_group represents tasks belonging to root user.
+                * Hence it forms a sibling of all subsequent groups formed.
+                * In this case, init_task_group gets only a fraction of overall
+                * system cpu resource, based on the weight assigned to root
+                * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
+                * by letting tasks of init_task_group sit in a separate cfs_rq
+                * (init_cfs_rq) and having one entity represent this group of
+                * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
+                */
+               init_tg_cfs_entry(&init_task_group,
                                 &per_cpu(init_cfs_rq, i),
-                               &per_cpu(init_sched_entity, i), i, 1);
+                               &per_cpu(init_sched_entity, i), i, 1,
+                               root_task_group.se[i]);
  
  #endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
  #ifdef CONFIG_RT_GROUP_SCHED
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-               init_tg_rt_entry(rq, &init_task_group,
+#ifdef CONFIG_CGROUP_SCHED
+               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
+#elif defined CONFIG_USER_SCHED
+               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
+               init_tg_rt_entry(&init_task_group,
                                 &per_cpu(init_rt_rq, i),
-                               &per_cpu(init_sched_rt_entity, i), i, 1);
-#else
-               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+                               &per_cpu(init_sched_rt_entity, i), i, 1,
+                               root_task_group.rt_se[i]);
+#endif
  #endif
  
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7749,10 +7928,11 @@ static void free_fair_sched_group(struct task_group *tg)
         kfree(tg->se);
  }
  
-static int alloc_fair_sched_group(struct task_group *tg)
+static
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
         struct cfs_rq *cfs_rq;
-       struct sched_entity *se;
+       struct sched_entity *se, *parent_se;
         struct rq *rq;
         int i;
  
@@ -7778,7 +7958,8 @@ static int alloc_fair_sched_group(struct task_group *tg)
                 if (!se)
                         goto err;
  
-               init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+               parent_se = parent ? parent->se[i] : NULL;
+               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
         }
  
         return 1;
@@ -7802,7 +7983,8 @@ static inline void free_fair_sched_group(struct task_group *tg)
  {
  }
  
-static inline int alloc_fair_sched_group(struct task_group *tg)
+static inline
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
         return 1;
  }
@@ -7834,10 +8016,11 @@ static void free_rt_sched_group(struct task_group *tg)
         kfree(tg->rt_se);
  }
  
-static int alloc_rt_sched_group(struct task_group *tg)
+static
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
         struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se;
+       struct sched_rt_entity *rt_se, *parent_se;
         struct rq *rq;
         int i;
  
@@ -7864,7 +8047,8 @@ static int alloc_rt_sched_group(struct task_group *tg)
                 if (!rt_se)
                         goto err;
  
-               init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
+               parent_se = parent ? parent->rt_se[i] : NULL;
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
         }
  
         return 1;
@@ -7888,7 +8072,8 @@ static inline void free_rt_sched_group(struct task_group *tg)
  {
  }
  
-static inline int alloc_rt_sched_group(struct task_group *tg)
+static inline
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
         return 1;
  }
@@ -7911,7 +8096,7 @@ static void free_sched_group(struct task_group *tg)
  }
  
  /* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(void)
+struct task_group *sched_create_group(struct task_group *parent)
  {
         struct task_group *tg;
         unsigned long flags;
@@ -7921,10 +8106,10 @@ struct task_group *sched_create_group(void)
         if (!tg)
                 return ERR_PTR(-ENOMEM);
  
-       if (!alloc_fair_sched_group(tg))
+       if (!alloc_fair_sched_group(tg, parent))
                 goto err;
  
-       if (!alloc_rt_sched_group(tg))
+       if (!alloc_rt_sched_group(tg, parent))
                 goto err;
  
         spin_lock_irqsave(&task_group_lock, flags);
@@ -7933,6 +8118,12 @@ struct task_group *sched_create_group(void)
                 register_rt_sched_group(tg, i);
         }
         list_add_rcu(&tg->list, &task_groups);
+
+       WARN_ON(!parent); /* root should already exist */
+
+       tg->parent = parent;
+       list_add_rcu(&tg->siblings, &parent->children);
+       INIT_LIST_HEAD(&tg->children);
         spin_unlock_irqrestore(&task_group_lock, flags);
  
         return tg;
@@ -7961,6 +8152,7 @@ void sched_destroy_group(struct task_group *tg)
                 unregister_rt_sched_group(tg, i);
         }
         list_del_rcu(&tg->list);
+       list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
  
         /* wait for possible concurrent references to cfs_rqs complete */
@@ -8035,6 +8227,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         int i;
         unsigned long flags;
  
+       /*
+        * We can't change the weight of the root cgroup.
+        */
+       if (!tg->se[0])
+               return -EINVAL;
+
         /*
          * A weight of 0 or 1 can cause arithmetics problems.
          * (The default weight is 1024 - so there's no practical
@@ -8050,6 +8248,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         spin_lock_irqsave(&task_group_lock, flags);
         for_each_possible_cpu(i)
                 unregister_fair_sched_group(tg, i);
+       list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
  
         /* wait for any ongoing reference to this group to finish */
@@ -8070,6 +8269,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         spin_lock_irqsave(&task_group_lock, flags);
         for_each_possible_cpu(i)
                 register_fair_sched_group(tg, i);
+       list_add_rcu(&tg->siblings, &tg->parent->children);
         spin_unlock_irqrestore(&task_group_lock, flags);
  done:
         mutex_unlock(&shares_mutex);
@@ -8096,6 +8296,38 @@ static unsigned long to_ratio(u64 period, u64 runtime)
         return div64_64(runtime << 16, period);
  }
  
+#ifdef CONFIG_CGROUP_SCHED
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+       struct task_group *tgi, *parent = tg->parent;
+       unsigned long total = 0;
+
+       if (!parent) {
+               if (global_rt_period() < period)
+                       return 0;
+
+               return to_ratio(period, runtime) <
+                       to_ratio(global_rt_period(), global_rt_runtime());
+       }
+
+       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
+               return 0;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
+               if (tgi == tg)
+                       continue;
+
+               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+                               tgi->rt_bandwidth.rt_runtime);
+       }
+       rcu_read_unlock();
+
+       return total + to_ratio(period, runtime) <
+               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
+                               parent->rt_bandwidth.rt_runtime);
+}
+#elif defined CONFIG_USER_SCHED
  static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
         struct task_group *tgi;
@@ -8115,6 +8347,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  
         return total + to_ratio(period, runtime) < global_ratio;
  }
+#endif
  
  /* Must be called with tasklist_lock held */
  static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8278,7 +8511,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
  static struct cgroup_subsys_state *
  cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
-       struct task_group *tg;
+       struct task_group *tg, *parent;
  
         if (!cgrp->parent) {
                 /* This is early initialization for the top cgroup */
@@ -8286,11 +8519,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
                 return &init_task_group.css;
         }
  
-       /* we support only 1-level deep hierarchical scheduler atm */
-       if (cgrp->parent->parent)
-               return ERR_PTR(-EINVAL);
-
-       tg = sched_create_group();
+       parent = cgroup_tg(cgrp->parent);
+       tg = sched_create_group(parent);
         if (IS_ERR(tg))
                 return ERR_PTR(-ENOMEM);