Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-2.6-block.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 7fa8e74ad2ab4003457d266df57373f41f0e0d2a..06961b997ed6d8c13ced5558520f75b07c85aedc 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
  }
  
  #ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
  /* Max allowed minimum utilization */
  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  
@@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
  }
  
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
  {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
  }
  
  static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
  {
         /*
@@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
         return uclamp_none(UCLAMP_MIN);
  }
  
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
  {
         /* Reset max-clamp retention only on idle exit */
@@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
  }
  
  static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-                                unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+                                  unsigned int clamp_value)
  {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
         return uclamp_idle_value(rq, clamp_id, clamp_value);
  }
  
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       struct uclamp_se uc_max;
+
+       /*
+        * Tasks in autogroups or root task group will be
+        * restricted by system defaults.
+        */
+       if (task_group_is_autogroup(task_group(p)))
+               return uc_req;
+       if (task_group(p) == &root_task_group)
+               return uc_req;
+
+       uc_max = task_group(p)->uclamp[clamp_id];
+       if (uc_req.value > uc_max.value || !uc_req.user_defined)
+               return uc_max;
+#endif
+
+       return uc_req;
+}
+
  /*
   * The effective clamp bucket index of a task depends on, by increasing
   * priority:
   * - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ *   group or in an autogroup
   * - the system default clamp value, defined by the sysadmin
   */
  static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
  {
-       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+       struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
  
         /* System default restrictions always apply */
@@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
         return uc_req;
  }
  
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
  {
         struct uclamp_se uc_eff;
  
@@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
   * for each bucket when all its RUNNABLE tasks require the same clamp.
   */
  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
   * enforce the expected state and warn.
   */
  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
  
  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  
  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
                 uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+
+       /*
+        * Lock the task and the rq where the task is (or was) queued.
+        *
+        * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+        * price to pay to safely serialize util_{min,max} updates with
+        * enqueues, dequeues and migration operations.
+        * This is the same locking schema used by __set_cpus_allowed_ptr().
+        */
+       rq = task_rq_lock(p, &rf);
+
+       /*
+        * Setting the clamp bucket is serialized by task_rq_lock().
+        * If the task is not yet RUNNABLE and its task_struct is not
+        * affecting a valid clamp bucket, the next time it's enqueued,
+        * it will already see the updated clamp bucket value.
+        */
+       if (!p->uclamp[clamp_id].active) {
+               uclamp_rq_dec_id(rq, p, clamp_id);
+               uclamp_rq_inc_id(rq, p, clamp_id);
+       }
+
+       task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+                          unsigned int clamps)
+{
+       enum uclamp_id clamp_id;
+       struct css_task_iter it;
+       struct task_struct *p;
+
+       css_task_iter_start(css, 0, &it);
+       while ((p = css_task_iter_next(&it))) {
+               for_each_clamp_id(clamp_id) {
+                       if ((0x1 << clamp_id) & clamps)
+                               uclamp_update_active(p, clamp_id);
+               }
+       }
+       css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+       struct task_group *tg = &root_task_group;
+
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+                     sysctl_sched_uclamp_util_min, false);
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+                     sysctl_sched_uclamp_util_max, false);
+
+       rcu_read_lock();
+       cpu_util_update_eff(&root_task_group.css);
+       rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos)
  {
+       bool update_root_tg = false;
         int old_min, old_max;
-       static DEFINE_MUTEX(mutex);
         int result;
  
-       mutex_lock(&mutex);
+       mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
  
@@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
         if (old_min != sysctl_sched_uclamp_util_min) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                               sysctl_sched_uclamp_util_min, false);
+               update_root_tg = true;
         }
         if (old_max != sysctl_sched_uclamp_util_max) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                               sysctl_sched_uclamp_util_max, false);
+               update_root_tg = true;
         }
  
+       if (update_root_tg)
+               uclamp_update_root_tg();
+
         /*
-        * Updating all the RUNNABLE task is expensive, keep it simple and do
-        * just a lazy update at each next enqueue time.
+        * We update all RUNNABLE tasks only when task groups are in use.
+        * Otherwise, keep it simple and do just a lazy update at each next
+        * task enqueue time.
          */
+
         goto done;
  
  undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
  done:
-       mutex_unlock(&mutex);
+       mutex_unlock(&uclamp_mutex);
  
         return result;
  }
@@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
  static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         /*
          * On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
  
  static void uclamp_fork(struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)
  static void __init init_uclamp(void)
  {
         struct uclamp_se uc_max = {};
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
         int cpu;
  
+       mutex_init(&uclamp_mutex);
+
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)
  
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-       for_each_clamp_id(clamp_id)
+       for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+               root_task_group.uclamp_req[clamp_id] = uc_max;
+               root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+       }
  }
  
  #else /* CONFIG_UCLAMP_TASK */
@@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  }
  
  /*
@@ -3214,12 +3333,8 @@ static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next, struct rq_flags *rf)
  {
-       struct mm_struct *mm, *oldmm;
-
         prepare_task_switch(rq, prev, next);
  
-       mm = next->mm;
-       oldmm = prev->active_mm;
         /*
          * For paravirt, this is coupled with an exit in switch_to to
          * combine the page table reload and the switch backend into
@@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
         arch_start_context_switch(prev);
  
         /*
-        * If mm is non-NULL, we pass through switch_mm(). If mm is
-        * NULL, we will pass through mmdrop() in finish_task_switch().
-        * Both of these contain the full memory barrier required by
-        * membarrier after storing to rq->curr, before returning to
-        * user-space.
+        * kernel -> kernel   lazy + transfer active
+        *   user -> kernel   lazy + mmgrab() active
+        *
+        * kernel ->   user   switch + mmdrop() active
+        *   user ->   user   switch
          */
-       if (!mm) {
-               next->active_mm = oldmm;
-               mmgrab(oldmm);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm_irqs_off(oldmm, mm, next);
+       if (!next->mm) {                                // to kernel
+               enter_lazy_tlb(prev->active_mm, next);
+
+               next->active_mm = prev->active_mm;
+               if (prev->mm)                           // from user
+                       mmgrab(prev->active_mm);
+               else
+                       prev->active_mm = NULL;
+       } else {                                        // to user
+               /*
+                * sys_membarrier() requires an smp_mb() between setting
+                * rq->curr and returning to userspace.
+                *
+                * The below provides this either through switch_mm(), or in
+                * case 'prev->active_mm == next->mm' through
+                * finish_task_switch()'s mmdrop().
+                */
+
+               switch_mm_irqs_off(prev->active_mm, next->mm, next);
  
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
+               if (!prev->mm) {                        // from kernel
+                       /* will mmdrop() in finish_task_switch(). */
+                       rq->prev_mm = prev->active_mm;
+                       prev->active_mm = NULL;
+               }
         }
  
         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3622,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
  static inline void sched_tick_stop(int cpu) { }
  #endif
  
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
  /*
   * If the value passed in is equal to the current preempt count
@@ -3780,7 +3910,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
                 p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
-                       goto again;
+                       goto restart;
  
                 /* Assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
@@ -3789,14 +3919,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                 return p;
         }
  
-again:
+restart:
+       /*
+        * Ensure that we put DL/RT tasks before the pick loop, such that they
+        * can PULL higher prio tasks when we lower the RQ 'priority'.
+        */
+       prev->sched_class->put_prev_task(rq, prev, rf);
+       if (!rq->nr_running)
+               newidle_balance(rq, rf);
+
         for_each_class(class) {
-               p = class->pick_next_task(rq, prev, rf);
-               if (p) {
-                       if (unlikely(p == RETRY_TASK))
-                               goto again;
+               p = class->pick_next_task(rq, NULL, NULL);
+               if (p)
                         return p;
-               }
         }
  
         /* The idle class should always have a runnable task: */
@@ -3823,7 +3958,7 @@ again:
   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   *      called on the nearest possible occasion:
   *
- *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
   *
   *         - in syscall or exception context, at the next outmost
   *           preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3832,7 +3967,7 @@ again:
   *         - in IRQ context, return from interrupt-handler to
   *           preemptible context
   *
- *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
   *         then at the next:
   *
   *          - cond_resched() call
@@ -4077,7 +4212,7 @@ static void __sched notrace preempt_schedule_common(void)
         } while (need_resched());
  }
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  /*
   * this is the entry point to schedule() from in-kernel preemption
   * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4149,7 +4284,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
  
  /*
   * this is the entry point to schedule() from kernel preemption
@@ -4317,7 +4452,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
         if (queued)
                 enqueue_task(rq, p, queue_flag);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@ -4384,7 +4519,7 @@ void set_user_nice(struct task_struct *p, long nice)
                         resched_curr(rq);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  out_unlock:
         task_rq_unlock(rq, p, &rf);
  }
@@ -4701,6 +4836,9 @@ recheck:
                         return retval;
         }
  
+       if (pi)
+               cpuset_read_lock();
+
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
@@ -4715,8 +4853,8 @@ recheck:
          * Changing the policy of the stop threads its a very bad idea:
          */
         if (p == rq->stop) {
-               task_rq_unlock(rq, p, &rf);
-               return -EINVAL;
+               retval = -EINVAL;
+               goto unlock;
         }
  
         /*
@@ -4734,8 +4872,8 @@ recheck:
                         goto change;
  
                 p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &rf);
-               return 0;
+               retval = 0;
+               goto unlock;
         }
  change:
  
@@ -4748,8 +4886,8 @@ change:
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &rf);
-                       return -EPERM;
+                       retval = -EPERM;
+                       goto unlock;
                 }
  #endif
  #ifdef CONFIG_SMP
@@ -4764,8 +4902,8 @@ change:
                          */
                         if (!cpumask_subset(span, p->cpus_ptr) ||
                             rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &rf);
-                               return -EPERM;
+                               retval = -EPERM;
+                               goto unlock;
                         }
                 }
  #endif
@@ -4775,6 +4913,8 @@ change:
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
+               if (pi)
+                       cpuset_read_unlock();
                 goto recheck;
         }
  
@@ -4784,8 +4924,8 @@ change:
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &rf);
-               return -EBUSY;
+               retval = -EBUSY;
+               goto unlock;
         }
  
         p->sched_reset_on_fork = reset_on_fork;
@@ -4827,7 +4967,7 @@ change:
                 enqueue_task(rq, p, queue_flags);
         }
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
  
         check_class_changed(rq, p, prev_class, oldprio);
  
@@ -4835,14 +4975,22 @@ change:
         preempt_disable();
         task_rq_unlock(rq, p, &rf);
  
-       if (pi)
+       if (pi) {
+               cpuset_read_unlock();
                 rt_mutex_adjust_pi(p);
+       }
  
         /* Run balance callbacks after we've adjusted the PI chain: */
         balance_callback(rq);
         preempt_enable();
  
         return 0;
+
+unlock:
+       task_rq_unlock(rq, p, &rf);
+       if (pi)
+               cpuset_read_unlock();
+       return retval;
  }
  
  static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4926,10 +5074,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
         rcu_read_lock();
         retval = -ESRCH;
         p = find_process_by_pid(pid);
-       if (p != NULL)
-               retval = sched_setscheduler(p, policy, &lparam);
+       if (likely(p))
+               get_task_struct(p);
         rcu_read_unlock();
  
+       if (likely(p)) {
+               retval = sched_setscheduler(p, policy, &lparam);
+               put_task_struct(p);
+       }
+
         return retval;
  }
  
@@ -5460,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
  int __sched _cond_resched(void)
  {
         if (should_resched(0)) {
@@ -5477,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched);
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
@@ -6016,7 +6169,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
         task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@ -6056,21 +6209,22 @@ static void calc_load_migrate(struct rq *rq)
                 atomic_long_add(delta, &calc_load_tasks);
  }
  
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
  {
-}
+       const struct sched_class *class;
+       struct task_struct *next;
  
-static const struct sched_class fake_sched_class = {
-       .put_prev_task = put_prev_task_fake,
-};
+       for_each_class(class) {
+               next = class->pick_next_task(rq, NULL, NULL);
+               if (next) {
+                       next->sched_class->put_prev_task(rq, next, NULL);
+                       return next;
+               }
+       }
  
-static struct task_struct fake_task = {
-       /*
-        * Avoid pull_{rt,dl}_task()
-        */
-       .prio = MAX_PRIO + 1,
-       .sched_class = &fake_sched_class,
-};
+       /* The idle class should always have a runnable task */
+       BUG();
+}
  
  /*
   * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6113,12 +6267,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
                 if (rq->nr_running == 1)
                         break;
  
-               /*
-                * pick_next_task() assumes pinned rq->lock:
-                */
-               next = pick_next_task(rq, &fake_task, rf);
-               BUG_ON(!next);
-               put_prev_task(rq, next);
+               next = __pick_migrate_task(rq);
  
                 /*
                  * Rules for changing task_struct::cpus_mask are holding
@@ -6415,19 +6564,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
  
  void __init sched_init(void)
  {
-       unsigned long alloc_size = 0, ptr;
+       unsigned long ptr = 0;
         int i;
  
         wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
-       if (alloc_size) {
-               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+       if (ptr) {
+               ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.se = (struct sched_entity **)ptr;
@@ -6746,7 +6895,7 @@ struct task_struct *curr_task(int cpu)
  
  #ifdef CONFIG_IA64
  /**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
   * @cpu: the processor in question.
   * @p: the task pointer to set.
   *
@@ -6771,6 +6920,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+                                           struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       enum uclamp_id clamp_id;
+
+       for_each_clamp_id(clamp_id) {
+               uclamp_se_set(&tg->uclamp_req[clamp_id],
+                             uclamp_none(clamp_id), false);
+               tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+       }
+#endif
+}
+
  static void sched_free_group(struct task_group *tg)
  {
         free_fair_sched_group(tg);
@@ -6794,6 +6957,8 @@ struct task_group *sched_create_group(struct task_group *parent)
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
  
+       alloc_uclamp_sched_group(tg, parent);
+
         return tg;
  
  err:
@@ -6897,7 +7062,7 @@ void sched_move_task(struct task_struct *tsk)
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
         if (running)
-               set_curr_task(rq, tsk);
+               set_next_task(rq, tsk);
  
         task_rq_unlock(rq, tsk, &rf);
  }
@@ -6980,10 +7145,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  #ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
-#else
-               /* We don't support RT-tasks being in separate groups */
-               if (task->sched_class != &fair_sched_class)
-                       return -EINVAL;
  #endif
                 /*
                  * Serialize against wake_up_new_task() such that if its
@@ -7014,6 +7175,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
                 sched_move_task(task);
  }
  
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+       struct cgroup_subsys_state *top_css = css;
+       struct uclamp_se *uc_parent = NULL;
+       struct uclamp_se *uc_se = NULL;
+       unsigned int eff[UCLAMP_CNT];
+       enum uclamp_id clamp_id;
+       unsigned int clamps;
+
+       css_for_each_descendant_pre(css, top_css) {
+               uc_parent = css_tg(css)->parent
+                       ? css_tg(css)->parent->uclamp : NULL;
+
+               for_each_clamp_id(clamp_id) {
+                       /* Assume effective clamps matches requested clamps */
+                       eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+                       /* Cap effective clamps with parent's effective clamps */
+                       if (uc_parent &&
+                           eff[clamp_id] > uc_parent[clamp_id].value) {
+                               eff[clamp_id] = uc_parent[clamp_id].value;
+                       }
+               }
+               /* Ensure protection is always capped by limit */
+               eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+               /* Propagate most restrictive effective clamps */
+               clamps = 0x0;
+               uc_se = css_tg(css)->uclamp;
+               for_each_clamp_id(clamp_id) {
+                       if (eff[clamp_id] == uc_se[clamp_id].value)
+                               continue;
+                       uc_se[clamp_id].value = eff[clamp_id];
+                       uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+                       clamps |= (0x1 << clamp_id);
+               }
+               if (!clamps) {
+                       css = css_rightmost_descendant(css);
+                       continue;
+               }
+
+               /* Immediately update descendants RUNNABLE tasks */
+               uclamp_update_active_tasks(css, clamps);
+       }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT   2
+#define UCLAMP_PERCENT_SCALE   (100 * POW10(UCLAMP_PERCENT_SHIFT))
+       s64 percent;
+       u64 util;
+       int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+       struct uclamp_request req = {
+               .percent = UCLAMP_PERCENT_SCALE,
+               .util = SCHED_CAPACITY_SCALE,
+               .ret = 0,
+       };
+
+       buf = strim(buf);
+       if (strcmp(buf, "max")) {
+               req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+                                            &req.percent);
+               if (req.ret)
+                       return req;
+               if (req.percent > UCLAMP_PERCENT_SCALE) {
+                       req.ret = -ERANGE;
+                       return req;
+               }
+
+               req.util = req.percent << SCHED_CAPACITY_SHIFT;
+               req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+       }
+
+       return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off,
+                               enum uclamp_id clamp_id)
+{
+       struct uclamp_request req;
+       struct task_group *tg;
+
+       req = capacity_from_percent(buf);
+       if (req.ret)
+               return req.ret;
+
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
+
+       tg = css_tg(of_css(of));
+       if (tg->uclamp_req[clamp_id].value != req.util)
+               uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+       /*
+        * Because of not recoverable conversion rounding we keep track of the
+        * exact requested value
+        */
+       tg->uclamp_pct[clamp_id] = req.percent;
+
+       /* Update effective clamps to track the most restrictive value */
+       cpu_util_update_eff(of_css(of));
+
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
+
+       return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+                                   enum uclamp_id clamp_id)
+{
+       struct task_group *tg;
+       u64 util_clamp;
+       u64 percent;
+       u32 rem;
+
+       rcu_read_lock();
+       tg = css_tg(seq_css(sf));
+       util_clamp = tg->uclamp_req[clamp_id].value;
+       rcu_read_unlock();
+
+       if (util_clamp == SCHED_CAPACITY_SCALE) {
+               seq_puts(sf, "max\n");
+               return;
+       }
+
+       percent = tg->uclamp_pct[clamp_id];
+       percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+       seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MIN);
+       return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MAX);
+       return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@ -7358,6 +7691,20 @@ static struct cftype cpu_legacy_files[] = {
                 .read_u64 = cpu_rt_period_read_uint,
                 .write_u64 = cpu_rt_period_write_uint,
         },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
         { }     /* Terminate */
  };
@@ -7525,6 +7872,20 @@ static struct cftype cpu_files[] = {
                 .seq_show = cpu_max_show,
                 .write = cpu_max_write,
         },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
         { }     /* terminate */
  };