Merge branch 'sched/rt' into sched/core, to pick up -rt changes
[linux-2.6-block.git] / kernel / sched / core.c
index 604a5e137efeef856a62be9fdccafccae84f721a..87b84a726db448c76edd1fd46a387e392de9255c 100644 (file)
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 }
 
 #ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
 /* Max allowed minimum utilization */
 unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
 
@@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
        return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
 }
 
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
 {
        if (clamp_id == UCLAMP_MIN)
                return 0;
@@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
 }
 
 static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                  unsigned int clamp_value)
 {
        /*
@@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
        return uclamp_none(UCLAMP_MIN);
 }
 
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                     unsigned int clamp_value)
 {
        /* Reset max-clamp retention only on idle exit */
@@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
 }
 
 static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-                                unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+                                  unsigned int clamp_value)
 {
        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
        int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
        return uclamp_idle_value(rq, clamp_id, clamp_value);
 }
 
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       struct uclamp_se uc_max;
+
+       /*
+        * Tasks in autogroups or root task group will be
+        * restricted by system defaults.
+        */
+       if (task_group_is_autogroup(task_group(p)))
+               return uc_req;
+       if (task_group(p) == &root_task_group)
+               return uc_req;
+
+       uc_max = task_group(p)->uclamp[clamp_id];
+       if (uc_req.value > uc_max.value || !uc_req.user_defined)
+               return uc_max;
+#endif
+
+       return uc_req;
+}
+
 /*
  * The effective clamp bucket index of a task depends on, by increasing
  * priority:
  * - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ *   group or in an autogroup
  * - the system default clamp value, defined by the sysadmin
  */
 static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
 {
-       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+       struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
        struct uclamp_se uc_max = uclamp_default[clamp_id];
 
        /* System default restrictions always apply */
@@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
        return uc_req;
 }
 
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
 {
        struct uclamp_se uc_eff;
 
@@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
  * for each bucket when all its RUNNABLE tasks require the same clamp.
  */
 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
 {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
  * enforce the expected state and warn.
  */
 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
 {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 
 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
 
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
@@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 
 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
 
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
@@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
                uclamp_rq_dec_id(rq, p, clamp_id);
 }
 
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+
+       /*
+        * Lock the task and the rq where the task is (or was) queued.
+        *
+        * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+        * price to pay to safely serialize util_{min,max} updates with
+        * enqueues, dequeues and migration operations.
+        * This is the same locking schema used by __set_cpus_allowed_ptr().
+        */
+       rq = task_rq_lock(p, &rf);
+
+       /*
+        * Setting the clamp bucket is serialized by task_rq_lock().
+        * If the task is not yet RUNNABLE and its task_struct is not
+        * affecting a valid clamp bucket, the next time it's enqueued,
+        * it will already see the updated clamp bucket value.
+        */
+       if (!p->uclamp[clamp_id].active) {
+               uclamp_rq_dec_id(rq, p, clamp_id);
+               uclamp_rq_inc_id(rq, p, clamp_id);
+       }
+
+       task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+                          unsigned int clamps)
+{
+       enum uclamp_id clamp_id;
+       struct css_task_iter it;
+       struct task_struct *p;
+
+       css_task_iter_start(css, 0, &it);
+       while ((p = css_task_iter_next(&it))) {
+               for_each_clamp_id(clamp_id) {
+                       if ((0x1 << clamp_id) & clamps)
+                               uclamp_update_active(p, clamp_id);
+               }
+       }
+       css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+       struct task_group *tg = &root_task_group;
+
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+                     sysctl_sched_uclamp_util_min, false);
+       uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+                     sysctl_sched_uclamp_util_max, false);
+
+       rcu_read_lock();
+       cpu_util_update_eff(&root_task_group.css);
+       rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
                                loff_t *ppos)
 {
+       bool update_root_tg = false;
        int old_min, old_max;
-       static DEFINE_MUTEX(mutex);
        int result;
 
-       mutex_lock(&mutex);
+       mutex_lock(&uclamp_mutex);
        old_min = sysctl_sched_uclamp_util_min;
        old_max = sysctl_sched_uclamp_util_max;
 
@@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
        if (old_min != sysctl_sched_uclamp_util_min) {
                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                              sysctl_sched_uclamp_util_min, false);
+               update_root_tg = true;
        }
        if (old_max != sysctl_sched_uclamp_util_max) {
                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                              sysctl_sched_uclamp_util_max, false);
+               update_root_tg = true;
        }
 
+       if (update_root_tg)
+               uclamp_update_root_tg();
+
        /*
-        * Updating all the RUNNABLE task is expensive, keep it simple and do
-        * just a lazy update at each next enqueue time.
+        * We update all RUNNABLE tasks only when task groups are in use.
+        * Otherwise, keep it simple and do just a lazy update at each next
+        * task enqueue time.
         */
+
        goto done;
 
 undo:
        sysctl_sched_uclamp_util_min = old_min;
        sysctl_sched_uclamp_util_max = old_max;
 done:
-       mutex_unlock(&mutex);
+       mutex_unlock(&uclamp_mutex);
 
        return result;
 }
@@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
 static void __setscheduler_uclamp(struct task_struct *p,
                                  const struct sched_attr *attr)
 {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
 
        /*
         * On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
 
 static void uclamp_fork(struct task_struct *p)
 {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
 
        for_each_clamp_id(clamp_id)
                p->uclamp[clamp_id].active = false;
@@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)
 static void __init init_uclamp(void)
 {
        struct uclamp_se uc_max = {};
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
        int cpu;
 
+       mutex_init(&uclamp_mutex);
+
        for_each_possible_cpu(cpu) {
                memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)
 
        /* System defaults allow max clamp values for both indexes */
        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-       for_each_clamp_id(clamp_id)
+       for_each_clamp_id(clamp_id) {
                uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+               root_task_group.uclamp_req[clamp_id] = uc_max;
+               root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+       }
 }
 
 #else /* CONFIG_UCLAMP_TASK */
@@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
 }
 
 /*
@@ -3214,12 +3333,8 @@ static __always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next, struct rq_flags *rf)
 {
-       struct mm_struct *mm, *oldmm;
-
        prepare_task_switch(rq, prev, next);
 
-       mm = next->mm;
-       oldmm = prev->active_mm;
        /*
         * For paravirt, this is coupled with an exit in switch_to to
         * combine the page table reload and the switch backend into
@@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
        arch_start_context_switch(prev);
 
        /*
-        * If mm is non-NULL, we pass through switch_mm(). If mm is
-        * NULL, we will pass through mmdrop() in finish_task_switch().
-        * Both of these contain the full memory barrier required by
-        * membarrier after storing to rq->curr, before returning to
-        * user-space.
+        * kernel -> kernel   lazy + transfer active
+        *   user -> kernel   lazy + mmgrab() active
+        *
+        * kernel ->   user   switch + mmdrop() active
+        *   user ->   user   switch
         */
-       if (!mm) {
-               next->active_mm = oldmm;
-               mmgrab(oldmm);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm_irqs_off(oldmm, mm, next);
+       if (!next->mm) {                                // to kernel
+               enter_lazy_tlb(prev->active_mm, next);
+
+               next->active_mm = prev->active_mm;
+               if (prev->mm)                           // from user
+                       mmgrab(prev->active_mm);
+               else
+                       prev->active_mm = NULL;
+       } else {                                        // to user
+               /*
+                * sys_membarrier() requires an smp_mb() between setting
+                * rq->curr and returning to userspace.
+                *
+                * The below provides this either through switch_mm(), or in
+                * case 'prev->active_mm == next->mm' through
+                * finish_task_switch()'s mmdrop().
+                */
+
+               switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
+               if (!prev->mm) {                        // from kernel
+                       /* will mmdrop() in finish_task_switch(). */
+                       rq->prev_mm = prev->active_mm;
+                       prev->active_mm = NULL;
+               }
        }
 
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3486,8 +3616,36 @@ void scheduler_tick(void)
 
 struct tick_work {
        int                     cpu;
+       atomic_t                state;
        struct delayed_work     work;
 };
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE      0
+#define TICK_SCHED_REMOTE_OFFLINING    1
+#define TICK_SCHED_REMOTE_RUNNING      2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ *          TICK_SCHED_REMOTE_OFFLINE
+ *                    |   ^
+ *                    |   |
+ *                    |   | sched_tick_remote()
+ *                    |   |
+ *                    |   |
+ *                    +--TICK_SCHED_REMOTE_OFFLINING
+ *                    |   ^
+ *                    |   |
+ * sched_tick_start() |   | sched_tick_stop()
+ *                    |   |
+ *                    V   |
+ *          TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
 
 static struct tick_work __percpu *tick_work_cpu;
 
@@ -3500,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work)
        struct task_struct *curr;
        struct rq_flags rf;
        u64 delta;
+       int os;
 
        /*
         * Handle the tick only if it appears the remote CPU is running in full
@@ -3513,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work)
 
        rq_lock_irq(rq, &rf);
        curr = rq->curr;
-       if (is_idle_task(curr))
+       if (is_idle_task(curr) || cpu_is_offline(cpu))
                goto out_unlock;
 
        update_rq_clock(rq);
@@ -3533,13 +3692,18 @@ out_requeue:
        /*
         * Run the remote tick once per second (1Hz). This arbitrary
         * frequency is large enough to avoid overload but short enough
-        * to keep scheduler internal stats reasonably up to date.
+        * to keep scheduler internal stats reasonably up to date.  But
+        * first update state to reflect hotplug activity if required.
         */
-       queue_delayed_work(system_unbound_wq, dwork, HZ);
+       os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+       WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+       if (os == TICK_SCHED_REMOTE_RUNNING)
+               queue_delayed_work(system_unbound_wq, dwork, HZ);
 }
 
 static void sched_tick_start(int cpu)
 {
+       int os;
        struct tick_work *twork;
 
        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu)
        WARN_ON_ONCE(!tick_work_cpu);
 
        twork = per_cpu_ptr(tick_work_cpu, cpu);
-       twork->cpu = cpu;
-       INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
-       queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+       os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+       WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+       if (os == TICK_SCHED_REMOTE_OFFLINE) {
+               twork->cpu = cpu;
+               INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+               queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+       }
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 static void sched_tick_stop(int cpu)
 {
        struct tick_work *twork;
+       int os;
 
        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
                return;
@@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu)
        WARN_ON_ONCE(!tick_work_cpu);
 
        twork = per_cpu_ptr(tick_work_cpu, cpu);
-       cancel_delayed_work_sync(&twork->work);
+       /* There cannot be competing actions, but don't rely on stop-machine. */
+       os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+       WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+       /* Don't cancel, as this would mess up the state machine. */
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
@@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void)
 {
        tick_work_cpu = alloc_percpu(struct tick_work);
        BUG_ON(!tick_work_cpu);
-
        return 0;
 }
 
@@ -3739,7 +3910,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
                p = fair_sched_class.pick_next_task(rq, prev, rf);
                if (unlikely(p == RETRY_TASK))
-                       goto again;
+                       goto restart;
 
                /* Assumes fair_sched_class->next == idle_sched_class */
                if (unlikely(!p))
@@ -3748,14 +3919,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                return p;
        }
 
-again:
+restart:
+       /*
+        * Ensure that we put DL/RT tasks before the pick loop, such that they
+        * can PULL higher prio tasks when we lower the RQ 'priority'.
+        */
+       prev->sched_class->put_prev_task(rq, prev, rf);
+       if (!rq->nr_running)
+               newidle_balance(rq, rf);
+
        for_each_class(class) {
-               p = class->pick_next_task(rq, prev, rf);
-               if (p) {
-                       if (unlikely(p == RETRY_TASK))
-                               goto again;
+               p = class->pick_next_task(rq, NULL, NULL);
+               if (p)
                        return p;
-               }
        }
 
        /* The idle class should always have a runnable task: */
@@ -4273,7 +4449,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
        if (queued)
                enqueue_task(rq, p, queue_flag);
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
 
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -4340,7 +4516,7 @@ void set_user_nice(struct task_struct *p, long nice)
                        resched_curr(rq);
        }
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
 out_unlock:
        task_rq_unlock(rq, p, &rf);
 }
@@ -4657,6 +4833,9 @@ recheck:
                        return retval;
        }
 
+       if (pi)
+               cpuset_read_lock();
+
        /*
         * Make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -4671,8 +4850,8 @@ recheck:
         * Changing the policy of the stop threads its a very bad idea:
         */
        if (p == rq->stop) {
-               task_rq_unlock(rq, p, &rf);
-               return -EINVAL;
+               retval = -EINVAL;
+               goto unlock;
        }
 
        /*
@@ -4690,8 +4869,8 @@ recheck:
                        goto change;
 
                p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &rf);
-               return 0;
+               retval = 0;
+               goto unlock;
        }
 change:
 
@@ -4704,8 +4883,8 @@ change:
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &rf);
-                       return -EPERM;
+                       retval = -EPERM;
+                       goto unlock;
                }
 #endif
 #ifdef CONFIG_SMP
@@ -4720,8 +4899,8 @@ change:
                         */
                        if (!cpumask_subset(span, p->cpus_ptr) ||
                            rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &rf);
-                               return -EPERM;
+                               retval = -EPERM;
+                               goto unlock;
                        }
                }
 #endif
@@ -4731,6 +4910,8 @@ change:
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
                task_rq_unlock(rq, p, &rf);
+               if (pi)
+                       cpuset_read_unlock();
                goto recheck;
        }
 
@@ -4740,8 +4921,8 @@ change:
         * is available.
         */
        if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &rf);
-               return -EBUSY;
+               retval = -EBUSY;
+               goto unlock;
        }
 
        p->sched_reset_on_fork = reset_on_fork;
@@ -4783,7 +4964,7 @@ change:
                enqueue_task(rq, p, queue_flags);
        }
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
 
        check_class_changed(rq, p, prev_class, oldprio);
 
@@ -4791,14 +4972,22 @@ change:
        preempt_disable();
        task_rq_unlock(rq, p, &rf);
 
-       if (pi)
+       if (pi) {
+               cpuset_read_unlock();
                rt_mutex_adjust_pi(p);
+       }
 
        /* Run balance callbacks after we've adjusted the PI chain: */
        balance_callback(rq);
        preempt_enable();
 
        return 0;
+
+unlock:
+       task_rq_unlock(rq, p, &rf);
+       if (pi)
+               cpuset_read_unlock();
+       return retval;
 }
 
 static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4882,10 +5071,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
-       if (p != NULL)
-               retval = sched_setscheduler(p, policy, &lparam);
+       if (likely(p))
+               get_task_struct(p);
        rcu_read_unlock();
 
+       if (likely(p)) {
+               retval = sched_setscheduler(p, policy, &lparam);
+               put_task_struct(p);
+       }
+
        return retval;
 }
 
@@ -5972,7 +6166,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
-               set_curr_task(rq, p);
+               set_next_task(rq, p);
        task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -6012,21 +6206,22 @@ static void calc_load_migrate(struct rq *rq)
                atomic_long_add(delta, &calc_load_tasks);
 }
 
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
 {
-}
+       const struct sched_class *class;
+       struct task_struct *next;
 
-static const struct sched_class fake_sched_class = {
-       .put_prev_task = put_prev_task_fake,
-};
+       for_each_class(class) {
+               next = class->pick_next_task(rq, NULL, NULL);
+               if (next) {
+                       next->sched_class->put_prev_task(rq, next, NULL);
+                       return next;
+               }
+       }
 
-static struct task_struct fake_task = {
-       /*
-        * Avoid pull_{rt,dl}_task()
-        */
-       .prio = MAX_PRIO + 1,
-       .sched_class = &fake_sched_class,
-};
+       /* The idle class should always have a runnable task */
+       BUG();
+}
 
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6069,12 +6264,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
                if (rq->nr_running == 1)
                        break;
 
-               /*
-                * pick_next_task() assumes pinned rq->lock:
-                */
-               next = pick_next_task(rq, &fake_task, rf);
-               BUG_ON(!next);
-               put_prev_task(rq, next);
+               next = __pick_migrate_task(rq);
 
                /*
                 * Rules for changing task_struct::cpus_mask are holding
@@ -6371,19 +6561,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 
 void __init sched_init(void)
 {
-       unsigned long alloc_size = 0, ptr;
+       unsigned long ptr = 0;
        int i;
 
        wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+       ptr += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-       if (alloc_size) {
-               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+       if (ptr) {
+               ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.se = (struct sched_entity **)ptr;
@@ -6702,7 +6892,7 @@ struct task_struct *curr_task(int cpu)
 
 #ifdef CONFIG_IA64
 /**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
@@ -6727,6 +6917,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+                                           struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       enum uclamp_id clamp_id;
+
+       for_each_clamp_id(clamp_id) {
+               uclamp_se_set(&tg->uclamp_req[clamp_id],
+                             uclamp_none(clamp_id), false);
+               tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+       }
+#endif
+}
+
 static void sched_free_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
@@ -6750,6 +6954,8 @@ struct task_group *sched_create_group(struct task_group *parent)
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
 
+       alloc_uclamp_sched_group(tg, parent);
+
        return tg;
 
 err:
@@ -6853,7 +7059,7 @@ void sched_move_task(struct task_struct *tsk)
        if (queued)
                enqueue_task(rq, tsk, queue_flags);
        if (running)
-               set_curr_task(rq, tsk);
+               set_next_task(rq, tsk);
 
        task_rq_unlock(rq, tsk, &rf);
 }
@@ -6936,10 +7142,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 #ifdef CONFIG_RT_GROUP_SCHED
                if (!sched_rt_can_attach(css_tg(css), task))
                        return -EINVAL;
-#else
-               /* We don't support RT-tasks being in separate groups */
-               if (task->sched_class != &fair_sched_class)
-                       return -EINVAL;
 #endif
                /*
                 * Serialize against wake_up_new_task() such that if its
@@ -6970,6 +7172,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
                sched_move_task(task);
 }
 
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+       struct cgroup_subsys_state *top_css = css;
+       struct uclamp_se *uc_parent = NULL;
+       struct uclamp_se *uc_se = NULL;
+       unsigned int eff[UCLAMP_CNT];
+       enum uclamp_id clamp_id;
+       unsigned int clamps;
+
+       css_for_each_descendant_pre(css, top_css) {
+               uc_parent = css_tg(css)->parent
+                       ? css_tg(css)->parent->uclamp : NULL;
+
+               for_each_clamp_id(clamp_id) {
+                       /* Assume effective clamps matches requested clamps */
+                       eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+                       /* Cap effective clamps with parent's effective clamps */
+                       if (uc_parent &&
+                           eff[clamp_id] > uc_parent[clamp_id].value) {
+                               eff[clamp_id] = uc_parent[clamp_id].value;
+                       }
+               }
+               /* Ensure protection is always capped by limit */
+               eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+               /* Propagate most restrictive effective clamps */
+               clamps = 0x0;
+               uc_se = css_tg(css)->uclamp;
+               for_each_clamp_id(clamp_id) {
+                       if (eff[clamp_id] == uc_se[clamp_id].value)
+                               continue;
+                       uc_se[clamp_id].value = eff[clamp_id];
+                       uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+                       clamps |= (0x1 << clamp_id);
+               }
+               if (!clamps) {
+                       css = css_rightmost_descendant(css);
+                       continue;
+               }
+
+               /* Immediately update descendants RUNNABLE tasks */
+               uclamp_update_active_tasks(css, clamps);
+       }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT   2
+#define UCLAMP_PERCENT_SCALE   (100 * POW10(UCLAMP_PERCENT_SHIFT))
+       s64 percent;
+       u64 util;
+       int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+       struct uclamp_request req = {
+               .percent = UCLAMP_PERCENT_SCALE,
+               .util = SCHED_CAPACITY_SCALE,
+               .ret = 0,
+       };
+
+       buf = strim(buf);
+       if (strcmp(buf, "max")) {
+               req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+                                            &req.percent);
+               if (req.ret)
+                       return req;
+               if (req.percent > UCLAMP_PERCENT_SCALE) {
+                       req.ret = -ERANGE;
+                       return req;
+               }
+
+               req.util = req.percent << SCHED_CAPACITY_SHIFT;
+               req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+       }
+
+       return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off,
+                               enum uclamp_id clamp_id)
+{
+       struct uclamp_request req;
+       struct task_group *tg;
+
+       req = capacity_from_percent(buf);
+       if (req.ret)
+               return req.ret;
+
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
+
+       tg = css_tg(of_css(of));
+       if (tg->uclamp_req[clamp_id].value != req.util)
+               uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+       /*
+        * Because of not recoverable conversion rounding we keep track of the
+        * exact requested value
+        */
+       tg->uclamp_pct[clamp_id] = req.percent;
+
+       /* Update effective clamps to track the most restrictive value */
+       cpu_util_update_eff(of_css(of));
+
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
+
+       return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+                                   enum uclamp_id clamp_id)
+{
+       struct task_group *tg;
+       u64 util_clamp;
+       u64 percent;
+       u32 rem;
+
+       rcu_read_lock();
+       tg = css_tg(seq_css(sf));
+       util_clamp = tg->uclamp_req[clamp_id].value;
+       rcu_read_unlock();
+
+       if (util_clamp == SCHED_CAPACITY_SCALE) {
+               seq_puts(sf, "max\n");
+               return;
+       }
+
+       percent = tg->uclamp_pct[clamp_id];
+       percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+       seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MIN);
+       return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MAX);
+       return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
@@ -7314,6 +7688,20 @@ static struct cftype cpu_legacy_files[] = {
                .read_u64 = cpu_rt_period_read_uint,
                .write_u64 = cpu_rt_period_write_uint,
        },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
 #endif
        { }     /* Terminate */
 };
@@ -7481,6 +7869,20 @@ static struct cftype cpu_files[] = {
                .seq_show = cpu_max_show,
                .write = cpu_max_write,
        },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
 #endif
        { }     /* terminate */
 };