Merge branch 'smp/hotplug' into sched/core, to resolve conflicts
[linux-2.6-block.git] / kernel / sched / core.c
index 9c710ad0ac22bde2ffc479dddc8e47a46b7da57c..1e622f254df41d04a2b1a6c28c056c9cbb4ba3cf 100644 (file)
@@ -33,7 +33,7 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
@@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void)
        return rq;
 }
 
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       lockdep_assert_held(&p->pi_lock);
+
+       for (;;) {
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       return rq;
+               }
+               raw_spin_unlock(&rq->lock);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+       __acquires(p->pi_lock)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       for (;;) {
+               raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               /*
+                *      move_queued_task()              task_rq_lock()
+                *
+                *      ACQUIRE (rq->lock)
+                *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
+                *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
+                *      [S] ->cpu = new_cpu             [L] task_rq()
+                *                                      [L] ->on_rq
+                *      RELEASE (rq->lock)
+                *
+                * If we observe the old cpu in task_rq_lock, the acquire of
+                * the old rq->lock will fully serialize against the stores.
+                *
+                * If we observe the new cpu in task_rq_lock, the acquire will
+                * pair with the WMB to ensure we must then also see migrating.
+                */
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       return rq;
+               }
+               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -369,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
         * wakeup due to that.
         *
         * This cmpxchg() implies a full barrier, which pairs with the write
-        * barrier implied by the wakeup in wake_up_list().
+        * barrier implied by the wakeup in wake_up_q().
         */
        if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
                return;
@@ -565,17 +630,8 @@ bool sched_can_stop_tick(struct rq *rq)
                return false;
 
        /*
-        * FIFO realtime policy runs the highest priority task (after DEADLINE).
-        * Other runnable tasks are of a lower priority. The scheduler tick
-        * isn't needed.
-        */
-       fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
-       if (fifo_nr_running)
-               return true;
-
-       /*
-        * Round-robin realtime tasks time slice with other tasks at the same
-        * realtime priority.
+        * If there are more than one RR tasks, we need the tick to effect the
+        * actual RR behaviour.
         */
        if (rq->rt.rr_nr_running) {
                if (rq->rt.rr_nr_running == 1)
@@ -584,8 +640,20 @@ bool sched_can_stop_tick(struct rq *rq)
                        return false;
        }
 
-       /* Normal multitasking need periodic preemption checks */
-       if (rq->cfs.nr_running > 1)
+       /*
+        * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+        * forced preemption between FIFO tasks.
+        */
+       fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+       if (fifo_nr_running)
+               return true;
+
+       /*
+        * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+        * if there's more than one we need the tick for involuntary
+        * preemption.
+        */
+       if (rq->nr_running > 1)
                return false;
 
        return true;
@@ -1053,11 +1121,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 {
        const struct cpumask *cpu_valid_mask = cpu_active_mask;
        unsigned int dest_cpu;
-       unsigned long flags;
+       struct rq_flags rf;
        struct rq *rq;
        int ret = 0;
 
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
 
        if (p->flags & PF_KTHREAD) {
                /*
@@ -1103,7 +1171,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        if (task_running(rq, p) || p->state == TASK_WAKING) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
@@ -1112,12 +1180,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                 * OK, since we're going to drop the lock immediately
                 * afterwards anyway.
                 */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, rf.cookie);
                rq = move_queued_task(rq, p, dest_cpu);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, rf.cookie);
        }
 out:
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
 
        return ret;
 }
@@ -1301,8 +1369,8 @@ out:
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
-       unsigned long flags;
        int running, queued;
+       struct rq_flags rf;
        unsigned long ncsw;
        struct rq *rq;
 
@@ -1337,14 +1405,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * lock now, to be *sure*. If we're wrong, we'll
                 * just go back and repeat.
                 */
-               rq = task_rq_lock(p, &flags);
+               rq = task_rq_lock(p, &rf);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
                queued = task_on_rq_queued(p);
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
 
                /*
                 * If it changed from the expected state, bail out now.
@@ -1605,8 +1673,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+                          struct pin_cookie cookie)
 {
        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
@@ -1618,9 +1686,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
                 * Our task @p is fully woken up and running; so its safe to
                 * drop the rq->lock, hereafter rq is only used for statistics.
                 */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                p->sched_class->task_woken(rq, p);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
        }
 
        if (rq->idle_stamp) {
@@ -1638,7 +1706,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 }
 
 static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+                struct pin_cookie cookie)
 {
        lockdep_assert_held(&rq->lock);
 
@@ -1648,7 +1717,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 #endif
 
        ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-       ttwu_do_wakeup(rq, p, wake_flags);
+       ttwu_do_wakeup(rq, p, wake_flags, cookie);
 }
 
 /*
@@ -1659,17 +1728,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
  */
 static int ttwu_remote(struct task_struct *p, int wake_flags)
 {
+       struct rq_flags rf;
        struct rq *rq;
        int ret = 0;
 
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
        if (task_on_rq_queued(p)) {
                /* check_preempt_curr() may use rq clock */
                update_rq_clock(rq);
-               ttwu_do_wakeup(rq, p, wake_flags);
+               ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
                ret = 1;
        }
-       __task_rq_unlock(rq);
+       __task_rq_unlock(rq, &rf);
 
        return ret;
 }
@@ -1679,6 +1749,7 @@ void sched_ttwu_pending(void)
 {
        struct rq *rq = this_rq();
        struct llist_node *llist = llist_del_all(&rq->wake_list);
+       struct pin_cookie cookie;
        struct task_struct *p;
        unsigned long flags;
 
@@ -1686,15 +1757,15 @@ void sched_ttwu_pending(void)
                return;
 
        raw_spin_lock_irqsave(&rq->lock, flags);
-       lockdep_pin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
 
        while (llist) {
                p = llist_entry(llist, struct task_struct, wake_entry);
                llist = llist_next(llist);
-               ttwu_do_activate(rq, p, 0);
+               ttwu_do_activate(rq, p, 0, cookie);
        }
 
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1781,6 +1852,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
+       struct pin_cookie cookie;
 
 #if defined(CONFIG_SMP)
        if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1791,9 +1863,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #endif
 
        raw_spin_lock(&rq->lock);
-       lockdep_pin_lock(&rq->lock);
-       ttwu_do_activate(rq, p, 0);
-       lockdep_unpin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
+       ttwu_do_activate(rq, p, 0, cookie);
+       lockdep_unpin_lock(&rq->lock, cookie);
        raw_spin_unlock(&rq->lock);
 }
 
@@ -1990,7 +2062,7 @@ out:
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.
  */
-static void try_to_wake_up_local(struct task_struct *p)
+static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
 {
        struct rq *rq = task_rq(p);
 
@@ -2007,11 +2079,11 @@ static void try_to_wake_up_local(struct task_struct *p)
                 * disabled avoiding further scheduler activity on it and we've
                 * not yet picked a replacement task.
                 */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                raw_spin_unlock(&rq->lock);
                raw_spin_lock(&p->pi_lock);
                raw_spin_lock(&rq->lock);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
        }
 
        if (!(p->state & TASK_NORMAL))
@@ -2022,7 +2094,7 @@ static void try_to_wake_up_local(struct task_struct *p)
        if (!task_on_rq_queued(p))
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
-       ttwu_do_wakeup(rq, p, 0);
+       ttwu_do_wakeup(rq, p, 0, cookie);
        if (schedstat_enabled())
                ttwu_stat(p, smp_processor_id(), 0);
 out:
@@ -2382,7 +2454,8 @@ static int dl_overflow(struct task_struct *p, int policy,
        u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
        int cpus, err = -1;
 
-       if (new_bw == p->dl.dl_bw)
+       /* !deadline task may carry old deadline bandwidth */
+       if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
                return 0;
 
        /*
@@ -2421,12 +2494,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);
  */
 void wake_up_new_task(struct task_struct *p)
 {
-       unsigned long flags;
+       struct rq_flags rf;
        struct rq *rq;
 
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
        /* Initialize new task's runnable average */
        init_entity_runnable_average(&p->se);
+       raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
@@ -2435,8 +2508,10 @@ void wake_up_new_task(struct task_struct *p)
         */
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
+       /* Post initialize new task's util average when its cfs_rq is set */
+       post_init_entity_util_avg(&p->se);
 
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
@@ -2447,12 +2522,12 @@ void wake_up_new_task(struct task_struct *p)
                 * Nothing relies on rq->lock after this, so its fine to
                 * drop it.
                 */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, rf.cookie);
                p->sched_class->task_woken(rq, p);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, rf.cookie);
        }
 #endif
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2714,7 +2789,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
  */
 static __always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
-              struct task_struct *next)
+              struct task_struct *next, struct pin_cookie cookie)
 {
        struct mm_struct *mm, *oldmm;
 
@@ -2734,7 +2809,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
-               switch_mm(oldmm, mm, next);
+               switch_mm_irqs_off(oldmm, mm, next);
 
        if (!prev->mm) {
                prev->active_mm = NULL;
@@ -2746,7 +2821,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * of the scheduler it's an obvious special-case), so we
         * do an early lockdep release here:
         */
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
        /* Here we just switch the register state and the stack. */
@@ -2868,7 +2943,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
-       unsigned long flags;
+       struct rq_flags rf;
        struct rq *rq;
        u64 ns;
 
@@ -2888,7 +2963,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
                return p->se.sum_exec_runtime;
 #endif
 
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
        /*
         * Must be ->curr _and_ ->on_rq.  If dequeued, we would
         * project cycles that may never be accounted to this
@@ -2899,7 +2974,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
                p->sched_class->update_curr(rq);
        }
        ns = p->se.sum_exec_runtime;
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
 
        return ns;
 }
@@ -2919,7 +2994,7 @@ void scheduler_tick(void)
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
-       update_cpu_load_active(rq);
+       cpu_load_update_active(rq);
        calc_global_load_tick(rq);
        raw_spin_unlock(&rq->lock);
 
@@ -2962,6 +3037,20 @@ u64 scheduler_tick_max_deferment(void)
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+       if (preempt_count() == val) {
+               unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+               current->preempt_disable_ip = ip;
+#endif
+               trace_preempt_off(CALLER_ADDR0, ip);
+       }
+}
 
 void preempt_count_add(int val)
 {
@@ -2980,17 +3069,21 @@ void preempt_count_add(int val)
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
 #endif
-       if (preempt_count() == val) {
-               unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
-               current->preempt_disable_ip = ip;
-#endif
-               trace_preempt_off(CALLER_ADDR0, ip);
-       }
+       preempt_latency_start(val);
 }
 EXPORT_SYMBOL(preempt_count_add);
 NOKPROBE_SYMBOL(preempt_count_add);
 
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
 void preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
@@ -3007,13 +3100,15 @@ void preempt_count_sub(int val)
                return;
 #endif
 
-       if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+       preempt_latency_stop(val);
        __preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
 NOKPROBE_SYMBOL(preempt_count_sub);
 
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
 #endif
 
 /*
@@ -3066,7 +3161,7 @@ static inline void schedule_debug(struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
        const struct sched_class *class = &fair_sched_class;
        struct task_struct *p;
@@ -3077,20 +3172,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
         */
        if (likely(prev->sched_class == class &&
                   rq->nr_running == rq->cfs.h_nr_running)) {
-               p = fair_sched_class.pick_next_task(rq, prev);
+               p = fair_sched_class.pick_next_task(rq, prev, cookie);
                if (unlikely(p == RETRY_TASK))
                        goto again;
 
                /* assumes fair_sched_class->next == idle_sched_class */
                if (unlikely(!p))
-                       p = idle_sched_class.pick_next_task(rq, prev);
+                       p = idle_sched_class.pick_next_task(rq, prev, cookie);
 
                return p;
        }
 
 again:
        for_each_class(class) {
-               p = class->pick_next_task(rq, prev);
+               p = class->pick_next_task(rq, prev, cookie);
                if (p) {
                        if (unlikely(p == RETRY_TASK))
                                goto again;
@@ -3144,6 +3239,7 @@ static void __sched notrace __schedule(bool preempt)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
+       struct pin_cookie cookie;
        struct rq *rq;
        int cpu;
 
@@ -3177,7 +3273,7 @@ static void __sched notrace __schedule(bool preempt)
         */
        smp_mb__before_spinlock();
        raw_spin_lock(&rq->lock);
-       lockdep_pin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
 
        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -3199,7 +3295,7 @@ static void __sched notrace __schedule(bool preempt)
 
                                to_wakeup = wq_worker_sleeping(prev);
                                if (to_wakeup)
-                                       try_to_wake_up_local(to_wakeup);
+                                       try_to_wake_up_local(to_wakeup, cookie);
                        }
                }
                switch_count = &prev->nvcsw;
@@ -3208,7 +3304,7 @@ static void __sched notrace __schedule(bool preempt)
        if (task_on_rq_queued(prev))
                update_rq_clock(rq);
 
-       next = pick_next_task(rq, prev);
+       next = pick_next_task(rq, prev, cookie);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
        rq->clock_skip_update = 0;
@@ -3219,9 +3315,9 @@ static void __sched notrace __schedule(bool preempt)
                ++*switch_count;
 
                trace_sched_switch(preempt, prev, next);
-               rq = context_switch(rq, prev, next); /* unlocks the rq */
+               rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
        } else {
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                raw_spin_unlock_irq(&rq->lock);
        }
 
@@ -3288,8 +3384,23 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
        do {
+               /*
+                * Because the function tracer can trace preempt_count_sub()
+                * and it also uses preempt_enable/disable_notrace(), if
+                * NEED_RESCHED is set, the preempt_enable_notrace() called
+                * by the function tracer will call this function again and
+                * cause infinite recursion.
+                *
+                * Preemption must be disabled here before the function
+                * tracer can trace. Break up preempt_disable() into two
+                * calls. One to disable preemption without fear of being
+                * traced. The other to still record the preemption latency,
+                * which can also be traced by the function tracer.
+                */
                preempt_disable_notrace();
+               preempt_latency_start(1);
                __schedule(true);
+               preempt_latency_stop(1);
                preempt_enable_no_resched_notrace();
 
                /*
@@ -3341,7 +3452,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                return;
 
        do {
+               /*
+                * Because the function tracer can trace preempt_count_sub()
+                * and it also uses preempt_enable/disable_notrace(), if
+                * NEED_RESCHED is set, the preempt_enable_notrace() called
+                * by the function tracer will call this function again and
+                * cause infinite recursion.
+                *
+                * Preemption must be disabled here before the function
+                * tracer can trace. Break up preempt_disable() into two
+                * calls. One to disable preemption without fear of being
+                * traced. The other to still record the preemption latency,
+                * which can also be traced by the function tracer.
+                */
                preempt_disable_notrace();
+               preempt_latency_start(1);
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
@@ -3351,6 +3476,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                __schedule(true);
                exception_exit(prev_ctx);
 
+               preempt_latency_stop(1);
                preempt_enable_no_resched_notrace();
        } while (need_resched());
 }
@@ -3407,12 +3533,13 @@ EXPORT_SYMBOL(default_wake_function);
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
        int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
-       struct rq *rq;
        const struct sched_class *prev_class;
+       struct rq_flags rf;
+       struct rq *rq;
 
        BUG_ON(prio > MAX_PRIO);
 
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
 
        /*
         * Idle task boosting is a nono in general. There is one
@@ -3488,7 +3615,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
        preempt_disable(); /* avoid rq from going away on us */
-       __task_rq_unlock(rq);
+       __task_rq_unlock(rq, &rf);
 
        balance_callback(rq);
        preempt_enable();
@@ -3498,7 +3625,7 @@ out_unlock:
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, queued;
-       unsigned long flags;
+       struct rq_flags rf;
        struct rq *rq;
 
        if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3507,7 +3634,7 @@ void set_user_nice(struct task_struct *p, long nice)
         * We have to be careful, if called from sys_setpriority(),
         * the task might be in the middle of scheduling on another CPU.
         */
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
@@ -3538,7 +3665,7 @@ void set_user_nice(struct task_struct *p, long nice)
                        resched_curr(rq);
        }
 out_unlock:
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -3835,11 +3962,11 @@ static int __sched_setscheduler(struct task_struct *p,
                      MAX_RT_PRIO - 1 - attr->sched_priority;
        int retval, oldprio, oldpolicy = -1, queued, running;
        int new_effective_prio, policy = attr->sched_policy;
-       unsigned long flags;
        const struct sched_class *prev_class;
-       struct rq *rq;
+       struct rq_flags rf;
        int reset_on_fork;
        int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       struct rq *rq;
 
        /* may grab non-irq protected spin_locks */
        BUG_ON(in_interrupt());
@@ -3934,13 +4061,13 @@ recheck:
         * To be able to change p->policy safely, the appropriate
         * runqueue lock must be held.
         */
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
 
        /*
         * Changing the policy of the stop threads its a very bad idea
         */
        if (p == rq->stop) {
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                return -EINVAL;
        }
 
@@ -3957,7 +4084,7 @@ recheck:
                        goto change;
 
                p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                return 0;
        }
 change:
@@ -3971,7 +4098,7 @@ change:
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &flags);
+                       task_rq_unlock(rq, p, &rf);
                        return -EPERM;
                }
 #endif
@@ -3986,7 +4113,7 @@ change:
                         */
                        if (!cpumask_subset(span, &p->cpus_allowed) ||
                            rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &flags);
+                               task_rq_unlock(rq, p, &rf);
                                return -EPERM;
                        }
                }
@@ -3996,7 +4123,7 @@ change:
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                goto recheck;
        }
 
@@ -4006,7 +4133,7 @@ change:
         * is available.
         */
        if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                return -EBUSY;
        }
 
@@ -4051,7 +4178,7 @@ change:
 
        check_class_changed(rq, p, prev_class, oldprio);
        preempt_disable(); /* avoid rq from going away on us */
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
 
        if (pi)
                rt_mutex_adjust_pi(p);
@@ -4904,10 +5031,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 {
        struct task_struct *p;
        unsigned int time_slice;
-       unsigned long flags;
+       struct rq_flags rf;
+       struct timespec t;
        struct rq *rq;
        int retval;
-       struct timespec t;
 
        if (pid < 0)
                return -EINVAL;
@@ -4922,11 +5049,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        if (retval)
                goto out_unlock;
 
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
        time_slice = 0;
        if (p->sched_class->get_rr_interval)
                time_slice = p->sched_class->get_rr_interval(rq, p);
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
 
        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
@@ -5002,7 +5129,8 @@ void show_state_filter(unsigned long state_filter)
        touch_all_softlockup_watchdogs();
 
 #ifdef CONFIG_SCHED_DEBUG
-       sysrq_sched_debug_show();
+       if (!state_filter)
+               sysrq_sched_debug_show();
 #endif
        rcu_read_unlock();
        /*
@@ -5191,11 +5319,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-       struct rq *rq;
-       unsigned long flags;
        bool queued, running;
+       struct rq_flags rf;
+       struct rq *rq;
 
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
 
@@ -5210,7 +5338,7 @@ void sched_setnuma(struct task_struct *p, int nid)
                p->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE);
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
@@ -5226,7 +5354,7 @@ void idle_task_exit(void)
        BUG_ON(cpu_online(smp_processor_id()));
 
        if (mm != &init_mm) {
-               switch_mm(mm, &init_mm, current);
+               switch_mm_irqs_off(mm, &init_mm, current);
                finish_arch_post_lock_switch();
        }
        mmdrop(mm);
@@ -5274,6 +5402,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
        struct rq *rq = dead_rq;
        struct task_struct *next, *stop = rq->stop;
+       struct pin_cookie cookie;
        int dest_cpu;
 
        /*
@@ -5305,8 +5434,8 @@ static void migrate_tasks(struct rq *dead_rq)
                /*
                 * pick_next_task assumes pinned rq->lock.
                 */
-               lockdep_pin_lock(&rq->lock);
-               next = pick_next_task(rq, &fake_task);
+               cookie = lockdep_pin_lock(&rq->lock);
+               next = pick_next_task(rq, &fake_task, cookie);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
 
@@ -5319,7 +5448,7 @@ static void migrate_tasks(struct rq *dead_rq)
                 * because !cpu_active at this point, which means load-balance
                 * will not interfere. Also, stop-machine.
                 */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                raw_spin_unlock(&rq->lock);
                raw_spin_lock(&next->pi_lock);
                raw_spin_lock(&rq->lock);
@@ -7279,8 +7408,6 @@ void __init sched_init(void)
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                        rq->cpu_load[j] = 0;
 
-               rq->last_load_update_tick = jiffies;
-
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
@@ -7299,12 +7426,13 @@ void __init sched_init(void)
 
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
+               rq->last_load_update_tick = jiffies;
                rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
                rq->last_sched_tick = 0;
 #endif
-#endif
+#endif /* CONFIG_SMP */
                init_rq_hrtick(rq);
                atomic_set(&rq->nr_iowait, 0);
        }
@@ -7587,10 +7715,10 @@ void sched_move_task(struct task_struct *tsk)
 {
        struct task_group *tg;
        int queued, running;
-       unsigned long flags;
+       struct rq_flags rf;
        struct rq *rq;
 
-       rq = task_rq_lock(tsk, &flags);
+       rq = task_rq_lock(tsk, &rf);
 
        running = task_current(rq, tsk);
        queued = task_on_rq_queued(tsk);
@@ -7622,7 +7750,7 @@ void sched_move_task(struct task_struct *tsk)
        if (queued)
                enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
-       task_rq_unlock(rq, tsk, &flags);
+       task_rq_unlock(rq, tsk, &rf);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
@@ -7842,7 +7970,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
-       int i, ret = 0;
+       int i;
 
        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
@@ -7854,7 +7982,7 @@ static int sched_rt_global_constraints(void)
        }
        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 
-       return ret;
+       return 0;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */