Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Mar 2017 22:42:34 +0000 (14:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Mar 2017 22:42:34 +0000 (14:42 -0800)
Pull scheduler fixes from Ingo Molnar:
 "A fix for KVM's scheduler clock which (erroneously) was always marked
  unstable, a fix for RT/DL load balancing, plus latency fixes"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/clock, x86/tsc: Rework the x86 'unstable' sched_clock() interface
  sched/core: Fix pick_next_task() for RT,DL
  sched/fair: Make select_idle_cpu() more aggressive

1  2 
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/centaur.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/cyrix.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/transmeta.c
arch/x86/kernel/tsc.c
kernel/sched/core.c
kernel/sched/fair.c

index 35a5d5dca2fae5fb68d522658824440f3b736d8b,30d924ae5c3465b5dc974e66c74c95c2fd47bb2a..c36140d788fe215aadb3a8f27a8de040f2c44c06
@@@ -5,7 -5,6 +5,7 @@@
  
  #include <linux/io.h>
  #include <linux/sched.h>
 +#include <linux/sched/clock.h>
  #include <linux/random.h>
  #include <asm/processor.h>
  #include <asm/apic.h>
@@@ -556,10 -555,6 +556,6 @@@ static void early_init_amd(struct cpuin
        if (c->x86_power & (1 << 8)) {
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-               if (check_tsc_unstable())
-                       clear_sched_clock_stable();
-       } else {
-               clear_sched_clock_stable();
        }
  
        /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
index adc0ebd8bed0e17be1716f3fb7c3eab51b7fa0c4,bad8ff078a214f752b8377f2d61e4f323bbeeeb1..43955ee6715b1876b89ebd615b8eb171bbde1dd0
@@@ -1,6 -1,5 +1,6 @@@
  
  #include <linux/sched.h>
 +#include <linux/sched/clock.h>
  
  #include <asm/cpufeature.h>
  #include <asm/e820.h>
@@@ -105,8 -104,6 +105,6 @@@ static void early_init_centaur(struct c
  #ifdef CONFIG_X86_64
        set_cpu_cap(c, X86_FEATURE_SYSENTER32);
  #endif
-       clear_sched_clock_stable();
  }
  
  static void init_centaur(struct cpuinfo_x86 *c)
index b11b38c3b0bde194b9139ebfa0d3bc251b96f803,9d98e2e15d54d381dc00bcf6cec6899f2ff3591e..58094a1f9e9d301e11d2c93a1ecc126e1715002e
@@@ -7,9 -7,7 +7,9 @@@
  #include <linux/string.h>
  #include <linux/ctype.h>
  #include <linux/delay.h>
 -#include <linux/sched.h>
 +#include <linux/sched/mm.h>
 +#include <linux/sched/clock.h>
 +#include <linux/sched/task.h>
  #include <linux/init.h>
  #include <linux/kprobes.h>
  #include <linux/kgdb.h>
@@@ -88,7 -86,6 +88,6 @@@ static void default_init(struct cpuinfo
                        strcpy(c->x86_model_id, "386");
        }
  #endif
-       clear_sched_clock_stable();
  }
  
  static const struct cpu_dev default_cpu = {
@@@ -1077,8 -1074,6 +1076,6 @@@ static void identify_cpu(struct cpuinfo
         */
        if (this_cpu->c_init)
                this_cpu->c_init(c);
-       else
-               clear_sched_clock_stable();
  
        /* Disable the PN if appropriate */
        squash_the_stupid_serial_number(c);
index 0a3bc19de0177e93f81ae24c58264e7205406fd6,31e679238e8d0bc94db07415e69dd466bbc09445..a70fd61095f8a73baa5eb7c486afd6ff19cd4fd1
@@@ -10,7 -10,6 +10,7 @@@
  #include <asm/tsc.h>
  #include <asm/cpufeature.h>
  #include <linux/sched.h>
 +#include <linux/sched/clock.h>
  
  #include "cpu.h"
  
@@@ -185,7 -184,6 +185,6 @@@ static void early_init_cyrix(struct cpu
                set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
                break;
        }
-       clear_sched_clock_stable();
  }
  
  static void init_cyrix(struct cpuinfo_x86 *c)
index fe0a615a051b19a99f9388a7b245b326cee86e11,2388bafe5c37e2e809e5c7f318989debf35b45ca..063197771b8d7ba08f2eafe474cacb0efe9e79d3
@@@ -4,7 -4,6 +4,7 @@@
  #include <linux/bitops.h>
  #include <linux/smp.h>
  #include <linux/sched.h>
 +#include <linux/sched/clock.h>
  #include <linux/thread_info.h>
  #include <linux/init.h>
  #include <linux/uaccess.h>
@@@ -162,10 -161,6 +162,6 @@@ static void early_init_intel(struct cpu
        if (c->x86_power & (1 << 8)) {
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-               if (check_tsc_unstable())
-                       clear_sched_clock_stable();
-       } else {
-               clear_sched_clock_stable();
        }
  
        /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
index 8457b49786686f74a737429ec47d04c2f92bab35,6a9ad73a2c5468d34136b4189b956c5864685287..d77d07ab310b4317d33e44de10896297bb2a6654
@@@ -1,6 -1,5 +1,6 @@@
  #include <linux/kernel.h>
  #include <linux/sched.h>
 +#include <linux/sched/clock.h>
  #include <linux/mm.h>
  #include <asm/cpufeature.h>
  #include <asm/msr.h>
@@@ -16,8 -15,6 +16,6 @@@ static void early_init_transmeta(struc
                if (xlvl >= 0x80860001)
                        c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
        }
-       clear_sched_clock_stable();
  }
  
  static void init_transmeta(struct cpuinfo_x86 *c)
diff --combined arch/x86/kernel/tsc.c
index 46bcda4cb1c2f84762bc409bf4c89b1528ba1c52,911129fda2f988f9141f9726616b7fce1166cf97..4f7a9833d8e51f2e023c3a5c0f6b54813c70c4a0
@@@ -2,7 -2,6 +2,7 @@@
  
  #include <linux/kernel.h>
  #include <linux/sched.h>
 +#include <linux/sched/clock.h>
  #include <linux/init.h>
  #include <linux/export.h>
  #include <linux/timer.h>
@@@ -327,9 -326,16 +327,16 @@@ unsigned long long sched_clock(void
  {
        return paravirt_sched_clock();
  }
+ static inline bool using_native_sched_clock(void)
+ {
+       return pv_time_ops.sched_clock == native_sched_clock;
+ }
  #else
  unsigned long long
  sched_clock(void) __attribute__((alias("native_sched_clock")));
+ static inline bool using_native_sched_clock(void) { return true; }
  #endif
  
  int check_tsc_unstable(void)
@@@ -1112,8 -1118,10 +1119,10 @@@ static void tsc_cs_mark_unstable(struc
  {
        if (tsc_unstable)
                return;
        tsc_unstable = 1;
-       clear_sched_clock_stable();
+       if (using_native_sched_clock())
+               clear_sched_clock_stable();
        disable_sched_clock_irqtime();
        pr_info("Marking TSC unstable due to clocksource watchdog\n");
  }
@@@ -1135,18 -1143,20 +1144,20 @@@ static struct clocksource clocksource_t
  
  void mark_tsc_unstable(char *reason)
  {
-       if (!tsc_unstable) {
-               tsc_unstable = 1;
+       if (tsc_unstable)
+               return;
+       tsc_unstable = 1;
+       if (using_native_sched_clock())
                clear_sched_clock_stable();
-               disable_sched_clock_irqtime();
-               pr_info("Marking TSC unstable due to %s\n", reason);
-               /* Change only the rating, when not registered */
-               if (clocksource_tsc.mult)
-                       clocksource_mark_unstable(&clocksource_tsc);
-               else {
-                       clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
-                       clocksource_tsc.rating = 0;
-               }
+       disable_sched_clock_irqtime();
+       pr_info("Marking TSC unstable due to %s\n", reason);
+       /* Change only the rating, when not registered */
+       if (clocksource_tsc.mult) {
+               clocksource_mark_unstable(&clocksource_tsc);
+       } else {
+               clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
+               clocksource_tsc.rating = 0;
        }
  }
  
diff --combined kernel/sched/core.c
index 956383844116ab456f8552abd4b5dcc00e09f347,6699d43a88430a27abeda42e65e6f7d93cddc359..3b31fc05a0f1e45be5985b860a5fde95ee969832
@@@ -6,15 -6,10 +6,15 @@@
   *  Copyright (C) 1991-2002  Linus Torvalds
   */
  #include <linux/sched.h>
 +#include <linux/sched/clock.h>
 +#include <uapi/linux/sched/types.h>
 +#include <linux/sched/loadavg.h>
 +#include <linux/sched/hotplug.h>
  #include <linux/cpuset.h>
  #include <linux/delayacct.h>
  #include <linux/init_task.h>
  #include <linux/context_tracking.h>
 +#include <linux/rcupdate_wait.h>
  
  #include <linux/blkdev.h>
  #include <linux/kprobes.h>
@@@ -986,7 -981,7 +986,7 @@@ static struct rq *__migrate_task(struc
                return rq;
  
        /* Affinity changed (again). */
 -      if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 +      if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                return rq;
  
        rq = move_queued_task(rq, p, dest_cpu);
@@@ -1264,10 -1259,10 +1264,10 @@@ static int migrate_swap_stop(void *data
        if (task_cpu(arg->src_task) != arg->src_cpu)
                goto unlock;
  
 -      if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
 +      if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
                goto unlock;
  
 -      if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
 +      if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
                goto unlock;
  
        __migrate_swap_task(arg->src_task, arg->dst_cpu);
@@@ -1308,10 -1303,10 +1308,10 @@@ int migrate_swap(struct task_struct *cu
        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
                goto out;
  
 -      if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
 +      if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
                goto out;
  
 -      if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
 +      if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
                goto out;
  
        trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@@ -1495,14 -1490,14 +1495,14 @@@ static int select_fallback_rq(int cpu, 
                for_each_cpu(dest_cpu, nodemask) {
                        if (!cpu_active(dest_cpu))
                                continue;
 -                      if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 +                      if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                                return dest_cpu;
                }
        }
  
        for (;;) {
                /* Any allowed, online CPU? */
 -              for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
 +              for_each_cpu(dest_cpu, &p->cpus_allowed) {
                        if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
                                continue;
                        if (!cpu_online(dest_cpu))
@@@ -1554,10 -1549,10 +1554,10 @@@ int select_task_rq(struct task_struct *
  {
        lockdep_assert_held(&p->pi_lock);
  
 -      if (tsk_nr_cpus_allowed(p) > 1)
 +      if (p->nr_cpus_allowed > 1)
                cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        else
 -              cpu = cpumask_any(tsk_cpus_allowed(p));
 +              cpu = cpumask_any(&p->cpus_allowed);
  
        /*
         * In order not to call set_task_cpu() on a blocking task we need
         * [ this allows ->select_task() to simply return task_cpu(p) and
         *   not worry about this generic constraint ]
         */
 -      if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
 +      if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
                     !cpu_online(cpu)))
                cpu = select_fallback_rq(task_cpu(p), p);
  
@@@ -3216,15 -3211,6 +3216,15 @@@ static inline void preempt_latency_star
  static inline void preempt_latency_stop(int val) { }
  #endif
  
 +static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
 +{
 +#ifdef CONFIG_DEBUG_PREEMPT
 +      return p->preempt_disable_ip;
 +#else
 +      return 0;
 +#endif
 +}
 +
  /*
   * Print scheduling while atomic bug:
   */
@@@ -3287,10 -3273,15 +3287,15 @@@ pick_next_task(struct rq *rq, struct ta
        struct task_struct *p;
  
        /*
-        * Optimization: we know that if all tasks are in
-        * the fair class we can call that function directly:
+        * Optimization: we know that if all tasks are in the fair class we can
+        * call that function directly, but only if the @prev task wasn't of a
+        * higher scheduling class, because otherwise those loose the
+        * opportunity to pull in more work from other CPUs.
         */
-       if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+       if (likely((prev->sched_class == &idle_sched_class ||
+                   prev->sched_class == &fair_sched_class) &&
+                  rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq, prev, rf);
                if (unlikely(p == RETRY_TASK))
                        goto again;
@@@ -5247,9 -5238,6 +5252,9 @@@ void sched_show_task(struct task_struc
        int ppid;
        unsigned long state = p->state;
  
 +      /* Make sure the string lines up properly with the number of task states: */
 +      BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
 +
        if (!try_get_task_stack(p))
                return;
        if (state)
@@@ -5478,7 -5466,7 +5483,7 @@@ int migrate_task_to(struct task_struct 
        if (curr_cpu == target_cpu)
                return 0;
  
 -      if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
 +      if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
                return -EINVAL;
  
        /* TODO: This is not properly updating schedstats */
diff --combined kernel/sched/fair.c
index 3e88b35ac1571cd2dc1719378902ca75c08cfbb9,b3ee10dd3e85e49dbaf8a24f506d3e391dcfb300..dea138964b9107b3e22542a8b80f5cf1d43c1dee
@@@ -20,9 -20,7 +20,9 @@@
   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
   */
  
 -#include <linux/sched.h>
 +#include <linux/sched/mm.h>
 +#include <linux/sched/topology.h>
 +
  #include <linux/latencytop.h>
  #include <linux/cpumask.h>
  #include <linux/cpuidle.h>
@@@ -1553,7 -1551,7 +1553,7 @@@ static void task_numa_compare(struct ta
         */
        if (cur) {
                /* Skip this swap candidate if cannot move to the source cpu */
 -              if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
 +              if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
                        goto unlock;
  
                /*
@@@ -1663,7 -1661,7 +1663,7 @@@ static void task_numa_find_cpu(struct t
  
        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
                /* Skip this CPU if the source task cannot migrate */
 -              if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
 +              if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
                        continue;
  
                env->dst_cpu = cpu;
@@@ -5460,7 -5458,7 +5460,7 @@@ find_idlest_group(struct sched_domain *
  
                /* Skip over this group if it has no CPUs allowed */
                if (!cpumask_intersects(sched_group_cpus(group),
 -                                      tsk_cpus_allowed(p)))
 +                                      &p->cpus_allowed))
                        continue;
  
                local_group = cpumask_test_cpu(this_cpu,
@@@ -5580,7 -5578,7 +5580,7 @@@ find_idlest_cpu(struct sched_group *gro
                return cpumask_first(sched_group_cpus(group));
  
        /* Traverse only the allowed CPUs */
 -      for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
 +      for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                if (idle_cpu(i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
@@@ -5719,7 -5717,7 +5719,7 @@@ static int select_idle_core(struct task
        if (!test_idle_cores(target, false))
                return -1;
  
 -      cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
 +      cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
  
        for_each_cpu_wrap(core, cpus, target, wrap) {
                bool idle = true;
@@@ -5753,7 -5751,7 +5753,7 @@@ static int select_idle_smt(struct task_
                return -1;
  
        for_each_cpu(cpu, cpu_smt_mask(target)) {
 -              if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
 +              if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
                        continue;
                if (idle_cpu(cpu))
                        return cpu;
@@@ -5799,13 -5797,13 +5799,13 @@@ static int select_idle_cpu(struct task_
         * Due to large variance we need a large fuzz factor; hackbench in
         * particularly is sensitive here.
         */
-       if ((avg_idle / 512) < avg_cost)
+       if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
                return -1;
  
        time = local_clock();
  
        for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
 -              if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
 +              if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
                        continue;
                if (idle_cpu(cpu))
                        break;
@@@ -5960,7 -5958,7 +5960,7 @@@ select_task_rq_fair(struct task_struct 
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
 -                            && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 +                            && cpumask_test_cpu(cpu, &p->cpus_allowed);
        }
  
        rcu_read_lock();
@@@ -6700,7 -6698,7 +6700,7 @@@ int can_migrate_task(struct task_struc
        if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
                return 0;
  
 -      if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
 +      if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
                int cpu;
  
                schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
  
                /* Prevent to re-select dst_cpu via env's cpus */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
 -                      if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
 +                      if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
                                env->flags |= LBF_DST_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
@@@ -7254,7 -7252,7 +7254,7 @@@ check_cpu_capacity(struct rq *rq, struc
  
  /*
   * Group imbalance indicates (and tries to solve) the problem where balancing
 - * groups is inadequate due to tsk_cpus_allowed() constraints.
 + * groups is inadequate due to ->cpus_allowed constraints.
   *
   * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
   * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
@@@ -8213,7 -8211,8 +8213,7 @@@ more_balance
                         * if the curr task on busiest cpu can't be
                         * moved to this_cpu
                         */
 -                      if (!cpumask_test_cpu(this_cpu,
 -                                      tsk_cpus_allowed(busiest->curr))) {
 +                      if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
                                                            flags);
                                env.flags |= LBF_ALL_PINNED;