Merge branch 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 1 Apr 2014 17:16:10 +0000 (10:16 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 1 Apr 2014 17:16:10 +0000 (10:16 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 1 Apr 2014 17:16:10 +0000 (10:16 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 1 Apr 2014 17:16:10 +0000 (10:16 -0700)
diff --combined drivers/s390/cio/cio.c

index 97c48b38d67d48641befbcd2c2d06804b114871f,5154513de112b99b1c4113bbb9e6fcb3ff6b5b94..d691e6a13aae934622376d60140409475aa39783
--- 1/drivers/s390/cio/cio.c
--- 2/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@@ -28,7 -28,7 +28,7 @@@
   #include <asm/chpid.h>
   #include <asm/airq.h>
   #include <asm/isc.h>
- #include <asm/cputime.h>
+ #include <linux/cputime.h>
   #include <asm/fcx.h>
   #include <asm/nmi.h>
   #include <asm/crw.h>
@@@ -54,7 -54,7 +54,7 @@@ debug_info_t *cio_debug_crw_id
    */
   static int __init cio_debug_init(void)
   {
- -      cio_debug_msg_id = debug_register("cio_msg", 16, 1, 16 * sizeof(long));
+ +      cio_debug_msg_id = debug_register("cio_msg", 16, 1, 11 * sizeof(long));
         if (!cio_debug_msg_id)
                 goto out_unregister;
         debug_register_view(cio_debug_msg_id, &debug_sprintf_view);
@@@ -64,7 -64,7 +64,7 @@@
                 goto out_unregister;
         debug_register_view(cio_debug_trace_id, &debug_hex_ascii_view);
         debug_set_level(cio_debug_trace_id, 2);
- -      cio_debug_crw_id = debug_register("cio_crw", 16, 1, 16 * sizeof(long));
+ +      cio_debug_crw_id = debug_register("cio_crw", 8, 1, 8 * sizeof(long));
         if (!cio_debug_crw_id)
                 goto out_unregister;
         debug_register_view(cio_debug_crw_id, &debug_sprintf_view);
@@@ -342,9 -342,8 +342,9 @@@ static int cio_check_config(struct subc
    */
   int cio_commit_config(struct subchannel *sch)
   {
- -      struct schib schib;
         int ccode, retry, ret = 0;
+ +      struct schib schib;
+ +      struct irb irb;
   
         if (stsch_err(sch->schid, &schib) || !css_sch_is_valid(&schib))
                 return -ENODEV;
@@@ -368,10 -367,7 +368,10 @@@
                         ret = -EAGAIN;
                         break;
                 case 1: /* status pending */
- -                      return -EBUSY;
+ +                      ret = -EBUSY;
+ +                      if (tsch(sch->schid, &irb))
+ +                              return ret;
+ +                      break;
                 case 2: /* busy */
                         udelay(100); /* allow for recovery */
                         ret = -EBUSY;
@@@ -407,6 -403,7 +407,6 @@@ EXPORT_SYMBOL_GPL(cio_update_schib)
    */
   int cio_enable_subchannel(struct subchannel *sch, u32 intparm)
   {
- -      int retry;
         int ret;
   
         CIO_TRACE_EVENT(2, "ensch");
@@@ -421,14 -418,20 +421,14 @@@
         sch->config.isc = sch->isc;
         sch->config.intparm = intparm;
   
- -      for (retry = 0; retry < 3; retry++) {
+ +      ret = cio_commit_config(sch);
+ +      if (ret == -EIO) {
+ +              /*
+ +               * Got a program check in msch. Try without
+ +               * the concurrent sense bit the next time.
+ +               */
+ +              sch->config.csense = 0;
                 ret = cio_commit_config(sch);
- -              if (ret == -EIO) {
- -                      /*
- -                       * Got a program check in msch. Try without
- -                       * the concurrent sense bit the next time.
- -                       */
- -                      sch->config.csense = 0;
- -              } else if (ret == -EBUSY) {
- -                      struct irb irb;
- -                      if (tsch(sch->schid, &irb) != 0)
- -                              break;
- -              } else
- -                      break;
         }
         CIO_HEX_EVENT(2, &ret, sizeof(ret));
         return ret;
@@@ -441,6 -444,7 +441,6 @@@ EXPORT_SYMBOL_GPL(cio_enable_subchannel
    */
   int cio_disable_subchannel(struct subchannel *sch)
   {
- -      int retry;
         int ret;
   
         CIO_TRACE_EVENT(2, "dissch");
@@@ -452,8 -456,16 +452,8 @@@
                 return -ENODEV;
   
         sch->config.ena = 0;
+ +      ret = cio_commit_config(sch);
   
- -      for (retry = 0; retry < 3; retry++) {
- -              ret = cio_commit_config(sch);
- -              if (ret == -EBUSY) {
- -                      struct irb irb;
- -                      if (tsch(sch->schid, &irb) != 0)
- -                              break;
- -              } else
- -                      break;
- -      }
         CIO_HEX_EVENT(2, &ret, sizeof(ret));
         return ret;
   }
diff --combined include/linux/sched.h

index 825ed838d4b967d91242fcdc3f06e932d7582499,1ac566c48d3ddb4649f0a191485213a49df12de5..c399ed826648a33f1292f00ea5d004cb4b45ddff
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -3,8 -3,6 +3,8 @@@
   
   #include <uapi/linux/sched.h>
   
+ +#include <linux/sched/prio.h>
+ +
   
   struct sched_param {
         int sched_priority;
@@@ -29,7 -27,7 +29,7 @@@
   
   #include <asm/page.h>
   #include <asm/ptrace.h>
- #include <asm/cputime.h>
+ #include <linux/cputime.h>
   
   #include <linux/smp.h>
   #include <linux/sem.h>
@@@ -130,7 -128,6 +130,7 @@@ struct bio_list
   struct fs_struct;
   struct perf_event_context;
   struct blk_plug;
+ +struct filename;
   
   /*
    * List of flags we want to share for kernel threads,
@@@ -1079,7 -1076,6 +1079,7 @@@ struct sched_entity 
   #endif
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
+ +      int                     depth;
         struct sched_entity     *parent;
         /* rq on which this entity is (to be) queued: */
         struct cfs_rq           *cfs_rq;
@@@ -1463,9 -1459,6 +1463,9 @@@ struct task_struct 
         struct mutex perf_event_mutex;
         struct list_head perf_event_list;
   #endif
+ +#ifdef CONFIG_DEBUG_PREEMPT
+ +      unsigned long preempt_disable_ip;
+ +#endif
   #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;    /* Protected by alloc_lock */
         short il_next;
@@@ -1476,10 -1469,9 +1476,10 @@@
         unsigned int numa_scan_period;
         unsigned int numa_scan_period_max;
         int numa_preferred_nid;
- -      int numa_migrate_deferred;
         unsigned long numa_migrate_retry;
         u64 node_stamp;                 /* migration stamp  */
+ +      u64 last_task_numa_placement;
+ +      u64 last_sum_exec_runtime;
         struct callback_head numa_work;
   
         struct list_head numa_entry;
@@@ -1490,22 -1482,15 +1490,22 @@@
          * Scheduling placement decisions are made based on the these counts.
          * The values remain static for the duration of a PTE scan
          */
- -      unsigned long *numa_faults;
+ +      unsigned long *numa_faults_memory;
         unsigned long total_numa_faults;
   
         /*
          * numa_faults_buffer records faults per node during the current
- -       * scan window. When the scan completes, the counts in numa_faults
- -       * decay and these values are copied.
+ +       * scan window. When the scan completes, the counts in
+ +       * numa_faults_memory decay and these values are copied.
+ +       */
+ +      unsigned long *numa_faults_buffer_memory;
+ +
+ +      /*
+ +       * Track the nodes the process was running on when a NUMA hinting
+ +       * fault was incurred.
          */
- -      unsigned long *numa_faults_buffer;
+ +      unsigned long *numa_faults_cpu;
+ +      unsigned long *numa_faults_buffer_cpu;
   
         /*
          * numa_faults_locality tracks if faults recorded during the last
@@@ -1610,8 -1595,8 +1610,8 @@@ extern void task_numa_fault(int last_no
   extern pid_t task_numa_group_id(struct task_struct *p);
   extern void set_numabalancing_state(bool enabled);
   extern void task_numa_free(struct task_struct *p);
- -
- -extern unsigned int sysctl_numa_balancing_migrate_deferred;
+ +extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+ +                                      int src_nid, int dst_cpu);
   #else
   static inline void task_numa_fault(int last_node, int node, int pages,
                                    int flags)
@@@ -1627,11 -1612,6 +1627,11 @@@ static inline void set_numabalancing_st
   static inline void task_numa_free(struct task_struct *p)
   {
   }
+ +static inline bool should_numa_migrate_memory(struct task_struct *p,
+ +                              struct page *page, int src_nid, int dst_cpu)
+ +{
+ +      return true;
+ +}
   #endif
   
   static inline struct pid *task_pid(struct task_struct *task)
@@@ -2099,16 -2079,7 +2099,16 @@@ static inline void sched_autogroup_exit
   extern bool yield_to(struct task_struct *p, bool preempt);
   extern void set_user_nice(struct task_struct *p, long nice);
   extern int task_prio(const struct task_struct *p);
- -extern int task_nice(const struct task_struct *p);
+ +/**
+ + * task_nice - return the nice value of a given task.
+ + * @p: the task in question.
+ + *
+ + * Return: The nice value [ -20 ... 0 ... 19 ].
+ + */
+ +static inline int task_nice(const struct task_struct *p)
+ +{
+ +      return PRIO_TO_NICE((p)->static_prio);
+ +}
   extern int can_nice(const struct task_struct *p, const int nice);
   extern int task_curr(const struct task_struct *p);
   extern int idle_cpu(int cpu);
@@@ -2340,7 -2311,7 +2340,7 @@@ extern void do_group_exit(int)
   extern int allow_signal(int);
   extern int disallow_signal(int);
   
- -extern int do_execve(const char *,
+ +extern int do_execve(struct filename *,
                      const char __user * const __user *,
                      const char __user * const __user *);
   extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
diff --combined kernel/sched/core.c

index a47902c687ae74e2261fb44e0cec07d3ef8180f6,b14a188af898def4e2ba0f00d65c25458f96da6d..d11a1768357d6f7ca158c93d15690d734e75386c
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -823,19 -823,13 +823,13 @@@ static void update_rq_clock_task(struc
   #endif
   #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
         if (static_key_false((&paravirt_steal_rq_enabled))) {
-               u64 st;
- 
                 steal = paravirt_steal_clock(cpu_of(rq));
                 steal -= rq->prev_steal_time_rq;
   
                 if (unlikely(steal > delta))
                         steal = delta;
   
-               st = steal_ticks(steal);
-               steal = st * TICK_NSEC;
- 
                 rq->prev_steal_time_rq += steal;
- 
                 delta -= steal;
         }
   #endif
@@@ -1745,10 -1739,8 +1739,10 @@@ static void __sched_fork(unsigned long 
         p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
         p->numa_scan_period = sysctl_numa_balancing_scan_delay;
         p->numa_work.next = &p->numa_work;
- -      p->numa_faults = NULL;
- -      p->numa_faults_buffer = NULL;
+ +      p->numa_faults_memory = NULL;
+ +      p->numa_faults_buffer_memory = NULL;
+ +      p->last_task_numa_placement = 0;
+ +      p->last_sum_exec_runtime = 0;
   
         INIT_LIST_HEAD(&p->numa_entry);
         p->numa_group = NULL;
@@@ -1954,7 -1946,7 +1948,7 @@@ static int dl_overflow(struct task_stru
   {
   
         struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- -      u64 period = attr->sched_period;
+ +      u64 period = attr->sched_period ?: attr->sched_deadline;
         u64 runtime = attr->sched_runtime;
         u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
         int cpus, err = -1;
@@@ -2151,6 -2143,8 +2145,6 @@@ static void finish_task_switch(struct r
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
- -              task_numa_free(prev);
- -
                 if (prev->sched_class->task_dead)
                         prev->sched_class->task_dead(prev);
   
@@@ -2167,6 -2161,13 +2161,6 @@@
   
   #ifdef CONFIG_SMP
   
- -/* assumes rq->lock is held */
- -static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
- -{
- -      if (prev->sched_class->pre_schedule)
- -              prev->sched_class->pre_schedule(rq, prev);
- -}
- -
   /* rq->lock is NOT held, but preemption is disabled */
   static inline void post_schedule(struct rq *rq)
   {
@@@ -2184,6 -2185,10 +2178,6 @@@
   
   #else
   
- -static inline void pre_schedule(struct rq *rq, struct task_struct *p)
- -{
- -}
- -
   static inline void post_schedule(struct rq *rq)
   {
   }
@@@ -2499,13 -2504,8 +2493,13 @@@ void __kprobes preempt_count_add(int va
         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                 PREEMPT_MASK - 10);
   #endif
- -      if (preempt_count() == val)
- -              trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ +      if (preempt_count() == val) {
+ +              unsigned long ip = get_parent_ip(CALLER_ADDR1);
+ +#ifdef CONFIG_DEBUG_PREEMPT
+ +              current->preempt_disable_ip = ip;
+ +#endif
+ +              trace_preempt_off(CALLER_ADDR0, ip);
+ +      }
   }
   EXPORT_SYMBOL(preempt_count_add);
   
@@@ -2548,13 -2548,6 +2542,13 @@@ static noinline void __schedule_bug(str
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
+ +#ifdef CONFIG_DEBUG_PREEMPT
+ +      if (in_atomic_preempt_off()) {
+ +              pr_err("Preemption disabled at:");
+ +              print_ip_sym(current->preempt_disable_ip);
+ +              pr_cont("\n");
+ +      }
+ +#endif
         dump_stack();
         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
   }
@@@ -2578,34 -2571,36 +2572,34 @@@ static inline void schedule_debug(struc
         schedstat_inc(this_rq(), sched_count);
   }
   
- -static void put_prev_task(struct rq *rq, struct task_struct *prev)
- -{
- -      if (prev->on_rq || rq->skip_clock_update < 0)
- -              update_rq_clock(rq);
- -      prev->sched_class->put_prev_task(rq, prev);
- -}
- -
   /*
    * Pick up the highest-prio task:
    */
   static inline struct task_struct *
- -pick_next_task(struct rq *rq)
+ +pick_next_task(struct rq *rq, struct task_struct *prev)
   {
- -      const struct sched_class *class;
+ +      const struct sched_class *class = &fair_sched_class;
         struct task_struct *p;
   
         /*
          * Optimization: we know that if all tasks are in
          * the fair class we can call that function directly:
          */
- -      if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- -              p = fair_sched_class.pick_next_task(rq);
- -              if (likely(p))
+ +      if (likely(prev->sched_class == class &&
+ +                 rq->nr_running == rq->cfs.h_nr_running)) {
+ +              p = fair_sched_class.pick_next_task(rq, prev);
+ +              if (likely(p && p != RETRY_TASK))
                         return p;
         }
   
+ +again:
         for_each_class(class) {
- -              p = class->pick_next_task(rq);
- -              if (p)
+ +              p = class->pick_next_task(rq, prev);
+ +              if (p) {
+ +                      if (unlikely(p == RETRY_TASK))
+ +                              goto again;
                         return p;
+ +              }
         }
   
         BUG(); /* the idle class will always have a runnable task */
@@@ -2699,10 -2694,13 +2693,10 @@@ need_resched
                 switch_count = &prev->nvcsw;
         }
   
- -      pre_schedule(rq, prev);
- -
- -      if (unlikely(!rq->nr_running))
- -              idle_balance(cpu, rq);
+ +      if (prev->on_rq || rq->skip_clock_update < 0)
+ +              update_rq_clock(rq);
   
- -      put_prev_task(rq, prev);
- -      next = pick_next_task(rq);
+ +      next = pick_next_task(rq, prev);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
         rq->skip_clock_update = 0;
@@@ -2904,8 -2902,7 +2898,8 @@@ EXPORT_SYMBOL(sleep_on_timeout)
    * This function changes the 'effective' priority of a task. It does
    * not touch ->normal_prio like __setscheduler().
    *
- - * Used by the rt_mutex code to implement priority inheritance logic.
+ + * Used by the rt_mutex code to implement priority inheritance
+ + * logic. Call site only calls if the priority of the task changed.
    */
   void rt_mutex_setprio(struct task_struct *p, int prio)
   {
@@@ -2995,7 -2992,7 +2989,7 @@@ void set_user_nice(struct task_struct *
         unsigned long flags;
         struct rq *rq;
   
- -      if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+ +      if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
                 return;
         /*
          * We have to be careful, if called from sys_setpriority(),
@@@ -3073,11 -3070,11 +3067,11 @@@ SYSCALL_DEFINE1(nice, int, increment
         if (increment > 40)
                 increment = 40;
   
- -      nice = TASK_NICE(current) + increment;
- -      if (nice < -20)
- -              nice = -20;
- -      if (nice > 19)
- -              nice = 19;
+ +      nice = task_nice(current) + increment;
+ +      if (nice < MIN_NICE)
+ +              nice = MIN_NICE;
+ +      if (nice > MAX_NICE)
+ +              nice = MAX_NICE;
   
         if (increment < 0 && !can_nice(current, nice))
                 return -EPERM;
@@@ -3105,6 -3102,18 +3099,6 @@@ int task_prio(const struct task_struct 
         return p->prio - MAX_RT_PRIO;
   }
   
- -/**
- - * task_nice - return the nice value of a given task.
- - * @p: the task in question.
- - *
- - * Return: The nice value [ -20 ... 0 ... 19 ].
- - */
- -int task_nice(const struct task_struct *p)
- -{
- -      return TASK_NICE(p);
- -}
- -EXPORT_SYMBOL(task_nice);
- -
   /**
    * idle_cpu - is a given cpu idle currently?
    * @cpu: the processor in question.
@@@ -3174,8 -3183,9 +3168,8 @@@ __setparam_dl(struct task_struct *p, co
         dl_se->dl_new = 1;
   }
   
- -/* Actually do priority change: must hold pi & rq lock. */
- -static void __setscheduler(struct rq *rq, struct task_struct *p,
- -                         const struct sched_attr *attr)
+ +static void __setscheduler_params(struct task_struct *p,
+ +              const struct sched_attr *attr)
   {
         int policy = attr->sched_policy;
   
@@@ -3195,21 -3205,9 +3189,21 @@@
          * getparam()/getattr() don't report silly values for !rt tasks.
          */
         p->rt_priority = attr->sched_priority;
- -
         p->normal_prio = normal_prio(p);
- -      p->prio = rt_mutex_getprio(p);
+ +      set_load_weight(p);
+ +}
+ +
+ +/* Actually do priority change: must hold pi & rq lock. */
+ +static void __setscheduler(struct rq *rq, struct task_struct *p,
+ +                         const struct sched_attr *attr)
+ +{
+ +      __setscheduler_params(p, attr);
+ +
+ +      /*
+ +       * If we get here, there was no pi waiters boosting the
+ +       * task. It is safe to use the normal prio.
+ +       */
+ +      p->prio = normal_prio(p);
   
         if (dl_prio(p->prio))
                 p->sched_class = &dl_sched_class;
@@@ -3217,6 -3215,8 +3211,6 @@@
                 p->sched_class = &rt_sched_class;
         else
                 p->sched_class = &fair_sched_class;
- -
- -      set_load_weight(p);
   }
   
   static void
@@@ -3269,8 -3269,6 +3263,8 @@@ static int __sched_setscheduler(struct 
                                 const struct sched_attr *attr,
                                 bool user)
   {
+ +      int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ +                    MAX_RT_PRIO - 1 - attr->sched_priority;
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         int policy = attr->sched_policy;
         unsigned long flags;
@@@ -3315,7 -3313,7 +3309,7 @@@ recheck
          */
         if (user && !capable(CAP_SYS_NICE)) {
                 if (fair_policy(policy)) {
- -                      if (attr->sched_nice < TASK_NICE(p) &&
+ +                      if (attr->sched_nice < task_nice(p) &&
                             !can_nice(p, attr->sched_nice))
                                 return -EPERM;
                 }
@@@ -3334,21 -3332,12 +3328,21 @@@
                                 return -EPERM;
                 }
   
+ +               /*
+ +                * Can't set/change SCHED_DEADLINE policy at all for now
+ +                * (safest behavior); in the future we would like to allow
+ +                * unprivileged DL tasks to increase their relative deadline
+ +                * or reduce their runtime (both ways reducing utilization)
+ +                */
+ +              if (dl_policy(policy))
+ +                      return -EPERM;
+ +
                 /*
                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
- -                      if (!can_nice(p, TASK_NICE(p)))
+ +                      if (!can_nice(p, task_nice(p)))
                                 return -EPERM;
                 }
   
@@@ -3385,18 -3374,16 +3379,18 @@@
         }
   
         /*
- -       * If not changing anything there's no need to proceed further:
+ +       * If not changing anything there's no need to proceed further,
+ +       * but store a possible modification of reset_on_fork.
          */
         if (unlikely(policy == p->policy)) {
- -              if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+ +              if (fair_policy(policy) && attr->sched_nice != task_nice(p))
                         goto change;
                 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                         goto change;
                 if (dl_policy(policy))
                         goto change;
   
+ +              p->sched_reset_on_fork = reset_on_fork;
                 task_rq_unlock(rq, p, &flags);
                 return 0;
         }
@@@ -3450,24 -3437,6 +3444,24 @@@ change
                 return -EBUSY;
         }
   
+ +      p->sched_reset_on_fork = reset_on_fork;
+ +      oldprio = p->prio;
+ +
+ +      /*
+ +       * Special case for priority boosted tasks.
+ +       *
+ +       * If the new priority is lower or equal (user space view)
+ +       * than the current (boosted) priority, we just store the new
+ +       * normal parameters and do not touch the scheduler class and
+ +       * the runqueue. This will be done when the task deboost
+ +       * itself.
+ +       */
+ +      if (rt_mutex_check_prio(p, newprio)) {
+ +              __setscheduler_params(p, attr);
+ +              task_rq_unlock(rq, p, &flags);
+ +              return 0;
+ +      }
+ +
         on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
@@@ -3475,18 -3444,16 +3469,18 @@@
         if (running)
                 p->sched_class->put_prev_task(rq, p);
   
- -      p->sched_reset_on_fork = reset_on_fork;
- -
- -      oldprio = p->prio;
         prev_class = p->sched_class;
         __setscheduler(rq, p, attr);
   
         if (running)
                 p->sched_class->set_curr_task(rq);
- -      if (on_rq)
- -              enqueue_task(rq, p, 0);
+ +      if (on_rq) {
+ +              /*
+ +               * We enqueue to tail when the priority of a task is
+ +               * increased (user space view).
+ +               */
+ +              enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ +      }
   
         check_class_changed(rq, p, prev_class, oldprio);
         task_rq_unlock(rq, p, &flags);
@@@ -3642,7 -3609,7 +3636,7 @@@ static int sched_copy_attr(struct sched
          * XXX: do we want to be lenient like existing syscalls; or do we want
          * to be strict and return an error on out-of-bounds values?
          */
- -      attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+ +      attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
   
   out:
         return ret;
@@@ -3688,14 -3655,13 +3682,14 @@@ SYSCALL_DEFINE2(sched_setparam, pid_t, 
    * @pid: the pid in question.
    * @uattr: structure containing the extended parameters.
    */
- -SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ +                             unsigned int, flags)
   {
         struct sched_attr attr;
         struct task_struct *p;
         int retval;
   
- -      if (!uattr || pid < 0)
+ +      if (!uattr || pid < 0 || flags)
                 return -EINVAL;
   
         if (sched_copy_attr(uattr, &attr))
@@@ -3814,7 -3780,7 +3808,7 @@@ static int sched_read_attr(struct sched
                 attr->size = usize;
         }
   
- -      ret = copy_to_user(uattr, attr, usize);
+ +      ret = copy_to_user(uattr, attr, attr->size);
         if (ret)
                 return -EFAULT;
   
@@@ -3832,8 -3798,8 +3826,8 @@@ err_size
    * @uattr: structure containing the extended parameters.
    * @size: sizeof(attr) for fwd/bwd comp.
    */
- -SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- -              unsigned int, size)
+ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ +              unsigned int, size, unsigned int, flags)
   {
         struct sched_attr attr = {
                 .size = sizeof(struct sched_attr),
@@@ -3842,7 -3808,7 +3836,7 @@@
         int retval;
   
         if (!uattr || pid < 0 || size > PAGE_SIZE ||
- -          size < SCHED_ATTR_SIZE_VER0)
+ +          size < SCHED_ATTR_SIZE_VER0 || flags)
                 return -EINVAL;
   
         rcu_read_lock();
@@@ -3863,7 -3829,7 +3857,7 @@@
         else if (task_has_rt_policy(p))
                 attr.sched_priority = p->rt_priority;
         else
- -              attr.sched_nice = TASK_NICE(p);
+ +              attr.sched_nice = task_nice(p);
   
         rcu_read_unlock();
   
@@@ -4501,7 -4467,6 +4495,7 @@@ void init_idle(struct task_struct *idle
         rcu_read_unlock();
   
         rq->curr = rq->idle = idle;
+ +      idle->on_rq = 1;
   #if defined(CONFIG_SMP)
         idle->on_cpu = 1;
   #endif
@@@ -4721,10 -4686,8 +4715,10 @@@ void idle_task_exit(void
   
         BUG_ON(cpu_online(smp_processor_id()));
   
- -      if (mm != &init_mm)
+ +      if (mm != &init_mm) {
                 switch_mm(mm, &init_mm, current);
+ +              finish_arch_post_lock_switch();
+ +      }
         mmdrop(mm);
   }
   
@@@ -4742,22 -4705,6 +4736,22 @@@ static void calc_load_migrate(struct r
                 atomic_long_add(delta, &calc_load_tasks);
   }
   
+ +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+ +{
+ +}
+ +
+ +static const struct sched_class fake_sched_class = {
+ +      .put_prev_task = put_prev_task_fake,
+ +};
+ +
+ +static struct task_struct fake_task = {
+ +      /*
+ +       * Avoid pull_{rt,dl}_task()
+ +       */
+ +      .prio = MAX_PRIO + 1,
+ +      .sched_class = &fake_sched_class,
+ +};
+ +
   /*
    * Migrate all tasks from the rq, sleeping tasks will be migrated by
    * try_to_wake_up()->select_task_rq().
@@@ -4798,7 -4745,7 +4792,7 @@@ static void migrate_tasks(unsigned int 
                 if (rq->nr_running == 1)
                         break;
   
- -              next = pick_next_task(rq);
+ +              next = pick_next_task(rq, &fake_task);
                 BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
   
@@@ -4888,7 -4835,7 +4882,7 @@@ set_table_entry(struct ctl_table *entry
   static struct ctl_table *
   sd_alloc_ctl_domain_table(struct sched_domain *sd)
   {
- -      struct ctl_table *table = sd_alloc_ctl_entry(13);
+ +      struct ctl_table *table = sd_alloc_ctl_entry(14);
   
         if (table == NULL)
                 return NULL;
@@@ -4916,12 -4863,9 +4910,12 @@@
                 sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[10], "flags", &sd->flags,
                 sizeof(int), 0644, proc_dointvec_minmax, false);
- -      set_table_entry(&table[11], "name", sd->name,
+ +      set_table_entry(&table[11], "max_newidle_lb_cost",
+ +              &sd->max_newidle_lb_cost,
+ +              sizeof(long), 0644, proc_doulongvec_minmax, false);
+ +      set_table_entry(&table[12], "name", sd->name,
                 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- -      /* &table[12] is terminator */
+ +      /* &table[13] is terminator */
   
         return table;
   }
@@@ -6898,6 -6842,7 +6892,6 @@@ void __init sched_init(void
   
                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
   #ifdef CONFIG_RT_GROUP_SCHED
- -              INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
   #endif
   
@@@ -6986,8 -6931,7 +6980,8 @@@ void __might_sleep(const char *file, in
         static unsigned long prev_jiffy;        /* ratelimiting */
   
         rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- -      if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ +      if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ +           !is_idle_task(current)) ||
             system_state != SYSTEM_RUNNING || oops_in_progress)
                 return;
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@@ -7005,13 -6949,6 +6999,13 @@@
         debug_show_held_locks(current);
         if (irqs_disabled())
                 print_irqtrace_events(current);
+ +#ifdef CONFIG_DEBUG_PREEMPT
+ +      if (!preempt_count_equals(preempt_offset)) {
+ +              pr_err("Preemption disabled at:");
+ +              print_ip_sym(current->preempt_disable_ip);
+ +              pr_cont("\n");
+ +      }
+ +#endif
         dump_stack();
   }
   EXPORT_SYMBOL(__might_sleep);
@@@ -7065,7 -7002,7 +7059,7 @@@ void normalize_rt_tasks(void
                          * Renice negative nice level userspace
                          * tasks back to 0:
                          */
- -                      if (TASK_NICE(p) < 0 && p->mm)
+ +                      if (task_nice(p) < 0 && p->mm)
                                 set_user_nice(p, 0);
                         continue;
                 }
@@@ -7479,7 -7416,6 +7473,7 @@@ static int sched_dl_global_constraints(
         u64 period = global_rt_period();
         u64 new_bw = to_ratio(period, runtime);
         int cpu, ret = 0;
+ +      unsigned long flags;
   
         /*
          * Here we want to check the bandwidth not being set to some
@@@ -7493,10 -7429,10 +7487,10 @@@
         for_each_possible_cpu(cpu) {
                 struct dl_bw *dl_b = dl_bw_of(cpu);
   
- -              raw_spin_lock(&dl_b->lock);
+ +              raw_spin_lock_irqsave(&dl_b->lock, flags);
                 if (new_bw < dl_b->total_bw)
                         ret = -EBUSY;
- -              raw_spin_unlock(&dl_b->lock);
+ +              raw_spin_unlock_irqrestore(&dl_b->lock, flags);
   
                 if (ret)
                         break;
@@@ -7509,7 -7445,6 +7503,7 @@@ static void sched_dl_do_global(void
   {
         u64 new_bw = -1;
         int cpu;
+ +      unsigned long flags;
   
         def_dl_bandwidth.dl_period = global_rt_period();
         def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@@ -7523,9 -7458,9 +7517,9 @@@
         for_each_possible_cpu(cpu) {
                 struct dl_bw *dl_b = dl_bw_of(cpu);
   
- -              raw_spin_lock(&dl_b->lock);
+ +              raw_spin_lock_irqsave(&dl_b->lock, flags);
                 dl_b->bw = new_bw;
- -              raw_spin_unlock(&dl_b->lock);
+ +              raw_spin_unlock_irqrestore(&dl_b->lock, flags);
         }
   }
   
@@@ -7534,8 -7469,7 +7528,8 @@@ static int sched_rt_global_validate(voi
         if (sysctl_sched_rt_period <= 0)
                 return -EINVAL;
   
- -      if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+ +      if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ +              (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
                 return -EINVAL;
   
         return 0;
diff --combined kernel/sched/cputime.c

index 58624a65f124b673911be1f9861d9cfeeefadbee,c91b09770ebd089b6cc7432ad5e0af637b212021..a95097cb4591b5bfa2466adb5600895e782fc661
--- 1/kernel/sched/cputime.c
--- 2/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@@ -142,7 -142,7 +142,7 @@@ void account_user_time(struct task_stru
         p->utimescaled += cputime_scaled;
         account_group_user_time(p, cputime);
   
- -      index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+ +      index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
   
         /* Add user time to cpustat. */
         task_group_account_field(p, index, (__force u64) cputime);
@@@ -169,7 -169,7 +169,7 @@@ static void account_guest_time(struct t
         p->gtime += cputime;
   
         /* Add guest time to cpustat. */
- -      if (TASK_NICE(p) > 0) {
+ +      if (task_nice(p) > 0) {
                 cpustat[CPUTIME_NICE] += (__force u64) cputime;
                 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
         } else {
@@@ -258,16 -258,22 +258,22 @@@ static __always_inline bool steal_accou
   {
   #ifdef CONFIG_PARAVIRT
         if (static_key_false(&paravirt_steal_enabled)) {
-               u64 steal, st = 0;
+               u64 steal;
+               cputime_t steal_ct;
   
                 steal = paravirt_steal_clock(smp_processor_id());
                 steal -= this_rq()->prev_steal_time;
   
-               st = steal_ticks(steal);
-               this_rq()->prev_steal_time += st * TICK_NSEC;
+               /*
+                * cputime_t may be less precise than nsecs (eg: if it's
+                * based on jiffies). Lets cast the result to cputime
+                * granularity and account the rest on the next rounds.
+                */
+               steal_ct = nsecs_to_cputime(steal);
+               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
   
-               account_steal_time(st);
-               return st;
+               account_steal_time(steal_ct);
+               return steal_ct;
         }
   #endif
         return false;
diff --combined kernel/sched/sched.h

index f2de7a17562053b30f9e72b298ae46752a0d5d72,5ec99101012272eafbcc55a63d6a845e5ca63564..c9007f28d3a222ca97b5fb98210b2ecc1e756b7a
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -23,6 -23,24 +23,6 @@@ extern atomic_long_t calc_load_tasks
   extern long calc_load_fold_active(struct rq *this_rq);
   extern void update_cpu_load_active(struct rq *this_rq);
   
- -/*
- - * Convert user-nice values [ -20 ... 0 ... 19 ]
- - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- - * and back.
- - */
- -#define NICE_TO_PRIO(nice)    (MAX_RT_PRIO + (nice) + 20)
- -#define PRIO_TO_NICE(prio)    ((prio) - MAX_RT_PRIO - 20)
- -#define TASK_NICE(p)          PRIO_TO_NICE((p)->static_prio)
- -
- -/*
- - * 'User priority' is the nice value converted to something we
- - * can work with better when scaling various scheduler parameters,
- - * it's a [ 0 ... 39 ] range.
- - */
- -#define USER_PRIO(p)          ((p)-MAX_RT_PRIO)
- -#define TASK_USER_PRIO(p)     USER_PRIO((p)->static_prio)
- -#define MAX_USER_PRIO         (USER_PRIO(MAX_PRIO))
- -
   /*
    * Helpers for converting nanosecond timing to jiffy resolution
    */
@@@ -423,18 -441,6 +423,18 @@@ struct rt_rq 
   #endif
   };
   
+ +#ifdef CONFIG_RT_GROUP_SCHED
+ +static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+ +{
+ +      return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+ +}
+ +#else
+ +static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+ +{
+ +      return rt_rq->rt_throttled;
+ +}
+ +#endif
+ +
   /* Deadline class' related fields in a runqueue */
   struct dl_rq {
         /* runqueue is an rbtree, ordered by deadline */
@@@ -456,6 -462,7 +456,6 @@@
         } earliest_dl;
   
         unsigned long dl_nr_migratory;
- -      unsigned long dl_nr_total;
         int overloaded;
   
         /*
@@@ -552,9 -559,11 +552,9 @@@ struct rq 
   #ifdef CONFIG_FAIR_GROUP_SCHED
         /* list of leaf cfs_rq on this cpu: */
         struct list_head leaf_cfs_rq_list;
- -#endif /* CONFIG_FAIR_GROUP_SCHED */
   
- -#ifdef CONFIG_RT_GROUP_SCHED
- -      struct list_head leaf_rt_rq_list;
- -#endif
+ +      struct sched_avg avg;
+ +#endif /* CONFIG_FAIR_GROUP_SCHED */
   
         /*
          * This is part of a global counter where only the total sum
@@@ -643,6 -652,8 +643,6 @@@
   #ifdef CONFIG_SMP
         struct llist_head wake_list;
   #endif
- -
- -      struct sched_avg avg;
   };
   
   static inline int cpu_of(struct rq *rq)
@@@ -1102,8 -1113,6 +1102,8 @@@ static const u32 prio_to_wmult[40] = 
   
   #define DEQUEUE_SLEEP         1
   
+ +#define RETRY_TASK            ((void *)-1UL)
+ +
   struct sched_class {
         const struct sched_class *next;
   
@@@ -1114,22 -1123,14 +1114,22 @@@
   
         void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
   
- -      struct task_struct * (*pick_next_task) (struct rq *rq);
+ +      /*
+ +       * It is the responsibility of the pick_next_task() method that will
+ +       * return the next task to call put_prev_task() on the @prev task or
+ +       * something equivalent.
+ +       *
+ +       * May return RETRY_TASK when it finds a higher prio class has runnable
+ +       * tasks.
+ +       */
+ +      struct task_struct * (*pick_next_task) (struct rq *rq,
+ +                                              struct task_struct *prev);
         void (*put_prev_task) (struct rq *rq, struct task_struct *p);
   
   #ifdef CONFIG_SMP
         int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
         void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
   
- -      void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_waking) (struct task_struct *task);
         void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@@ -1159,11 -1160,6 +1159,11 @@@
   #endif
   };
   
+ +static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+ +{
+ +      prev->sched_class->put_prev_task(rq, prev);
+ +}
+ +
   #define sched_class_highest (&stop_sched_class)
   #define for_each_class(class) \
      for (class = sched_class_highest; class; class = class->next)
@@@ -1180,14 -1176,16 +1180,14 @@@ extern const struct sched_class idle_sc
   extern void update_group_power(struct sched_domain *sd, int cpu);
   
   extern void trigger_load_balance(struct rq *rq);
- -extern void idle_balance(int this_cpu, struct rq *this_rq);
   
   extern void idle_enter_fair(struct rq *this_rq);
   extern void idle_exit_fair(struct rq *this_rq);
   
- -#else /* CONFIG_SMP */
+ +#else
   
- -static inline void idle_balance(int cpu, struct rq *rq)
- -{
- -}
+ +static inline void idle_enter_fair(struct rq *rq) { }
+ +static inline void idle_exit_fair(struct rq *rq) { }
   
   #endif
   
@@@ -1216,16 -1214,6 +1216,6 @@@ extern void update_idle_cpu_load(struc
   
   extern void init_task_runnable_average(struct task_struct *p);
   
- #ifdef CONFIG_PARAVIRT
- static inline u64 steal_ticks(u64 steal)
- {
-       if (unlikely(steal > NSEC_PER_SEC))
-               return div_u64(steal, TICK_NSEC);
- 
-       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
- }
- #endif
- 
   static inline void inc_nr_running(struct rq *rq)
   {
         rq->nr_running++;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 1 Apr 2014 17:16:10 +0000 (10:16 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 1 Apr 2014 17:16:10 +0000 (10:16 -0700)
		1	2
drivers/s390/cio/cio.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/cputime.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history