Merge branch 'sched/urgent' into sched/core to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Thu, 12 May 2016 07:18:13 +0000 (09:18 +0200)

committer Ingo Molnar <mingo@kernel.org>

Thu, 12 May 2016 07:18:13 +0000 (09:18 +0200)
author Ingo Molnar <mingo@kernel.org>
Thu, 12 May 2016 07:18:13 +0000 (09:18 +0200)
committer Ingo Molnar <mingo@kernel.org>
Thu, 12 May 2016 07:18:13 +0000 (09:18 +0200)
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt

index f52f297cb40627a7d5855f04399977f304272e51..9857606dd7b7118c23885d039919e4265ea55775 100644 (file)
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
    <idle>-0       3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
    <idle>-0       3dN.1   12us : ktime_get <-tick_nohz_idle_exit
    <idle>-0       3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  <idle>-0       3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  <idle>-0       3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  <idle>-0       3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  <idle>-0       3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
    <idle>-0       3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  <idle>-0       3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  <idle>-0       3dN.2   14us : sched_avg_update <-__update_cpu_load
-  <idle>-0       3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  <idle>-0       3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  <idle>-0       3dN.2   14us : sched_avg_update <-__cpu_load_update
+  <idle>-0       3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
    <idle>-0       3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
    <idle>-0       3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
    <idle>-0       3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h

index fa5b42d44985fbda00595f891450c6a60c8fd48b..ed73babc0dc915c06f4282696a9cf2e34e28d814 100644 (file)
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -15,6 +15,7 @@
  
  #include <linux/compiler.h>
  #include <linux/sched.h>
+#include <linux/preempt.h>
  #include <asm/cacheflush.h>
  #include <asm/cachetype.h>
  #include <asm/proc-fns.h>
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c

index 041e442a3e2806ed884584758cb8e62abd809e36..dd39fde66b54b62f754d8b472b9edba7b2ffe6e0 100644 (file)
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2177,7 +2177,7 @@ void arch_perf_update_userpage(struct perf_event *event,
          * cap_user_time_zero doesn't make sense when we're using a different
          * time base for the records.
          */
-       if (event->clock == &local_clock) {
+       if (!event->attr.use_clockid) {
                 userpg->cap_user_time_zero = 1;
                 userpg->time_zero = data->cyc2ns_offset;
         }
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h

index 84280029cafd73a83e64efd3133a8f9d9575bcb3..396348196aa779aeb55bea2b1d9b0782475e3558 100644 (file)
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -115,103 +115,12 @@ static inline void destroy_context(struct mm_struct *mm)
         destroy_context_ldt(mm);
  }
  
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-                            struct task_struct *tsk)
-{
-       unsigned cpu = smp_processor_id();
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+                     struct task_struct *tsk);
  
-       if (likely(prev != next)) {
-#ifdef CONFIG_SMP
-               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-               this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
-               cpumask_set_cpu(cpu, mm_cpumask(next));
-
-               /*
-                * Re-load page tables.
-                *
-                * This logic has an ordering constraint:
-                *
-                *  CPU 0: Write to a PTE for 'next'
-                *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-                *  CPU 1: set bit 1 in next's mm_cpumask
-                *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-                *
-                * We need to prevent an outcome in which CPU 1 observes
-                * the new PTE value and CPU 0 observes bit 1 clear in
-                * mm_cpumask.  (If that occurs, then the IPI will never
-                * be sent, and CPU 0's TLB will contain a stale entry.)
-                *
-                * The bad outcome can occur if either CPU's load is
-                * reordered before that CPU's store, so both CPUs must
-                * execute full barriers to prevent this from happening.
-                *
-                * Thus, switch_mm needs a full barrier between the
-                * store to mm_cpumask and any operation that could load
-                * from next->pgd.  TLB fills are special and can happen
-                * due to instruction fetches or for no reason at all,
-                * and neither LOCK nor MFENCE orders them.
-                * Fortunately, load_cr3() is serializing and gives the
-                * ordering guarantee we need.
-                *
-                */
-               load_cr3(next->pgd);
-
-               trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-
-               /* Stop flush ipis for the previous mm */
-               cpumask_clear_cpu(cpu, mm_cpumask(prev));
-
-               /* Load per-mm CR4 state */
-               load_mm_cr4(next);
-
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-               /*
-                * Load the LDT, if the LDT is different.
-                *
-                * It's possible that prev->context.ldt doesn't match
-                * the LDT register.  This can happen if leave_mm(prev)
-                * was called and then modify_ldt changed
-                * prev->context.ldt but suppressed an IPI to this CPU.
-                * In this case, prev->context.ldt != NULL, because we
-                * never set context.ldt to NULL while the mm still
-                * exists.  That means that next->context.ldt !=
-                * prev->context.ldt, because mms never share an LDT.
-                */
-               if (unlikely(prev->context.ldt != next->context.ldt))
-                       load_mm_ldt(next);
-#endif
-       }
-#ifdef CONFIG_SMP
-         else {
-               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-               BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
-               if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
-                       /*
-                        * On established mms, the mm_cpumask is only changed
-                        * from irq context, from ptep_clear_flush() while in
-                        * lazy tlb mode, and here. Irqs are blocked during
-                        * schedule, protecting us from simultaneous changes.
-                        */
-                       cpumask_set_cpu(cpu, mm_cpumask(next));
-
-                       /*
-                        * We were in lazy tlb mode and leave_mm disabled
-                        * tlb flush IPI delivery. We must reload CR3
-                        * to make sure to use no freed page tables.
-                        *
-                        * As above, load_cr3() is serializing and orders TLB
-                        * fills with respect to the mm_cpumask write.
-                        */
-                       load_cr3(next->pgd);
-                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-                       load_mm_cr4(next);
-                       load_mm_ldt(next);
-               }
-       }
-#endif
-}
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+                              struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
  
  #define activate_mm(prev, next)                        \
  do {                                           \
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index f98913258c639dac160fa8f01304486c48fb391d..62c0043a5fd545f09a584e2c1f991923c9b16afb 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
  KCOV_INSTRUMENT_tlb.o  := n
  
  obj-y  :=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-           pat.o pgtable.o physaddr.o gup.o setup_nx.o
+           pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
  
  # Make sure __phys_addr has no stackprotector
  nostackp := $(call cc-option, -fno-stack-protector)
@@ -12,7 +12,6 @@ CFLAGS_setup_nx.o             := $(nostackp)
  CFLAGS_fault.o := -I$(src)/../include/asm/trace
  
  obj-$(CONFIG_X86_PAT)          += pat_rbtree.o
-obj-$(CONFIG_SMP)              += tlb.o
  
  obj-$(CONFIG_X86_32)           += pgtable_32.o iomap_32.o
  
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index fe9b9f77636168752f989b9d007634396864bf13..5643fd0b1a7d271da14dee848589d99437b25b49 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,8 @@
   *     Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
   */
  
+#ifdef CONFIG_SMP
+
  struct flush_tlb_info {
         struct mm_struct *flush_mm;
         unsigned long flush_start;
@@ -57,6 +59,118 @@ void leave_mm(int cpu)
  }
  EXPORT_SYMBOL_GPL(leave_mm);
  
+#endif /* CONFIG_SMP */
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+              struct task_struct *tsk)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       switch_mm_irqs_off(prev, next, tsk);
+       local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+                       struct task_struct *tsk)
+{
+       unsigned cpu = smp_processor_id();
+
+       if (likely(prev != next)) {
+#ifdef CONFIG_SMP
+               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               this_cpu_write(cpu_tlbstate.active_mm, next);
+#endif
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+
+               /*
+                * Re-load page tables.
+                *
+                * This logic has an ordering constraint:
+                *
+                *  CPU 0: Write to a PTE for 'next'
+                *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+                *  CPU 1: set bit 1 in next's mm_cpumask
+                *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+                *
+                * We need to prevent an outcome in which CPU 1 observes
+                * the new PTE value and CPU 0 observes bit 1 clear in
+                * mm_cpumask.  (If that occurs, then the IPI will never
+                * be sent, and CPU 0's TLB will contain a stale entry.)
+                *
+                * The bad outcome can occur if either CPU's load is
+                * reordered before that CPU's store, so both CPUs must
+                * execute full barriers to prevent this from happening.
+                *
+                * Thus, switch_mm needs a full barrier between the
+                * store to mm_cpumask and any operation that could load
+                * from next->pgd.  TLB fills are special and can happen
+                * due to instruction fetches or for no reason at all,
+                * and neither LOCK nor MFENCE orders them.
+                * Fortunately, load_cr3() is serializing and gives the
+                * ordering guarantee we need.
+                *
+                */
+               load_cr3(next->pgd);
+
+               trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+               /* Stop flush ipis for the previous mm */
+               cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+               /* Load per-mm CR4 state */
+               load_mm_cr4(next);
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+               /*
+                * Load the LDT, if the LDT is different.
+                *
+                * It's possible that prev->context.ldt doesn't match
+                * the LDT register.  This can happen if leave_mm(prev)
+                * was called and then modify_ldt changed
+                * prev->context.ldt but suppressed an IPI to this CPU.
+                * In this case, prev->context.ldt != NULL, because we
+                * never set context.ldt to NULL while the mm still
+                * exists.  That means that next->context.ldt !=
+                * prev->context.ldt, because mms never share an LDT.
+                */
+               if (unlikely(prev->context.ldt != next->context.ldt))
+                       load_mm_ldt(next);
+#endif
+       }
+#ifdef CONFIG_SMP
+         else {
+               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+               if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+                       /*
+                        * On established mms, the mm_cpumask is only changed
+                        * from irq context, from ptep_clear_flush() while in
+                        * lazy tlb mode, and here. Irqs are blocked during
+                        * schedule, protecting us from simultaneous changes.
+                        */
+                       cpumask_set_cpu(cpu, mm_cpumask(next));
+
+                       /*
+                        * We were in lazy tlb mode and leave_mm disabled
+                        * tlb flush IPI delivery. We must reload CR3
+                        * to make sure to use no freed page tables.
+                        *
+                        * As above, load_cr3() is serializing and orders TLB
+                        * fills with respect to the mm_cpumask write.
+                        */
+                       load_cr3(next->pgd);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+                       load_mm_cr4(next);
+                       load_mm_ldt(next);
+               }
+       }
+#endif
+}
+
+#ifdef CONFIG_SMP
+
  /*
   * The flush IPI assumes that a thread switch happens in this order:
   * [cpu0: the cpu that switches]
@@ -353,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void)
         return 0;
  }
  late_initcall(create_tlb_single_page_flush_ceiling);
+
+#endif /* CONFIG_SMP */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h

index d10ef06971b57d8783934333612cb196421716ab..fb7d87e45fbe6cae331df4fb34276f6c7510374c 100644 (file)
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -356,8 +356,13 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
  extern void lockdep_clear_current_reclaim_state(void);
  extern void lockdep_trace_alloc(gfp_t mask);
  
-extern void lock_pin_lock(struct lockdep_map *lock);
-extern void lock_unpin_lock(struct lockdep_map *lock);
+struct pin_cookie { unsigned int val; };
+
+#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
+
+extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
+extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
+extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
  
  # define INIT_LOCKDEP                          .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
  
@@ -373,8 +378,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock);
  
  #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion)
  
-#define lockdep_pin_lock(l)            lock_pin_lock(&(l)->dep_map)
-#define lockdep_unpin_lock(l)  lock_unpin_lock(&(l)->dep_map)
+#define lockdep_pin_lock(l)    lock_pin_lock(&(l)->dep_map)
+#define lockdep_repin_lock(l,c)        lock_repin_lock(&(l)->dep_map, (c))
+#define lockdep_unpin_lock(l,c)        lock_unpin_lock(&(l)->dep_map, (c))
  
  #else /* !CONFIG_LOCKDEP */
  
@@ -427,8 +433,13 @@ struct lock_class_key { };
  
  #define lockdep_recursing(tsk)                 (0)
  
-#define lockdep_pin_lock(l)                            do { (void)(l); } while (0)
-#define lockdep_unpin_lock(l)                  do { (void)(l); } while (0)
+struct pin_cookie { };
+
+#define NIL_COOKIE (struct pin_cookie){ }
+
+#define lockdep_pin_lock(l)                    ({ struct pin_cookie cookie; cookie; })
+#define lockdep_repin_lock(l, c)               do { (void)(l); (void)(c); } while (0)
+#define lockdep_unpin_lock(l, c)               do { (void)(l); (void)(c); } while (0)
  
  #endif /* !LOCKDEP */
  
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h

index 70fffeba7495802c4f0edf83738f2b95952e7e3d..a4441784503b5bcade00e20c5b67ac496a9f1143 100644 (file)
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -1,9 +1,16 @@
  #ifndef _LINUX_MMU_CONTEXT_H
  #define _LINUX_MMU_CONTEXT_H
  
+#include <asm/mmu_context.h>
+
  struct mm_struct;
  
  void use_mm(struct mm_struct *mm);
  void unuse_mm(struct mm_struct *mm);
  
+/* Architectures that care about IRQ state in switch_mm can override this. */
+#ifndef switch_mm_irqs_off
+# define switch_mm_irqs_off switch_mm
+#endif
+
  #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 52c4847b05e2882a72d04c3c75fc4d55c2b4a6b9..57faf789c88f8d86576138053f6e1b85b672159b 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
  extern void calc_global_load(unsigned long ticks);
  
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
  #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
  #endif
  
  extern void dump_cpu_task(int cpu);
@@ -934,10 +936,20 @@ enum cpu_idle_type {
         CPU_MAX_IDLE_TYPES
  };
  
+/*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT        10
+# define SCHED_FIXEDPOINT_SCALE        (1L << SCHED_FIXEDPOINT_SHIFT)
+
  /*
   * Increase resolution of cpu_capacity calculations
   */
-#define SCHED_CAPACITY_SHIFT   10
+#define SCHED_CAPACITY_SHIFT   SCHED_FIXEDPOINT_SHIFT
  #define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
  
  /*
@@ -1199,18 +1211,56 @@ struct load_weight {
  };
  
  /*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
+ * The load_avg/util_avg accumulates an infinite geometric series
+ * (see __update_load_avg() in kernel/sched/fair.c).
+ *
+ * [load_avg definition]
+ *
+ *   load_avg = runnable% * scale_load_down(load)
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable.
+ * For cfs_rq, it is the aggregated load_avg of all runnable and
   * blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
+ *
+ * load_avg may also take frequency scaling into account:
+ *
+ *   load_avg = runnable% * scale_load_down(load) * freq%
+ *
+ * where freq% is the CPU frequency normalized to the highest frequency.
+ *
+ * [util_avg definition]
+ *
+ *   util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where running% is the time ratio that a sched_entity is running on
+ * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
+ * and blocked sched_entities.
+ *
+ * util_avg may also factor frequency scaling and CPU capacity scaling:
+ *
+ *   util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
+ *
+ * where freq% is the same as above, and capacity% is the CPU capacity
+ * normalized to the greatest capacity (due to uarch differences, etc).
+ *
+ * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
+ * themselves are in the range of [0, 1]. To do fixed point arithmetics,
+ * we therefore scale them to as large a range as necessary. This is for
+ * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
+ * with the highest load (=88761), always runnable on a single cfs_rq,
+ * and should not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * For all other cases (including 32-bit kernels), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ *    Max(load_avg) <= Max(load.weight)
+ *
+ * Then it is the load_weight's responsibility to consider overflow
+ * issues.
   */
  struct sched_avg {
         u64 last_update_time, load_sum;
@@ -2303,8 +2353,6 @@ extern unsigned long long notrace sched_clock(void);
  /*
   * See the comment in kernel/sched/clock.c
   */
-extern u64 cpu_clock(int cpu);
-extern u64 local_clock(void);
  extern u64 running_clock(void);
  extern u64 sched_clock_cpu(int cpu);
  
@@ -2323,6 +2371,16 @@ static inline void sched_clock_idle_sleep_event(void)
  static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
  {
  }
+
+static inline u64 cpu_clock(int cpu)
+{
+       return sched_clock();
+}
+
+static inline u64 local_clock(void)
+{
+       return sched_clock();
+}
  #else
  /*
   * Architectures can set this to 1 if they have specified
@@ -2337,6 +2395,26 @@ extern void clear_sched_clock_stable(void);
  extern void sched_clock_tick(void);
  extern void sched_clock_idle_sleep_event(void);
  extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+/*
+ * As outlined in clock.c, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+static inline u64 cpu_clock(int cpu)
+{
+       return sched_clock_cpu(cpu);
+}
+
+static inline u64 local_clock(void)
+{
+       return sched_clock_cpu(raw_smp_processor_id());
+}
  #endif
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c

index 78c1c0ee6dc1256904e1afb90611818813fc031c..68bc6a654ca309429012130015f71b0c87009a5a 100644 (file)
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -45,6 +45,7 @@
  #include <linux/bitops.h>
  #include <linux/gfp.h>
  #include <linux/kmemcheck.h>
+#include <linux/random.h>
  
  #include <asm/sections.h>
  
@@ -3585,7 +3586,35 @@ static int __lock_is_held(struct lockdep_map *lock)
         return 0;
  }
  
-static void __lock_pin_lock(struct lockdep_map *lock)
+static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
+{
+       struct pin_cookie cookie = NIL_COOKIE;
+       struct task_struct *curr = current;
+       int i;
+
+       if (unlikely(!debug_locks))
+               return cookie;
+
+       for (i = 0; i < curr->lockdep_depth; i++) {
+               struct held_lock *hlock = curr->held_locks + i;
+
+               if (match_held_lock(hlock, lock)) {
+                       /*
+                        * Grab 16bits of randomness; this is sufficient to not
+                        * be guessable and still allows some pin nesting in
+                        * our u32 pin_count.
+                        */
+                       cookie.val = 1 + (prandom_u32() >> 16);
+                       hlock->pin_count += cookie.val;
+                       return cookie;
+               }
+       }
+
+       WARN(1, "pinning an unheld lock\n");
+       return cookie;
+}
+
+static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
  {
         struct task_struct *curr = current;
         int i;
@@ -3597,7 +3626,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
                 struct held_lock *hlock = curr->held_locks + i;
  
                 if (match_held_lock(hlock, lock)) {
-                       hlock->pin_count++;
+                       hlock->pin_count += cookie.val;
                         return;
                 }
         }
@@ -3605,7 +3634,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
         WARN(1, "pinning an unheld lock\n");
  }
  
-static void __lock_unpin_lock(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
  {
         struct task_struct *curr = current;
         int i;
@@ -3620,7 +3649,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock)
                         if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
                                 return;
  
-                       hlock->pin_count--;
+                       hlock->pin_count -= cookie.val;
+
+                       if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n"))
+                               hlock->pin_count = 0;
+
                         return;
                 }
         }
@@ -3751,24 +3784,44 @@ int lock_is_held(struct lockdep_map *lock)
  }
  EXPORT_SYMBOL_GPL(lock_is_held);
  
-void lock_pin_lock(struct lockdep_map *lock)
+struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
  {
+       struct pin_cookie cookie = NIL_COOKIE;
         unsigned long flags;
  
         if (unlikely(current->lockdep_recursion))
-               return;
+               return cookie;
  
         raw_local_irq_save(flags);
         check_flags(flags);
  
         current->lockdep_recursion = 1;
-       __lock_pin_lock(lock);
+       cookie = __lock_pin_lock(lock);
         current->lockdep_recursion = 0;
         raw_local_irq_restore(flags);
+
+       return cookie;
  }
  EXPORT_SYMBOL_GPL(lock_pin_lock);
  
-void lock_unpin_lock(struct lockdep_map *lock)
+void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+       unsigned long flags;
+
+       if (unlikely(current->lockdep_recursion))
+               return;
+
+       raw_local_irq_save(flags);
+       check_flags(flags);
+
+       current->lockdep_recursion = 1;
+       __lock_repin_lock(lock, cookie);
+       current->lockdep_recursion = 0;
+       raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_repin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
  {
         unsigned long flags;
  
@@ -3779,7 +3832,7 @@ void lock_unpin_lock(struct lockdep_map *lock)
         check_flags(flags);
  
         current->lockdep_recursion = 1;
-       __lock_unpin_lock(lock);
+       __lock_unpin_lock(lock, cookie);
         current->lockdep_recursion = 0;
         raw_local_irq_restore(flags);
  }
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c

index fedb967a98419c14348e8d79becdcb7abf62bfa2..e85a725e5c3496687cccffa196372011f75ef2ad 100644 (file)
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu)
  
         return clock;
  }
+EXPORT_SYMBOL_GPL(sched_clock_cpu);
  
  void sched_clock_tick(void)
  {
@@ -363,39 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
  }
  EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
  
-/*
- * As outlined at the top, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- */
-u64 cpu_clock(int cpu)
-{
-       if (!sched_clock_stable())
-               return sched_clock_cpu(cpu);
-
-       return sched_clock();
-}
-
-/*
- * Similar to cpu_clock() for the current cpu. Time will only be observed
- * to be monotonic if care is taken to only compare timestampt taken on the
- * same CPU.
- *
- * See cpu_clock().
- */
-u64 local_clock(void)
-{
-       if (!sched_clock_stable())
-               return sched_clock_cpu(raw_smp_processor_id());
-
-       return sched_clock();
-}
-
  #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
  
  void sched_clock_init(void)
@@ -410,22 +378,8 @@ u64 sched_clock_cpu(int cpu)
  
         return sched_clock();
  }
-
-u64 cpu_clock(int cpu)
-{
-       return sched_clock();
-}
-
-u64 local_clock(void)
-{
-       return sched_clock();
-}
-
  #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
  
-EXPORT_SYMBOL_GPL(cpu_clock);
-EXPORT_SYMBOL_GPL(local_clock);
-
  /*
   * Running clock - returns the time that has elapsed while a guest has been
   * running.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index d1f7149f870439d65b9cfcfbc27d4160bbb1672f..e09f92c3a096d0eefeb005da0d92582a65c62ee9 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
  #include <linux/init.h>
  #include <linux/uaccess.h>
  #include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
  #include <linux/interrupt.h>
  #include <linux/capability.h>
  #include <linux/completion.h>
@@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void)
         return rq;
  }
  
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       lockdep_assert_held(&p->pi_lock);
+
+       for (;;) {
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       return rq;
+               }
+               raw_spin_unlock(&rq->lock);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+       __acquires(p->pi_lock)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       for (;;) {
+               raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               /*
+                *      move_queued_task()              task_rq_lock()
+                *
+                *      ACQUIRE (rq->lock)
+                *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
+                *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
+                *      [S] ->cpu = new_cpu             [L] task_rq()
+                *                                      [L] ->on_rq
+                *      RELEASE (rq->lock)
+                *
+                * If we observe the old cpu in task_rq_lock, the acquire of
+                * the old rq->lock will fully serialize against the stores.
+                *
+                * If we observe the new cpu in task_rq_lock, the acquire will
+                * pair with the WMB to ensure we must then also see migrating.
+                */
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       return rq;
+               }
+               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
  #ifdef CONFIG_SCHED_HRTICK
  /*
   * Use HR-timers to deliver accurate preemption points.
@@ -400,7 +465,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
          * wakeup due to that.
          *
          * This cmpxchg() implies a full barrier, which pairs with the write
-        * barrier implied by the wakeup in wake_up_list().
+        * barrier implied by the wakeup in wake_up_q().
          */
         if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
                 return;
@@ -1085,12 +1150,12 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  static int __set_cpus_allowed_ptr(struct task_struct *p,
                                   const struct cpumask *new_mask, bool check)
  {
-       unsigned long flags;
-       struct rq *rq;
         unsigned int dest_cpu;
+       struct rq_flags rf;
+       struct rq *rq;
         int ret = 0;
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
  
         /*
          * Must re-check here, to close a race against __kthread_bind(),
@@ -1119,7 +1184,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
         if (task_running(rq, p) || p->state == TASK_WAKING) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                 tlb_migrate_finish(p->mm);
                 return 0;
@@ -1128,12 +1193,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                  * OK, since we're going to drop the lock immediately
                  * afterwards anyway.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, rf.cookie);
                 rq = move_queued_task(rq, p, dest_cpu);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, rf.cookie);
         }
  out:
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         return ret;
  }
@@ -1317,8 +1382,8 @@ out:
   */
  unsigned long wait_task_inactive(struct task_struct *p, long match_state)
  {
-       unsigned long flags;
         int running, queued;
+       struct rq_flags rf;
         unsigned long ncsw;
         struct rq *rq;
  
@@ -1353,14 +1418,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                  * lock now, to be *sure*. If we're wrong, we'll
                  * just go back and repeat.
                  */
-               rq = task_rq_lock(p, &flags);
+               rq = task_rq_lock(p, &rf);
                 trace_sched_wait_task(p);
                 running = task_running(rq, p);
                 queued = task_on_rq_queued(p);
                 ncsw = 0;
                 if (!match_state || p->state == match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
  
                 /*
                  * If it changed from the expected state, bail out now.
@@ -1604,8 +1669,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
  /*
   * Mark the task runnable and perform wakeup-preemption.
   */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+                          struct pin_cookie cookie)
  {
         check_preempt_curr(rq, p, wake_flags);
         p->state = TASK_RUNNING;
@@ -1617,9 +1682,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
                  * Our task @p is fully woken up and running; so its safe to
                  * drop the rq->lock, hereafter rq is only used for statistics.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 p->sched_class->task_woken(rq, p);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
         }
  
         if (rq->idle_stamp) {
@@ -1637,7 +1702,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  }
  
  static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+                struct pin_cookie cookie)
  {
         lockdep_assert_held(&rq->lock);
  
@@ -1647,7 +1713,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
  #endif
  
         ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-       ttwu_do_wakeup(rq, p, wake_flags);
+       ttwu_do_wakeup(rq, p, wake_flags, cookie);
  }
  
  /*
@@ -1658,17 +1724,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
   */
  static int ttwu_remote(struct task_struct *p, int wake_flags)
  {
+       struct rq_flags rf;
         struct rq *rq;
         int ret = 0;
  
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
         if (task_on_rq_queued(p)) {
                 /* check_preempt_curr() may use rq clock */
                 update_rq_clock(rq);
-               ttwu_do_wakeup(rq, p, wake_flags);
+               ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
                 ret = 1;
         }
-       __task_rq_unlock(rq);
+       __task_rq_unlock(rq, &rf);
  
         return ret;
  }
@@ -1678,6 +1745,7 @@ void sched_ttwu_pending(void)
  {
         struct rq *rq = this_rq();
         struct llist_node *llist = llist_del_all(&rq->wake_list);
+       struct pin_cookie cookie;
         struct task_struct *p;
         unsigned long flags;
  
@@ -1685,15 +1753,15 @@ void sched_ttwu_pending(void)
                 return;
  
         raw_spin_lock_irqsave(&rq->lock, flags);
-       lockdep_pin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
  
         while (llist) {
                 p = llist_entry(llist, struct task_struct, wake_entry);
                 llist = llist_next(llist);
-               ttwu_do_activate(rq, p, 0);
+               ttwu_do_activate(rq, p, 0, cookie);
         }
  
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -1780,6 +1848,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
  static void ttwu_queue(struct task_struct *p, int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
+       struct pin_cookie cookie;
  
  #if defined(CONFIG_SMP)
         if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1790,9 +1859,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
  #endif
  
         raw_spin_lock(&rq->lock);
-       lockdep_pin_lock(&rq->lock);
-       ttwu_do_activate(rq, p, 0);
-       lockdep_unpin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
+       ttwu_do_activate(rq, p, 0, cookie);
+       lockdep_unpin_lock(&rq->lock, cookie);
         raw_spin_unlock(&rq->lock);
  }
  
@@ -1989,7 +2058,7 @@ out:
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
   * the current task.
   */
-static void try_to_wake_up_local(struct task_struct *p)
+static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
  {
         struct rq *rq = task_rq(p);
  
@@ -2006,11 +2075,11 @@ static void try_to_wake_up_local(struct task_struct *p)
                  * disabled avoiding further scheduler activity on it and we've
                  * not yet picked a replacement task.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 raw_spin_unlock(&rq->lock);
                 raw_spin_lock(&p->pi_lock);
                 raw_spin_lock(&rq->lock);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
         }
  
         if (!(p->state & TASK_NORMAL))
@@ -2021,7 +2090,7 @@ static void try_to_wake_up_local(struct task_struct *p)
         if (!task_on_rq_queued(p))
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  
-       ttwu_do_wakeup(rq, p, 0);
+       ttwu_do_wakeup(rq, p, 0, cookie);
         if (schedstat_enabled())
                 ttwu_stat(p, smp_processor_id(), 0);
  out:
@@ -2381,7 +2450,8 @@ static int dl_overflow(struct task_struct *p, int policy,
         u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
         int cpus, err = -1;
  
-       if (new_bw == p->dl.dl_bw)
+       /* !deadline task may carry old deadline bandwidth */
+       if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
                 return 0;
  
         /*
@@ -2420,12 +2490,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);
   */
  void wake_up_new_task(struct task_struct *p)
  {
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
  
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
         /* Initialize new task's runnable average */
         init_entity_runnable_average(&p->se);
+       raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
  #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
@@ -2434,8 +2504,10 @@ void wake_up_new_task(struct task_struct *p)
          */
         set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
+       /* Post initialize new task's util average when its cfs_rq is set */
+       post_init_entity_util_avg(&p->se);
  
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
         activate_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
@@ -2446,12 +2518,12 @@ void wake_up_new_task(struct task_struct *p)
                  * Nothing relies on rq->lock after this, so its fine to
                  * drop it.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, rf.cookie);
                 p->sched_class->task_woken(rq, p);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, rf.cookie);
         }
  #endif
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  }
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2713,7 +2785,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
   */
  static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
-              struct task_struct *next)
+              struct task_struct *next, struct pin_cookie cookie)
  {
         struct mm_struct *mm, *oldmm;
  
@@ -2733,7 +2805,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
-               switch_mm(oldmm, mm, next);
+               switch_mm_irqs_off(oldmm, mm, next);
  
         if (!prev->mm) {
                 prev->active_mm = NULL;
@@ -2745,7 +2817,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
          * of the scheduler it's an obvious special-case), so we
          * do an early lockdep release here:
          */
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
  
         /* Here we just switch the register state and the stack. */
@@ -2867,7 +2939,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
   */
  unsigned long long task_sched_runtime(struct task_struct *p)
  {
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
         u64 ns;
  
@@ -2887,7 +2959,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
                 return p->se.sum_exec_runtime;
  #endif
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         /*
          * Must be ->curr _and_ ->on_rq.  If dequeued, we would
          * project cycles that may never be accounted to this
@@ -2898,7 +2970,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
                 p->sched_class->update_curr(rq);
         }
         ns = p->se.sum_exec_runtime;
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         return ns;
  }
@@ -2918,7 +2990,7 @@ void scheduler_tick(void)
         raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
-       update_cpu_load_active(rq);
+       cpu_load_update_active(rq);
         calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
  
@@ -2961,6 +3033,20 @@ u64 scheduler_tick_max_deferment(void)
  
  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+       if (preempt_count() == val) {
+               unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+               current->preempt_disable_ip = ip;
+#endif
+               trace_preempt_off(CALLER_ADDR0, ip);
+       }
+}
  
  void preempt_count_add(int val)
  {
@@ -2979,17 +3065,21 @@ void preempt_count_add(int val)
         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                 PREEMPT_MASK - 10);
  #endif
-       if (preempt_count() == val) {
-               unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
-               current->preempt_disable_ip = ip;
-#endif
-               trace_preempt_off(CALLER_ADDR0, ip);
-       }
+       preempt_latency_start(val);
  }
  EXPORT_SYMBOL(preempt_count_add);
  NOKPROBE_SYMBOL(preempt_count_add);
  
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
  void preempt_count_sub(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
@@ -3006,13 +3096,15 @@ void preempt_count_sub(int val)
                 return;
  #endif
  
-       if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+       preempt_latency_stop(val);
         __preempt_count_sub(val);
  }
  EXPORT_SYMBOL(preempt_count_sub);
  NOKPROBE_SYMBOL(preempt_count_sub);
  
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
  #endif
  
  /*
@@ -3065,7 +3157,7 @@ static inline void schedule_debug(struct task_struct *prev)
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         const struct sched_class *class = &fair_sched_class;
         struct task_struct *p;
@@ -3076,20 +3168,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
          */
         if (likely(prev->sched_class == class &&
                    rq->nr_running == rq->cfs.h_nr_running)) {
-               p = fair_sched_class.pick_next_task(rq, prev);
+               p = fair_sched_class.pick_next_task(rq, prev, cookie);
                 if (unlikely(p == RETRY_TASK))
                         goto again;
  
                 /* assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
-                       p = idle_sched_class.pick_next_task(rq, prev);
+                       p = idle_sched_class.pick_next_task(rq, prev, cookie);
  
                 return p;
         }
  
  again:
         for_each_class(class) {
-               p = class->pick_next_task(rq, prev);
+               p = class->pick_next_task(rq, prev, cookie);
                 if (p) {
                         if (unlikely(p == RETRY_TASK))
                                 goto again;
@@ -3143,6 +3235,7 @@ static void __sched notrace __schedule(bool preempt)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
+       struct pin_cookie cookie;
         struct rq *rq;
         int cpu;
  
@@ -3176,7 +3269,7 @@ static void __sched notrace __schedule(bool preempt)
          */
         smp_mb__before_spinlock();
         raw_spin_lock(&rq->lock);
-       lockdep_pin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
  
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
@@ -3198,7 +3291,7 @@ static void __sched notrace __schedule(bool preempt)
  
                                 to_wakeup = wq_worker_sleeping(prev);
                                 if (to_wakeup)
-                                       try_to_wake_up_local(to_wakeup);
+                                       try_to_wake_up_local(to_wakeup, cookie);
                         }
                 }
                 switch_count = &prev->nvcsw;
@@ -3207,7 +3300,7 @@ static void __sched notrace __schedule(bool preempt)
         if (task_on_rq_queued(prev))
                 update_rq_clock(rq);
  
-       next = pick_next_task(rq, prev);
+       next = pick_next_task(rq, prev, cookie);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
         rq->clock_skip_update = 0;
@@ -3218,9 +3311,9 @@ static void __sched notrace __schedule(bool preempt)
                 ++*switch_count;
  
                 trace_sched_switch(preempt, prev, next);
-               rq = context_switch(rq, prev, next); /* unlocks the rq */
+               rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
         } else {
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 raw_spin_unlock_irq(&rq->lock);
         }
  
@@ -3287,8 +3380,23 @@ void __sched schedule_preempt_disabled(void)
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
+               /*
+                * Because the function tracer can trace preempt_count_sub()
+                * and it also uses preempt_enable/disable_notrace(), if
+                * NEED_RESCHED is set, the preempt_enable_notrace() called
+                * by the function tracer will call this function again and
+                * cause infinite recursion.
+                *
+                * Preemption must be disabled here before the function
+                * tracer can trace. Break up preempt_disable() into two
+                * calls. One to disable preemption without fear of being
+                * traced. The other to still record the preemption latency,
+                * which can also be traced by the function tracer.
+                */
                 preempt_disable_notrace();
+               preempt_latency_start(1);
                 __schedule(true);
+               preempt_latency_stop(1);
                 preempt_enable_no_resched_notrace();
  
                 /*
@@ -3340,7 +3448,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                 return;
  
         do {
+               /*
+                * Because the function tracer can trace preempt_count_sub()
+                * and it also uses preempt_enable/disable_notrace(), if
+                * NEED_RESCHED is set, the preempt_enable_notrace() called
+                * by the function tracer will call this function again and
+                * cause infinite recursion.
+                *
+                * Preemption must be disabled here before the function
+                * tracer can trace. Break up preempt_disable() into two
+                * calls. One to disable preemption without fear of being
+                * traced. The other to still record the preemption latency,
+                * which can also be traced by the function tracer.
+                */
                 preempt_disable_notrace();
+               preempt_latency_start(1);
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
@@ -3350,6 +3472,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                 __schedule(true);
                 exception_exit(prev_ctx);
  
+               preempt_latency_stop(1);
                 preempt_enable_no_resched_notrace();
         } while (need_resched());
  }
@@ -3406,12 +3529,13 @@ EXPORT_SYMBOL(default_wake_function);
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
         int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
-       struct rq *rq;
         const struct sched_class *prev_class;
+       struct rq_flags rf;
+       struct rq *rq;
  
         BUG_ON(prio > MAX_PRIO);
  
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
  
         /*
          * Idle task boosting is a nono in general. There is one
@@ -3487,7 +3611,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
         preempt_disable(); /* avoid rq from going away on us */
-       __task_rq_unlock(rq);
+       __task_rq_unlock(rq, &rf);
  
         balance_callback(rq);
         preempt_enable();
@@ -3497,7 +3621,7 @@ out_unlock:
  void set_user_nice(struct task_struct *p, long nice)
  {
         int old_prio, delta, queued;
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
  
         if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3506,7 +3630,7 @@ void set_user_nice(struct task_struct *p, long nice)
          * We have to be careful, if called from sys_setpriority(),
          * the task might be in the middle of scheduling on another CPU.
          */
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         /*
          * The RT priorities are set via sched_setscheduler(), but we still
          * allow the 'normal' nice value to be set - but as expected
@@ -3537,7 +3661,7 @@ void set_user_nice(struct task_struct *p, long nice)
                         resched_curr(rq);
         }
  out_unlock:
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  }
  EXPORT_SYMBOL(set_user_nice);
  
@@ -3834,11 +3958,11 @@ static int __sched_setscheduler(struct task_struct *p,
                       MAX_RT_PRIO - 1 - attr->sched_priority;
         int retval, oldprio, oldpolicy = -1, queued, running;
         int new_effective_prio, policy = attr->sched_policy;
-       unsigned long flags;
         const struct sched_class *prev_class;
-       struct rq *rq;
+       struct rq_flags rf;
         int reset_on_fork;
         int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       struct rq *rq;
  
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
@@ -3933,13 +4057,13 @@ recheck:
          * To be able to change p->policy safely, the appropriate
          * runqueue lock must be held.
          */
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
  
         /*
          * Changing the policy of the stop threads its a very bad idea
          */
         if (p == rq->stop) {
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 return -EINVAL;
         }
  
@@ -3956,7 +4080,7 @@ recheck:
                         goto change;
  
                 p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 return 0;
         }
  change:
@@ -3970,7 +4094,7 @@ change:
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &flags);
+                       task_rq_unlock(rq, p, &rf);
                         return -EPERM;
                 }
  #endif
@@ -3985,7 +4109,7 @@ change:
                          */
                         if (!cpumask_subset(span, &p->cpus_allowed) ||
                             rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &flags);
+                               task_rq_unlock(rq, p, &rf);
                                 return -EPERM;
                         }
                 }
@@ -3995,7 +4119,7 @@ change:
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 goto recheck;
         }
  
@@ -4005,7 +4129,7 @@ change:
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 return -EBUSY;
         }
  
@@ -4050,7 +4174,7 @@ change:
  
         check_class_changed(rq, p, prev_class, oldprio);
         preempt_disable(); /* avoid rq from going away on us */
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         if (pi)
                 rt_mutex_adjust_pi(p);
@@ -4903,10 +5027,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
  {
         struct task_struct *p;
         unsigned int time_slice;
-       unsigned long flags;
+       struct rq_flags rf;
+       struct timespec t;
         struct rq *rq;
         int retval;
-       struct timespec t;
  
         if (pid < 0)
                 return -EINVAL;
@@ -4921,11 +5045,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
         if (retval)
                 goto out_unlock;
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         time_slice = 0;
         if (p->sched_class->get_rr_interval)
                 time_slice = p->sched_class->get_rr_interval(rq, p);
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         rcu_read_unlock();
         jiffies_to_timespec(time_slice, &t);
@@ -5001,7 +5125,8 @@ void show_state_filter(unsigned long state_filter)
         touch_all_softlockup_watchdogs();
  
  #ifdef CONFIG_SCHED_DEBUG
-       sysrq_sched_debug_show();
+       if (!state_filter)
+               sysrq_sched_debug_show();
  #endif
         rcu_read_unlock();
         /*
@@ -5188,11 +5313,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
   */
  void sched_setnuma(struct task_struct *p, int nid)
  {
-       struct rq *rq;
-       unsigned long flags;
         bool queued, running;
+       struct rq_flags rf;
+       struct rq *rq;
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
  
@@ -5207,7 +5332,7 @@ void sched_setnuma(struct task_struct *p, int nid)
                 p->sched_class->set_curr_task(rq);
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE);
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
@@ -5223,7 +5348,7 @@ void idle_task_exit(void)
         BUG_ON(cpu_online(smp_processor_id()));
  
         if (mm != &init_mm) {
-               switch_mm(mm, &init_mm, current);
+               switch_mm_irqs_off(mm, &init_mm, current);
                 finish_arch_post_lock_switch();
         }
         mmdrop(mm);
@@ -5271,6 +5396,7 @@ static void migrate_tasks(struct rq *dead_rq)
  {
         struct rq *rq = dead_rq;
         struct task_struct *next, *stop = rq->stop;
+       struct pin_cookie cookie;
         int dest_cpu;
  
         /*
@@ -5302,8 +5428,8 @@ static void migrate_tasks(struct rq *dead_rq)
                 /*
                  * pick_next_task assumes pinned rq->lock.
                  */
-               lockdep_pin_lock(&rq->lock);
-               next = pick_next_task(rq, &fake_task);
+               cookie = lockdep_pin_lock(&rq->lock);
+               next = pick_next_task(rq, &fake_task, cookie);
                 BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
  
@@ -5316,7 +5442,7 @@ static void migrate_tasks(struct rq *dead_rq)
                  * because !cpu_active at this point, which means load-balance
                  * will not interfere. Also, stop-machine.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 raw_spin_unlock(&rq->lock);
                 raw_spin_lock(&next->pi_lock);
                 raw_spin_lock(&rq->lock);
@@ -7331,8 +7457,6 @@ void __init sched_init(void)
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                         rq->cpu_load[j] = 0;
  
-               rq->last_load_update_tick = jiffies;
-
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
@@ -7351,12 +7475,13 @@ void __init sched_init(void)
  
                 rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ_COMMON
+               rq->last_load_update_tick = jiffies;
                 rq->nohz_flags = 0;
  #endif
  #ifdef CONFIG_NO_HZ_FULL
                 rq->last_sched_tick = 0;
  #endif
-#endif
+#endif /* CONFIG_SMP */
                 init_rq_hrtick(rq);
                 atomic_set(&rq->nr_iowait, 0);
         }
@@ -7639,10 +7764,10 @@ void sched_move_task(struct task_struct *tsk)
  {
         struct task_group *tg;
         int queued, running;
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
  
-       rq = task_rq_lock(tsk, &flags);
+       rq = task_rq_lock(tsk, &rf);
  
         running = task_current(rq, tsk);
         queued = task_on_rq_queued(tsk);
@@ -7674,7 +7799,7 @@ void sched_move_task(struct task_struct *tsk)
         if (queued)
                 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
  
-       task_rq_unlock(rq, tsk, &flags);
+       task_rq_unlock(rq, tsk, &rf);
  }
  #endif /* CONFIG_CGROUP_SCHED */
  
@@ -7894,7 +8019,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
  static int sched_rt_global_constraints(void)
  {
         unsigned long flags;
-       int i, ret = 0;
+       int i;
  
         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
@@ -7906,7 +8031,7 @@ static int sched_rt_global_constraints(void)
         }
         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
  
-       return ret;
+       return 0;
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c

index 4a811203c04a462478c85ab31d20ec2492e0f4b9..41f85c4d09387a8bd03299ef00f6e79482f68b40 100644 (file)
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,11 +25,22 @@ enum cpuacct_stat_index {
         CPUACCT_STAT_NSTATS,
  };
  
+enum cpuacct_usage_index {
+       CPUACCT_USAGE_USER,     /* ... user mode */
+       CPUACCT_USAGE_SYSTEM,   /* ... kernel mode */
+
+       CPUACCT_USAGE_NRUSAGE,
+};
+
+struct cpuacct_usage {
+       u64     usages[CPUACCT_USAGE_NRUSAGE];
+};
+
  /* track cpu usage of a group of tasks and its child groups */
  struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 __percpu *cpuusage;
+       struct cpuacct_usage __percpu *cpuusage;
         struct kernel_cpustat __percpu *cpustat;
  };
  
@@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
         return css_ca(ca->css.parent);
  }
  
-static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
  static struct cpuacct root_cpuacct = {
         .cpustat        = &kernel_cpustat,
         .cpuusage       = &root_cpuacct_cpuusage,
@@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
         if (!ca)
                 goto out;
  
-       ca->cpuusage = alloc_percpu(u64);
+       ca->cpuusage = alloc_percpu(struct cpuacct_usage);
         if (!ca->cpuusage)
                 goto out_free_ca;
  
@@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
         kfree(ca);
  }
  
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
+                                enum cpuacct_usage_index index)
  {
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+       struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
         u64 data;
  
+       /*
+        * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+        * the sum of suages.
+        */
+       BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+
  #ifndef CONFIG_64BIT
         /*
          * Take rq->lock to make 64-bit read safe on 32-bit platforms.
          */
         raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       data = *cpuusage;
+#endif
+
+       if (index == CPUACCT_USAGE_NRUSAGE) {
+               int i = 0;
+
+               data = 0;
+               for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+                       data += cpuusage->usages[i];
+       } else {
+               data = cpuusage->usages[index];
+       }
+
+#ifndef CONFIG_64BIT
         raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-       data = *cpuusage;
  #endif
  
         return data;
@@ -117,69 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
  
  static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
  {
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+       struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+       int i;
  
  #ifndef CONFIG_64BIT
         /*
          * Take rq->lock to make 64-bit write safe on 32-bit platforms.
          */
         raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       *cpuusage = val;
+#endif
+
+       for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+               cpuusage->usages[i] = val;
+
+#ifndef CONFIG_64BIT
         raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-       *cpuusage = val;
  #endif
  }
  
  /* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+static u64 __cpuusage_read(struct cgroup_subsys_state *css,
+                          enum cpuacct_usage_index index)
  {
         struct cpuacct *ca = css_ca(css);
         u64 totalcpuusage = 0;
         int i;
  
-       for_each_present_cpu(i)
-               totalcpuusage += cpuacct_cpuusage_read(ca, i);
+       for_each_possible_cpu(i)
+               totalcpuusage += cpuacct_cpuusage_read(ca, i, index);
  
         return totalcpuusage;
  }
  
+static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
+                             struct cftype *cft)
+{
+       return __cpuusage_read(css, CPUACCT_USAGE_USER);
+}
+
+static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
+                            struct cftype *cft)
+{
+       return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+}
+
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+       return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+}
+
  static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
                           u64 val)
  {
         struct cpuacct *ca = css_ca(css);
-       int err = 0;
-       int i;
+       int cpu;
  
         /*
          * Only allow '0' here to do a reset.
          */
-       if (val) {
-               err = -EINVAL;
-               goto out;
-       }
+       if (val)
+               return -EINVAL;
  
-       for_each_present_cpu(i)
-               cpuacct_cpuusage_write(ca, i, 0);
+       for_each_possible_cpu(cpu)
+               cpuacct_cpuusage_write(ca, cpu, 0);
  
-out:
-       return err;
+       return 0;
  }
  
-static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+static int __cpuacct_percpu_seq_show(struct seq_file *m,
+                                    enum cpuacct_usage_index index)
  {
         struct cpuacct *ca = css_ca(seq_css(m));
         u64 percpu;
         int i;
  
-       for_each_present_cpu(i) {
-               percpu = cpuacct_cpuusage_read(ca, i);
+       for_each_possible_cpu(i) {
+               percpu = cpuacct_cpuusage_read(ca, i, index);
                 seq_printf(m, "%llu ", (unsigned long long) percpu);
         }
         seq_printf(m, "\n");
         return 0;
  }
  
+static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
+{
+       return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+}
+
+static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
+{
+       return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+       return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+}
+
  static const char * const cpuacct_stat_desc[] = {
         [CPUACCT_STAT_USER] = "user",
         [CPUACCT_STAT_SYSTEM] = "system",
@@ -191,7 +253,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
         int cpu;
         s64 val = 0;
  
-       for_each_online_cpu(cpu) {
+       for_each_possible_cpu(cpu) {
                 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
                 val += kcpustat->cpustat[CPUTIME_USER];
                 val += kcpustat->cpustat[CPUTIME_NICE];
@@ -200,7 +262,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
         seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
  
         val = 0;
-       for_each_online_cpu(cpu) {
+       for_each_possible_cpu(cpu) {
                 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
                 val += kcpustat->cpustat[CPUTIME_SYSTEM];
                 val += kcpustat->cpustat[CPUTIME_IRQ];
@@ -219,10 +281,26 @@ static struct cftype files[] = {
                 .read_u64 = cpuusage_read,
                 .write_u64 = cpuusage_write,
         },
+       {
+               .name = "usage_user",
+               .read_u64 = cpuusage_user_read,
+       },
+       {
+               .name = "usage_sys",
+               .read_u64 = cpuusage_sys_read,
+       },
         {
                 .name = "usage_percpu",
                 .seq_show = cpuacct_percpu_seq_show,
         },
+       {
+               .name = "usage_percpu_user",
+               .seq_show = cpuacct_percpu_user_seq_show,
+       },
+       {
+               .name = "usage_percpu_sys",
+               .seq_show = cpuacct_percpu_sys_seq_show,
+       },
         {
                 .name = "stat",
                 .seq_show = cpuacct_stats_show,
@@ -238,10 +316,17 @@ static struct cftype files[] = {
  void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  {
         struct cpuacct *ca;
+       int index = CPUACCT_USAGE_SYSTEM;
+       struct pt_regs *regs = task_pt_regs(tsk);
+
+       if (regs && user_mode(regs))
+               index = CPUACCT_USAGE_USER;
  
         rcu_read_lock();
+
         for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
-               *this_cpu_ptr(ca->cpuusage) += cputime;
+               this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+
         rcu_read_unlock();
  }
  
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 686ec8adf952fbd3767c652515089d92b3827ae0..0ac6c84f33719813a31c9c906b564fef15307d93 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -591,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                                                      struct sched_dl_entity,
                                                      dl_timer);
         struct task_struct *p = dl_task_of(dl_se);
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
  
         /*
          * The task might have changed its scheduling policy to something
@@ -670,14 +670,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                  * Nothing relies on rq->lock after this, so its safe to drop
                  * rq->lock.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, rf.cookie);
                 push_dl_task(rq);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, rf.cookie);
         }
  #endif
  
  unlock:
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         /*
          * This can free the task_struct, including this hrtimer, do not touch
@@ -717,10 +717,6 @@ static void update_curr_dl(struct rq *rq)
         if (!dl_task(curr) || !on_dl_rq(dl_se))
                 return;
  
-       /* Kick cpufreq (see the comment in linux/cpufreq.h). */
-       if (cpu_of(rq) == smp_processor_id())
-               cpufreq_trigger_update(rq_clock(rq));
-
         /*
          * Consumed budget is computed considering the time as
          * observed by schedulable tasks (excluding time spent
@@ -736,6 +732,10 @@ static void update_curr_dl(struct rq *rq)
                 return;
         }
  
+       /* kick cpufreq (see the comment in linux/cpufreq.h). */
+       if (cpu_of(rq) == smp_processor_id())
+               cpufreq_trigger_update(rq_clock(rq));
+
         schedstat_set(curr->se.statistics.exec_max,
                       max(curr->se.statistics.exec_max, delta_exec));
  
@@ -1125,7 +1125,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
         return rb_entry(left, struct sched_dl_entity, rb_node);
  }
  
-struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+struct task_struct *
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         struct sched_dl_entity *dl_se;
         struct task_struct *p;
@@ -1140,9 +1141,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
                  * disabled avoiding further scheduler activity on it and we're
                  * being very careful to re-start the picking loop.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 pull_dl_task(rq);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
                 /*
                  * pull_rt_task() can drop (and re-acquire) rq->lock; this
                  * means a stop task can slip in, in which case we need to
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 4fbc3bd5ff6067dfe184295fc262987c912b669e..cf905f655ba120fd91d2dd023c6a5a93ad699d4b 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -626,15 +626,16 @@ do {                                                                      \
  #undef P
  #undef PN
  
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-
  #ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
         P64(avg_idle);
         P64(max_idle_balance_cost);
+#undef P64
  #endif
  
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+
         if (schedstat_enabled()) {
                 P(yld_count);
                 P(sched_count);
@@ -644,7 +645,6 @@ do {                                                                        \
         }
  
  #undef P
-#undef P64
  #endif
         spin_lock_irqsave(&sched_debug_lock, flags);
         print_cfs_stats(m, cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index e7dd0ec169bea82c630e2b8d897d2aee0cc9571b..39fde3660f9793be55accc4d95596def00da0b7d 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw)
   *   OR
   * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
   *
- * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
   * we're guaranteed shift stays positive because inv_weight is guaranteed to
   * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
   *
@@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se)
         sa->period_contrib = 1023;
         sa->load_avg = scale_load_down(se->load.weight);
         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-       sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
-       sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       /*
+        * At this point, util_avg won't be used in select_task_rq_fair anyway
+        */
+       sa->util_avg = 0;
+       sa->util_sum = 0;
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct sched_avg *sa = &se->avg;
+       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+       if (cap > 0) {
+               if (cfs_rq->avg.util_avg != 0) {
+                       sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+                       sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+                       if (sa->util_avg > cap)
+                               sa->util_avg = cap;
+               } else {
+                       sa->util_avg = cap;
+               }
+               sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       }
+}
+
  static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
  static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
  #else
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
  #endif
  
  /*
@@ -2437,10 +2488,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         update_load_sub(&cfs_rq->load, se->load.weight);
         if (!parent_entity(se))
                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
         if (entity_is_task(se)) {
                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                 list_del_init(&se->group_node);
         }
+#endif
         cfs_rq->nr_running--;
  }
  
@@ -2549,6 +2602,16 @@ static const u32 runnable_avg_yN_sum[] = {
         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
  };
  
+/*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers. See Documentation/scheduler/sched-avg.txt how these
+ * were generated:
+ */
+static const u32 __accumulated_sum_N32[] = {
+           0, 23371, 35056, 40899, 43820, 45281,
+       46011, 46376, 46559, 46650, 46696, 46719,
+};
+
  /*
   * Approximate:
   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
@@ -2597,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n)
         else if (unlikely(n >= LOAD_AVG_MAX_N))
                 return LOAD_AVG_MAX;
  
-       /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-       do {
-               contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-               contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
-               n -= LOAD_AVG_PERIOD;
-       } while (n > LOAD_AVG_PERIOD);
-
+       /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+       contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+       n %= LOAD_AVG_PERIOD;
         contrib = decay_load(contrib, n);
         return contrib + runnable_avg_yN_sum[n];
  }
  
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
  
  /*
@@ -2821,23 +2875,54 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+       struct rq *rq = rq_of(cfs_rq);
+       int cpu = cpu_of(rq);
+
+       if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+               unsigned long max = rq->cpu_capacity_orig;
+
+               /*
+                * There are a few boundary cases this might miss but it should
+                * get called often enough that that should (hopefully) not be
+                * a real problem -- added to that it only calls on the local
+                * CPU, so if we enqueue remotely we'll miss an update, but
+                * the next tick/schedule should update.
+                *
+                * It will not get called when we go idle, because the idle
+                * thread is a different class (!fair), nor will the utilization
+                * number include things like RT tasks.
+                *
+                * As is, the util number is not freq-invariant (we'd have to
+                * implement arch_scale_freq_capacity() for that).
+                *
+                * See cpu_util().
+                */
+               cpufreq_update_util(rq_clock(rq),
+                                   min(cfs_rq->avg.util_avg, max), max);
+       }
+}
+
  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  {
         struct sched_avg *sa = &cfs_rq->avg;
-       int decayed, removed = 0;
+       int decayed, removed_load = 0, removed_util = 0;
  
         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
                 sa->load_avg = max_t(long, sa->load_avg - r, 0);
                 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
-               removed = 1;
+               removed_load = 1;
         }
  
         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                 sa->util_avg = max_t(long, sa->util_avg - r, 0);
                 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+               removed_util = 1;
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2848,7 +2933,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
-       return decayed || removed;
+       if (update_freq && (decayed || removed_util))
+               cfs_rq_util_change(cfs_rq);
+
+       return decayed || removed_load;
  }
  
  /* Update task and its cfs_rq load average */
@@ -2867,31 +2955,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
                           se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
  
-       if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+       if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
                 update_tg_load_avg(cfs_rq, 0);
-
-       if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
-               unsigned long max = rq->cpu_capacity_orig;
-
-               /*
-                * There are a few boundary cases this might miss but it should
-                * get called often enough that that should (hopefully) not be
-                * a real problem -- added to that it only calls on the local
-                * CPU, so if we enqueue remotely we'll miss an update, but
-                * the next tick/schedule should update.
-                *
-                * It will not get called when we go idle, because the idle
-                * thread is a different class (!fair), nor will the utilization
-                * number include things like RT tasks.
-                *
-                * As is, the util number is not freq-invariant (we'd have to
-                * implement arch_scale_freq_capacity() for that).
-                *
-                * See cpu_util().
-                */
-               cpufreq_update_util(rq_clock(rq),
-                                   min(cfs_rq->avg.util_avg, max), max);
-       }
  }
  
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2919,6 +2984,8 @@ skip_aging:
         cfs_rq->avg.load_sum += se->avg.load_sum;
         cfs_rq->avg.util_avg += se->avg.util_avg;
         cfs_rq->avg.util_sum += se->avg.util_sum;
+
+       cfs_rq_util_change(cfs_rq);
  }
  
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2931,6 +2998,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
         cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
         cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
         cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+
+       cfs_rq_util_change(cfs_rq);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@ -2948,7 +3017,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         cfs_rq->curr == se, NULL);
         }
  
-       decayed = update_cfs_rq_load_avg(now, cfs_rq);
+       decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
  
         cfs_rq->runnable_load_avg += sa->load_avg;
         cfs_rq->runnable_load_sum += sa->load_sum;
@@ -4422,7 +4491,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  }
  
  #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
  /*
   * per rq 'load' arrray crap; XXX kill this.
   */
@@ -4488,13 +4557,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
         }
         return load;
  }
+#endif /* CONFIG_NO_HZ_COMMON */
  
  /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
   * @this_rq: The rq to update statistics for
   * @this_load: The current load
   * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
   *
   * Update rq->cpu_load[] statistics. This function is usually called every
   * scheduler tick (TICK_NSEC).
@@ -4523,12 +4592,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
   *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
   *
   * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
   */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-                             unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+                           unsigned long pending_updates)
  {
-       unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+       unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
         int i, scale;
  
         this_rq->nr_load_updates++;
@@ -4541,6 +4610,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
                 /* scale is effectively 1 << i now, and >> i divides by scale */
  
                 old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
                 if (tickless_load) {
                         old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
@@ -4551,6 +4621,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
                          */
                         old_load += tickless_load;
                 }
+#endif
                 new_load = this_load;
                 /*
                  * Round up the averaging division if load is increasing. This
@@ -4573,10 +4644,23 @@ static unsigned long weighted_cpuload(const int cpu)
  }
  
  #ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
-                                  unsigned long curr_jiffies,
-                                  unsigned long load,
-                                  int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+static void cpu_load_update_nohz(struct rq *this_rq,
+                                unsigned long curr_jiffies,
+                                unsigned long load)
  {
         unsigned long pending_updates;
  
@@ -4588,28 +4672,15 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
                  * In the NOHZ_FULL case, we were non-idle, we should consider
                  * its weighted load.
                  */
-               __update_cpu_load(this_rq, load, pending_updates, active);
+               cpu_load_update(this_rq, load, pending_updates);
         }
  }
  
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
  /*
   * Called from nohz_idle_balance() to update the load ratings before doing the
   * idle balance.
   */
-static void update_cpu_load_idle(struct rq *this_rq)
+static void cpu_load_update_idle(struct rq *this_rq)
  {
         /*
          * bail if there's load or we're actually up-to-date.
@@ -4617,38 +4688,71 @@ static void update_cpu_load_idle(struct rq *this_rq)
         if (weighted_cpuload(cpu_of(this_rq)))
                 return;
  
-       __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
+       cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
  }
  
  /*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ * Record CPU load on nohz entry so we know the tickless load to account
+ * on nohz exit. cpu_load[0] happens then to be updated more frequently
+ * than other cpu_load[idx] but it should be fine as cpu_load readers
+ * shouldn't rely into synchronized cpu_load[*] updates.
   */
-void update_cpu_load_nohz(int active)
+void cpu_load_update_nohz_start(void)
  {
         struct rq *this_rq = this_rq();
+
+       /*
+        * This is all lockless but should be fine. If weighted_cpuload changes
+        * concurrently we'll exit nohz. And cpu_load write can race with
+        * cpu_load_update_idle() but both updater would be writing the same.
+        */
+       this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+}
+
+/*
+ * Account the tickless load in the end of a nohz frame.
+ */
+void cpu_load_update_nohz_stop(void)
+{
         unsigned long curr_jiffies = READ_ONCE(jiffies);
-       unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
+       struct rq *this_rq = this_rq();
+       unsigned long load;
  
         if (curr_jiffies == this_rq->last_load_update_tick)
                 return;
  
+       load = weighted_cpuload(cpu_of(this_rq));
         raw_spin_lock(&this_rq->lock);
-       __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
+       update_rq_clock(this_rq);
+       cpu_load_update_nohz(this_rq, curr_jiffies, load);
         raw_spin_unlock(&this_rq->lock);
  }
-#endif /* CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void cpu_load_update_nohz(struct rq *this_rq,
+                                       unsigned long curr_jiffies,
+                                       unsigned long load) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+       /* See the mess around cpu_load_update_nohz(). */
+       this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
+       cpu_load_update(this_rq, load, 1);
+}
  
  /*
   * Called from scheduler_tick()
   */
-void update_cpu_load_active(struct rq *this_rq)
+void cpu_load_update_active(struct rq *this_rq)
  {
         unsigned long load = weighted_cpuload(cpu_of(this_rq));
-       /*
-        * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
-        */
-       this_rq->last_load_update_tick = jiffies;
-       __update_cpu_load(this_rq, load, 1, 1);
+
+       if (tick_nohz_tick_stopped())
+               cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
+       else
+               cpu_load_update_periodic(this_rq, load);
  }
  
  /*
@@ -5439,7 +5543,7 @@ preempt:
  }
  
  static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         struct cfs_rq *cfs_rq = &rq->cfs;
         struct sched_entity *se;
@@ -5552,9 +5656,9 @@ idle:
          * further scheduler activity on it and we're being very careful to
          * re-start the picking loop.
          */
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
         new_tasks = idle_balance(rq);
-       lockdep_pin_lock(&rq->lock);
+       lockdep_repin_lock(&rq->lock, cookie);
         /*
          * Because idle_balance() releases (and re-acquires) rq->lock, it is
          * possible for any higher priority task to appear. In that case we
@@ -5653,7 +5757,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
   *   W_i,0 = \Sum_j w_i,j                                             (2)
   *
   * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
- * is derived from the nice value as per prio_to_weight[].
+ * is derived from the nice value as per sched_prio_to_weight[].
   *
   * The weight average is an exponential decay average of the instantaneous
   * weight:
@@ -6155,7 +6259,7 @@ static void update_blocked_averages(int cpu)
                 if (throttled_hierarchy(cfs_rq))
                         continue;
  
-               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
                         update_tg_load_avg(cfs_rq, 0);
         }
         raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6216,7 +6320,7 @@ static inline void update_blocked_averages(int cpu)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
         update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -6625,6 +6729,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         if (!(env->sd->flags & SD_ASYM_PACKING))
                 return true;
  
+       /* No ASYM_PACKING if target cpu is already busy */
+       if (env->idle == CPU_NOT_IDLE)
+               return true;
         /*
          * ASYM_PACKING needs to move all the work to the lowest
          * numbered CPUs in the group, therefore mark all groups
@@ -6634,7 +6741,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                 if (!sds->busiest)
                         return true;
  
-               if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+               /* Prefer to move from highest possible cpu's work */
+               if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
                         return true;
         }
  
@@ -6780,6 +6888,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
         if (!(env->sd->flags & SD_ASYM_PACKING))
                 return 0;
  
+       if (env->idle == CPU_NOT_IDLE)
+               return 0;
+
         if (!sds->busiest)
                 return 0;
  
@@ -6888,9 +6999,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         }
  
         /*
-        * In the presence of smp nice balancing, certain scenarios can have
-        * max load less than avg load(as we skip the groups at or below
-        * its cpu_capacity, while calculating max_load..)
+        * Avg load of busiest sg can be less and avg load of local sg can
+        * be greater than avg load across all sgs of sd because avg load
+        * factors in sg capacity and sgs with smaller group_type are
+        * skipped when updating the busiest sg:
          */
         if (busiest->avg_load <= sds->avg_load ||
             local->avg_load >= sds->avg_load) {
@@ -6904,7 +7016,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         if (busiest->group_type == group_overloaded &&
             local->group_type   == group_overloaded) {
                 load_above_capacity = busiest->sum_nr_running *
-                                       SCHED_LOAD_SCALE;
+                                     scale_load_down(NICE_0_LOAD);
                 if (load_above_capacity > busiest->group_capacity)
                         load_above_capacity -= busiest->group_capacity;
                 else
@@ -6915,9 +7027,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
          * We're trying to get all the cpus to the average_load, so we don't
          * want to push ourselves above the average load, nor do we wish to
          * reduce the max loaded cpu below the average load. At the same time,
-        * we also don't want to reduce the group load below the group capacity
-        * (so that we can implement power-savings policies etc). Thus we look
-        * for the minimum possible imbalance.
+        * we also don't want to reduce the group load below the group
+        * capacity. Thus we look for the minimum possible imbalance.
          */
         max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
  
@@ -6941,10 +7052,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  
  /**
   * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
+ * if there is an imbalance.
   *
   * Also calculates the amount of weighted load which should be moved
   * to restore balance.
@@ -6952,9 +7060,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
   * @env: The load balancing environment.
   *
   * Return:     - The busiest group if imbalance exists.
- *             - If no imbalance and user has opted for power-savings balance,
- *                return the least loaded group whose CPUs can be
- *                put to idle by rebalancing its tasks onto our group.
   */
  static struct sched_group *find_busiest_group(struct lb_env *env)
  {
@@ -6972,8 +7077,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         busiest = &sds.busiest_stat;
  
         /* ASYM feature bypasses nice load balance check */
-       if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
-           check_asym_packing(env, &sds))
+       if (check_asym_packing(env, &sds))
                 return sds.busiest;
  
         /* There is no busy sibling group to pull tasks from */
@@ -7398,10 +7502,7 @@ more_balance:
                                         &busiest->active_balance_work);
                         }
  
-                       /*
-                        * We've kicked active balancing, reset the failure
-                        * counter.
-                        */
+                       /* We've kicked active balancing, force task migration. */
                         sd->nr_balance_failed = sd->cache_nice_tries+1;
                 }
         } else
@@ -7636,10 +7737,13 @@ static int active_load_balance_cpu_stop(void *data)
                 schedstat_inc(sd, alb_count);
  
                 p = detach_one_task(&env);
-               if (p)
+               if (p) {
                         schedstat_inc(sd, alb_pushed);
-               else
+                       /* Active balancing done, reset the failure counter. */
+                       sd->nr_balance_failed = 0;
+               } else {
                         schedstat_inc(sd, alb_failed);
+               }
         }
         rcu_read_unlock();
  out_unlock:
@@ -7956,7 +8060,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                 if (time_after_eq(jiffies, rq->next_balance)) {
                         raw_spin_lock_irq(&rq->lock);
                         update_rq_clock(rq);
-                       update_cpu_load_idle(rq);
+                       cpu_load_update_idle(rq);
                         raw_spin_unlock_irq(&rq->lock);
                         rebalance_domains(rq, CPU_IDLE);
                 }
@@ -8381,6 +8485,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 init_cfs_rq(cfs_rq);
                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                 init_entity_runnable_average(se);
+               post_init_entity_util_avg(se);
         }
  
         return 1;
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c

index 47ce94931f1b612a589151e3e64210450a006174..2ce5458bbe1d1ad82c16f7cbe38d79c23f91ebb0 100644 (file)
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
  }
  
  static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         put_prev_task(rq, prev);
  
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index ec4f538d4396beb20e2656923995438d5093f667..67afa06cc8bc210460b47df17119fe7256f3a624 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq)
         if (curr->sched_class != &rt_sched_class)
                 return;
  
-       /* Kick cpufreq (see the comment in linux/cpufreq.h). */
-       if (cpu_of(rq) == smp_processor_id())
-               cpufreq_trigger_update(rq_clock(rq));
-
         delta_exec = rq_clock_task(rq) - curr->se.exec_start;
         if (unlikely((s64)delta_exec <= 0))
                 return;
  
+       /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+       if (cpu_of(rq) == smp_processor_id())
+               cpufreq_trigger_update(rq_clock(rq));
+
         schedstat_set(curr->se.statistics.exec_max,
                       max(curr->se.statistics.exec_max, delta_exec));
  
@@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
  }
  
  static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         struct task_struct *p;
         struct rt_rq *rt_rq = &rq->rt;
@@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
                  * disabled avoiding further scheduler activity on it and we're
                  * being very careful to re-start the picking loop.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 pull_rt_task(rq);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
                 /*
                  * pull_rt_task() can drop (and re-acquire) rq->lock; this
                  * means a dl or stop task can slip in, in which case we need
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index ec2e8d23527e6c92a4fe1b5ef45dfb9ac1e242a8..d24e91b0a722cc21fcc501383bbec8a4844614ef 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -31,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq);
  extern long calc_load_fold_active(struct rq *this_rq);
  
  #ifdef CONFIG_SMP
-extern void update_cpu_load_active(struct rq *this_rq);
+extern void cpu_load_update_active(struct rq *this_rq);
  #else
-static inline void update_cpu_load_active(struct rq *this_rq) { }
+static inline void cpu_load_update_active(struct rq *this_rq) { }
  #endif
  
  /*
@@ -49,25 +49,32 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
   * and does not change the user-interface for setting shares/weights.
   *
   * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
+ * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
+ * pretty high and the returns do not justify the increased costs.
+ *
+ * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
+ * increase coverage and consistency always enable it on 64bit platforms.
   */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
-# define SCHED_LOAD_RESOLUTION 10
-# define scale_load(w)         ((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w)    ((w) >> SCHED_LOAD_RESOLUTION)
+#ifdef CONFIG_64BIT
+# define NICE_0_LOAD_SHIFT     (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w)         ((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w)    ((w) >> SCHED_FIXEDPOINT_SHIFT)
  #else
-# define SCHED_LOAD_RESOLUTION 0
+# define NICE_0_LOAD_SHIFT     (SCHED_FIXEDPOINT_SHIFT)
  # define scale_load(w)         (w)
  # define scale_load_down(w)    (w)
  #endif
  
-#define SCHED_LOAD_SHIFT       (10 + SCHED_LOAD_RESOLUTION)
-#define SCHED_LOAD_SCALE       (1L << SCHED_LOAD_SHIFT)
-
-#define NICE_0_LOAD            SCHED_LOAD_SCALE
-#define NICE_0_SHIFT           SCHED_LOAD_SHIFT
+/*
+ * Task weight (visible to users) and its load (invisible to users) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ *  scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *
+ */
+#define NICE_0_LOAD            (1L << NICE_0_LOAD_SHIFT)
  
  /*
   * Single value that decides SCHED_DEADLINE internal math precision.
@@ -585,11 +592,13 @@ struct rq {
  #endif
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-       unsigned long last_load_update_tick;
  #ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
+       unsigned long last_load_update_tick;
+#endif /* CONFIG_SMP */
         u64 nohz_stamp;
         unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
  #ifdef CONFIG_NO_HZ_FULL
         unsigned long last_sched_tick;
  #endif
@@ -854,7 +863,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym);
  struct sched_group_capacity {
         atomic_t ref;
         /*
-        * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+        * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
          * for a single CPU.
          */
         unsigned int capacity;
@@ -1200,7 +1209,8 @@ struct sched_class {
          * tasks.
          */
         struct task_struct * (*pick_next_task) (struct rq *rq,
-                                               struct task_struct *prev);
+                                               struct task_struct *prev,
+                                               struct pin_cookie cookie);
         void (*put_prev_task) (struct rq *rq, struct task_struct *p);
  
  #ifdef CONFIG_SMP
@@ -1313,6 +1323,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
  unsigned long to_ratio(u64 period, u64 runtime);
  
  extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
  
  #ifdef CONFIG_NO_HZ_FULL
  extern bool sched_can_stop_tick(struct rq *rq);
@@ -1448,86 +1459,32 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
  static inline void sched_avg_update(struct rq *rq) { }
  #endif
  
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       lockdep_assert_held(&p->pi_lock);
-
-       for (;;) {
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-                       lockdep_pin_lock(&rq->lock);
-                       return rq;
-               }
-               raw_spin_unlock(&rq->lock);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
+struct rq_flags {
+       unsigned long flags;
+       struct pin_cookie cookie;
+};
  
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+       __acquires(rq->lock);
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
         __acquires(p->pi_lock)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       for (;;) {
-               raw_spin_lock_irqsave(&p->pi_lock, *flags);
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               /*
-                *      move_queued_task()              task_rq_lock()
-                *
-                *      ACQUIRE (rq->lock)
-                *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
-                *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
-                *      [S] ->cpu = new_cpu             [L] task_rq()
-                *                                      [L] ->on_rq
-                *      RELEASE (rq->lock)
-                *
-                * If we observe the old cpu in task_rq_lock, the acquire of
-                * the old rq->lock will fully serialize against the stores.
-                *
-                * If we observe the new cpu in task_rq_lock, the acquire will
-                * pair with the WMB to ensure we must then also see migrating.
-                */
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-                       lockdep_pin_lock(&rq->lock);
-                       return rq;
-               }
-               raw_spin_unlock(&rq->lock);
-               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
+       __acquires(rq->lock);
  
-static inline void __task_rq_unlock(struct rq *rq)
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
         __releases(rq->lock)
  {
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, rf->cookie);
         raw_spin_unlock(&rq->lock);
  }
  
  static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
         __releases(rq->lock)
         __releases(p->pi_lock)
  {
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, rf->cookie);
         raw_spin_unlock(&rq->lock);
-       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+       raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
  }
  
  #ifdef CONFIG_SMP
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c

index cbc67da109544c4f0841b609e44d7337650aa81c..604297a08b3ae3064f990ec3c0b3a38384bb4f00 100644 (file)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
  }
  
  static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         struct task_struct *stop = rq->stop;
  
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c

index 58e3310c9b213617210fb19c10b6eb9b22a405a9..31872bc53bc4522267169eda985cd06e5bdb432a 100644 (file)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -776,6 +776,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
         if (!ts->tick_stopped) {
                 nohz_balance_enter_idle(cpu);
                 calc_load_enter_idle();
+               cpu_load_update_nohz_start();
  
                 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                 ts->tick_stopped = 1;
@@ -802,11 +803,11 @@ out:
         return tick;
  }
  
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
  {
         /* Update jiffies first */
         tick_do_update_jiffies64(now);
-       update_cpu_load_nohz(active);
+       cpu_load_update_nohz_stop();
  
         calc_load_exit_idle();
         touch_softlockup_watchdog_sched();
@@ -833,7 +834,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
         if (can_stop_full_tick(ts))
                 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
         else if (ts->tick_stopped)
-               tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
+               tick_nohz_restart_sched_tick(ts, ktime_get());
  #endif
  }
  
@@ -1024,7 +1025,7 @@ void tick_nohz_idle_exit(void)
                 tick_nohz_stop_idle(ts, now);
  
         if (ts->tick_stopped) {
-               tick_nohz_restart_sched_tick(ts, now, 0);
+               tick_nohz_restart_sched_tick(ts, now);
                 tick_nohz_account_idle_ticks(ts);
         }
  
diff --git a/mm/mmu_context.c b/mm/mmu_context.c

index f802c2d216a7d28bf76c5c911d83d10d213fd474..6f4d27c5bb325f6468461b17cd8800cc1e473308 100644 (file)
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -4,9 +4,9 @@
   */
  
  #include <linux/mm.h>
+#include <linux/sched.h>
  #include <linux/mmu_context.h>
  #include <linux/export.h>
-#include <linux/sched.h>
  
  #include <asm/mmu_context.h>
author	Ingo Molnar <mingo@kernel.org>
	Thu, 12 May 2016 07:18:13 +0000 (09:18 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 12 May 2016 07:18:13 +0000 (09:18 +0200)
Documentation/trace/ftrace.txt		patch \| blob \| blame \| history
arch/arm/include/asm/mmu_context.h		patch \| blob \| blame \| history
arch/x86/events/core.c		patch \| blob \| blame \| history
arch/x86/include/asm/mmu_context.h		patch \| blob \| blame \| history
arch/x86/mm/Makefile		patch \| blob \| blame \| history
arch/x86/mm/tlb.c		patch \| blob \| blame \| history
include/linux/lockdep.h		patch \| blob \| blame \| history
include/linux/mmu_context.h		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
kernel/locking/lockdep.c		patch \| blob \| blame \| history
kernel/sched/clock.c		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/cpuacct.c		patch \| blob \| blame \| history
kernel/sched/deadline.c		patch \| blob \| blame \| history
kernel/sched/debug.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/idle_task.c		patch \| blob \| blame \| history
kernel/sched/rt.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
kernel/sched/stop_task.c		patch \| blob \| blame \| history
kernel/time/tick-sched.c		patch \| blob \| blame \| history
mm/mmu_context.c		patch \| blob \| blame \| history