Merge branch 'timers-for-linus-migration' of git://git.kernel.org/pub/scm/linux/kerne...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 15 Jun 2009 17:06:19 +0000 (10:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 15 Jun 2009 17:06:19 +0000 (10:06 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 15 Jun 2009 17:06:19 +0000 (10:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 15 Jun 2009 17:06:19 +0000 (10:06 -0700)
diff --combined arch/x86/kernel/apic/x2apic_uv_x.c

index ef0ae207a7c82cb64f7a0a7758a91dc22b0f8fee,a9cad1b00d6bd9e06a9541e07eb3fc45ddbabf09..096d19aea2f7180b40be33870a0225ac822f71d3
--- 1/arch/x86/kernel/apic/x2apic_uv_x.c
--- 2/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@@ -105,7 -105,7 +105,7 @@@ static void uv_vector_allocation_domain
         cpumask_set_cpu(cpu, retmask);
   }
   
- -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+ +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
   {
   #ifdef CONFIG_SMP
         unsigned long val;
@@@ -463,7 -463,7 +463,7 @@@ static void uv_heartbeat(unsigned long 
         uv_set_scir_bits(bits);
   
         /* enable next timer period */
-       mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+       mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
   }
   
   static void __cpuinit uv_heartbeat_enable(int cpu)
@@@ -562,7 -562,7 +562,7 @@@ void __init uv_system_init(void
         union uvh_node_id_u node_id;
         unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
         int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
- -      int max_pnode = 0;
+ +      int gnode_extra, max_pnode = 0;
         unsigned long mmr_base, present, paddr;
         unsigned short pnode_mask;
   
@@@ -574,13 -574,6 +574,13 @@@
         mmr_base =
             uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
             ~UV_MMR_ENABLE;
+ +      pnode_mask = (1 << n_val) - 1;
+ +      node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ +      gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+ +      gnode_upper = ((unsigned long)gnode_extra  << m_val);
+ +      printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+ +                      n_val, m_val, gnode_upper, gnode_extra);
+ +
         printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
   
         for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@@ -590,18 -583,15 +590,18 @@@
   
         bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
         uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+ +      BUG_ON(!uv_blade_info);
   
         get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
   
         bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
         uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+ +      BUG_ON(!uv_node_to_blade);
         memset(uv_node_to_blade, 255, bytes);
   
         bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
         uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+ +      BUG_ON(!uv_cpu_to_blade);
         memset(uv_cpu_to_blade, 255, bytes);
   
         blade = 0;
@@@ -617,6 -607,11 +617,6 @@@
                 }
         }
   
- -      pnode_mask = (1 << n_val) - 1;
- -      node_id.v = uv_read_local_mmr(UVH_NODE_ID);
- -      gnode_upper = (((unsigned long)node_id.s.node_id) &
- -                     ~((1 << n_val) - 1)) << m_val;
- -
         uv_bios_init();
         uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
                             &sn_coherency_id, &sn_region_size);
@@@ -639,7 -634,6 +639,7 @@@
                 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
                 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+ +              uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
                 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
                 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
                 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --combined include/linux/sched.h

index fea9d188dbfff7ad1002e541143ab2866f5dcc52,311dec12397465f297f88ff1d8c3a29bcb56ada0..c900aa530070d7c08ad4a0851e684ac6931c98b9
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -77,7 -77,6 +77,7 @@@ struct sched_param 
   #include <linux/proportions.h>
   #include <linux/seccomp.h>
   #include <linux/rcupdate.h>
+ +#include <linux/rculist.h>
   #include <linux/rtmutex.h>
   
   #include <linux/time.h>
@@@ -97,9 -96,8 +97,9 @@@ struct exec_domain
   struct futex_pi_state;
   struct robust_list_head;
   struct bio;
- -struct bts_tracer;
   struct fs_struct;
+ +struct bts_context;
+ +struct perf_counter_context;
   
   /*
    * List of flags we want to share for kernel threads,
@@@ -118,7 -116,6 +118,7 @@@
    *    11 bit fractions.
    */
   extern unsigned long avenrun[];               /* Load averages */
+ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
   
   #define FSHIFT                11              /* nr of bits of precision */
   #define FIXED_1               (1<<FSHIFT)     /* 1.0 as fixed-point */
@@@ -138,9 -135,8 +138,9 @@@ DECLARE_PER_CPU(unsigned long, process_
   extern int nr_processes(void);
   extern unsigned long nr_running(void);
   extern unsigned long nr_uninterruptible(void);
- -extern unsigned long nr_active(void);
   extern unsigned long nr_iowait(void);
+ +extern void calc_global_load(void);
+ +extern u64 cpu_nr_migrations(int cpu);
   
   extern unsigned long get_parent_ip(unsigned long addr);
   
@@@ -261,6 -257,7 +261,7 @@@ extern void task_rq_unlock_wait(struct 
   extern cpumask_var_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
   extern int select_nohz_load_balancer(int cpu);
+ extern int get_nohz_load_balancer(void);
   #else
   static inline int select_nohz_load_balancer(int cpu)
   {
@@@ -676,10 -673,6 +677,10 @@@ struct user_struct 
         struct work_struct work;
   #endif
   #endif
+ +
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      atomic_long_t locked_vm;
+ +#endif
   };
   
   extern int uids_sysfs_init(void);
@@@ -846,17 -839,7 +847,17 @@@ struct sched_group 
          */
         u32 reciprocal_cpu_power;
   
- -      unsigned long cpumask[];
+ +      /*
+ +       * The CPUs this group covers.
+ +       *
+ +       * NOTE: this field is variable length. (Allocated dynamically
+ +       * by attaching extra space to the end of the structure,
+ +       * depending on how many CPUs the kernel has booted up with)
+ +       *
+ +       * It is also be embedded into static data structures at build
+ +       * time. (See 'struct static_sched_group' in kernel/sched.c)
+ +       */
+ +      unsigned long cpumask[0];
   };
   
   static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@@ -942,17 -925,8 +943,17 @@@ struct sched_domain 
         char *name;
   #endif
   
- -      /* span of all CPUs in this domain */
- -      unsigned long span[];
+ +      /*
+ +       * Span of all CPUs in this domain.
+ +       *
+ +       * NOTE: this field is variable length. (Allocated dynamically
+ +       * by attaching extra space to the end of the structure,
+ +       * depending on how many CPUs the kernel has booted up with)
+ +       *
+ +       * It is also be embedded into static data structures at build
+ +       * time. (See 'struct static_sched_domain' in kernel/sched.c)
+ +       */
+ +      unsigned long span[0];
   };
   
   static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
@@@ -1079,10 -1053,9 +1080,10 @@@ struct sched_entity 
         u64                     last_wakeup;
         u64                     avg_overlap;
   
+ +      u64                     nr_migrations;
+ +
         u64                     start_runtime;
         u64                     avg_wakeup;
- -      u64                     nr_migrations;
   
   #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
@@@ -1237,11 -1210,18 +1238,11 @@@ struct task_struct 
         struct list_head ptraced;
         struct list_head ptrace_entry;
   
- -#ifdef CONFIG_X86_PTRACE_BTS
         /*
          * This is the tracer handle for the ptrace BTS extension.
          * This field actually belongs to the ptracer task.
          */
- -      struct bts_tracer *bts;
- -      /*
- -       * The buffer to hold the BTS data.
- -       */
- -      void *bts_buffer;
- -      size_t bts_size;
- -#endif /* CONFIG_X86_PTRACE_BTS */
+ +      struct bts_context *bts;
   
         /* PID/PID hash table linkage. */
         struct pid_link pids[PIDTYPE_MAX];
@@@ -1268,9 -1248,7 +1269,9 @@@
                                          * credentials (COW) */
         const struct cred *cred;        /* effective (overridable) subjective task
                                          * credentials (COW) */
- -      struct mutex cred_exec_mutex;   /* execve vs ptrace cred calculation mutex */
+ +      struct mutex cred_guard_mutex;  /* guard against foreign influences on
+ +                                       * credential calculations
+ +                                       * (notably. ptrace) */
   
         char comm[TASK_COMM_LEN]; /* executable name excluding path
                                      - access with [gs]et_task_comm (which lock
@@@ -1403,11 -1381,6 +1404,11 @@@
         struct list_head pi_state_list;
         struct futex_pi_state *pi_state_cache;
   #endif
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      struct perf_counter_context *perf_counter_ctxp;
+ +      struct mutex perf_counter_mutex;
+ +      struct list_head perf_counter_list;
+ +#endif
   #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;
         short il_next;
@@@ -1456,9 -1429,7 +1457,9 @@@
   #ifdef CONFIG_TRACING
         /* state flags for use by tracers */
         unsigned long trace;
- -#endif
+ +      /* bitmask of trace recursion */
+ +      unsigned long trace_recursion;
+ +#endif /* CONFIG_TRACING */
   };
   
   /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@@ -1796,11 -1767,23 +1797,23 @@@ extern unsigned int sysctl_sched_child_
   extern unsigned int sysctl_sched_features;
   extern unsigned int sysctl_sched_migration_cost;
   extern unsigned int sysctl_sched_nr_migrate;
+ extern unsigned int sysctl_timer_migration;
   
   int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
                 loff_t *ppos);
   #endif
+ #ifdef CONFIG_SCHED_DEBUG
+ static inline unsigned int get_sysctl_timer_migration(void)
+ {
+       return sysctl_timer_migration;
+ }
+ #else
+ static inline unsigned int get_sysctl_timer_migration(void)
+ {
+       return 1;
+ }
+ #endif
   extern unsigned int sysctl_sched_rt_period;
   extern int sysctl_sched_rt_runtime;
   
@@@ -1915,7 -1898,6 +1928,7 @@@ extern void sched_dead(struct task_stru
   
   extern void proc_caches_init(void);
   extern void flush_signals(struct task_struct *);
+ +extern void __flush_signals(struct task_struct *);
   extern void ignore_signals(struct task_struct *);
   extern void flush_signal_handlers(struct task_struct *, int force_default);
   extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
@@@ -2032,10 -2014,8 +2045,10 @@@ extern void set_task_comm(struct task_s
   extern char *get_task_comm(char *to, struct task_struct *tsk);
   
   #ifdef CONFIG_SMP
+ +extern void wait_task_context_switch(struct task_struct *p);
   extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
   #else
+ +static inline void wait_task_context_switch(struct task_struct *p) {}
   static inline unsigned long wait_task_inactive(struct task_struct *p,
                                                long match_state)
   {
@@@ -2043,8 -2023,7 +2056,8 @@@
   }
   #endif
   
- -#define next_task(p)  list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+ +#define next_task(p) \
+ +      list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
   
   #define for_each_process(p) \
         for (p = &init_task ; (p = next_task(p)) != &init_task ; )
@@@ -2083,8 -2062,8 +2096,8 @@@ int same_thread_group(struct task_struc
   
   static inline struct task_struct *next_thread(const struct task_struct *p)
   {
- -      return list_entry(rcu_dereference(p->thread_group.next),
- -                        struct task_struct, thread_group);
+ +      return list_entry_rcu(p->thread_group.next,
+ +                            struct task_struct, thread_group);
   }
   
   static inline int thread_group_empty(struct task_struct *p)
@@@ -2212,12 -2191,6 +2225,12 @@@ static inline int test_tsk_need_resched
         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
   }
   
+ +static inline int restart_syscall(void)
+ +{
+ +      set_tsk_thread_flag(current, TIF_SIGPENDING);
+ +      return -ERESTARTNOINTR;
+ +}
+ +
   static inline int signal_pending(struct task_struct *p)
   {
         return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
@@@ -2428,13 -2401,6 +2441,13 @@@ static inline void inc_syscw(struct tas
   #define TASK_SIZE_OF(tsk)     TASK_SIZE
   #endif
   
+ +/*
+ + * Call the function if the target task is executing on a CPU right now:
+ + */
+ +extern void task_oncpu_function_call(struct task_struct *p,
+ +                                   void (*func) (void *info), void *info);
+ +
+ +
   #ifdef CONFIG_MM_OWNER
   extern void mm_update_next_owner(struct mm_struct *mm);
   extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --combined kernel/sched.c

index 8ec9d13140be832cd88d349248c1cd155fd8da61,9fe3774a0fd308fa7c34664586e1c87c6757a533..8fb88a906aaa21983bd5f61d913971788a4b9444
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -39,7 -39,6 +39,7 @@@
   #include <linux/completion.h>
   #include <linux/kernel_stat.h>
   #include <linux/debug_locks.h>
+ +#include <linux/perf_counter.h>
   #include <linux/security.h>
   #include <linux/notifier.h>
   #include <linux/profile.h>
@@@ -69,18 -68,17 +69,18 @@@
   #include <linux/pagemap.h>
   #include <linux/hrtimer.h>
   #include <linux/tick.h>
- -#include <linux/bootmem.h>
   #include <linux/debugfs.h>
   #include <linux/ctype.h>
   #include <linux/ftrace.h>
- -#include <trace/sched.h>
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
   
   #include "sched_cpupri.h"
   
+ +#define CREATE_TRACE_POINTS
+ +#include <trace/events/sched.h>
+ +
   /*
    * Convert user-nice values [ -20 ... 0 ... 19 ]
    * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@@ -120,6 -118,12 +120,6 @@@
    */
   #define RUNTIME_INF   ((u64)~0ULL)
   
- -DEFINE_TRACE(sched_wait_task);
- -DEFINE_TRACE(sched_wakeup);
- -DEFINE_TRACE(sched_wakeup_new);
- -DEFINE_TRACE(sched_switch);
- -DEFINE_TRACE(sched_migrate_task);
- -
   #ifdef CONFIG_SMP
   
   static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@@ -240,7 -244,7 +240,7 @@@ static void start_rt_bandwidth(struct r
                 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
                 delta = ktime_to_ns(ktime_sub(hard, soft));
                 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                               HRTIMER_MODE_ABS, 0);
+                               HRTIMER_MODE_ABS_PINNED, 0);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
   }
@@@ -580,7 -584,6 +580,7 @@@ struct rq 
         struct load_weight load;
         unsigned long nr_load_updates;
         u64 nr_switches;
+ +      u64 nr_migrations_in;
   
         struct cfs_rq cfs;
         struct rt_rq rt;
@@@ -627,10 -630,6 +627,10 @@@
         struct list_head migration_queue;
   #endif
   
+ +      /* calc_load related fields */
+ +      unsigned long calc_load_update;
+ +      long calc_load_active;
+ +
   #ifdef CONFIG_SCHED_HRTICK
   #ifdef CONFIG_SMP
         int hrtick_csd_pending;
@@@ -693,7 -692,7 +693,7 @@@ static inline int cpu_of(struct rq *rq
   #define task_rq(p)            cpu_rq(task_cpu(p))
   #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
   
- -static inline void update_rq_clock(struct rq *rq)
+ +inline void update_rq_clock(struct rq *rq)
   {
         rq->clock = sched_clock_cpu(cpu_of(rq));
   }
@@@ -1155,7 -1154,7 +1155,7 @@@ static __init void init_hrtick(void
   static void hrtick_start(struct rq *rq, u64 delay)
   {
         __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL, 0);
+                       HRTIMER_MODE_REL_PINNED, 0);
   }
   
   static inline void init_hrtick(void)
@@@ -1729,8 -1728,6 +1729,8 @@@ static void cfs_rq_set_shares(struct cf
   }
   #endif
   
+ +static void calc_load_account_active(struct rq *this_rq);
+ +
   #include "sched_stats.h"
   #include "sched_idletask.c"
   #include "sched_fair.c"
@@@ -1961,7 -1958,7 +1961,7 @@@ void set_task_cpu(struct task_struct *p
   
         clock_offset = old_rq->clock - new_rq->clock;
   
- -      trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ +      trace_sched_migrate_task(p, new_cpu);
   
   #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
@@@ -1970,16 -1967,12 +1970,16 @@@
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
+ +#endif
         if (old_cpu != new_cpu) {
- -              schedstat_inc(p, se.nr_migrations);
+ +              p->se.nr_migrations++;
+ +              new_rq->nr_migrations_in++;
+ +#ifdef CONFIG_SCHEDSTATS
                 if (task_hot(p, old_rq->clock, NULL))
                         schedstat_inc(p, se.nr_forced2_migrations);
- -      }
   #endif
+ +              perf_counter_task_migration(p, new_cpu);
+ +      }
         p->se.vruntime -= old_cfsrq->min_vruntime -
                                          new_cfsrq->min_vruntime;
   
@@@ -2021,49 -2014,6 +2021,49 @@@ migrate_task(struct task_struct *p, in
         return 1;
   }
   
+ +/*
+ + * wait_task_context_switch - wait for a thread to complete at least one
+ + *                            context switch.
+ + *
+ + * @p must not be current.
+ + */
+ +void wait_task_context_switch(struct task_struct *p)
+ +{
+ +      unsigned long nvcsw, nivcsw, flags;
+ +      int running;
+ +      struct rq *rq;
+ +
+ +      nvcsw   = p->nvcsw;
+ +      nivcsw  = p->nivcsw;
+ +      for (;;) {
+ +              /*
+ +               * The runqueue is assigned before the actual context
+ +               * switch. We need to take the runqueue lock.
+ +               *
+ +               * We could check initially without the lock but it is
+ +               * very likely that we need to take the lock in every
+ +               * iteration.
+ +               */
+ +              rq = task_rq_lock(p, &flags);
+ +              running = task_running(rq, p);
+ +              task_rq_unlock(rq, &flags);
+ +
+ +              if (likely(!running))
+ +                      break;
+ +              /*
+ +               * The switch count is incremented before the actual
+ +               * context switch. We thus wait for two switches to be
+ +               * sure at least one completed.
+ +               */
+ +              if ((p->nvcsw - nvcsw) > 1)
+ +                      break;
+ +              if ((p->nivcsw - nivcsw) > 1)
+ +                      break;
+ +
+ +              cpu_relax();
+ +      }
+ +}
+ +
   /*
    * wait_task_inactive - wait for a thread to unschedule.
    *
@@@ -2192,7 -2142,6 +2192,7 @@@ void kick_process(struct task_struct *p
                 smp_send_reschedule(cpu);
         preempt_enable();
   }
+ +EXPORT_SYMBOL_GPL(kick_process);
   
   /*
    * Return a low guess at the load of a migration-source cpu weighted
@@@ -2375,27 -2324,6 +2375,27 @@@ static int sched_balance_self(int cpu, 
   
   #endif /* CONFIG_SMP */
   
+ +/**
+ + * task_oncpu_function_call - call a function on the cpu on which a task runs
+ + * @p:                the task to evaluate
+ + * @func:     the function to be called
+ + * @info:     the function call argument
+ + *
+ + * Calls the function @func when the task is currently running. This might
+ + * be on the current CPU, which just calls the function directly
+ + */
+ +void task_oncpu_function_call(struct task_struct *p,
+ +                            void (*func) (void *info), void *info)
+ +{
+ +      int cpu;
+ +
+ +      preempt_disable();
+ +      cpu = task_cpu(p);
+ +      if (task_curr(p))
+ +              smp_call_function_single(cpu, func, info, 1);
+ +      preempt_enable();
+ +}
+ +
   /***
    * try_to_wake_up - wake up a thread
    * @p: the to-be-woken-up thread
@@@ -2530,17 -2458,6 +2530,17 @@@ out
         return success;
   }
   
+ +/**
+ + * wake_up_process - Wake up a specific process
+ + * @p: The process to be woken up.
+ + *
+ + * Attempt to wake up the nominated process and move it to the set of runnable
+ + * processes.  Returns 1 if the process was woken up, 0 if it was already
+ + * running.
+ + *
+ + * It may be assumed that this function implies a write memory barrier before
+ + * changing the task state if and only if any tasks are woken up.
+ + */
   int wake_up_process(struct task_struct *p)
   {
         return try_to_wake_up(p, TASK_ALL, 0);
@@@ -2563,7 -2480,6 +2563,7 @@@ static void __sched_fork(struct task_st
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
+ +      p->se.nr_migrations             = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
         p->se.start_runtime             = 0;
@@@ -2794,7 -2710,6 +2794,7 @@@ static void finish_task_switch(struct r
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
+ +      perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
   #ifdef CONFIG_SMP
         if (post_schedule)
@@@ -2851,7 -2766,7 +2851,7 @@@ context_switch(struct rq *rq, struct ta
          * combine the page table reload and the switch backend into
          * one hypercall.
          */
- -      arch_enter_lazy_cpu_mode();
+ +      arch_start_context_switch(prev);
   
         if (unlikely(!mm)) {
                 next->active_mm = oldmm;
@@@ -2941,81 -2856,19 +2941,81 @@@ unsigned long nr_iowait(void
         return sum;
   }
   
- -unsigned long nr_active(void)
+ +/* Variables and functions for calc_load */
+ +static atomic_long_t calc_load_tasks;
+ +static unsigned long calc_load_update;
+ +unsigned long avenrun[3];
+ +EXPORT_SYMBOL(avenrun);
+ +
+ +/**
+ + * get_avenrun - get the load average array
+ + * @loads:    pointer to dest load array
+ + * @offset:   offset to add
+ + * @shift:    shift count to shift the result left
+ + *
+ + * These values are estimates at best, so no need for locking.
+ + */
+ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
   {
- -      unsigned long i, running = 0, uninterruptible = 0;
+ +      loads[0] = (avenrun[0] + offset) << shift;
+ +      loads[1] = (avenrun[1] + offset) << shift;
+ +      loads[2] = (avenrun[2] + offset) << shift;
+ +}
   
- -      for_each_online_cpu(i) {
- -              running += cpu_rq(i)->nr_running;
- -              uninterruptible += cpu_rq(i)->nr_uninterruptible;
- -      }
+ +static unsigned long
+ +calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ +{
+ +      load *= exp;
+ +      load += active * (FIXED_1 - exp);
+ +      return load >> FSHIFT;
+ +}
   
- -      if (unlikely((long)uninterruptible < 0))
- -              uninterruptible = 0;
+ +/*
+ + * calc_load - update the avenrun load estimates 10 ticks after the
+ + * CPUs have updated calc_load_tasks.
+ + */
+ +void calc_global_load(void)
+ +{
+ +      unsigned long upd = calc_load_update + 10;
+ +      long active;
   
- -      return running + uninterruptible;
+ +      if (time_before(jiffies, upd))
+ +              return;
+ +
+ +      active = atomic_long_read(&calc_load_tasks);
+ +      active = active > 0 ? active * FIXED_1 : 0;
+ +
+ +      avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ +      avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ +      avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+ +
+ +      calc_load_update += LOAD_FREQ;
+ +}
+ +
+ +/*
+ + * Either called from update_cpu_load() or from a cpu going idle
+ + */
+ +static void calc_load_account_active(struct rq *this_rq)
+ +{
+ +      long nr_active, delta;
+ +
+ +      nr_active = this_rq->nr_running;
+ +      nr_active += (long) this_rq->nr_uninterruptible;
+ +
+ +      if (nr_active != this_rq->calc_load_active) {
+ +              delta = nr_active - this_rq->calc_load_active;
+ +              this_rq->calc_load_active = nr_active;
+ +              atomic_long_add(delta, &calc_load_tasks);
+ +      }
+ +}
+ +
+ +/*
+ + * Externally visible per-cpu scheduler statistics:
+ + * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ + */
+ +u64 cpu_nr_migrations(int cpu)
+ +{
+ +      return cpu_rq(cpu)->nr_migrations_in;
   }
   
   /*
@@@ -3046,11 -2899,6 +3046,11 @@@ static void update_cpu_load(struct rq *
                         new_load += scale-1;
                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
         }
+ +
+ +      if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+ +              this_rq->calc_load_update += LOAD_FREQ;
+ +              calc_load_account_active(this_rq);
+ +      }
   }
   
   #ifdef CONFIG_SMP
@@@ -4392,126 -4240,15 +4392,131 @@@ static void active_load_balance(struct 
   static struct {
         atomic_t load_balancer;
         cpumask_var_t cpu_mask;
+ +      cpumask_var_t ilb_grp_nohz_mask;
   } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
   };
   
+ int get_nohz_load_balancer(void)
+ {
+       return atomic_read(&nohz.load_balancer);
+ }
+ 
+ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ +/**
+ + * lowest_flag_domain - Return lowest sched_domain containing flag.
+ + * @cpu:      The cpu whose lowest level of sched domain is to
+ + *            be returned.
+ + * @flag:     The flag to check for the lowest sched_domain
+ + *            for the given cpu.
+ + *
+ + * Returns the lowest sched_domain of a cpu which contains the given flag.
+ + */
+ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+ +{
+ +      struct sched_domain *sd;
+ +
+ +      for_each_domain(cpu, sd)
+ +              if (sd && (sd->flags & flag))
+ +                      break;
+ +
+ +      return sd;
+ +}
+ +
+ +/**
+ + * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ + * @cpu:      The cpu whose domains we're iterating over.
+ + * @sd:               variable holding the value of the power_savings_sd
+ + *            for cpu.
+ + * @flag:     The flag to filter the sched_domains to be iterated.
+ + *
+ + * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ + * set, starting from the lowest sched_domain to the highest.
+ + */
+ +#define for_each_flag_domain(cpu, sd, flag) \
+ +      for (sd = lowest_flag_domain(cpu, flag); \
+ +              (sd && (sd->flags & flag)); sd = sd->parent)
+ +
+ +/**
+ + * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ + * @ilb_group:        group to be checked for semi-idleness
+ + *
+ + * Returns:   1 if the group is semi-idle. 0 otherwise.
+ + *
+ + * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ + * and atleast one non-idle CPU. This helper function checks if the given
+ + * sched_group is semi-idle or not.
+ + */
+ +static inline int is_semi_idle_group(struct sched_group *ilb_group)
+ +{
+ +      cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ +                                      sched_group_cpus(ilb_group));
+ +
+ +      /*
+ +       * A sched_group is semi-idle when it has atleast one busy cpu
+ +       * and atleast one idle cpu.
+ +       */
+ +      if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ +              return 0;
+ +
+ +      if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +/**
+ + * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ + * @cpu:      The cpu which is nominating a new idle_load_balancer.
+ + *
+ + * Returns:   Returns the id of the idle load balancer if it exists,
+ + *            Else, returns >= nr_cpu_ids.
+ + *
+ + * This algorithm picks the idle load balancer such that it belongs to a
+ + * semi-idle powersavings sched_domain. The idea is to try and avoid
+ + * completely idle packages/cores just for the purpose of idle load balancing
+ + * when there are other idle cpu's which are better suited for that job.
+ + */
+ +static int find_new_ilb(int cpu)
+ +{
+ +      struct sched_domain *sd;
+ +      struct sched_group *ilb_group;
+ +
+ +      /*
+ +       * Have idle load balancer selection from semi-idle packages only
+ +       * when power-aware load balancing is enabled
+ +       */
+ +      if (!(sched_smt_power_savings || sched_mc_power_savings))
+ +              goto out_done;
+ +
+ +      /*
+ +       * Optimize for the case when we have no idle CPUs or only one
+ +       * idle CPU. Don't walk the sched_domain hierarchy in such cases
+ +       */
+ +      if (cpumask_weight(nohz.cpu_mask) < 2)
+ +              goto out_done;
+ +
+ +      for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+ +              ilb_group = sd->groups;
+ +
+ +              do {
+ +                      if (is_semi_idle_group(ilb_group))
+ +                              return cpumask_first(nohz.ilb_grp_nohz_mask);
+ +
+ +                      ilb_group = ilb_group->next;
+ +
+ +              } while (ilb_group != sd->groups);
+ +      }
+ +
+ +out_done:
+ +      return cpumask_first(nohz.cpu_mask);
+ +}
+ +#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+ +static inline int find_new_ilb(int call_cpu)
+ +{
+ +      return cpumask_first(nohz.cpu_mask);
+ +}
+ +#endif
+ +
   /*
    * This routine will try to nominate the ilb (idle load balancing)
    * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@@ -4566,24 -4303,8 +4571,24 @@@ int select_nohz_load_balancer(int stop_
                         /* make me the ilb owner */
                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                 return 1;
- -              } else if (atomic_read(&nohz.load_balancer) == cpu)
+ +              } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ +                      int new_ilb;
+ +
+ +                      if (!(sched_smt_power_savings ||
+ +                                              sched_mc_power_savings))
+ +                              return 1;
+ +                      /*
+ +                       * Check to see if there is a more power-efficient
+ +                       * ilb.
+ +                       */
+ +                      new_ilb = find_new_ilb(cpu);
+ +                      if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+ +                              atomic_set(&nohz.load_balancer, -1);
+ +                              resched_cpu(new_ilb);
+ +                              return 0;
+ +                      }
                         return 1;
+ +              }
         } else {
                 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                         return 0;
@@@ -4752,7 -4473,15 +4757,7 @@@ static inline void trigger_load_balance
                 }
   
                 if (atomic_read(&nohz.load_balancer) == -1) {
- -                      /*
- -                       * simple selection for now: Nominate the
- -                       * first cpu in the nohz list to be the next
- -                       * ilb owner.
- -                       *
- -                       * TBD: Traverse the sched domains and nominate
- -                       * the nearest cpu in the nohz.cpu_mask.
- -                       */
- -                      int ilb = cpumask_first(nohz.cpu_mask);
+ +                      int ilb = find_new_ilb(cpu);
   
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@@ -5008,7 -4737,7 +5013,7 @@@ void account_process_tick(struct task_s
   
         if (user_tick)
                 account_user_time(p, one_jiffy, one_jiffy_scaled);
- -      else if (p != rq->idle)
+ +      else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
                 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
                                     one_jiffy_scaled);
         else
@@@ -5116,8 -4845,6 +5121,8 @@@ void scheduler_tick(void
         curr->sched_class->task_tick(rq, curr, 0);
         spin_unlock(&rq->lock);
   
+ +      perf_counter_task_tick(curr, cpu);
+ +
   #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
         trigger_load_balance(rq, cpu);
@@@ -5285,15 -5012,13 +5290,15 @@@ pick_next_task(struct rq *rq
   /*
    * schedule() is the main scheduler function.
    */
- -asmlinkage void __sched __schedule(void)
+ +asmlinkage void __sched schedule(void)
   {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
   
+ +need_resched:
+ +      preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@@ -5333,7 -5058,6 +5338,7 @@@ need_resched_nonpreemptible
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
+ +              perf_counter_task_sched_out(prev, next, cpu);
   
                 rq->nr_switches++;
                 rq->curr = next;
@@@ -5351,9 -5075,15 +5356,9 @@@
   
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
- -}
   
- -asmlinkage void __sched schedule(void)
- -{
- -need_resched:
- -      preempt_disable();
- -      __schedule();
         preempt_enable_no_resched();
- -      if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ +      if (need_resched())
                 goto need_resched;
   }
   EXPORT_SYMBOL(schedule);
@@@ -5496,7 -5226,7 +5501,7 @@@ EXPORT_SYMBOL(default_wake_function)
    * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
    * zero in this (rare) case, and we handle it by continuing to scan the queue.
    */
- -void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, int sync, void *key)
   {
         wait_queue_t *curr, *next;
@@@ -5516,9 -5246,6 +5521,9 @@@
    * @mode: which threads
    * @nr_exclusive: how many wake-one or wake-many threads to wake up
    * @key: is directly passed to the wakeup function
+ + *
+ + * It may be assumed that this function implies a write memory barrier before
+ + * changing the task state if and only if any tasks are woken up.
    */
   void __wake_up(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, void *key)
@@@ -5557,9 -5284,6 +5562,9 @@@ void __wake_up_locked_key(wait_queue_he
    * with each other. This can prevent needless bouncing between CPUs.
    *
    * On UP it can prevent extra preemption.
+ + *
+ + * It may be assumed that this function implies a write memory barrier before
+ + * changing the task state if and only if any tasks are woken up.
    */
   void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, void *key)
@@@ -5596,9 -5320,6 +5601,9 @@@ EXPORT_SYMBOL_GPL(__wake_up_sync);      /* F
    * awakened in the same order in which they were queued.
    *
    * See also complete_all(), wait_for_completion() and related routines.
+ + *
+ + * It may be assumed that this function implies a write memory barrier before
+ + * changing the task state if and only if any tasks are woken up.
    */
   void complete(struct completion *x)
   {
@@@ -5616,9 -5337,6 +5621,9 @@@ EXPORT_SYMBOL(complete)
    * @x:  holds the state of this particular completion
    *
    * This will wake up all threads waiting on this particular completion event.
+ + *
+ + * It may be assumed that this function implies a write memory barrier before
+ + * changing the task state if and only if any tasks are woken up.
    */
   void complete_all(struct completion *x)
   {
@@@ -6777,9 -6495,8 +6782,9 @@@ void sched_show_task(struct task_struc
   #ifdef CONFIG_DEBUG_STACK_USAGE
         free = stack_not_used(p);
   #endif
- -      printk(KERN_CONT "%5lu %5d %6d\n", free,
- -              task_pid_nr(p), task_pid_nr(p->real_parent));
+ +      printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ +              task_pid_nr(p), task_pid_nr(p->real_parent),
+ +              (unsigned long)task_thread_info(p)->flags);
   
         show_stack(p, NULL);
   }
@@@ -7258,14 -6975,6 +7263,14 @@@ static void migrate_dead_tasks(unsigne
   
         }
   }
+ +
+ +/*
+ + * remove the tasks which were accounted by rq from calc_load_tasks.
+ + */
+ +static void calc_global_load_remove(struct rq *rq)
+ +{
+ +      atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ +}
   #endif /* CONFIG_HOTPLUG_CPU */
   
   #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@@ -7500,8 -7209,6 +7505,8 @@@ migration_call(struct notifier_block *n
                 /* Update our root-domain */
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
+ +              rq->calc_load_update = calc_load_update;
+ +              rq->calc_load_active = 0;
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   
@@@ -7541,7 -7248,7 +7546,7 @@@
                 cpuset_unlock();
                 migrate_nr_uninterruptible(rq);
                 BUG_ON(rq->nr_running != 0);
- -
+ +              calc_global_load_remove(rq);
                 /*
                  * No need to migrate the tasks: it was best-effort if
                  * they didn't take sched_hotcpu_mutex. Just wake up
@@@ -7577,10 -7284,8 +7582,10 @@@
         return NOTIFY_OK;
   }
   
- -/* Register at highest priority so that task migration (migrate_all_tasks)
- - * happens before everything else.
+ +/*
+ + * Register at high priority so that task migration (migrate_all_tasks)
+ + * happens before everything else.  This has to be lower priority than
+ + * the notifier in the perf_counter subsystem, though.
    */
   static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
@@@ -7825,21 -7530,24 +7830,21 @@@ static void rq_attach_root(struct rq *r
   
   static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
   {
+ +      gfp_t gfp = GFP_KERNEL;
+ +
         memset(rd, 0, sizeof(*rd));
   
- -      if (bootmem) {
- -              alloc_bootmem_cpumask_var(&def_root_domain.span);
- -              alloc_bootmem_cpumask_var(&def_root_domain.online);
- -              alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
- -              cpupri_init(&rd->cpupri, true);
- -              return 0;
- -      }
+ +      if (bootmem)
+ +              gfp = GFP_NOWAIT;
   
- -      if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ +      if (!alloc_cpumask_var(&rd->span, gfp))
                 goto out;
- -      if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ +      if (!alloc_cpumask_var(&rd->online, gfp))
                 goto free_span;
- -      if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ +      if (!alloc_cpumask_var(&rd->rto_mask, gfp))
                 goto free_online;
   
- -      if (cpupri_init(&rd->cpupri, false) != 0)
+ +      if (cpupri_init(&rd->cpupri, bootmem) != 0)
                 goto free_rto_mask;
         return 0;
   
@@@ -8050,9 -7758,8 +8055,9 @@@ int sched_smt_power_savings = 0, sched_
   
   /*
    * The cpus mask in sched_group and sched_domain hangs off the end.
- - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- - * for nr_cpu_ids < CONFIG_NR_CPUS.
+ + *
+ + * ( See the the comments in include/linux/sched.h:struct sched_group
+ + *   and struct sched_domain. )
    */
   struct static_sched_group {
         struct sched_group sg;
@@@ -8173,7 -7880,7 +8178,7 @@@ static void init_numa_sched_groups_powe
                         struct sched_domain *sd;
   
                         sd = &per_cpu(phys_domains, j).sd;
- -                      if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+ +                      if (j != group_first_cpu(sd->groups)) {
                                 /*
                                  * Only add "power" once for each
                                  * physical package.
@@@ -8251,7 -7958,7 +8256,7 @@@ static void init_sched_groups_power(in
   
         WARN_ON(!sd || !sd->groups);
   
- -      if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+ +      if (cpu != group_first_cpu(sd->groups))
                 return;
   
         child = sd->child;
@@@ -9029,6 -8736,8 +9034,8 @@@ void __init sched_init_smp(void
   }
   #endif /* CONFIG_SMP */
   
+ const_debug unsigned int sysctl_timer_migration = 1;
+ 
   int in_sched_functions(unsigned long addr)
   {
         return in_lock_functions(addr) ||
@@@ -9163,7 -8872,7 +9170,7 @@@ void __init sched_init(void
          * we use alloc_bootmem().
          */
         if (alloc_size) {
- -              ptr = (unsigned long)alloc_bootmem(alloc_size);
+ +              ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
                 init_task_group.se = (struct sched_entity **)ptr;
@@@ -9236,8 -8945,6 +9243,8 @@@
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
                 rq->nr_running = 0;
+ +              rq->calc_load_active = 0;
+ +              rq->calc_load_update = jiffies + LOAD_FREQ;
                 init_cfs_rq(&rq->cfs, rq);
                 init_rt_rq(&rq->rt, rq);
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -9258,7 -8965,7 +9265,7 @@@
                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
                  * then A0's share of the cpu resource is:
                  *
- -               *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+ +               *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                  *
                  * We achieve this by letting init_task_group's tasks sit
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@@ -9345,26 -9052,20 +9352,26 @@@
          * when this runqueue becomes "idle".
          */
         init_idle(current, smp_processor_id());
+ +
+ +      calc_load_update = jiffies + LOAD_FREQ;
+ +
         /*
          * During early bootup we pretend to be a normal task:
          */
         current->sched_class = &fair_sched_class;
   
         /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- -      alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ +      alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
   #ifdef CONFIG_SMP
   #ifdef CONFIG_NO_HZ
- -      alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ +      alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+ +      alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
   #endif
- -      alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ +      alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
   #endif /* SMP */
   
+ +      perf_counter_init();
+ +
         scheduler_running = 1;
   }
   
@@@ -10106,13 -9807,6 +10113,13 @@@ static int sched_rt_global_constraints(
         if (sysctl_sched_rt_period <= 0)
                 return -EINVAL;
   
+ +      /*
+ +       * There's always some RT tasks in the root group
+ +       * -- migration, kstopmachine etc..
+ +       */
+ +      if (sysctl_sched_rt_runtime == 0)
+ +              return -EBUSY;
+ +
         spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --combined kernel/sysctl.c

index ce664f98e3fb67d7741182b9bce3fd29f4b8b88f,b3ce581373030db501d83ea08c4a771f0a02aeae..0e51a35a44869425aa3f4e7cf9fca498787fd0fb
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -49,7 -49,6 +49,7 @@@
   #include <linux/reboot.h>
   #include <linux/ftrace.h>
   #include <linux/slow-work.h>
+ +#include <linux/perf_counter.h>
   
   #include <asm/uaccess.h>
   #include <asm/processor.h>
@@@ -102,9 -101,7 +102,9 @@@ static int __maybe_unused one = 1
   static int __maybe_unused two = 2;
   static unsigned long one_ul = 1;
   static int one_hundred = 100;
- -static int one_thousand = 1000;
+ +
+ +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+ +static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
   
   /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
   static int maxolduid = 65535;
@@@ -115,7 -112,6 +115,7 @@@ static int ngroups_max = NGROUPS_MAX
   
   #ifdef CONFIG_MODULES
   extern char modprobe_path[];
+ +extern int modules_disabled;
   #endif
   #ifdef CONFIG_CHR_DEV_SG
   extern int sg_big_buff;
@@@ -327,6 -323,14 +327,14 @@@ static struct ctl_table kern_table[] = 
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "timer_migration",
+               .data           = &sysctl_timer_migration,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
         },
   #endif
         {
@@@ -536,17 -540,6 +544,17 @@@
                 .proc_handler   = &proc_dostring,
                 .strategy       = &sysctl_string,
         },
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "modules_disabled",
+ +              .data           = &modules_disabled,
+ +              .maxlen         = sizeof(int),
+ +              .mode           = 0644,
+ +              /* only handle a transition from default "0" to "1" */
+ +              .proc_handler   = &proc_dointvec_minmax,
+ +              .extra1         = &one,
+ +              .extra2         = &one,
+ +      },
   #endif
   #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
         {
@@@ -742,14 -735,6 +750,14 @@@
                 .mode           = 0444,
                 .proc_handler   = &proc_dointvec,
         },
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "bootloader_version",
+ +              .data           = &bootloader_version,
+ +              .maxlen         = sizeof (int),
+ +              .mode           = 0444,
+ +              .proc_handler   = &proc_dointvec,
+ +      },
         {
                 .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "kstack_depth_to_print",
@@@ -933,32 -918,6 +941,32 @@@
                 .child          = slow_work_sysctls,
         },
   #endif
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "perf_counter_paranoid",
+ +              .data           = &sysctl_perf_counter_paranoid,
+ +              .maxlen         = sizeof(sysctl_perf_counter_paranoid),
+ +              .mode           = 0644,
+ +              .proc_handler   = &proc_dointvec,
+ +      },
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "perf_counter_mlock_kb",
+ +              .data           = &sysctl_perf_counter_mlock,
+ +              .maxlen         = sizeof(sysctl_perf_counter_mlock),
+ +              .mode           = 0644,
+ +              .proc_handler   = &proc_dointvec,
+ +      },
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "perf_counter_max_sample_rate",
+ +              .data           = &sysctl_perf_counter_sample_rate,
+ +              .maxlen         = sizeof(sysctl_perf_counter_sample_rate),
+ +              .mode           = 0644,
+ +              .proc_handler   = &proc_dointvec,
+ +      },
+ +#endif
   /*
    * NOTE: do not add new entries to this table unless you have read
    * Documentation/sysctl/ctl_unnumbered.txt
@@@ -1055,7 -1014,7 +1063,7 @@@ static struct ctl_table vm_table[] = 
                 .mode           = 0644,
                 .proc_handler   = &dirty_bytes_handler,
                 .strategy       = &sysctl_intvec,
- -              .extra1         = &one_ul,
+ +              .extra1         = &dirty_bytes_min,
         },
         {
                 .procname       = "dirty_writeback_centisecs",
@@@ -1079,6 -1038,28 +1087,6 @@@
                 .mode           = 0444 /* read-only*/,
                 .proc_handler   = &proc_dointvec,
         },
- -      {
- -              .ctl_name       = CTL_UNNUMBERED,
- -              .procname       = "nr_pdflush_threads_min",
- -              .data           = &nr_pdflush_threads_min,
- -              .maxlen         = sizeof nr_pdflush_threads_min,
- -              .mode           = 0644 /* read-write */,
- -              .proc_handler   = &proc_dointvec_minmax,
- -              .strategy       = &sysctl_intvec,
- -              .extra1         = &one,
- -              .extra2         = &nr_pdflush_threads_max,
- -      },
- -      {
- -              .ctl_name       = CTL_UNNUMBERED,
- -              .procname       = "nr_pdflush_threads_max",
- -              .data           = &nr_pdflush_threads_max,
- -              .maxlen         = sizeof nr_pdflush_threads_max,
- -              .mode           = 0644 /* read-write */,
- -              .proc_handler   = &proc_dointvec_minmax,
- -              .strategy       = &sysctl_intvec,
- -              .extra1         = &nr_pdflush_threads_min,
- -              .extra2         = &one_thousand,
- -      },
         {
                 .ctl_name       = VM_SWAPPINESS,
                 .procname       = "swappiness",
@@@ -1272,6 -1253,7 +1280,6 @@@
                 .strategy       = &sysctl_jiffies,
         },
   #endif
- -#ifdef CONFIG_SECURITY
         {
                 .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "mmap_min_addr",
@@@ -1280,6 -1262,7 +1288,6 @@@
                 .mode           = 0644,
                 .proc_handler   = &proc_doulongvec_minmax,
         },
- -#endif
   #ifdef CONFIG_NUMA
         {
                 .ctl_name       = CTL_UNNUMBERED,
diff --combined kernel/time/clockevents.c

index 3948fa644a2db6e42072f48828d9e1f199ce90e9,ab20ded013bd5621c807f868f296ab3e003df4da..1ad6dd46111920d10c5d83f2f90c1aeeb81ca66d
--- 1/kernel/time/clockevents.c
--- 2/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@@ -18,6 -18,7 +18,7 @@@
   #include <linux/notifier.h>
   #include <linux/smp.h>
   #include <linux/sysdev.h>
+ #include <linux/tick.h>
   
   /* The registered clock event devices */
   static LIST_HEAD(clockevent_devices);
@@@ -54,7 -55,6 +55,7 @@@ unsigned long clockevent_delta2ns(unsig
   
         return (unsigned long) clc;
   }
+ +EXPORT_SYMBOL_GPL(clockevent_delta2ns);
   
   /**
    * clockevents_set_mode - set the operating mode of a clock event device
@@@ -188,7 -188,6 +189,7 @@@ void clockevents_register_device(struc
   
         spin_unlock(&clockevents_lock);
   }
+ +EXPORT_SYMBOL_GPL(clockevents_register_device);
   
   /*
    * Noop handler when we shut down an event device
@@@ -253,4 -252,15 +254,15 @@@ void clockevents_notify(unsigned long r
         spin_unlock(&clockevents_lock);
   }
   EXPORT_SYMBOL_GPL(clockevents_notify);
+ 
+ ktime_t clockevents_get_next_event(int cpu)
+ {
+       struct tick_device *td;
+       struct clock_event_device *dev;
+ 
+       td = &per_cpu(tick_cpu_device, cpu);
+       dev = td->evtdev;
+ 
+       return dev->next_event;
+ }
   #endif
diff --combined kernel/timer.c

index faf2db897de4b5ef6b82ed81fcef40c6c4452aa9,3f841db5edf9df4b5e39ef290199a000eb3fc5b8..54d3912f8cadd497d09546ad8b9b1ba42369634d
--- 1/kernel/timer.c
--- 2/kernel/timer.c
+++ b/kernel/timer.c
@@@ -37,7 -37,7 +37,8 @@@
   #include <linux/delay.h>
   #include <linux/tick.h>
   #include <linux/kallsyms.h>
+ +#include <linux/perf_counter.h>
+ #include <linux/sched.h>
   
   #include <asm/uaccess.h>
   #include <asm/unistd.h>
@@@ -605,13 -605,12 +606,12 @@@ static struct tvec_base *lock_timer_bas
   }
   
   static inline int
- __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+ __mod_timer(struct timer_list *timer, unsigned long expires,
+                                               bool pending_only, int pinned)
   {
         struct tvec_base *base, *new_base;
         unsigned long flags;
-       int ret;
- 
-       ret = 0;
+       int ret = 0 , cpu;
   
         timer_stats_timer_set_start_info(timer);
         BUG_ON(!timer->function);
@@@ -630,6 -629,18 +630,18 @@@
   
         new_base = __get_cpu_var(tvec_bases);
   
+       cpu = smp_processor_id();
+ 
+ #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+       if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+               int preferred_cpu = get_nohz_load_balancer();
+ 
+               if (preferred_cpu >= 0)
+                       cpu = preferred_cpu;
+       }
+ #endif
+       new_base = per_cpu(tvec_bases, cpu);
+ 
         if (base != new_base) {
                 /*
                  * We are trying to schedule the timer on the local CPU.
@@@ -669,7 -680,7 +681,7 @@@ out_unlock
    */
   int mod_timer_pending(struct timer_list *timer, unsigned long expires)
   {
-       return __mod_timer(timer, expires, true);
+       return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
   }
   EXPORT_SYMBOL(mod_timer_pending);
   
@@@ -703,10 -714,32 +715,32 @@@ int mod_timer(struct timer_list *timer
         if (timer->expires == expires && timer_pending(timer))
                 return 1;
   
-       return __mod_timer(timer, expires, false);
+       return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
   }
   EXPORT_SYMBOL(mod_timer);
   
+ /**
+  * mod_timer_pinned - modify a timer's timeout
+  * @timer: the timer to be modified
+  * @expires: new timeout in jiffies
+  *
+  * mod_timer_pinned() is a way to update the expire field of an
+  * active timer (if the timer is inactive it will be activated)
+  * and not allow the timer to be migrated to a different CPU.
+  *
+  * mod_timer_pinned(timer, expires) is equivalent to:
+  *
+  *     del_timer(timer); timer->expires = expires; add_timer(timer);
+  */
+ int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+ {
+       if (timer->expires == expires && timer_pending(timer))
+               return 1;
+ 
+       return __mod_timer(timer, expires, false, TIMER_PINNED);
+ }
+ EXPORT_SYMBOL(mod_timer_pinned);
+ 
   /**
    * add_timer - start a timer
    * @timer: the timer to be added
@@@ -757,7 -790,6 +791,7 @@@ void add_timer_on(struct timer_list *ti
         wake_up_idle_cpu(cpu);
         spin_unlock_irqrestore(&base->lock, flags);
   }
+ +EXPORT_SYMBOL_GPL(add_timer_on);
   
   /**
    * del_timer - deactive a timer.
@@@ -1017,6 -1049,9 +1051,9 @@@ cascade
                 index = slot = timer_jiffies & TVN_MASK;
                 do {
                         list_for_each_entry(nte, varp->vec + slot, entry) {
+                               if (tbase_get_deferrable(nte->base))
+                                       continue;
+ 
                                 found = 1;
                                 if (time_before(nte->expires, expires))
                                         expires = nte->expires;
@@@ -1124,6 -1159,47 +1161,6 @@@ void update_process_times(int user_tick
         run_posix_cpu_timers(p);
   }
   
- -/*
- - * Nr of active tasks - counted in fixed-point numbers
- - */
- -static unsigned long count_active_tasks(void)
- -{
- -      return nr_active() * FIXED_1;
- -}
- -
- -/*
- - * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- - * imply that avenrun[] is the standard name for this kind of thing.
- - * Nothing else seems to be standardized: the fractional size etc
- - * all seem to differ on different machines.
- - *
- - * Requires xtime_lock to access.
- - */
- -unsigned long avenrun[3];
- -
- -EXPORT_SYMBOL(avenrun);
- -
- -/*
- - * calc_load - given tick count, update the avenrun load estimates.
- - * This is called while holding a write_lock on xtime_lock.
- - */
- -static inline void calc_load(unsigned long ticks)
- -{
- -      unsigned long active_tasks; /* fixed-point */
- -      static int count = LOAD_FREQ;
- -
- -      count -= ticks;
- -      if (unlikely(count < 0)) {
- -              active_tasks = count_active_tasks();
- -              do {
- -                      CALC_LOAD(avenrun[0], EXP_1, active_tasks);
- -                      CALC_LOAD(avenrun[1], EXP_5, active_tasks);
- -                      CALC_LOAD(avenrun[2], EXP_15, active_tasks);
- -                      count += LOAD_FREQ;
- -              } while (count < 0);
- -      }
- -}
- -
   /*
    * This function runs timers and the timer-tq in bottom half context.
    */
@@@ -1131,8 -1207,6 +1168,8 @@@ static void run_timer_softirq(struct so
   {
         struct tvec_base *base = __get_cpu_var(tvec_bases);
   
+ +      perf_counter_do_pending();
+ +
         hrtimer_run_pending();
   
         if (time_after_eq(jiffies, base->timer_jiffies))
@@@ -1149,6 -1223,16 +1186,6 @@@ void run_local_timers(void
         softlockup_tick();
   }
   
- -/*
- - * Called by the timer interrupt. xtime_lock must already be taken
- - * by the timer IRQ!
- - */
- -static inline void update_times(unsigned long ticks)
- -{
- -      update_wall_time();
- -      calc_load(ticks);
- -}
- -
   /*
    * The 64-bit jiffies value is not atomic - you MUST NOT read it
    * without sampling the sequence number in xtime_lock.
@@@ -1158,8 -1242,7 +1195,8 @@@
   void do_timer(unsigned long ticks)
   {
         jiffies_64 += ticks;
- -      update_times(ticks);
+ +      update_wall_time();
+ +      calc_global_load();
   }
   
   #ifdef __ARCH_WANT_SYS_ALARM
@@@ -1307,7 -1390,7 +1344,7 @@@ signed long __sched schedule_timeout(si
         expire = timeout + jiffies;
   
         setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-       __mod_timer(&timer, expire, false);
+       __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
         schedule();
         del_singleshot_timer_sync(&timer);
   
@@@ -1360,17 -1443,37 +1397,17 @@@ int do_sysinfo(struct sysinfo *info
   {
         unsigned long mem_total, sav_total;
         unsigned int mem_unit, bitcount;
- -      unsigned long seq;
+ +      struct timespec tp;
   
         memset(info, 0, sizeof(struct sysinfo));
   
- -      do {
- -              struct timespec tp;
- -              seq = read_seqbegin(&xtime_lock);
- -
- -              /*
- -               * This is annoying.  The below is the same thing
- -               * posix_get_clock_monotonic() does, but it wants to
- -               * take the lock which we want to cover the loads stuff
- -               * too.
- -               */
- -
- -              getnstimeofday(&tp);
- -              tp.tv_sec += wall_to_monotonic.tv_sec;
- -              tp.tv_nsec += wall_to_monotonic.tv_nsec;
- -              monotonic_to_bootbased(&tp);
- -              if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
- -                      tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
- -                      tp.tv_sec++;
- -              }
- -              info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+ +      ktime_get_ts(&tp);
+ +      monotonic_to_bootbased(&tp);
+ +      info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
   
- -              info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
- -              info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
- -              info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+ +      get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
   
- -              info->procs = nr_threads;
- -      } while (read_seqretry(&xtime_lock, seq));
+ +      info->procs = nr_threads;
   
         si_meminfo(info);
         si_swapinfo(info);
diff --combined kernel/trace/trace_sysprof.c

index e04b76cc238a69816cd96b57146afbc97cc36da1,d180554bc935ee245100f8cf67bfe6def2172f06..f6693969287d83e716733f0d21c6356f70ec8ff5
--- 1/kernel/trace/trace_sysprof.c
--- 2/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@@ -203,7 -203,8 +203,8 @@@ static void start_stack_timer(void *unu
         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         hrtimer->function = stack_trace_timer_fn;
   
-       hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+       hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+                     HRTIMER_MODE_REL_PINNED);
   }
   
   static void start_stack_timers(void)
@@@ -321,7 -322,11 +322,7 @@@ static const struct file_operations sys
   
   void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
   {
- -      struct dentry *entry;
   
- -      entry = debugfs_create_file("sysprof_sample_period", 0644,
+ +      trace_create_file("sysprof_sample_period", 0644,
                         d_tracer, NULL, &sysprof_sample_fops);
- -      if (entry)
- -              return;
- -      pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 15 Jun 2009 17:06:19 +0000 (10:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 15 Jun 2009 17:06:19 +0000 (10:06 -0700)
		1	2
arch/x86/kernel/apic/x2apic_uv_x.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/clockevents.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/timer.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace_sysprof.c	patch \|	diff1 \|	diff2 \|	blob \| history