4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <linux/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h>
30 #include <linux/capability.h>
31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h>
33 #include <linux/debug_locks.h>
34 #include <linux/security.h>
35 #include <linux/notifier.h>
36 #include <linux/profile.h>
37 #include <linux/freezer.h>
38 #include <linux/vmalloc.h>
39 #include <linux/blkdev.h>
40 #include <linux/delay.h>
41 #include <linux/smp.h>
42 #include <linux/threads.h>
43 #include <linux/timer.h>
44 #include <linux/rcupdate.h>
45 #include <linux/cpu.h>
46 #include <linux/cpuset.h>
47 #include <linux/percpu.h>
48 #include <linux/kthread.h>
49 #include <linux/seq_file.h>
50 #include <linux/syscalls.h>
51 #include <linux/times.h>
52 #include <linux/tsacct_kern.h>
53 #include <linux/kprobes.h>
54 #include <linux/delayacct.h>
55 #include <linux/reciprocal_div.h>
56 #include <linux/unistd.h>
61 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
65 unsigned long long __attribute__((weak)) sched_clock(void)
67 return (unsigned long long)jiffies * (1000000000 / HZ);
71 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
75 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
84 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
89 * Some helpers for converting nanosecond timing to jiffy resolution
91 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
94 #define NICE_0_LOAD SCHED_LOAD_SCALE
95 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
98 * These are the 'tuning knobs' of the scheduler:
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
104 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105 #define DEF_TIMESLICE (100 * HZ / 1000)
106 #define ON_RUNQUEUE_WEIGHT 30
107 #define CHILD_PENALTY 95
108 #define PARENT_PENALTY 100
109 #define EXIT_WEIGHT 3
110 #define PRIO_BONUS_RATIO 25
111 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
112 #define INTERACTIVE_DELTA 2
113 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
114 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
115 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
118 * If a task is 'interactive' then we reinsert it in the active
119 * array after it has expired its current timeslice. (it will not
120 * continue to run immediately, it will still roundrobin with
121 * other interactive tasks.)
123 * This part scales the interactivity limit depending on niceness.
125 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
126 * Here are a few examples of different nice levels:
128 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
129 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
130 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
131 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
132 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
134 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
135 * priority range a task can explore, a value of '1' means the
136 * task is rated interactive.)
138 * Ie. nice +19 tasks can never get 'interactive' enough to be
139 * reinserted into the active array. And only heavily CPU-hog nice -20
140 * tasks will be expired. Default nice 0 tasks are somewhere between,
141 * it takes some effort for them to get interactive, but it's not
145 #define CURRENT_BONUS(p) \
146 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
149 #define GRANULARITY (10 * HZ / 1000 ? : 1)
152 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
153 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
156 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
157 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
160 #define SCALE(v1,v1_max,v2_max) \
161 (v1) * (v2_max) / (v1_max)
164 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
167 #define TASK_INTERACTIVE(p) \
168 ((p)->prio <= (p)->static_prio - DELTA(p))
170 #define INTERACTIVE_SLEEP(p) \
171 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
172 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
174 #define TASK_PREEMPTS_CURR(p, rq) \
175 ((p)->prio < (rq)->curr->prio)
177 #define SCALE_PRIO(x, prio) \
178 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
180 static unsigned int static_prio_timeslice(int static_prio)
182 if (static_prio < NICE_TO_PRIO(0))
183 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
185 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
190 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
191 * Since cpu_power is a 'constant', we can use a reciprocal divide.
193 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
195 return reciprocal_divide(load, sg->reciprocal_cpu_power);
199 * Each time a sched group cpu_power is changed,
200 * we must compute its reciprocal value
202 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
204 sg->__cpu_power += val;
205 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
210 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
211 * to time slice values: [800ms ... 100ms ... 5ms]
213 * The higher a thread's priority, the bigger timeslices
214 * it gets during one round of execution. But even the lowest
215 * priority thread gets MIN_TIMESLICE worth of execution time.
218 static inline unsigned int task_timeslice(struct task_struct *p)
220 return static_prio_timeslice(p->static_prio);
223 static inline int rt_policy(int policy)
225 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
230 static inline int task_has_rt_policy(struct task_struct *p)
232 return rt_policy(p->policy);
236 * This is the priority-queue data structure of the RT scheduling class:
238 struct rt_prio_array {
239 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
240 struct list_head queue[MAX_RT_PRIO];
244 struct load_weight load;
245 u64 load_update_start, load_update_last;
246 unsigned long delta_fair, delta_exec, delta_stat;
249 /* CFS-related fields in a runqueue */
251 struct load_weight load;
252 unsigned long nr_running;
258 unsigned long wait_runtime_overruns, wait_runtime_underruns;
260 struct rb_root tasks_timeline;
261 struct rb_node *rb_leftmost;
262 struct rb_node *rb_load_balance_curr;
263 #ifdef CONFIG_FAIR_GROUP_SCHED
264 /* 'curr' points to currently running entity on this cfs_rq.
265 * It is set to NULL otherwise (i.e when none are currently running).
267 struct sched_entity *curr;
268 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
270 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
271 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
272 * (like users, containers etc.)
274 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
275 * list is used during load balance.
277 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
281 /* Real-Time classes' related field in a runqueue: */
283 struct rt_prio_array active;
284 int rt_load_balance_idx;
285 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
289 * The prio-array type of the old scheduler:
292 unsigned int nr_active;
293 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
294 struct list_head queue[MAX_PRIO];
298 * This is the main, per-CPU runqueue data structure.
300 * Locking rule: those places that want to lock multiple runqueues
301 * (such as the load balancing or the thread migration code), lock
302 * acquire operations must be ordered by ascending &runqueue.
305 spinlock_t lock; /* runqueue lock */
308 * nr_running and cpu_load should be in the same cacheline because
309 * remote CPUs use both these fields when doing load calculation.
311 unsigned long nr_running;
312 unsigned long raw_weighted_load;
313 #define CPU_LOAD_IDX_MAX 5
314 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
315 unsigned char idle_at_tick;
317 unsigned char in_nohz_recently;
319 struct load_stat ls; /* capture load from *all* tasks on this cpu */
320 unsigned long nr_load_updates;
324 #ifdef CONFIG_FAIR_GROUP_SCHED
325 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
330 * This is part of a global counter where only the total sum
331 * over all CPUs matters. A task can increase this counter on
332 * one CPU and if it got migrated afterwards it may decrease
333 * it on another CPU. Always updated under the runqueue lock:
335 unsigned long nr_uninterruptible;
337 unsigned long expired_timestamp;
338 unsigned long long most_recent_timestamp;
340 struct task_struct *curr, *idle;
341 unsigned long next_balance;
342 struct mm_struct *prev_mm;
344 struct prio_array *active, *expired, arrays[2];
345 int best_expired_prio;
347 u64 clock, prev_clock_raw;
350 unsigned int clock_warps, clock_overflows;
351 unsigned int clock_unstable_events;
353 struct sched_class *load_balance_class;
358 struct sched_domain *sd;
360 /* For active balancing */
363 int cpu; /* cpu of this runqueue */
365 struct task_struct *migration_thread;
366 struct list_head migration_queue;
369 #ifdef CONFIG_SCHEDSTATS
371 struct sched_info rq_sched_info;
373 /* sys_sched_yield() stats */
374 unsigned long yld_exp_empty;
375 unsigned long yld_act_empty;
376 unsigned long yld_both_empty;
377 unsigned long yld_cnt;
379 /* schedule() stats */
380 unsigned long sched_switch;
381 unsigned long sched_cnt;
382 unsigned long sched_goidle;
384 /* try_to_wake_up() stats */
385 unsigned long ttwu_cnt;
386 unsigned long ttwu_local;
388 struct lock_class_key rq_lock_key;
391 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
392 static DEFINE_MUTEX(sched_hotcpu_mutex);
394 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
396 rq->curr->sched_class->check_preempt_curr(rq, p);
399 static inline int cpu_of(struct rq *rq)
409 * Per-runqueue clock, as finegrained as the platform can give us:
411 static unsigned long long __rq_clock(struct rq *rq)
413 u64 prev_raw = rq->prev_clock_raw;
414 u64 now = sched_clock();
415 s64 delta = now - prev_raw;
416 u64 clock = rq->clock;
419 * Protect against sched_clock() occasionally going backwards:
421 if (unlikely(delta < 0)) {
426 * Catch too large forward jumps too:
428 if (unlikely(delta > 2*TICK_NSEC)) {
430 rq->clock_overflows++;
432 if (unlikely(delta > rq->clock_max_delta))
433 rq->clock_max_delta = delta;
438 rq->prev_clock_raw = now;
444 static inline unsigned long long rq_clock(struct rq *rq)
446 int this_cpu = smp_processor_id();
448 if (this_cpu == cpu_of(rq))
449 return __rq_clock(rq);
455 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
456 * See detach_destroy_domains: synchronize_sched for details.
458 * The domain tree of any CPU may only be accessed from within
459 * preempt-disabled sections.
461 #define for_each_domain(cpu, __sd) \
462 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
464 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
465 #define this_rq() (&__get_cpu_var(runqueues))
466 #define task_rq(p) cpu_rq(task_cpu(p))
467 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
469 #ifdef CONFIG_FAIR_GROUP_SCHED
470 /* Change a task's ->cfs_rq if it moves across CPUs */
471 static inline void set_task_cfs_rq(struct task_struct *p)
473 p->se.cfs_rq = &task_rq(p)->cfs;
476 static inline void set_task_cfs_rq(struct task_struct *p)
481 #ifndef prepare_arch_switch
482 # define prepare_arch_switch(next) do { } while (0)
484 #ifndef finish_arch_switch
485 # define finish_arch_switch(prev) do { } while (0)
488 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
489 static inline int task_running(struct rq *rq, struct task_struct *p)
491 return rq->curr == p;
494 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
498 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
500 #ifdef CONFIG_DEBUG_SPINLOCK
501 /* this is a valid case when another task releases the spinlock */
502 rq->lock.owner = current;
505 * If we are tracking spinlock dependencies then we have to
506 * fix up the runqueue lock - which gets 'carried over' from
509 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
511 spin_unlock_irq(&rq->lock);
514 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
515 static inline int task_running(struct rq *rq, struct task_struct *p)
520 return rq->curr == p;
524 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
528 * We can optimise this out completely for !SMP, because the
529 * SMP rebalancing from interrupt is the only thing that cares
534 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
535 spin_unlock_irq(&rq->lock);
537 spin_unlock(&rq->lock);
541 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
545 * After ->oncpu is cleared, the task can be moved to a different CPU.
546 * We must ensure this doesn't happen until the switch is completely
552 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
556 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
559 * __task_rq_lock - lock the runqueue a given task resides on.
560 * Must be called interrupts disabled.
562 static inline struct rq *__task_rq_lock(struct task_struct *p)
569 spin_lock(&rq->lock);
570 if (unlikely(rq != task_rq(p))) {
571 spin_unlock(&rq->lock);
572 goto repeat_lock_task;
578 * task_rq_lock - lock the runqueue a given task resides on and disable
579 * interrupts. Note the ordering: we can safely lookup the task_rq without
580 * explicitly disabling preemption.
582 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
588 local_irq_save(*flags);
590 spin_lock(&rq->lock);
591 if (unlikely(rq != task_rq(p))) {
592 spin_unlock_irqrestore(&rq->lock, *flags);
593 goto repeat_lock_task;
598 static inline void __task_rq_unlock(struct rq *rq)
601 spin_unlock(&rq->lock);
604 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
607 spin_unlock_irqrestore(&rq->lock, *flags);
611 * this_rq_lock - lock this runqueue and disable interrupts.
613 static inline struct rq *this_rq_lock(void)
620 spin_lock(&rq->lock);
626 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
628 void sched_clock_unstable_event(void)
633 rq = task_rq_lock(current, &flags);
634 rq->prev_clock_raw = sched_clock();
635 rq->clock_unstable_events++;
636 task_rq_unlock(rq, &flags);
640 * resched_task - mark a task 'to be rescheduled now'.
642 * On UP this means the setting of the need_resched flag, on SMP it
643 * might also involve a cross-CPU call to trigger the scheduler on
648 #ifndef tsk_is_polling
649 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
652 static void resched_task(struct task_struct *p)
656 assert_spin_locked(&task_rq(p)->lock);
658 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
661 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
664 if (cpu == smp_processor_id())
667 /* NEED_RESCHED must be visible before we test polling */
669 if (!tsk_is_polling(p))
670 smp_send_reschedule(cpu);
673 static void resched_cpu(int cpu)
675 struct rq *rq = cpu_rq(cpu);
678 if (!spin_trylock_irqsave(&rq->lock, flags))
680 resched_task(cpu_curr(cpu));
681 spin_unlock_irqrestore(&rq->lock, flags);
684 static inline void resched_task(struct task_struct *p)
686 assert_spin_locked(&task_rq(p)->lock);
687 set_tsk_need_resched(p);
691 static u64 div64_likely32(u64 divident, unsigned long divisor)
693 #if BITS_PER_LONG == 32
694 if (likely(divident <= 0xffffffffULL))
695 return (u32)divident / divisor;
696 do_div(divident, divisor);
700 return divident / divisor;
704 #if BITS_PER_LONG == 32
705 # define WMULT_CONST (~0UL)
707 # define WMULT_CONST (1UL << 32)
710 #define WMULT_SHIFT 32
712 static inline unsigned long
713 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
714 struct load_weight *lw)
718 if (unlikely(!lw->inv_weight))
719 lw->inv_weight = WMULT_CONST / lw->weight;
721 tmp = (u64)delta_exec * weight;
723 * Check whether we'd overflow the 64-bit multiplication:
725 if (unlikely(tmp > WMULT_CONST)) {
726 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
729 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
732 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
735 static inline unsigned long
736 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
738 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
741 static void update_load_add(struct load_weight *lw, unsigned long inc)
747 static void update_load_sub(struct load_weight *lw, unsigned long dec)
753 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
755 if (rq->curr != rq->idle && ls->load.weight) {
756 ls->delta_exec += ls->delta_stat;
757 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
763 * Update delta_exec, delta_fair fields for rq.
765 * delta_fair clock advances at a rate inversely proportional to
766 * total load (rq->ls.load.weight) on the runqueue, while
767 * delta_exec advances at the same rate as wall-clock (provided
770 * delta_exec / delta_fair is a measure of the (smoothened) load on this
771 * runqueue over any given interval. This (smoothened) load is used
772 * during load balance.
774 * This function is called /before/ updating rq->ls.load
775 * and when switching tasks.
777 static void update_curr_load(struct rq *rq, u64 now)
779 struct load_stat *ls = &rq->ls;
782 start = ls->load_update_start;
783 ls->load_update_start = now;
784 ls->delta_stat += now - start;
786 * Stagger updates to ls->delta_fair. Very frequent updates
789 if (ls->delta_stat >= sysctl_sched_stat_granularity)
790 __update_curr_load(rq, ls);
794 * To aid in avoiding the subversion of "niceness" due to uneven distribution
795 * of tasks with abnormal "nice" values across CPUs the contribution that
796 * each task makes to its run queue's load is weighted according to its
797 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
798 * scaled version of the new time slice allocation that they receive on time
803 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
804 * If static_prio_timeslice() is ever changed to break this assumption then
805 * this code will need modification
807 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
808 #define load_weight(lp) \
809 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
810 #define PRIO_TO_LOAD_WEIGHT(prio) \
811 load_weight(static_prio_timeslice(prio))
812 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
813 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
815 #define WEIGHT_IDLEPRIO 2
816 #define WMULT_IDLEPRIO (1 << 31)
819 * Nice levels are multiplicative, with a gentle 10% change for every
820 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
821 * nice 1, it will get ~10% less CPU time than another CPU-bound task
822 * that remained on nice 0.
824 * The "10% effect" is relative and cumulative: from _any_ nice level,
825 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
826 * it's +10% CPU usage.
828 static const int prio_to_weight[40] = {
829 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
830 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
831 /* 0 */ NICE_0_LOAD /* 1024 */,
832 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
833 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
836 static const u32 prio_to_wmult[40] = {
837 48356, 60446, 75558, 94446, 118058, 147573,
838 184467, 230589, 288233, 360285, 450347,
839 562979, 703746, 879575, 1099582, 1374389,
840 717986, 2147483, 2684354, 3355443, 4194304,
841 244160, 6557201, 8196502, 10250518, 12782640,
842 16025997, 19976592, 24970740, 31350126, 39045157,
843 49367440, 61356675, 76695844, 95443717, 119304647,
844 148102320, 186737708, 238609294, 286331153,
848 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
850 update_curr_load(rq, now);
851 update_load_add(&rq->ls.load, p->se.load.weight);
855 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
857 update_curr_load(rq, now);
858 update_load_sub(&rq->ls.load, p->se.load.weight);
861 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
864 inc_load(rq, p, now);
867 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
870 dec_load(rq, p, now);
873 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
876 * runqueue iterator, to support SMP load-balancing between different
877 * scheduling classes, without having to expose their internal data
878 * structures to the load-balancing proper:
882 struct task_struct *(*start)(void *);
883 struct task_struct *(*next)(void *);
886 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
887 unsigned long max_nr_move, unsigned long max_load_move,
888 struct sched_domain *sd, enum cpu_idle_type idle,
889 int *all_pinned, unsigned long *load_moved,
890 int this_best_prio, int best_prio, int best_prio_seen,
891 struct rq_iterator *iterator);
893 #include "sched_stats.h"
894 #include "sched_rt.c"
895 #include "sched_fair.c"
896 #include "sched_idletask.c"
897 #ifdef CONFIG_SCHED_DEBUG
898 # include "sched_debug.c"
901 #define sched_class_highest (&rt_sched_class)
903 static void set_load_weight(struct task_struct *p)
905 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
906 p->se.wait_runtime = 0;
908 if (task_has_rt_policy(p)) {
909 p->se.load.weight = prio_to_weight[0] * 2;
910 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
915 * SCHED_IDLE tasks get minimal weight:
917 if (p->policy == SCHED_IDLE) {
918 p->se.load.weight = WEIGHT_IDLEPRIO;
919 p->se.load.inv_weight = WMULT_IDLEPRIO;
923 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
924 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
928 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
930 sched_info_queued(p);
931 p->sched_class->enqueue_task(rq, p, wakeup, now);
936 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
938 p->sched_class->dequeue_task(rq, p, sleep, now);
943 * __normal_prio - return the priority that is based on the static prio
945 static inline int __normal_prio(struct task_struct *p)
947 return p->static_prio;
951 * Calculate the expected normal priority: i.e. priority
952 * without taking RT-inheritance into account. Might be
953 * boosted by interactivity modifiers. Changes upon fork,
954 * setprio syscalls, and whenever the interactivity
955 * estimator recalculates.
957 static inline int normal_prio(struct task_struct *p)
961 if (task_has_rt_policy(p))
962 prio = MAX_RT_PRIO-1 - p->rt_priority;
964 prio = __normal_prio(p);
969 * Calculate the current priority, i.e. the priority
970 * taken into account by the scheduler. This value might
971 * be boosted by RT tasks, or might be boosted by
972 * interactivity modifiers. Will be RT if the task got
973 * RT-boosted. If not then it returns p->normal_prio.
975 static int effective_prio(struct task_struct *p)
977 p->normal_prio = normal_prio(p);
979 * If we are RT tasks or we were boosted to RT priority,
980 * keep the priority unchanged. Otherwise, update priority
981 * to the normal priority:
983 if (!rt_prio(p->prio))
984 return p->normal_prio;
989 * activate_task - move a task to the runqueue.
991 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
993 u64 now = rq_clock(rq);
995 if (p->state == TASK_UNINTERRUPTIBLE)
996 rq->nr_uninterruptible--;
998 enqueue_task(rq, p, wakeup, now);
999 inc_nr_running(p, rq, now);
1003 * activate_idle_task - move idle task to the _front_ of runqueue.
1005 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
1007 u64 now = rq_clock(rq);
1009 if (p->state == TASK_UNINTERRUPTIBLE)
1010 rq->nr_uninterruptible--;
1012 enqueue_task(rq, p, 0, now);
1013 inc_nr_running(p, rq, now);
1017 * deactivate_task - remove a task from the runqueue.
1019 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1021 u64 now = rq_clock(rq);
1023 if (p->state == TASK_UNINTERRUPTIBLE)
1024 rq->nr_uninterruptible++;
1026 dequeue_task(rq, p, sleep, now);
1027 dec_nr_running(p, rq, now);
1031 * task_curr - is this task currently executing on a CPU?
1032 * @p: the task in question.
1034 inline int task_curr(const struct task_struct *p)
1036 return cpu_curr(task_cpu(p)) == p;
1039 /* Used instead of source_load when we know the type == 0 */
1040 unsigned long weighted_cpuload(const int cpu)
1042 return cpu_rq(cpu)->ls.load.weight;
1045 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1048 task_thread_info(p)->cpu = cpu;
1055 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1057 int old_cpu = task_cpu(p);
1058 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1059 u64 clock_offset, fair_clock_offset;
1061 clock_offset = old_rq->clock - new_rq->clock;
1062 fair_clock_offset = old_rq->cfs.fair_clock -
1063 new_rq->cfs.fair_clock;
1064 if (p->se.wait_start)
1065 p->se.wait_start -= clock_offset;
1066 if (p->se.wait_start_fair)
1067 p->se.wait_start_fair -= fair_clock_offset;
1068 if (p->se.sleep_start)
1069 p->se.sleep_start -= clock_offset;
1070 if (p->se.block_start)
1071 p->se.block_start -= clock_offset;
1072 if (p->se.sleep_start_fair)
1073 p->se.sleep_start_fair -= fair_clock_offset;
1075 __set_task_cpu(p, new_cpu);
1078 struct migration_req {
1079 struct list_head list;
1081 struct task_struct *task;
1084 struct completion done;
1088 * The task's runqueue lock must be held.
1089 * Returns true if you have to wait for migration thread.
1092 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1094 struct rq *rq = task_rq(p);
1097 * If the task is not on a runqueue (and not running), then
1098 * it is sufficient to simply update the task's cpu field.
1100 if (!p->se.on_rq && !task_running(rq, p)) {
1101 set_task_cpu(p, dest_cpu);
1105 init_completion(&req->done);
1107 req->dest_cpu = dest_cpu;
1108 list_add(&req->list, &rq->migration_queue);
1114 * wait_task_inactive - wait for a thread to unschedule.
1116 * The caller must ensure that the task *will* unschedule sometime soon,
1117 * else this function might spin for a *long* time. This function can't
1118 * be called with interrupts off, or it may introduce deadlock with
1119 * smp_call_function() if an IPI is sent by the same process we are
1120 * waiting to become inactive.
1122 void wait_task_inactive(struct task_struct *p)
1124 unsigned long flags;
1130 * We do the initial early heuristics without holding
1131 * any task-queue locks at all. We'll only try to get
1132 * the runqueue lock when things look like they will
1138 * If the task is actively running on another CPU
1139 * still, just relax and busy-wait without holding
1142 * NOTE! Since we don't hold any locks, it's not
1143 * even sure that "rq" stays as the right runqueue!
1144 * But we don't care, since "task_running()" will
1145 * return false if the runqueue has changed and p
1146 * is actually now running somewhere else!
1148 while (task_running(rq, p))
1152 * Ok, time to look more closely! We need the rq
1153 * lock now, to be *sure*. If we're wrong, we'll
1154 * just go back and repeat.
1156 rq = task_rq_lock(p, &flags);
1157 running = task_running(rq, p);
1158 on_rq = p->se.on_rq;
1159 task_rq_unlock(rq, &flags);
1162 * Was it really running after all now that we
1163 * checked with the proper locks actually held?
1165 * Oops. Go back and try again..
1167 if (unlikely(running)) {
1173 * It's not enough that it's not actively running,
1174 * it must be off the runqueue _entirely_, and not
1177 * So if it wa still runnable (but just not actively
1178 * running right now), it's preempted, and we should
1179 * yield - it could be a while.
1181 if (unlikely(on_rq)) {
1187 * Ahh, all good. It wasn't running, and it wasn't
1188 * runnable, which means that it will never become
1189 * running in the future either. We're all done!
1194 * kick_process - kick a running thread to enter/exit the kernel
1195 * @p: the to-be-kicked thread
1197 * Cause a process which is running on another CPU to enter
1198 * kernel-mode, without any delay. (to get signals handled.)
1200 * NOTE: this function doesnt have to take the runqueue lock,
1201 * because all it wants to ensure is that the remote task enters
1202 * the kernel. If the IPI races and the task has been migrated
1203 * to another CPU then no harm is done and the purpose has been
1206 void kick_process(struct task_struct *p)
1212 if ((cpu != smp_processor_id()) && task_curr(p))
1213 smp_send_reschedule(cpu);
1218 * Return a low guess at the load of a migration-source cpu weighted
1219 * according to the scheduling class and "nice" value.
1221 * We want to under-estimate the load of migration sources, to
1222 * balance conservatively.
1224 static inline unsigned long source_load(int cpu, int type)
1226 struct rq *rq = cpu_rq(cpu);
1227 unsigned long total = weighted_cpuload(cpu);
1232 return min(rq->cpu_load[type-1], total);
1236 * Return a high guess at the load of a migration-target cpu weighted
1237 * according to the scheduling class and "nice" value.
1239 static inline unsigned long target_load(int cpu, int type)
1241 struct rq *rq = cpu_rq(cpu);
1242 unsigned long total = weighted_cpuload(cpu);
1247 return max(rq->cpu_load[type-1], total);
1251 * Return the average load per task on the cpu's run queue
1253 static inline unsigned long cpu_avg_load_per_task(int cpu)
1255 struct rq *rq = cpu_rq(cpu);
1256 unsigned long total = weighted_cpuload(cpu);
1257 unsigned long n = rq->nr_running;
1259 return n ? total / n : SCHED_LOAD_SCALE;
1263 * find_idlest_group finds and returns the least busy CPU group within the
1266 static struct sched_group *
1267 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1269 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1270 unsigned long min_load = ULONG_MAX, this_load = 0;
1271 int load_idx = sd->forkexec_idx;
1272 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1275 unsigned long load, avg_load;
1279 /* Skip over this group if it has no CPUs allowed */
1280 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1283 local_group = cpu_isset(this_cpu, group->cpumask);
1285 /* Tally up the load of all CPUs in the group */
1288 for_each_cpu_mask(i, group->cpumask) {
1289 /* Bias balancing toward cpus of our domain */
1291 load = source_load(i, load_idx);
1293 load = target_load(i, load_idx);
1298 /* Adjust by relative CPU power of the group */
1299 avg_load = sg_div_cpu_power(group,
1300 avg_load * SCHED_LOAD_SCALE);
1303 this_load = avg_load;
1305 } else if (avg_load < min_load) {
1306 min_load = avg_load;
1310 group = group->next;
1311 } while (group != sd->groups);
1313 if (!idlest || 100*this_load < imbalance*min_load)
1319 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1322 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1325 unsigned long load, min_load = ULONG_MAX;
1329 /* Traverse only the allowed CPUs */
1330 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1332 for_each_cpu_mask(i, tmp) {
1333 load = weighted_cpuload(i);
1335 if (load < min_load || (load == min_load && i == this_cpu)) {
1345 * sched_balance_self: balance the current task (running on cpu) in domains
1346 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1349 * Balance, ie. select the least loaded group.
1351 * Returns the target CPU number, or the same CPU if no balancing is needed.
1353 * preempt must be disabled.
1355 static int sched_balance_self(int cpu, int flag)
1357 struct task_struct *t = current;
1358 struct sched_domain *tmp, *sd = NULL;
1360 for_each_domain(cpu, tmp) {
1362 * If power savings logic is enabled for a domain, stop there.
1364 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1366 if (tmp->flags & flag)
1372 struct sched_group *group;
1373 int new_cpu, weight;
1375 if (!(sd->flags & flag)) {
1381 group = find_idlest_group(sd, t, cpu);
1387 new_cpu = find_idlest_cpu(group, t, cpu);
1388 if (new_cpu == -1 || new_cpu == cpu) {
1389 /* Now try balancing at a lower domain level of cpu */
1394 /* Now try balancing at a lower domain level of new_cpu */
1397 weight = cpus_weight(span);
1398 for_each_domain(cpu, tmp) {
1399 if (weight <= cpus_weight(tmp->span))
1401 if (tmp->flags & flag)
1404 /* while loop will break here if sd == NULL */
1410 #endif /* CONFIG_SMP */
1413 * wake_idle() will wake a task on an idle cpu if task->cpu is
1414 * not idle and an idle cpu is available. The span of cpus to
1415 * search starts with cpus closest then further out as needed,
1416 * so we always favor a closer, idle cpu.
1418 * Returns the CPU we should wake onto.
1420 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1421 static int wake_idle(int cpu, struct task_struct *p)
1424 struct sched_domain *sd;
1428 * If it is idle, then it is the best cpu to run this task.
1430 * This cpu is also the best, if it has more than one task already.
1431 * Siblings must be also busy(in most cases) as they didn't already
1432 * pickup the extra load from this cpu and hence we need not check
1433 * sibling runqueue info. This will avoid the checks and cache miss
1434 * penalities associated with that.
1436 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1439 for_each_domain(cpu, sd) {
1440 if (sd->flags & SD_WAKE_IDLE) {
1441 cpus_and(tmp, sd->span, p->cpus_allowed);
1442 for_each_cpu_mask(i, tmp) {
1453 static inline int wake_idle(int cpu, struct task_struct *p)
1460 * try_to_wake_up - wake up a thread
1461 * @p: the to-be-woken-up thread
1462 * @state: the mask of task states that can be woken
1463 * @sync: do a synchronous wakeup?
1465 * Put it on the run-queue if it's not already there. The "current"
1466 * thread is always on the run-queue (except when the actual
1467 * re-schedule is in progress), and as such you're allowed to do
1468 * the simpler "current->state = TASK_RUNNING" to mark yourself
1469 * runnable without the overhead of this.
1471 * returns failure only if the task is already active.
1473 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1475 int cpu, this_cpu, success = 0;
1476 unsigned long flags;
1480 struct sched_domain *sd, *this_sd = NULL;
1481 unsigned long load, this_load;
1485 rq = task_rq_lock(p, &flags);
1486 old_state = p->state;
1487 if (!(old_state & state))
1494 this_cpu = smp_processor_id();
1497 if (unlikely(task_running(rq, p)))
1502 schedstat_inc(rq, ttwu_cnt);
1503 if (cpu == this_cpu) {
1504 schedstat_inc(rq, ttwu_local);
1508 for_each_domain(this_cpu, sd) {
1509 if (cpu_isset(cpu, sd->span)) {
1510 schedstat_inc(sd, ttwu_wake_remote);
1516 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1520 * Check for affine wakeup and passive balancing possibilities.
1523 int idx = this_sd->wake_idx;
1524 unsigned int imbalance;
1526 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1528 load = source_load(cpu, idx);
1529 this_load = target_load(this_cpu, idx);
1531 new_cpu = this_cpu; /* Wake to this CPU if we can */
1533 if (this_sd->flags & SD_WAKE_AFFINE) {
1534 unsigned long tl = this_load;
1535 unsigned long tl_per_task;
1537 tl_per_task = cpu_avg_load_per_task(this_cpu);
1540 * If sync wakeup then subtract the (maximum possible)
1541 * effect of the currently running task from the load
1542 * of the current CPU:
1545 tl -= current->se.load.weight;
1548 tl + target_load(cpu, idx) <= tl_per_task) ||
1549 100*(tl + p->se.load.weight) <= imbalance*load) {
1551 * This domain has SD_WAKE_AFFINE and
1552 * p is cache cold in this domain, and
1553 * there is no bad imbalance.
1555 schedstat_inc(this_sd, ttwu_move_affine);
1561 * Start passive balancing when half the imbalance_pct
1564 if (this_sd->flags & SD_WAKE_BALANCE) {
1565 if (imbalance*this_load <= 100*load) {
1566 schedstat_inc(this_sd, ttwu_move_balance);
1572 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1574 new_cpu = wake_idle(new_cpu, p);
1575 if (new_cpu != cpu) {
1576 set_task_cpu(p, new_cpu);
1577 task_rq_unlock(rq, &flags);
1578 /* might preempt at this point */
1579 rq = task_rq_lock(p, &flags);
1580 old_state = p->state;
1581 if (!(old_state & state))
1586 this_cpu = smp_processor_id();
1591 #endif /* CONFIG_SMP */
1592 activate_task(rq, p, 1);
1594 * Sync wakeups (i.e. those types of wakeups where the waker
1595 * has indicated that it will leave the CPU in short order)
1596 * don't trigger a preemption, if the woken up task will run on
1597 * this cpu. (in this case the 'I will reschedule' promise of
1598 * the waker guarantees that the freshly woken up task is going
1599 * to be considered on this CPU.)
1601 if (!sync || cpu != this_cpu)
1602 check_preempt_curr(rq, p);
1606 p->state = TASK_RUNNING;
1608 task_rq_unlock(rq, &flags);
1613 int fastcall wake_up_process(struct task_struct *p)
1615 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1616 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1618 EXPORT_SYMBOL(wake_up_process);
1620 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1622 return try_to_wake_up(p, state, 0);
1626 * Perform scheduler related setup for a newly forked process p.
1627 * p is forked by current.
1629 * __sched_fork() is basic setup used by init_idle() too:
1631 static void __sched_fork(struct task_struct *p)
1633 p->se.wait_start_fair = 0;
1634 p->se.wait_start = 0;
1635 p->se.exec_start = 0;
1636 p->se.sum_exec_runtime = 0;
1637 p->se.delta_exec = 0;
1638 p->se.delta_fair_run = 0;
1639 p->se.delta_fair_sleep = 0;
1640 p->se.wait_runtime = 0;
1641 p->se.sum_wait_runtime = 0;
1642 p->se.sum_sleep_runtime = 0;
1643 p->se.sleep_start = 0;
1644 p->se.sleep_start_fair = 0;
1645 p->se.block_start = 0;
1646 p->se.sleep_max = 0;
1647 p->se.block_max = 0;
1650 p->se.wait_runtime_overruns = 0;
1651 p->se.wait_runtime_underruns = 0;
1653 INIT_LIST_HEAD(&p->run_list);
1657 * We mark the process as running here, but have not actually
1658 * inserted it onto the runqueue yet. This guarantees that
1659 * nobody will actually run it, and a signal or other external
1660 * event cannot wake it up and insert it on the runqueue either.
1662 p->state = TASK_RUNNING;
1666 * fork()/clone()-time setup:
1668 void sched_fork(struct task_struct *p, int clone_flags)
1670 int cpu = get_cpu();
1675 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1677 __set_task_cpu(p, cpu);
1680 * Make sure we do not leak PI boosting priority to the child:
1682 p->prio = current->normal_prio;
1684 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1685 if (likely(sched_info_on()))
1686 memset(&p->sched_info, 0, sizeof(p->sched_info));
1688 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1691 #ifdef CONFIG_PREEMPT
1692 /* Want to start with kernel preemption disabled. */
1693 task_thread_info(p)->preempt_count = 1;
1699 * After fork, child runs first. (default) If set to 0 then
1700 * parent will (try to) run first.
1702 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1705 * wake_up_new_task - wake up a newly created task for the first time.
1707 * This function will do some initial scheduler statistics housekeeping
1708 * that must be done for every newly created context, then puts the task
1709 * on the runqueue and wakes it.
1711 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1713 unsigned long flags;
1717 rq = task_rq_lock(p, &flags);
1718 BUG_ON(p->state != TASK_RUNNING);
1719 this_cpu = smp_processor_id(); /* parent's CPU */
1721 p->prio = effective_prio(p);
1723 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1724 task_cpu(p) != this_cpu || !current->se.on_rq) {
1725 activate_task(rq, p, 0);
1728 * Let the scheduling class do new task startup
1729 * management (if any):
1731 p->sched_class->task_new(rq, p);
1733 check_preempt_curr(rq, p);
1734 task_rq_unlock(rq, &flags);
1738 * prepare_task_switch - prepare to switch tasks
1739 * @rq: the runqueue preparing to switch
1740 * @next: the task we are going to switch to.
1742 * This is called with the rq lock held and interrupts off. It must
1743 * be paired with a subsequent finish_task_switch after the context
1746 * prepare_task_switch sets up locking and calls architecture specific
1749 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1751 prepare_lock_switch(rq, next);
1752 prepare_arch_switch(next);
1756 * finish_task_switch - clean up after a task-switch
1757 * @rq: runqueue associated with task-switch
1758 * @prev: the thread we just switched away from.
1760 * finish_task_switch must be called after the context switch, paired
1761 * with a prepare_task_switch call before the context switch.
1762 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1763 * and do any other architecture-specific cleanup actions.
1765 * Note that we may have delayed dropping an mm in context_switch(). If
1766 * so, we finish that here outside of the runqueue lock. (Doing it
1767 * with the lock held can cause deadlocks; see schedule() for
1770 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1771 __releases(rq->lock)
1773 struct mm_struct *mm = rq->prev_mm;
1779 * A task struct has one reference for the use as "current".
1780 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1781 * schedule one last time. The schedule call will never return, and
1782 * the scheduled task must drop that reference.
1783 * The test for TASK_DEAD must occur while the runqueue locks are
1784 * still held, otherwise prev could be scheduled on another cpu, die
1785 * there before we look at prev->state, and then the reference would
1787 * Manfred Spraul <manfred@colorfullife.com>
1789 prev_state = prev->state;
1790 finish_arch_switch(prev);
1791 finish_lock_switch(rq, prev);
1794 if (unlikely(prev_state == TASK_DEAD)) {
1796 * Remove function-return probe instances associated with this
1797 * task and put them back on the free list.
1799 kprobe_flush_task(prev);
1800 put_task_struct(prev);
1805 * schedule_tail - first thing a freshly forked thread must call.
1806 * @prev: the thread we just switched away from.
1808 asmlinkage void schedule_tail(struct task_struct *prev)
1809 __releases(rq->lock)
1811 struct rq *rq = this_rq();
1813 finish_task_switch(rq, prev);
1814 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1815 /* In this case, finish_task_switch does not reenable preemption */
1818 if (current->set_child_tid)
1819 put_user(current->pid, current->set_child_tid);
1823 * context_switch - switch to the new MM and the new
1824 * thread's register state.
1827 context_switch(struct rq *rq, struct task_struct *prev,
1828 struct task_struct *next)
1830 struct mm_struct *mm, *oldmm;
1832 prepare_task_switch(rq, next);
1834 oldmm = prev->active_mm;
1836 * For paravirt, this is coupled with an exit in switch_to to
1837 * combine the page table reload and the switch backend into
1840 arch_enter_lazy_cpu_mode();
1842 if (unlikely(!mm)) {
1843 next->active_mm = oldmm;
1844 atomic_inc(&oldmm->mm_count);
1845 enter_lazy_tlb(oldmm, next);
1847 switch_mm(oldmm, mm, next);
1849 if (unlikely(!prev->mm)) {
1850 prev->active_mm = NULL;
1851 rq->prev_mm = oldmm;
1854 * Since the runqueue lock will be released by the next
1855 * task (which is an invalid locking op but in the case
1856 * of the scheduler it's an obvious special-case), so we
1857 * do an early lockdep release here:
1859 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1860 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1863 /* Here we just switch the register state and the stack. */
1864 switch_to(prev, next, prev);
1868 * this_rq must be evaluated again because prev may have moved
1869 * CPUs since it called schedule(), thus the 'rq' on its stack
1870 * frame will be invalid.
1872 finish_task_switch(this_rq(), prev);
1876 * nr_running, nr_uninterruptible and nr_context_switches:
1878 * externally visible scheduler statistics: current number of runnable
1879 * threads, current number of uninterruptible-sleeping threads, total
1880 * number of context switches performed since bootup.
1882 unsigned long nr_running(void)
1884 unsigned long i, sum = 0;
1886 for_each_online_cpu(i)
1887 sum += cpu_rq(i)->nr_running;
1892 unsigned long nr_uninterruptible(void)
1894 unsigned long i, sum = 0;
1896 for_each_possible_cpu(i)
1897 sum += cpu_rq(i)->nr_uninterruptible;
1900 * Since we read the counters lockless, it might be slightly
1901 * inaccurate. Do not allow it to go below zero though:
1903 if (unlikely((long)sum < 0))
1909 unsigned long long nr_context_switches(void)
1912 unsigned long long sum = 0;
1914 for_each_possible_cpu(i)
1915 sum += cpu_rq(i)->nr_switches;
1920 unsigned long nr_iowait(void)
1922 unsigned long i, sum = 0;
1924 for_each_possible_cpu(i)
1925 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1930 unsigned long nr_active(void)
1932 unsigned long i, running = 0, uninterruptible = 0;
1934 for_each_online_cpu(i) {
1935 running += cpu_rq(i)->nr_running;
1936 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1939 if (unlikely((long)uninterruptible < 0))
1940 uninterruptible = 0;
1942 return running + uninterruptible;
1946 * Update rq->cpu_load[] statistics. This function is usually called every
1947 * scheduler tick (TICK_NSEC).
1949 static void update_cpu_load(struct rq *this_rq)
1951 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1952 unsigned long total_load = this_rq->ls.load.weight;
1953 unsigned long this_load = total_load;
1954 struct load_stat *ls = &this_rq->ls;
1955 u64 now = __rq_clock(this_rq);
1958 this_rq->nr_load_updates++;
1959 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1962 /* Update delta_fair/delta_exec fields first */
1963 update_curr_load(this_rq, now);
1965 fair_delta64 = ls->delta_fair + 1;
1968 exec_delta64 = ls->delta_exec + 1;
1971 sample_interval64 = now - ls->load_update_last;
1972 ls->load_update_last = now;
1974 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1975 sample_interval64 = TICK_NSEC;
1977 if (exec_delta64 > sample_interval64)
1978 exec_delta64 = sample_interval64;
1980 idle_delta64 = sample_interval64 - exec_delta64;
1982 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1983 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1985 this_load = (unsigned long)tmp64;
1989 /* Update our load: */
1990 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1991 unsigned long old_load, new_load;
1993 /* scale is effectively 1 << i now, and >> i divides by scale */
1995 old_load = this_rq->cpu_load[i];
1996 new_load = this_load;
1998 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2005 * double_rq_lock - safely lock two runqueues
2007 * Note this does not disable interrupts like task_rq_lock,
2008 * you need to do so manually before calling.
2010 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2011 __acquires(rq1->lock)
2012 __acquires(rq2->lock)
2014 BUG_ON(!irqs_disabled());
2016 spin_lock(&rq1->lock);
2017 __acquire(rq2->lock); /* Fake it out ;) */
2020 spin_lock(&rq1->lock);
2021 spin_lock(&rq2->lock);
2023 spin_lock(&rq2->lock);
2024 spin_lock(&rq1->lock);
2030 * double_rq_unlock - safely unlock two runqueues
2032 * Note this does not restore interrupts like task_rq_unlock,
2033 * you need to do so manually after calling.
2035 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2036 __releases(rq1->lock)
2037 __releases(rq2->lock)
2039 spin_unlock(&rq1->lock);
2041 spin_unlock(&rq2->lock);
2043 __release(rq2->lock);
2047 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2049 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2050 __releases(this_rq->lock)
2051 __acquires(busiest->lock)
2052 __acquires(this_rq->lock)
2054 if (unlikely(!irqs_disabled())) {
2055 /* printk() doesn't work good under rq->lock */
2056 spin_unlock(&this_rq->lock);
2059 if (unlikely(!spin_trylock(&busiest->lock))) {
2060 if (busiest < this_rq) {
2061 spin_unlock(&this_rq->lock);
2062 spin_lock(&busiest->lock);
2063 spin_lock(&this_rq->lock);
2065 spin_lock(&busiest->lock);
2070 * If dest_cpu is allowed for this process, migrate the task to it.
2071 * This is accomplished by forcing the cpu_allowed mask to only
2072 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2073 * the cpu_allowed mask is restored.
2075 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2077 struct migration_req req;
2078 unsigned long flags;
2081 rq = task_rq_lock(p, &flags);
2082 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2083 || unlikely(cpu_is_offline(dest_cpu)))
2086 /* force the process onto the specified CPU */
2087 if (migrate_task(p, dest_cpu, &req)) {
2088 /* Need to wait for migration thread (might exit: take ref). */
2089 struct task_struct *mt = rq->migration_thread;
2091 get_task_struct(mt);
2092 task_rq_unlock(rq, &flags);
2093 wake_up_process(mt);
2094 put_task_struct(mt);
2095 wait_for_completion(&req.done);
2100 task_rq_unlock(rq, &flags);
2104 * sched_exec - execve() is a valuable balancing opportunity, because at
2105 * this point the task has the smallest effective memory and cache footprint.
2107 void sched_exec(void)
2109 int new_cpu, this_cpu = get_cpu();
2110 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2112 if (new_cpu != this_cpu)
2113 sched_migrate_task(current, new_cpu);
2117 * pull_task - move a task from a remote runqueue to the local runqueue.
2118 * Both runqueues must be locked.
2120 static void pull_task(struct rq *src_rq, struct task_struct *p,
2121 struct rq *this_rq, int this_cpu)
2123 deactivate_task(src_rq, p, 0);
2124 set_task_cpu(p, this_cpu);
2125 activate_task(this_rq, p, 0);
2127 * Note that idle threads have a prio of MAX_PRIO, for this test
2128 * to be always true for them.
2130 check_preempt_curr(this_rq, p);
2134 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2137 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2138 struct sched_domain *sd, enum cpu_idle_type idle,
2142 * We do not migrate tasks that are:
2143 * 1) running (obviously), or
2144 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2145 * 3) are cache-hot on their current CPU.
2147 if (!cpu_isset(this_cpu, p->cpus_allowed))
2151 if (task_running(rq, p))
2155 * Aggressive migration if too many balance attempts have failed:
2157 if (sd->nr_balance_failed > sd->cache_nice_tries)
2163 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2164 unsigned long max_nr_move, unsigned long max_load_move,
2165 struct sched_domain *sd, enum cpu_idle_type idle,
2166 int *all_pinned, unsigned long *load_moved,
2167 int this_best_prio, int best_prio, int best_prio_seen,
2168 struct rq_iterator *iterator)
2170 int pulled = 0, pinned = 0, skip_for_load;
2171 struct task_struct *p;
2172 long rem_load_move = max_load_move;
2174 if (max_nr_move == 0 || max_load_move == 0)
2180 * Start the load-balancing iterator:
2182 p = iterator->start(iterator->arg);
2187 * To help distribute high priority tasks accross CPUs we don't
2188 * skip a task if it will be the highest priority task (i.e. smallest
2189 * prio value) on its new queue regardless of its load weight
2191 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2192 SCHED_LOAD_SCALE_FUZZ;
2193 if (skip_for_load && p->prio < this_best_prio)
2194 skip_for_load = !best_prio_seen && p->prio == best_prio;
2195 if (skip_for_load ||
2196 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2198 best_prio_seen |= p->prio == best_prio;
2199 p = iterator->next(iterator->arg);
2203 pull_task(busiest, p, this_rq, this_cpu);
2205 rem_load_move -= p->se.load.weight;
2208 * We only want to steal up to the prescribed number of tasks
2209 * and the prescribed amount of weighted load.
2211 if (pulled < max_nr_move && rem_load_move > 0) {
2212 if (p->prio < this_best_prio)
2213 this_best_prio = p->prio;
2214 p = iterator->next(iterator->arg);
2219 * Right now, this is the only place pull_task() is called,
2220 * so we can safely collect pull_task() stats here rather than
2221 * inside pull_task().
2223 schedstat_add(sd, lb_gained[idle], pulled);
2226 *all_pinned = pinned;
2227 *load_moved = max_load_move - rem_load_move;
2232 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2233 * load from busiest to this_rq, as part of a balancing operation within
2234 * "domain". Returns the number of tasks moved.
2236 * Called with both runqueues locked.
2238 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2239 unsigned long max_nr_move, unsigned long max_load_move,
2240 struct sched_domain *sd, enum cpu_idle_type idle,
2243 struct sched_class *class = sched_class_highest;
2244 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2245 long rem_load_move = max_load_move;
2248 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2249 max_nr_move, (unsigned long)rem_load_move,
2250 sd, idle, all_pinned, &load_moved);
2251 total_nr_moved += nr_moved;
2252 max_nr_move -= nr_moved;
2253 rem_load_move -= load_moved;
2254 class = class->next;
2255 } while (class && max_nr_move && rem_load_move > 0);
2257 return total_nr_moved;
2261 * find_busiest_group finds and returns the busiest CPU group within the
2262 * domain. It calculates and returns the amount of weighted load which
2263 * should be moved to restore balance via the imbalance parameter.
2265 static struct sched_group *
2266 find_busiest_group(struct sched_domain *sd, int this_cpu,
2267 unsigned long *imbalance, enum cpu_idle_type idle,
2268 int *sd_idle, cpumask_t *cpus, int *balance)
2270 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2271 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2272 unsigned long max_pull;
2273 unsigned long busiest_load_per_task, busiest_nr_running;
2274 unsigned long this_load_per_task, this_nr_running;
2276 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2277 int power_savings_balance = 1;
2278 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2279 unsigned long min_nr_running = ULONG_MAX;
2280 struct sched_group *group_min = NULL, *group_leader = NULL;
2283 max_load = this_load = total_load = total_pwr = 0;
2284 busiest_load_per_task = busiest_nr_running = 0;
2285 this_load_per_task = this_nr_running = 0;
2286 if (idle == CPU_NOT_IDLE)
2287 load_idx = sd->busy_idx;
2288 else if (idle == CPU_NEWLY_IDLE)
2289 load_idx = sd->newidle_idx;
2291 load_idx = sd->idle_idx;
2294 unsigned long load, group_capacity;
2297 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2298 unsigned long sum_nr_running, sum_weighted_load;
2300 local_group = cpu_isset(this_cpu, group->cpumask);
2303 balance_cpu = first_cpu(group->cpumask);
2305 /* Tally up the load of all CPUs in the group */
2306 sum_weighted_load = sum_nr_running = avg_load = 0;
2308 for_each_cpu_mask(i, group->cpumask) {
2311 if (!cpu_isset(i, *cpus))
2316 if (*sd_idle && !idle_cpu(i))
2319 /* Bias balancing toward cpus of our domain */
2321 if (idle_cpu(i) && !first_idle_cpu) {
2326 load = target_load(i, load_idx);
2328 load = source_load(i, load_idx);
2331 sum_nr_running += rq->nr_running;
2332 sum_weighted_load += weighted_cpuload(i);
2336 * First idle cpu or the first cpu(busiest) in this sched group
2337 * is eligible for doing load balancing at this and above
2340 if (local_group && balance_cpu != this_cpu && balance) {
2345 total_load += avg_load;
2346 total_pwr += group->__cpu_power;
2348 /* Adjust by relative CPU power of the group */
2349 avg_load = sg_div_cpu_power(group,
2350 avg_load * SCHED_LOAD_SCALE);
2352 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2355 this_load = avg_load;
2357 this_nr_running = sum_nr_running;
2358 this_load_per_task = sum_weighted_load;
2359 } else if (avg_load > max_load &&
2360 sum_nr_running > group_capacity) {
2361 max_load = avg_load;
2363 busiest_nr_running = sum_nr_running;
2364 busiest_load_per_task = sum_weighted_load;
2367 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2369 * Busy processors will not participate in power savings
2372 if (idle == CPU_NOT_IDLE ||
2373 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2377 * If the local group is idle or completely loaded
2378 * no need to do power savings balance at this domain
2380 if (local_group && (this_nr_running >= group_capacity ||
2382 power_savings_balance = 0;
2385 * If a group is already running at full capacity or idle,
2386 * don't include that group in power savings calculations
2388 if (!power_savings_balance || sum_nr_running >= group_capacity
2393 * Calculate the group which has the least non-idle load.
2394 * This is the group from where we need to pick up the load
2397 if ((sum_nr_running < min_nr_running) ||
2398 (sum_nr_running == min_nr_running &&
2399 first_cpu(group->cpumask) <
2400 first_cpu(group_min->cpumask))) {
2402 min_nr_running = sum_nr_running;
2403 min_load_per_task = sum_weighted_load /
2408 * Calculate the group which is almost near its
2409 * capacity but still has some space to pick up some load
2410 * from other group and save more power
2412 if (sum_nr_running <= group_capacity - 1) {
2413 if (sum_nr_running > leader_nr_running ||
2414 (sum_nr_running == leader_nr_running &&
2415 first_cpu(group->cpumask) >
2416 first_cpu(group_leader->cpumask))) {
2417 group_leader = group;
2418 leader_nr_running = sum_nr_running;
2423 group = group->next;
2424 } while (group != sd->groups);
2426 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2429 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2431 if (this_load >= avg_load ||
2432 100*max_load <= sd->imbalance_pct*this_load)
2435 busiest_load_per_task /= busiest_nr_running;
2437 * We're trying to get all the cpus to the average_load, so we don't
2438 * want to push ourselves above the average load, nor do we wish to
2439 * reduce the max loaded cpu below the average load, as either of these
2440 * actions would just result in more rebalancing later, and ping-pong
2441 * tasks around. Thus we look for the minimum possible imbalance.
2442 * Negative imbalances (*we* are more loaded than anyone else) will
2443 * be counted as no imbalance for these purposes -- we can't fix that
2444 * by pulling tasks to us. Be careful of negative numbers as they'll
2445 * appear as very large values with unsigned longs.
2447 if (max_load <= busiest_load_per_task)
2451 * In the presence of smp nice balancing, certain scenarios can have
2452 * max load less than avg load(as we skip the groups at or below
2453 * its cpu_power, while calculating max_load..)
2455 if (max_load < avg_load) {
2457 goto small_imbalance;
2460 /* Don't want to pull so many tasks that a group would go idle */
2461 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2463 /* How much load to actually move to equalise the imbalance */
2464 *imbalance = min(max_pull * busiest->__cpu_power,
2465 (avg_load - this_load) * this->__cpu_power)
2469 * if *imbalance is less than the average load per runnable task
2470 * there is no gaurantee that any tasks will be moved so we'll have
2471 * a think about bumping its value to force at least one task to be
2474 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2475 unsigned long tmp, pwr_now, pwr_move;
2479 pwr_move = pwr_now = 0;
2481 if (this_nr_running) {
2482 this_load_per_task /= this_nr_running;
2483 if (busiest_load_per_task > this_load_per_task)
2486 this_load_per_task = SCHED_LOAD_SCALE;
2488 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2489 busiest_load_per_task * imbn) {
2490 *imbalance = busiest_load_per_task;
2495 * OK, we don't have enough imbalance to justify moving tasks,
2496 * however we may be able to increase total CPU power used by
2500 pwr_now += busiest->__cpu_power *
2501 min(busiest_load_per_task, max_load);
2502 pwr_now += this->__cpu_power *
2503 min(this_load_per_task, this_load);
2504 pwr_now /= SCHED_LOAD_SCALE;
2506 /* Amount of load we'd subtract */
2507 tmp = sg_div_cpu_power(busiest,
2508 busiest_load_per_task * SCHED_LOAD_SCALE);
2510 pwr_move += busiest->__cpu_power *
2511 min(busiest_load_per_task, max_load - tmp);
2513 /* Amount of load we'd add */
2514 if (max_load * busiest->__cpu_power <
2515 busiest_load_per_task * SCHED_LOAD_SCALE)
2516 tmp = sg_div_cpu_power(this,
2517 max_load * busiest->__cpu_power);
2519 tmp = sg_div_cpu_power(this,
2520 busiest_load_per_task * SCHED_LOAD_SCALE);
2521 pwr_move += this->__cpu_power *
2522 min(this_load_per_task, this_load + tmp);
2523 pwr_move /= SCHED_LOAD_SCALE;
2525 /* Move if we gain throughput */
2526 if (pwr_move <= pwr_now)
2529 *imbalance = busiest_load_per_task;
2535 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2536 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2539 if (this == group_leader && group_leader != group_min) {
2540 *imbalance = min_load_per_task;
2550 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2553 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2554 unsigned long imbalance, cpumask_t *cpus)
2556 struct rq *busiest = NULL, *rq;
2557 unsigned long max_load = 0;
2560 for_each_cpu_mask(i, group->cpumask) {
2563 if (!cpu_isset(i, *cpus))
2567 wl = weighted_cpuload(i);
2569 if (rq->nr_running == 1 && wl > imbalance)
2572 if (wl > max_load) {
2582 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2583 * so long as it is large enough.
2585 #define MAX_PINNED_INTERVAL 512
2587 static inline unsigned long minus_1_or_zero(unsigned long n)
2589 return n > 0 ? n - 1 : 0;
2593 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2594 * tasks if there is an imbalance.
2596 static int load_balance(int this_cpu, struct rq *this_rq,
2597 struct sched_domain *sd, enum cpu_idle_type idle,
2600 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2601 struct sched_group *group;
2602 unsigned long imbalance;
2604 cpumask_t cpus = CPU_MASK_ALL;
2605 unsigned long flags;
2608 * When power savings policy is enabled for the parent domain, idle
2609 * sibling can pick up load irrespective of busy siblings. In this case,
2610 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2611 * portraying it as CPU_NOT_IDLE.
2613 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2614 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2617 schedstat_inc(sd, lb_cnt[idle]);
2620 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2627 schedstat_inc(sd, lb_nobusyg[idle]);
2631 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2633 schedstat_inc(sd, lb_nobusyq[idle]);
2637 BUG_ON(busiest == this_rq);
2639 schedstat_add(sd, lb_imbalance[idle], imbalance);
2642 if (busiest->nr_running > 1) {
2644 * Attempt to move tasks. If find_busiest_group has found
2645 * an imbalance but busiest->nr_running <= 1, the group is
2646 * still unbalanced. nr_moved simply stays zero, so it is
2647 * correctly treated as an imbalance.
2649 local_irq_save(flags);
2650 double_rq_lock(this_rq, busiest);
2651 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2652 minus_1_or_zero(busiest->nr_running),
2653 imbalance, sd, idle, &all_pinned);
2654 double_rq_unlock(this_rq, busiest);
2655 local_irq_restore(flags);
2658 * some other cpu did the load balance for us.
2660 if (nr_moved && this_cpu != smp_processor_id())
2661 resched_cpu(this_cpu);
2663 /* All tasks on this runqueue were pinned by CPU affinity */
2664 if (unlikely(all_pinned)) {
2665 cpu_clear(cpu_of(busiest), cpus);
2666 if (!cpus_empty(cpus))
2673 schedstat_inc(sd, lb_failed[idle]);
2674 sd->nr_balance_failed++;
2676 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2678 spin_lock_irqsave(&busiest->lock, flags);
2680 /* don't kick the migration_thread, if the curr
2681 * task on busiest cpu can't be moved to this_cpu
2683 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2684 spin_unlock_irqrestore(&busiest->lock, flags);
2686 goto out_one_pinned;
2689 if (!busiest->active_balance) {
2690 busiest->active_balance = 1;
2691 busiest->push_cpu = this_cpu;
2694 spin_unlock_irqrestore(&busiest->lock, flags);
2696 wake_up_process(busiest->migration_thread);
2699 * We've kicked active balancing, reset the failure
2702 sd->nr_balance_failed = sd->cache_nice_tries+1;
2705 sd->nr_balance_failed = 0;
2707 if (likely(!active_balance)) {
2708 /* We were unbalanced, so reset the balancing interval */
2709 sd->balance_interval = sd->min_interval;
2712 * If we've begun active balancing, start to back off. This
2713 * case may not be covered by the all_pinned logic if there
2714 * is only 1 task on the busy runqueue (because we don't call
2717 if (sd->balance_interval < sd->max_interval)
2718 sd->balance_interval *= 2;
2721 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2727 schedstat_inc(sd, lb_balanced[idle]);
2729 sd->nr_balance_failed = 0;
2732 /* tune up the balancing interval */
2733 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2734 (sd->balance_interval < sd->max_interval))
2735 sd->balance_interval *= 2;
2737 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2738 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2744 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2745 * tasks if there is an imbalance.
2747 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2748 * this_rq is locked.
2751 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2753 struct sched_group *group;
2754 struct rq *busiest = NULL;
2755 unsigned long imbalance;
2758 cpumask_t cpus = CPU_MASK_ALL;
2761 * When power savings policy is enabled for the parent domain, idle
2762 * sibling can pick up load irrespective of busy siblings. In this case,
2763 * let the state of idle sibling percolate up as IDLE, instead of
2764 * portraying it as CPU_NOT_IDLE.
2766 if (sd->flags & SD_SHARE_CPUPOWER &&
2767 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2770 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2772 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2773 &sd_idle, &cpus, NULL);
2775 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2779 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2782 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2786 BUG_ON(busiest == this_rq);
2788 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2791 if (busiest->nr_running > 1) {
2792 /* Attempt to move tasks */
2793 double_lock_balance(this_rq, busiest);
2794 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2795 minus_1_or_zero(busiest->nr_running),
2796 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2797 spin_unlock(&busiest->lock);
2800 cpu_clear(cpu_of(busiest), cpus);
2801 if (!cpus_empty(cpus))
2807 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2808 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2809 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2812 sd->nr_balance_failed = 0;
2817 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2818 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2819 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2821 sd->nr_balance_failed = 0;
2827 * idle_balance is called by schedule() if this_cpu is about to become
2828 * idle. Attempts to pull tasks from other CPUs.
2830 static void idle_balance(int this_cpu, struct rq *this_rq)
2832 struct sched_domain *sd;
2833 int pulled_task = -1;
2834 unsigned long next_balance = jiffies + HZ;
2836 for_each_domain(this_cpu, sd) {
2837 unsigned long interval;
2839 if (!(sd->flags & SD_LOAD_BALANCE))
2842 if (sd->flags & SD_BALANCE_NEWIDLE)
2843 /* If we've pulled tasks over stop searching: */
2844 pulled_task = load_balance_newidle(this_cpu,
2847 interval = msecs_to_jiffies(sd->balance_interval);
2848 if (time_after(next_balance, sd->last_balance + interval))
2849 next_balance = sd->last_balance + interval;
2853 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2855 * We are going idle. next_balance may be set based on
2856 * a busy processor. So reset next_balance.
2858 this_rq->next_balance = next_balance;
2863 * active_load_balance is run by migration threads. It pushes running tasks
2864 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2865 * running on each physical CPU where possible, and avoids physical /
2866 * logical imbalances.
2868 * Called with busiest_rq locked.
2870 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2872 int target_cpu = busiest_rq->push_cpu;
2873 struct sched_domain *sd;
2874 struct rq *target_rq;
2876 /* Is there any task to move? */
2877 if (busiest_rq->nr_running <= 1)
2880 target_rq = cpu_rq(target_cpu);
2883 * This condition is "impossible", if it occurs
2884 * we need to fix it. Originally reported by
2885 * Bjorn Helgaas on a 128-cpu setup.
2887 BUG_ON(busiest_rq == target_rq);
2889 /* move a task from busiest_rq to target_rq */
2890 double_lock_balance(busiest_rq, target_rq);
2892 /* Search for an sd spanning us and the target CPU. */
2893 for_each_domain(target_cpu, sd) {
2894 if ((sd->flags & SD_LOAD_BALANCE) &&
2895 cpu_isset(busiest_cpu, sd->span))
2900 schedstat_inc(sd, alb_cnt);
2902 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2903 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2905 schedstat_inc(sd, alb_pushed);
2907 schedstat_inc(sd, alb_failed);
2909 spin_unlock(&target_rq->lock);
2914 atomic_t load_balancer;
2916 } nohz ____cacheline_aligned = {
2917 .load_balancer = ATOMIC_INIT(-1),
2918 .cpu_mask = CPU_MASK_NONE,
2922 * This routine will try to nominate the ilb (idle load balancing)
2923 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2924 * load balancing on behalf of all those cpus. If all the cpus in the system
2925 * go into this tickless mode, then there will be no ilb owner (as there is
2926 * no need for one) and all the cpus will sleep till the next wakeup event
2929 * For the ilb owner, tick is not stopped. And this tick will be used
2930 * for idle load balancing. ilb owner will still be part of
2933 * While stopping the tick, this cpu will become the ilb owner if there
2934 * is no other owner. And will be the owner till that cpu becomes busy
2935 * or if all cpus in the system stop their ticks at which point
2936 * there is no need for ilb owner.
2938 * When the ilb owner becomes busy, it nominates another owner, during the
2939 * next busy scheduler_tick()
2941 int select_nohz_load_balancer(int stop_tick)
2943 int cpu = smp_processor_id();
2946 cpu_set(cpu, nohz.cpu_mask);
2947 cpu_rq(cpu)->in_nohz_recently = 1;
2950 * If we are going offline and still the leader, give up!
2952 if (cpu_is_offline(cpu) &&
2953 atomic_read(&nohz.load_balancer) == cpu) {
2954 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2959 /* time for ilb owner also to sleep */
2960 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2961 if (atomic_read(&nohz.load_balancer) == cpu)
2962 atomic_set(&nohz.load_balancer, -1);
2966 if (atomic_read(&nohz.load_balancer) == -1) {
2967 /* make me the ilb owner */
2968 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2970 } else if (atomic_read(&nohz.load_balancer) == cpu)
2973 if (!cpu_isset(cpu, nohz.cpu_mask))
2976 cpu_clear(cpu, nohz.cpu_mask);
2978 if (atomic_read(&nohz.load_balancer) == cpu)
2979 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2986 static DEFINE_SPINLOCK(balancing);
2989 * It checks each scheduling domain to see if it is due to be balanced,
2990 * and initiates a balancing operation if so.
2992 * Balancing parameters are set up in arch_init_sched_domains.
2994 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2997 struct rq *rq = cpu_rq(cpu);
2998 unsigned long interval;
2999 struct sched_domain *sd;
3000 /* Earliest time when we have to do rebalance again */
3001 unsigned long next_balance = jiffies + 60*HZ;
3003 for_each_domain(cpu, sd) {
3004 if (!(sd->flags & SD_LOAD_BALANCE))
3007 interval = sd->balance_interval;
3008 if (idle != CPU_IDLE)
3009 interval *= sd->busy_factor;
3011 /* scale ms to jiffies */
3012 interval = msecs_to_jiffies(interval);
3013 if (unlikely(!interval))
3015 if (interval > HZ*NR_CPUS/10)
3016 interval = HZ*NR_CPUS/10;
3019 if (sd->flags & SD_SERIALIZE) {
3020 if (!spin_trylock(&balancing))
3024 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3025 if (load_balance(cpu, rq, sd, idle, &balance)) {
3027 * We've pulled tasks over so either we're no
3028 * longer idle, or one of our SMT siblings is
3031 idle = CPU_NOT_IDLE;
3033 sd->last_balance = jiffies;
3035 if (sd->flags & SD_SERIALIZE)
3036 spin_unlock(&balancing);
3038 if (time_after(next_balance, sd->last_balance + interval))
3039 next_balance = sd->last_balance + interval;
3042 * Stop the load balance at this level. There is another
3043 * CPU in our sched group which is doing load balancing more
3049 rq->next_balance = next_balance;
3053 * run_rebalance_domains is triggered when needed from the scheduler tick.
3054 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3055 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3057 static void run_rebalance_domains(struct softirq_action *h)
3059 int this_cpu = smp_processor_id();
3060 struct rq *this_rq = cpu_rq(this_cpu);
3061 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3062 CPU_IDLE : CPU_NOT_IDLE;
3064 rebalance_domains(this_cpu, idle);
3068 * If this cpu is the owner for idle load balancing, then do the
3069 * balancing on behalf of the other idle cpus whose ticks are
3072 if (this_rq->idle_at_tick &&
3073 atomic_read(&nohz.load_balancer) == this_cpu) {
3074 cpumask_t cpus = nohz.cpu_mask;
3078 cpu_clear(this_cpu, cpus);
3079 for_each_cpu_mask(balance_cpu, cpus) {
3081 * If this cpu gets work to do, stop the load balancing
3082 * work being done for other cpus. Next load
3083 * balancing owner will pick it up.
3088 rebalance_domains(balance_cpu, SCHED_IDLE);
3090 rq = cpu_rq(balance_cpu);
3091 if (time_after(this_rq->next_balance, rq->next_balance))
3092 this_rq->next_balance = rq->next_balance;
3099 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3101 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3102 * idle load balancing owner or decide to stop the periodic load balancing,
3103 * if the whole system is idle.
3105 static inline void trigger_load_balance(struct rq *rq, int cpu)
3109 * If we were in the nohz mode recently and busy at the current
3110 * scheduler tick, then check if we need to nominate new idle
3113 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3114 rq->in_nohz_recently = 0;
3116 if (atomic_read(&nohz.load_balancer) == cpu) {
3117 cpu_clear(cpu, nohz.cpu_mask);
3118 atomic_set(&nohz.load_balancer, -1);
3121 if (atomic_read(&nohz.load_balancer) == -1) {
3123 * simple selection for now: Nominate the
3124 * first cpu in the nohz list to be the next
3127 * TBD: Traverse the sched domains and nominate
3128 * the nearest cpu in the nohz.cpu_mask.
3130 int ilb = first_cpu(nohz.cpu_mask);
3138 * If this cpu is idle and doing idle load balancing for all the
3139 * cpus with ticks stopped, is it time for that to stop?
3141 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3142 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3148 * If this cpu is idle and the idle load balancing is done by
3149 * someone else, then no need raise the SCHED_SOFTIRQ
3151 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3152 cpu_isset(cpu, nohz.cpu_mask))
3155 if (time_after_eq(jiffies, rq->next_balance))
3156 raise_softirq(SCHED_SOFTIRQ);
3159 #else /* CONFIG_SMP */
3162 * on UP we do not need to balance between CPUs:
3164 static inline void idle_balance(int cpu, struct rq *rq)
3168 /* Avoid "used but not defined" warning on UP */
3169 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3170 unsigned long max_nr_move, unsigned long max_load_move,
3171 struct sched_domain *sd, enum cpu_idle_type idle,
3172 int *all_pinned, unsigned long *load_moved,
3173 int this_best_prio, int best_prio, int best_prio_seen,
3174 struct rq_iterator *iterator)
3183 DEFINE_PER_CPU(struct kernel_stat, kstat);
3185 EXPORT_PER_CPU_SYMBOL(kstat);
3188 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3189 * that have not yet been banked in case the task is currently running.
3191 unsigned long long task_sched_runtime(struct task_struct *p)
3193 unsigned long flags;
3197 rq = task_rq_lock(p, &flags);
3198 ns = p->se.sum_exec_runtime;
3199 if (rq->curr == p) {
3200 delta_exec = rq_clock(rq) - p->se.exec_start;
3201 if ((s64)delta_exec > 0)
3204 task_rq_unlock(rq, &flags);
3210 * Account user cpu time to a process.
3211 * @p: the process that the cpu time gets accounted to
3212 * @hardirq_offset: the offset to subtract from hardirq_count()
3213 * @cputime: the cpu time spent in user space since the last update
3215 void account_user_time(struct task_struct *p, cputime_t cputime)
3217 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3220 p->utime = cputime_add(p->utime, cputime);
3222 /* Add user time to cpustat. */
3223 tmp = cputime_to_cputime64(cputime);
3224 if (TASK_NICE(p) > 0)
3225 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3227 cpustat->user = cputime64_add(cpustat->user, tmp);
3231 * Account system cpu time to a process.
3232 * @p: the process that the cpu time gets accounted to
3233 * @hardirq_offset: the offset to subtract from hardirq_count()
3234 * @cputime: the cpu time spent in kernel space since the last update
3236 void account_system_time(struct task_struct *p, int hardirq_offset,
3239 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3240 struct rq *rq = this_rq();
3243 p->stime = cputime_add(p->stime, cputime);
3245 /* Add system time to cpustat. */
3246 tmp = cputime_to_cputime64(cputime);
3247 if (hardirq_count() - hardirq_offset)
3248 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3249 else if (softirq_count())
3250 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3251 else if (p != rq->idle)
3252 cpustat->system = cputime64_add(cpustat->system, tmp);
3253 else if (atomic_read(&rq->nr_iowait) > 0)
3254 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3256 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3257 /* Account for system time used */
3258 acct_update_integrals(p);
3262 * Account for involuntary wait time.
3263 * @p: the process from which the cpu time has been stolen
3264 * @steal: the cpu time spent in involuntary wait
3266 void account_steal_time(struct task_struct *p, cputime_t steal)
3268 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3269 cputime64_t tmp = cputime_to_cputime64(steal);
3270 struct rq *rq = this_rq();
3272 if (p == rq->idle) {
3273 p->stime = cputime_add(p->stime, steal);
3274 if (atomic_read(&rq->nr_iowait) > 0)
3275 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3277 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3279 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3283 * This function gets called by the timer code, with HZ frequency.
3284 * We call it with interrupts disabled.
3286 * It also gets called by the fork code, when changing the parent's
3289 void scheduler_tick(void)
3291 int cpu = smp_processor_id();
3292 struct rq *rq = cpu_rq(cpu);
3293 struct task_struct *curr = rq->curr;
3295 spin_lock(&rq->lock);
3296 if (curr != rq->idle) /* FIXME: needed? */
3297 curr->sched_class->task_tick(rq, curr);
3298 update_cpu_load(rq);
3299 spin_unlock(&rq->lock);
3302 rq->idle_at_tick = idle_cpu(cpu);
3303 trigger_load_balance(rq, cpu);
3307 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3309 void fastcall add_preempt_count(int val)
3314 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3316 preempt_count() += val;
3318 * Spinlock count overflowing soon?
3320 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3323 EXPORT_SYMBOL(add_preempt_count);
3325 void fastcall sub_preempt_count(int val)
3330 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3333 * Is the spinlock portion underflowing?
3335 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3336 !(preempt_count() & PREEMPT_MASK)))
3339 preempt_count() -= val;
3341 EXPORT_SYMBOL(sub_preempt_count);
3346 * Print scheduling while atomic bug:
3348 static noinline void __schedule_bug(struct task_struct *prev)
3350 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3351 prev->comm, preempt_count(), prev->pid);
3352 debug_show_held_locks(prev);
3353 if (irqs_disabled())
3354 print_irqtrace_events(prev);
3359 * Various schedule()-time debugging checks and statistics:
3361 static inline void schedule_debug(struct task_struct *prev)
3364 * Test if we are atomic. Since do_exit() needs to call into
3365 * schedule() atomically, we ignore that path for now.
3366 * Otherwise, whine if we are scheduling when we should not be.
3368 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3369 __schedule_bug(prev);
3371 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3373 schedstat_inc(this_rq(), sched_cnt);
3377 * Pick up the highest-prio task:
3379 static inline struct task_struct *
3380 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3382 struct sched_class *class;
3383 struct task_struct *p;
3386 * Optimization: we know that if all tasks are in
3387 * the fair class we can call that function directly:
3389 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3390 p = fair_sched_class.pick_next_task(rq, now);
3395 class = sched_class_highest;
3397 p = class->pick_next_task(rq, now);
3401 * Will never be NULL as the idle class always
3402 * returns a non-NULL p:
3404 class = class->next;
3409 * schedule() is the main scheduler function.
3411 asmlinkage void __sched schedule(void)
3413 struct task_struct *prev, *next;
3421 cpu = smp_processor_id();
3425 switch_count = &prev->nivcsw;
3427 release_kernel_lock(prev);
3428 need_resched_nonpreemptible:
3430 schedule_debug(prev);
3432 spin_lock_irq(&rq->lock);
3433 clear_tsk_need_resched(prev);
3435 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3436 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3437 unlikely(signal_pending(prev)))) {
3438 prev->state = TASK_RUNNING;
3440 deactivate_task(rq, prev, 1);
3442 switch_count = &prev->nvcsw;
3445 if (unlikely(!rq->nr_running))
3446 idle_balance(cpu, rq);
3448 now = __rq_clock(rq);
3449 prev->sched_class->put_prev_task(rq, prev, now);
3450 next = pick_next_task(rq, prev, now);
3452 sched_info_switch(prev, next);
3454 if (likely(prev != next)) {
3459 context_switch(rq, prev, next); /* unlocks the rq */
3461 spin_unlock_irq(&rq->lock);
3463 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3464 cpu = smp_processor_id();
3466 goto need_resched_nonpreemptible;
3468 preempt_enable_no_resched();
3469 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3472 EXPORT_SYMBOL(schedule);
3474 #ifdef CONFIG_PREEMPT
3476 * this is the entry point to schedule() from in-kernel preemption
3477 * off of preempt_enable. Kernel preemptions off return from interrupt
3478 * occur there and call schedule directly.
3480 asmlinkage void __sched preempt_schedule(void)
3482 struct thread_info *ti = current_thread_info();
3483 #ifdef CONFIG_PREEMPT_BKL
3484 struct task_struct *task = current;
3485 int saved_lock_depth;
3488 * If there is a non-zero preempt_count or interrupts are disabled,
3489 * we do not want to preempt the current task. Just return..
3491 if (likely(ti->preempt_count || irqs_disabled()))
3495 add_preempt_count(PREEMPT_ACTIVE);
3497 * We keep the big kernel semaphore locked, but we
3498 * clear ->lock_depth so that schedule() doesnt
3499 * auto-release the semaphore:
3501 #ifdef CONFIG_PREEMPT_BKL
3502 saved_lock_depth = task->lock_depth;
3503 task->lock_depth = -1;
3506 #ifdef CONFIG_PREEMPT_BKL
3507 task->lock_depth = saved_lock_depth;
3509 sub_preempt_count(PREEMPT_ACTIVE);
3511 /* we could miss a preemption opportunity between schedule and now */
3513 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3516 EXPORT_SYMBOL(preempt_schedule);
3519 * this is the entry point to schedule() from kernel preemption
3520 * off of irq context.
3521 * Note, that this is called and return with irqs disabled. This will
3522 * protect us against recursive calling from irq.
3524 asmlinkage void __sched preempt_schedule_irq(void)
3526 struct thread_info *ti = current_thread_info();
3527 #ifdef CONFIG_PREEMPT_BKL
3528 struct task_struct *task = current;
3529 int saved_lock_depth;
3531 /* Catch callers which need to be fixed */
3532 BUG_ON(ti->preempt_count || !irqs_disabled());
3535 add_preempt_count(PREEMPT_ACTIVE);
3537 * We keep the big kernel semaphore locked, but we
3538 * clear ->lock_depth so that schedule() doesnt
3539 * auto-release the semaphore:
3541 #ifdef CONFIG_PREEMPT_BKL
3542 saved_lock_depth = task->lock_depth;
3543 task->lock_depth = -1;
3547 local_irq_disable();
3548 #ifdef CONFIG_PREEMPT_BKL
3549 task->lock_depth = saved_lock_depth;
3551 sub_preempt_count(PREEMPT_ACTIVE);
3553 /* we could miss a preemption opportunity between schedule and now */
3555 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3559 #endif /* CONFIG_PREEMPT */
3561 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3564 return try_to_wake_up(curr->private, mode, sync);
3566 EXPORT_SYMBOL(default_wake_function);
3569 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3570 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3571 * number) then we wake all the non-exclusive tasks and one exclusive task.
3573 * There are circumstances in which we can try to wake a task which has already
3574 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3575 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3577 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3578 int nr_exclusive, int sync, void *key)
3580 struct list_head *tmp, *next;
3582 list_for_each_safe(tmp, next, &q->task_list) {
3583 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3584 unsigned flags = curr->flags;
3586 if (curr->func(curr, mode, sync, key) &&
3587 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3593 * __wake_up - wake up threads blocked on a waitqueue.
3595 * @mode: which threads
3596 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3597 * @key: is directly passed to the wakeup function
3599 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3600 int nr_exclusive, void *key)
3602 unsigned long flags;
3604 spin_lock_irqsave(&q->lock, flags);
3605 __wake_up_common(q, mode, nr_exclusive, 0, key);
3606 spin_unlock_irqrestore(&q->lock, flags);
3608 EXPORT_SYMBOL(__wake_up);
3611 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3613 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3615 __wake_up_common(q, mode, 1, 0, NULL);
3619 * __wake_up_sync - wake up threads blocked on a waitqueue.
3621 * @mode: which threads
3622 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3624 * The sync wakeup differs that the waker knows that it will schedule
3625 * away soon, so while the target thread will be woken up, it will not
3626 * be migrated to another CPU - ie. the two threads are 'synchronized'
3627 * with each other. This can prevent needless bouncing between CPUs.
3629 * On UP it can prevent extra preemption.
3632 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3634 unsigned long flags;
3640 if (unlikely(!nr_exclusive))
3643 spin_lock_irqsave(&q->lock, flags);
3644 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3645 spin_unlock_irqrestore(&q->lock, flags);
3647 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3649 void fastcall complete(struct completion *x)
3651 unsigned long flags;
3653 spin_lock_irqsave(&x->wait.lock, flags);
3655 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3657 spin_unlock_irqrestore(&x->wait.lock, flags);
3659 EXPORT_SYMBOL(complete);
3661 void fastcall complete_all(struct completion *x)
3663 unsigned long flags;
3665 spin_lock_irqsave(&x->wait.lock, flags);
3666 x->done += UINT_MAX/2;
3667 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3669 spin_unlock_irqrestore(&x->wait.lock, flags);
3671 EXPORT_SYMBOL(complete_all);
3673 void fastcall __sched wait_for_completion(struct completion *x)
3677 spin_lock_irq(&x->wait.lock);
3679 DECLARE_WAITQUEUE(wait, current);
3681 wait.flags |= WQ_FLAG_EXCLUSIVE;
3682 __add_wait_queue_tail(&x->wait, &wait);
3684 __set_current_state(TASK_UNINTERRUPTIBLE);
3685 spin_unlock_irq(&x->wait.lock);
3687 spin_lock_irq(&x->wait.lock);
3689 __remove_wait_queue(&x->wait, &wait);
3692 spin_unlock_irq(&x->wait.lock);
3694 EXPORT_SYMBOL(wait_for_completion);
3696 unsigned long fastcall __sched
3697 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3701 spin_lock_irq(&x->wait.lock);
3703 DECLARE_WAITQUEUE(wait, current);
3705 wait.flags |= WQ_FLAG_EXCLUSIVE;
3706 __add_wait_queue_tail(&x->wait, &wait);
3708 __set_current_state(TASK_UNINTERRUPTIBLE);
3709 spin_unlock_irq(&x->wait.lock);
3710 timeout = schedule_timeout(timeout);
3711 spin_lock_irq(&x->wait.lock);
3713 __remove_wait_queue(&x->wait, &wait);
3717 __remove_wait_queue(&x->wait, &wait);
3721 spin_unlock_irq(&x->wait.lock);
3724 EXPORT_SYMBOL(wait_for_completion_timeout);
3726 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3732 spin_lock_irq(&x->wait.lock);
3734 DECLARE_WAITQUEUE(wait, current);
3736 wait.flags |= WQ_FLAG_EXCLUSIVE;
3737 __add_wait_queue_tail(&x->wait, &wait);
3739 if (signal_pending(current)) {
3741 __remove_wait_queue(&x->wait, &wait);
3744 __set_current_state(TASK_INTERRUPTIBLE);
3745 spin_unlock_irq(&x->wait.lock);
3747 spin_lock_irq(&x->wait.lock);
3749 __remove_wait_queue(&x->wait, &wait);
3753 spin_unlock_irq(&x->wait.lock);
3757 EXPORT_SYMBOL(wait_for_completion_interruptible);
3759 unsigned long fastcall __sched
3760 wait_for_completion_interruptible_timeout(struct completion *x,
3761 unsigned long timeout)
3765 spin_lock_irq(&x->wait.lock);
3767 DECLARE_WAITQUEUE(wait, current);
3769 wait.flags |= WQ_FLAG_EXCLUSIVE;
3770 __add_wait_queue_tail(&x->wait, &wait);
3772 if (signal_pending(current)) {
3773 timeout = -ERESTARTSYS;
3774 __remove_wait_queue(&x->wait, &wait);
3777 __set_current_state(TASK_INTERRUPTIBLE);
3778 spin_unlock_irq(&x->wait.lock);
3779 timeout = schedule_timeout(timeout);
3780 spin_lock_irq(&x->wait.lock);
3782 __remove_wait_queue(&x->wait, &wait);
3786 __remove_wait_queue(&x->wait, &wait);
3790 spin_unlock_irq(&x->wait.lock);
3793 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3796 #define SLEEP_ON_VAR \
3797 unsigned long flags; \
3798 wait_queue_t wait; \
3799 init_waitqueue_entry(&wait, current);
3801 #define SLEEP_ON_HEAD \
3802 spin_lock_irqsave(&q->lock,flags); \
3803 __add_wait_queue(q, &wait); \
3804 spin_unlock(&q->lock);
3806 #define SLEEP_ON_TAIL \
3807 spin_lock_irq(&q->lock); \
3808 __remove_wait_queue(q, &wait); \
3809 spin_unlock_irqrestore(&q->lock, flags);
3811 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3815 current->state = TASK_INTERRUPTIBLE;
3821 EXPORT_SYMBOL(interruptible_sleep_on);
3823 long fastcall __sched
3824 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3828 current->state = TASK_INTERRUPTIBLE;
3831 timeout = schedule_timeout(timeout);
3836 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3838 void fastcall __sched sleep_on(wait_queue_head_t *q)
3842 current->state = TASK_UNINTERRUPTIBLE;
3848 EXPORT_SYMBOL(sleep_on);
3850 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3854 current->state = TASK_UNINTERRUPTIBLE;
3857 timeout = schedule_timeout(timeout);
3863 EXPORT_SYMBOL(sleep_on_timeout);
3865 #ifdef CONFIG_RT_MUTEXES
3868 * rt_mutex_setprio - set the current priority of a task
3870 * @prio: prio value (kernel-internal form)
3872 * This function changes the 'effective' priority of a task. It does
3873 * not touch ->normal_prio like __setscheduler().
3875 * Used by the rt_mutex code to implement priority inheritance logic.
3877 void rt_mutex_setprio(struct task_struct *p, int prio)
3879 unsigned long flags;
3884 BUG_ON(prio < 0 || prio > MAX_PRIO);
3886 rq = task_rq_lock(p, &flags);
3890 on_rq = p->se.on_rq;
3892 dequeue_task(rq, p, 0, now);
3895 p->sched_class = &rt_sched_class;
3897 p->sched_class = &fair_sched_class;
3902 enqueue_task(rq, p, 0, now);
3904 * Reschedule if we are currently running on this runqueue and
3905 * our priority decreased, or if we are not currently running on
3906 * this runqueue and our priority is higher than the current's
3908 if (task_running(rq, p)) {
3909 if (p->prio > oldprio)
3910 resched_task(rq->curr);
3912 check_preempt_curr(rq, p);
3915 task_rq_unlock(rq, &flags);
3920 void set_user_nice(struct task_struct *p, long nice)
3922 int old_prio, delta, on_rq;
3923 unsigned long flags;
3927 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3930 * We have to be careful, if called from sys_setpriority(),
3931 * the task might be in the middle of scheduling on another CPU.
3933 rq = task_rq_lock(p, &flags);
3936 * The RT priorities are set via sched_setscheduler(), but we still
3937 * allow the 'normal' nice value to be set - but as expected
3938 * it wont have any effect on scheduling until the task is
3939 * SCHED_FIFO/SCHED_RR:
3941 if (task_has_rt_policy(p)) {
3942 p->static_prio = NICE_TO_PRIO(nice);
3945 on_rq = p->se.on_rq;
3947 dequeue_task(rq, p, 0, now);
3948 dec_load(rq, p, now);
3951 p->static_prio = NICE_TO_PRIO(nice);
3954 p->prio = effective_prio(p);
3955 delta = p->prio - old_prio;
3958 enqueue_task(rq, p, 0, now);
3959 inc_load(rq, p, now);
3961 * If the task increased its priority or is running and
3962 * lowered its priority, then reschedule its CPU:
3964 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3965 resched_task(rq->curr);
3968 task_rq_unlock(rq, &flags);
3970 EXPORT_SYMBOL(set_user_nice);
3973 * can_nice - check if a task can reduce its nice value
3977 int can_nice(const struct task_struct *p, const int nice)
3979 /* convert nice value [19,-20] to rlimit style value [1,40] */
3980 int nice_rlim = 20 - nice;
3982 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3983 capable(CAP_SYS_NICE));
3986 #ifdef __ARCH_WANT_SYS_NICE
3989 * sys_nice - change the priority of the current process.
3990 * @increment: priority increment
3992 * sys_setpriority is a more generic, but much slower function that
3993 * does similar things.
3995 asmlinkage long sys_nice(int increment)
4000 * Setpriority might change our priority at the same moment.
4001 * We don't have to worry. Conceptually one call occurs first
4002 * and we have a single winner.
4004 if (increment < -40)
4009 nice = PRIO_TO_NICE(current->static_prio) + increment;
4015 if (increment < 0 && !can_nice(current, nice))
4018 retval = security_task_setnice(current, nice);
4022 set_user_nice(current, nice);
4029 * task_prio - return the priority value of a given task.
4030 * @p: the task in question.
4032 * This is the priority value as seen by users in /proc.
4033 * RT tasks are offset by -200. Normal tasks are centered
4034 * around 0, value goes from -16 to +15.
4036 int task_prio(const struct task_struct *p)
4038 return p->prio - MAX_RT_PRIO;
4042 * task_nice - return the nice value of a given task.
4043 * @p: the task in question.
4045 int task_nice(const struct task_struct *p)
4047 return TASK_NICE(p);
4049 EXPORT_SYMBOL_GPL(task_nice);
4052 * idle_cpu - is a given cpu idle currently?
4053 * @cpu: the processor in question.
4055 int idle_cpu(int cpu)
4057 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4061 * idle_task - return the idle task for a given cpu.
4062 * @cpu: the processor in question.
4064 struct task_struct *idle_task(int cpu)
4066 return cpu_rq(cpu)->idle;
4070 * find_process_by_pid - find a process with a matching PID value.
4071 * @pid: the pid in question.
4073 static inline struct task_struct *find_process_by_pid(pid_t pid)
4075 return pid ? find_task_by_pid(pid) : current;
4078 /* Actually do priority change: must hold rq lock. */
4080 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4082 BUG_ON(p->se.on_rq);
4085 switch (p->policy) {
4089 p->sched_class = &fair_sched_class;
4093 p->sched_class = &rt_sched_class;
4097 p->rt_priority = prio;
4098 p->normal_prio = normal_prio(p);
4099 /* we are holding p->pi_lock already */
4100 p->prio = rt_mutex_getprio(p);
4105 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4106 * @p: the task in question.
4107 * @policy: new policy.
4108 * @param: structure containing the new RT priority.
4110 * NOTE that the task may be already dead.
4112 int sched_setscheduler(struct task_struct *p, int policy,
4113 struct sched_param *param)
4115 int retval, oldprio, oldpolicy = -1, on_rq;
4116 unsigned long flags;
4119 /* may grab non-irq protected spin_locks */
4120 BUG_ON(in_interrupt());
4122 /* double check policy once rq lock held */
4124 policy = oldpolicy = p->policy;
4125 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4126 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4127 policy != SCHED_IDLE)
4130 * Valid priorities for SCHED_FIFO and SCHED_RR are
4131 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4132 * SCHED_BATCH and SCHED_IDLE is 0.
4134 if (param->sched_priority < 0 ||
4135 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4136 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4138 if (rt_policy(policy) != (param->sched_priority != 0))
4142 * Allow unprivileged RT tasks to decrease priority:
4144 if (!capable(CAP_SYS_NICE)) {
4145 if (rt_policy(policy)) {
4146 unsigned long rlim_rtprio;
4148 if (!lock_task_sighand(p, &flags))
4150 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4151 unlock_task_sighand(p, &flags);
4153 /* can't set/change the rt policy */
4154 if (policy != p->policy && !rlim_rtprio)
4157 /* can't increase priority */
4158 if (param->sched_priority > p->rt_priority &&
4159 param->sched_priority > rlim_rtprio)
4163 * Like positive nice levels, dont allow tasks to
4164 * move out of SCHED_IDLE either:
4166 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4169 /* can't change other user's priorities */
4170 if ((current->euid != p->euid) &&
4171 (current->euid != p->uid))
4175 retval = security_task_setscheduler(p, policy, param);
4179 * make sure no PI-waiters arrive (or leave) while we are
4180 * changing the priority of the task:
4182 spin_lock_irqsave(&p->pi_lock, flags);
4184 * To be able to change p->policy safely, the apropriate
4185 * runqueue lock must be held.
4187 rq = __task_rq_lock(p);
4188 /* recheck policy now with rq lock held */
4189 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4190 policy = oldpolicy = -1;
4191 __task_rq_unlock(rq);
4192 spin_unlock_irqrestore(&p->pi_lock, flags);
4195 on_rq = p->se.on_rq;
4197 deactivate_task(rq, p, 0);
4199 __setscheduler(rq, p, policy, param->sched_priority);
4201 activate_task(rq, p, 0);
4203 * Reschedule if we are currently running on this runqueue and
4204 * our priority decreased, or if we are not currently running on
4205 * this runqueue and our priority is higher than the current's
4207 if (task_running(rq, p)) {
4208 if (p->prio > oldprio)
4209 resched_task(rq->curr);
4211 check_preempt_curr(rq, p);
4214 __task_rq_unlock(rq);
4215 spin_unlock_irqrestore(&p->pi_lock, flags);
4217 rt_mutex_adjust_pi(p);
4221 EXPORT_SYMBOL_GPL(sched_setscheduler);
4224 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4226 struct sched_param lparam;
4227 struct task_struct *p;
4230 if (!param || pid < 0)
4232 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4237 p = find_process_by_pid(pid);
4239 retval = sched_setscheduler(p, policy, &lparam);
4246 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4247 * @pid: the pid in question.
4248 * @policy: new policy.
4249 * @param: structure containing the new RT priority.
4251 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4252 struct sched_param __user *param)
4254 /* negative values for policy are not valid */
4258 return do_sched_setscheduler(pid, policy, param);
4262 * sys_sched_setparam - set/change the RT priority of a thread
4263 * @pid: the pid in question.
4264 * @param: structure containing the new RT priority.
4266 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4268 return do_sched_setscheduler(pid, -1, param);
4272 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4273 * @pid: the pid in question.
4275 asmlinkage long sys_sched_getscheduler(pid_t pid)
4277 struct task_struct *p;
4278 int retval = -EINVAL;
4284 read_lock(&tasklist_lock);
4285 p = find_process_by_pid(pid);
4287 retval = security_task_getscheduler(p);
4291 read_unlock(&tasklist_lock);
4298 * sys_sched_getscheduler - get the RT priority of a thread
4299 * @pid: the pid in question.
4300 * @param: structure containing the RT priority.
4302 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4304 struct sched_param lp;
4305 struct task_struct *p;
4306 int retval = -EINVAL;
4308 if (!param || pid < 0)
4311 read_lock(&tasklist_lock);
4312 p = find_process_by_pid(pid);
4317 retval = security_task_getscheduler(p);
4321 lp.sched_priority = p->rt_priority;
4322 read_unlock(&tasklist_lock);
4325 * This one might sleep, we cannot do it with a spinlock held ...
4327 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4333 read_unlock(&tasklist_lock);
4337 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4339 cpumask_t cpus_allowed;
4340 struct task_struct *p;
4343 mutex_lock(&sched_hotcpu_mutex);
4344 read_lock(&tasklist_lock);
4346 p = find_process_by_pid(pid);
4348 read_unlock(&tasklist_lock);
4349 mutex_unlock(&sched_hotcpu_mutex);
4354 * It is not safe to call set_cpus_allowed with the
4355 * tasklist_lock held. We will bump the task_struct's
4356 * usage count and then drop tasklist_lock.
4359 read_unlock(&tasklist_lock);
4362 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4363 !capable(CAP_SYS_NICE))
4366 retval = security_task_setscheduler(p, 0, NULL);
4370 cpus_allowed = cpuset_cpus_allowed(p);
4371 cpus_and(new_mask, new_mask, cpus_allowed);
4372 retval = set_cpus_allowed(p, new_mask);
4376 mutex_unlock(&sched_hotcpu_mutex);
4380 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4381 cpumask_t *new_mask)
4383 if (len < sizeof(cpumask_t)) {
4384 memset(new_mask, 0, sizeof(cpumask_t));
4385 } else if (len > sizeof(cpumask_t)) {
4386 len = sizeof(cpumask_t);
4388 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4392 * sys_sched_setaffinity - set the cpu affinity of a process
4393 * @pid: pid of the process
4394 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4395 * @user_mask_ptr: user-space pointer to the new cpu mask
4397 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4398 unsigned long __user *user_mask_ptr)
4403 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4407 return sched_setaffinity(pid, new_mask);
4411 * Represents all cpu's present in the system
4412 * In systems capable of hotplug, this map could dynamically grow
4413 * as new cpu's are detected in the system via any platform specific
4414 * method, such as ACPI for e.g.
4417 cpumask_t cpu_present_map __read_mostly;
4418 EXPORT_SYMBOL(cpu_present_map);
4421 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4422 EXPORT_SYMBOL(cpu_online_map);
4424 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4425 EXPORT_SYMBOL(cpu_possible_map);
4428 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4430 struct task_struct *p;
4433 mutex_lock(&sched_hotcpu_mutex);
4434 read_lock(&tasklist_lock);
4437 p = find_process_by_pid(pid);
4441 retval = security_task_getscheduler(p);
4445 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4448 read_unlock(&tasklist_lock);
4449 mutex_unlock(&sched_hotcpu_mutex);
4457 * sys_sched_getaffinity - get the cpu affinity of a process
4458 * @pid: pid of the process
4459 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4460 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4462 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4463 unsigned long __user *user_mask_ptr)
4468 if (len < sizeof(cpumask_t))
4471 ret = sched_getaffinity(pid, &mask);
4475 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4478 return sizeof(cpumask_t);
4482 * sys_sched_yield - yield the current processor to other threads.
4484 * This function yields the current CPU to other tasks. If there are no
4485 * other threads running on this CPU then this function will return.
4487 asmlinkage long sys_sched_yield(void)
4489 struct rq *rq = this_rq_lock();
4491 schedstat_inc(rq, yld_cnt);
4492 if (unlikely(rq->nr_running == 1))
4493 schedstat_inc(rq, yld_act_empty);
4495 current->sched_class->yield_task(rq, current);
4498 * Since we are going to call schedule() anyway, there's
4499 * no need to preempt or enable interrupts:
4501 __release(rq->lock);
4502 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4503 _raw_spin_unlock(&rq->lock);
4504 preempt_enable_no_resched();
4511 static void __cond_resched(void)
4513 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4514 __might_sleep(__FILE__, __LINE__);
4517 * The BKS might be reacquired before we have dropped
4518 * PREEMPT_ACTIVE, which could trigger a second
4519 * cond_resched() call.
4522 add_preempt_count(PREEMPT_ACTIVE);
4524 sub_preempt_count(PREEMPT_ACTIVE);
4525 } while (need_resched());
4528 int __sched cond_resched(void)
4530 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4531 system_state == SYSTEM_RUNNING) {
4537 EXPORT_SYMBOL(cond_resched);
4540 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4541 * call schedule, and on return reacquire the lock.
4543 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4544 * operations here to prevent schedule() from being called twice (once via
4545 * spin_unlock(), once by hand).
4547 int cond_resched_lock(spinlock_t *lock)
4551 if (need_lockbreak(lock)) {
4557 if (need_resched() && system_state == SYSTEM_RUNNING) {
4558 spin_release(&lock->dep_map, 1, _THIS_IP_);
4559 _raw_spin_unlock(lock);
4560 preempt_enable_no_resched();
4567 EXPORT_SYMBOL(cond_resched_lock);
4569 int __sched cond_resched_softirq(void)
4571 BUG_ON(!in_softirq());
4573 if (need_resched() && system_state == SYSTEM_RUNNING) {
4581 EXPORT_SYMBOL(cond_resched_softirq);
4584 * yield - yield the current processor to other threads.
4586 * This is a shortcut for kernel-space yielding - it marks the
4587 * thread runnable and calls sys_sched_yield().
4589 void __sched yield(void)
4591 set_current_state(TASK_RUNNING);
4594 EXPORT_SYMBOL(yield);
4597 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4598 * that process accounting knows that this is a task in IO wait state.
4600 * But don't do that if it is a deliberate, throttling IO wait (this task
4601 * has set its backing_dev_info: the queue against which it should throttle)
4603 void __sched io_schedule(void)
4605 struct rq *rq = &__raw_get_cpu_var(runqueues);
4607 delayacct_blkio_start();
4608 atomic_inc(&rq->nr_iowait);
4610 atomic_dec(&rq->nr_iowait);
4611 delayacct_blkio_end();
4613 EXPORT_SYMBOL(io_schedule);
4615 long __sched io_schedule_timeout(long timeout)
4617 struct rq *rq = &__raw_get_cpu_var(runqueues);
4620 delayacct_blkio_start();
4621 atomic_inc(&rq->nr_iowait);
4622 ret = schedule_timeout(timeout);
4623 atomic_dec(&rq->nr_iowait);
4624 delayacct_blkio_end();
4629 * sys_sched_get_priority_max - return maximum RT priority.
4630 * @policy: scheduling class.
4632 * this syscall returns the maximum rt_priority that can be used
4633 * by a given scheduling class.
4635 asmlinkage long sys_sched_get_priority_max(int policy)
4642 ret = MAX_USER_RT_PRIO-1;
4654 * sys_sched_get_priority_min - return minimum RT priority.
4655 * @policy: scheduling class.
4657 * this syscall returns the minimum rt_priority that can be used
4658 * by a given scheduling class.
4660 asmlinkage long sys_sched_get_priority_min(int policy)
4678 * sys_sched_rr_get_interval - return the default timeslice of a process.
4679 * @pid: pid of the process.
4680 * @interval: userspace pointer to the timeslice value.
4682 * this syscall writes the default timeslice value of a given process
4683 * into the user-space timespec buffer. A value of '0' means infinity.
4686 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4688 struct task_struct *p;
4689 int retval = -EINVAL;
4696 read_lock(&tasklist_lock);
4697 p = find_process_by_pid(pid);
4701 retval = security_task_getscheduler(p);
4705 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4706 0 : static_prio_timeslice(p->static_prio), &t);
4707 read_unlock(&tasklist_lock);
4708 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4712 read_unlock(&tasklist_lock);
4716 static const char stat_nam[] = "RSDTtZX";
4718 static void show_task(struct task_struct *p)
4720 unsigned long free = 0;
4723 state = p->state ? __ffs(p->state) + 1 : 0;
4724 printk("%-13.13s %c", p->comm,
4725 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4726 #if (BITS_PER_LONG == 32)
4727 if (state == TASK_RUNNING)
4728 printk(" running ");
4730 printk(" %08lX ", thread_saved_pc(p));
4732 if (state == TASK_RUNNING)
4733 printk(" running task ");
4735 printk(" %016lx ", thread_saved_pc(p));
4737 #ifdef CONFIG_DEBUG_STACK_USAGE
4739 unsigned long *n = end_of_stack(p);
4742 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4745 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4747 printk(" (L-TLB)\n");
4749 printk(" (NOTLB)\n");
4751 if (state != TASK_RUNNING)
4752 show_stack(p, NULL);
4755 void show_state_filter(unsigned long state_filter)
4757 struct task_struct *g, *p;
4759 #if (BITS_PER_LONG == 32)
4762 printk(" task PC stack pid father child younger older\n");
4766 printk(" task PC stack pid father child younger older\n");
4768 read_lock(&tasklist_lock);
4769 do_each_thread(g, p) {
4771 * reset the NMI-timeout, listing all files on a slow
4772 * console might take alot of time:
4774 touch_nmi_watchdog();
4775 if (!state_filter || (p->state & state_filter))
4777 } while_each_thread(g, p);
4779 touch_all_softlockup_watchdogs();
4781 #ifdef CONFIG_SCHED_DEBUG
4782 sysrq_sched_debug_show();
4784 read_unlock(&tasklist_lock);
4786 * Only show locks if all tasks are dumped:
4788 if (state_filter == -1)
4789 debug_show_all_locks();
4792 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4794 idle->sched_class = &idle_sched_class;
4798 * init_idle - set up an idle thread for a given CPU
4799 * @idle: task in question
4800 * @cpu: cpu the idle task belongs to
4802 * NOTE: this function does not set the idle thread's NEED_RESCHED
4803 * flag, to make booting more robust.
4805 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4807 struct rq *rq = cpu_rq(cpu);
4808 unsigned long flags;
4811 idle->se.exec_start = sched_clock();
4813 idle->prio = idle->normal_prio = MAX_PRIO;
4814 idle->cpus_allowed = cpumask_of_cpu(cpu);
4815 __set_task_cpu(idle, cpu);
4817 spin_lock_irqsave(&rq->lock, flags);
4818 rq->curr = rq->idle = idle;
4819 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4822 spin_unlock_irqrestore(&rq->lock, flags);
4824 /* Set the preempt count _outside_ the spinlocks! */
4825 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4826 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4828 task_thread_info(idle)->preempt_count = 0;
4831 * The idle tasks have their own, simple scheduling class:
4833 idle->sched_class = &idle_sched_class;
4837 * In a system that switches off the HZ timer nohz_cpu_mask
4838 * indicates which cpus entered this state. This is used
4839 * in the rcu update to wait only for active cpus. For system
4840 * which do not switch off the HZ timer nohz_cpu_mask should
4841 * always be CPU_MASK_NONE.
4843 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4846 * Increase the granularity value when there are more CPUs,
4847 * because with more CPUs the 'effective latency' as visible
4848 * to users decreases. But the relationship is not linear,
4849 * so pick a second-best guess by going with the log2 of the
4852 * This idea comes from the SD scheduler of Con Kolivas:
4854 static inline void sched_init_granularity(void)
4856 unsigned int factor = 1 + ilog2(num_online_cpus());
4857 const unsigned long gran_limit = 10000000;
4859 sysctl_sched_granularity *= factor;
4860 if (sysctl_sched_granularity > gran_limit)
4861 sysctl_sched_granularity = gran_limit;
4863 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4864 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4869 * This is how migration works:
4871 * 1) we queue a struct migration_req structure in the source CPU's
4872 * runqueue and wake up that CPU's migration thread.
4873 * 2) we down() the locked semaphore => thread blocks.
4874 * 3) migration thread wakes up (implicitly it forces the migrated
4875 * thread off the CPU)
4876 * 4) it gets the migration request and checks whether the migrated
4877 * task is still in the wrong runqueue.
4878 * 5) if it's in the wrong runqueue then the migration thread removes
4879 * it and puts it into the right queue.
4880 * 6) migration thread up()s the semaphore.
4881 * 7) we wake up and the migration is done.
4885 * Change a given task's CPU affinity. Migrate the thread to a
4886 * proper CPU and schedule it away if the CPU it's executing on
4887 * is removed from the allowed bitmask.
4889 * NOTE: the caller must have a valid reference to the task, the
4890 * task must not exit() & deallocate itself prematurely. The
4891 * call is not atomic; no spinlocks may be held.
4893 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4895 struct migration_req req;
4896 unsigned long flags;
4900 rq = task_rq_lock(p, &flags);
4901 if (!cpus_intersects(new_mask, cpu_online_map)) {
4906 p->cpus_allowed = new_mask;
4907 /* Can the task run on the task's current CPU? If so, we're done */
4908 if (cpu_isset(task_cpu(p), new_mask))
4911 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4912 /* Need help from migration thread: drop lock and wait. */
4913 task_rq_unlock(rq, &flags);
4914 wake_up_process(rq->migration_thread);
4915 wait_for_completion(&req.done);
4916 tlb_migrate_finish(p->mm);
4920 task_rq_unlock(rq, &flags);
4924 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4927 * Move (not current) task off this cpu, onto dest cpu. We're doing
4928 * this because either it can't run here any more (set_cpus_allowed()
4929 * away from this CPU, or CPU going down), or because we're
4930 * attempting to rebalance this task on exec (sched_exec).
4932 * So we race with normal scheduler movements, but that's OK, as long
4933 * as the task is no longer on this CPU.
4935 * Returns non-zero if task was successfully migrated.
4937 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4939 struct rq *rq_dest, *rq_src;
4942 if (unlikely(cpu_is_offline(dest_cpu)))
4945 rq_src = cpu_rq(src_cpu);
4946 rq_dest = cpu_rq(dest_cpu);
4948 double_rq_lock(rq_src, rq_dest);
4949 /* Already moved. */
4950 if (task_cpu(p) != src_cpu)
4952 /* Affinity changed (again). */
4953 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4956 on_rq = p->se.on_rq;
4958 deactivate_task(rq_src, p, 0);
4959 set_task_cpu(p, dest_cpu);
4961 activate_task(rq_dest, p, 0);
4962 check_preempt_curr(rq_dest, p);
4966 double_rq_unlock(rq_src, rq_dest);
4971 * migration_thread - this is a highprio system thread that performs
4972 * thread migration by bumping thread off CPU then 'pushing' onto
4975 static int migration_thread(void *data)
4977 int cpu = (long)data;
4981 BUG_ON(rq->migration_thread != current);
4983 set_current_state(TASK_INTERRUPTIBLE);
4984 while (!kthread_should_stop()) {
4985 struct migration_req *req;
4986 struct list_head *head;
4990 spin_lock_irq(&rq->lock);
4992 if (cpu_is_offline(cpu)) {
4993 spin_unlock_irq(&rq->lock);
4997 if (rq->active_balance) {
4998 active_load_balance(rq, cpu);
4999 rq->active_balance = 0;
5002 head = &rq->migration_queue;
5004 if (list_empty(head)) {
5005 spin_unlock_irq(&rq->lock);
5007 set_current_state(TASK_INTERRUPTIBLE);
5010 req = list_entry(head->next, struct migration_req, list);
5011 list_del_init(head->next);
5013 spin_unlock(&rq->lock);
5014 __migrate_task(req->task, cpu, req->dest_cpu);
5017 complete(&req->done);
5019 __set_current_state(TASK_RUNNING);
5023 /* Wait for kthread_stop */
5024 set_current_state(TASK_INTERRUPTIBLE);
5025 while (!kthread_should_stop()) {
5027 set_current_state(TASK_INTERRUPTIBLE);
5029 __set_current_state(TASK_RUNNING);
5033 #ifdef CONFIG_HOTPLUG_CPU
5035 * Figure out where task on dead CPU should go, use force if neccessary.
5036 * NOTE: interrupts should be disabled by the caller
5038 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5040 unsigned long flags;
5047 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5048 cpus_and(mask, mask, p->cpus_allowed);
5049 dest_cpu = any_online_cpu(mask);
5051 /* On any allowed CPU? */
5052 if (dest_cpu == NR_CPUS)
5053 dest_cpu = any_online_cpu(p->cpus_allowed);
5055 /* No more Mr. Nice Guy. */
5056 if (dest_cpu == NR_CPUS) {
5057 rq = task_rq_lock(p, &flags);
5058 cpus_setall(p->cpus_allowed);
5059 dest_cpu = any_online_cpu(p->cpus_allowed);
5060 task_rq_unlock(rq, &flags);
5063 * Don't tell them about moving exiting tasks or
5064 * kernel threads (both mm NULL), since they never
5067 if (p->mm && printk_ratelimit())
5068 printk(KERN_INFO "process %d (%s) no "
5069 "longer affine to cpu%d\n",
5070 p->pid, p->comm, dead_cpu);
5072 if (!__migrate_task(p, dead_cpu, dest_cpu))
5077 * While a dead CPU has no uninterruptible tasks queued at this point,
5078 * it might still have a nonzero ->nr_uninterruptible counter, because
5079 * for performance reasons the counter is not stricly tracking tasks to
5080 * their home CPUs. So we just add the counter to another CPU's counter,
5081 * to keep the global sum constant after CPU-down:
5083 static void migrate_nr_uninterruptible(struct rq *rq_src)
5085 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5086 unsigned long flags;
5088 local_irq_save(flags);
5089 double_rq_lock(rq_src, rq_dest);
5090 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5091 rq_src->nr_uninterruptible = 0;
5092 double_rq_unlock(rq_src, rq_dest);
5093 local_irq_restore(flags);
5096 /* Run through task list and migrate tasks from the dead cpu. */
5097 static void migrate_live_tasks(int src_cpu)
5099 struct task_struct *p, *t;
5101 write_lock_irq(&tasklist_lock);
5103 do_each_thread(t, p) {
5107 if (task_cpu(p) == src_cpu)
5108 move_task_off_dead_cpu(src_cpu, p);
5109 } while_each_thread(t, p);
5111 write_unlock_irq(&tasklist_lock);
5115 * Schedules idle task to be the next runnable task on current CPU.
5116 * It does so by boosting its priority to highest possible and adding it to
5117 * the _front_ of the runqueue. Used by CPU offline code.
5119 void sched_idle_next(void)
5121 int this_cpu = smp_processor_id();
5122 struct rq *rq = cpu_rq(this_cpu);
5123 struct task_struct *p = rq->idle;
5124 unsigned long flags;
5126 /* cpu has to be offline */
5127 BUG_ON(cpu_online(this_cpu));
5130 * Strictly not necessary since rest of the CPUs are stopped by now
5131 * and interrupts disabled on the current cpu.
5133 spin_lock_irqsave(&rq->lock, flags);
5135 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5137 /* Add idle task to the _front_ of its priority queue: */
5138 activate_idle_task(p, rq);
5140 spin_unlock_irqrestore(&rq->lock, flags);
5144 * Ensures that the idle task is using init_mm right before its cpu goes
5147 void idle_task_exit(void)
5149 struct mm_struct *mm = current->active_mm;
5151 BUG_ON(cpu_online(smp_processor_id()));
5154 switch_mm(mm, &init_mm, current);
5158 /* called under rq->lock with disabled interrupts */
5159 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5161 struct rq *rq = cpu_rq(dead_cpu);
5163 /* Must be exiting, otherwise would be on tasklist. */
5164 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5166 /* Cannot have done final schedule yet: would have vanished. */
5167 BUG_ON(p->state == TASK_DEAD);
5172 * Drop lock around migration; if someone else moves it,
5173 * that's OK. No task can be added to this CPU, so iteration is
5175 * NOTE: interrupts should be left disabled --dev@
5177 spin_unlock(&rq->lock);
5178 move_task_off_dead_cpu(dead_cpu, p);
5179 spin_lock(&rq->lock);
5184 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5185 static void migrate_dead_tasks(unsigned int dead_cpu)
5187 struct rq *rq = cpu_rq(dead_cpu);
5188 struct task_struct *next;
5191 if (!rq->nr_running)
5193 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5196 migrate_dead(dead_cpu, next);
5199 #endif /* CONFIG_HOTPLUG_CPU */
5202 * migration_call - callback that gets triggered when a CPU is added.
5203 * Here we can start up the necessary migration thread for the new CPU.
5205 static int __cpuinit
5206 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5208 struct task_struct *p;
5209 int cpu = (long)hcpu;
5210 unsigned long flags;
5214 case CPU_LOCK_ACQUIRE:
5215 mutex_lock(&sched_hotcpu_mutex);
5218 case CPU_UP_PREPARE:
5219 case CPU_UP_PREPARE_FROZEN:
5220 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5223 p->flags |= PF_NOFREEZE;
5224 kthread_bind(p, cpu);
5225 /* Must be high prio: stop_machine expects to yield to it. */
5226 rq = task_rq_lock(p, &flags);
5227 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5228 task_rq_unlock(rq, &flags);
5229 cpu_rq(cpu)->migration_thread = p;
5233 case CPU_ONLINE_FROZEN:
5234 /* Strictly unneccessary, as first user will wake it. */
5235 wake_up_process(cpu_rq(cpu)->migration_thread);
5238 #ifdef CONFIG_HOTPLUG_CPU
5239 case CPU_UP_CANCELED:
5240 case CPU_UP_CANCELED_FROZEN:
5241 if (!cpu_rq(cpu)->migration_thread)
5243 /* Unbind it from offline cpu so it can run. Fall thru. */
5244 kthread_bind(cpu_rq(cpu)->migration_thread,
5245 any_online_cpu(cpu_online_map));
5246 kthread_stop(cpu_rq(cpu)->migration_thread);
5247 cpu_rq(cpu)->migration_thread = NULL;
5251 case CPU_DEAD_FROZEN:
5252 migrate_live_tasks(cpu);
5254 kthread_stop(rq->migration_thread);
5255 rq->migration_thread = NULL;
5256 /* Idle task back to normal (off runqueue, low prio) */
5257 rq = task_rq_lock(rq->idle, &flags);
5258 deactivate_task(rq, rq->idle, 0);
5259 rq->idle->static_prio = MAX_PRIO;
5260 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5261 rq->idle->sched_class = &idle_sched_class;
5262 migrate_dead_tasks(cpu);
5263 task_rq_unlock(rq, &flags);
5264 migrate_nr_uninterruptible(rq);
5265 BUG_ON(rq->nr_running != 0);
5267 /* No need to migrate the tasks: it was best-effort if
5268 * they didn't take sched_hotcpu_mutex. Just wake up
5269 * the requestors. */
5270 spin_lock_irq(&rq->lock);
5271 while (!list_empty(&rq->migration_queue)) {
5272 struct migration_req *req;
5274 req = list_entry(rq->migration_queue.next,
5275 struct migration_req, list);
5276 list_del_init(&req->list);
5277 complete(&req->done);
5279 spin_unlock_irq(&rq->lock);
5282 case CPU_LOCK_RELEASE:
5283 mutex_unlock(&sched_hotcpu_mutex);
5289 /* Register at highest priority so that task migration (migrate_all_tasks)
5290 * happens before everything else.
5292 static struct notifier_block __cpuinitdata migration_notifier = {
5293 .notifier_call = migration_call,
5297 int __init migration_init(void)
5299 void *cpu = (void *)(long)smp_processor_id();
5302 /* Start one for the boot CPU: */
5303 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5304 BUG_ON(err == NOTIFY_BAD);
5305 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5306 register_cpu_notifier(&migration_notifier);
5314 /* Number of possible processor ids */
5315 int nr_cpu_ids __read_mostly = NR_CPUS;
5316 EXPORT_SYMBOL(nr_cpu_ids);
5318 #undef SCHED_DOMAIN_DEBUG
5319 #ifdef SCHED_DOMAIN_DEBUG
5320 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5325 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5329 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5334 struct sched_group *group = sd->groups;
5335 cpumask_t groupmask;
5337 cpumask_scnprintf(str, NR_CPUS, sd->span);
5338 cpus_clear(groupmask);
5341 for (i = 0; i < level + 1; i++)
5343 printk("domain %d: ", level);
5345 if (!(sd->flags & SD_LOAD_BALANCE)) {
5346 printk("does not load-balance\n");
5348 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5353 printk("span %s\n", str);
5355 if (!cpu_isset(cpu, sd->span))
5356 printk(KERN_ERR "ERROR: domain->span does not contain "
5358 if (!cpu_isset(cpu, group->cpumask))
5359 printk(KERN_ERR "ERROR: domain->groups does not contain"
5363 for (i = 0; i < level + 2; i++)
5369 printk(KERN_ERR "ERROR: group is NULL\n");
5373 if (!group->__cpu_power) {
5375 printk(KERN_ERR "ERROR: domain->cpu_power not "
5379 if (!cpus_weight(group->cpumask)) {
5381 printk(KERN_ERR "ERROR: empty group\n");
5384 if (cpus_intersects(groupmask, group->cpumask)) {
5386 printk(KERN_ERR "ERROR: repeated CPUs\n");
5389 cpus_or(groupmask, groupmask, group->cpumask);
5391 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5394 group = group->next;
5395 } while (group != sd->groups);
5398 if (!cpus_equal(sd->span, groupmask))
5399 printk(KERN_ERR "ERROR: groups don't span "
5407 if (!cpus_subset(groupmask, sd->span))
5408 printk(KERN_ERR "ERROR: parent span is not a superset "
5409 "of domain->span\n");
5414 # define sched_domain_debug(sd, cpu) do { } while (0)
5417 static int sd_degenerate(struct sched_domain *sd)
5419 if (cpus_weight(sd->span) == 1)
5422 /* Following flags need at least 2 groups */
5423 if (sd->flags & (SD_LOAD_BALANCE |
5424 SD_BALANCE_NEWIDLE |
5428 SD_SHARE_PKG_RESOURCES)) {
5429 if (sd->groups != sd->groups->next)
5433 /* Following flags don't use groups */
5434 if (sd->flags & (SD_WAKE_IDLE |
5443 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5445 unsigned long cflags = sd->flags, pflags = parent->flags;
5447 if (sd_degenerate(parent))
5450 if (!cpus_equal(sd->span, parent->span))
5453 /* Does parent contain flags not in child? */
5454 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5455 if (cflags & SD_WAKE_AFFINE)
5456 pflags &= ~SD_WAKE_BALANCE;
5457 /* Flags needing groups don't count if only 1 group in parent */
5458 if (parent->groups == parent->groups->next) {
5459 pflags &= ~(SD_LOAD_BALANCE |
5460 SD_BALANCE_NEWIDLE |
5464 SD_SHARE_PKG_RESOURCES);
5466 if (~cflags & pflags)
5473 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5474 * hold the hotplug lock.
5476 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5478 struct rq *rq = cpu_rq(cpu);
5479 struct sched_domain *tmp;
5481 /* Remove the sched domains which do not contribute to scheduling. */
5482 for (tmp = sd; tmp; tmp = tmp->parent) {
5483 struct sched_domain *parent = tmp->parent;
5486 if (sd_parent_degenerate(tmp, parent)) {
5487 tmp->parent = parent->parent;
5489 parent->parent->child = tmp;
5493 if (sd && sd_degenerate(sd)) {
5499 sched_domain_debug(sd, cpu);
5501 rcu_assign_pointer(rq->sd, sd);
5504 /* cpus with isolated domains */
5505 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5507 /* Setup the mask of cpus configured for isolated domains */
5508 static int __init isolated_cpu_setup(char *str)
5510 int ints[NR_CPUS], i;
5512 str = get_options(str, ARRAY_SIZE(ints), ints);
5513 cpus_clear(cpu_isolated_map);
5514 for (i = 1; i <= ints[0]; i++)
5515 if (ints[i] < NR_CPUS)
5516 cpu_set(ints[i], cpu_isolated_map);
5520 __setup ("isolcpus=", isolated_cpu_setup);
5523 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5524 * to a function which identifies what group(along with sched group) a CPU
5525 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5526 * (due to the fact that we keep track of groups covered with a cpumask_t).
5528 * init_sched_build_groups will build a circular linked list of the groups
5529 * covered by the given span, and will set each group's ->cpumask correctly,
5530 * and ->cpu_power to 0.
5533 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5534 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5535 struct sched_group **sg))
5537 struct sched_group *first = NULL, *last = NULL;
5538 cpumask_t covered = CPU_MASK_NONE;
5541 for_each_cpu_mask(i, span) {
5542 struct sched_group *sg;
5543 int group = group_fn(i, cpu_map, &sg);
5546 if (cpu_isset(i, covered))
5549 sg->cpumask = CPU_MASK_NONE;
5550 sg->__cpu_power = 0;
5552 for_each_cpu_mask(j, span) {
5553 if (group_fn(j, cpu_map, NULL) != group)
5556 cpu_set(j, covered);
5557 cpu_set(j, sg->cpumask);
5568 #define SD_NODES_PER_DOMAIN 16
5573 * find_next_best_node - find the next node to include in a sched_domain
5574 * @node: node whose sched_domain we're building
5575 * @used_nodes: nodes already in the sched_domain
5577 * Find the next node to include in a given scheduling domain. Simply
5578 * finds the closest node not already in the @used_nodes map.
5580 * Should use nodemask_t.
5582 static int find_next_best_node(int node, unsigned long *used_nodes)
5584 int i, n, val, min_val, best_node = 0;
5588 for (i = 0; i < MAX_NUMNODES; i++) {
5589 /* Start at @node */
5590 n = (node + i) % MAX_NUMNODES;
5592 if (!nr_cpus_node(n))
5595 /* Skip already used nodes */
5596 if (test_bit(n, used_nodes))
5599 /* Simple min distance search */
5600 val = node_distance(node, n);
5602 if (val < min_val) {
5608 set_bit(best_node, used_nodes);
5613 * sched_domain_node_span - get a cpumask for a node's sched_domain
5614 * @node: node whose cpumask we're constructing
5615 * @size: number of nodes to include in this span
5617 * Given a node, construct a good cpumask for its sched_domain to span. It
5618 * should be one that prevents unnecessary balancing, but also spreads tasks
5621 static cpumask_t sched_domain_node_span(int node)
5623 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5624 cpumask_t span, nodemask;
5628 bitmap_zero(used_nodes, MAX_NUMNODES);
5630 nodemask = node_to_cpumask(node);
5631 cpus_or(span, span, nodemask);
5632 set_bit(node, used_nodes);
5634 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5635 int next_node = find_next_best_node(node, used_nodes);
5637 nodemask = node_to_cpumask(next_node);
5638 cpus_or(span, span, nodemask);
5645 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5648 * SMT sched-domains:
5650 #ifdef CONFIG_SCHED_SMT
5651 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5652 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5654 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5655 struct sched_group **sg)
5658 *sg = &per_cpu(sched_group_cpus, cpu);
5664 * multi-core sched-domains:
5666 #ifdef CONFIG_SCHED_MC
5667 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5668 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5671 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5672 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5673 struct sched_group **sg)
5676 cpumask_t mask = cpu_sibling_map[cpu];
5677 cpus_and(mask, mask, *cpu_map);
5678 group = first_cpu(mask);
5680 *sg = &per_cpu(sched_group_core, group);
5683 #elif defined(CONFIG_SCHED_MC)
5684 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5685 struct sched_group **sg)
5688 *sg = &per_cpu(sched_group_core, cpu);
5693 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5694 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5696 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5697 struct sched_group **sg)
5700 #ifdef CONFIG_SCHED_MC
5701 cpumask_t mask = cpu_coregroup_map(cpu);
5702 cpus_and(mask, mask, *cpu_map);
5703 group = first_cpu(mask);
5704 #elif defined(CONFIG_SCHED_SMT)
5705 cpumask_t mask = cpu_sibling_map[cpu];
5706 cpus_and(mask, mask, *cpu_map);
5707 group = first_cpu(mask);
5712 *sg = &per_cpu(sched_group_phys, group);
5718 * The init_sched_build_groups can't handle what we want to do with node
5719 * groups, so roll our own. Now each node has its own list of groups which
5720 * gets dynamically allocated.
5722 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5723 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5725 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5726 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5728 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5729 struct sched_group **sg)
5731 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5734 cpus_and(nodemask, nodemask, *cpu_map);
5735 group = first_cpu(nodemask);
5738 *sg = &per_cpu(sched_group_allnodes, group);
5742 static void init_numa_sched_groups_power(struct sched_group *group_head)
5744 struct sched_group *sg = group_head;
5750 for_each_cpu_mask(j, sg->cpumask) {
5751 struct sched_domain *sd;
5753 sd = &per_cpu(phys_domains, j);
5754 if (j != first_cpu(sd->groups->cpumask)) {
5756 * Only add "power" once for each
5762 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5765 if (sg != group_head)
5771 /* Free memory allocated for various sched_group structures */
5772 static void free_sched_groups(const cpumask_t *cpu_map)
5776 for_each_cpu_mask(cpu, *cpu_map) {
5777 struct sched_group **sched_group_nodes
5778 = sched_group_nodes_bycpu[cpu];
5780 if (!sched_group_nodes)
5783 for (i = 0; i < MAX_NUMNODES; i++) {
5784 cpumask_t nodemask = node_to_cpumask(i);
5785 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5787 cpus_and(nodemask, nodemask, *cpu_map);
5788 if (cpus_empty(nodemask))
5798 if (oldsg != sched_group_nodes[i])
5801 kfree(sched_group_nodes);
5802 sched_group_nodes_bycpu[cpu] = NULL;
5806 static void free_sched_groups(const cpumask_t *cpu_map)
5812 * Initialize sched groups cpu_power.
5814 * cpu_power indicates the capacity of sched group, which is used while
5815 * distributing the load between different sched groups in a sched domain.
5816 * Typically cpu_power for all the groups in a sched domain will be same unless
5817 * there are asymmetries in the topology. If there are asymmetries, group
5818 * having more cpu_power will pickup more load compared to the group having
5821 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5822 * the maximum number of tasks a group can handle in the presence of other idle
5823 * or lightly loaded groups in the same sched domain.
5825 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5827 struct sched_domain *child;
5828 struct sched_group *group;
5830 WARN_ON(!sd || !sd->groups);
5832 if (cpu != first_cpu(sd->groups->cpumask))
5837 sd->groups->__cpu_power = 0;
5840 * For perf policy, if the groups in child domain share resources
5841 * (for example cores sharing some portions of the cache hierarchy
5842 * or SMT), then set this domain groups cpu_power such that each group
5843 * can handle only one task, when there are other idle groups in the
5844 * same sched domain.
5846 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5848 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5849 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5854 * add cpu_power of each child group to this groups cpu_power
5856 group = child->groups;
5858 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5859 group = group->next;
5860 } while (group != child->groups);
5864 * Build sched domains for a given set of cpus and attach the sched domains
5865 * to the individual cpus
5867 static int build_sched_domains(const cpumask_t *cpu_map)
5871 struct sched_group **sched_group_nodes = NULL;
5872 int sd_allnodes = 0;
5875 * Allocate the per-node list of sched groups
5877 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5879 if (!sched_group_nodes) {
5880 printk(KERN_WARNING "Can not alloc sched group node list\n");
5883 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5887 * Set up domains for cpus specified by the cpu_map.
5889 for_each_cpu_mask(i, *cpu_map) {
5890 struct sched_domain *sd = NULL, *p;
5891 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5893 cpus_and(nodemask, nodemask, *cpu_map);
5896 if (cpus_weight(*cpu_map) >
5897 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5898 sd = &per_cpu(allnodes_domains, i);
5899 *sd = SD_ALLNODES_INIT;
5900 sd->span = *cpu_map;
5901 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5907 sd = &per_cpu(node_domains, i);
5909 sd->span = sched_domain_node_span(cpu_to_node(i));
5913 cpus_and(sd->span, sd->span, *cpu_map);
5917 sd = &per_cpu(phys_domains, i);
5919 sd->span = nodemask;
5923 cpu_to_phys_group(i, cpu_map, &sd->groups);
5925 #ifdef CONFIG_SCHED_MC
5927 sd = &per_cpu(core_domains, i);
5929 sd->span = cpu_coregroup_map(i);
5930 cpus_and(sd->span, sd->span, *cpu_map);
5933 cpu_to_core_group(i, cpu_map, &sd->groups);
5936 #ifdef CONFIG_SCHED_SMT
5938 sd = &per_cpu(cpu_domains, i);
5939 *sd = SD_SIBLING_INIT;
5940 sd->span = cpu_sibling_map[i];
5941 cpus_and(sd->span, sd->span, *cpu_map);
5944 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5948 #ifdef CONFIG_SCHED_SMT
5949 /* Set up CPU (sibling) groups */
5950 for_each_cpu_mask(i, *cpu_map) {
5951 cpumask_t this_sibling_map = cpu_sibling_map[i];
5952 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5953 if (i != first_cpu(this_sibling_map))
5956 init_sched_build_groups(this_sibling_map, cpu_map,
5961 #ifdef CONFIG_SCHED_MC
5962 /* Set up multi-core groups */
5963 for_each_cpu_mask(i, *cpu_map) {
5964 cpumask_t this_core_map = cpu_coregroup_map(i);
5965 cpus_and(this_core_map, this_core_map, *cpu_map);
5966 if (i != first_cpu(this_core_map))
5968 init_sched_build_groups(this_core_map, cpu_map,
5969 &cpu_to_core_group);
5973 /* Set up physical groups */
5974 for (i = 0; i < MAX_NUMNODES; i++) {
5975 cpumask_t nodemask = node_to_cpumask(i);
5977 cpus_and(nodemask, nodemask, *cpu_map);
5978 if (cpus_empty(nodemask))
5981 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5985 /* Set up node groups */
5987 init_sched_build_groups(*cpu_map, cpu_map,
5988 &cpu_to_allnodes_group);
5990 for (i = 0; i < MAX_NUMNODES; i++) {
5991 /* Set up node groups */
5992 struct sched_group *sg, *prev;
5993 cpumask_t nodemask = node_to_cpumask(i);
5994 cpumask_t domainspan;
5995 cpumask_t covered = CPU_MASK_NONE;
5998 cpus_and(nodemask, nodemask, *cpu_map);
5999 if (cpus_empty(nodemask)) {
6000 sched_group_nodes[i] = NULL;
6004 domainspan = sched_domain_node_span(i);
6005 cpus_and(domainspan, domainspan, *cpu_map);
6007 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6009 printk(KERN_WARNING "Can not alloc domain group for "
6013 sched_group_nodes[i] = sg;
6014 for_each_cpu_mask(j, nodemask) {
6015 struct sched_domain *sd;
6016 sd = &per_cpu(node_domains, j);
6019 sg->__cpu_power = 0;
6020 sg->cpumask = nodemask;
6022 cpus_or(covered, covered, nodemask);
6025 for (j = 0; j < MAX_NUMNODES; j++) {
6026 cpumask_t tmp, notcovered;
6027 int n = (i + j) % MAX_NUMNODES;
6029 cpus_complement(notcovered, covered);
6030 cpus_and(tmp, notcovered, *cpu_map);
6031 cpus_and(tmp, tmp, domainspan);
6032 if (cpus_empty(tmp))
6035 nodemask = node_to_cpumask(n);
6036 cpus_and(tmp, tmp, nodemask);
6037 if (cpus_empty(tmp))
6040 sg = kmalloc_node(sizeof(struct sched_group),
6044 "Can not alloc domain group for node %d\n", j);
6047 sg->__cpu_power = 0;
6049 sg->next = prev->next;
6050 cpus_or(covered, covered, tmp);
6057 /* Calculate CPU power for physical packages and nodes */
6058 #ifdef CONFIG_SCHED_SMT
6059 for_each_cpu_mask(i, *cpu_map) {
6060 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6062 init_sched_groups_power(i, sd);
6065 #ifdef CONFIG_SCHED_MC
6066 for_each_cpu_mask(i, *cpu_map) {
6067 struct sched_domain *sd = &per_cpu(core_domains, i);
6069 init_sched_groups_power(i, sd);
6073 for_each_cpu_mask(i, *cpu_map) {
6074 struct sched_domain *sd = &per_cpu(phys_domains, i);
6076 init_sched_groups_power(i, sd);
6080 for (i = 0; i < MAX_NUMNODES; i++)
6081 init_numa_sched_groups_power(sched_group_nodes[i]);
6084 struct sched_group *sg;
6086 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6087 init_numa_sched_groups_power(sg);
6091 /* Attach the domains */
6092 for_each_cpu_mask(i, *cpu_map) {
6093 struct sched_domain *sd;
6094 #ifdef CONFIG_SCHED_SMT
6095 sd = &per_cpu(cpu_domains, i);
6096 #elif defined(CONFIG_SCHED_MC)
6097 sd = &per_cpu(core_domains, i);
6099 sd = &per_cpu(phys_domains, i);
6101 cpu_attach_domain(sd, i);
6108 free_sched_groups(cpu_map);
6113 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6115 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6117 cpumask_t cpu_default_map;
6121 * Setup mask for cpus without special case scheduling requirements.
6122 * For now this just excludes isolated cpus, but could be used to
6123 * exclude other special cases in the future.
6125 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6127 err = build_sched_domains(&cpu_default_map);
6132 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6134 free_sched_groups(cpu_map);
6138 * Detach sched domains from a group of cpus specified in cpu_map
6139 * These cpus will now be attached to the NULL domain
6141 static void detach_destroy_domains(const cpumask_t *cpu_map)
6145 for_each_cpu_mask(i, *cpu_map)
6146 cpu_attach_domain(NULL, i);
6147 synchronize_sched();
6148 arch_destroy_sched_domains(cpu_map);
6152 * Partition sched domains as specified by the cpumasks below.
6153 * This attaches all cpus from the cpumasks to the NULL domain,
6154 * waits for a RCU quiescent period, recalculates sched
6155 * domain information and then attaches them back to the
6156 * correct sched domains
6157 * Call with hotplug lock held
6159 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6161 cpumask_t change_map;
6164 cpus_and(*partition1, *partition1, cpu_online_map);
6165 cpus_and(*partition2, *partition2, cpu_online_map);
6166 cpus_or(change_map, *partition1, *partition2);
6168 /* Detach sched domains from all of the affected cpus */
6169 detach_destroy_domains(&change_map);
6170 if (!cpus_empty(*partition1))
6171 err = build_sched_domains(partition1);
6172 if (!err && !cpus_empty(*partition2))
6173 err = build_sched_domains(partition2);
6178 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6179 int arch_reinit_sched_domains(void)
6183 mutex_lock(&sched_hotcpu_mutex);
6184 detach_destroy_domains(&cpu_online_map);
6185 err = arch_init_sched_domains(&cpu_online_map);
6186 mutex_unlock(&sched_hotcpu_mutex);
6191 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6195 if (buf[0] != '0' && buf[0] != '1')
6199 sched_smt_power_savings = (buf[0] == '1');
6201 sched_mc_power_savings = (buf[0] == '1');
6203 ret = arch_reinit_sched_domains();
6205 return ret ? ret : count;
6208 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6212 #ifdef CONFIG_SCHED_SMT
6214 err = sysfs_create_file(&cls->kset.kobj,
6215 &attr_sched_smt_power_savings.attr);
6217 #ifdef CONFIG_SCHED_MC
6218 if (!err && mc_capable())
6219 err = sysfs_create_file(&cls->kset.kobj,
6220 &attr_sched_mc_power_savings.attr);
6226 #ifdef CONFIG_SCHED_MC
6227 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6229 return sprintf(page, "%u\n", sched_mc_power_savings);
6231 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6232 const char *buf, size_t count)
6234 return sched_power_savings_store(buf, count, 0);
6236 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6237 sched_mc_power_savings_store);
6240 #ifdef CONFIG_SCHED_SMT
6241 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6243 return sprintf(page, "%u\n", sched_smt_power_savings);
6245 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6246 const char *buf, size_t count)
6248 return sched_power_savings_store(buf, count, 1);
6250 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6251 sched_smt_power_savings_store);
6255 * Force a reinitialization of the sched domains hierarchy. The domains
6256 * and groups cannot be updated in place without racing with the balancing
6257 * code, so we temporarily attach all running cpus to the NULL domain
6258 * which will prevent rebalancing while the sched domains are recalculated.
6260 static int update_sched_domains(struct notifier_block *nfb,
6261 unsigned long action, void *hcpu)
6264 case CPU_UP_PREPARE:
6265 case CPU_UP_PREPARE_FROZEN:
6266 case CPU_DOWN_PREPARE:
6267 case CPU_DOWN_PREPARE_FROZEN:
6268 detach_destroy_domains(&cpu_online_map);
6271 case CPU_UP_CANCELED:
6272 case CPU_UP_CANCELED_FROZEN:
6273 case CPU_DOWN_FAILED:
6274 case CPU_DOWN_FAILED_FROZEN:
6276 case CPU_ONLINE_FROZEN:
6278 case CPU_DEAD_FROZEN:
6280 * Fall through and re-initialise the domains.
6287 /* The hotplug lock is already held by cpu_up/cpu_down */
6288 arch_init_sched_domains(&cpu_online_map);
6293 void __init sched_init_smp(void)
6295 cpumask_t non_isolated_cpus;
6297 mutex_lock(&sched_hotcpu_mutex);
6298 arch_init_sched_domains(&cpu_online_map);
6299 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6300 if (cpus_empty(non_isolated_cpus))
6301 cpu_set(smp_processor_id(), non_isolated_cpus);
6302 mutex_unlock(&sched_hotcpu_mutex);
6303 /* XXX: Theoretical race here - CPU may be hotplugged now */
6304 hotcpu_notifier(update_sched_domains, 0);
6306 /* Move init over to a non-isolated CPU */
6307 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6309 sched_init_granularity();
6312 void __init sched_init_smp(void)
6314 sched_init_granularity();
6316 #endif /* CONFIG_SMP */
6318 int in_sched_functions(unsigned long addr)
6320 /* Linker adds these: start and end of __sched functions */
6321 extern char __sched_text_start[], __sched_text_end[];
6323 return in_lock_functions(addr) ||
6324 (addr >= (unsigned long)__sched_text_start
6325 && addr < (unsigned long)__sched_text_end);
6328 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6330 cfs_rq->tasks_timeline = RB_ROOT;
6331 cfs_rq->fair_clock = 1;
6332 #ifdef CONFIG_FAIR_GROUP_SCHED
6337 void __init sched_init(void)
6339 u64 now = sched_clock();
6340 int highest_cpu = 0;
6344 * Link up the scheduling class hierarchy:
6346 rt_sched_class.next = &fair_sched_class;
6347 fair_sched_class.next = &idle_sched_class;
6348 idle_sched_class.next = NULL;
6350 for_each_possible_cpu(i) {
6351 struct rt_prio_array *array;
6355 spin_lock_init(&rq->lock);
6356 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6359 init_cfs_rq(&rq->cfs, rq);
6360 #ifdef CONFIG_FAIR_GROUP_SCHED
6361 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6362 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6364 rq->ls.load_update_last = now;
6365 rq->ls.load_update_start = now;
6367 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6368 rq->cpu_load[j] = 0;
6371 rq->active_balance = 0;
6372 rq->next_balance = jiffies;
6375 rq->migration_thread = NULL;
6376 INIT_LIST_HEAD(&rq->migration_queue);
6378 atomic_set(&rq->nr_iowait, 0);
6380 array = &rq->rt.active;
6381 for (j = 0; j < MAX_RT_PRIO; j++) {
6382 INIT_LIST_HEAD(array->queue + j);
6383 __clear_bit(j, array->bitmap);
6386 /* delimiter for bitsearch: */
6387 __set_bit(MAX_RT_PRIO, array->bitmap);
6390 set_load_weight(&init_task);
6393 nr_cpu_ids = highest_cpu + 1;
6394 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6397 #ifdef CONFIG_RT_MUTEXES
6398 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6402 * The boot idle thread does lazy MMU switching as well:
6404 atomic_inc(&init_mm.mm_count);
6405 enter_lazy_tlb(&init_mm, current);
6408 * Make us the idle thread. Technically, schedule() should not be
6409 * called from this thread, however somewhere below it might be,
6410 * but because we are the idle thread, we just pick up running again
6411 * when this runqueue becomes "idle".
6413 init_idle(current, smp_processor_id());
6415 * During early bootup we pretend to be a normal task:
6417 current->sched_class = &fair_sched_class;
6420 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6421 void __might_sleep(char *file, int line)
6424 static unsigned long prev_jiffy; /* ratelimiting */
6426 if ((in_atomic() || irqs_disabled()) &&
6427 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6428 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6430 prev_jiffy = jiffies;
6431 printk(KERN_ERR "BUG: sleeping function called from invalid"
6432 " context at %s:%d\n", file, line);
6433 printk("in_atomic():%d, irqs_disabled():%d\n",
6434 in_atomic(), irqs_disabled());
6435 debug_show_held_locks(current);
6436 if (irqs_disabled())
6437 print_irqtrace_events(current);
6442 EXPORT_SYMBOL(__might_sleep);
6445 #ifdef CONFIG_MAGIC_SYSRQ
6446 void normalize_rt_tasks(void)
6448 struct task_struct *g, *p;
6449 unsigned long flags;
6453 read_lock_irq(&tasklist_lock);
6454 do_each_thread(g, p) {
6456 p->se.wait_runtime = 0;
6457 p->se.wait_start_fair = 0;
6458 p->se.wait_start = 0;
6459 p->se.exec_start = 0;
6460 p->se.sleep_start = 0;
6461 p->se.sleep_start_fair = 0;
6462 p->se.block_start = 0;
6463 task_rq(p)->cfs.fair_clock = 0;
6464 task_rq(p)->clock = 0;
6468 * Renice negative nice level userspace
6471 if (TASK_NICE(p) < 0 && p->mm)
6472 set_user_nice(p, 0);
6476 spin_lock_irqsave(&p->pi_lock, flags);
6477 rq = __task_rq_lock(p);
6480 * Do not touch the migration thread:
6482 if (p == rq->migration_thread)
6486 on_rq = p->se.on_rq;
6488 deactivate_task(task_rq(p), p, 0);
6489 __setscheduler(rq, p, SCHED_NORMAL, 0);
6491 activate_task(task_rq(p), p, 0);
6492 resched_task(rq->curr);
6497 __task_rq_unlock(rq);
6498 spin_unlock_irqrestore(&p->pi_lock, flags);
6499 } while_each_thread(g, p);
6501 read_unlock_irq(&tasklist_lock);
6504 #endif /* CONFIG_MAGIC_SYSRQ */
6508 * These functions are only useful for the IA64 MCA handling.
6510 * They can only be called when the whole system has been
6511 * stopped - every CPU needs to be quiescent, and no scheduling
6512 * activity can take place. Using them for anything else would
6513 * be a serious bug, and as a result, they aren't even visible
6514 * under any other configuration.
6518 * curr_task - return the current task for a given cpu.
6519 * @cpu: the processor in question.
6521 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6523 struct task_struct *curr_task(int cpu)
6525 return cpu_curr(cpu);
6529 * set_curr_task - set the current task for a given cpu.
6530 * @cpu: the processor in question.
6531 * @p: the task pointer to set.
6533 * Description: This function must only be used when non-maskable interrupts
6534 * are serviced on a separate stack. It allows the architecture to switch the
6535 * notion of the current task on a cpu in a non-blocking manner. This function
6536 * must be called with all CPU's synchronized, and interrupts disabled, the
6537 * and caller must save the original value of the current task (see
6538 * curr_task() above) and restore that value before reenabling interrupts and
6539 * re-starting the system.
6541 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6543 void set_curr_task(int cpu, struct task_struct *p)