[PATCH] sched: improve weight-array comments
[linux-block.git] / kernel / sched.c
CommitLineData
1da177e4
LT
1/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
c31f2e8a
IM
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
1da177e4
LT
25 */
26
27#include <linux/mm.h>
28#include <linux/module.h>
29#include <linux/nmi.h>
30#include <linux/init.h>
dff06c15 31#include <linux/uaccess.h>
1da177e4
LT
32#include <linux/highmem.h>
33#include <linux/smp_lock.h>
34#include <asm/mmu_context.h>
35#include <linux/interrupt.h>
c59ede7b 36#include <linux/capability.h>
1da177e4
LT
37#include <linux/completion.h>
38#include <linux/kernel_stat.h>
9a11b49a 39#include <linux/debug_locks.h>
1da177e4
LT
40#include <linux/security.h>
41#include <linux/notifier.h>
42#include <linux/profile.h>
7dfb7103 43#include <linux/freezer.h>
198e2f18 44#include <linux/vmalloc.h>
1da177e4
LT
45#include <linux/blkdev.h>
46#include <linux/delay.h>
47#include <linux/smp.h>
48#include <linux/threads.h>
49#include <linux/timer.h>
50#include <linux/rcupdate.h>
51#include <linux/cpu.h>
52#include <linux/cpuset.h>
53#include <linux/percpu.h>
54#include <linux/kthread.h>
55#include <linux/seq_file.h>
56#include <linux/syscalls.h>
57#include <linux/times.h>
8f0ab514 58#include <linux/tsacct_kern.h>
c6fd91f0 59#include <linux/kprobes.h>
0ff92245 60#include <linux/delayacct.h>
5517d86b 61#include <linux/reciprocal_div.h>
dff06c15 62#include <linux/unistd.h>
1da177e4 63
5517d86b 64#include <asm/tlb.h>
1da177e4 65
b035b6de
AD
66/*
67 * Scheduler clock - returns current time in nanosec units.
68 * This is default implementation.
69 * Architectures and sub-architectures can override this.
70 */
71unsigned long long __attribute__((weak)) sched_clock(void)
72{
73 return (unsigned long long)jiffies * (1000000000 / HZ);
74}
75
1da177e4
LT
76/*
77 * Convert user-nice values [ -20 ... 0 ... 19 ]
78 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
79 * and back.
80 */
81#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
82#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
83#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
84
85/*
86 * 'User priority' is the nice value converted to something we
87 * can work with better when scaling various scheduler parameters,
88 * it's a [ 0 ... 39 ] range.
89 */
90#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
91#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
92#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
93
94/*
95 * Some helpers for converting nanosecond timing to jiffy resolution
96 */
97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
99
6aa645ea
IM
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
1da177e4
LT
103/*
104 * These are the 'tuning knobs' of the scheduler:
105 *
106 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
107 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
108 * Timeslices get refilled after they expire.
109 */
110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
111#define DEF_TIMESLICE (100 * HZ / 1000)
2dd73a4f 112
5517d86b
ED
113#ifdef CONFIG_SMP
114/*
115 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
116 * Since cpu_power is a 'constant', we can use a reciprocal divide.
117 */
118static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
119{
120 return reciprocal_divide(load, sg->reciprocal_cpu_power);
121}
122
123/*
124 * Each time a sched group cpu_power is changed,
125 * we must compute its reciprocal value
126 */
127static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
128{
129 sg->__cpu_power += val;
130 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
131}
132#endif
133
634fa8c9
IM
134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
136
91fcdd4e 137/*
634fa8c9 138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
91fcdd4e 139 * to time slice values: [800ms ... 100ms ... 5ms]
91fcdd4e 140 */
634fa8c9 141static unsigned int static_prio_timeslice(int static_prio)
2dd73a4f 142{
634fa8c9
IM
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
145
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
2dd73a4f
PW
150}
151
e05606d3
IM
152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
158
159static inline int task_has_rt_policy(struct task_struct *p)
160{
161 return rt_policy(p->policy);
162}
163
1da177e4 164/*
6aa645ea 165 * This is the priority-queue data structure of the RT scheduling class:
1da177e4 166 */
6aa645ea
IM
167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
171
172struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
198
199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
201 * (like users, containers etc.)
202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
1da177e4 209
6aa645ea
IM
210/* Real-Time classes' related field in a runqueue: */
211struct rt_rq {
212 struct rt_prio_array active;
213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
215};
216
1da177e4
LT
217/*
218 * This is the main, per-CPU runqueue data structure.
219 *
220 * Locking rule: those places that want to lock multiple runqueues
221 * (such as the load balancing or the thread migration code), lock
222 * acquire operations must be ordered by ascending &runqueue.
223 */
70b97a7f 224struct rq {
6aa645ea 225 spinlock_t lock; /* runqueue lock */
1da177e4
LT
226
227 /*
228 * nr_running and cpu_load should be in the same cacheline because
229 * remote CPUs use both these fields when doing load calculation.
230 */
231 unsigned long nr_running;
6aa645ea
IM
232 #define CPU_LOAD_IDX_MAX 5
233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
bdecea3a 234 unsigned char idle_at_tick;
46cb4b7c
SS
235#ifdef CONFIG_NO_HZ
236 unsigned char in_nohz_recently;
237#endif
6aa645ea
IM
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
1da177e4 245#endif
6aa645ea 246 struct rt_rq rt;
1da177e4
LT
247
248 /*
249 * This is part of a global counter where only the total sum
250 * over all CPUs matters. A task can increase this counter on
251 * one CPU and if it got migrated afterwards it may decrease
252 * it on another CPU. Always updated under the runqueue lock:
253 */
254 unsigned long nr_uninterruptible;
255
36c8b586 256 struct task_struct *curr, *idle;
c9819f45 257 unsigned long next_balance;
1da177e4 258 struct mm_struct *prev_mm;
6aa645ea 259
6aa645ea
IM
260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
1da177e4
LT
268 atomic_t nr_iowait;
269
270#ifdef CONFIG_SMP
271 struct sched_domain *sd;
272
273 /* For active balancing */
274 int active_balance;
275 int push_cpu;
0a2966b4 276 int cpu; /* cpu of this runqueue */
1da177e4 277
36c8b586 278 struct task_struct *migration_thread;
1da177e4
LT
279 struct list_head migration_queue;
280#endif
281
282#ifdef CONFIG_SCHEDSTATS
283 /* latency stats */
284 struct sched_info rq_sched_info;
285
286 /* sys_sched_yield() stats */
287 unsigned long yld_exp_empty;
288 unsigned long yld_act_empty;
289 unsigned long yld_both_empty;
290 unsigned long yld_cnt;
291
292 /* schedule() stats */
293 unsigned long sched_switch;
294 unsigned long sched_cnt;
295 unsigned long sched_goidle;
296
297 /* try_to_wake_up() stats */
298 unsigned long ttwu_cnt;
299 unsigned long ttwu_local;
300#endif
fcb99371 301 struct lock_class_key rq_lock_key;
1da177e4
LT
302};
303
c3396620 304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
5be9361c 305static DEFINE_MUTEX(sched_hotcpu_mutex);
1da177e4 306
dd41f596
IM
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
0a2966b4
CL
312static inline int cpu_of(struct rq *rq)
313{
314#ifdef CONFIG_SMP
315 return rq->cpu;
316#else
317 return 0;
318#endif
319}
320
20d315d4
IM
321/*
322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
674311d5
NP
367/*
368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1a20ff27 369 * See detach_destroy_domains: synchronize_sched for details.
674311d5
NP
370 *
371 * The domain tree of any CPU may only be accessed from within
372 * preempt-disabled sections.
373 */
48f24c4d
IM
374#define for_each_domain(cpu, __sd) \
375 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
1da177e4
LT
376
377#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
378#define this_rq() (&__get_cpu_var(runqueues))
379#define task_rq(p) cpu_rq(task_cpu(p))
380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
381
138a8aeb
IM
382#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p)
385{
386 p->se.cfs_rq = &task_rq(p)->cfs;
387}
388#else
389static inline void set_task_cfs_rq(struct task_struct *p)
390{
391}
392#endif
393
1da177e4 394#ifndef prepare_arch_switch
4866cde0
NP
395# define prepare_arch_switch(next) do { } while (0)
396#endif
397#ifndef finish_arch_switch
398# define finish_arch_switch(prev) do { } while (0)
399#endif
400
401#ifndef __ARCH_WANT_UNLOCKED_CTXSW
70b97a7f 402static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0
NP
403{
404 return rq->curr == p;
405}
406
70b97a7f 407static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
408{
409}
410
70b97a7f 411static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0 412{
da04c035
IM
413#ifdef CONFIG_DEBUG_SPINLOCK
414 /* this is a valid case when another task releases the spinlock */
415 rq->lock.owner = current;
416#endif
8a25d5de
IM
417 /*
418 * If we are tracking spinlock dependencies then we have to
419 * fix up the runqueue lock - which gets 'carried over' from
420 * prev into current:
421 */
422 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
423
4866cde0
NP
424 spin_unlock_irq(&rq->lock);
425}
426
427#else /* __ARCH_WANT_UNLOCKED_CTXSW */
70b97a7f 428static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0
NP
429{
430#ifdef CONFIG_SMP
431 return p->oncpu;
432#else
433 return rq->curr == p;
434#endif
435}
436
70b97a7f 437static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
438{
439#ifdef CONFIG_SMP
440 /*
441 * We can optimise this out completely for !SMP, because the
442 * SMP rebalancing from interrupt is the only thing that cares
443 * here.
444 */
445 next->oncpu = 1;
446#endif
447#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
448 spin_unlock_irq(&rq->lock);
449#else
450 spin_unlock(&rq->lock);
451#endif
452}
453
70b97a7f 454static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0
NP
455{
456#ifdef CONFIG_SMP
457 /*
458 * After ->oncpu is cleared, the task can be moved to a different CPU.
459 * We must ensure this doesn't happen until the switch is completely
460 * finished.
461 */
462 smp_wmb();
463 prev->oncpu = 0;
464#endif
465#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
466 local_irq_enable();
1da177e4 467#endif
4866cde0
NP
468}
469#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1da177e4 470
b29739f9
IM
471/*
472 * __task_rq_lock - lock the runqueue a given task resides on.
473 * Must be called interrupts disabled.
474 */
70b97a7f 475static inline struct rq *__task_rq_lock(struct task_struct *p)
b29739f9
IM
476 __acquires(rq->lock)
477{
70b97a7f 478 struct rq *rq;
b29739f9
IM
479
480repeat_lock_task:
481 rq = task_rq(p);
482 spin_lock(&rq->lock);
483 if (unlikely(rq != task_rq(p))) {
484 spin_unlock(&rq->lock);
485 goto repeat_lock_task;
486 }
487 return rq;
488}
489
1da177e4
LT
490/*
491 * task_rq_lock - lock the runqueue a given task resides on and disable
492 * interrupts. Note the ordering: we can safely lookup the task_rq without
493 * explicitly disabling preemption.
494 */
70b97a7f 495static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1da177e4
LT
496 __acquires(rq->lock)
497{
70b97a7f 498 struct rq *rq;
1da177e4
LT
499
500repeat_lock_task:
501 local_irq_save(*flags);
502 rq = task_rq(p);
503 spin_lock(&rq->lock);
504 if (unlikely(rq != task_rq(p))) {
505 spin_unlock_irqrestore(&rq->lock, *flags);
506 goto repeat_lock_task;
507 }
508 return rq;
509}
510
70b97a7f 511static inline void __task_rq_unlock(struct rq *rq)
b29739f9
IM
512 __releases(rq->lock)
513{
514 spin_unlock(&rq->lock);
515}
516
70b97a7f 517static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1da177e4
LT
518 __releases(rq->lock)
519{
520 spin_unlock_irqrestore(&rq->lock, *flags);
521}
522
1da177e4 523/*
cc2a73b5 524 * this_rq_lock - lock this runqueue and disable interrupts.
1da177e4 525 */
70b97a7f 526static inline struct rq *this_rq_lock(void)
1da177e4
LT
527 __acquires(rq->lock)
528{
70b97a7f 529 struct rq *rq;
1da177e4
LT
530
531 local_irq_disable();
532 rq = this_rq();
533 spin_lock(&rq->lock);
534
535 return rq;
536}
537
1b9f19c2
IM
538/*
539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
540 */
541void sched_clock_unstable_event(void)
542{
543 unsigned long flags;
544 struct rq *rq;
545
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
550}
551
c24d20db
IM
552/*
553 * resched_task - mark a task 'to be rescheduled now'.
554 *
555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
558 */
559#ifdef CONFIG_SMP
560
561#ifndef tsk_is_polling
562#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563#endif
564
565static void resched_task(struct task_struct *p)
566{
567 int cpu;
568
569 assert_spin_locked(&task_rq(p)->lock);
570
571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
572 return;
573
574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
575
576 cpu = task_cpu(p);
577 if (cpu == smp_processor_id())
578 return;
579
580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
584}
585
586static void resched_cpu(int cpu)
587{
588 struct rq *rq = cpu_rq(cpu);
589 unsigned long flags;
590
591 if (!spin_trylock_irqsave(&rq->lock, flags))
592 return;
593 resched_task(cpu_curr(cpu));
594 spin_unlock_irqrestore(&rq->lock, flags);
595}
596#else
597static inline void resched_task(struct task_struct *p)
598{
599 assert_spin_locked(&task_rq(p)->lock);
600 set_tsk_need_resched(p);
601}
602#endif
603
45bf76df
IM
604static u64 div64_likely32(u64 divident, unsigned long divisor)
605{
606#if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
610
611 return divident;
612#else
613 return divident / divisor;
614#endif
615}
616
617#if BITS_PER_LONG == 32
618# define WMULT_CONST (~0UL)
619#else
620# define WMULT_CONST (1UL << 32)
621#endif
622
623#define WMULT_SHIFT 32
624
625static inline unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
628{
629 u64 tmp;
630
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
633
634 tmp = (u64)delta_exec * weight;
635 /*
636 * Check whether we'd overflow the 64-bit multiplication:
637 */
638 if (unlikely(tmp > WMULT_CONST)) {
639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
643 }
644
645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
646}
647
648static inline unsigned long
649calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
650{
651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
652}
653
654static void update_load_add(struct load_weight *lw, unsigned long inc)
655{
656 lw->weight += inc;
657 lw->inv_weight = 0;
658}
659
660static void update_load_sub(struct load_weight *lw, unsigned long dec)
661{
662 lw->weight -= dec;
663 lw->inv_weight = 0;
664}
665
666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
667{
668 if (rq->curr != rq->idle && ls->load.weight) {
669 ls->delta_exec += ls->delta_stat;
670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
671 ls->delta_stat = 0;
672 }
673}
674
675/*
676 * Update delta_exec, delta_fair fields for rq.
677 *
678 * delta_fair clock advances at a rate inversely proportional to
679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
682 *
683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
686 *
687 * This function is called /before/ updating rq->ls.load
688 * and when switching tasks.
689 */
690static void update_curr_load(struct rq *rq, u64 now)
691{
692 struct load_stat *ls = &rq->ls;
693 u64 start;
694
695 start = ls->load_update_start;
696 ls->load_update_start = now;
697 ls->delta_stat += now - start;
698 /*
699 * Stagger updates to ls->delta_fair. Very frequent updates
700 * can be expensive.
701 */
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
704}
705
2dd73a4f
PW
706/*
707 * To aid in avoiding the subversion of "niceness" due to uneven distribution
708 * of tasks with abnormal "nice" values across CPUs the contribution that
709 * each task makes to its run queue's load is weighted according to its
710 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
711 * scaled version of the new time slice allocation that they receive on time
712 * slice expiry etc.
713 */
714
715/*
716 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
717 * If static_prio_timeslice() is ever changed to break this assumption then
718 * this code will need modification
719 */
720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
dd41f596 721#define load_weight(lp) \
2dd73a4f
PW
722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
723#define PRIO_TO_LOAD_WEIGHT(prio) \
dd41f596 724 load_weight(static_prio_timeslice(prio))
2dd73a4f 725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
dd41f596
IM
726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
727
728#define WEIGHT_IDLEPRIO 2
729#define WMULT_IDLEPRIO (1 << 31)
730
731/*
732 * Nice levels are multiplicative, with a gentle 10% change for every
733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
735 * that remained on nice 0.
736 *
737 * The "10% effect" is relative and cumulative: from _any_ nice level,
738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
f9153ee6
IM
739 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
740 * If a task goes up by ~10% and another task goes down by ~10% then
741 * the relative distance between them is ~25%.)
dd41f596
IM
742 */
743static const int prio_to_weight[40] = {
744/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
745/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
746/* 0 */ NICE_0_LOAD /* 1024 */,
747/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
748/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
749};
750
751static const u32 prio_to_wmult[40] = {
752 48356, 60446, 75558, 94446, 118058, 147573,
753 184467, 230589, 288233, 360285, 450347,
754 562979, 703746, 879575, 1099582, 1374389,
4fd88517 755 1717986, 2147483, 2684354, 3355443, 4194304,
e127031f 756 5244160, 6557201, 8196502, 10250518, 12782640,
dd41f596
IM
757 16025997, 19976592, 24970740, 31350126, 39045157,
758 49367440, 61356675, 76695844, 95443717, 119304647,
759 148102320, 186737708, 238609294, 286331153,
760};
2dd73a4f 761
36c8b586 762static inline void
dd41f596 763inc_load(struct rq *rq, const struct task_struct *p, u64 now)
2dd73a4f 764{
dd41f596
IM
765 update_curr_load(rq, now);
766 update_load_add(&rq->ls.load, p->se.load.weight);
2dd73a4f
PW
767}
768
36c8b586 769static inline void
dd41f596 770dec_load(struct rq *rq, const struct task_struct *p, u64 now)
2dd73a4f 771{
dd41f596
IM
772 update_curr_load(rq, now);
773 update_load_sub(&rq->ls.load, p->se.load.weight);
2dd73a4f
PW
774}
775
dd41f596 776static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
2dd73a4f
PW
777{
778 rq->nr_running++;
dd41f596 779 inc_load(rq, p, now);
2dd73a4f
PW
780}
781
dd41f596 782static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
2dd73a4f
PW
783{
784 rq->nr_running--;
dd41f596 785 dec_load(rq, p, now);
2dd73a4f
PW
786}
787
dd41f596
IM
788static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
789
790/*
791 * runqueue iterator, to support SMP load-balancing between different
792 * scheduling classes, without having to expose their internal data
793 * structures to the load-balancing proper:
794 */
795struct rq_iterator {
796 void *arg;
797 struct task_struct *(*start)(void *);
798 struct task_struct *(*next)(void *);
799};
800
801static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
802 unsigned long max_nr_move, unsigned long max_load_move,
803 struct sched_domain *sd, enum cpu_idle_type idle,
804 int *all_pinned, unsigned long *load_moved,
805 int this_best_prio, int best_prio, int best_prio_seen,
806 struct rq_iterator *iterator);
807
808#include "sched_stats.h"
809#include "sched_rt.c"
810#include "sched_fair.c"
811#include "sched_idletask.c"
812#ifdef CONFIG_SCHED_DEBUG
813# include "sched_debug.c"
814#endif
815
816#define sched_class_highest (&rt_sched_class)
817
45bf76df
IM
818static void set_load_weight(struct task_struct *p)
819{
dd41f596
IM
820 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
821 p->se.wait_runtime = 0;
822
45bf76df 823 if (task_has_rt_policy(p)) {
dd41f596
IM
824 p->se.load.weight = prio_to_weight[0] * 2;
825 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
826 return;
827 }
45bf76df 828
dd41f596
IM
829 /*
830 * SCHED_IDLE tasks get minimal weight:
831 */
832 if (p->policy == SCHED_IDLE) {
833 p->se.load.weight = WEIGHT_IDLEPRIO;
834 p->se.load.inv_weight = WMULT_IDLEPRIO;
835 return;
836 }
71f8bd46 837
dd41f596
IM
838 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
839 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
71f8bd46
IM
840}
841
dd41f596
IM
842static void
843enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
71f8bd46 844{
dd41f596
IM
845 sched_info_queued(p);
846 p->sched_class->enqueue_task(rq, p, wakeup, now);
847 p->se.on_rq = 1;
71f8bd46
IM
848}
849
dd41f596
IM
850static void
851dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
71f8bd46 852{
dd41f596
IM
853 p->sched_class->dequeue_task(rq, p, sleep, now);
854 p->se.on_rq = 0;
71f8bd46
IM
855}
856
14531189 857/*
dd41f596 858 * __normal_prio - return the priority that is based on the static prio
14531189 859 */
14531189
IM
860static inline int __normal_prio(struct task_struct *p)
861{
dd41f596 862 return p->static_prio;
14531189
IM
863}
864
b29739f9
IM
865/*
866 * Calculate the expected normal priority: i.e. priority
867 * without taking RT-inheritance into account. Might be
868 * boosted by interactivity modifiers. Changes upon fork,
869 * setprio syscalls, and whenever the interactivity
870 * estimator recalculates.
871 */
36c8b586 872static inline int normal_prio(struct task_struct *p)
b29739f9
IM
873{
874 int prio;
875
e05606d3 876 if (task_has_rt_policy(p))
b29739f9
IM
877 prio = MAX_RT_PRIO-1 - p->rt_priority;
878 else
879 prio = __normal_prio(p);
880 return prio;
881}
882
883/*
884 * Calculate the current priority, i.e. the priority
885 * taken into account by the scheduler. This value might
886 * be boosted by RT tasks, or might be boosted by
887 * interactivity modifiers. Will be RT if the task got
888 * RT-boosted. If not then it returns p->normal_prio.
889 */
36c8b586 890static int effective_prio(struct task_struct *p)
b29739f9
IM
891{
892 p->normal_prio = normal_prio(p);
893 /*
894 * If we are RT tasks or we were boosted to RT priority,
895 * keep the priority unchanged. Otherwise, update priority
896 * to the normal priority:
897 */
898 if (!rt_prio(p->prio))
899 return p->normal_prio;
900 return p->prio;
901}
902
1da177e4 903/*
dd41f596 904 * activate_task - move a task to the runqueue.
1da177e4 905 */
dd41f596 906static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1da177e4 907{
dd41f596 908 u64 now = rq_clock(rq);
d425b274 909
dd41f596
IM
910 if (p->state == TASK_UNINTERRUPTIBLE)
911 rq->nr_uninterruptible--;
1da177e4 912
dd41f596
IM
913 enqueue_task(rq, p, wakeup, now);
914 inc_nr_running(p, rq, now);
1da177e4
LT
915}
916
917/*
dd41f596 918 * activate_idle_task - move idle task to the _front_ of runqueue.
1da177e4 919 */
dd41f596 920static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
1da177e4 921{
dd41f596 922 u64 now = rq_clock(rq);
1da177e4 923
dd41f596
IM
924 if (p->state == TASK_UNINTERRUPTIBLE)
925 rq->nr_uninterruptible--;
ece8a684 926
dd41f596
IM
927 enqueue_task(rq, p, 0, now);
928 inc_nr_running(p, rq, now);
1da177e4
LT
929}
930
931/*
932 * deactivate_task - remove a task from the runqueue.
933 */
dd41f596 934static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1da177e4 935{
dd41f596
IM
936 u64 now = rq_clock(rq);
937
938 if (p->state == TASK_UNINTERRUPTIBLE)
939 rq->nr_uninterruptible++;
940
941 dequeue_task(rq, p, sleep, now);
942 dec_nr_running(p, rq, now);
1da177e4
LT
943}
944
1da177e4
LT
945/**
946 * task_curr - is this task currently executing on a CPU?
947 * @p: the task in question.
948 */
36c8b586 949inline int task_curr(const struct task_struct *p)
1da177e4
LT
950{
951 return cpu_curr(task_cpu(p)) == p;
952}
953
2dd73a4f
PW
954/* Used instead of source_load when we know the type == 0 */
955unsigned long weighted_cpuload(const int cpu)
956{
dd41f596
IM
957 return cpu_rq(cpu)->ls.load.weight;
958}
959
960static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
961{
962#ifdef CONFIG_SMP
963 task_thread_info(p)->cpu = cpu;
964 set_task_cfs_rq(p);
965#endif
2dd73a4f
PW
966}
967
1da177e4 968#ifdef CONFIG_SMP
c65cc870 969
dd41f596 970void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
c65cc870 971{
dd41f596
IM
972 int old_cpu = task_cpu(p);
973 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
974 u64 clock_offset, fair_clock_offset;
975
976 clock_offset = old_rq->clock - new_rq->clock;
977 fair_clock_offset = old_rq->cfs.fair_clock -
978 new_rq->cfs.fair_clock;
979 if (p->se.wait_start)
980 p->se.wait_start -= clock_offset;
981 if (p->se.wait_start_fair)
982 p->se.wait_start_fair -= fair_clock_offset;
983 if (p->se.sleep_start)
984 p->se.sleep_start -= clock_offset;
985 if (p->se.block_start)
986 p->se.block_start -= clock_offset;
987 if (p->se.sleep_start_fair)
988 p->se.sleep_start_fair -= fair_clock_offset;
989
990 __set_task_cpu(p, new_cpu);
c65cc870
IM
991}
992
70b97a7f 993struct migration_req {
1da177e4 994 struct list_head list;
1da177e4 995
36c8b586 996 struct task_struct *task;
1da177e4
LT
997 int dest_cpu;
998
1da177e4 999 struct completion done;
70b97a7f 1000};
1da177e4
LT
1001
1002/*
1003 * The task's runqueue lock must be held.
1004 * Returns true if you have to wait for migration thread.
1005 */
36c8b586 1006static int
70b97a7f 1007migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1da177e4 1008{
70b97a7f 1009 struct rq *rq = task_rq(p);
1da177e4
LT
1010
1011 /*
1012 * If the task is not on a runqueue (and not running), then
1013 * it is sufficient to simply update the task's cpu field.
1014 */
dd41f596 1015 if (!p->se.on_rq && !task_running(rq, p)) {
1da177e4
LT
1016 set_task_cpu(p, dest_cpu);
1017 return 0;
1018 }
1019
1020 init_completion(&req->done);
1da177e4
LT
1021 req->task = p;
1022 req->dest_cpu = dest_cpu;
1023 list_add(&req->list, &rq->migration_queue);
48f24c4d 1024
1da177e4
LT
1025 return 1;
1026}
1027
1028/*
1029 * wait_task_inactive - wait for a thread to unschedule.
1030 *
1031 * The caller must ensure that the task *will* unschedule sometime soon,
1032 * else this function might spin for a *long* time. This function can't
1033 * be called with interrupts off, or it may introduce deadlock with
1034 * smp_call_function() if an IPI is sent by the same process we are
1035 * waiting to become inactive.
1036 */
36c8b586 1037void wait_task_inactive(struct task_struct *p)
1da177e4
LT
1038{
1039 unsigned long flags;
dd41f596 1040 int running, on_rq;
70b97a7f 1041 struct rq *rq;
1da177e4
LT
1042
1043repeat:
fa490cfd
LT
1044 /*
1045 * We do the initial early heuristics without holding
1046 * any task-queue locks at all. We'll only try to get
1047 * the runqueue lock when things look like they will
1048 * work out!
1049 */
1050 rq = task_rq(p);
1051
1052 /*
1053 * If the task is actively running on another CPU
1054 * still, just relax and busy-wait without holding
1055 * any locks.
1056 *
1057 * NOTE! Since we don't hold any locks, it's not
1058 * even sure that "rq" stays as the right runqueue!
1059 * But we don't care, since "task_running()" will
1060 * return false if the runqueue has changed and p
1061 * is actually now running somewhere else!
1062 */
1063 while (task_running(rq, p))
1064 cpu_relax();
1065
1066 /*
1067 * Ok, time to look more closely! We need the rq
1068 * lock now, to be *sure*. If we're wrong, we'll
1069 * just go back and repeat.
1070 */
1da177e4 1071 rq = task_rq_lock(p, &flags);
fa490cfd 1072 running = task_running(rq, p);
dd41f596 1073 on_rq = p->se.on_rq;
fa490cfd
LT
1074 task_rq_unlock(rq, &flags);
1075
1076 /*
1077 * Was it really running after all now that we
1078 * checked with the proper locks actually held?
1079 *
1080 * Oops. Go back and try again..
1081 */
1082 if (unlikely(running)) {
1da177e4 1083 cpu_relax();
1da177e4
LT
1084 goto repeat;
1085 }
fa490cfd
LT
1086
1087 /*
1088 * It's not enough that it's not actively running,
1089 * it must be off the runqueue _entirely_, and not
1090 * preempted!
1091 *
1092 * So if it wa still runnable (but just not actively
1093 * running right now), it's preempted, and we should
1094 * yield - it could be a while.
1095 */
dd41f596 1096 if (unlikely(on_rq)) {
fa490cfd
LT
1097 yield();
1098 goto repeat;
1099 }
1100
1101 /*
1102 * Ahh, all good. It wasn't running, and it wasn't
1103 * runnable, which means that it will never become
1104 * running in the future either. We're all done!
1105 */
1da177e4
LT
1106}
1107
1108/***
1109 * kick_process - kick a running thread to enter/exit the kernel
1110 * @p: the to-be-kicked thread
1111 *
1112 * Cause a process which is running on another CPU to enter
1113 * kernel-mode, without any delay. (to get signals handled.)
1114 *
1115 * NOTE: this function doesnt have to take the runqueue lock,
1116 * because all it wants to ensure is that the remote task enters
1117 * the kernel. If the IPI races and the task has been migrated
1118 * to another CPU then no harm is done and the purpose has been
1119 * achieved as well.
1120 */
36c8b586 1121void kick_process(struct task_struct *p)
1da177e4
LT
1122{
1123 int cpu;
1124
1125 preempt_disable();
1126 cpu = task_cpu(p);
1127 if ((cpu != smp_processor_id()) && task_curr(p))
1128 smp_send_reschedule(cpu);
1129 preempt_enable();
1130}
1131
1132/*
2dd73a4f
PW
1133 * Return a low guess at the load of a migration-source cpu weighted
1134 * according to the scheduling class and "nice" value.
1da177e4
LT
1135 *
1136 * We want to under-estimate the load of migration sources, to
1137 * balance conservatively.
1138 */
a2000572 1139static inline unsigned long source_load(int cpu, int type)
1da177e4 1140{
70b97a7f 1141 struct rq *rq = cpu_rq(cpu);
dd41f596 1142 unsigned long total = weighted_cpuload(cpu);
2dd73a4f 1143
3b0bd9bc 1144 if (type == 0)
dd41f596 1145 return total;
b910472d 1146
dd41f596 1147 return min(rq->cpu_load[type-1], total);
1da177e4
LT
1148}
1149
1150/*
2dd73a4f
PW
1151 * Return a high guess at the load of a migration-target cpu weighted
1152 * according to the scheduling class and "nice" value.
1da177e4 1153 */
a2000572 1154static inline unsigned long target_load(int cpu, int type)
1da177e4 1155{
70b97a7f 1156 struct rq *rq = cpu_rq(cpu);
dd41f596 1157 unsigned long total = weighted_cpuload(cpu);
2dd73a4f 1158
7897986b 1159 if (type == 0)
dd41f596 1160 return total;
3b0bd9bc 1161
dd41f596 1162 return max(rq->cpu_load[type-1], total);
2dd73a4f
PW
1163}
1164
1165/*
1166 * Return the average load per task on the cpu's run queue
1167 */
1168static inline unsigned long cpu_avg_load_per_task(int cpu)
1169{
70b97a7f 1170 struct rq *rq = cpu_rq(cpu);
dd41f596 1171 unsigned long total = weighted_cpuload(cpu);
2dd73a4f
PW
1172 unsigned long n = rq->nr_running;
1173
dd41f596 1174 return n ? total / n : SCHED_LOAD_SCALE;
1da177e4
LT
1175}
1176
147cbb4b
NP
1177/*
1178 * find_idlest_group finds and returns the least busy CPU group within the
1179 * domain.
1180 */
1181static struct sched_group *
1182find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1183{
1184 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1185 unsigned long min_load = ULONG_MAX, this_load = 0;
1186 int load_idx = sd->forkexec_idx;
1187 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1188
1189 do {
1190 unsigned long load, avg_load;
1191 int local_group;
1192 int i;
1193
da5a5522
BD
1194 /* Skip over this group if it has no CPUs allowed */
1195 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1196 goto nextgroup;
1197
147cbb4b 1198 local_group = cpu_isset(this_cpu, group->cpumask);
147cbb4b
NP
1199
1200 /* Tally up the load of all CPUs in the group */
1201 avg_load = 0;
1202
1203 for_each_cpu_mask(i, group->cpumask) {
1204 /* Bias balancing toward cpus of our domain */
1205 if (local_group)
1206 load = source_load(i, load_idx);
1207 else
1208 load = target_load(i, load_idx);
1209
1210 avg_load += load;
1211 }
1212
1213 /* Adjust by relative CPU power of the group */
5517d86b
ED
1214 avg_load = sg_div_cpu_power(group,
1215 avg_load * SCHED_LOAD_SCALE);
147cbb4b
NP
1216
1217 if (local_group) {
1218 this_load = avg_load;
1219 this = group;
1220 } else if (avg_load < min_load) {
1221 min_load = avg_load;
1222 idlest = group;
1223 }
da5a5522 1224nextgroup:
147cbb4b
NP
1225 group = group->next;
1226 } while (group != sd->groups);
1227
1228 if (!idlest || 100*this_load < imbalance*min_load)
1229 return NULL;
1230 return idlest;
1231}
1232
1233/*
0feaece9 1234 * find_idlest_cpu - find the idlest cpu among the cpus in group.
147cbb4b 1235 */
95cdf3b7
IM
1236static int
1237find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
147cbb4b 1238{
da5a5522 1239 cpumask_t tmp;
147cbb4b
NP
1240 unsigned long load, min_load = ULONG_MAX;
1241 int idlest = -1;
1242 int i;
1243
da5a5522
BD
1244 /* Traverse only the allowed CPUs */
1245 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1246
1247 for_each_cpu_mask(i, tmp) {
2dd73a4f 1248 load = weighted_cpuload(i);
147cbb4b
NP
1249
1250 if (load < min_load || (load == min_load && i == this_cpu)) {
1251 min_load = load;
1252 idlest = i;
1253 }
1254 }
1255
1256 return idlest;
1257}
1258
476d139c
NP
1259/*
1260 * sched_balance_self: balance the current task (running on cpu) in domains
1261 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1262 * SD_BALANCE_EXEC.
1263 *
1264 * Balance, ie. select the least loaded group.
1265 *
1266 * Returns the target CPU number, or the same CPU if no balancing is needed.
1267 *
1268 * preempt must be disabled.
1269 */
1270static int sched_balance_self(int cpu, int flag)
1271{
1272 struct task_struct *t = current;
1273 struct sched_domain *tmp, *sd = NULL;
147cbb4b 1274
c96d145e 1275 for_each_domain(cpu, tmp) {
9761eea8
IM
1276 /*
1277 * If power savings logic is enabled for a domain, stop there.
1278 */
5c45bf27
SS
1279 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1280 break;
476d139c
NP
1281 if (tmp->flags & flag)
1282 sd = tmp;
c96d145e 1283 }
476d139c
NP
1284
1285 while (sd) {
1286 cpumask_t span;
1287 struct sched_group *group;
1a848870
SS
1288 int new_cpu, weight;
1289
1290 if (!(sd->flags & flag)) {
1291 sd = sd->child;
1292 continue;
1293 }
476d139c
NP
1294
1295 span = sd->span;
1296 group = find_idlest_group(sd, t, cpu);
1a848870
SS
1297 if (!group) {
1298 sd = sd->child;
1299 continue;
1300 }
476d139c 1301
da5a5522 1302 new_cpu = find_idlest_cpu(group, t, cpu);
1a848870
SS
1303 if (new_cpu == -1 || new_cpu == cpu) {
1304 /* Now try balancing at a lower domain level of cpu */
1305 sd = sd->child;
1306 continue;
1307 }
476d139c 1308
1a848870 1309 /* Now try balancing at a lower domain level of new_cpu */
476d139c 1310 cpu = new_cpu;
476d139c
NP
1311 sd = NULL;
1312 weight = cpus_weight(span);
1313 for_each_domain(cpu, tmp) {
1314 if (weight <= cpus_weight(tmp->span))
1315 break;
1316 if (tmp->flags & flag)
1317 sd = tmp;
1318 }
1319 /* while loop will break here if sd == NULL */
1320 }
1321
1322 return cpu;
1323}
1324
1325#endif /* CONFIG_SMP */
1da177e4
LT
1326
1327/*
1328 * wake_idle() will wake a task on an idle cpu if task->cpu is
1329 * not idle and an idle cpu is available. The span of cpus to
1330 * search starts with cpus closest then further out as needed,
1331 * so we always favor a closer, idle cpu.
1332 *
1333 * Returns the CPU we should wake onto.
1334 */
1335#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
36c8b586 1336static int wake_idle(int cpu, struct task_struct *p)
1da177e4
LT
1337{
1338 cpumask_t tmp;
1339 struct sched_domain *sd;
1340 int i;
1341
4953198b
SS
1342 /*
1343 * If it is idle, then it is the best cpu to run this task.
1344 *
1345 * This cpu is also the best, if it has more than one task already.
1346 * Siblings must be also busy(in most cases) as they didn't already
1347 * pickup the extra load from this cpu and hence we need not check
1348 * sibling runqueue info. This will avoid the checks and cache miss
1349 * penalities associated with that.
1350 */
1351 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1da177e4
LT
1352 return cpu;
1353
1354 for_each_domain(cpu, sd) {
1355 if (sd->flags & SD_WAKE_IDLE) {
e0f364f4 1356 cpus_and(tmp, sd->span, p->cpus_allowed);
1da177e4
LT
1357 for_each_cpu_mask(i, tmp) {
1358 if (idle_cpu(i))
1359 return i;
1360 }
9761eea8 1361 } else {
e0f364f4 1362 break;
9761eea8 1363 }
1da177e4
LT
1364 }
1365 return cpu;
1366}
1367#else
36c8b586 1368static inline int wake_idle(int cpu, struct task_struct *p)
1da177e4
LT
1369{
1370 return cpu;
1371}
1372#endif
1373
1374/***
1375 * try_to_wake_up - wake up a thread
1376 * @p: the to-be-woken-up thread
1377 * @state: the mask of task states that can be woken
1378 * @sync: do a synchronous wakeup?
1379 *
1380 * Put it on the run-queue if it's not already there. The "current"
1381 * thread is always on the run-queue (except when the actual
1382 * re-schedule is in progress), and as such you're allowed to do
1383 * the simpler "current->state = TASK_RUNNING" to mark yourself
1384 * runnable without the overhead of this.
1385 *
1386 * returns failure only if the task is already active.
1387 */
36c8b586 1388static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1da177e4
LT
1389{
1390 int cpu, this_cpu, success = 0;
1391 unsigned long flags;
1392 long old_state;
70b97a7f 1393 struct rq *rq;
1da177e4 1394#ifdef CONFIG_SMP
7897986b 1395 struct sched_domain *sd, *this_sd = NULL;
70b97a7f 1396 unsigned long load, this_load;
1da177e4
LT
1397 int new_cpu;
1398#endif
1399
1400 rq = task_rq_lock(p, &flags);
1401 old_state = p->state;
1402 if (!(old_state & state))
1403 goto out;
1404
dd41f596 1405 if (p->se.on_rq)
1da177e4
LT
1406 goto out_running;
1407
1408 cpu = task_cpu(p);
1409 this_cpu = smp_processor_id();
1410
1411#ifdef CONFIG_SMP
1412 if (unlikely(task_running(rq, p)))
1413 goto out_activate;
1414
7897986b
NP
1415 new_cpu = cpu;
1416
1da177e4
LT
1417 schedstat_inc(rq, ttwu_cnt);
1418 if (cpu == this_cpu) {
1419 schedstat_inc(rq, ttwu_local);
7897986b
NP
1420 goto out_set_cpu;
1421 }
1422
1423 for_each_domain(this_cpu, sd) {
1424 if (cpu_isset(cpu, sd->span)) {
1425 schedstat_inc(sd, ttwu_wake_remote);
1426 this_sd = sd;
1427 break;
1da177e4
LT
1428 }
1429 }
1da177e4 1430
7897986b 1431 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1da177e4
LT
1432 goto out_set_cpu;
1433
1da177e4 1434 /*
7897986b 1435 * Check for affine wakeup and passive balancing possibilities.
1da177e4 1436 */
7897986b
NP
1437 if (this_sd) {
1438 int idx = this_sd->wake_idx;
1439 unsigned int imbalance;
1da177e4 1440
a3f21bce
NP
1441 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1442
7897986b
NP
1443 load = source_load(cpu, idx);
1444 this_load = target_load(this_cpu, idx);
1da177e4 1445
7897986b
NP
1446 new_cpu = this_cpu; /* Wake to this CPU if we can */
1447
a3f21bce
NP
1448 if (this_sd->flags & SD_WAKE_AFFINE) {
1449 unsigned long tl = this_load;
33859f7f
MOS
1450 unsigned long tl_per_task;
1451
1452 tl_per_task = cpu_avg_load_per_task(this_cpu);
2dd73a4f 1453
1da177e4 1454 /*
a3f21bce
NP
1455 * If sync wakeup then subtract the (maximum possible)
1456 * effect of the currently running task from the load
1457 * of the current CPU:
1da177e4 1458 */
a3f21bce 1459 if (sync)
dd41f596 1460 tl -= current->se.load.weight;
a3f21bce
NP
1461
1462 if ((tl <= load &&
2dd73a4f 1463 tl + target_load(cpu, idx) <= tl_per_task) ||
dd41f596 1464 100*(tl + p->se.load.weight) <= imbalance*load) {
a3f21bce
NP
1465 /*
1466 * This domain has SD_WAKE_AFFINE and
1467 * p is cache cold in this domain, and
1468 * there is no bad imbalance.
1469 */
1470 schedstat_inc(this_sd, ttwu_move_affine);
1471 goto out_set_cpu;
1472 }
1473 }
1474
1475 /*
1476 * Start passive balancing when half the imbalance_pct
1477 * limit is reached.
1478 */
1479 if (this_sd->flags & SD_WAKE_BALANCE) {
1480 if (imbalance*this_load <= 100*load) {
1481 schedstat_inc(this_sd, ttwu_move_balance);
1482 goto out_set_cpu;
1483 }
1da177e4
LT
1484 }
1485 }
1486
1487 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1488out_set_cpu:
1489 new_cpu = wake_idle(new_cpu, p);
1490 if (new_cpu != cpu) {
1491 set_task_cpu(p, new_cpu);
1492 task_rq_unlock(rq, &flags);
1493 /* might preempt at this point */
1494 rq = task_rq_lock(p, &flags);
1495 old_state = p->state;
1496 if (!(old_state & state))
1497 goto out;
dd41f596 1498 if (p->se.on_rq)
1da177e4
LT
1499 goto out_running;
1500
1501 this_cpu = smp_processor_id();
1502 cpu = task_cpu(p);
1503 }
1504
1505out_activate:
1506#endif /* CONFIG_SMP */
dd41f596 1507 activate_task(rq, p, 1);
1da177e4
LT
1508 /*
1509 * Sync wakeups (i.e. those types of wakeups where the waker
1510 * has indicated that it will leave the CPU in short order)
1511 * don't trigger a preemption, if the woken up task will run on
1512 * this cpu. (in this case the 'I will reschedule' promise of
1513 * the waker guarantees that the freshly woken up task is going
1514 * to be considered on this CPU.)
1515 */
dd41f596
IM
1516 if (!sync || cpu != this_cpu)
1517 check_preempt_curr(rq, p);
1da177e4
LT
1518 success = 1;
1519
1520out_running:
1521 p->state = TASK_RUNNING;
1522out:
1523 task_rq_unlock(rq, &flags);
1524
1525 return success;
1526}
1527
36c8b586 1528int fastcall wake_up_process(struct task_struct *p)
1da177e4
LT
1529{
1530 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1531 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1532}
1da177e4
LT
1533EXPORT_SYMBOL(wake_up_process);
1534
36c8b586 1535int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1da177e4
LT
1536{
1537 return try_to_wake_up(p, state, 0);
1538}
1539
1da177e4
LT
1540/*
1541 * Perform scheduler related setup for a newly forked process p.
1542 * p is forked by current.
dd41f596
IM
1543 *
1544 * __sched_fork() is basic setup used by init_idle() too:
1545 */
1546static void __sched_fork(struct task_struct *p)
1547{
1548 p->se.wait_start_fair = 0;
1549 p->se.wait_start = 0;
1550 p->se.exec_start = 0;
1551 p->se.sum_exec_runtime = 0;
1552 p->se.delta_exec = 0;
1553 p->se.delta_fair_run = 0;
1554 p->se.delta_fair_sleep = 0;
1555 p->se.wait_runtime = 0;
1556 p->se.sum_wait_runtime = 0;
1557 p->se.sum_sleep_runtime = 0;
1558 p->se.sleep_start = 0;
1559 p->se.sleep_start_fair = 0;
1560 p->se.block_start = 0;
1561 p->se.sleep_max = 0;
1562 p->se.block_max = 0;
1563 p->se.exec_max = 0;
1564 p->se.wait_max = 0;
1565 p->se.wait_runtime_overruns = 0;
1566 p->se.wait_runtime_underruns = 0;
476d139c 1567
dd41f596
IM
1568 INIT_LIST_HEAD(&p->run_list);
1569 p->se.on_rq = 0;
476d139c 1570
1da177e4
LT
1571 /*
1572 * We mark the process as running here, but have not actually
1573 * inserted it onto the runqueue yet. This guarantees that
1574 * nobody will actually run it, and a signal or other external
1575 * event cannot wake it up and insert it on the runqueue either.
1576 */
1577 p->state = TASK_RUNNING;
dd41f596
IM
1578}
1579
1580/*
1581 * fork()/clone()-time setup:
1582 */
1583void sched_fork(struct task_struct *p, int clone_flags)
1584{
1585 int cpu = get_cpu();
1586
1587 __sched_fork(p);
1588
1589#ifdef CONFIG_SMP
1590 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1591#endif
1592 __set_task_cpu(p, cpu);
b29739f9
IM
1593
1594 /*
1595 * Make sure we do not leak PI boosting priority to the child:
1596 */
1597 p->prio = current->normal_prio;
1598
52f17b6c 1599#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
dd41f596 1600 if (likely(sched_info_on()))
52f17b6c 1601 memset(&p->sched_info, 0, sizeof(p->sched_info));
1da177e4 1602#endif
d6077cb8 1603#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4866cde0
NP
1604 p->oncpu = 0;
1605#endif
1da177e4 1606#ifdef CONFIG_PREEMPT
4866cde0 1607 /* Want to start with kernel preemption disabled. */
a1261f54 1608 task_thread_info(p)->preempt_count = 1;
1da177e4 1609#endif
476d139c 1610 put_cpu();
1da177e4
LT
1611}
1612
dd41f596
IM
1613/*
1614 * After fork, child runs first. (default) If set to 0 then
1615 * parent will (try to) run first.
1616 */
1617unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1618
1da177e4
LT
1619/*
1620 * wake_up_new_task - wake up a newly created task for the first time.
1621 *
1622 * This function will do some initial scheduler statistics housekeeping
1623 * that must be done for every newly created context, then puts the task
1624 * on the runqueue and wakes it.
1625 */
36c8b586 1626void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1da177e4
LT
1627{
1628 unsigned long flags;
dd41f596
IM
1629 struct rq *rq;
1630 int this_cpu;
1da177e4
LT
1631
1632 rq = task_rq_lock(p, &flags);
147cbb4b 1633 BUG_ON(p->state != TASK_RUNNING);
dd41f596 1634 this_cpu = smp_processor_id(); /* parent's CPU */
1da177e4
LT
1635
1636 p->prio = effective_prio(p);
1637
dd41f596
IM
1638 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1639 task_cpu(p) != this_cpu || !current->se.on_rq) {
1640 activate_task(rq, p, 0);
1da177e4 1641 } else {
1da177e4 1642 /*
dd41f596
IM
1643 * Let the scheduling class do new task startup
1644 * management (if any):
1da177e4 1645 */
dd41f596 1646 p->sched_class->task_new(rq, p);
1da177e4 1647 }
dd41f596
IM
1648 check_preempt_curr(rq, p);
1649 task_rq_unlock(rq, &flags);
1da177e4
LT
1650}
1651
4866cde0
NP
1652/**
1653 * prepare_task_switch - prepare to switch tasks
1654 * @rq: the runqueue preparing to switch
1655 * @next: the task we are going to switch to.
1656 *
1657 * This is called with the rq lock held and interrupts off. It must
1658 * be paired with a subsequent finish_task_switch after the context
1659 * switch.
1660 *
1661 * prepare_task_switch sets up locking and calls architecture specific
1662 * hooks.
1663 */
70b97a7f 1664static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
1665{
1666 prepare_lock_switch(rq, next);
1667 prepare_arch_switch(next);
1668}
1669
1da177e4
LT
1670/**
1671 * finish_task_switch - clean up after a task-switch
344babaa 1672 * @rq: runqueue associated with task-switch
1da177e4
LT
1673 * @prev: the thread we just switched away from.
1674 *
4866cde0
NP
1675 * finish_task_switch must be called after the context switch, paired
1676 * with a prepare_task_switch call before the context switch.
1677 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1678 * and do any other architecture-specific cleanup actions.
1da177e4
LT
1679 *
1680 * Note that we may have delayed dropping an mm in context_switch(). If
1681 * so, we finish that here outside of the runqueue lock. (Doing it
1682 * with the lock held can cause deadlocks; see schedule() for
1683 * details.)
1684 */
70b97a7f 1685static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1da177e4
LT
1686 __releases(rq->lock)
1687{
1da177e4 1688 struct mm_struct *mm = rq->prev_mm;
55a101f8 1689 long prev_state;
1da177e4
LT
1690
1691 rq->prev_mm = NULL;
1692
1693 /*
1694 * A task struct has one reference for the use as "current".
c394cc9f 1695 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
55a101f8
ON
1696 * schedule one last time. The schedule call will never return, and
1697 * the scheduled task must drop that reference.
c394cc9f 1698 * The test for TASK_DEAD must occur while the runqueue locks are
1da177e4
LT
1699 * still held, otherwise prev could be scheduled on another cpu, die
1700 * there before we look at prev->state, and then the reference would
1701 * be dropped twice.
1702 * Manfred Spraul <manfred@colorfullife.com>
1703 */
55a101f8 1704 prev_state = prev->state;
4866cde0
NP
1705 finish_arch_switch(prev);
1706 finish_lock_switch(rq, prev);
1da177e4
LT
1707 if (mm)
1708 mmdrop(mm);
c394cc9f 1709 if (unlikely(prev_state == TASK_DEAD)) {
c6fd91f0 1710 /*
1711 * Remove function-return probe instances associated with this
1712 * task and put them back on the free list.
9761eea8 1713 */
c6fd91f0 1714 kprobe_flush_task(prev);
1da177e4 1715 put_task_struct(prev);
c6fd91f0 1716 }
1da177e4
LT
1717}
1718
1719/**
1720 * schedule_tail - first thing a freshly forked thread must call.
1721 * @prev: the thread we just switched away from.
1722 */
36c8b586 1723asmlinkage void schedule_tail(struct task_struct *prev)
1da177e4
LT
1724 __releases(rq->lock)
1725{
70b97a7f
IM
1726 struct rq *rq = this_rq();
1727
4866cde0
NP
1728 finish_task_switch(rq, prev);
1729#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1730 /* In this case, finish_task_switch does not reenable preemption */
1731 preempt_enable();
1732#endif
1da177e4
LT
1733 if (current->set_child_tid)
1734 put_user(current->pid, current->set_child_tid);
1735}
1736
1737/*
1738 * context_switch - switch to the new MM and the new
1739 * thread's register state.
1740 */
dd41f596 1741static inline void
70b97a7f 1742context_switch(struct rq *rq, struct task_struct *prev,
36c8b586 1743 struct task_struct *next)
1da177e4 1744{
dd41f596 1745 struct mm_struct *mm, *oldmm;
1da177e4 1746
dd41f596
IM
1747 prepare_task_switch(rq, next);
1748 mm = next->mm;
1749 oldmm = prev->active_mm;
9226d125
ZA
1750 /*
1751 * For paravirt, this is coupled with an exit in switch_to to
1752 * combine the page table reload and the switch backend into
1753 * one hypercall.
1754 */
1755 arch_enter_lazy_cpu_mode();
1756
dd41f596 1757 if (unlikely(!mm)) {
1da177e4
LT
1758 next->active_mm = oldmm;
1759 atomic_inc(&oldmm->mm_count);
1760 enter_lazy_tlb(oldmm, next);
1761 } else
1762 switch_mm(oldmm, mm, next);
1763
dd41f596 1764 if (unlikely(!prev->mm)) {
1da177e4 1765 prev->active_mm = NULL;
1da177e4
LT
1766 rq->prev_mm = oldmm;
1767 }
3a5f5e48
IM
1768 /*
1769 * Since the runqueue lock will be released by the next
1770 * task (which is an invalid locking op but in the case
1771 * of the scheduler it's an obvious special-case), so we
1772 * do an early lockdep release here:
1773 */
1774#ifndef __ARCH_WANT_UNLOCKED_CTXSW
8a25d5de 1775 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3a5f5e48 1776#endif
1da177e4
LT
1777
1778 /* Here we just switch the register state and the stack. */
1779 switch_to(prev, next, prev);
1780
dd41f596
IM
1781 barrier();
1782 /*
1783 * this_rq must be evaluated again because prev may have moved
1784 * CPUs since it called schedule(), thus the 'rq' on its stack
1785 * frame will be invalid.
1786 */
1787 finish_task_switch(this_rq(), prev);
1da177e4
LT
1788}
1789
1790/*
1791 * nr_running, nr_uninterruptible and nr_context_switches:
1792 *
1793 * externally visible scheduler statistics: current number of runnable
1794 * threads, current number of uninterruptible-sleeping threads, total
1795 * number of context switches performed since bootup.
1796 */
1797unsigned long nr_running(void)
1798{
1799 unsigned long i, sum = 0;
1800
1801 for_each_online_cpu(i)
1802 sum += cpu_rq(i)->nr_running;
1803
1804 return sum;
1805}
1806
1807unsigned long nr_uninterruptible(void)
1808{
1809 unsigned long i, sum = 0;
1810
0a945022 1811 for_each_possible_cpu(i)
1da177e4
LT
1812 sum += cpu_rq(i)->nr_uninterruptible;
1813
1814 /*
1815 * Since we read the counters lockless, it might be slightly
1816 * inaccurate. Do not allow it to go below zero though:
1817 */
1818 if (unlikely((long)sum < 0))
1819 sum = 0;
1820
1821 return sum;
1822}
1823
1824unsigned long long nr_context_switches(void)
1825{
cc94abfc
SR
1826 int i;
1827 unsigned long long sum = 0;
1da177e4 1828
0a945022 1829 for_each_possible_cpu(i)
1da177e4
LT
1830 sum += cpu_rq(i)->nr_switches;
1831
1832 return sum;
1833}
1834
1835unsigned long nr_iowait(void)
1836{
1837 unsigned long i, sum = 0;
1838
0a945022 1839 for_each_possible_cpu(i)
1da177e4
LT
1840 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1841
1842 return sum;
1843}
1844
db1b1fef
JS
1845unsigned long nr_active(void)
1846{
1847 unsigned long i, running = 0, uninterruptible = 0;
1848
1849 for_each_online_cpu(i) {
1850 running += cpu_rq(i)->nr_running;
1851 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1852 }
1853
1854 if (unlikely((long)uninterruptible < 0))
1855 uninterruptible = 0;
1856
1857 return running + uninterruptible;
1858}
1859
48f24c4d 1860/*
dd41f596
IM
1861 * Update rq->cpu_load[] statistics. This function is usually called every
1862 * scheduler tick (TICK_NSEC).
48f24c4d 1863 */
dd41f596 1864static void update_cpu_load(struct rq *this_rq)
48f24c4d 1865{
dd41f596
IM
1866 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1867 unsigned long total_load = this_rq->ls.load.weight;
1868 unsigned long this_load = total_load;
1869 struct load_stat *ls = &this_rq->ls;
1870 u64 now = __rq_clock(this_rq);
1871 int i, scale;
1872
1873 this_rq->nr_load_updates++;
1874 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1875 goto do_avg;
1876
1877 /* Update delta_fair/delta_exec fields first */
1878 update_curr_load(this_rq, now);
1879
1880 fair_delta64 = ls->delta_fair + 1;
1881 ls->delta_fair = 0;
1882
1883 exec_delta64 = ls->delta_exec + 1;
1884 ls->delta_exec = 0;
1885
1886 sample_interval64 = now - ls->load_update_last;
1887 ls->load_update_last = now;
1888
1889 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1890 sample_interval64 = TICK_NSEC;
1891
1892 if (exec_delta64 > sample_interval64)
1893 exec_delta64 = sample_interval64;
1894
1895 idle_delta64 = sample_interval64 - exec_delta64;
1896
1897 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1898 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1899
1900 this_load = (unsigned long)tmp64;
1901
1902do_avg:
1903
1904 /* Update our load: */
1905 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1906 unsigned long old_load, new_load;
1907
1908 /* scale is effectively 1 << i now, and >> i divides by scale */
1909
1910 old_load = this_rq->cpu_load[i];
1911 new_load = this_load;
1912
1913 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1914 }
48f24c4d
IM
1915}
1916
dd41f596
IM
1917#ifdef CONFIG_SMP
1918
1da177e4
LT
1919/*
1920 * double_rq_lock - safely lock two runqueues
1921 *
1922 * Note this does not disable interrupts like task_rq_lock,
1923 * you need to do so manually before calling.
1924 */
70b97a7f 1925static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1da177e4
LT
1926 __acquires(rq1->lock)
1927 __acquires(rq2->lock)
1928{
054b9108 1929 BUG_ON(!irqs_disabled());
1da177e4
LT
1930 if (rq1 == rq2) {
1931 spin_lock(&rq1->lock);
1932 __acquire(rq2->lock); /* Fake it out ;) */
1933 } else {
c96d145e 1934 if (rq1 < rq2) {
1da177e4
LT
1935 spin_lock(&rq1->lock);
1936 spin_lock(&rq2->lock);
1937 } else {
1938 spin_lock(&rq2->lock);
1939 spin_lock(&rq1->lock);
1940 }
1941 }
1942}
1943
1944/*
1945 * double_rq_unlock - safely unlock two runqueues
1946 *
1947 * Note this does not restore interrupts like task_rq_unlock,
1948 * you need to do so manually after calling.
1949 */
70b97a7f 1950static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1da177e4
LT
1951 __releases(rq1->lock)
1952 __releases(rq2->lock)
1953{
1954 spin_unlock(&rq1->lock);
1955 if (rq1 != rq2)
1956 spin_unlock(&rq2->lock);
1957 else
1958 __release(rq2->lock);
1959}
1960
1961/*
1962 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1963 */
70b97a7f 1964static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1da177e4
LT
1965 __releases(this_rq->lock)
1966 __acquires(busiest->lock)
1967 __acquires(this_rq->lock)
1968{
054b9108
KK
1969 if (unlikely(!irqs_disabled())) {
1970 /* printk() doesn't work good under rq->lock */
1971 spin_unlock(&this_rq->lock);
1972 BUG_ON(1);
1973 }
1da177e4 1974 if (unlikely(!spin_trylock(&busiest->lock))) {
c96d145e 1975 if (busiest < this_rq) {
1da177e4
LT
1976 spin_unlock(&this_rq->lock);
1977 spin_lock(&busiest->lock);
1978 spin_lock(&this_rq->lock);
1979 } else
1980 spin_lock(&busiest->lock);
1981 }
1982}
1983
1da177e4
LT
1984/*
1985 * If dest_cpu is allowed for this process, migrate the task to it.
1986 * This is accomplished by forcing the cpu_allowed mask to only
1987 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1988 * the cpu_allowed mask is restored.
1989 */
36c8b586 1990static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1da177e4 1991{
70b97a7f 1992 struct migration_req req;
1da177e4 1993 unsigned long flags;
70b97a7f 1994 struct rq *rq;
1da177e4
LT
1995
1996 rq = task_rq_lock(p, &flags);
1997 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1998 || unlikely(cpu_is_offline(dest_cpu)))
1999 goto out;
2000
2001 /* force the process onto the specified CPU */
2002 if (migrate_task(p, dest_cpu, &req)) {
2003 /* Need to wait for migration thread (might exit: take ref). */
2004 struct task_struct *mt = rq->migration_thread;
36c8b586 2005
1da177e4
LT
2006 get_task_struct(mt);
2007 task_rq_unlock(rq, &flags);
2008 wake_up_process(mt);
2009 put_task_struct(mt);
2010 wait_for_completion(&req.done);
36c8b586 2011
1da177e4
LT
2012 return;
2013 }
2014out:
2015 task_rq_unlock(rq, &flags);
2016}
2017
2018/*
476d139c
NP
2019 * sched_exec - execve() is a valuable balancing opportunity, because at
2020 * this point the task has the smallest effective memory and cache footprint.
1da177e4
LT
2021 */
2022void sched_exec(void)
2023{
1da177e4 2024 int new_cpu, this_cpu = get_cpu();
476d139c 2025 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1da177e4 2026 put_cpu();
476d139c
NP
2027 if (new_cpu != this_cpu)
2028 sched_migrate_task(current, new_cpu);
1da177e4
LT
2029}
2030
2031/*
2032 * pull_task - move a task from a remote runqueue to the local runqueue.
2033 * Both runqueues must be locked.
2034 */
dd41f596
IM
2035static void pull_task(struct rq *src_rq, struct task_struct *p,
2036 struct rq *this_rq, int this_cpu)
1da177e4 2037{
dd41f596 2038 deactivate_task(src_rq, p, 0);
1da177e4 2039 set_task_cpu(p, this_cpu);
dd41f596 2040 activate_task(this_rq, p, 0);
1da177e4
LT
2041 /*
2042 * Note that idle threads have a prio of MAX_PRIO, for this test
2043 * to be always true for them.
2044 */
dd41f596 2045 check_preempt_curr(this_rq, p);
1da177e4
LT
2046}
2047
2048/*
2049 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2050 */
858119e1 2051static
70b97a7f 2052int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
d15bcfdb 2053 struct sched_domain *sd, enum cpu_idle_type idle,
95cdf3b7 2054 int *all_pinned)
1da177e4
LT
2055{
2056 /*
2057 * We do not migrate tasks that are:
2058 * 1) running (obviously), or
2059 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2060 * 3) are cache-hot on their current CPU.
2061 */
1da177e4
LT
2062 if (!cpu_isset(this_cpu, p->cpus_allowed))
2063 return 0;
81026794
NP
2064 *all_pinned = 0;
2065
2066 if (task_running(rq, p))
2067 return 0;
1da177e4
LT
2068
2069 /*
dd41f596 2070 * Aggressive migration if too many balance attempts have failed:
1da177e4 2071 */
dd41f596 2072 if (sd->nr_balance_failed > sd->cache_nice_tries)
1da177e4
LT
2073 return 1;
2074
1da177e4
LT
2075 return 1;
2076}
2077
dd41f596 2078static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2dd73a4f 2079 unsigned long max_nr_move, unsigned long max_load_move,
d15bcfdb 2080 struct sched_domain *sd, enum cpu_idle_type idle,
dd41f596
IM
2081 int *all_pinned, unsigned long *load_moved,
2082 int this_best_prio, int best_prio, int best_prio_seen,
2083 struct rq_iterator *iterator)
1da177e4 2084{
dd41f596
IM
2085 int pulled = 0, pinned = 0, skip_for_load;
2086 struct task_struct *p;
2087 long rem_load_move = max_load_move;
1da177e4 2088
2dd73a4f 2089 if (max_nr_move == 0 || max_load_move == 0)
1da177e4
LT
2090 goto out;
2091
81026794
NP
2092 pinned = 1;
2093
1da177e4 2094 /*
dd41f596 2095 * Start the load-balancing iterator:
1da177e4 2096 */
dd41f596
IM
2097 p = iterator->start(iterator->arg);
2098next:
2099 if (!p)
1da177e4 2100 goto out;
50ddd969
PW
2101 /*
2102 * To help distribute high priority tasks accross CPUs we don't
2103 * skip a task if it will be the highest priority task (i.e. smallest
2104 * prio value) on its new queue regardless of its load weight
2105 */
dd41f596
IM
2106 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2107 SCHED_LOAD_SCALE_FUZZ;
2108 if (skip_for_load && p->prio < this_best_prio)
2109 skip_for_load = !best_prio_seen && p->prio == best_prio;
615052dc 2110 if (skip_for_load ||
dd41f596 2111 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
48f24c4d 2112
dd41f596
IM
2113 best_prio_seen |= p->prio == best_prio;
2114 p = iterator->next(iterator->arg);
2115 goto next;
1da177e4
LT
2116 }
2117
dd41f596 2118 pull_task(busiest, p, this_rq, this_cpu);
1da177e4 2119 pulled++;
dd41f596 2120 rem_load_move -= p->se.load.weight;
1da177e4 2121
2dd73a4f
PW
2122 /*
2123 * We only want to steal up to the prescribed number of tasks
2124 * and the prescribed amount of weighted load.
2125 */
2126 if (pulled < max_nr_move && rem_load_move > 0) {
dd41f596
IM
2127 if (p->prio < this_best_prio)
2128 this_best_prio = p->prio;
2129 p = iterator->next(iterator->arg);
2130 goto next;
1da177e4
LT
2131 }
2132out:
2133 /*
2134 * Right now, this is the only place pull_task() is called,
2135 * so we can safely collect pull_task() stats here rather than
2136 * inside pull_task().
2137 */
2138 schedstat_add(sd, lb_gained[idle], pulled);
81026794
NP
2139
2140 if (all_pinned)
2141 *all_pinned = pinned;
dd41f596 2142 *load_moved = max_load_move - rem_load_move;
1da177e4
LT
2143 return pulled;
2144}
2145
dd41f596
IM
2146/*
2147 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2148 * load from busiest to this_rq, as part of a balancing operation within
2149 * "domain". Returns the number of tasks moved.
2150 *
2151 * Called with both runqueues locked.
2152 */
2153static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2154 unsigned long max_nr_move, unsigned long max_load_move,
2155 struct sched_domain *sd, enum cpu_idle_type idle,
2156 int *all_pinned)
2157{
2158 struct sched_class *class = sched_class_highest;
2159 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2160 long rem_load_move = max_load_move;
2161
2162 do {
2163 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2164 max_nr_move, (unsigned long)rem_load_move,
2165 sd, idle, all_pinned, &load_moved);
2166 total_nr_moved += nr_moved;
2167 max_nr_move -= nr_moved;
2168 rem_load_move -= load_moved;
2169 class = class->next;
2170 } while (class && max_nr_move && rem_load_move > 0);
2171
2172 return total_nr_moved;
2173}
2174
1da177e4
LT
2175/*
2176 * find_busiest_group finds and returns the busiest CPU group within the
48f24c4d
IM
2177 * domain. It calculates and returns the amount of weighted load which
2178 * should be moved to restore balance via the imbalance parameter.
1da177e4
LT
2179 */
2180static struct sched_group *
2181find_busiest_group(struct sched_domain *sd, int this_cpu,
dd41f596
IM
2182 unsigned long *imbalance, enum cpu_idle_type idle,
2183 int *sd_idle, cpumask_t *cpus, int *balance)
1da177e4
LT
2184{
2185 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2186 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
0c117f1b 2187 unsigned long max_pull;
2dd73a4f
PW
2188 unsigned long busiest_load_per_task, busiest_nr_running;
2189 unsigned long this_load_per_task, this_nr_running;
7897986b 2190 int load_idx;
5c45bf27
SS
2191#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2192 int power_savings_balance = 1;
2193 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2194 unsigned long min_nr_running = ULONG_MAX;
2195 struct sched_group *group_min = NULL, *group_leader = NULL;
2196#endif
1da177e4
LT
2197
2198 max_load = this_load = total_load = total_pwr = 0;
2dd73a4f
PW
2199 busiest_load_per_task = busiest_nr_running = 0;
2200 this_load_per_task = this_nr_running = 0;
d15bcfdb 2201 if (idle == CPU_NOT_IDLE)
7897986b 2202 load_idx = sd->busy_idx;
d15bcfdb 2203 else if (idle == CPU_NEWLY_IDLE)
7897986b
NP
2204 load_idx = sd->newidle_idx;
2205 else
2206 load_idx = sd->idle_idx;
1da177e4
LT
2207
2208 do {
5c45bf27 2209 unsigned long load, group_capacity;
1da177e4
LT
2210 int local_group;
2211 int i;
783609c6 2212 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2dd73a4f 2213 unsigned long sum_nr_running, sum_weighted_load;
1da177e4
LT
2214
2215 local_group = cpu_isset(this_cpu, group->cpumask);
2216
783609c6
SS
2217 if (local_group)
2218 balance_cpu = first_cpu(group->cpumask);
2219
1da177e4 2220 /* Tally up the load of all CPUs in the group */
2dd73a4f 2221 sum_weighted_load = sum_nr_running = avg_load = 0;
1da177e4
LT
2222
2223 for_each_cpu_mask(i, group->cpumask) {
0a2966b4
CL
2224 struct rq *rq;
2225
2226 if (!cpu_isset(i, *cpus))
2227 continue;
2228
2229 rq = cpu_rq(i);
2dd73a4f 2230
5969fe06
NP
2231 if (*sd_idle && !idle_cpu(i))
2232 *sd_idle = 0;
2233
1da177e4 2234 /* Bias balancing toward cpus of our domain */
783609c6
SS
2235 if (local_group) {
2236 if (idle_cpu(i) && !first_idle_cpu) {
2237 first_idle_cpu = 1;
2238 balance_cpu = i;
2239 }
2240
a2000572 2241 load = target_load(i, load_idx);
783609c6 2242 } else
a2000572 2243 load = source_load(i, load_idx);
1da177e4
LT
2244
2245 avg_load += load;
2dd73a4f 2246 sum_nr_running += rq->nr_running;
dd41f596 2247 sum_weighted_load += weighted_cpuload(i);
1da177e4
LT
2248 }
2249
783609c6
SS
2250 /*
2251 * First idle cpu or the first cpu(busiest) in this sched group
2252 * is eligible for doing load balancing at this and above
2253 * domains.
2254 */
2255 if (local_group && balance_cpu != this_cpu && balance) {
2256 *balance = 0;
2257 goto ret;
2258 }
2259
1da177e4 2260 total_load += avg_load;
5517d86b 2261 total_pwr += group->__cpu_power;
1da177e4
LT
2262
2263 /* Adjust by relative CPU power of the group */
5517d86b
ED
2264 avg_load = sg_div_cpu_power(group,
2265 avg_load * SCHED_LOAD_SCALE);
1da177e4 2266
5517d86b 2267 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
5c45bf27 2268
1da177e4
LT
2269 if (local_group) {
2270 this_load = avg_load;
2271 this = group;
2dd73a4f
PW
2272 this_nr_running = sum_nr_running;
2273 this_load_per_task = sum_weighted_load;
2274 } else if (avg_load > max_load &&
5c45bf27 2275 sum_nr_running > group_capacity) {
1da177e4
LT
2276 max_load = avg_load;
2277 busiest = group;
2dd73a4f
PW
2278 busiest_nr_running = sum_nr_running;
2279 busiest_load_per_task = sum_weighted_load;
1da177e4 2280 }
5c45bf27
SS
2281
2282#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2283 /*
2284 * Busy processors will not participate in power savings
2285 * balance.
2286 */
dd41f596
IM
2287 if (idle == CPU_NOT_IDLE ||
2288 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2289 goto group_next;
5c45bf27
SS
2290
2291 /*
2292 * If the local group is idle or completely loaded
2293 * no need to do power savings balance at this domain
2294 */
2295 if (local_group && (this_nr_running >= group_capacity ||
2296 !this_nr_running))
2297 power_savings_balance = 0;
2298
dd41f596 2299 /*
5c45bf27
SS
2300 * If a group is already running at full capacity or idle,
2301 * don't include that group in power savings calculations
dd41f596
IM
2302 */
2303 if (!power_savings_balance || sum_nr_running >= group_capacity
5c45bf27 2304 || !sum_nr_running)
dd41f596 2305 goto group_next;
5c45bf27 2306
dd41f596 2307 /*
5c45bf27 2308 * Calculate the group which has the least non-idle load.
dd41f596
IM
2309 * This is the group from where we need to pick up the load
2310 * for saving power
2311 */
2312 if ((sum_nr_running < min_nr_running) ||
2313 (sum_nr_running == min_nr_running &&
5c45bf27
SS
2314 first_cpu(group->cpumask) <
2315 first_cpu(group_min->cpumask))) {
dd41f596
IM
2316 group_min = group;
2317 min_nr_running = sum_nr_running;
5c45bf27
SS
2318 min_load_per_task = sum_weighted_load /
2319 sum_nr_running;
dd41f596 2320 }
5c45bf27 2321
dd41f596 2322 /*
5c45bf27 2323 * Calculate the group which is almost near its
dd41f596
IM
2324 * capacity but still has some space to pick up some load
2325 * from other group and save more power
2326 */
2327 if (sum_nr_running <= group_capacity - 1) {
2328 if (sum_nr_running > leader_nr_running ||
2329 (sum_nr_running == leader_nr_running &&
2330 first_cpu(group->cpumask) >
2331 first_cpu(group_leader->cpumask))) {
2332 group_leader = group;
2333 leader_nr_running = sum_nr_running;
2334 }
48f24c4d 2335 }
5c45bf27
SS
2336group_next:
2337#endif
1da177e4
LT
2338 group = group->next;
2339 } while (group != sd->groups);
2340
2dd73a4f 2341 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
1da177e4
LT
2342 goto out_balanced;
2343
2344 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2345
2346 if (this_load >= avg_load ||
2347 100*max_load <= sd->imbalance_pct*this_load)
2348 goto out_balanced;
2349
2dd73a4f 2350 busiest_load_per_task /= busiest_nr_running;
1da177e4
LT
2351 /*
2352 * We're trying to get all the cpus to the average_load, so we don't
2353 * want to push ourselves above the average load, nor do we wish to
2354 * reduce the max loaded cpu below the average load, as either of these
2355 * actions would just result in more rebalancing later, and ping-pong
2356 * tasks around. Thus we look for the minimum possible imbalance.
2357 * Negative imbalances (*we* are more loaded than anyone else) will
2358 * be counted as no imbalance for these purposes -- we can't fix that
2359 * by pulling tasks to us. Be careful of negative numbers as they'll
2360 * appear as very large values with unsigned longs.
2361 */
2dd73a4f
PW
2362 if (max_load <= busiest_load_per_task)
2363 goto out_balanced;
2364
2365 /*
2366 * In the presence of smp nice balancing, certain scenarios can have
2367 * max load less than avg load(as we skip the groups at or below
2368 * its cpu_power, while calculating max_load..)
2369 */
2370 if (max_load < avg_load) {
2371 *imbalance = 0;
2372 goto small_imbalance;
2373 }
0c117f1b
SS
2374
2375 /* Don't want to pull so many tasks that a group would go idle */
2dd73a4f 2376 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
0c117f1b 2377
1da177e4 2378 /* How much load to actually move to equalise the imbalance */
5517d86b
ED
2379 *imbalance = min(max_pull * busiest->__cpu_power,
2380 (avg_load - this_load) * this->__cpu_power)
1da177e4
LT
2381 / SCHED_LOAD_SCALE;
2382
2dd73a4f
PW
2383 /*
2384 * if *imbalance is less than the average load per runnable task
2385 * there is no gaurantee that any tasks will be moved so we'll have
2386 * a think about bumping its value to force at least one task to be
2387 * moved
2388 */
dd41f596 2389 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
48f24c4d 2390 unsigned long tmp, pwr_now, pwr_move;
2dd73a4f
PW
2391 unsigned int imbn;
2392
2393small_imbalance:
2394 pwr_move = pwr_now = 0;
2395 imbn = 2;
2396 if (this_nr_running) {
2397 this_load_per_task /= this_nr_running;
2398 if (busiest_load_per_task > this_load_per_task)
2399 imbn = 1;
2400 } else
2401 this_load_per_task = SCHED_LOAD_SCALE;
1da177e4 2402
dd41f596
IM
2403 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2404 busiest_load_per_task * imbn) {
2dd73a4f 2405 *imbalance = busiest_load_per_task;
1da177e4
LT
2406 return busiest;
2407 }
2408
2409 /*
2410 * OK, we don't have enough imbalance to justify moving tasks,
2411 * however we may be able to increase total CPU power used by
2412 * moving them.
2413 */
2414
5517d86b
ED
2415 pwr_now += busiest->__cpu_power *
2416 min(busiest_load_per_task, max_load);
2417 pwr_now += this->__cpu_power *
2418 min(this_load_per_task, this_load);
1da177e4
LT
2419 pwr_now /= SCHED_LOAD_SCALE;
2420
2421 /* Amount of load we'd subtract */
5517d86b
ED
2422 tmp = sg_div_cpu_power(busiest,
2423 busiest_load_per_task * SCHED_LOAD_SCALE);
1da177e4 2424 if (max_load > tmp)
5517d86b 2425 pwr_move += busiest->__cpu_power *
2dd73a4f 2426 min(busiest_load_per_task, max_load - tmp);
1da177e4
LT
2427
2428 /* Amount of load we'd add */
5517d86b 2429 if (max_load * busiest->__cpu_power <
33859f7f 2430 busiest_load_per_task * SCHED_LOAD_SCALE)
5517d86b
ED
2431 tmp = sg_div_cpu_power(this,
2432 max_load * busiest->__cpu_power);
1da177e4 2433 else
5517d86b
ED
2434 tmp = sg_div_cpu_power(this,
2435 busiest_load_per_task * SCHED_LOAD_SCALE);
2436 pwr_move += this->__cpu_power *
2437 min(this_load_per_task, this_load + tmp);
1da177e4
LT
2438 pwr_move /= SCHED_LOAD_SCALE;
2439
2440 /* Move if we gain throughput */
2441 if (pwr_move <= pwr_now)
2442 goto out_balanced;
2443
2dd73a4f 2444 *imbalance = busiest_load_per_task;
1da177e4
LT
2445 }
2446
1da177e4
LT
2447 return busiest;
2448
2449out_balanced:
5c45bf27 2450#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
d15bcfdb 2451 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
5c45bf27 2452 goto ret;
1da177e4 2453
5c45bf27
SS
2454 if (this == group_leader && group_leader != group_min) {
2455 *imbalance = min_load_per_task;
2456 return group_min;
2457 }
5c45bf27 2458#endif
783609c6 2459ret:
1da177e4
LT
2460 *imbalance = 0;
2461 return NULL;
2462}
2463
2464/*
2465 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2466 */
70b97a7f 2467static struct rq *
d15bcfdb 2468find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
0a2966b4 2469 unsigned long imbalance, cpumask_t *cpus)
1da177e4 2470{
70b97a7f 2471 struct rq *busiest = NULL, *rq;
2dd73a4f 2472 unsigned long max_load = 0;
1da177e4
LT
2473 int i;
2474
2475 for_each_cpu_mask(i, group->cpumask) {
dd41f596 2476 unsigned long wl;
0a2966b4
CL
2477
2478 if (!cpu_isset(i, *cpus))
2479 continue;
2480
48f24c4d 2481 rq = cpu_rq(i);
dd41f596 2482 wl = weighted_cpuload(i);
2dd73a4f 2483
dd41f596 2484 if (rq->nr_running == 1 && wl > imbalance)
2dd73a4f 2485 continue;
1da177e4 2486
dd41f596
IM
2487 if (wl > max_load) {
2488 max_load = wl;
48f24c4d 2489 busiest = rq;
1da177e4
LT
2490 }
2491 }
2492
2493 return busiest;
2494}
2495
77391d71
NP
2496/*
2497 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2498 * so long as it is large enough.
2499 */
2500#define MAX_PINNED_INTERVAL 512
2501
48f24c4d
IM
2502static inline unsigned long minus_1_or_zero(unsigned long n)
2503{
2504 return n > 0 ? n - 1 : 0;
2505}
2506
1da177e4
LT
2507/*
2508 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2509 * tasks if there is an imbalance.
1da177e4 2510 */
70b97a7f 2511static int load_balance(int this_cpu, struct rq *this_rq,
d15bcfdb 2512 struct sched_domain *sd, enum cpu_idle_type idle,
783609c6 2513 int *balance)
1da177e4 2514{
48f24c4d 2515 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
1da177e4 2516 struct sched_group *group;
1da177e4 2517 unsigned long imbalance;
70b97a7f 2518 struct rq *busiest;
0a2966b4 2519 cpumask_t cpus = CPU_MASK_ALL;
fe2eea3f 2520 unsigned long flags;
5969fe06 2521
89c4710e
SS
2522 /*
2523 * When power savings policy is enabled for the parent domain, idle
2524 * sibling can pick up load irrespective of busy siblings. In this case,
dd41f596 2525 * let the state of idle sibling percolate up as CPU_IDLE, instead of
d15bcfdb 2526 * portraying it as CPU_NOT_IDLE.
89c4710e 2527 */
d15bcfdb 2528 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2529 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2530 sd_idle = 1;
1da177e4 2531
1da177e4
LT
2532 schedstat_inc(sd, lb_cnt[idle]);
2533
0a2966b4
CL
2534redo:
2535 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
783609c6
SS
2536 &cpus, balance);
2537
06066714 2538 if (*balance == 0)
783609c6 2539 goto out_balanced;
783609c6 2540
1da177e4
LT
2541 if (!group) {
2542 schedstat_inc(sd, lb_nobusyg[idle]);
2543 goto out_balanced;
2544 }
2545
0a2966b4 2546 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
1da177e4
LT
2547 if (!busiest) {
2548 schedstat_inc(sd, lb_nobusyq[idle]);
2549 goto out_balanced;
2550 }
2551
db935dbd 2552 BUG_ON(busiest == this_rq);
1da177e4
LT
2553
2554 schedstat_add(sd, lb_imbalance[idle], imbalance);
2555
2556 nr_moved = 0;
2557 if (busiest->nr_running > 1) {
2558 /*
2559 * Attempt to move tasks. If find_busiest_group has found
2560 * an imbalance but busiest->nr_running <= 1, the group is
2561 * still unbalanced. nr_moved simply stays zero, so it is
2562 * correctly treated as an imbalance.
2563 */
fe2eea3f 2564 local_irq_save(flags);
e17224bf 2565 double_rq_lock(this_rq, busiest);
1da177e4 2566 nr_moved = move_tasks(this_rq, this_cpu, busiest,
48f24c4d
IM
2567 minus_1_or_zero(busiest->nr_running),
2568 imbalance, sd, idle, &all_pinned);
e17224bf 2569 double_rq_unlock(this_rq, busiest);
fe2eea3f 2570 local_irq_restore(flags);
81026794 2571
46cb4b7c
SS
2572 /*
2573 * some other cpu did the load balance for us.
2574 */
2575 if (nr_moved && this_cpu != smp_processor_id())
2576 resched_cpu(this_cpu);
2577
81026794 2578 /* All tasks on this runqueue were pinned by CPU affinity */
0a2966b4
CL
2579 if (unlikely(all_pinned)) {
2580 cpu_clear(cpu_of(busiest), cpus);
2581 if (!cpus_empty(cpus))
2582 goto redo;
81026794 2583 goto out_balanced;
0a2966b4 2584 }
1da177e4 2585 }
81026794 2586
1da177e4
LT
2587 if (!nr_moved) {
2588 schedstat_inc(sd, lb_failed[idle]);
2589 sd->nr_balance_failed++;
2590
2591 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1da177e4 2592
fe2eea3f 2593 spin_lock_irqsave(&busiest->lock, flags);
fa3b6ddc
SS
2594
2595 /* don't kick the migration_thread, if the curr
2596 * task on busiest cpu can't be moved to this_cpu
2597 */
2598 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
fe2eea3f 2599 spin_unlock_irqrestore(&busiest->lock, flags);
fa3b6ddc
SS
2600 all_pinned = 1;
2601 goto out_one_pinned;
2602 }
2603
1da177e4
LT
2604 if (!busiest->active_balance) {
2605 busiest->active_balance = 1;
2606 busiest->push_cpu = this_cpu;
81026794 2607 active_balance = 1;
1da177e4 2608 }
fe2eea3f 2609 spin_unlock_irqrestore(&busiest->lock, flags);
81026794 2610 if (active_balance)
1da177e4
LT
2611 wake_up_process(busiest->migration_thread);
2612
2613 /*
2614 * We've kicked active balancing, reset the failure
2615 * counter.
2616 */
39507451 2617 sd->nr_balance_failed = sd->cache_nice_tries+1;
1da177e4 2618 }
81026794 2619 } else
1da177e4
LT
2620 sd->nr_balance_failed = 0;
2621
81026794 2622 if (likely(!active_balance)) {
1da177e4
LT
2623 /* We were unbalanced, so reset the balancing interval */
2624 sd->balance_interval = sd->min_interval;
81026794
NP
2625 } else {
2626 /*
2627 * If we've begun active balancing, start to back off. This
2628 * case may not be covered by the all_pinned logic if there
2629 * is only 1 task on the busy runqueue (because we don't call
2630 * move_tasks).
2631 */
2632 if (sd->balance_interval < sd->max_interval)
2633 sd->balance_interval *= 2;
1da177e4
LT
2634 }
2635
5c45bf27 2636 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2637 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2638 return -1;
1da177e4
LT
2639 return nr_moved;
2640
2641out_balanced:
1da177e4
LT
2642 schedstat_inc(sd, lb_balanced[idle]);
2643
16cfb1c0 2644 sd->nr_balance_failed = 0;
fa3b6ddc
SS
2645
2646out_one_pinned:
1da177e4 2647 /* tune up the balancing interval */
77391d71
NP
2648 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2649 (sd->balance_interval < sd->max_interval))
1da177e4
LT
2650 sd->balance_interval *= 2;
2651
48f24c4d 2652 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2653 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2654 return -1;
1da177e4
LT
2655 return 0;
2656}
2657
2658/*
2659 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2660 * tasks if there is an imbalance.
2661 *
d15bcfdb 2662 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
1da177e4
LT
2663 * this_rq is locked.
2664 */
48f24c4d 2665static int
70b97a7f 2666load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
1da177e4
LT
2667{
2668 struct sched_group *group;
70b97a7f 2669 struct rq *busiest = NULL;
1da177e4
LT
2670 unsigned long imbalance;
2671 int nr_moved = 0;
5969fe06 2672 int sd_idle = 0;
0a2966b4 2673 cpumask_t cpus = CPU_MASK_ALL;
5969fe06 2674
89c4710e
SS
2675 /*
2676 * When power savings policy is enabled for the parent domain, idle
2677 * sibling can pick up load irrespective of busy siblings. In this case,
2678 * let the state of idle sibling percolate up as IDLE, instead of
d15bcfdb 2679 * portraying it as CPU_NOT_IDLE.
89c4710e
SS
2680 */
2681 if (sd->flags & SD_SHARE_CPUPOWER &&
2682 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2683 sd_idle = 1;
1da177e4 2684
d15bcfdb 2685 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
0a2966b4 2686redo:
d15bcfdb 2687 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
783609c6 2688 &sd_idle, &cpus, NULL);
1da177e4 2689 if (!group) {
d15bcfdb 2690 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
16cfb1c0 2691 goto out_balanced;
1da177e4
LT
2692 }
2693
d15bcfdb 2694 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
0a2966b4 2695 &cpus);
db935dbd 2696 if (!busiest) {
d15bcfdb 2697 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
16cfb1c0 2698 goto out_balanced;
1da177e4
LT
2699 }
2700
db935dbd
NP
2701 BUG_ON(busiest == this_rq);
2702
d15bcfdb 2703 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
d6d5cfaf
NP
2704
2705 nr_moved = 0;
2706 if (busiest->nr_running > 1) {
2707 /* Attempt to move tasks */
2708 double_lock_balance(this_rq, busiest);
2709 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2dd73a4f 2710 minus_1_or_zero(busiest->nr_running),
d15bcfdb 2711 imbalance, sd, CPU_NEWLY_IDLE, NULL);
d6d5cfaf 2712 spin_unlock(&busiest->lock);
0a2966b4
CL
2713
2714 if (!nr_moved) {
2715 cpu_clear(cpu_of(busiest), cpus);
2716 if (!cpus_empty(cpus))
2717 goto redo;
2718 }
d6d5cfaf
NP
2719 }
2720
5969fe06 2721 if (!nr_moved) {
d15bcfdb 2722 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
89c4710e
SS
2723 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2724 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06
NP
2725 return -1;
2726 } else
16cfb1c0 2727 sd->nr_balance_failed = 0;
1da177e4 2728
1da177e4 2729 return nr_moved;
16cfb1c0
NP
2730
2731out_balanced:
d15bcfdb 2732 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
48f24c4d 2733 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2734 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2735 return -1;
16cfb1c0 2736 sd->nr_balance_failed = 0;
48f24c4d 2737
16cfb1c0 2738 return 0;
1da177e4
LT
2739}
2740
2741/*
2742 * idle_balance is called by schedule() if this_cpu is about to become
2743 * idle. Attempts to pull tasks from other CPUs.
2744 */
70b97a7f 2745static void idle_balance(int this_cpu, struct rq *this_rq)
1da177e4
LT
2746{
2747 struct sched_domain *sd;
dd41f596
IM
2748 int pulled_task = -1;
2749 unsigned long next_balance = jiffies + HZ;
1da177e4
LT
2750
2751 for_each_domain(this_cpu, sd) {
92c4ca5c
CL
2752 unsigned long interval;
2753
2754 if (!(sd->flags & SD_LOAD_BALANCE))
2755 continue;
2756
2757 if (sd->flags & SD_BALANCE_NEWIDLE)
48f24c4d 2758 /* If we've pulled tasks over stop searching: */
1bd77f2d 2759 pulled_task = load_balance_newidle(this_cpu,
92c4ca5c
CL
2760 this_rq, sd);
2761
2762 interval = msecs_to_jiffies(sd->balance_interval);
2763 if (time_after(next_balance, sd->last_balance + interval))
2764 next_balance = sd->last_balance + interval;
2765 if (pulled_task)
2766 break;
1da177e4 2767 }
dd41f596 2768 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
1bd77f2d
CL
2769 /*
2770 * We are going idle. next_balance may be set based on
2771 * a busy processor. So reset next_balance.
2772 */
2773 this_rq->next_balance = next_balance;
dd41f596 2774 }
1da177e4
LT
2775}
2776
2777/*
2778 * active_load_balance is run by migration threads. It pushes running tasks
2779 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2780 * running on each physical CPU where possible, and avoids physical /
2781 * logical imbalances.
2782 *
2783 * Called with busiest_rq locked.
2784 */
70b97a7f 2785static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
1da177e4 2786{
39507451 2787 int target_cpu = busiest_rq->push_cpu;
70b97a7f
IM
2788 struct sched_domain *sd;
2789 struct rq *target_rq;
39507451 2790
48f24c4d 2791 /* Is there any task to move? */
39507451 2792 if (busiest_rq->nr_running <= 1)
39507451
NP
2793 return;
2794
2795 target_rq = cpu_rq(target_cpu);
1da177e4
LT
2796
2797 /*
39507451
NP
2798 * This condition is "impossible", if it occurs
2799 * we need to fix it. Originally reported by
2800 * Bjorn Helgaas on a 128-cpu setup.
1da177e4 2801 */
39507451 2802 BUG_ON(busiest_rq == target_rq);
1da177e4 2803
39507451
NP
2804 /* move a task from busiest_rq to target_rq */
2805 double_lock_balance(busiest_rq, target_rq);
2806
2807 /* Search for an sd spanning us and the target CPU. */
c96d145e 2808 for_each_domain(target_cpu, sd) {
39507451 2809 if ((sd->flags & SD_LOAD_BALANCE) &&
48f24c4d 2810 cpu_isset(busiest_cpu, sd->span))
39507451 2811 break;
c96d145e 2812 }
39507451 2813
48f24c4d
IM
2814 if (likely(sd)) {
2815 schedstat_inc(sd, alb_cnt);
39507451 2816
48f24c4d 2817 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
d15bcfdb 2818 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
48f24c4d
IM
2819 NULL))
2820 schedstat_inc(sd, alb_pushed);
2821 else
2822 schedstat_inc(sd, alb_failed);
2823 }
39507451 2824 spin_unlock(&target_rq->lock);
1da177e4
LT
2825}
2826
46cb4b7c
SS
2827#ifdef CONFIG_NO_HZ
2828static struct {
2829 atomic_t load_balancer;
2830 cpumask_t cpu_mask;
2831} nohz ____cacheline_aligned = {
2832 .load_balancer = ATOMIC_INIT(-1),
2833 .cpu_mask = CPU_MASK_NONE,
2834};
2835
7835b98b 2836/*
46cb4b7c
SS
2837 * This routine will try to nominate the ilb (idle load balancing)
2838 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2839 * load balancing on behalf of all those cpus. If all the cpus in the system
2840 * go into this tickless mode, then there will be no ilb owner (as there is
2841 * no need for one) and all the cpus will sleep till the next wakeup event
2842 * arrives...
2843 *
2844 * For the ilb owner, tick is not stopped. And this tick will be used
2845 * for idle load balancing. ilb owner will still be part of
2846 * nohz.cpu_mask..
7835b98b 2847 *
46cb4b7c
SS
2848 * While stopping the tick, this cpu will become the ilb owner if there
2849 * is no other owner. And will be the owner till that cpu becomes busy
2850 * or if all cpus in the system stop their ticks at which point
2851 * there is no need for ilb owner.
2852 *
2853 * When the ilb owner becomes busy, it nominates another owner, during the
2854 * next busy scheduler_tick()
2855 */
2856int select_nohz_load_balancer(int stop_tick)
2857{
2858 int cpu = smp_processor_id();
2859
2860 if (stop_tick) {
2861 cpu_set(cpu, nohz.cpu_mask);
2862 cpu_rq(cpu)->in_nohz_recently = 1;
2863
2864 /*
2865 * If we are going offline and still the leader, give up!
2866 */
2867 if (cpu_is_offline(cpu) &&
2868 atomic_read(&nohz.load_balancer) == cpu) {
2869 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2870 BUG();
2871 return 0;
2872 }
2873
2874 /* time for ilb owner also to sleep */
2875 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2876 if (atomic_read(&nohz.load_balancer) == cpu)
2877 atomic_set(&nohz.load_balancer, -1);
2878 return 0;
2879 }
2880
2881 if (atomic_read(&nohz.load_balancer) == -1) {
2882 /* make me the ilb owner */
2883 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2884 return 1;
2885 } else if (atomic_read(&nohz.load_balancer) == cpu)
2886 return 1;
2887 } else {
2888 if (!cpu_isset(cpu, nohz.cpu_mask))
2889 return 0;
2890
2891 cpu_clear(cpu, nohz.cpu_mask);
2892
2893 if (atomic_read(&nohz.load_balancer) == cpu)
2894 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2895 BUG();
2896 }
2897 return 0;
2898}
2899#endif
2900
2901static DEFINE_SPINLOCK(balancing);
2902
2903/*
7835b98b
CL
2904 * It checks each scheduling domain to see if it is due to be balanced,
2905 * and initiates a balancing operation if so.
2906 *
2907 * Balancing parameters are set up in arch_init_sched_domains.
2908 */
d15bcfdb 2909static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
7835b98b 2910{
46cb4b7c
SS
2911 int balance = 1;
2912 struct rq *rq = cpu_rq(cpu);
7835b98b
CL
2913 unsigned long interval;
2914 struct sched_domain *sd;
46cb4b7c 2915 /* Earliest time when we have to do rebalance again */
c9819f45 2916 unsigned long next_balance = jiffies + 60*HZ;
1da177e4 2917
46cb4b7c 2918 for_each_domain(cpu, sd) {
1da177e4
LT
2919 if (!(sd->flags & SD_LOAD_BALANCE))
2920 continue;
2921
2922 interval = sd->balance_interval;
d15bcfdb 2923 if (idle != CPU_IDLE)
1da177e4
LT
2924 interval *= sd->busy_factor;
2925
2926 /* scale ms to jiffies */
2927 interval = msecs_to_jiffies(interval);
2928 if (unlikely(!interval))
2929 interval = 1;
dd41f596
IM
2930 if (interval > HZ*NR_CPUS/10)
2931 interval = HZ*NR_CPUS/10;
2932
1da177e4 2933
08c183f3
CL
2934 if (sd->flags & SD_SERIALIZE) {
2935 if (!spin_trylock(&balancing))
2936 goto out;
2937 }
2938
c9819f45 2939 if (time_after_eq(jiffies, sd->last_balance + interval)) {
46cb4b7c 2940 if (load_balance(cpu, rq, sd, idle, &balance)) {
fa3b6ddc
SS
2941 /*
2942 * We've pulled tasks over so either we're no
5969fe06
NP
2943 * longer idle, or one of our SMT siblings is
2944 * not idle.
2945 */
d15bcfdb 2946 idle = CPU_NOT_IDLE;
1da177e4 2947 }
1bd77f2d 2948 sd->last_balance = jiffies;
1da177e4 2949 }
08c183f3
CL
2950 if (sd->flags & SD_SERIALIZE)
2951 spin_unlock(&balancing);
2952out:
c9819f45
CL
2953 if (time_after(next_balance, sd->last_balance + interval))
2954 next_balance = sd->last_balance + interval;
783609c6
SS
2955
2956 /*
2957 * Stop the load balance at this level. There is another
2958 * CPU in our sched group which is doing load balancing more
2959 * actively.
2960 */
2961 if (!balance)
2962 break;
1da177e4 2963 }
46cb4b7c
SS
2964 rq->next_balance = next_balance;
2965}
2966
2967/*
2968 * run_rebalance_domains is triggered when needed from the scheduler tick.
2969 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2970 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2971 */
2972static void run_rebalance_domains(struct softirq_action *h)
2973{
dd41f596
IM
2974 int this_cpu = smp_processor_id();
2975 struct rq *this_rq = cpu_rq(this_cpu);
2976 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2977 CPU_IDLE : CPU_NOT_IDLE;
46cb4b7c 2978
dd41f596 2979 rebalance_domains(this_cpu, idle);
46cb4b7c
SS
2980
2981#ifdef CONFIG_NO_HZ
2982 /*
2983 * If this cpu is the owner for idle load balancing, then do the
2984 * balancing on behalf of the other idle cpus whose ticks are
2985 * stopped.
2986 */
dd41f596
IM
2987 if (this_rq->idle_at_tick &&
2988 atomic_read(&nohz.load_balancer) == this_cpu) {
46cb4b7c
SS
2989 cpumask_t cpus = nohz.cpu_mask;
2990 struct rq *rq;
2991 int balance_cpu;
2992
dd41f596 2993 cpu_clear(this_cpu, cpus);
46cb4b7c
SS
2994 for_each_cpu_mask(balance_cpu, cpus) {
2995 /*
2996 * If this cpu gets work to do, stop the load balancing
2997 * work being done for other cpus. Next load
2998 * balancing owner will pick it up.
2999 */
3000 if (need_resched())
3001 break;
3002
dd41f596 3003 rebalance_domains(balance_cpu, SCHED_IDLE);
46cb4b7c
SS
3004
3005 rq = cpu_rq(balance_cpu);
dd41f596
IM
3006 if (time_after(this_rq->next_balance, rq->next_balance))
3007 this_rq->next_balance = rq->next_balance;
46cb4b7c
SS
3008 }
3009 }
3010#endif
3011}
3012
3013/*
3014 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3015 *
3016 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3017 * idle load balancing owner or decide to stop the periodic load balancing,
3018 * if the whole system is idle.
3019 */
dd41f596 3020static inline void trigger_load_balance(struct rq *rq, int cpu)
46cb4b7c 3021{
46cb4b7c
SS
3022#ifdef CONFIG_NO_HZ
3023 /*
3024 * If we were in the nohz mode recently and busy at the current
3025 * scheduler tick, then check if we need to nominate new idle
3026 * load balancer.
3027 */
3028 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3029 rq->in_nohz_recently = 0;
3030
3031 if (atomic_read(&nohz.load_balancer) == cpu) {
3032 cpu_clear(cpu, nohz.cpu_mask);
3033 atomic_set(&nohz.load_balancer, -1);
3034 }
3035
3036 if (atomic_read(&nohz.load_balancer) == -1) {
3037 /*
3038 * simple selection for now: Nominate the
3039 * first cpu in the nohz list to be the next
3040 * ilb owner.
3041 *
3042 * TBD: Traverse the sched domains and nominate
3043 * the nearest cpu in the nohz.cpu_mask.
3044 */
3045 int ilb = first_cpu(nohz.cpu_mask);
3046
3047 if (ilb != NR_CPUS)
3048 resched_cpu(ilb);
3049 }
3050 }
3051
3052 /*
3053 * If this cpu is idle and doing idle load balancing for all the
3054 * cpus with ticks stopped, is it time for that to stop?
3055 */
3056 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3057 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3058 resched_cpu(cpu);
3059 return;
3060 }
3061
3062 /*
3063 * If this cpu is idle and the idle load balancing is done by
3064 * someone else, then no need raise the SCHED_SOFTIRQ
3065 */
3066 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3067 cpu_isset(cpu, nohz.cpu_mask))
3068 return;
3069#endif
3070 if (time_after_eq(jiffies, rq->next_balance))
3071 raise_softirq(SCHED_SOFTIRQ);
1da177e4 3072}
dd41f596
IM
3073
3074#else /* CONFIG_SMP */
3075
1da177e4
LT
3076/*
3077 * on UP we do not need to balance between CPUs:
3078 */
70b97a7f 3079static inline void idle_balance(int cpu, struct rq *rq)
1da177e4
LT
3080{
3081}
dd41f596
IM
3082
3083/* Avoid "used but not defined" warning on UP */
3084static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3085 unsigned long max_nr_move, unsigned long max_load_move,
3086 struct sched_domain *sd, enum cpu_idle_type idle,
3087 int *all_pinned, unsigned long *load_moved,
3088 int this_best_prio, int best_prio, int best_prio_seen,
3089 struct rq_iterator *iterator)
3090{
3091 *load_moved = 0;
3092
3093 return 0;
3094}
3095
1da177e4
LT
3096#endif
3097
1da177e4
LT
3098DEFINE_PER_CPU(struct kernel_stat, kstat);
3099
3100EXPORT_PER_CPU_SYMBOL(kstat);
3101
3102/*
41b86e9c
IM
3103 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3104 * that have not yet been banked in case the task is currently running.
1da177e4 3105 */
41b86e9c 3106unsigned long long task_sched_runtime(struct task_struct *p)
1da177e4 3107{
1da177e4 3108 unsigned long flags;
41b86e9c
IM
3109 u64 ns, delta_exec;
3110 struct rq *rq;
48f24c4d 3111
41b86e9c
IM
3112 rq = task_rq_lock(p, &flags);
3113 ns = p->se.sum_exec_runtime;
3114 if (rq->curr == p) {
3115 delta_exec = rq_clock(rq) - p->se.exec_start;
3116 if ((s64)delta_exec > 0)
3117 ns += delta_exec;
3118 }
3119 task_rq_unlock(rq, &flags);
48f24c4d 3120
1da177e4
LT
3121 return ns;
3122}
3123
1da177e4
LT
3124/*
3125 * Account user cpu time to a process.
3126 * @p: the process that the cpu time gets accounted to
3127 * @hardirq_offset: the offset to subtract from hardirq_count()
3128 * @cputime: the cpu time spent in user space since the last update
3129 */
3130void account_user_time(struct task_struct *p, cputime_t cputime)
3131{
3132 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3133 cputime64_t tmp;
3134
3135 p->utime = cputime_add(p->utime, cputime);
3136
3137 /* Add user time to cpustat. */
3138 tmp = cputime_to_cputime64(cputime);
3139 if (TASK_NICE(p) > 0)
3140 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3141 else
3142 cpustat->user = cputime64_add(cpustat->user, tmp);
3143}
3144
3145/*
3146 * Account system cpu time to a process.
3147 * @p: the process that the cpu time gets accounted to
3148 * @hardirq_offset: the offset to subtract from hardirq_count()
3149 * @cputime: the cpu time spent in kernel space since the last update
3150 */
3151void account_system_time(struct task_struct *p, int hardirq_offset,
3152 cputime_t cputime)
3153{
3154 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
70b97a7f 3155 struct rq *rq = this_rq();
1da177e4
LT
3156 cputime64_t tmp;
3157
3158 p->stime = cputime_add(p->stime, cputime);
3159
3160 /* Add system time to cpustat. */
3161 tmp = cputime_to_cputime64(cputime);
3162 if (hardirq_count() - hardirq_offset)
3163 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3164 else if (softirq_count())
3165 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3166 else if (p != rq->idle)
3167 cpustat->system = cputime64_add(cpustat->system, tmp);
3168 else if (atomic_read(&rq->nr_iowait) > 0)
3169 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3170 else
3171 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3172 /* Account for system time used */
3173 acct_update_integrals(p);
1da177e4
LT
3174}
3175
3176/*
3177 * Account for involuntary wait time.
3178 * @p: the process from which the cpu time has been stolen
3179 * @steal: the cpu time spent in involuntary wait
3180 */
3181void account_steal_time(struct task_struct *p, cputime_t steal)
3182{
3183 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3184 cputime64_t tmp = cputime_to_cputime64(steal);
70b97a7f 3185 struct rq *rq = this_rq();
1da177e4
LT
3186
3187 if (p == rq->idle) {
3188 p->stime = cputime_add(p->stime, steal);
3189 if (atomic_read(&rq->nr_iowait) > 0)
3190 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3191 else
3192 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3193 } else
3194 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3195}
3196
7835b98b
CL
3197/*
3198 * This function gets called by the timer code, with HZ frequency.
3199 * We call it with interrupts disabled.
3200 *
3201 * It also gets called by the fork code, when changing the parent's
3202 * timeslices.
3203 */
3204void scheduler_tick(void)
3205{
7835b98b
CL
3206 int cpu = smp_processor_id();
3207 struct rq *rq = cpu_rq(cpu);
dd41f596
IM
3208 struct task_struct *curr = rq->curr;
3209
3210 spin_lock(&rq->lock);
3211 if (curr != rq->idle) /* FIXME: needed? */
3212 curr->sched_class->task_tick(rq, curr);
3213 update_cpu_load(rq);
3214 spin_unlock(&rq->lock);
7835b98b 3215
e418e1c2 3216#ifdef CONFIG_SMP
dd41f596
IM
3217 rq->idle_at_tick = idle_cpu(cpu);
3218 trigger_load_balance(rq, cpu);
e418e1c2 3219#endif
1da177e4
LT
3220}
3221
1da177e4
LT
3222#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3223
3224void fastcall add_preempt_count(int val)
3225{
3226 /*
3227 * Underflow?
3228 */
9a11b49a
IM
3229 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3230 return;
1da177e4
LT
3231 preempt_count() += val;
3232 /*
3233 * Spinlock count overflowing soon?
3234 */
33859f7f
MOS
3235 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3236 PREEMPT_MASK - 10);
1da177e4
LT
3237}
3238EXPORT_SYMBOL(add_preempt_count);
3239
3240void fastcall sub_preempt_count(int val)
3241{
3242 /*
3243 * Underflow?
3244 */
9a11b49a
IM
3245 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3246 return;
1da177e4
LT
3247 /*
3248 * Is the spinlock portion underflowing?
3249 */
9a11b49a
IM
3250 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3251 !(preempt_count() & PREEMPT_MASK)))
3252 return;
3253
1da177e4
LT
3254 preempt_count() -= val;
3255}
3256EXPORT_SYMBOL(sub_preempt_count);
3257
3258#endif
3259
3260/*
dd41f596 3261 * Print scheduling while atomic bug:
1da177e4 3262 */
dd41f596 3263static noinline void __schedule_bug(struct task_struct *prev)
1da177e4 3264{
dd41f596
IM
3265 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3266 prev->comm, preempt_count(), prev->pid);
3267 debug_show_held_locks(prev);
3268 if (irqs_disabled())
3269 print_irqtrace_events(prev);
3270 dump_stack();
3271}
1da177e4 3272
dd41f596
IM
3273/*
3274 * Various schedule()-time debugging checks and statistics:
3275 */
3276static inline void schedule_debug(struct task_struct *prev)
3277{
1da177e4
LT
3278 /*
3279 * Test if we are atomic. Since do_exit() needs to call into
3280 * schedule() atomically, we ignore that path for now.
3281 * Otherwise, whine if we are scheduling when we should not be.
3282 */
dd41f596
IM
3283 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3284 __schedule_bug(prev);
3285
1da177e4
LT
3286 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3287
dd41f596
IM
3288 schedstat_inc(this_rq(), sched_cnt);
3289}
3290
3291/*
3292 * Pick up the highest-prio task:
3293 */
3294static inline struct task_struct *
3295pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3296{
3297 struct sched_class *class;
3298 struct task_struct *p;
1da177e4
LT
3299
3300 /*
dd41f596
IM
3301 * Optimization: we know that if all tasks are in
3302 * the fair class we can call that function directly:
1da177e4 3303 */
dd41f596
IM
3304 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3305 p = fair_sched_class.pick_next_task(rq, now);
3306 if (likely(p))
3307 return p;
1da177e4
LT
3308 }
3309
dd41f596
IM
3310 class = sched_class_highest;
3311 for ( ; ; ) {
3312 p = class->pick_next_task(rq, now);
3313 if (p)
3314 return p;
3315 /*
3316 * Will never be NULL as the idle class always
3317 * returns a non-NULL p:
3318 */
3319 class = class->next;
3320 }
3321}
1da177e4 3322
dd41f596
IM
3323/*
3324 * schedule() is the main scheduler function.
3325 */
3326asmlinkage void __sched schedule(void)
3327{
3328 struct task_struct *prev, *next;
3329 long *switch_count;
3330 struct rq *rq;
3331 u64 now;
3332 int cpu;
3333
3334need_resched:
3335 preempt_disable();
3336 cpu = smp_processor_id();
3337 rq = cpu_rq(cpu);
3338 rcu_qsctr_inc(cpu);
3339 prev = rq->curr;
3340 switch_count = &prev->nivcsw;
3341
3342 release_kernel_lock(prev);
3343need_resched_nonpreemptible:
3344
3345 schedule_debug(prev);
1da177e4
LT
3346
3347 spin_lock_irq(&rq->lock);
dd41f596 3348 clear_tsk_need_resched(prev);
1da177e4 3349
1da177e4 3350 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1da177e4 3351 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
dd41f596 3352 unlikely(signal_pending(prev)))) {
1da177e4 3353 prev->state = TASK_RUNNING;
dd41f596
IM
3354 } else {
3355 deactivate_task(rq, prev, 1);
1da177e4 3356 }
dd41f596 3357 switch_count = &prev->nvcsw;
1da177e4
LT
3358 }
3359
dd41f596 3360 if (unlikely(!rq->nr_running))
1da177e4 3361 idle_balance(cpu, rq);
1da177e4 3362
dd41f596
IM
3363 now = __rq_clock(rq);
3364 prev->sched_class->put_prev_task(rq, prev, now);
3365 next = pick_next_task(rq, prev, now);
1da177e4
LT
3366
3367 sched_info_switch(prev, next);
dd41f596 3368
1da177e4 3369 if (likely(prev != next)) {
1da177e4
LT
3370 rq->nr_switches++;
3371 rq->curr = next;
3372 ++*switch_count;
3373
dd41f596 3374 context_switch(rq, prev, next); /* unlocks the rq */
1da177e4
LT
3375 } else
3376 spin_unlock_irq(&rq->lock);
3377
dd41f596
IM
3378 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3379 cpu = smp_processor_id();
3380 rq = cpu_rq(cpu);
1da177e4 3381 goto need_resched_nonpreemptible;
dd41f596 3382 }
1da177e4
LT
3383 preempt_enable_no_resched();
3384 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3385 goto need_resched;
3386}
1da177e4
LT
3387EXPORT_SYMBOL(schedule);
3388
3389#ifdef CONFIG_PREEMPT
3390/*
2ed6e34f 3391 * this is the entry point to schedule() from in-kernel preemption
1da177e4
LT
3392 * off of preempt_enable. Kernel preemptions off return from interrupt
3393 * occur there and call schedule directly.
3394 */
3395asmlinkage void __sched preempt_schedule(void)
3396{
3397 struct thread_info *ti = current_thread_info();
3398#ifdef CONFIG_PREEMPT_BKL
3399 struct task_struct *task = current;
3400 int saved_lock_depth;
3401#endif
3402 /*
3403 * If there is a non-zero preempt_count or interrupts are disabled,
3404 * we do not want to preempt the current task. Just return..
3405 */
beed33a8 3406 if (likely(ti->preempt_count || irqs_disabled()))
1da177e4
LT
3407 return;
3408
3409need_resched:
3410 add_preempt_count(PREEMPT_ACTIVE);
3411 /*
3412 * We keep the big kernel semaphore locked, but we
3413 * clear ->lock_depth so that schedule() doesnt
3414 * auto-release the semaphore:
3415 */
3416#ifdef CONFIG_PREEMPT_BKL
3417 saved_lock_depth = task->lock_depth;
3418 task->lock_depth = -1;
3419#endif
3420 schedule();
3421#ifdef CONFIG_PREEMPT_BKL
3422 task->lock_depth = saved_lock_depth;
3423#endif
3424 sub_preempt_count(PREEMPT_ACTIVE);
3425
3426 /* we could miss a preemption opportunity between schedule and now */
3427 barrier();
3428 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3429 goto need_resched;
3430}
1da177e4
LT
3431EXPORT_SYMBOL(preempt_schedule);
3432
3433/*
2ed6e34f 3434 * this is the entry point to schedule() from kernel preemption
1da177e4
LT
3435 * off of irq context.
3436 * Note, that this is called and return with irqs disabled. This will
3437 * protect us against recursive calling from irq.
3438 */
3439asmlinkage void __sched preempt_schedule_irq(void)
3440{
3441 struct thread_info *ti = current_thread_info();
3442#ifdef CONFIG_PREEMPT_BKL
3443 struct task_struct *task = current;
3444 int saved_lock_depth;
3445#endif
2ed6e34f 3446 /* Catch callers which need to be fixed */
1da177e4
LT
3447 BUG_ON(ti->preempt_count || !irqs_disabled());
3448
3449need_resched:
3450 add_preempt_count(PREEMPT_ACTIVE);
3451 /*
3452 * We keep the big kernel semaphore locked, but we
3453 * clear ->lock_depth so that schedule() doesnt
3454 * auto-release the semaphore:
3455 */
3456#ifdef CONFIG_PREEMPT_BKL
3457 saved_lock_depth = task->lock_depth;
3458 task->lock_depth = -1;
3459#endif
3460 local_irq_enable();
3461 schedule();
3462 local_irq_disable();
3463#ifdef CONFIG_PREEMPT_BKL
3464 task->lock_depth = saved_lock_depth;
3465#endif
3466 sub_preempt_count(PREEMPT_ACTIVE);
3467
3468 /* we could miss a preemption opportunity between schedule and now */
3469 barrier();
3470 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3471 goto need_resched;
3472}
3473
3474#endif /* CONFIG_PREEMPT */
3475
95cdf3b7
IM
3476int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3477 void *key)
1da177e4 3478{
48f24c4d 3479 return try_to_wake_up(curr->private, mode, sync);
1da177e4 3480}
1da177e4
LT
3481EXPORT_SYMBOL(default_wake_function);
3482
3483/*
3484 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3485 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3486 * number) then we wake all the non-exclusive tasks and one exclusive task.
3487 *
3488 * There are circumstances in which we can try to wake a task which has already
3489 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3490 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3491 */
3492static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3493 int nr_exclusive, int sync, void *key)
3494{
3495 struct list_head *tmp, *next;
3496
3497 list_for_each_safe(tmp, next, &q->task_list) {
48f24c4d
IM
3498 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3499 unsigned flags = curr->flags;
3500
1da177e4 3501 if (curr->func(curr, mode, sync, key) &&
48f24c4d 3502 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1da177e4
LT
3503 break;
3504 }
3505}
3506
3507/**
3508 * __wake_up - wake up threads blocked on a waitqueue.
3509 * @q: the waitqueue
3510 * @mode: which threads
3511 * @nr_exclusive: how many wake-one or wake-many threads to wake up
67be2dd1 3512 * @key: is directly passed to the wakeup function
1da177e4
LT
3513 */
3514void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
95cdf3b7 3515 int nr_exclusive, void *key)
1da177e4
LT
3516{
3517 unsigned long flags;
3518
3519 spin_lock_irqsave(&q->lock, flags);
3520 __wake_up_common(q, mode, nr_exclusive, 0, key);
3521 spin_unlock_irqrestore(&q->lock, flags);
3522}
1da177e4
LT
3523EXPORT_SYMBOL(__wake_up);
3524
3525/*
3526 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3527 */
3528void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3529{
3530 __wake_up_common(q, mode, 1, 0, NULL);
3531}
3532
3533/**
67be2dd1 3534 * __wake_up_sync - wake up threads blocked on a waitqueue.
1da177e4
LT
3535 * @q: the waitqueue
3536 * @mode: which threads
3537 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3538 *
3539 * The sync wakeup differs that the waker knows that it will schedule
3540 * away soon, so while the target thread will be woken up, it will not
3541 * be migrated to another CPU - ie. the two threads are 'synchronized'
3542 * with each other. This can prevent needless bouncing between CPUs.
3543 *
3544 * On UP it can prevent extra preemption.
3545 */
95cdf3b7
IM
3546void fastcall
3547__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1da177e4
LT
3548{
3549 unsigned long flags;
3550 int sync = 1;
3551
3552 if (unlikely(!q))
3553 return;
3554
3555 if (unlikely(!nr_exclusive))
3556 sync = 0;
3557
3558 spin_lock_irqsave(&q->lock, flags);
3559 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3560 spin_unlock_irqrestore(&q->lock, flags);
3561}
3562EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3563
3564void fastcall complete(struct completion *x)
3565{
3566 unsigned long flags;
3567
3568 spin_lock_irqsave(&x->wait.lock, flags);
3569 x->done++;
3570 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3571 1, 0, NULL);
3572 spin_unlock_irqrestore(&x->wait.lock, flags);
3573}
3574EXPORT_SYMBOL(complete);
3575
3576void fastcall complete_all(struct completion *x)
3577{
3578 unsigned long flags;
3579
3580 spin_lock_irqsave(&x->wait.lock, flags);
3581 x->done += UINT_MAX/2;
3582 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3583 0, 0, NULL);
3584 spin_unlock_irqrestore(&x->wait.lock, flags);
3585}
3586EXPORT_SYMBOL(complete_all);
3587
3588void fastcall __sched wait_for_completion(struct completion *x)
3589{
3590 might_sleep();
48f24c4d 3591
1da177e4
LT
3592 spin_lock_irq(&x->wait.lock);
3593 if (!x->done) {
3594 DECLARE_WAITQUEUE(wait, current);
3595
3596 wait.flags |= WQ_FLAG_EXCLUSIVE;
3597 __add_wait_queue_tail(&x->wait, &wait);
3598 do {
3599 __set_current_state(TASK_UNINTERRUPTIBLE);
3600 spin_unlock_irq(&x->wait.lock);
3601 schedule();
3602 spin_lock_irq(&x->wait.lock);
3603 } while (!x->done);
3604 __remove_wait_queue(&x->wait, &wait);
3605 }
3606 x->done--;
3607 spin_unlock_irq(&x->wait.lock);
3608}
3609EXPORT_SYMBOL(wait_for_completion);
3610
3611unsigned long fastcall __sched
3612wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3613{
3614 might_sleep();
3615
3616 spin_lock_irq(&x->wait.lock);
3617 if (!x->done) {
3618 DECLARE_WAITQUEUE(wait, current);
3619
3620 wait.flags |= WQ_FLAG_EXCLUSIVE;
3621 __add_wait_queue_tail(&x->wait, &wait);
3622 do {
3623 __set_current_state(TASK_UNINTERRUPTIBLE);
3624 spin_unlock_irq(&x->wait.lock);
3625 timeout = schedule_timeout(timeout);
3626 spin_lock_irq(&x->wait.lock);
3627 if (!timeout) {
3628 __remove_wait_queue(&x->wait, &wait);
3629 goto out;
3630 }
3631 } while (!x->done);
3632 __remove_wait_queue(&x->wait, &wait);
3633 }
3634 x->done--;
3635out:
3636 spin_unlock_irq(&x->wait.lock);
3637 return timeout;
3638}
3639EXPORT_SYMBOL(wait_for_completion_timeout);
3640
3641int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3642{
3643 int ret = 0;
3644
3645 might_sleep();
3646
3647 spin_lock_irq(&x->wait.lock);
3648 if (!x->done) {
3649 DECLARE_WAITQUEUE(wait, current);
3650
3651 wait.flags |= WQ_FLAG_EXCLUSIVE;
3652 __add_wait_queue_tail(&x->wait, &wait);
3653 do {
3654 if (signal_pending(current)) {
3655 ret = -ERESTARTSYS;
3656 __remove_wait_queue(&x->wait, &wait);
3657 goto out;
3658 }
3659 __set_current_state(TASK_INTERRUPTIBLE);
3660 spin_unlock_irq(&x->wait.lock);
3661 schedule();
3662 spin_lock_irq(&x->wait.lock);
3663 } while (!x->done);
3664 __remove_wait_queue(&x->wait, &wait);
3665 }
3666 x->done--;
3667out:
3668 spin_unlock_irq(&x->wait.lock);
3669
3670 return ret;
3671}
3672EXPORT_SYMBOL(wait_for_completion_interruptible);
3673
3674unsigned long fastcall __sched
3675wait_for_completion_interruptible_timeout(struct completion *x,
3676 unsigned long timeout)
3677{
3678 might_sleep();
3679
3680 spin_lock_irq(&x->wait.lock);
3681 if (!x->done) {
3682 DECLARE_WAITQUEUE(wait, current);
3683
3684 wait.flags |= WQ_FLAG_EXCLUSIVE;
3685 __add_wait_queue_tail(&x->wait, &wait);
3686 do {
3687 if (signal_pending(current)) {
3688 timeout = -ERESTARTSYS;
3689 __remove_wait_queue(&x->wait, &wait);
3690 goto out;
3691 }
3692 __set_current_state(TASK_INTERRUPTIBLE);
3693 spin_unlock_irq(&x->wait.lock);
3694 timeout = schedule_timeout(timeout);
3695 spin_lock_irq(&x->wait.lock);
3696 if (!timeout) {
3697 __remove_wait_queue(&x->wait, &wait);
3698 goto out;
3699 }
3700 } while (!x->done);
3701 __remove_wait_queue(&x->wait, &wait);
3702 }
3703 x->done--;
3704out:
3705 spin_unlock_irq(&x->wait.lock);
3706 return timeout;
3707}
3708EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3709
0fec171c
IM
3710static inline void
3711sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3712{
3713 spin_lock_irqsave(&q->lock, *flags);
3714 __add_wait_queue(q, wait);
1da177e4 3715 spin_unlock(&q->lock);
0fec171c 3716}
1da177e4 3717
0fec171c
IM
3718static inline void
3719sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3720{
3721 spin_lock_irq(&q->lock);
3722 __remove_wait_queue(q, wait);
3723 spin_unlock_irqrestore(&q->lock, *flags);
3724}
1da177e4 3725
0fec171c 3726void __sched interruptible_sleep_on(wait_queue_head_t *q)
1da177e4 3727{
0fec171c
IM
3728 unsigned long flags;
3729 wait_queue_t wait;
3730
3731 init_waitqueue_entry(&wait, current);
1da177e4
LT
3732
3733 current->state = TASK_INTERRUPTIBLE;
3734
0fec171c 3735 sleep_on_head(q, &wait, &flags);
1da177e4 3736 schedule();
0fec171c 3737 sleep_on_tail(q, &wait, &flags);
1da177e4 3738}
1da177e4
LT
3739EXPORT_SYMBOL(interruptible_sleep_on);
3740
0fec171c 3741long __sched
95cdf3b7 3742interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1da177e4 3743{
0fec171c
IM
3744 unsigned long flags;
3745 wait_queue_t wait;
3746
3747 init_waitqueue_entry(&wait, current);
1da177e4
LT
3748
3749 current->state = TASK_INTERRUPTIBLE;
3750
0fec171c 3751 sleep_on_head(q, &wait, &flags);
1da177e4 3752 timeout = schedule_timeout(timeout);
0fec171c 3753 sleep_on_tail(q, &wait, &flags);
1da177e4
LT
3754
3755 return timeout;
3756}
1da177e4
LT
3757EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3758
0fec171c 3759void __sched sleep_on(wait_queue_head_t *q)
1da177e4 3760{
0fec171c
IM
3761 unsigned long flags;
3762 wait_queue_t wait;
3763
3764 init_waitqueue_entry(&wait, current);
1da177e4
LT
3765
3766 current->state = TASK_UNINTERRUPTIBLE;
3767
0fec171c 3768 sleep_on_head(q, &wait, &flags);
1da177e4 3769 schedule();
0fec171c 3770 sleep_on_tail(q, &wait, &flags);
1da177e4 3771}
1da177e4
LT
3772EXPORT_SYMBOL(sleep_on);
3773
0fec171c 3774long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
1da177e4 3775{
0fec171c
IM
3776 unsigned long flags;
3777 wait_queue_t wait;
3778
3779 init_waitqueue_entry(&wait, current);
1da177e4
LT
3780
3781 current->state = TASK_UNINTERRUPTIBLE;
3782
0fec171c 3783 sleep_on_head(q, &wait, &flags);
1da177e4 3784 timeout = schedule_timeout(timeout);
0fec171c 3785 sleep_on_tail(q, &wait, &flags);
1da177e4
LT
3786
3787 return timeout;
3788}
1da177e4
LT
3789EXPORT_SYMBOL(sleep_on_timeout);
3790
b29739f9
IM
3791#ifdef CONFIG_RT_MUTEXES
3792
3793/*
3794 * rt_mutex_setprio - set the current priority of a task
3795 * @p: task
3796 * @prio: prio value (kernel-internal form)
3797 *
3798 * This function changes the 'effective' priority of a task. It does
3799 * not touch ->normal_prio like __setscheduler().
3800 *
3801 * Used by the rt_mutex code to implement priority inheritance logic.
3802 */
36c8b586 3803void rt_mutex_setprio(struct task_struct *p, int prio)
b29739f9
IM
3804{
3805 unsigned long flags;
dd41f596 3806 int oldprio, on_rq;
70b97a7f 3807 struct rq *rq;
dd41f596 3808 u64 now;
b29739f9
IM
3809
3810 BUG_ON(prio < 0 || prio > MAX_PRIO);
3811
3812 rq = task_rq_lock(p, &flags);
dd41f596 3813 now = rq_clock(rq);
b29739f9 3814
d5f9f942 3815 oldprio = p->prio;
dd41f596
IM
3816 on_rq = p->se.on_rq;
3817 if (on_rq)
3818 dequeue_task(rq, p, 0, now);
3819
3820 if (rt_prio(prio))
3821 p->sched_class = &rt_sched_class;
3822 else
3823 p->sched_class = &fair_sched_class;
3824
b29739f9
IM
3825 p->prio = prio;
3826
dd41f596
IM
3827 if (on_rq) {
3828 enqueue_task(rq, p, 0, now);
b29739f9
IM
3829 /*
3830 * Reschedule if we are currently running on this runqueue and
d5f9f942
AM
3831 * our priority decreased, or if we are not currently running on
3832 * this runqueue and our priority is higher than the current's
b29739f9 3833 */
d5f9f942
AM
3834 if (task_running(rq, p)) {
3835 if (p->prio > oldprio)
3836 resched_task(rq->curr);
dd41f596
IM
3837 } else {
3838 check_preempt_curr(rq, p);
3839 }
b29739f9
IM
3840 }
3841 task_rq_unlock(rq, &flags);
3842}
3843
3844#endif
3845
36c8b586 3846void set_user_nice(struct task_struct *p, long nice)
1da177e4 3847{
dd41f596 3848 int old_prio, delta, on_rq;
1da177e4 3849 unsigned long flags;
70b97a7f 3850 struct rq *rq;
dd41f596 3851 u64 now;
1da177e4
LT
3852
3853 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3854 return;
3855 /*
3856 * We have to be careful, if called from sys_setpriority(),
3857 * the task might be in the middle of scheduling on another CPU.
3858 */
3859 rq = task_rq_lock(p, &flags);
dd41f596 3860 now = rq_clock(rq);
1da177e4
LT
3861 /*
3862 * The RT priorities are set via sched_setscheduler(), but we still
3863 * allow the 'normal' nice value to be set - but as expected
3864 * it wont have any effect on scheduling until the task is
dd41f596 3865 * SCHED_FIFO/SCHED_RR:
1da177e4 3866 */
e05606d3 3867 if (task_has_rt_policy(p)) {
1da177e4
LT
3868 p->static_prio = NICE_TO_PRIO(nice);
3869 goto out_unlock;
3870 }
dd41f596
IM
3871 on_rq = p->se.on_rq;
3872 if (on_rq) {
3873 dequeue_task(rq, p, 0, now);
3874 dec_load(rq, p, now);
2dd73a4f 3875 }
1da177e4 3876
1da177e4 3877 p->static_prio = NICE_TO_PRIO(nice);
2dd73a4f 3878 set_load_weight(p);
b29739f9
IM
3879 old_prio = p->prio;
3880 p->prio = effective_prio(p);
3881 delta = p->prio - old_prio;
1da177e4 3882
dd41f596
IM
3883 if (on_rq) {
3884 enqueue_task(rq, p, 0, now);
3885 inc_load(rq, p, now);
1da177e4 3886 /*
d5f9f942
AM
3887 * If the task increased its priority or is running and
3888 * lowered its priority, then reschedule its CPU:
1da177e4 3889 */
d5f9f942 3890 if (delta < 0 || (delta > 0 && task_running(rq, p)))
1da177e4
LT
3891 resched_task(rq->curr);
3892 }
3893out_unlock:
3894 task_rq_unlock(rq, &flags);
3895}
1da177e4
LT
3896EXPORT_SYMBOL(set_user_nice);
3897
e43379f1
MM
3898/*
3899 * can_nice - check if a task can reduce its nice value
3900 * @p: task
3901 * @nice: nice value
3902 */
36c8b586 3903int can_nice(const struct task_struct *p, const int nice)
e43379f1 3904{
024f4747
MM
3905 /* convert nice value [19,-20] to rlimit style value [1,40] */
3906 int nice_rlim = 20 - nice;
48f24c4d 3907
e43379f1
MM
3908 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3909 capable(CAP_SYS_NICE));
3910}
3911
1da177e4
LT
3912#ifdef __ARCH_WANT_SYS_NICE
3913
3914/*
3915 * sys_nice - change the priority of the current process.
3916 * @increment: priority increment
3917 *
3918 * sys_setpriority is a more generic, but much slower function that
3919 * does similar things.
3920 */
3921asmlinkage long sys_nice(int increment)
3922{
48f24c4d 3923 long nice, retval;
1da177e4
LT
3924
3925 /*
3926 * Setpriority might change our priority at the same moment.
3927 * We don't have to worry. Conceptually one call occurs first
3928 * and we have a single winner.
3929 */
e43379f1
MM
3930 if (increment < -40)
3931 increment = -40;
1da177e4
LT
3932 if (increment > 40)
3933 increment = 40;
3934
3935 nice = PRIO_TO_NICE(current->static_prio) + increment;
3936 if (nice < -20)
3937 nice = -20;
3938 if (nice > 19)
3939 nice = 19;
3940
e43379f1
MM
3941 if (increment < 0 && !can_nice(current, nice))
3942 return -EPERM;
3943
1da177e4
LT
3944 retval = security_task_setnice(current, nice);
3945 if (retval)
3946 return retval;
3947
3948 set_user_nice(current, nice);
3949 return 0;
3950}
3951
3952#endif
3953
3954/**
3955 * task_prio - return the priority value of a given task.
3956 * @p: the task in question.
3957 *
3958 * This is the priority value as seen by users in /proc.
3959 * RT tasks are offset by -200. Normal tasks are centered
3960 * around 0, value goes from -16 to +15.
3961 */
36c8b586 3962int task_prio(const struct task_struct *p)
1da177e4
LT
3963{
3964 return p->prio - MAX_RT_PRIO;
3965}
3966
3967/**
3968 * task_nice - return the nice value of a given task.
3969 * @p: the task in question.
3970 */
36c8b586 3971int task_nice(const struct task_struct *p)
1da177e4
LT
3972{
3973 return TASK_NICE(p);
3974}
1da177e4 3975EXPORT_SYMBOL_GPL(task_nice);
1da177e4
LT
3976
3977/**
3978 * idle_cpu - is a given cpu idle currently?
3979 * @cpu: the processor in question.
3980 */
3981int idle_cpu(int cpu)
3982{
3983 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3984}
3985
1da177e4
LT
3986/**
3987 * idle_task - return the idle task for a given cpu.
3988 * @cpu: the processor in question.
3989 */
36c8b586 3990struct task_struct *idle_task(int cpu)
1da177e4
LT
3991{
3992 return cpu_rq(cpu)->idle;
3993}
3994
3995/**
3996 * find_process_by_pid - find a process with a matching PID value.
3997 * @pid: the pid in question.
3998 */
36c8b586 3999static inline struct task_struct *find_process_by_pid(pid_t pid)
1da177e4
LT
4000{
4001 return pid ? find_task_by_pid(pid) : current;
4002}
4003
4004/* Actually do priority change: must hold rq lock. */
dd41f596
IM
4005static void
4006__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
1da177e4 4007{
dd41f596 4008 BUG_ON(p->se.on_rq);
48f24c4d 4009
1da177e4 4010 p->policy = policy;
dd41f596
IM
4011 switch (p->policy) {
4012 case SCHED_NORMAL:
4013 case SCHED_BATCH:
4014 case SCHED_IDLE:
4015 p->sched_class = &fair_sched_class;
4016 break;
4017 case SCHED_FIFO:
4018 case SCHED_RR:
4019 p->sched_class = &rt_sched_class;
4020 break;
4021 }
4022
1da177e4 4023 p->rt_priority = prio;
b29739f9
IM
4024 p->normal_prio = normal_prio(p);
4025 /* we are holding p->pi_lock already */
4026 p->prio = rt_mutex_getprio(p);
2dd73a4f 4027 set_load_weight(p);
1da177e4
LT
4028}
4029
4030/**
72fd4a35 4031 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
1da177e4
LT
4032 * @p: the task in question.
4033 * @policy: new policy.
4034 * @param: structure containing the new RT priority.
5fe1d75f 4035 *
72fd4a35 4036 * NOTE that the task may be already dead.
1da177e4 4037 */
95cdf3b7
IM
4038int sched_setscheduler(struct task_struct *p, int policy,
4039 struct sched_param *param)
1da177e4 4040{
dd41f596 4041 int retval, oldprio, oldpolicy = -1, on_rq;
1da177e4 4042 unsigned long flags;
70b97a7f 4043 struct rq *rq;
1da177e4 4044
66e5393a
SR
4045 /* may grab non-irq protected spin_locks */
4046 BUG_ON(in_interrupt());
1da177e4
LT
4047recheck:
4048 /* double check policy once rq lock held */
4049 if (policy < 0)
4050 policy = oldpolicy = p->policy;
4051 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
dd41f596
IM
4052 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4053 policy != SCHED_IDLE)
b0a9499c 4054 return -EINVAL;
1da177e4
LT
4055 /*
4056 * Valid priorities for SCHED_FIFO and SCHED_RR are
dd41f596
IM
4057 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4058 * SCHED_BATCH and SCHED_IDLE is 0.
1da177e4
LT
4059 */
4060 if (param->sched_priority < 0 ||
95cdf3b7 4061 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
d46523ea 4062 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
1da177e4 4063 return -EINVAL;
e05606d3 4064 if (rt_policy(policy) != (param->sched_priority != 0))
1da177e4
LT
4065 return -EINVAL;
4066
37e4ab3f
OC
4067 /*
4068 * Allow unprivileged RT tasks to decrease priority:
4069 */
4070 if (!capable(CAP_SYS_NICE)) {
e05606d3 4071 if (rt_policy(policy)) {
8dc3e909 4072 unsigned long rlim_rtprio;
8dc3e909
ON
4073
4074 if (!lock_task_sighand(p, &flags))
4075 return -ESRCH;
4076 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4077 unlock_task_sighand(p, &flags);
4078
4079 /* can't set/change the rt policy */
4080 if (policy != p->policy && !rlim_rtprio)
4081 return -EPERM;
4082
4083 /* can't increase priority */
4084 if (param->sched_priority > p->rt_priority &&
4085 param->sched_priority > rlim_rtprio)
4086 return -EPERM;
4087 }
dd41f596
IM
4088 /*
4089 * Like positive nice levels, dont allow tasks to
4090 * move out of SCHED_IDLE either:
4091 */
4092 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4093 return -EPERM;
5fe1d75f 4094
37e4ab3f
OC
4095 /* can't change other user's priorities */
4096 if ((current->euid != p->euid) &&
4097 (current->euid != p->uid))
4098 return -EPERM;
4099 }
1da177e4
LT
4100
4101 retval = security_task_setscheduler(p, policy, param);
4102 if (retval)
4103 return retval;
b29739f9
IM
4104 /*
4105 * make sure no PI-waiters arrive (or leave) while we are
4106 * changing the priority of the task:
4107 */
4108 spin_lock_irqsave(&p->pi_lock, flags);
1da177e4
LT
4109 /*
4110 * To be able to change p->policy safely, the apropriate
4111 * runqueue lock must be held.
4112 */
b29739f9 4113 rq = __task_rq_lock(p);
1da177e4
LT
4114 /* recheck policy now with rq lock held */
4115 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4116 policy = oldpolicy = -1;
b29739f9
IM
4117 __task_rq_unlock(rq);
4118 spin_unlock_irqrestore(&p->pi_lock, flags);
1da177e4
LT
4119 goto recheck;
4120 }
dd41f596
IM
4121 on_rq = p->se.on_rq;
4122 if (on_rq)
4123 deactivate_task(rq, p, 0);
1da177e4 4124 oldprio = p->prio;
dd41f596
IM
4125 __setscheduler(rq, p, policy, param->sched_priority);
4126 if (on_rq) {
4127 activate_task(rq, p, 0);
1da177e4
LT
4128 /*
4129 * Reschedule if we are currently running on this runqueue and
d5f9f942
AM
4130 * our priority decreased, or if we are not currently running on
4131 * this runqueue and our priority is higher than the current's
1da177e4 4132 */
d5f9f942
AM
4133 if (task_running(rq, p)) {
4134 if (p->prio > oldprio)
4135 resched_task(rq->curr);
dd41f596
IM
4136 } else {
4137 check_preempt_curr(rq, p);
4138 }
1da177e4 4139 }
b29739f9
IM
4140 __task_rq_unlock(rq);
4141 spin_unlock_irqrestore(&p->pi_lock, flags);
4142
95e02ca9
TG
4143 rt_mutex_adjust_pi(p);
4144
1da177e4
LT
4145 return 0;
4146}
4147EXPORT_SYMBOL_GPL(sched_setscheduler);
4148
95cdf3b7
IM
4149static int
4150do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
1da177e4 4151{
1da177e4
LT
4152 struct sched_param lparam;
4153 struct task_struct *p;
36c8b586 4154 int retval;
1da177e4
LT
4155
4156 if (!param || pid < 0)
4157 return -EINVAL;
4158 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4159 return -EFAULT;
5fe1d75f
ON
4160
4161 rcu_read_lock();
4162 retval = -ESRCH;
1da177e4 4163 p = find_process_by_pid(pid);
5fe1d75f
ON
4164 if (p != NULL)
4165 retval = sched_setscheduler(p, policy, &lparam);
4166 rcu_read_unlock();
36c8b586 4167
1da177e4
LT
4168 return retval;
4169}
4170
4171/**
4172 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4173 * @pid: the pid in question.
4174 * @policy: new policy.
4175 * @param: structure containing the new RT priority.
4176 */
4177asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4178 struct sched_param __user *param)
4179{
c21761f1
JB
4180 /* negative values for policy are not valid */
4181 if (policy < 0)
4182 return -EINVAL;
4183
1da177e4
LT
4184 return do_sched_setscheduler(pid, policy, param);
4185}
4186
4187/**
4188 * sys_sched_setparam - set/change the RT priority of a thread
4189 * @pid: the pid in question.
4190 * @param: structure containing the new RT priority.
4191 */
4192asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4193{
4194 return do_sched_setscheduler(pid, -1, param);
4195}
4196
4197/**
4198 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4199 * @pid: the pid in question.
4200 */
4201asmlinkage long sys_sched_getscheduler(pid_t pid)
4202{
36c8b586 4203 struct task_struct *p;
1da177e4 4204 int retval = -EINVAL;
1da177e4
LT
4205
4206 if (pid < 0)
4207 goto out_nounlock;
4208
4209 retval = -ESRCH;
4210 read_lock(&tasklist_lock);
4211 p = find_process_by_pid(pid);
4212 if (p) {
4213 retval = security_task_getscheduler(p);
4214 if (!retval)
4215 retval = p->policy;
4216 }
4217 read_unlock(&tasklist_lock);
4218
4219out_nounlock:
4220 return retval;
4221}
4222
4223/**
4224 * sys_sched_getscheduler - get the RT priority of a thread
4225 * @pid: the pid in question.
4226 * @param: structure containing the RT priority.
4227 */
4228asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4229{
4230 struct sched_param lp;
36c8b586 4231 struct task_struct *p;
1da177e4 4232 int retval = -EINVAL;
1da177e4
LT
4233
4234 if (!param || pid < 0)
4235 goto out_nounlock;
4236
4237 read_lock(&tasklist_lock);
4238 p = find_process_by_pid(pid);
4239 retval = -ESRCH;
4240 if (!p)
4241 goto out_unlock;
4242
4243 retval = security_task_getscheduler(p);
4244 if (retval)
4245 goto out_unlock;
4246
4247 lp.sched_priority = p->rt_priority;
4248 read_unlock(&tasklist_lock);
4249
4250 /*
4251 * This one might sleep, we cannot do it with a spinlock held ...
4252 */
4253 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4254
4255out_nounlock:
4256 return retval;
4257
4258out_unlock:
4259 read_unlock(&tasklist_lock);
4260 return retval;
4261}
4262
4263long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4264{
1da177e4 4265 cpumask_t cpus_allowed;
36c8b586
IM
4266 struct task_struct *p;
4267 int retval;
1da177e4 4268
5be9361c 4269 mutex_lock(&sched_hotcpu_mutex);
1da177e4
LT
4270 read_lock(&tasklist_lock);
4271
4272 p = find_process_by_pid(pid);
4273 if (!p) {
4274 read_unlock(&tasklist_lock);
5be9361c 4275 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4276 return -ESRCH;
4277 }
4278
4279 /*
4280 * It is not safe to call set_cpus_allowed with the
4281 * tasklist_lock held. We will bump the task_struct's
4282 * usage count and then drop tasklist_lock.
4283 */
4284 get_task_struct(p);
4285 read_unlock(&tasklist_lock);
4286
4287 retval = -EPERM;
4288 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4289 !capable(CAP_SYS_NICE))
4290 goto out_unlock;
4291
e7834f8f
DQ
4292 retval = security_task_setscheduler(p, 0, NULL);
4293 if (retval)
4294 goto out_unlock;
4295
1da177e4
LT
4296 cpus_allowed = cpuset_cpus_allowed(p);
4297 cpus_and(new_mask, new_mask, cpus_allowed);
4298 retval = set_cpus_allowed(p, new_mask);
4299
4300out_unlock:
4301 put_task_struct(p);
5be9361c 4302 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4303 return retval;
4304}
4305
4306static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4307 cpumask_t *new_mask)
4308{
4309 if (len < sizeof(cpumask_t)) {
4310 memset(new_mask, 0, sizeof(cpumask_t));
4311 } else if (len > sizeof(cpumask_t)) {
4312 len = sizeof(cpumask_t);
4313 }
4314 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4315}
4316
4317/**
4318 * sys_sched_setaffinity - set the cpu affinity of a process
4319 * @pid: pid of the process
4320 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4321 * @user_mask_ptr: user-space pointer to the new cpu mask
4322 */
4323asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4324 unsigned long __user *user_mask_ptr)
4325{
4326 cpumask_t new_mask;
4327 int retval;
4328
4329 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4330 if (retval)
4331 return retval;
4332
4333 return sched_setaffinity(pid, new_mask);
4334}
4335
4336/*
4337 * Represents all cpu's present in the system
4338 * In systems capable of hotplug, this map could dynamically grow
4339 * as new cpu's are detected in the system via any platform specific
4340 * method, such as ACPI for e.g.
4341 */
4342
4cef0c61 4343cpumask_t cpu_present_map __read_mostly;
1da177e4
LT
4344EXPORT_SYMBOL(cpu_present_map);
4345
4346#ifndef CONFIG_SMP
4cef0c61 4347cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
e16b38f7
GB
4348EXPORT_SYMBOL(cpu_online_map);
4349
4cef0c61 4350cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
e16b38f7 4351EXPORT_SYMBOL(cpu_possible_map);
1da177e4
LT
4352#endif
4353
4354long sched_getaffinity(pid_t pid, cpumask_t *mask)
4355{
36c8b586 4356 struct task_struct *p;
1da177e4 4357 int retval;
1da177e4 4358
5be9361c 4359 mutex_lock(&sched_hotcpu_mutex);
1da177e4
LT
4360 read_lock(&tasklist_lock);
4361
4362 retval = -ESRCH;
4363 p = find_process_by_pid(pid);
4364 if (!p)
4365 goto out_unlock;
4366
e7834f8f
DQ
4367 retval = security_task_getscheduler(p);
4368 if (retval)
4369 goto out_unlock;
4370
2f7016d9 4371 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
1da177e4
LT
4372
4373out_unlock:
4374 read_unlock(&tasklist_lock);
5be9361c 4375 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4376 if (retval)
4377 return retval;
4378
4379 return 0;
4380}
4381
4382/**
4383 * sys_sched_getaffinity - get the cpu affinity of a process
4384 * @pid: pid of the process
4385 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4386 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4387 */
4388asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4389 unsigned long __user *user_mask_ptr)
4390{
4391 int ret;
4392 cpumask_t mask;
4393
4394 if (len < sizeof(cpumask_t))
4395 return -EINVAL;
4396
4397 ret = sched_getaffinity(pid, &mask);
4398 if (ret < 0)
4399 return ret;
4400
4401 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4402 return -EFAULT;
4403
4404 return sizeof(cpumask_t);
4405}
4406
4407/**
4408 * sys_sched_yield - yield the current processor to other threads.
4409 *
dd41f596
IM
4410 * This function yields the current CPU to other tasks. If there are no
4411 * other threads running on this CPU then this function will return.
1da177e4
LT
4412 */
4413asmlinkage long sys_sched_yield(void)
4414{
70b97a7f 4415 struct rq *rq = this_rq_lock();
1da177e4
LT
4416
4417 schedstat_inc(rq, yld_cnt);
dd41f596 4418 if (unlikely(rq->nr_running == 1))
1da177e4 4419 schedstat_inc(rq, yld_act_empty);
dd41f596
IM
4420 else
4421 current->sched_class->yield_task(rq, current);
1da177e4
LT
4422
4423 /*
4424 * Since we are going to call schedule() anyway, there's
4425 * no need to preempt or enable interrupts:
4426 */
4427 __release(rq->lock);
8a25d5de 4428 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1da177e4
LT
4429 _raw_spin_unlock(&rq->lock);
4430 preempt_enable_no_resched();
4431
4432 schedule();
4433
4434 return 0;
4435}
4436
e7b38404 4437static void __cond_resched(void)
1da177e4 4438{
8e0a43d8
IM
4439#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4440 __might_sleep(__FILE__, __LINE__);
4441#endif
5bbcfd90
IM
4442 /*
4443 * The BKS might be reacquired before we have dropped
4444 * PREEMPT_ACTIVE, which could trigger a second
4445 * cond_resched() call.
4446 */
1da177e4
LT
4447 do {
4448 add_preempt_count(PREEMPT_ACTIVE);
4449 schedule();
4450 sub_preempt_count(PREEMPT_ACTIVE);
4451 } while (need_resched());
4452}
4453
4454int __sched cond_resched(void)
4455{
9414232f
IM
4456 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4457 system_state == SYSTEM_RUNNING) {
1da177e4
LT
4458 __cond_resched();
4459 return 1;
4460 }
4461 return 0;
4462}
1da177e4
LT
4463EXPORT_SYMBOL(cond_resched);
4464
4465/*
4466 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4467 * call schedule, and on return reacquire the lock.
4468 *
4469 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4470 * operations here to prevent schedule() from being called twice (once via
4471 * spin_unlock(), once by hand).
4472 */
95cdf3b7 4473int cond_resched_lock(spinlock_t *lock)
1da177e4 4474{
6df3cecb
JK
4475 int ret = 0;
4476
1da177e4
LT
4477 if (need_lockbreak(lock)) {
4478 spin_unlock(lock);
4479 cpu_relax();
6df3cecb 4480 ret = 1;
1da177e4
LT
4481 spin_lock(lock);
4482 }
9414232f 4483 if (need_resched() && system_state == SYSTEM_RUNNING) {
8a25d5de 4484 spin_release(&lock->dep_map, 1, _THIS_IP_);
1da177e4
LT
4485 _raw_spin_unlock(lock);
4486 preempt_enable_no_resched();
4487 __cond_resched();
6df3cecb 4488 ret = 1;
1da177e4 4489 spin_lock(lock);
1da177e4 4490 }
6df3cecb 4491 return ret;
1da177e4 4492}
1da177e4
LT
4493EXPORT_SYMBOL(cond_resched_lock);
4494
4495int __sched cond_resched_softirq(void)
4496{
4497 BUG_ON(!in_softirq());
4498
9414232f 4499 if (need_resched() && system_state == SYSTEM_RUNNING) {
98d82567 4500 local_bh_enable();
1da177e4
LT
4501 __cond_resched();
4502 local_bh_disable();
4503 return 1;
4504 }
4505 return 0;
4506}
1da177e4
LT
4507EXPORT_SYMBOL(cond_resched_softirq);
4508
1da177e4
LT
4509/**
4510 * yield - yield the current processor to other threads.
4511 *
72fd4a35 4512 * This is a shortcut for kernel-space yielding - it marks the
1da177e4
LT
4513 * thread runnable and calls sys_sched_yield().
4514 */
4515void __sched yield(void)
4516{
4517 set_current_state(TASK_RUNNING);
4518 sys_sched_yield();
4519}
1da177e4
LT
4520EXPORT_SYMBOL(yield);
4521
4522/*
4523 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4524 * that process accounting knows that this is a task in IO wait state.
4525 *
4526 * But don't do that if it is a deliberate, throttling IO wait (this task
4527 * has set its backing_dev_info: the queue against which it should throttle)
4528 */
4529void __sched io_schedule(void)
4530{
70b97a7f 4531 struct rq *rq = &__raw_get_cpu_var(runqueues);
1da177e4 4532
0ff92245 4533 delayacct_blkio_start();
1da177e4
LT
4534 atomic_inc(&rq->nr_iowait);
4535 schedule();
4536 atomic_dec(&rq->nr_iowait);
0ff92245 4537 delayacct_blkio_end();
1da177e4 4538}
1da177e4
LT
4539EXPORT_SYMBOL(io_schedule);
4540
4541long __sched io_schedule_timeout(long timeout)
4542{
70b97a7f 4543 struct rq *rq = &__raw_get_cpu_var(runqueues);
1da177e4
LT
4544 long ret;
4545
0ff92245 4546 delayacct_blkio_start();
1da177e4
LT
4547 atomic_inc(&rq->nr_iowait);
4548 ret = schedule_timeout(timeout);
4549 atomic_dec(&rq->nr_iowait);
0ff92245 4550 delayacct_blkio_end();
1da177e4
LT
4551 return ret;
4552}
4553
4554/**
4555 * sys_sched_get_priority_max - return maximum RT priority.
4556 * @policy: scheduling class.
4557 *
4558 * this syscall returns the maximum rt_priority that can be used
4559 * by a given scheduling class.
4560 */
4561asmlinkage long sys_sched_get_priority_max(int policy)
4562{
4563 int ret = -EINVAL;
4564
4565 switch (policy) {
4566 case SCHED_FIFO:
4567 case SCHED_RR:
4568 ret = MAX_USER_RT_PRIO-1;
4569 break;
4570 case SCHED_NORMAL:
b0a9499c 4571 case SCHED_BATCH:
dd41f596 4572 case SCHED_IDLE:
1da177e4
LT
4573 ret = 0;
4574 break;
4575 }
4576 return ret;
4577}
4578
4579/**
4580 * sys_sched_get_priority_min - return minimum RT priority.
4581 * @policy: scheduling class.
4582 *
4583 * this syscall returns the minimum rt_priority that can be used
4584 * by a given scheduling class.
4585 */
4586asmlinkage long sys_sched_get_priority_min(int policy)
4587{
4588 int ret = -EINVAL;
4589
4590 switch (policy) {
4591 case SCHED_FIFO:
4592 case SCHED_RR:
4593 ret = 1;
4594 break;
4595 case SCHED_NORMAL:
b0a9499c 4596 case SCHED_BATCH:
dd41f596 4597 case SCHED_IDLE:
1da177e4
LT
4598 ret = 0;
4599 }
4600 return ret;
4601}
4602
4603/**
4604 * sys_sched_rr_get_interval - return the default timeslice of a process.
4605 * @pid: pid of the process.
4606 * @interval: userspace pointer to the timeslice value.
4607 *
4608 * this syscall writes the default timeslice value of a given process
4609 * into the user-space timespec buffer. A value of '0' means infinity.
4610 */
4611asmlinkage
4612long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4613{
36c8b586 4614 struct task_struct *p;
1da177e4
LT
4615 int retval = -EINVAL;
4616 struct timespec t;
1da177e4
LT
4617
4618 if (pid < 0)
4619 goto out_nounlock;
4620
4621 retval = -ESRCH;
4622 read_lock(&tasklist_lock);
4623 p = find_process_by_pid(pid);
4624 if (!p)
4625 goto out_unlock;
4626
4627 retval = security_task_getscheduler(p);
4628 if (retval)
4629 goto out_unlock;
4630
b78709cf 4631 jiffies_to_timespec(p->policy == SCHED_FIFO ?
dd41f596 4632 0 : static_prio_timeslice(p->static_prio), &t);
1da177e4
LT
4633 read_unlock(&tasklist_lock);
4634 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4635out_nounlock:
4636 return retval;
4637out_unlock:
4638 read_unlock(&tasklist_lock);
4639 return retval;
4640}
4641
2ed6e34f 4642static const char stat_nam[] = "RSDTtZX";
36c8b586
IM
4643
4644static void show_task(struct task_struct *p)
1da177e4 4645{
1da177e4 4646 unsigned long free = 0;
36c8b586 4647 unsigned state;
1da177e4 4648
1da177e4 4649 state = p->state ? __ffs(p->state) + 1 : 0;
2ed6e34f
AM
4650 printk("%-13.13s %c", p->comm,
4651 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4bd77321 4652#if BITS_PER_LONG == 32
1da177e4 4653 if (state == TASK_RUNNING)
4bd77321 4654 printk(" running ");
1da177e4 4655 else
4bd77321 4656 printk(" %08lx ", thread_saved_pc(p));
1da177e4
LT
4657#else
4658 if (state == TASK_RUNNING)
4bd77321 4659 printk(" running task ");
1da177e4
LT
4660 else
4661 printk(" %016lx ", thread_saved_pc(p));
4662#endif
4663#ifdef CONFIG_DEBUG_STACK_USAGE
4664 {
10ebffde 4665 unsigned long *n = end_of_stack(p);
1da177e4
LT
4666 while (!*n)
4667 n++;
10ebffde 4668 free = (unsigned long)n - (unsigned long)end_of_stack(p);
1da177e4
LT
4669 }
4670#endif
4bd77321 4671 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
1da177e4
LT
4672
4673 if (state != TASK_RUNNING)
4674 show_stack(p, NULL);
4675}
4676
e59e2ae2 4677void show_state_filter(unsigned long state_filter)
1da177e4 4678{
36c8b586 4679 struct task_struct *g, *p;
1da177e4 4680
4bd77321
IM
4681#if BITS_PER_LONG == 32
4682 printk(KERN_INFO
4683 " task PC stack pid father\n");
1da177e4 4684#else
4bd77321
IM
4685 printk(KERN_INFO
4686 " task PC stack pid father\n");
1da177e4
LT
4687#endif
4688 read_lock(&tasklist_lock);
4689 do_each_thread(g, p) {
4690 /*
4691 * reset the NMI-timeout, listing all files on a slow
4692 * console might take alot of time:
4693 */
4694 touch_nmi_watchdog();
39bc89fd 4695 if (!state_filter || (p->state & state_filter))
e59e2ae2 4696 show_task(p);
1da177e4
LT
4697 } while_each_thread(g, p);
4698
04c9167f
JF
4699 touch_all_softlockup_watchdogs();
4700
dd41f596
IM
4701#ifdef CONFIG_SCHED_DEBUG
4702 sysrq_sched_debug_show();
4703#endif
1da177e4 4704 read_unlock(&tasklist_lock);
e59e2ae2
IM
4705 /*
4706 * Only show locks if all tasks are dumped:
4707 */
4708 if (state_filter == -1)
4709 debug_show_all_locks();
1da177e4
LT
4710}
4711
1df21055
IM
4712void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4713{
dd41f596 4714 idle->sched_class = &idle_sched_class;
1df21055
IM
4715}
4716
f340c0d1
IM
4717/**
4718 * init_idle - set up an idle thread for a given CPU
4719 * @idle: task in question
4720 * @cpu: cpu the idle task belongs to
4721 *
4722 * NOTE: this function does not set the idle thread's NEED_RESCHED
4723 * flag, to make booting more robust.
4724 */
5c1e1767 4725void __cpuinit init_idle(struct task_struct *idle, int cpu)
1da177e4 4726{
70b97a7f 4727 struct rq *rq = cpu_rq(cpu);
1da177e4
LT
4728 unsigned long flags;
4729
dd41f596
IM
4730 __sched_fork(idle);
4731 idle->se.exec_start = sched_clock();
4732
b29739f9 4733 idle->prio = idle->normal_prio = MAX_PRIO;
1da177e4 4734 idle->cpus_allowed = cpumask_of_cpu(cpu);
dd41f596 4735 __set_task_cpu(idle, cpu);
1da177e4
LT
4736
4737 spin_lock_irqsave(&rq->lock, flags);
4738 rq->curr = rq->idle = idle;
4866cde0
NP
4739#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4740 idle->oncpu = 1;
4741#endif
1da177e4
LT
4742 spin_unlock_irqrestore(&rq->lock, flags);
4743
4744 /* Set the preempt count _outside_ the spinlocks! */
4745#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
a1261f54 4746 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
1da177e4 4747#else
a1261f54 4748 task_thread_info(idle)->preempt_count = 0;
1da177e4 4749#endif
dd41f596
IM
4750 /*
4751 * The idle tasks have their own, simple scheduling class:
4752 */
4753 idle->sched_class = &idle_sched_class;
1da177e4
LT
4754}
4755
4756/*
4757 * In a system that switches off the HZ timer nohz_cpu_mask
4758 * indicates which cpus entered this state. This is used
4759 * in the rcu update to wait only for active cpus. For system
4760 * which do not switch off the HZ timer nohz_cpu_mask should
4761 * always be CPU_MASK_NONE.
4762 */
4763cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4764
dd41f596
IM
4765/*
4766 * Increase the granularity value when there are more CPUs,
4767 * because with more CPUs the 'effective latency' as visible
4768 * to users decreases. But the relationship is not linear,
4769 * so pick a second-best guess by going with the log2 of the
4770 * number of CPUs.
4771 *
4772 * This idea comes from the SD scheduler of Con Kolivas:
4773 */
4774static inline void sched_init_granularity(void)
4775{
4776 unsigned int factor = 1 + ilog2(num_online_cpus());
a5968df8 4777 const unsigned long gran_limit = 100000000;
dd41f596
IM
4778
4779 sysctl_sched_granularity *= factor;
4780 if (sysctl_sched_granularity > gran_limit)
4781 sysctl_sched_granularity = gran_limit;
4782
4783 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4784 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4785}
4786
1da177e4
LT
4787#ifdef CONFIG_SMP
4788/*
4789 * This is how migration works:
4790 *
70b97a7f 4791 * 1) we queue a struct migration_req structure in the source CPU's
1da177e4
LT
4792 * runqueue and wake up that CPU's migration thread.
4793 * 2) we down() the locked semaphore => thread blocks.
4794 * 3) migration thread wakes up (implicitly it forces the migrated
4795 * thread off the CPU)
4796 * 4) it gets the migration request and checks whether the migrated
4797 * task is still in the wrong runqueue.
4798 * 5) if it's in the wrong runqueue then the migration thread removes
4799 * it and puts it into the right queue.
4800 * 6) migration thread up()s the semaphore.
4801 * 7) we wake up and the migration is done.
4802 */
4803
4804/*
4805 * Change a given task's CPU affinity. Migrate the thread to a
4806 * proper CPU and schedule it away if the CPU it's executing on
4807 * is removed from the allowed bitmask.
4808 *
4809 * NOTE: the caller must have a valid reference to the task, the
4810 * task must not exit() & deallocate itself prematurely. The
4811 * call is not atomic; no spinlocks may be held.
4812 */
36c8b586 4813int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1da177e4 4814{
70b97a7f 4815 struct migration_req req;
1da177e4 4816 unsigned long flags;
70b97a7f 4817 struct rq *rq;
48f24c4d 4818 int ret = 0;
1da177e4
LT
4819
4820 rq = task_rq_lock(p, &flags);
4821 if (!cpus_intersects(new_mask, cpu_online_map)) {
4822 ret = -EINVAL;
4823 goto out;
4824 }
4825
4826 p->cpus_allowed = new_mask;
4827 /* Can the task run on the task's current CPU? If so, we're done */
4828 if (cpu_isset(task_cpu(p), new_mask))
4829 goto out;
4830
4831 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4832 /* Need help from migration thread: drop lock and wait. */
4833 task_rq_unlock(rq, &flags);
4834 wake_up_process(rq->migration_thread);
4835 wait_for_completion(&req.done);
4836 tlb_migrate_finish(p->mm);
4837 return 0;
4838 }
4839out:
4840 task_rq_unlock(rq, &flags);
48f24c4d 4841
1da177e4
LT
4842 return ret;
4843}
1da177e4
LT
4844EXPORT_SYMBOL_GPL(set_cpus_allowed);
4845
4846/*
4847 * Move (not current) task off this cpu, onto dest cpu. We're doing
4848 * this because either it can't run here any more (set_cpus_allowed()
4849 * away from this CPU, or CPU going down), or because we're
4850 * attempting to rebalance this task on exec (sched_exec).
4851 *
4852 * So we race with normal scheduler movements, but that's OK, as long
4853 * as the task is no longer on this CPU.
efc30814
KK
4854 *
4855 * Returns non-zero if task was successfully migrated.
1da177e4 4856 */
efc30814 4857static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
1da177e4 4858{
70b97a7f 4859 struct rq *rq_dest, *rq_src;
dd41f596 4860 int ret = 0, on_rq;
1da177e4
LT
4861
4862 if (unlikely(cpu_is_offline(dest_cpu)))
efc30814 4863 return ret;
1da177e4
LT
4864
4865 rq_src = cpu_rq(src_cpu);
4866 rq_dest = cpu_rq(dest_cpu);
4867
4868 double_rq_lock(rq_src, rq_dest);
4869 /* Already moved. */
4870 if (task_cpu(p) != src_cpu)
4871 goto out;
4872 /* Affinity changed (again). */
4873 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4874 goto out;
4875
dd41f596
IM
4876 on_rq = p->se.on_rq;
4877 if (on_rq)
4878 deactivate_task(rq_src, p, 0);
1da177e4 4879 set_task_cpu(p, dest_cpu);
dd41f596
IM
4880 if (on_rq) {
4881 activate_task(rq_dest, p, 0);
4882 check_preempt_curr(rq_dest, p);
1da177e4 4883 }
efc30814 4884 ret = 1;
1da177e4
LT
4885out:
4886 double_rq_unlock(rq_src, rq_dest);
efc30814 4887 return ret;
1da177e4
LT
4888}
4889
4890/*
4891 * migration_thread - this is a highprio system thread that performs
4892 * thread migration by bumping thread off CPU then 'pushing' onto
4893 * another runqueue.
4894 */
95cdf3b7 4895static int migration_thread(void *data)
1da177e4 4896{
1da177e4 4897 int cpu = (long)data;
70b97a7f 4898 struct rq *rq;
1da177e4
LT
4899
4900 rq = cpu_rq(cpu);
4901 BUG_ON(rq->migration_thread != current);
4902
4903 set_current_state(TASK_INTERRUPTIBLE);
4904 while (!kthread_should_stop()) {
70b97a7f 4905 struct migration_req *req;
1da177e4 4906 struct list_head *head;
1da177e4 4907
3e1d1d28 4908 try_to_freeze();
1da177e4
LT
4909
4910 spin_lock_irq(&rq->lock);
4911
4912 if (cpu_is_offline(cpu)) {
4913 spin_unlock_irq(&rq->lock);
4914 goto wait_to_die;
4915 }
4916
4917 if (rq->active_balance) {
4918 active_load_balance(rq, cpu);
4919 rq->active_balance = 0;
4920 }
4921
4922 head = &rq->migration_queue;
4923
4924 if (list_empty(head)) {
4925 spin_unlock_irq(&rq->lock);
4926 schedule();
4927 set_current_state(TASK_INTERRUPTIBLE);
4928 continue;
4929 }
70b97a7f 4930 req = list_entry(head->next, struct migration_req, list);
1da177e4
LT
4931 list_del_init(head->next);
4932
674311d5
NP
4933 spin_unlock(&rq->lock);
4934 __migrate_task(req->task, cpu, req->dest_cpu);
4935 local_irq_enable();
1da177e4
LT
4936
4937 complete(&req->done);
4938 }
4939 __set_current_state(TASK_RUNNING);
4940 return 0;
4941
4942wait_to_die:
4943 /* Wait for kthread_stop */
4944 set_current_state(TASK_INTERRUPTIBLE);
4945 while (!kthread_should_stop()) {
4946 schedule();
4947 set_current_state(TASK_INTERRUPTIBLE);
4948 }
4949 __set_current_state(TASK_RUNNING);
4950 return 0;
4951}
4952
4953#ifdef CONFIG_HOTPLUG_CPU
054b9108
KK
4954/*
4955 * Figure out where task on dead CPU should go, use force if neccessary.
4956 * NOTE: interrupts should be disabled by the caller
4957 */
48f24c4d 4958static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
1da177e4 4959{
efc30814 4960 unsigned long flags;
1da177e4 4961 cpumask_t mask;
70b97a7f
IM
4962 struct rq *rq;
4963 int dest_cpu;
1da177e4 4964
efc30814 4965restart:
1da177e4
LT
4966 /* On same node? */
4967 mask = node_to_cpumask(cpu_to_node(dead_cpu));
48f24c4d 4968 cpus_and(mask, mask, p->cpus_allowed);
1da177e4
LT
4969 dest_cpu = any_online_cpu(mask);
4970
4971 /* On any allowed CPU? */
4972 if (dest_cpu == NR_CPUS)
48f24c4d 4973 dest_cpu = any_online_cpu(p->cpus_allowed);
1da177e4
LT
4974
4975 /* No more Mr. Nice Guy. */
4976 if (dest_cpu == NR_CPUS) {
48f24c4d
IM
4977 rq = task_rq_lock(p, &flags);
4978 cpus_setall(p->cpus_allowed);
4979 dest_cpu = any_online_cpu(p->cpus_allowed);
efc30814 4980 task_rq_unlock(rq, &flags);
1da177e4
LT
4981
4982 /*
4983 * Don't tell them about moving exiting tasks or
4984 * kernel threads (both mm NULL), since they never
4985 * leave kernel.
4986 */
48f24c4d 4987 if (p->mm && printk_ratelimit())
1da177e4
LT
4988 printk(KERN_INFO "process %d (%s) no "
4989 "longer affine to cpu%d\n",
48f24c4d 4990 p->pid, p->comm, dead_cpu);
1da177e4 4991 }
48f24c4d 4992 if (!__migrate_task(p, dead_cpu, dest_cpu))
efc30814 4993 goto restart;
1da177e4
LT
4994}
4995
4996/*
4997 * While a dead CPU has no uninterruptible tasks queued at this point,
4998 * it might still have a nonzero ->nr_uninterruptible counter, because
4999 * for performance reasons the counter is not stricly tracking tasks to
5000 * their home CPUs. So we just add the counter to another CPU's counter,
5001 * to keep the global sum constant after CPU-down:
5002 */
70b97a7f 5003static void migrate_nr_uninterruptible(struct rq *rq_src)
1da177e4 5004{
70b97a7f 5005 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
1da177e4
LT
5006 unsigned long flags;
5007
5008 local_irq_save(flags);
5009 double_rq_lock(rq_src, rq_dest);
5010 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5011 rq_src->nr_uninterruptible = 0;
5012 double_rq_unlock(rq_src, rq_dest);
5013 local_irq_restore(flags);
5014}
5015
5016/* Run through task list and migrate tasks from the dead cpu. */
5017static void migrate_live_tasks(int src_cpu)
5018{
48f24c4d 5019 struct task_struct *p, *t;
1da177e4
LT
5020
5021 write_lock_irq(&tasklist_lock);
5022
48f24c4d
IM
5023 do_each_thread(t, p) {
5024 if (p == current)
1da177e4
LT
5025 continue;
5026
48f24c4d
IM
5027 if (task_cpu(p) == src_cpu)
5028 move_task_off_dead_cpu(src_cpu, p);
5029 } while_each_thread(t, p);
1da177e4
LT
5030
5031 write_unlock_irq(&tasklist_lock);
5032}
5033
dd41f596
IM
5034/*
5035 * Schedules idle task to be the next runnable task on current CPU.
1da177e4 5036 * It does so by boosting its priority to highest possible and adding it to
48f24c4d 5037 * the _front_ of the runqueue. Used by CPU offline code.
1da177e4
LT
5038 */
5039void sched_idle_next(void)
5040{
48f24c4d 5041 int this_cpu = smp_processor_id();
70b97a7f 5042 struct rq *rq = cpu_rq(this_cpu);
1da177e4
LT
5043 struct task_struct *p = rq->idle;
5044 unsigned long flags;
5045
5046 /* cpu has to be offline */
48f24c4d 5047 BUG_ON(cpu_online(this_cpu));
1da177e4 5048
48f24c4d
IM
5049 /*
5050 * Strictly not necessary since rest of the CPUs are stopped by now
5051 * and interrupts disabled on the current cpu.
1da177e4
LT
5052 */
5053 spin_lock_irqsave(&rq->lock, flags);
5054
dd41f596 5055 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
48f24c4d
IM
5056
5057 /* Add idle task to the _front_ of its priority queue: */
dd41f596 5058 activate_idle_task(p, rq);
1da177e4
LT
5059
5060 spin_unlock_irqrestore(&rq->lock, flags);
5061}
5062
48f24c4d
IM
5063/*
5064 * Ensures that the idle task is using init_mm right before its cpu goes
1da177e4
LT
5065 * offline.
5066 */
5067void idle_task_exit(void)
5068{
5069 struct mm_struct *mm = current->active_mm;
5070
5071 BUG_ON(cpu_online(smp_processor_id()));
5072
5073 if (mm != &init_mm)
5074 switch_mm(mm, &init_mm, current);
5075 mmdrop(mm);
5076}
5077
054b9108 5078/* called under rq->lock with disabled interrupts */
36c8b586 5079static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
1da177e4 5080{
70b97a7f 5081 struct rq *rq = cpu_rq(dead_cpu);
1da177e4
LT
5082
5083 /* Must be exiting, otherwise would be on tasklist. */
48f24c4d 5084 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
1da177e4
LT
5085
5086 /* Cannot have done final schedule yet: would have vanished. */
c394cc9f 5087 BUG_ON(p->state == TASK_DEAD);
1da177e4 5088
48f24c4d 5089 get_task_struct(p);
1da177e4
LT
5090
5091 /*
5092 * Drop lock around migration; if someone else moves it,
5093 * that's OK. No task can be added to this CPU, so iteration is
5094 * fine.
054b9108 5095 * NOTE: interrupts should be left disabled --dev@
1da177e4 5096 */
054b9108 5097 spin_unlock(&rq->lock);
48f24c4d 5098 move_task_off_dead_cpu(dead_cpu, p);
054b9108 5099 spin_lock(&rq->lock);
1da177e4 5100
48f24c4d 5101 put_task_struct(p);
1da177e4
LT
5102}
5103
5104/* release_task() removes task from tasklist, so we won't find dead tasks. */
5105static void migrate_dead_tasks(unsigned int dead_cpu)
5106{
70b97a7f 5107 struct rq *rq = cpu_rq(dead_cpu);
dd41f596 5108 struct task_struct *next;
48f24c4d 5109
dd41f596
IM
5110 for ( ; ; ) {
5111 if (!rq->nr_running)
5112 break;
5113 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5114 if (!next)
5115 break;
5116 migrate_dead(dead_cpu, next);
1da177e4
LT
5117 }
5118}
5119#endif /* CONFIG_HOTPLUG_CPU */
5120
5121/*
5122 * migration_call - callback that gets triggered when a CPU is added.
5123 * Here we can start up the necessary migration thread for the new CPU.
5124 */
48f24c4d
IM
5125static int __cpuinit
5126migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
1da177e4 5127{
1da177e4 5128 struct task_struct *p;
48f24c4d 5129 int cpu = (long)hcpu;
1da177e4 5130 unsigned long flags;
70b97a7f 5131 struct rq *rq;
1da177e4
LT
5132
5133 switch (action) {
5be9361c
GS
5134 case CPU_LOCK_ACQUIRE:
5135 mutex_lock(&sched_hotcpu_mutex);
5136 break;
5137
1da177e4 5138 case CPU_UP_PREPARE:
8bb78442 5139 case CPU_UP_PREPARE_FROZEN:
dd41f596 5140 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
1da177e4
LT
5141 if (IS_ERR(p))
5142 return NOTIFY_BAD;
5143 p->flags |= PF_NOFREEZE;
5144 kthread_bind(p, cpu);
5145 /* Must be high prio: stop_machine expects to yield to it. */
5146 rq = task_rq_lock(p, &flags);
dd41f596 5147 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
1da177e4
LT
5148 task_rq_unlock(rq, &flags);
5149 cpu_rq(cpu)->migration_thread = p;
5150 break;
48f24c4d 5151
1da177e4 5152 case CPU_ONLINE:
8bb78442 5153 case CPU_ONLINE_FROZEN:
1da177e4
LT
5154 /* Strictly unneccessary, as first user will wake it. */
5155 wake_up_process(cpu_rq(cpu)->migration_thread);
5156 break;
48f24c4d 5157
1da177e4
LT
5158#ifdef CONFIG_HOTPLUG_CPU
5159 case CPU_UP_CANCELED:
8bb78442 5160 case CPU_UP_CANCELED_FROZEN:
fc75cdfa
HC
5161 if (!cpu_rq(cpu)->migration_thread)
5162 break;
1da177e4 5163 /* Unbind it from offline cpu so it can run. Fall thru. */
a4c4af7c
HC
5164 kthread_bind(cpu_rq(cpu)->migration_thread,
5165 any_online_cpu(cpu_online_map));
1da177e4
LT
5166 kthread_stop(cpu_rq(cpu)->migration_thread);
5167 cpu_rq(cpu)->migration_thread = NULL;
5168 break;
48f24c4d 5169
1da177e4 5170 case CPU_DEAD:
8bb78442 5171 case CPU_DEAD_FROZEN:
1da177e4
LT
5172 migrate_live_tasks(cpu);
5173 rq = cpu_rq(cpu);
5174 kthread_stop(rq->migration_thread);
5175 rq->migration_thread = NULL;
5176 /* Idle task back to normal (off runqueue, low prio) */
5177 rq = task_rq_lock(rq->idle, &flags);
dd41f596 5178 deactivate_task(rq, rq->idle, 0);
1da177e4 5179 rq->idle->static_prio = MAX_PRIO;
dd41f596
IM
5180 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5181 rq->idle->sched_class = &idle_sched_class;
1da177e4
LT
5182 migrate_dead_tasks(cpu);
5183 task_rq_unlock(rq, &flags);
5184 migrate_nr_uninterruptible(rq);
5185 BUG_ON(rq->nr_running != 0);
5186
5187 /* No need to migrate the tasks: it was best-effort if
5be9361c 5188 * they didn't take sched_hotcpu_mutex. Just wake up
1da177e4
LT
5189 * the requestors. */
5190 spin_lock_irq(&rq->lock);
5191 while (!list_empty(&rq->migration_queue)) {
70b97a7f
IM
5192 struct migration_req *req;
5193
1da177e4 5194 req = list_entry(rq->migration_queue.next,
70b97a7f 5195 struct migration_req, list);
1da177e4
LT
5196 list_del_init(&req->list);
5197 complete(&req->done);
5198 }
5199 spin_unlock_irq(&rq->lock);
5200 break;
5201#endif
5be9361c
GS
5202 case CPU_LOCK_RELEASE:
5203 mutex_unlock(&sched_hotcpu_mutex);
5204 break;
1da177e4
LT
5205 }
5206 return NOTIFY_OK;
5207}
5208
5209/* Register at highest priority so that task migration (migrate_all_tasks)
5210 * happens before everything else.
5211 */
26c2143b 5212static struct notifier_block __cpuinitdata migration_notifier = {
1da177e4
LT
5213 .notifier_call = migration_call,
5214 .priority = 10
5215};
5216
5217int __init migration_init(void)
5218{
5219 void *cpu = (void *)(long)smp_processor_id();
07dccf33 5220 int err;
48f24c4d
IM
5221
5222 /* Start one for the boot CPU: */
07dccf33
AM
5223 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5224 BUG_ON(err == NOTIFY_BAD);
1da177e4
LT
5225 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5226 register_cpu_notifier(&migration_notifier);
48f24c4d 5227
1da177e4
LT
5228 return 0;
5229}
5230#endif
5231
5232#ifdef CONFIG_SMP
476f3534
CL
5233
5234/* Number of possible processor ids */
5235int nr_cpu_ids __read_mostly = NR_CPUS;
5236EXPORT_SYMBOL(nr_cpu_ids);
5237
1a20ff27 5238#undef SCHED_DOMAIN_DEBUG
1da177e4
LT
5239#ifdef SCHED_DOMAIN_DEBUG
5240static void sched_domain_debug(struct sched_domain *sd, int cpu)
5241{
5242 int level = 0;
5243
41c7ce9a
NP
5244 if (!sd) {
5245 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5246 return;
5247 }
5248
1da177e4
LT
5249 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5250
5251 do {
5252 int i;
5253 char str[NR_CPUS];
5254 struct sched_group *group = sd->groups;
5255 cpumask_t groupmask;
5256
5257 cpumask_scnprintf(str, NR_CPUS, sd->span);
5258 cpus_clear(groupmask);
5259
5260 printk(KERN_DEBUG);
5261 for (i = 0; i < level + 1; i++)
5262 printk(" ");
5263 printk("domain %d: ", level);
5264
5265 if (!(sd->flags & SD_LOAD_BALANCE)) {
5266 printk("does not load-balance\n");
5267 if (sd->parent)
33859f7f
MOS
5268 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5269 " has parent");
1da177e4
LT
5270 break;
5271 }
5272
5273 printk("span %s\n", str);
5274
5275 if (!cpu_isset(cpu, sd->span))
33859f7f
MOS
5276 printk(KERN_ERR "ERROR: domain->span does not contain "
5277 "CPU%d\n", cpu);
1da177e4 5278 if (!cpu_isset(cpu, group->cpumask))
33859f7f
MOS
5279 printk(KERN_ERR "ERROR: domain->groups does not contain"
5280 " CPU%d\n", cpu);
1da177e4
LT
5281
5282 printk(KERN_DEBUG);
5283 for (i = 0; i < level + 2; i++)
5284 printk(" ");
5285 printk("groups:");
5286 do {
5287 if (!group) {
5288 printk("\n");
5289 printk(KERN_ERR "ERROR: group is NULL\n");
5290 break;
5291 }
5292
5517d86b 5293 if (!group->__cpu_power) {
1da177e4 5294 printk("\n");
33859f7f
MOS
5295 printk(KERN_ERR "ERROR: domain->cpu_power not "
5296 "set\n");
1da177e4
LT
5297 }
5298
5299 if (!cpus_weight(group->cpumask)) {
5300 printk("\n");
5301 printk(KERN_ERR "ERROR: empty group\n");
5302 }
5303
5304 if (cpus_intersects(groupmask, group->cpumask)) {
5305 printk("\n");
5306 printk(KERN_ERR "ERROR: repeated CPUs\n");
5307 }
5308
5309 cpus_or(groupmask, groupmask, group->cpumask);
5310
5311 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5312 printk(" %s", str);
5313
5314 group = group->next;
5315 } while (group != sd->groups);
5316 printk("\n");
5317
5318 if (!cpus_equal(sd->span, groupmask))
33859f7f
MOS
5319 printk(KERN_ERR "ERROR: groups don't span "
5320 "domain->span\n");
1da177e4
LT
5321
5322 level++;
5323 sd = sd->parent;
33859f7f
MOS
5324 if (!sd)
5325 continue;
1da177e4 5326
33859f7f
MOS
5327 if (!cpus_subset(groupmask, sd->span))
5328 printk(KERN_ERR "ERROR: parent span is not a superset "
5329 "of domain->span\n");
1da177e4
LT
5330
5331 } while (sd);
5332}
5333#else
48f24c4d 5334# define sched_domain_debug(sd, cpu) do { } while (0)
1da177e4
LT
5335#endif
5336
1a20ff27 5337static int sd_degenerate(struct sched_domain *sd)
245af2c7
SS
5338{
5339 if (cpus_weight(sd->span) == 1)
5340 return 1;
5341
5342 /* Following flags need at least 2 groups */
5343 if (sd->flags & (SD_LOAD_BALANCE |
5344 SD_BALANCE_NEWIDLE |
5345 SD_BALANCE_FORK |
89c4710e
SS
5346 SD_BALANCE_EXEC |
5347 SD_SHARE_CPUPOWER |
5348 SD_SHARE_PKG_RESOURCES)) {
245af2c7
SS
5349 if (sd->groups != sd->groups->next)
5350 return 0;
5351 }
5352
5353 /* Following flags don't use groups */
5354 if (sd->flags & (SD_WAKE_IDLE |
5355 SD_WAKE_AFFINE |
5356 SD_WAKE_BALANCE))
5357 return 0;
5358
5359 return 1;
5360}
5361
48f24c4d
IM
5362static int
5363sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
245af2c7
SS
5364{
5365 unsigned long cflags = sd->flags, pflags = parent->flags;
5366
5367 if (sd_degenerate(parent))
5368 return 1;
5369
5370 if (!cpus_equal(sd->span, parent->span))
5371 return 0;
5372
5373 /* Does parent contain flags not in child? */
5374 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5375 if (cflags & SD_WAKE_AFFINE)
5376 pflags &= ~SD_WAKE_BALANCE;
5377 /* Flags needing groups don't count if only 1 group in parent */
5378 if (parent->groups == parent->groups->next) {
5379 pflags &= ~(SD_LOAD_BALANCE |
5380 SD_BALANCE_NEWIDLE |
5381 SD_BALANCE_FORK |
89c4710e
SS
5382 SD_BALANCE_EXEC |
5383 SD_SHARE_CPUPOWER |
5384 SD_SHARE_PKG_RESOURCES);
245af2c7
SS
5385 }
5386 if (~cflags & pflags)
5387 return 0;
5388
5389 return 1;
5390}
5391
1da177e4
LT
5392/*
5393 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5394 * hold the hotplug lock.
5395 */
9c1cfda2 5396static void cpu_attach_domain(struct sched_domain *sd, int cpu)
1da177e4 5397{
70b97a7f 5398 struct rq *rq = cpu_rq(cpu);
245af2c7
SS
5399 struct sched_domain *tmp;
5400
5401 /* Remove the sched domains which do not contribute to scheduling. */
5402 for (tmp = sd; tmp; tmp = tmp->parent) {
5403 struct sched_domain *parent = tmp->parent;
5404 if (!parent)
5405 break;
1a848870 5406 if (sd_parent_degenerate(tmp, parent)) {
245af2c7 5407 tmp->parent = parent->parent;
1a848870
SS
5408 if (parent->parent)
5409 parent->parent->child = tmp;
5410 }
245af2c7
SS
5411 }
5412
1a848870 5413 if (sd && sd_degenerate(sd)) {
245af2c7 5414 sd = sd->parent;
1a848870
SS
5415 if (sd)
5416 sd->child = NULL;
5417 }
1da177e4
LT
5418
5419 sched_domain_debug(sd, cpu);
5420
674311d5 5421 rcu_assign_pointer(rq->sd, sd);
1da177e4
LT
5422}
5423
5424/* cpus with isolated domains */
67af63a6 5425static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
1da177e4
LT
5426
5427/* Setup the mask of cpus configured for isolated domains */
5428static int __init isolated_cpu_setup(char *str)
5429{
5430 int ints[NR_CPUS], i;
5431
5432 str = get_options(str, ARRAY_SIZE(ints), ints);
5433 cpus_clear(cpu_isolated_map);
5434 for (i = 1; i <= ints[0]; i++)
5435 if (ints[i] < NR_CPUS)
5436 cpu_set(ints[i], cpu_isolated_map);
5437 return 1;
5438}
5439
5440__setup ("isolcpus=", isolated_cpu_setup);
5441
5442/*
6711cab4
SS
5443 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5444 * to a function which identifies what group(along with sched group) a CPU
5445 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5446 * (due to the fact that we keep track of groups covered with a cpumask_t).
1da177e4
LT
5447 *
5448 * init_sched_build_groups will build a circular linked list of the groups
5449 * covered by the given span, and will set each group's ->cpumask correctly,
5450 * and ->cpu_power to 0.
5451 */
a616058b 5452static void
6711cab4
SS
5453init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5454 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5455 struct sched_group **sg))
1da177e4
LT
5456{
5457 struct sched_group *first = NULL, *last = NULL;
5458 cpumask_t covered = CPU_MASK_NONE;
5459 int i;
5460
5461 for_each_cpu_mask(i, span) {
6711cab4
SS
5462 struct sched_group *sg;
5463 int group = group_fn(i, cpu_map, &sg);
1da177e4
LT
5464 int j;
5465
5466 if (cpu_isset(i, covered))
5467 continue;
5468
5469 sg->cpumask = CPU_MASK_NONE;
5517d86b 5470 sg->__cpu_power = 0;
1da177e4
LT
5471
5472 for_each_cpu_mask(j, span) {
6711cab4 5473 if (group_fn(j, cpu_map, NULL) != group)
1da177e4
LT
5474 continue;
5475
5476 cpu_set(j, covered);
5477 cpu_set(j, sg->cpumask);
5478 }
5479 if (!first)
5480 first = sg;
5481 if (last)
5482 last->next = sg;
5483 last = sg;
5484 }
5485 last->next = first;
5486}
5487
9c1cfda2 5488#define SD_NODES_PER_DOMAIN 16
1da177e4 5489
9c1cfda2 5490#ifdef CONFIG_NUMA
198e2f18 5491
9c1cfda2
JH
5492/**
5493 * find_next_best_node - find the next node to include in a sched_domain
5494 * @node: node whose sched_domain we're building
5495 * @used_nodes: nodes already in the sched_domain
5496 *
5497 * Find the next node to include in a given scheduling domain. Simply
5498 * finds the closest node not already in the @used_nodes map.
5499 *
5500 * Should use nodemask_t.
5501 */
5502static int find_next_best_node(int node, unsigned long *used_nodes)
5503{
5504 int i, n, val, min_val, best_node = 0;
5505
5506 min_val = INT_MAX;
5507
5508 for (i = 0; i < MAX_NUMNODES; i++) {
5509 /* Start at @node */
5510 n = (node + i) % MAX_NUMNODES;
5511
5512 if (!nr_cpus_node(n))
5513 continue;
5514
5515 /* Skip already used nodes */
5516 if (test_bit(n, used_nodes))
5517 continue;
5518
5519 /* Simple min distance search */
5520 val = node_distance(node, n);
5521
5522 if (val < min_val) {
5523 min_val = val;
5524 best_node = n;
5525 }
5526 }
5527
5528 set_bit(best_node, used_nodes);
5529 return best_node;
5530}
5531
5532/**
5533 * sched_domain_node_span - get a cpumask for a node's sched_domain
5534 * @node: node whose cpumask we're constructing
5535 * @size: number of nodes to include in this span
5536 *
5537 * Given a node, construct a good cpumask for its sched_domain to span. It
5538 * should be one that prevents unnecessary balancing, but also spreads tasks
5539 * out optimally.
5540 */
5541static cpumask_t sched_domain_node_span(int node)
5542{
9c1cfda2 5543 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
48f24c4d
IM
5544 cpumask_t span, nodemask;
5545 int i;
9c1cfda2
JH
5546
5547 cpus_clear(span);
5548 bitmap_zero(used_nodes, MAX_NUMNODES);
5549
5550 nodemask = node_to_cpumask(node);
5551 cpus_or(span, span, nodemask);
5552 set_bit(node, used_nodes);
5553
5554 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5555 int next_node = find_next_best_node(node, used_nodes);
48f24c4d 5556
9c1cfda2
JH
5557 nodemask = node_to_cpumask(next_node);
5558 cpus_or(span, span, nodemask);
5559 }
5560
5561 return span;
5562}
5563#endif
5564
5c45bf27 5565int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
48f24c4d 5566
9c1cfda2 5567/*
48f24c4d 5568 * SMT sched-domains:
9c1cfda2 5569 */
1da177e4
LT
5570#ifdef CONFIG_SCHED_SMT
5571static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6711cab4 5572static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
48f24c4d 5573
6711cab4
SS
5574static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5575 struct sched_group **sg)
1da177e4 5576{
6711cab4
SS
5577 if (sg)
5578 *sg = &per_cpu(sched_group_cpus, cpu);
1da177e4
LT
5579 return cpu;
5580}
5581#endif
5582
48f24c4d
IM
5583/*
5584 * multi-core sched-domains:
5585 */
1e9f28fa
SS
5586#ifdef CONFIG_SCHED_MC
5587static DEFINE_PER_CPU(struct sched_domain, core_domains);
6711cab4 5588static DEFINE_PER_CPU(struct sched_group, sched_group_core);
1e9f28fa
SS
5589#endif
5590
5591#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6711cab4
SS
5592static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5593 struct sched_group **sg)
1e9f28fa 5594{
6711cab4 5595 int group;
a616058b
SS
5596 cpumask_t mask = cpu_sibling_map[cpu];
5597 cpus_and(mask, mask, *cpu_map);
6711cab4
SS
5598 group = first_cpu(mask);
5599 if (sg)
5600 *sg = &per_cpu(sched_group_core, group);
5601 return group;
1e9f28fa
SS
5602}
5603#elif defined(CONFIG_SCHED_MC)
6711cab4
SS
5604static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5605 struct sched_group **sg)
1e9f28fa 5606{
6711cab4
SS
5607 if (sg)
5608 *sg = &per_cpu(sched_group_core, cpu);
1e9f28fa
SS
5609 return cpu;
5610}
5611#endif
5612
1da177e4 5613static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6711cab4 5614static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
48f24c4d 5615
6711cab4
SS
5616static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5617 struct sched_group **sg)
1da177e4 5618{
6711cab4 5619 int group;
48f24c4d 5620#ifdef CONFIG_SCHED_MC
1e9f28fa 5621 cpumask_t mask = cpu_coregroup_map(cpu);
a616058b 5622 cpus_and(mask, mask, *cpu_map);
6711cab4 5623 group = first_cpu(mask);
1e9f28fa 5624#elif defined(CONFIG_SCHED_SMT)
a616058b
SS
5625 cpumask_t mask = cpu_sibling_map[cpu];
5626 cpus_and(mask, mask, *cpu_map);
6711cab4 5627 group = first_cpu(mask);
1da177e4 5628#else
6711cab4 5629 group = cpu;
1da177e4 5630#endif
6711cab4
SS
5631 if (sg)
5632 *sg = &per_cpu(sched_group_phys, group);
5633 return group;
1da177e4
LT
5634}
5635
5636#ifdef CONFIG_NUMA
1da177e4 5637/*
9c1cfda2
JH
5638 * The init_sched_build_groups can't handle what we want to do with node
5639 * groups, so roll our own. Now each node has its own list of groups which
5640 * gets dynamically allocated.
1da177e4 5641 */
9c1cfda2 5642static DEFINE_PER_CPU(struct sched_domain, node_domains);
d1b55138 5643static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
1da177e4 5644
9c1cfda2 5645static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6711cab4 5646static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
9c1cfda2 5647
6711cab4
SS
5648static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5649 struct sched_group **sg)
9c1cfda2 5650{
6711cab4
SS
5651 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5652 int group;
5653
5654 cpus_and(nodemask, nodemask, *cpu_map);
5655 group = first_cpu(nodemask);
5656
5657 if (sg)
5658 *sg = &per_cpu(sched_group_allnodes, group);
5659 return group;
1da177e4 5660}
6711cab4 5661
08069033
SS
5662static void init_numa_sched_groups_power(struct sched_group *group_head)
5663{
5664 struct sched_group *sg = group_head;
5665 int j;
5666
5667 if (!sg)
5668 return;
5669next_sg:
5670 for_each_cpu_mask(j, sg->cpumask) {
5671 struct sched_domain *sd;
5672
5673 sd = &per_cpu(phys_domains, j);
5674 if (j != first_cpu(sd->groups->cpumask)) {
5675 /*
5676 * Only add "power" once for each
5677 * physical package.
5678 */
5679 continue;
5680 }
5681
5517d86b 5682 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
08069033
SS
5683 }
5684 sg = sg->next;
5685 if (sg != group_head)
5686 goto next_sg;
5687}
1da177e4
LT
5688#endif
5689
a616058b 5690#ifdef CONFIG_NUMA
51888ca2
SV
5691/* Free memory allocated for various sched_group structures */
5692static void free_sched_groups(const cpumask_t *cpu_map)
5693{
a616058b 5694 int cpu, i;
51888ca2
SV
5695
5696 for_each_cpu_mask(cpu, *cpu_map) {
51888ca2
SV
5697 struct sched_group **sched_group_nodes
5698 = sched_group_nodes_bycpu[cpu];
5699
51888ca2
SV
5700 if (!sched_group_nodes)
5701 continue;
5702
5703 for (i = 0; i < MAX_NUMNODES; i++) {
5704 cpumask_t nodemask = node_to_cpumask(i);
5705 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5706
5707 cpus_and(nodemask, nodemask, *cpu_map);
5708 if (cpus_empty(nodemask))
5709 continue;
5710
5711 if (sg == NULL)
5712 continue;
5713 sg = sg->next;
5714next_sg:
5715 oldsg = sg;
5716 sg = sg->next;
5717 kfree(oldsg);
5718 if (oldsg != sched_group_nodes[i])
5719 goto next_sg;
5720 }
5721 kfree(sched_group_nodes);
5722 sched_group_nodes_bycpu[cpu] = NULL;
5723 }
51888ca2 5724}
a616058b
SS
5725#else
5726static void free_sched_groups(const cpumask_t *cpu_map)
5727{
5728}
5729#endif
51888ca2 5730
89c4710e
SS
5731/*
5732 * Initialize sched groups cpu_power.
5733 *
5734 * cpu_power indicates the capacity of sched group, which is used while
5735 * distributing the load between different sched groups in a sched domain.
5736 * Typically cpu_power for all the groups in a sched domain will be same unless
5737 * there are asymmetries in the topology. If there are asymmetries, group
5738 * having more cpu_power will pickup more load compared to the group having
5739 * less cpu_power.
5740 *
5741 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5742 * the maximum number of tasks a group can handle in the presence of other idle
5743 * or lightly loaded groups in the same sched domain.
5744 */
5745static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5746{
5747 struct sched_domain *child;
5748 struct sched_group *group;
5749
5750 WARN_ON(!sd || !sd->groups);
5751
5752 if (cpu != first_cpu(sd->groups->cpumask))
5753 return;
5754
5755 child = sd->child;
5756
5517d86b
ED
5757 sd->groups->__cpu_power = 0;
5758
89c4710e
SS
5759 /*
5760 * For perf policy, if the groups in child domain share resources
5761 * (for example cores sharing some portions of the cache hierarchy
5762 * or SMT), then set this domain groups cpu_power such that each group
5763 * can handle only one task, when there are other idle groups in the
5764 * same sched domain.
5765 */
5766 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5767 (child->flags &
5768 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5517d86b 5769 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
89c4710e
SS
5770 return;
5771 }
5772
89c4710e
SS
5773 /*
5774 * add cpu_power of each child group to this groups cpu_power
5775 */
5776 group = child->groups;
5777 do {
5517d86b 5778 sg_inc_cpu_power(sd->groups, group->__cpu_power);
89c4710e
SS
5779 group = group->next;
5780 } while (group != child->groups);
5781}
5782
1da177e4 5783/*
1a20ff27
DG
5784 * Build sched domains for a given set of cpus and attach the sched domains
5785 * to the individual cpus
1da177e4 5786 */
51888ca2 5787static int build_sched_domains(const cpumask_t *cpu_map)
1da177e4
LT
5788{
5789 int i;
d1b55138
JH
5790#ifdef CONFIG_NUMA
5791 struct sched_group **sched_group_nodes = NULL;
6711cab4 5792 int sd_allnodes = 0;
d1b55138
JH
5793
5794 /*
5795 * Allocate the per-node list of sched groups
5796 */
dd41f596 5797 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
d3a5aa98 5798 GFP_KERNEL);
d1b55138
JH
5799 if (!sched_group_nodes) {
5800 printk(KERN_WARNING "Can not alloc sched group node list\n");
51888ca2 5801 return -ENOMEM;
d1b55138
JH
5802 }
5803 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5804#endif
1da177e4
LT
5805
5806 /*
1a20ff27 5807 * Set up domains for cpus specified by the cpu_map.
1da177e4 5808 */
1a20ff27 5809 for_each_cpu_mask(i, *cpu_map) {
1da177e4
LT
5810 struct sched_domain *sd = NULL, *p;
5811 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5812
1a20ff27 5813 cpus_and(nodemask, nodemask, *cpu_map);
1da177e4
LT
5814
5815#ifdef CONFIG_NUMA
dd41f596
IM
5816 if (cpus_weight(*cpu_map) >
5817 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
9c1cfda2
JH
5818 sd = &per_cpu(allnodes_domains, i);
5819 *sd = SD_ALLNODES_INIT;
5820 sd->span = *cpu_map;
6711cab4 5821 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
9c1cfda2 5822 p = sd;
6711cab4 5823 sd_allnodes = 1;
9c1cfda2
JH
5824 } else
5825 p = NULL;
5826
1da177e4 5827 sd = &per_cpu(node_domains, i);
1da177e4 5828 *sd = SD_NODE_INIT;
9c1cfda2
JH
5829 sd->span = sched_domain_node_span(cpu_to_node(i));
5830 sd->parent = p;
1a848870
SS
5831 if (p)
5832 p->child = sd;
9c1cfda2 5833 cpus_and(sd->span, sd->span, *cpu_map);
1da177e4
LT
5834#endif
5835
5836 p = sd;
5837 sd = &per_cpu(phys_domains, i);
1da177e4
LT
5838 *sd = SD_CPU_INIT;
5839 sd->span = nodemask;
5840 sd->parent = p;
1a848870
SS
5841 if (p)
5842 p->child = sd;
6711cab4 5843 cpu_to_phys_group(i, cpu_map, &sd->groups);
1da177e4 5844
1e9f28fa
SS
5845#ifdef CONFIG_SCHED_MC
5846 p = sd;
5847 sd = &per_cpu(core_domains, i);
1e9f28fa
SS
5848 *sd = SD_MC_INIT;
5849 sd->span = cpu_coregroup_map(i);
5850 cpus_and(sd->span, sd->span, *cpu_map);
5851 sd->parent = p;
1a848870 5852 p->child = sd;
6711cab4 5853 cpu_to_core_group(i, cpu_map, &sd->groups);
1e9f28fa
SS
5854#endif
5855
1da177e4
LT
5856#ifdef CONFIG_SCHED_SMT
5857 p = sd;
5858 sd = &per_cpu(cpu_domains, i);
1da177e4
LT
5859 *sd = SD_SIBLING_INIT;
5860 sd->span = cpu_sibling_map[i];
1a20ff27 5861 cpus_and(sd->span, sd->span, *cpu_map);
1da177e4 5862 sd->parent = p;
1a848870 5863 p->child = sd;
6711cab4 5864 cpu_to_cpu_group(i, cpu_map, &sd->groups);
1da177e4
LT
5865#endif
5866 }
5867
5868#ifdef CONFIG_SCHED_SMT
5869 /* Set up CPU (sibling) groups */
9c1cfda2 5870 for_each_cpu_mask(i, *cpu_map) {
1da177e4 5871 cpumask_t this_sibling_map = cpu_sibling_map[i];
1a20ff27 5872 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
1da177e4
LT
5873 if (i != first_cpu(this_sibling_map))
5874 continue;
5875
dd41f596
IM
5876 init_sched_build_groups(this_sibling_map, cpu_map,
5877 &cpu_to_cpu_group);
1da177e4
LT
5878 }
5879#endif
5880
1e9f28fa
SS
5881#ifdef CONFIG_SCHED_MC
5882 /* Set up multi-core groups */
5883 for_each_cpu_mask(i, *cpu_map) {
5884 cpumask_t this_core_map = cpu_coregroup_map(i);
5885 cpus_and(this_core_map, this_core_map, *cpu_map);
5886 if (i != first_cpu(this_core_map))
5887 continue;
dd41f596
IM
5888 init_sched_build_groups(this_core_map, cpu_map,
5889 &cpu_to_core_group);
1e9f28fa
SS
5890 }
5891#endif
5892
1da177e4
LT
5893 /* Set up physical groups */
5894 for (i = 0; i < MAX_NUMNODES; i++) {
5895 cpumask_t nodemask = node_to_cpumask(i);
5896
1a20ff27 5897 cpus_and(nodemask, nodemask, *cpu_map);
1da177e4
LT
5898 if (cpus_empty(nodemask))
5899 continue;
5900
6711cab4 5901 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
1da177e4
LT
5902 }
5903
5904#ifdef CONFIG_NUMA
5905 /* Set up node groups */
6711cab4 5906 if (sd_allnodes)
dd41f596
IM
5907 init_sched_build_groups(*cpu_map, cpu_map,
5908 &cpu_to_allnodes_group);
9c1cfda2
JH
5909
5910 for (i = 0; i < MAX_NUMNODES; i++) {
5911 /* Set up node groups */
5912 struct sched_group *sg, *prev;
5913 cpumask_t nodemask = node_to_cpumask(i);
5914 cpumask_t domainspan;
5915 cpumask_t covered = CPU_MASK_NONE;
5916 int j;
5917
5918 cpus_and(nodemask, nodemask, *cpu_map);
d1b55138
JH
5919 if (cpus_empty(nodemask)) {
5920 sched_group_nodes[i] = NULL;
9c1cfda2 5921 continue;
d1b55138 5922 }
9c1cfda2
JH
5923
5924 domainspan = sched_domain_node_span(i);
5925 cpus_and(domainspan, domainspan, *cpu_map);
5926
15f0b676 5927 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
51888ca2
SV
5928 if (!sg) {
5929 printk(KERN_WARNING "Can not alloc domain group for "
5930 "node %d\n", i);
5931 goto error;
5932 }
9c1cfda2
JH
5933 sched_group_nodes[i] = sg;
5934 for_each_cpu_mask(j, nodemask) {
5935 struct sched_domain *sd;
9761eea8 5936
9c1cfda2
JH
5937 sd = &per_cpu(node_domains, j);
5938 sd->groups = sg;
9c1cfda2 5939 }
5517d86b 5940 sg->__cpu_power = 0;
9c1cfda2 5941 sg->cpumask = nodemask;
51888ca2 5942 sg->next = sg;
9c1cfda2
JH
5943 cpus_or(covered, covered, nodemask);
5944 prev = sg;
5945
5946 for (j = 0; j < MAX_NUMNODES; j++) {
5947 cpumask_t tmp, notcovered;
5948 int n = (i + j) % MAX_NUMNODES;
5949
5950 cpus_complement(notcovered, covered);
5951 cpus_and(tmp, notcovered, *cpu_map);
5952 cpus_and(tmp, tmp, domainspan);
5953 if (cpus_empty(tmp))
5954 break;
5955
5956 nodemask = node_to_cpumask(n);
5957 cpus_and(tmp, tmp, nodemask);
5958 if (cpus_empty(tmp))
5959 continue;
5960
15f0b676
SV
5961 sg = kmalloc_node(sizeof(struct sched_group),
5962 GFP_KERNEL, i);
9c1cfda2
JH
5963 if (!sg) {
5964 printk(KERN_WARNING
5965 "Can not alloc domain group for node %d\n", j);
51888ca2 5966 goto error;
9c1cfda2 5967 }
5517d86b 5968 sg->__cpu_power = 0;
9c1cfda2 5969 sg->cpumask = tmp;
51888ca2 5970 sg->next = prev->next;
9c1cfda2
JH
5971 cpus_or(covered, covered, tmp);
5972 prev->next = sg;
5973 prev = sg;
5974 }
9c1cfda2 5975 }
1da177e4
LT
5976#endif
5977
5978 /* Calculate CPU power for physical packages and nodes */
5c45bf27 5979#ifdef CONFIG_SCHED_SMT
1a20ff27 5980 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
5981 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5982
89c4710e 5983 init_sched_groups_power(i, sd);
5c45bf27 5984 }
1da177e4 5985#endif
1e9f28fa 5986#ifdef CONFIG_SCHED_MC
5c45bf27 5987 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
5988 struct sched_domain *sd = &per_cpu(core_domains, i);
5989
89c4710e 5990 init_sched_groups_power(i, sd);
5c45bf27
SS
5991 }
5992#endif
1e9f28fa 5993
5c45bf27 5994 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
5995 struct sched_domain *sd = &per_cpu(phys_domains, i);
5996
89c4710e 5997 init_sched_groups_power(i, sd);
1da177e4
LT
5998 }
5999
9c1cfda2 6000#ifdef CONFIG_NUMA
08069033
SS
6001 for (i = 0; i < MAX_NUMNODES; i++)
6002 init_numa_sched_groups_power(sched_group_nodes[i]);
9c1cfda2 6003
6711cab4
SS
6004 if (sd_allnodes) {
6005 struct sched_group *sg;
f712c0c7 6006
6711cab4 6007 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
f712c0c7
SS
6008 init_numa_sched_groups_power(sg);
6009 }
9c1cfda2
JH
6010#endif
6011
1da177e4 6012 /* Attach the domains */
1a20ff27 6013 for_each_cpu_mask(i, *cpu_map) {
1da177e4
LT
6014 struct sched_domain *sd;
6015#ifdef CONFIG_SCHED_SMT
6016 sd = &per_cpu(cpu_domains, i);
1e9f28fa
SS
6017#elif defined(CONFIG_SCHED_MC)
6018 sd = &per_cpu(core_domains, i);
1da177e4
LT
6019#else
6020 sd = &per_cpu(phys_domains, i);
6021#endif
6022 cpu_attach_domain(sd, i);
6023 }
51888ca2
SV
6024
6025 return 0;
6026
a616058b 6027#ifdef CONFIG_NUMA
51888ca2
SV
6028error:
6029 free_sched_groups(cpu_map);
6030 return -ENOMEM;
a616058b 6031#endif
1da177e4 6032}
1a20ff27
DG
6033/*
6034 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6035 */
51888ca2 6036static int arch_init_sched_domains(const cpumask_t *cpu_map)
1a20ff27
DG
6037{
6038 cpumask_t cpu_default_map;
51888ca2 6039 int err;
1da177e4 6040
1a20ff27
DG
6041 /*
6042 * Setup mask for cpus without special case scheduling requirements.
6043 * For now this just excludes isolated cpus, but could be used to
6044 * exclude other special cases in the future.
6045 */
6046 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6047
51888ca2
SV
6048 err = build_sched_domains(&cpu_default_map);
6049
6050 return err;
1a20ff27
DG
6051}
6052
6053static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
1da177e4 6054{
51888ca2 6055 free_sched_groups(cpu_map);
9c1cfda2 6056}
1da177e4 6057
1a20ff27
DG
6058/*
6059 * Detach sched domains from a group of cpus specified in cpu_map
6060 * These cpus will now be attached to the NULL domain
6061 */
858119e1 6062static void detach_destroy_domains(const cpumask_t *cpu_map)
1a20ff27
DG
6063{
6064 int i;
6065
6066 for_each_cpu_mask(i, *cpu_map)
6067 cpu_attach_domain(NULL, i);
6068 synchronize_sched();
6069 arch_destroy_sched_domains(cpu_map);
6070}
6071
6072/*
6073 * Partition sched domains as specified by the cpumasks below.
6074 * This attaches all cpus from the cpumasks to the NULL domain,
6075 * waits for a RCU quiescent period, recalculates sched
6076 * domain information and then attaches them back to the
6077 * correct sched domains
6078 * Call with hotplug lock held
6079 */
51888ca2 6080int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
1a20ff27
DG
6081{
6082 cpumask_t change_map;
51888ca2 6083 int err = 0;
1a20ff27
DG
6084
6085 cpus_and(*partition1, *partition1, cpu_online_map);
6086 cpus_and(*partition2, *partition2, cpu_online_map);
6087 cpus_or(change_map, *partition1, *partition2);
6088
6089 /* Detach sched domains from all of the affected cpus */
6090 detach_destroy_domains(&change_map);
6091 if (!cpus_empty(*partition1))
51888ca2
SV
6092 err = build_sched_domains(partition1);
6093 if (!err && !cpus_empty(*partition2))
6094 err = build_sched_domains(partition2);
6095
6096 return err;
1a20ff27
DG
6097}
6098
5c45bf27
SS
6099#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6100int arch_reinit_sched_domains(void)
6101{
6102 int err;
6103
5be9361c 6104 mutex_lock(&sched_hotcpu_mutex);
5c45bf27
SS
6105 detach_destroy_domains(&cpu_online_map);
6106 err = arch_init_sched_domains(&cpu_online_map);
5be9361c 6107 mutex_unlock(&sched_hotcpu_mutex);
5c45bf27
SS
6108
6109 return err;
6110}
6111
6112static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6113{
6114 int ret;
6115
6116 if (buf[0] != '0' && buf[0] != '1')
6117 return -EINVAL;
6118
6119 if (smt)
6120 sched_smt_power_savings = (buf[0] == '1');
6121 else
6122 sched_mc_power_savings = (buf[0] == '1');
6123
6124 ret = arch_reinit_sched_domains();
6125
6126 return ret ? ret : count;
6127}
6128
6129int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6130{
6131 int err = 0;
48f24c4d 6132
5c45bf27
SS
6133#ifdef CONFIG_SCHED_SMT
6134 if (smt_capable())
6135 err = sysfs_create_file(&cls->kset.kobj,
6136 &attr_sched_smt_power_savings.attr);
6137#endif
6138#ifdef CONFIG_SCHED_MC
6139 if (!err && mc_capable())
6140 err = sysfs_create_file(&cls->kset.kobj,
6141 &attr_sched_mc_power_savings.attr);
6142#endif
6143 return err;
6144}
6145#endif
6146
6147#ifdef CONFIG_SCHED_MC
6148static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6149{
6150 return sprintf(page, "%u\n", sched_mc_power_savings);
6151}
48f24c4d
IM
6152static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6153 const char *buf, size_t count)
5c45bf27
SS
6154{
6155 return sched_power_savings_store(buf, count, 0);
6156}
6157SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6158 sched_mc_power_savings_store);
6159#endif
6160
6161#ifdef CONFIG_SCHED_SMT
6162static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6163{
6164 return sprintf(page, "%u\n", sched_smt_power_savings);
6165}
48f24c4d
IM
6166static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6167 const char *buf, size_t count)
5c45bf27
SS
6168{
6169 return sched_power_savings_store(buf, count, 1);
6170}
6171SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6172 sched_smt_power_savings_store);
6173#endif
6174
1da177e4
LT
6175/*
6176 * Force a reinitialization of the sched domains hierarchy. The domains
6177 * and groups cannot be updated in place without racing with the balancing
41c7ce9a 6178 * code, so we temporarily attach all running cpus to the NULL domain
1da177e4
LT
6179 * which will prevent rebalancing while the sched domains are recalculated.
6180 */
6181static int update_sched_domains(struct notifier_block *nfb,
6182 unsigned long action, void *hcpu)
6183{
1da177e4
LT
6184 switch (action) {
6185 case CPU_UP_PREPARE:
8bb78442 6186 case CPU_UP_PREPARE_FROZEN:
1da177e4 6187 case CPU_DOWN_PREPARE:
8bb78442 6188 case CPU_DOWN_PREPARE_FROZEN:
1a20ff27 6189 detach_destroy_domains(&cpu_online_map);
1da177e4
LT
6190 return NOTIFY_OK;
6191
6192 case CPU_UP_CANCELED:
8bb78442 6193 case CPU_UP_CANCELED_FROZEN:
1da177e4 6194 case CPU_DOWN_FAILED:
8bb78442 6195 case CPU_DOWN_FAILED_FROZEN:
1da177e4 6196 case CPU_ONLINE:
8bb78442 6197 case CPU_ONLINE_FROZEN:
1da177e4 6198 case CPU_DEAD:
8bb78442 6199 case CPU_DEAD_FROZEN:
1da177e4
LT
6200 /*
6201 * Fall through and re-initialise the domains.
6202 */
6203 break;
6204 default:
6205 return NOTIFY_DONE;
6206 }
6207
6208 /* The hotplug lock is already held by cpu_up/cpu_down */
1a20ff27 6209 arch_init_sched_domains(&cpu_online_map);
1da177e4
LT
6210
6211 return NOTIFY_OK;
6212}
1da177e4
LT
6213
6214void __init sched_init_smp(void)
6215{
5c1e1767
NP
6216 cpumask_t non_isolated_cpus;
6217
5be9361c 6218 mutex_lock(&sched_hotcpu_mutex);
1a20ff27 6219 arch_init_sched_domains(&cpu_online_map);
e5e5673f 6220 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
5c1e1767
NP
6221 if (cpus_empty(non_isolated_cpus))
6222 cpu_set(smp_processor_id(), non_isolated_cpus);
5be9361c 6223 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
6224 /* XXX: Theoretical race here - CPU may be hotplugged now */
6225 hotcpu_notifier(update_sched_domains, 0);
5c1e1767
NP
6226
6227 /* Move init over to a non-isolated CPU */
6228 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6229 BUG();
dd41f596 6230 sched_init_granularity();
1da177e4
LT
6231}
6232#else
6233void __init sched_init_smp(void)
6234{
dd41f596 6235 sched_init_granularity();
1da177e4
LT
6236}
6237#endif /* CONFIG_SMP */
6238
6239int in_sched_functions(unsigned long addr)
6240{
6241 /* Linker adds these: start and end of __sched functions */
6242 extern char __sched_text_start[], __sched_text_end[];
48f24c4d 6243
1da177e4
LT
6244 return in_lock_functions(addr) ||
6245 (addr >= (unsigned long)__sched_text_start
6246 && addr < (unsigned long)__sched_text_end);
6247}
6248
dd41f596
IM
6249static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6250{
6251 cfs_rq->tasks_timeline = RB_ROOT;
6252 cfs_rq->fair_clock = 1;
6253#ifdef CONFIG_FAIR_GROUP_SCHED
6254 cfs_rq->rq = rq;
6255#endif
6256}
6257
1da177e4
LT
6258void __init sched_init(void)
6259{
dd41f596 6260 u64 now = sched_clock();
476f3534 6261 int highest_cpu = 0;
dd41f596
IM
6262 int i, j;
6263
6264 /*
6265 * Link up the scheduling class hierarchy:
6266 */
6267 rt_sched_class.next = &fair_sched_class;
6268 fair_sched_class.next = &idle_sched_class;
6269 idle_sched_class.next = NULL;
1da177e4 6270
0a945022 6271 for_each_possible_cpu(i) {
dd41f596 6272 struct rt_prio_array *array;
70b97a7f 6273 struct rq *rq;
1da177e4
LT
6274
6275 rq = cpu_rq(i);
6276 spin_lock_init(&rq->lock);
fcb99371 6277 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7897986b 6278 rq->nr_running = 0;
dd41f596
IM
6279 rq->clock = 1;
6280 init_cfs_rq(&rq->cfs, rq);
6281#ifdef CONFIG_FAIR_GROUP_SCHED
6282 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6283 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6284#endif
6285 rq->ls.load_update_last = now;
6286 rq->ls.load_update_start = now;
1da177e4 6287
dd41f596
IM
6288 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6289 rq->cpu_load[j] = 0;
1da177e4 6290#ifdef CONFIG_SMP
41c7ce9a 6291 rq->sd = NULL;
1da177e4 6292 rq->active_balance = 0;
dd41f596 6293 rq->next_balance = jiffies;
1da177e4 6294 rq->push_cpu = 0;
0a2966b4 6295 rq->cpu = i;
1da177e4
LT
6296 rq->migration_thread = NULL;
6297 INIT_LIST_HEAD(&rq->migration_queue);
6298#endif
6299 atomic_set(&rq->nr_iowait, 0);
6300
dd41f596
IM
6301 array = &rq->rt.active;
6302 for (j = 0; j < MAX_RT_PRIO; j++) {
6303 INIT_LIST_HEAD(array->queue + j);
6304 __clear_bit(j, array->bitmap);
1da177e4 6305 }
476f3534 6306 highest_cpu = i;
dd41f596
IM
6307 /* delimiter for bitsearch: */
6308 __set_bit(MAX_RT_PRIO, array->bitmap);
1da177e4
LT
6309 }
6310
2dd73a4f 6311 set_load_weight(&init_task);
b50f60ce 6312
c9819f45 6313#ifdef CONFIG_SMP
476f3534 6314 nr_cpu_ids = highest_cpu + 1;
c9819f45
CL
6315 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6316#endif
6317
b50f60ce
HC
6318#ifdef CONFIG_RT_MUTEXES
6319 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6320#endif
6321
1da177e4
LT
6322 /*
6323 * The boot idle thread does lazy MMU switching as well:
6324 */
6325 atomic_inc(&init_mm.mm_count);
6326 enter_lazy_tlb(&init_mm, current);
6327
6328 /*
6329 * Make us the idle thread. Technically, schedule() should not be
6330 * called from this thread, however somewhere below it might be,
6331 * but because we are the idle thread, we just pick up running again
6332 * when this runqueue becomes "idle".
6333 */
6334 init_idle(current, smp_processor_id());
dd41f596
IM
6335 /*
6336 * During early bootup we pretend to be a normal task:
6337 */
6338 current->sched_class = &fair_sched_class;
1da177e4
LT
6339}
6340
6341#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6342void __might_sleep(char *file, int line)
6343{
48f24c4d 6344#ifdef in_atomic
1da177e4
LT
6345 static unsigned long prev_jiffy; /* ratelimiting */
6346
6347 if ((in_atomic() || irqs_disabled()) &&
6348 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6349 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6350 return;
6351 prev_jiffy = jiffies;
91368d73 6352 printk(KERN_ERR "BUG: sleeping function called from invalid"
1da177e4
LT
6353 " context at %s:%d\n", file, line);
6354 printk("in_atomic():%d, irqs_disabled():%d\n",
6355 in_atomic(), irqs_disabled());
a4c410f0 6356 debug_show_held_locks(current);
3117df04
IM
6357 if (irqs_disabled())
6358 print_irqtrace_events(current);
1da177e4
LT
6359 dump_stack();
6360 }
6361#endif
6362}
6363EXPORT_SYMBOL(__might_sleep);
6364#endif
6365
6366#ifdef CONFIG_MAGIC_SYSRQ
6367void normalize_rt_tasks(void)
6368{
a0f98a1c 6369 struct task_struct *g, *p;
1da177e4 6370 unsigned long flags;
70b97a7f 6371 struct rq *rq;
dd41f596 6372 int on_rq;
1da177e4
LT
6373
6374 read_lock_irq(&tasklist_lock);
a0f98a1c 6375 do_each_thread(g, p) {
dd41f596
IM
6376 p->se.fair_key = 0;
6377 p->se.wait_runtime = 0;
6378 p->se.wait_start_fair = 0;
6379 p->se.wait_start = 0;
6380 p->se.exec_start = 0;
6381 p->se.sleep_start = 0;
6382 p->se.sleep_start_fair = 0;
6383 p->se.block_start = 0;
6384 task_rq(p)->cfs.fair_clock = 0;
6385 task_rq(p)->clock = 0;
6386
6387 if (!rt_task(p)) {
6388 /*
6389 * Renice negative nice level userspace
6390 * tasks back to 0:
6391 */
6392 if (TASK_NICE(p) < 0 && p->mm)
6393 set_user_nice(p, 0);
1da177e4 6394 continue;
dd41f596 6395 }
1da177e4 6396
b29739f9
IM
6397 spin_lock_irqsave(&p->pi_lock, flags);
6398 rq = __task_rq_lock(p);
dd41f596
IM
6399#ifdef CONFIG_SMP
6400 /*
6401 * Do not touch the migration thread:
6402 */
6403 if (p == rq->migration_thread)
6404 goto out_unlock;
6405#endif
1da177e4 6406
dd41f596
IM
6407 on_rq = p->se.on_rq;
6408 if (on_rq)
6409 deactivate_task(task_rq(p), p, 0);
6410 __setscheduler(rq, p, SCHED_NORMAL, 0);
6411 if (on_rq) {
6412 activate_task(task_rq(p), p, 0);
1da177e4
LT
6413 resched_task(rq->curr);
6414 }
dd41f596
IM
6415#ifdef CONFIG_SMP
6416 out_unlock:
6417#endif
b29739f9
IM
6418 __task_rq_unlock(rq);
6419 spin_unlock_irqrestore(&p->pi_lock, flags);
a0f98a1c
IM
6420 } while_each_thread(g, p);
6421
1da177e4
LT
6422 read_unlock_irq(&tasklist_lock);
6423}
6424
6425#endif /* CONFIG_MAGIC_SYSRQ */
1df5c10a
LT
6426
6427#ifdef CONFIG_IA64
6428/*
6429 * These functions are only useful for the IA64 MCA handling.
6430 *
6431 * They can only be called when the whole system has been
6432 * stopped - every CPU needs to be quiescent, and no scheduling
6433 * activity can take place. Using them for anything else would
6434 * be a serious bug, and as a result, they aren't even visible
6435 * under any other configuration.
6436 */
6437
6438/**
6439 * curr_task - return the current task for a given cpu.
6440 * @cpu: the processor in question.
6441 *
6442 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6443 */
36c8b586 6444struct task_struct *curr_task(int cpu)
1df5c10a
LT
6445{
6446 return cpu_curr(cpu);
6447}
6448
6449/**
6450 * set_curr_task - set the current task for a given cpu.
6451 * @cpu: the processor in question.
6452 * @p: the task pointer to set.
6453 *
6454 * Description: This function must only be used when non-maskable interrupts
6455 * are serviced on a separate stack. It allows the architecture to switch the
6456 * notion of the current task on a cpu in a non-blocking manner. This function
6457 * must be called with all CPU's synchronized, and interrupts disabled, the
6458 * and caller must save the original value of the current task (see
6459 * curr_task() above) and restore that value before reenabling interrupts and
6460 * re-starting the system.
6461 *
6462 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6463 */
36c8b586 6464void set_curr_task(int cpu, struct task_struct *p)
1df5c10a
LT
6465{
6466 cpu_curr(cpu) = p;
6467}
6468
6469#endif